Index: sys/arch/amd64/amd64/conf.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/conf.c,v diff -u -p -r1.81 conf.c --- sys/arch/amd64/amd64/conf.c 12 Jun 2024 12:54:54 -0000 1.81 +++ sys/arch/amd64/amd64/conf.c 11 Sep 2024 11:29:24 -0000 @@ -137,6 +137,7 @@ cdev_decl(cy); #include "bktr.h" #include "ksyms.h" #include "kstat.h" +#include "llt.h" #include "usb.h" #include "uhid.h" #include "fido.h" @@ -215,7 +216,8 @@ struct cdevsw cdevsw[] = cdev_notdef(), /* 28 was LKM */ cdev_notdef(), /* 29 */ cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ - cdev_notdef(), /* 31 */ + cdev_lltrace_init(NLLT,lltrace), + /* 31: lltrace */ cdev_notdef(), /* 32 */ cdev_notdef(), /* 33 */ cdev_notdef(), /* 34 */ Index: sys/arch/amd64/amd64/intr.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/intr.c,v diff -u -p -r1.61 intr.c --- sys/arch/amd64/amd64/intr.c 25 Jun 2024 12:02:48 -0000 1.61 +++ sys/arch/amd64/amd64/intr.c 11 Sep 2024 11:29:24 -0000 @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -543,6 +544,9 @@ intr_handler(struct intrframe *frame, st return 0; } + LLTRACE_CPU(ci, lltrace_intr_enter, LLTRACE_INTR_T_HW, + ci->ci_isources[ih->ih_slot]->is_idtvec); + #ifdef MULTIPROCESSOR if (ih->ih_flags & IPL_MPSAFE) need_lock = 0; @@ -552,14 +556,22 @@ intr_handler(struct intrframe *frame, st if (need_lock) __mp_lock(&kernel_lock); #endif + floor = ci->ci_handled_intr_level; ci->ci_handled_intr_level = ih->ih_level; + + LLTRACE_CPU(ci, lltrace_fn_enter, ih->ih_fun); rc = (*ih->ih_fun)(ih->ih_arg ? ih->ih_arg : frame); + LLTRACE_CPU(ci, lltrace_fn_leave, ih->ih_fun); + ci->ci_handled_intr_level = floor; + #ifdef MULTIPROCESSOR if (need_lock) __mp_unlock(&kernel_lock); #endif + LLTRACE_CPU(ci, lltrace_intr_leave, LLTRACE_INTR_T_HW, + ci->ci_isources[ih->ih_slot]->is_idtvec); return rc; } Index: sys/arch/amd64/amd64/ipi.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/ipi.c,v diff -u -p -r1.18 ipi.c --- sys/arch/amd64/amd64/ipi.c 10 Nov 2022 08:26:54 -0000 1.18 +++ sys/arch/amd64/amd64/ipi.c 11 Sep 2024 11:29:24 -0000 @@ -35,9 +35,10 @@ #include #include #include +#include +#include #include -#include #include #include #include @@ -45,6 +46,8 @@ void x86_send_ipi(struct cpu_info *ci, int ipimask) { + LLTRACE(lltrace_ipi, ci->ci_cpuid); + x86_atomic_setbits_u32(&ci->ci_ipis, ipimask); /* Don't send IPI to cpu which isn't (yet) running. */ @@ -57,6 +60,8 @@ x86_send_ipi(struct cpu_info *ci, int ip int x86_fast_ipi(struct cpu_info *ci, int ipi) { + LLTRACE(lltrace_ipi, ci->ci_cpuid); + if (!(ci->ci_flags & CPUF_RUNNING)) return (ENOENT); @@ -72,6 +77,8 @@ x86_broadcast_ipi(int ipimask) int count = 0; CPU_INFO_ITERATOR cii; + LLTRACE_CPU(self, lltrace_ipi, ~0); + CPU_INFO_FOREACH(cii, ci) { if (ci == self) continue; @@ -95,17 +102,22 @@ x86_ipi_handler(void) int bit; int floor; + LLTRACE_CPU(ci, lltrace_intr_enter, LLTRACE_INTR_T_IPI, 0); + floor = ci->ci_handled_intr_level; ci->ci_handled_intr_level = ci->ci_ilevel; pending = atomic_swap_uint(&ci->ci_ipis, 0); for (bit = 0; bit < X86_NIPI && pending; bit++) { if (pending & (1 << bit)) { - pending &= ~(1 << bit); + LLTRACE_CPU(ci, lltrace_fn_enter, ipifunc[bit]); (*ipifunc[bit])(ci); + LLTRACE_CPU(ci, lltrace_fn_leave, ipifunc[bit]); evcount_inc(&ipi_count); } } ci->ci_handled_intr_level = floor; + + LLTRACE_CPU(ci, lltrace_intr_leave, LLTRACE_INTR_T_IPI, 0); } Index: sys/arch/amd64/amd64/lapic.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v diff -u -p -r1.72 lapic.c --- sys/arch/amd64/amd64/lapic.c 3 Apr 2024 02:01:21 -0000 1.72 +++ sys/arch/amd64/amd64/lapic.c 11 Sep 2024 11:29:24 -0000 @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -476,12 +477,16 @@ lapic_clockintr(void *arg, struct intrfr struct cpu_info *ci = curcpu(); int floor; + LLTRACE_CPU(ci, lltrace_intr_enter, LLTRACE_INTR_T_CLOCK, 0); + floor = ci->ci_handled_intr_level; ci->ci_handled_intr_level = ci->ci_ilevel; clockintr_dispatch(&frame); ci->ci_handled_intr_level = floor; evcount_inc(&clk_count); + + LLTRACE_CPU(ci, lltrace_intr_leave, LLTRACE_INTR_T_CLOCK, 0); } void Index: sys/arch/amd64/amd64/softintr.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/softintr.c,v diff -u -p -r1.10 softintr.c --- sys/arch/amd64/amd64/softintr.c 11 Sep 2020 09:27:09 -0000 1.10 +++ sys/arch/amd64/amd64/softintr.c 11 Sep 2024 11:29:24 -0000 @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -82,6 +83,8 @@ softintr_dispatch(int which) struct x86_soft_intrhand *sih; int floor; + LLTRACE_CPU(ci, lltrace_intr_enter, LLTRACE_INTR_T_SW, which); + floor = ci->ci_handled_intr_level; ci->ci_handled_intr_level = ci->ci_ilevel; @@ -99,12 +102,15 @@ softintr_dispatch(int which) uvmexp.softs++; mtx_leave(&si->softintr_lock); - + LLTRACE_CPU(ci, lltrace_fn_enter, sih->sih_fn); (*sih->sih_fn)(sih->sih_arg); + LLTRACE_CPU(ci, lltrace_fn_leave, sih->sih_fn); } KERNEL_UNLOCK(); ci->ci_handled_intr_level = floor; + + LLTRACE_CPU(ci, lltrace_intr_leave, LLTRACE_INTR_T_SW, which); } /* Index: sys/arch/arm64/arm64/conf.c =================================================================== RCS file: /cvs/src/sys/arch/arm64/arm64/conf.c,v diff -u -p -r1.24 conf.c --- sys/arch/arm64/arm64/conf.c 12 Jun 2024 02:50:25 -0000 1.24 +++ sys/arch/arm64/arm64/conf.c 11 Sep 2024 11:29:24 -0000 @@ -91,6 +91,7 @@ cdev_decl(lpt); #include "bktr.h" #include "ksyms.h" #include "kstat.h" +#include "llt.h" #include "usb.h" #include "uhid.h" #include "fido.h" @@ -156,7 +157,8 @@ struct cdevsw cdevsw[] = cdev_notdef(), /* 28 was LKM */ cdev_notdef(), /* 29 */ cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ - cdev_notdef(), /* 31 */ + cdev_lltrace_init(NLLT,lltrace), + /* 31: lltrace */ cdev_notdef(), /* 32 */ cdev_notdef(), /* 33 */ cdev_notdef(), /* 34 */ Index: sys/arch/arm64/dev/agintc.c =================================================================== RCS file: /cvs/src/sys/arch/arm64/dev/agintc.c,v diff -u -p -r1.59 agintc.c --- sys/arch/arm64/dev/agintc.c 3 Jul 2024 22:37:00 -0000 1.59 +++ sys/arch/arm64/dev/agintc.c 11 Sep 2024 11:29:24 -0000 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -1121,7 +1122,11 @@ agintc_run_handler(struct intrhand *ih, else arg = frame; + LLTRACE(lltrace_irq, ih->ih_ipl == IPL_IPI ? LLTRACE_IRQ_IPI : 0, + ih->ih_irq); handled = ih->ih_func(arg); + LLTRACE(lltrace_irqret, ih->ih_ipl == IPL_IPI ? LLTRACE_IRQ_IPI : 0, + ih->ih_irq); if (handled) ih->ih_count.ec_count++; @@ -1466,6 +1471,8 @@ agintc_send_ipi(struct cpu_info *ci, int { struct agintc_softc *sc = agintc_sc; uint64_t sendmask; + + LLTRACE(lltrace_ipi, ci->ci_cpuid); if (ci == curcpu() && id == ARM_IPI_NOP) return; Index: sys/arch/sparc64/conf/files.sparc64 =================================================================== RCS file: /cvs/src/sys/arch/sparc64/conf/files.sparc64,v diff -u -p -r1.156 files.sparc64 --- sys/arch/sparc64/conf/files.sparc64 29 Mar 2024 21:11:31 -0000 1.156 +++ sys/arch/sparc64/conf/files.sparc64 11 Sep 2024 11:29:26 -0000 @@ -108,19 +108,19 @@ file arch/sparc64/dev/gfb.c gfb include "dev/pci/files.pci" major {wd = 12} -device psycho: pcibus, iommu +device psycho: pcibus, iommu, vmem attach psycho at mainbus file arch/sparc64/dev/psycho.c psycho -device schizo: pcibus, iommu +device schizo: pcibus, iommu, vmem attach schizo at mainbus file arch/sparc64/dev/schizo.c schizo -device pyro: pcibus, iommu, msi +device pyro: pcibus, iommu, vmem, msi attach pyro at mainbus file arch/sparc64/dev/pyro.c pyro -device vpci: pcibus, viommu, msi +device vpci: pcibus, viommu, vmem, msi attach vpci at mainbus file arch/sparc64/dev/vpci.c vpci Index: sys/arch/sparc64/dev/ebus.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/ebus.c,v diff -u -p -r1.27 ebus.c --- sys/arch/sparc64/dev/ebus.c 14 May 2024 08:26:13 -0000 1.27 +++ sys/arch/sparc64/dev/ebus.c 11 Sep 2024 11:29:26 -0000 @@ -53,7 +53,6 @@ int ebus_debug = 0x0; #include #include #include -#include #include #include #include Index: sys/arch/sparc64/dev/ebus_mainbus.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/ebus_mainbus.c,v diff -u -p -r1.13 ebus_mainbus.c --- sys/arch/sparc64/dev/ebus_mainbus.c 29 Mar 2024 21:29:33 -0000 1.13 +++ sys/arch/sparc64/dev/ebus_mainbus.c 11 Sep 2024 11:29:26 -0000 @@ -33,7 +33,6 @@ extern int ebus_debug; #include #include #include -#include #include #include #include Index: sys/arch/sparc64/dev/iommu.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/iommu.c,v diff -u -p -r1.83 iommu.c --- sys/arch/sparc64/dev/iommu.c 18 Oct 2023 14:24:29 -0000 1.83 +++ sys/arch/sparc64/dev/iommu.c 11 Sep 2024 11:29:26 -0000 @@ -35,7 +35,6 @@ * UltraSPARC IOMMU support; used by both the sbus and pci code. */ #include -#include #include #include #include @@ -249,10 +248,11 @@ iommu_init(char *name, const struct iomm (unsigned long long)is->is_ptsb, (unsigned long long)(is->is_ptsb + size)); #endif - is->is_dvmamap = extent_create(name, - is->is_dvmabase, (u_long)is->is_dvmaend + 1, - M_DEVBUF, NULL, 0, EX_NOCOALESCE); - mtx_init(&is->is_mtx, IPL_HIGH); + is->is_dvmamap = vmem_create(name, + is->is_dvmabase, ((u_long)is->is_dvmaend + 1) - is->is_dvmabase, + PAGE_SIZE, + /* allocfn = */ NULL, /* freefn = */ NULL, /* backend = */ NULL, + 0, VM_NOSLEEP, IPL_VM); /* * Now actually start up the IOMMU. @@ -341,7 +341,7 @@ iommu_enter(struct iommu_state *is, stru if (tte & IOTTE_V) { printf("Overwriting valid tte entry (dva %lx pa %lx " "&tte %p tte %llx)\n", va, pa, tte_ptr, tte); - extent_print(is->is_dvmamap); + // extent_print(is->is_dvmamap); panic("IOMMU overwrite"); } #endif @@ -407,7 +407,7 @@ iommu_remove(struct iommu_state *is, str if ((tte & IOTTE_V) == 0) { printf("Removing invalid tte entry (dva %lx &tte %p " "tte %llx)\n", va, tte_ptr, tte); - extent_print(is->is_dvmamap); + // extent_print(is->is_dvmamap); panic("IOMMU remove overwrite"); } #endif @@ -679,7 +679,8 @@ iommu_dvmamap_load(bus_dma_tag_t t, bus_ { int err = 0; bus_size_t sgsize; - u_long dvmaddr, sgstart, sgend; + vmem_addr_t dvmaddr; + u_long sgstart, sgend; bus_size_t align, boundary; struct iommu_state *is; struct iommu_map_state *ims; @@ -765,27 +766,25 @@ iommu_dvmamap_load(bus_dma_tag_t t, bus_ } sgsize = ims->ims_map.ipm_pagecnt * PAGE_SIZE; - mtx_enter(&is->is_mtx); if (flags & BUS_DMA_24BIT) { - sgstart = MAX(is->is_dvmamap->ex_start, 0xff000000); - sgend = MIN(is->is_dvmamap->ex_end, 0xffffffff); + sgstart = 0x000000; + sgend = 0xffffff; } else { - sgstart = is->is_dvmamap->ex_start; - sgend = is->is_dvmamap->ex_end; + sgstart = VMEM_ADDR_MIN; + sgend = VMEM_ADDR_MAX; } /* * If our segment size is larger than the boundary we need to * split the transfer up into little pieces ourselves. */ - err = extent_alloc_subregion_with_descr(is->is_dvmamap, sgstart, sgend, - sgsize, align, 0, (sgsize > boundary) ? 0 : boundary, - EX_NOWAIT | EX_BOUNDZERO, &ims->ims_er, (u_long *)&dvmaddr); - mtx_leave(&is->is_mtx); + err = vmem_xalloc(is->is_dvmamap, sgsize, align, 0, + (sgsize > boundary) ? 0 : boundary, + sgstart, sgend, VM_NOSLEEP | VM_INSTANTFIT, &dvmaddr); #ifdef DEBUG if (err || (dvmaddr == (bus_addr_t)-1)) { - printf("iommu_dvmamap_load(): extent_alloc(%d, %x) failed!\n", + printf("iommu_dvmamap_load(): vmem_xalloc(%d, %x) failed!\n", (int)sgsize, flags); #ifdef DDB if (iommudebug & IDB_BREAK) @@ -889,7 +888,8 @@ iommu_dvmamap_load_raw(bus_dma_tag_t t, int err = 0; bus_size_t sgsize; bus_size_t boundary, align; - u_long dvmaddr, sgstart, sgend; + vmem_addr_t dvmaddr; + u_long sgstart, sgend; struct iommu_state *is; struct iommu_map_state *ims; @@ -986,23 +986,21 @@ iommu_dvmamap_load_raw(bus_dma_tag_t t, } sgsize = ims->ims_map.ipm_pagecnt * PAGE_SIZE; - mtx_enter(&is->is_mtx); if (flags & BUS_DMA_24BIT) { - sgstart = MAX(is->is_dvmamap->ex_start, 0xff000000); - sgend = MIN(is->is_dvmamap->ex_end, 0xffffffff); + sgstart = 0x000000; + sgend = 0xffffff; } else { - sgstart = is->is_dvmamap->ex_start; - sgend = is->is_dvmamap->ex_end; + sgstart = VMEM_ADDR_MIN; + sgend = VMEM_ADDR_MAX; } /* * If our segment size is larger than the boundary we need to * split the transfer up into little pieces ourselves. */ - err = extent_alloc_subregion_with_descr(is->is_dvmamap, sgstart, sgend, - sgsize, align, 0, (sgsize > boundary) ? 0 : boundary, - EX_NOWAIT | EX_BOUNDZERO, &ims->ims_er, (u_long *)&dvmaddr); - mtx_leave(&is->is_mtx); + err = vmem_xalloc(is->is_dvmamap, sgsize, align, 0, + (sgsize > boundary) ? 0 : boundary, + sgstart, sgend, VM_NOSLEEP | VM_INSTANTFIT, &dvmaddr); if (err != 0) { iommu_iomap_clear_pages(ims); @@ -1011,7 +1009,7 @@ iommu_dvmamap_load_raw(bus_dma_tag_t t, #ifdef DEBUG if (dvmaddr == (bus_addr_t)-1) { - printf("iommu_dvmamap_load_raw(): extent_alloc(%d, %x) " + printf("iommu_dvmamap_load_raw(): vmem_xalloc(%d, %x) " "failed!\n", (int)sgsize, flags); #ifdef DDB if (iommudebug & IDB_BREAK) @@ -1326,7 +1324,6 @@ iommu_dvmamap_unload(bus_dma_tag_t t, bu struct iommu_map_state *ims; bus_addr_t dvmaddr = map->_dm_dvmastart; bus_size_t sgsize = map->_dm_dvmasize; - int error; if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) { bus_dmamap_unload(t->_parent, map); @@ -1365,13 +1362,9 @@ iommu_dvmamap_unload(bus_dma_tag_t t, bu map->dm_mapsize = 0; map->dm_nsegs = 0; - mtx_enter(&is->is_mtx); - error = extent_free(is->is_dvmamap, dvmaddr, sgsize, EX_NOWAIT); + vmem_xfree(is->is_dvmamap, dvmaddr, sgsize); map->_dm_dvmastart = 0; map->_dm_dvmasize = 0; - mtx_leave(&is->is_mtx); - if (error != 0) - printf("warning: %ld of DVMA space lost\n", sgsize); } #ifdef DEBUG Index: sys/arch/sparc64/dev/iommuvar.h =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/iommuvar.h,v diff -u -p -r1.19 iommuvar.h --- sys/arch/sparc64/dev/iommuvar.h 11 Mar 2021 11:17:00 -0000 1.19 +++ sys/arch/sparc64/dev/iommuvar.h 11 Sep 2024 11:29:26 -0000 @@ -37,7 +37,7 @@ #include #endif -#include +#include #include /* @@ -95,7 +95,6 @@ struct iommu_map_state { struct strbuf_ctl *ims_sb; /* Link to parent */ struct iommu_state *ims_iommu; int ims_flags; - struct extent_region ims_er; struct iommu_page_map ims_map; /* map must be last (array at end) */ }; #define IOMMU_MAP_STREAM 1 @@ -125,8 +124,7 @@ struct iommu_state { u_int is_dvmabase; u_int is_dvmaend; int64_t is_cr; /* Control register value */ - struct mutex is_mtx; - struct extent *is_dvmamap; /* DVMA map for this instance */ + vmem_t *is_dvmamap; /* DVMA map for this instance */ const struct iommu_hw *is_hw; struct strbuf_ctl *is_sb[2]; /* Streaming buffers if any */ Index: sys/arch/sparc64/dev/psycho.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/psycho.c,v diff -u -p -r1.84 psycho.c --- sys/arch/sparc64/dev/psycho.c 29 Mar 2024 21:29:33 -0000 1.84 +++ sys/arch/sparc64/dev/psycho.c 11 Sep 2024 11:29:26 -0000 @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include Index: sys/arch/sparc64/dev/sbus.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/sbus.c,v diff -u -p -r1.47 sbus.c --- sys/arch/sparc64/dev/sbus.c 29 Mar 2024 21:29:33 -0000 1.47 +++ sys/arch/sparc64/dev/sbus.c 11 Sep 2024 11:29:26 -0000 @@ -101,7 +101,6 @@ #include #include #include -#include #include #include #include @@ -374,15 +373,9 @@ sbus_mb_attach(struct device *parent, st * NULL DMA pointer will be translated by the first page of the IOTSB. * To avoid bugs we'll alloc and ignore the first entry in the IOTSB. */ - { - u_long dummy; - - if (extent_alloc_subregion(sc->sc_is.is_dvmamap, - sc->sc_is.is_dvmabase, sc->sc_is.is_dvmabase + NBPG, NBPG, - NBPG, 0, 0, EX_NOWAIT | EX_BOUNDZERO, - (u_long *)&dummy) != 0) - panic("sbus iommu: can't toss first dvma page"); - } + if (vmem_xalloc_addr(sc->sc_is.is_dvmamap, 0x0, NBPG, + VM_NOSLEEP) != 0) + panic("sbus iommu: can't toss first dvma page"); sc->sc_dmatag = sbus_alloc_dma_tag(sc, ma->ma_dmatag); Index: sys/arch/sparc64/dev/schizo.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/schizo.c,v diff -u -p -r1.70 schizo.c --- sys/arch/sparc64/dev/schizo.c 29 Mar 2024 21:29:33 -0000 1.70 +++ sys/arch/sparc64/dev/schizo.c 11 Sep 2024 11:29:26 -0000 @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include Index: sys/arch/sparc64/dev/stp_sbus.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/stp_sbus.c,v diff -u -p -r1.13 stp_sbus.c --- sys/arch/sparc64/dev/stp_sbus.c 16 Oct 2022 01:22:39 -0000 1.13 +++ sys/arch/sparc64/dev/stp_sbus.c 11 Sep 2024 11:29:26 -0000 @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include Index: sys/arch/sparc64/dev/viommu.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/viommu.c,v diff -u -p -r1.20 viommu.c --- sys/arch/sparc64/dev/viommu.c 16 May 2021 15:10:19 -0000 1.20 +++ sys/arch/sparc64/dev/viommu.c 11 Sep 2024 11:29:26 -0000 @@ -37,12 +37,13 @@ */ #include -#include +#include #include #include #include #include #include +#include #include @@ -76,8 +77,8 @@ extern int iommudebug; #define DPRINTF(l, s) #endif -void viommu_enter(struct iommu_state *, struct strbuf_ctl *, bus_addr_t, - paddr_t, int); +void viommu_enter(struct iommu_state *, struct iommu_map_state *, + struct strbuf_ctl *, bus_addr_t, paddr_t, int); void viommu_remove(struct iommu_state *, struct strbuf_ctl *, bus_addr_t); int viommu_dvmamap_load_seg(bus_dma_tag_t, struct iommu_state *, bus_dmamap_t, bus_dma_segment_t *, int, int, bus_size_t, bus_size_t); @@ -135,10 +136,11 @@ viommu_init(char *name, struct iommu_sta * Allocate a dvma map. */ printf("dvma map %x-%x", is->is_dvmabase, is->is_dvmaend); - is->is_dvmamap = extent_create(name, - is->is_dvmabase, (u_long)is->is_dvmaend + 1, - M_DEVBUF, NULL, 0, EX_NOCOALESCE); - mtx_init(&is->is_mtx, IPL_HIGH); + is->is_dvmamap = vmem_create(name, + is->is_dvmabase, ((u_long)is->is_dvmaend + 1) - is->is_dvmabase, + PAGE_SIZE, + /* allocfn = */ NULL, /* freefn = */ NULL, /* backend = */ NULL, + 0, VM_NOSLEEP, IPL_VM); printf("\n"); } @@ -147,11 +149,12 @@ viommu_init(char *name, struct iommu_sta * Add an entry to the IOMMU table. */ void -viommu_enter(struct iommu_state *is, struct strbuf_ctl *sb, bus_addr_t va, - paddr_t pa, int flags) +viommu_enter(struct iommu_state *is, struct iommu_map_state *ims, + struct strbuf_ctl *sb, bus_addr_t va, paddr_t pa, int flags) { u_int64_t tsbid = IOTSBSLOT(va, is->is_tsbsize); - paddr_t page_list[1], addr; + struct strbuf_flush *sbf = &ims->ims_flush; + paddr_t *page_list = (paddr_t *)&sbf->sbf_area; u_int64_t attr, nmapped; int err; @@ -162,17 +165,18 @@ viommu_enter(struct iommu_state *is, str panic("viommu_enter: va %#lx not in DVMA space", va); #endif + page_list[0] = trunc_page(pa); + attr = PCI_MAP_ATTR_READ | PCI_MAP_ATTR_WRITE; if (flags & BUS_DMA_READ) attr &= ~PCI_MAP_ATTR_READ; if (flags & BUS_DMA_WRITE) attr &= ~PCI_MAP_ATTR_WRITE; - page_list[0] = trunc_page(pa); - if (!pmap_extract(pmap_kernel(), (vaddr_t)page_list, &addr)) - panic("viommu_enter: pmap_extract failed"); + LLTRACE(lltrace_fn_enter, hv_pci_iommu_map); err = hv_pci_iommu_map(is->is_devhandle, tsbid, 1, attr, - addr, &nmapped); + sbf->sbf_flushpa, &nmapped); + LLTRACE(lltrace_fn_leave, hv_pci_iommu_map); if (err != H_EOK || nmapped != 1) panic("hv_pci_iommu_map: err=%d", err); } @@ -198,7 +202,9 @@ viommu_remove(struct iommu_state *is, st } #endif + LLTRACE(lltrace_fn_enter, hv_pci_iommu_demap); err = hv_pci_iommu_demap(is->is_devhandle, tsbid, 1, &ndemapped); + LLTRACE(lltrace_fn_leave, hv_pci_iommu_demap); if (err != H_EOK || ndemapped != 1) panic("hv_pci_iommu_unmap: err=%d", err); } @@ -256,8 +262,8 @@ viommu_dvmamap_destroy(bus_dma_tag_t t, if (map->dm_nsegs) bus_dmamap_unload(t0, map); - if (map->_dm_cookie) - iommu_iomap_destroy(map->_dm_cookie); + if (map->_dm_cookie) + iommu_iomap_destroy(map->_dm_cookie); map->_dm_cookie = NULL; BUS_DMA_FIND_PARENT(t, _dmamap_destroy); @@ -279,7 +285,7 @@ viommu_dvmamap_load(bus_dma_tag_t t, bus { int err = 0; bus_size_t sgsize; - u_long dvmaddr, sgstart, sgend; + vmem_addr_t dvmaddr, sgstart, sgend; bus_size_t align, boundary; struct iommu_state *is; struct iommu_map_state *ims = map->_dm_cookie; @@ -360,27 +366,25 @@ viommu_dvmamap_load(bus_dma_tag_t t, bus } sgsize = ims->ims_map.ipm_pagecnt * PAGE_SIZE; - mtx_enter(&is->is_mtx); if (flags & BUS_DMA_24BIT) { - sgstart = MAX(is->is_dvmamap->ex_start, 0xff000000); - sgend = MIN(is->is_dvmamap->ex_end, 0xffffffff); + sgstart = 0xff000000; + sgend = 0xffffffff; } else { - sgstart = is->is_dvmamap->ex_start; - sgend = is->is_dvmamap->ex_end; + sgstart = VMEM_ADDR_MIN; + sgend = VMEM_ADDR_MAX; } - /* - * If our segment size is larger than the boundary we need to - * split the transfer up into little pieces ourselves. - */ - err = extent_alloc_subregion_with_descr(is->is_dvmamap, sgstart, sgend, - sgsize, align, 0, (sgsize > boundary) ? 0 : boundary, - EX_NOWAIT | EX_BOUNDZERO, &ims->ims_er, (u_long *)&dvmaddr); - mtx_leave(&is->is_mtx); - +//printf("%s[%u] size %lu align %lu boundary 0x%lx\n", __func__, __LINE__, +// sgsize, align, boundary); +// err = extent_alloc_subregion_with_descr(is->is_dvmamap, sgstart, sgend, +// sgsize, align, 0, (sgsize > boundary) ? 0 : boundary, +// EX_NOWAIT | EX_BOUNDZERO, &ims->ims_er, (u_long *)&dvmaddr); + err = vmem_xalloc(is->is_dvmamap, sgsize, align, 0, + (sgsize > boundary) ? 0 : boundary, + sgstart, sgend, VM_NOSLEEP | VM_INSTANTFIT, &dvmaddr); #ifdef DEBUG if (err || (dvmaddr == (bus_addr_t)-1)) { - printf("iommu_dvmamap_load(): extent_alloc(%d, %x) failed!\n", + printf("iommu_dvmamap_load(): vmem_xalloc(%d, %x) failed!\n", (int)sgsize, flags); #ifdef DDB if (iommudebug & IDB_BREAK) @@ -392,6 +396,7 @@ viommu_dvmamap_load(bus_dma_tag_t t, bus iommu_iomap_clear_pages(ims); return (err); } +//printf("%s[%u] addr 0x%lx size %lu\n", __func__, __LINE__, dvmaddr, sgsize); /* Set the active DVMA map */ map->_dm_dvmastart = dvmaddr; @@ -466,10 +471,18 @@ viommu_dvmamap_load_raw(bus_dma_tag_t t, int err = 0; bus_size_t sgsize; bus_size_t boundary, align; - u_long dvmaddr, sgstart, sgend; + vmem_addr_t dvmaddr, sgstart, sgend; struct iommu_state *is; struct iommu_map_state *ims = map->_dm_cookie; + if (ISSET(flags, BUS_DMA_BUS4)) { + for (i = 0; i < nsegs; i++) { + printf("%d: %llu @ %llx\n", i, + (unsigned long long)segs[i].ds_len, + (unsigned long long)segs[i].ds_addr); + } + } + #ifdef DIAGNOSTIC if (ims == NULL) panic("viommu_dvmamap_load_raw: null map state"); @@ -550,32 +563,32 @@ viommu_dvmamap_load_raw(bus_dma_tag_t t, } sgsize = ims->ims_map.ipm_pagecnt * PAGE_SIZE; - mtx_enter(&is->is_mtx); if (flags & BUS_DMA_24BIT) { - sgstart = MAX(is->is_dvmamap->ex_start, 0xff000000); - sgend = MIN(is->is_dvmamap->ex_end, 0xffffffff); + sgstart = 0xff000000; + sgend = 0xffffffff; } else { - sgstart = is->is_dvmamap->ex_start; - sgend = is->is_dvmamap->ex_end; + sgstart = VMEM_ADDR_MIN; + sgend = VMEM_ADDR_MAX; } /* * If our segment size is larger than the boundary we need to * split the transfer up into little pieces ourselves. */ - err = extent_alloc_subregion_with_descr(is->is_dvmamap, sgstart, sgend, - sgsize, align, 0, (sgsize > boundary) ? 0 : boundary, - EX_NOWAIT | EX_BOUNDZERO, &ims->ims_er, (u_long *)&dvmaddr); - mtx_leave(&is->is_mtx); - +//printf("%s[%u] size %lu align %lu boundary 0x%lx\n", __func__, __LINE__, +// sgsize, align, boundary); + err = vmem_xalloc(is->is_dvmamap, sgsize, align, 0, + (sgsize > boundary) ? 0 : boundary, + sgstart, sgend, VM_NOSLEEP | VM_INSTANTFIT, &dvmaddr); if (err != 0) { iommu_iomap_clear_pages(ims); return (err); } +//printf("%s[%u] addr 0x%lx size %lu\n", __func__, __LINE__, dvmaddr, sgsize); #ifdef DEBUG if (dvmaddr == (bus_addr_t)-1) { - printf("iommu_dvmamap_load_raw(): extent_alloc(%d, %x) " + printf("iommu_dvmamap_load_raw(): vmem_xalloc(%d, %x) " "failed!\n", (int)sgsize, flags); #ifdef DDB if (iommudebug & IDB_BREAK) @@ -836,7 +849,6 @@ viommu_dvmamap_unload(bus_dma_tag_t t, b struct iommu_map_state *ims = map->_dm_cookie; bus_addr_t dvmaddr = map->_dm_dvmastart; bus_size_t sgsize = map->_dm_dvmasize; - int error; #ifdef DEBUG if (ims == NULL) @@ -859,13 +871,10 @@ viommu_dvmamap_unload(bus_dma_tag_t t, b map->dm_mapsize = 0; map->dm_nsegs = 0; - mtx_enter(&is->is_mtx); - error = extent_free(is->is_dvmamap, dvmaddr, sgsize, EX_NOWAIT); +//printf("%s[%u] addr 0x%lx size %lu\n", __func__, __LINE__, dvmaddr, sgsize); + vmem_xfree(is->is_dvmamap, dvmaddr, sgsize); map->_dm_dvmastart = 0; map->_dm_dvmasize = 0; - mtx_leave(&is->is_mtx); - if (error != 0) - printf("warning: %ld of DVMA space lost\n", sgsize); } void @@ -883,16 +892,15 @@ viommu_dvmamap_sync(bus_dma_tag_t t, bus if (len == 0) return; - if (ops & BUS_DMASYNC_PREWRITE) + if (ISSET(ops, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_PREWRITE)) __membar("#MemIssue"); - #if 0 if (ops & (BUS_DMASYNC_POSTREAD | BUS_DMASYNC_PREWRITE)) _viommu_dvmamap_sync(t, t0, map, offset, len, ops); -#endif if (ops & BUS_DMASYNC_POSTREAD) __membar("#MemIssue"); +#endif } int @@ -928,6 +936,7 @@ struct iommu_map_state * viommu_iomap_create(int n) { struct iommu_map_state *ims; + struct strbuf_flush *sbf; /* Safety for heavily fragmented data, such as mbufs */ n += 4; @@ -943,6 +952,10 @@ viommu_iomap_create(int n) ims->ims_map.ipm_maxpage = n; SPLAY_INIT(&ims->ims_map.ipm_tree); + /* (Ab)use the flush area for use with the pci_iommu_map hypercall */ + sbf = &ims->ims_flush; + pmap_extract(pmap_kernel(), (vaddr_t)sbf->sbf_area, &sbf->sbf_flushpa); + return (ims); } @@ -960,7 +973,7 @@ viommu_iomap_load_map(struct iommu_state for (i = 0, e = ipm->ipm_map; i < ipm->ipm_pagecnt; ++i, ++e) { e->ipe_va = vmaddr; - viommu_enter(is, NULL, e->ipe_va, e->ipe_pa, flags); + viommu_enter(is, ims, NULL, e->ipe_va, e->ipe_pa, flags); vmaddr += PAGE_SIZE; } } Index: sys/arch/sparc64/include/bus.h =================================================================== RCS file: /cvs/src/sys/arch/sparc64/include/bus.h,v diff -u -p -r1.37 bus.h --- sys/arch/sparc64/include/bus.h 24 Dec 2023 11:12:34 -0000 1.37 +++ sys/arch/sparc64/include/bus.h 11 Sep 2024 11:29:26 -0000 @@ -70,6 +70,8 @@ #ifdef _KERNEL +#include + /* * Debug hooks */ @@ -477,10 +479,12 @@ struct sparc_bus_dma_tag { #define _BD_PRECALL(t,f) \ while (t->f == NULL) { \ t = t->_parent; \ - } + } \ + LLTRACE(lltrace_fn_enter, t->f) #define _BD_CALL(t,f) \ (*(t)->f) -#define _BD_POSTCALL +#define _BD_POSTCALL(t,f) \ + LLTRACE(lltrace_fn_leave, t->f) static inline int bus_dmamap_create(bus_dma_tag_t t, bus_size_t s, int n, bus_size_t m, @@ -490,7 +494,7 @@ bus_dmamap_create(bus_dma_tag_t t, bus_s const bus_dma_tag_t t0 = t; _BD_PRECALL(t, _dmamap_create); r = _BD_CALL(t, _dmamap_create)(t, t0, s, n, m, b, f, p); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_create); return (r); } static inline void @@ -499,7 +503,7 @@ bus_dmamap_destroy(bus_dma_tag_t t, bus_ const bus_dma_tag_t t0 = t; _BD_PRECALL(t, _dmamap_destroy); _BD_CALL(t, _dmamap_destroy)(t, t0, p); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_destroy); } static inline int bus_dmamap_load(bus_dma_tag_t t, bus_dmamap_t m, void *b, bus_size_t s, @@ -509,7 +513,7 @@ bus_dmamap_load(bus_dma_tag_t t, bus_dma int r; _BD_PRECALL(t, _dmamap_load); r = _BD_CALL(t, _dmamap_load)(t, t0, m, b, s, p, f); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_load); return (r); } static inline int @@ -520,7 +524,7 @@ bus_dmamap_load_mbuf(bus_dma_tag_t t, bu int r; _BD_PRECALL(t, _dmamap_load_mbuf); r = _BD_CALL(t, _dmamap_load_mbuf)(t, t0, m, b, f); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_load_mbuf); return (r); } static inline int @@ -530,7 +534,7 @@ bus_dmamap_load_uio(bus_dma_tag_t t, bus int r; _BD_PRECALL(t, _dmamap_load_uio); r = _BD_CALL(t, _dmamap_load_uio)(t, t0, m, u, f); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_load_uio); return (r); } static inline int @@ -541,7 +545,7 @@ bus_dmamap_load_raw(bus_dma_tag_t t, bus int r; _BD_PRECALL(t, _dmamap_load_raw); r = _BD_CALL(t, _dmamap_load_raw)(t, t0, m, sg, n, s, f); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_load_raw); return (r); } static inline void @@ -550,7 +554,7 @@ bus_dmamap_unload(bus_dma_tag_t t, bus_d const bus_dma_tag_t t0 = t; _BD_PRECALL(t, _dmamap_unload); _BD_CALL(t, _dmamap_unload)(t, t0, p); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_unload); } static inline void bus_dmamap_sync(bus_dma_tag_t t, bus_dmamap_t p, bus_addr_t o, bus_size_t l, @@ -559,7 +563,7 @@ bus_dmamap_sync(bus_dma_tag_t t, bus_dma const bus_dma_tag_t t0 = t; _BD_PRECALL(t, _dmamap_sync); _BD_CALL(t, _dmamap_sync)(t, t0, p, o, l, ops); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_sync); } static inline int bus_dmamem_alloc(bus_dma_tag_t t, bus_size_t s, bus_size_t a, bus_size_t b, @@ -569,7 +573,7 @@ bus_dmamem_alloc(bus_dma_tag_t t, bus_si int ret; _BD_PRECALL(t, _dmamem_alloc); ret = _BD_CALL(t, _dmamem_alloc)(t, t0, s, a, b, sg, n, r, f); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamem_alloc); return (ret); } static inline void @@ -578,7 +582,7 @@ bus_dmamem_free(bus_dma_tag_t t, bus_dma const bus_dma_tag_t t0 = t; _BD_PRECALL(t, _dmamem_free); _BD_CALL(t, _dmamem_free)(t, t0, sg, n); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamem_free); } static inline int bus_dmamem_map(bus_dma_tag_t t, bus_dma_segment_t *sg, int n, size_t s, @@ -588,7 +592,7 @@ bus_dmamem_map(bus_dma_tag_t t, bus_dma_ int r; _BD_PRECALL(t, _dmamem_map); r = _BD_CALL(t, _dmamem_map)(t, t0, sg, n, s, k, f); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamem_map); return (r); } static inline void @@ -597,7 +601,7 @@ bus_dmamem_unmap(bus_dma_tag_t t, caddr_ const bus_dma_tag_t t0 = t; _BD_PRECALL(t, _dmamem_unmap); _BD_CALL(t, _dmamem_unmap)(t, t0, k, s); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamem_unmap); } static inline paddr_t bus_dmamem_mmap(bus_dma_tag_t t, bus_dma_segment_t *sg, int n, off_t o, int p, @@ -607,7 +611,7 @@ bus_dmamem_mmap(bus_dma_tag_t t, bus_dma int r; _BD_PRECALL(t, _dmamem_mmap); r = _BD_CALL(t, _dmamem_mmap)(t, t0, sg, n, o, p, f); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamem_mmap); return (r); } Index: sys/arch/sparc64/sparc64/conf.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/sparc64/conf.c,v diff -u -p -r1.90 conf.c --- sys/arch/sparc64/sparc64/conf.c 11 Jun 2024 09:21:32 -0000 1.90 +++ sys/arch/sparc64/sparc64/conf.c 11 Sep 2024 11:29:26 -0000 @@ -110,6 +110,7 @@ cdev_decl(pci); #include "ksyms.h" #include "kstat.h" +#include "llt.h" #include "hotplug.h" #include "vscsi.h" @@ -180,7 +181,8 @@ struct cdevsw cdevsw[] = cdev_notdef(), /* 28 */ cdev_notdef(), /* 29 */ cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ - cdev_notdef(), /* 31 */ + cdev_lltrace_init(NLLT,lltrace), + /* 31: lltrace */ cdev_notdef(), /* 32 */ cdev_notdef(), /* 33 */ cdev_notdef(), /* 34 */ Index: sys/arch/sparc64/sparc64/intr.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/sparc64/intr.c,v diff -u -p -r1.67 intr.c --- sys/arch/sparc64/sparc64/intr.c 29 Mar 2024 21:29:34 -0000 1.67 +++ sys/arch/sparc64/sparc64/intr.c 11 Sep 2024 11:29:26 -0000 @@ -45,6 +45,7 @@ #include #include #include +#include #include @@ -78,6 +79,8 @@ intr_handler(struct trapframe *tf, struc #ifdef MULTIPROCESSOR int need_lock; + LLTRACE(lltrace_intr_enter, LLTRACE_INTR_T_HW, ih->ih_number); + if (ih->ih_mpsafe) need_lock = 0; else @@ -86,11 +89,16 @@ intr_handler(struct trapframe *tf, struc if (need_lock) KERNEL_LOCK(); #endif + LLTRACE(lltrace_fn_enter, ih->ih_fun); rc = (*ih->ih_fun)(ih->ih_arg ? ih->ih_arg : tf); + LLTRACE(lltrace_fn_leave, ih->ih_fun); #ifdef MULTIPROCESSOR if (need_lock) KERNEL_UNLOCK(); #endif + + LLTRACE(lltrace_intr_leave, LLTRACE_INTR_T_HW, ih->ih_number); + return rc; } @@ -109,7 +117,9 @@ intr_list_handler(void *arg) sparc_wrpr(pil, ih->ih_pil, 0); ci->ci_handled_intr_level = ih->ih_pil; + LLTRACE_CPU(ci, lltrace_fn_enter, ih->ih_fun); rv = ih->ih_fun(ih->ih_arg); + LLTRACE_CPU(ci, lltrace_fn_leave, ih->ih_fun); if (rv) { ih->ih_count.ec_count++; claimed = 1; Index: sys/arch/sparc64/sparc64/ipifuncs.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/sparc64/ipifuncs.c,v diff -u -p -r1.22 ipifuncs.c --- sys/arch/sparc64/sparc64/ipifuncs.c 14 Apr 2024 19:08:09 -0000 1.22 +++ sys/arch/sparc64/sparc64/ipifuncs.c 11 Sep 2024 11:29:26 -0000 @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -74,6 +75,8 @@ sun4u_send_ipi(int itid, void (*func)(vo KASSERT((u_int64_t)func > MAXINTNUM); + LLTRACE(lltrace_ipi, itid); + /* * UltraSPARC-IIIi CPUs select the BUSY/NACK pair based on the * lower two bits of the ITID. @@ -127,6 +130,8 @@ sun4v_send_ipi(int itid, void (*func)(vo u_int64_t s; int err, i; + LLTRACE(lltrace_ipi, itid); + s = intr_disable(); stha(ci->ci_cpuset, ASI_PHYS_CACHED, itid); @@ -154,6 +159,8 @@ sun4v_send_ipi(int itid, void (*func)(vo void sparc64_broadcast_ipi(void (*func)(void), u_int64_t arg0, u_int64_t arg1) { + LLTRACE(lltrace_ipi, ~0x0); + if (CPU_ISSUN4V) sun4v_broadcast_ipi(func, arg0, arg1); else @@ -180,6 +187,8 @@ sun4v_broadcast_ipi(void (*func)(void), struct cpu_info *ci = curcpu(); paddr_t cpuset = ci->ci_cpuset; int err, i, ncpus = 0; + + LLTRACE(lltrace_ipi, ~0x0); for (ci = cpus; ci != NULL; ci = ci->ci_next) { if (ci->ci_cpuid == cpu_number()) Index: sys/arch/sparc64/sparc64/machdep.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/sparc64/machdep.c,v diff -u -p -r1.218 machdep.c --- sys/arch/sparc64/sparc64/machdep.c 22 May 2024 05:51:49 -0000 1.218 +++ sys/arch/sparc64/sparc64/machdep.c 11 Sep 2024 11:29:26 -0000 @@ -991,12 +991,20 @@ _bus_dmamap_load_mbuf(bus_dma_tag_t t, b buflen -= incr; vaddr += incr; - if (i > 0 && pa == (segs[i - 1].ds_addr + - segs[i - 1].ds_len) && ((segs[i - 1].ds_len + incr) - < map->_dm_maxsegsz)) { - /* Hey, waddyaknow, they're contiguous */ - segs[i - 1].ds_len += incr; - continue; + if (i > 0) { + bus_dma_segment_t *pseg = &segs[i - 1]; + if (pa == pseg->ds_addr + pseg->ds_len) { + /* waddyaknow, they're contiguous */ + long nlen = pseg->ds_len + incr; + if (nlen <= map->_dm_maxsegsz) { + pseg->ds_len = nlen; + continue; + } + pseg->ds_len = map->_dm_maxsegsz; + + pa = pseg->ds_addr + map->_dm_maxsegsz; + incr = nlen - map->_dm_maxsegsz; + } } segs[i].ds_addr = pa; segs[i].ds_len = incr; Index: sys/conf/GENERIC =================================================================== RCS file: /cvs/src/sys/conf/GENERIC,v diff -u -p -r1.297 GENERIC --- sys/conf/GENERIC 31 Aug 2024 04:17:14 -0000 1.297 +++ sys/conf/GENERIC 11 Sep 2024 11:29:26 -0000 @@ -81,6 +81,7 @@ pseudo-device endrun 1 # EndRun line dis pseudo-device vnd 4 # vnode disk devices pseudo-device ksyms 1 # kernel symbols device pseudo-device kstat # kernel statistics device +pseudo-device llt # low-level tracing device # clonable devices pseudo-device bpfilter # packet filter Index: sys/conf/files =================================================================== RCS file: /cvs/src/sys/conf/files,v diff -u -p -r1.736 files --- sys/conf/files 31 Aug 2024 04:17:14 -0000 1.736 +++ sys/conf/files 11 Sep 2024 11:29:26 -0000 @@ -24,6 +24,8 @@ define video {} define intrmap {} define fdt {[early = 0]} +define vmem + # filesystem firmware loading attribute define firmload @@ -602,6 +604,9 @@ file dev/ksyms.c ksyms needs-flag pseudo-device kstat file dev/kstat.c kstat needs-flag +pseudo-device llt +file dev/lltrace.c llt needs-flag + pseudo-device fuse file miscfs/fuse/fuse_device.c fuse needs-flag file miscfs/fuse/fuse_file.c fuse @@ -744,6 +749,7 @@ file kern/subr_blist.c file kern/subr_disk.c file kern/subr_evcount.c file kern/subr_extent.c +file kern/subr_vmem.c vmem file kern/subr_suspend.c suspend file kern/subr_hibernate.c hibernate file kern/subr_kubsan.c kubsan Index: sys/dev/lltrace.c =================================================================== RCS file: sys/dev/lltrace.c diff -N sys/dev/lltrace.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/dev/lltrace.c 11 Sep 2024 11:29:26 -0000 @@ -0,0 +1,1104 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2022 The University of Queensland + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * This code was written by David Gwynne as part + * of the Information Technology Infrastructure Group (ITIG) in the + * Faculty of Engineering, Architecture and Information Technology + * (EAIT). + * + * It was heavily inspired by the KUTrace (kernel/userland tracing) + * framework by Richard L. Sites. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#if defined(__amd64__) || defined(__i386__) + +static inline unsigned int +lltrace_cas(unsigned int *p, unsigned int e, unsigned int n) +{ + __asm volatile("cmpxchgl %2, %1" + : "=a" (e), "=m" (*p) + : "r" (n), "a" (e), "m" (*p)); + + return (e); +} + +static inline uint64_t +lltrace_ts(void) +{ + unsigned int hi, lo; + + __asm volatile("lfence; rdtsc" : "=d" (hi), "=a" (lo)); + + return (lo & (LLTRACE_TS_MASK << LLTRACE_TS_SHIFT)); +} + +static inline uint64_t +lltrace_ts_long(void) +{ + return (rdtsc_lfence() & ~LLTRACE_MASK(LLTRACE_TS_SHIFT)); +} + +#elif defined(__aarch64__) + +#define lltrace_cas(_p, _e, _n) atomic_cas_uint((_p), (_e), (_n)) + +static inline uint64_t +lltrace_ts_long(void) +{ + uint64_t ts; + + __asm volatile("mrs %x0, cntvct_el0" : "=r" (ts)); + + return (ts << LLTRACE_TS_SHIFT); +} + +static inline uint64_t +lltrace_ts(void) +{ + uint64_t ts = ltrace_ts_long(); + + return (ts & (LLTRACE_TS_MASK << LLTRACE_TS_SHIFT)); +} + +#elif defined(__sparc64__) + +#define lltrace_cas(_p, _e, _n) atomic_cas_uint((_p), (_e), (_n)) + +static inline uint64_t +lltrace_ts_long(void) +{ + uint64_t ts; + + ts = sys_tick(); + + return (ts << LLTRACE_TS_SHIFT); +} + +static inline uint64_t +lltrace_ts(void) +{ + uint64_t ts = lltrace_ts_long(); + + return (ts & (LLTRACE_TS_MASK << LLTRACE_TS_SHIFT)); +} + +#else /* not x86 or arm64 */ + +#error not supported (yet) + +static unsigned int +lltrace_cas(unsigned int *p, unsigned int e, unsigned int n) +{ + unsigned int o; + int s; + + s = intr_disable(); + o = *p; + if (o == e) + *p = n; + intr_restore(s); + + return (o); +} + +static inline uint64_t +lltrace_ts(void) +{ + return (countertime()); +} + +static inline uint64_t +lltrace_ts_long(void) +{ + return (countertime()); +} + +#endif + +#define LLTRACE_MB2NBUF(_mb) \ + (((_mb) * (1U << 20)) / sizeof(struct lltrace_buffer)) +#define LLTRACE_NBUF2MB(_nbuf) \ + (((_nbuf) * sizeof(struct lltrace_buffer)) / (1U << 20)) + +#define LLTRACE_BLEN_DEFAULT 16 + +struct lltrace_cpu { + SIMPLEQ_ENTRY(lltrace_cpu) + llt_entry; + struct lltrace_buffer llt_buffer; + unsigned int llt_slot; + unsigned int llt_pid; + unsigned int llt_tid; + uint64_t llt_wakeid; +}; + +SIMPLEQ_HEAD(lltrace_cpu_list, lltrace_cpu); + +struct lltrace_softc { + unsigned int sc_running; + unsigned int sc_mode; + struct rwlock sc_lock; + unsigned int sc_nbuffers; + + unsigned int sc_free; + unsigned int sc_used; + struct lltrace_cpu **sc_ring; + struct lltrace_cpu *sc_buffers; + + unsigned int sc_read; + unsigned int sc_reading; + struct selinfo sc_sel; + + uint64_t sc_boottime; + uint64_t sc_monotime; +}; + +static int lltrace_start(struct lltrace_softc *, struct proc *); +static int lltrace_stop(struct lltrace_softc *, struct proc *); +static int lltrace_flush(struct lltrace_softc *); + +static struct lltrace_softc *lltrace_sc; + +int +lltattach(int num) +{ + return (0); +} + +int +lltraceopen(dev_t dev, int flag, int mode, struct proc *p) +{ + struct lltrace_softc *sc; + int error; + + if (minor(dev) != 0) + return (ENXIO); + + error = suser(p); + if (error != 0) + return (error); + + if (lltrace_sc != NULL) + return (EBUSY); + + sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO); + if (sc == NULL) + return (ENOMEM); + + sc->sc_running = 0; + sc->sc_nbuffers = LLTRACE_MB2NBUF(LLTRACE_BLEN_DEFAULT); + + rw_init(&sc->sc_lock, "lltlk"); + + sc->sc_read = 0; + sc->sc_reading = 0; + klist_init_rwlock(&sc->sc_sel.si_note, &sc->sc_lock); + + /* commit */ + if (atomic_cas_ptr(&lltrace_sc, NULL, sc) != NULL) { + free(sc, M_DEVBUF, sizeof(*sc)); + return (EBUSY); + } + + return (0); +} + +int +lltraceclose(dev_t dev, int flag, int mode, struct proc *p) +{ + struct lltrace_softc *sc = lltrace_sc; + + rw_enter_write(&sc->sc_lock); + lltrace_stop(sc, p); + lltrace_flush(sc); + rw_exit_write(&sc->sc_lock); + + lltrace_sc = NULL; + membar_sync(); + + free(sc, M_DEVBUF, sizeof(*sc)); + + return (0); +} + +static int +lltrace_fionread(struct lltrace_softc *sc) +{ + int canread; + + rw_enter_read(&sc->sc_lock); + canread = !sc->sc_running && (sc->sc_buffers != NULL) && + (sc->sc_read < sc->sc_nbuffers); + rw_exit_read(&sc->sc_lock); + + return (canread ? sizeof(struct lltrace_buffer) : 0); +} + +static void +lltrace_cpu_init(struct lltrace_cpu *llt, struct lltrace_softc *sc, + struct cpu_info *ci, unsigned int pid, unsigned int tid, uint64_t wakeid) +{ + struct lltrace_header *llh; + + llh = (struct lltrace_header *)&llt->llt_buffer; + llh->h_cpu = cpu_number(); + llh->h_idletid = ci->ci_schedstate.spc_idleproc->p_tid; + llh->h_boottime = sc->sc_boottime; + llh->h_start_cy = lltrace_ts_long(); + llh->h_start_ns = nsecuptime() - sc->sc_monotime; + llh->h_end_cy = 0; + llh->h_end_ns = 0; + llh->h_pid = pid; + llh->h_tid = tid; + llh->h_zero = 0; + + llt->llt_pid = pid; + llt->llt_tid = tid; + llt->llt_slot = 8; + llt->llt_wakeid = wakeid; +} + +static void +lltrace_cpu_fini(struct lltrace_cpu *llt, struct lltrace_softc *sc) +{ + struct lltrace_header *llh; + + llh = (struct lltrace_header *)&llt->llt_buffer; + llh->h_end_cy = lltrace_ts_long(); + llh->h_end_ns = nsecuptime() - sc->sc_monotime; +} + +static int +lltrace_set_mode(struct lltrace_softc *sc, unsigned int mode) +{ + int error; + + if (mode >= LLTRACE_MODE_COUNT) + return (EINVAL); + + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + return (error); + + if (sc->sc_running) + error = EBUSY; + else + sc->sc_mode = mode; + + rw_exit(&sc->sc_lock); + return (error); +} + +static int +lltrace_set_blen(struct lltrace_softc *sc, unsigned int blen) +{ + int error; + unsigned int nbuffers; + + if (blen < LLTRACE_BLEN_MIN || blen > LLTRACE_BLEN_MAX) + return (EINVAL); + + /* convert megabytes to the number of buffers */ + nbuffers = LLTRACE_MB2NBUF(blen); + if (nbuffers <= ncpus) + return (EINVAL); + + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + return (error); + + if (sc->sc_buffers != NULL) + error = EBUSY; + else + sc->sc_nbuffers = nbuffers; + + rw_exit(&sc->sc_lock); + return (error); +} + +static int +lltrace_start(struct lltrace_softc *sc, struct proc *p) +{ + struct process *ps = p->p_p; + struct bintime boottime; + unsigned int i; + size_t sz; + struct lltrace_cpu_list l = SIMPLEQ_HEAD_INITIALIZER(l); + struct lltrace_cpu *llt; + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + unsigned int pid, tid; + + if (sc->sc_running) + return EINVAL; + + if (sc->sc_nbuffers <= (ncpus * 2 + 1)) + return (EINVAL); + + lltrace_flush(sc); + + sc->sc_monotime = nsecuptime(); + + binboottime(&boottime); + sc->sc_boottime = BINTIME_TO_NSEC(&boottime) + sc->sc_monotime; + + sz = roundup(sc->sc_nbuffers * sizeof(*sc->sc_buffers), PAGE_SIZE); + sc->sc_buffers = km_alloc(sz, &kv_any, &kp_dirty, &kd_waitok); + if (sc->sc_buffers == NULL) + return (ENOMEM); + sc->sc_ring = mallocarray(sc->sc_nbuffers, sizeof(*sc->sc_ring), + M_DEVBUF, M_WAITOK); + for (i = 0; i < sc->sc_nbuffers; i++) { + llt = &sc->sc_buffers[i]; + llt->llt_slot = 0; + sc->sc_ring[i] = llt; + } + + sc->sc_free = 0; /* next slot to pull a free buffer from */ + sc->sc_used = 0; /* next slot to put a used buffer in */ + + CPU_INFO_FOREACH(cii, ci) { + i = sc->sc_free++; /* can't wrap yet */ + + llt = sc->sc_ring[i]; + sc->sc_ring[i] = NULL; + + SIMPLEQ_INSERT_HEAD(&l, llt, llt_entry); + } + + tid = p->p_tid; + pid = ps->ps_pid; + if (ISSET(ps->ps_flags, PS_SYSTEM)) + pid |= (1U << 31); + + CPU_INFO_FOREACH(cii, ci) { + sched_peg_curproc(ci); + + llt = SIMPLEQ_FIRST(&l); + SIMPLEQ_REMOVE_HEAD(&l, llt_entry); + + lltrace_cpu_init(llt, sc, ci, pid, tid, 0x1); + lltrace_pidname(llt, p); + + membar_producer(); + ci->ci_schedstate.spc_lltrace = llt; + } + atomic_clearbits_int(&p->p_flag, P_CPUPEG); + + sc->sc_running = 1; + + return (0); +} + +static int +lltrace_stop(struct lltrace_softc *sc, struct proc *p) +{ + struct lltrace_cpu *llt; + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + unsigned long s; + + if (!sc->sc_running) + return (EALREADY); + + sc->sc_running = 0; + + /* visit each cpu to take llt away safely */ + CPU_INFO_FOREACH(cii, ci) { + sched_peg_curproc(ci); + + s = intr_disable(); + llt = ci->ci_schedstate.spc_lltrace; + ci->ci_schedstate.spc_lltrace = NULL; + intr_restore(s); + + lltrace_cpu_fini(llt, sc); + } + atomic_clearbits_int(&p->p_flag, P_CPUPEG); + + return (0); +} + +static int +lltrace_flush(struct lltrace_softc *sc) +{ + size_t sz; + + rw_assert_wrlock(&sc->sc_lock); + if (sc->sc_running) + return (EBUSY); + + if (sc->sc_buffers == NULL) + return (0); + + sz = roundup(sc->sc_nbuffers * sizeof(*sc->sc_buffers), PAGE_SIZE); + km_free(sc->sc_buffers, sz, &kv_any, &kp_dirty); + free(sc->sc_ring, M_DEVBUF, sc->sc_nbuffers * sizeof(*sc->sc_ring)); + + sc->sc_buffers = NULL; + sc->sc_ring = NULL; + sc->sc_read = 0; + + return (0); +} + +int +lltraceioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) +{ + struct lltrace_softc *sc = lltrace_sc; + int error = 0; + + KERNEL_UNLOCK(); + + switch (cmd) { + case FIONREAD: + *(int *)data = lltrace_fionread(sc); + break; + case FIONBIO: + /* vfs tracks this for us if we let it */ + break; + + case LLTIOCSTART: + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + break; + error = lltrace_start(sc, p); + rw_exit(&sc->sc_lock); + break; + case LLTIOCSTOP: + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + break; + error = lltrace_stop(sc, p); + rw_exit(&sc->sc_lock); + break; + case LLTIOCFLUSH: + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + break; + error = lltrace_flush(sc); + rw_exit(&sc->sc_lock); + break; + + case LLTIOCSBLEN: + error = lltrace_set_blen(sc, *(unsigned int *)data); + break; + case LLTIOCGBLEN: + *(unsigned int *)data = LLTRACE_NBUF2MB(sc->sc_nbuffers); + break; + + case LLTIOCSMODE: + error = lltrace_set_mode(sc, *(unsigned int *)data); + break; + case LLTIOCGMODE: + *(unsigned int *)data = sc->sc_mode; + break; + + default: + error = ENOTTY; + break; + } + + KERNEL_LOCK(); + + return (error); +} + +int +lltraceread(dev_t dev, struct uio *uio, int ioflag) +{ + struct lltrace_softc *sc = lltrace_sc; + struct lltrace_cpu *llt; + unsigned int slot; + int error; + + KERNEL_UNLOCK(); + + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + goto lock; + + if (sc->sc_running) { + if (ISSET(ioflag, IO_NDELAY)) { + error = EWOULDBLOCK; + goto unlock; + } + + do { + sc->sc_reading++; + error = rwsleep_nsec(&sc->sc_reading, &sc->sc_lock, + PRIBIO|PCATCH, "lltread", INFSLP); + sc->sc_reading--; + if (error != 0) + goto unlock; + } while (sc->sc_running); + } + + if (sc->sc_buffers == NULL) { + error = 0; + goto unlock; + } + + slot = sc->sc_read; + for (;;) { + if (slot >= sc->sc_nbuffers) { + error = 0; + goto unlock; + } + + llt = &sc->sc_buffers[slot]; + KASSERT(llt->llt_slot <= nitems(llt->llt_buffer.llt_slots)); + if (llt->llt_slot > 0) + break; + + slot++; + } + + error = uiomove(&llt->llt_buffer, + llt->llt_slot * sizeof(llt->llt_buffer.llt_slots[0]), uio); + if (error != 0) + goto unlock; + + sc->sc_read = slot + 1; + +unlock: + rw_exit(&sc->sc_lock); +lock: + KERNEL_LOCK(); + return (error); +} + +static void +lltrace_filt_detach(struct knote *kn) +{ + struct lltrace_softc *sc = kn->kn_hook; + + klist_remove(&sc->sc_sel.si_note, kn); +} + +static int +lltrace_filt_event(struct knote *kn, long hint) +{ + struct lltrace_softc *sc = kn->kn_hook; + int canread; + + canread = !sc->sc_running && (sc->sc_buffers != NULL) && + (sc->sc_read < sc->sc_nbuffers); + + kn->kn_data = canread ? sizeof(struct lltrace_buffer) : 0; + + return (canread); +} + +static int +lltrace_filt_modify(struct kevent *kev, struct knote *kn) +{ + struct lltrace_softc *sc = kn->kn_hook; + int active; + + rw_enter_write(&sc->sc_lock); + active = knote_modify_fn(kev, kn, lltrace_filt_event); + rw_exit_write(&sc->sc_lock); + + return (active); +} + +static int +lltrace_filt_process(struct knote *kn, struct kevent *kev) +{ + struct lltrace_softc *sc = kn->kn_hook; + int active; + + rw_enter_write(&sc->sc_lock); + active = knote_process_fn(kn, kev, lltrace_filt_event); + rw_exit_write(&sc->sc_lock); + + return (active); +} + +static const struct filterops lltrace_filtops = { + .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, + .f_attach = NULL, + .f_detach = lltrace_filt_detach, + .f_event = lltrace_filt_event, + .f_modify = lltrace_filt_modify, + .f_process = lltrace_filt_process, +}; + +int +lltracekqfilter(dev_t dev, struct knote *kn) +{ + struct lltrace_softc *sc = lltrace_sc; + struct klist *klist; + + switch (kn->kn_filter) { + case EVFILT_READ: + klist = &sc->sc_sel.si_note; + kn->kn_fop = &lltrace_filtops; + break; + default: + return (EINVAL); + } + + kn->kn_hook = sc; + klist_insert(klist, kn); + + return (0); +} + +static struct lltrace_cpu * +lltrace_next(struct lltrace_cpu *llt) +{ + struct lltrace_softc *sc = lltrace_sc; + struct cpu_info *ci = curcpu(); + struct lltrace_cpu *nllt; + unsigned int slot, oslot, nslot; + + /* check if we were preempted */ + nllt = ci->ci_schedstate.spc_lltrace; + if (nllt != llt) { + /* something preempted us and swapped buffers already */ + return (nllt); + } + + slot = sc->sc_free; + for (;;) { + nslot = slot + 1; + if (nslot > sc->sc_nbuffers) { + if (sc->sc_mode == LLTRACE_MODE_HEAD) + return (NULL); + } + + oslot = atomic_cas_uint(&sc->sc_free, slot, nslot); + if (slot == oslot) + break; + + slot = oslot; + } + + slot %= sc->sc_nbuffers; + nllt = sc->sc_ring[slot]; + sc->sc_ring[slot] = NULL; + + slot = sc->sc_used; + for (;;) { + nslot = slot + 1; + + oslot = atomic_cas_uint(&sc->sc_used, slot, nslot); + if (slot == oslot) + break; + + slot = oslot; + } + + lltrace_cpu_init(nllt, sc, ci, llt->llt_pid, llt->llt_tid, + llt->llt_wakeid); + lltrace_cpu_fini(llt, sc); + + slot %= sc->sc_nbuffers; + sc->sc_ring[slot] = llt; + + ci->ci_schedstate.spc_lltrace = nllt; + + return (nllt); +} + +static struct lltrace_cpu * +lltrace_insert_record(struct lltrace_cpu *llt, uint64_t type, uint64_t record, + const uint64_t *extra, unsigned int n) +{ + unsigned int slot, oslot, nslot; + uint64_t *slots; + + record |= type << LLTRACE_TYPE_SHIFT; + record |= n++ << LLTRACE_LEN_SHIFT; + + slot = llt->llt_slot; + for (;;) { + nslot = slot + n; + if (nslot > nitems(llt->llt_buffer.llt_slots)) { + unsigned long s; + + s = intr_disable(); + llt = lltrace_next(llt); + intr_restore(s); + + if (llt == NULL) + return (NULL); + + slot = llt->llt_slot; + continue; + } + + oslot = lltrace_cas(&llt->llt_slot, slot, nslot); + if (slot == oslot) + break; + + slot = oslot; + } + + slots = llt->llt_buffer.llt_slots + slot; + *slots = record; + while (n > 1) { + *(++slots) = *(extra++); + n--; + } + + return (llt); +} + +static struct lltrace_cpu * +lltrace_insert(struct lltrace_cpu *llt, uint64_t type, uint64_t record, + const uint64_t *extra, unsigned int n) +{ + record |= lltrace_ts(); + return (lltrace_insert_record(llt, type, record, extra, n)); +} + +void +lltrace_statclock(struct lltrace_cpu *llt, int usermode, unsigned long pc) +{ +#if 0 + uint64_t event = usermode ? LLTRACE_EVENT_PC_U : LLTRACE_EVENT_PC_K; + uint64_t extra[1] = { pc }; + + lltrace_insert(llt, (event | nitems(extra)) << LLTRACE_EVENT_SHIFT, + extra, nitems(extra)); +#endif +} + +void +lltrace_syscall(struct lltrace_cpu *llt, register_t code, + size_t argsize, const register_t *args) +{ + uint64_t record = LLTRACE_EVENT_PHASE_START << + LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_SYSCALL << + LLTRACE_EVENT_CLASS_SHIFT; + record |= ((uint64_t)code & LLTRACE_SYSCALL_MASK) << + LLTRACE_SYSCALL_SHIFT; + + if (argsize > 0) + record |= (uint64_t)args[0] << LLTRACE_SYSCALL_V_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_sysret(struct lltrace_cpu *llt, register_t code, + int error, const register_t retvals[2]) +{ + uint64_t record; + + record = LLTRACE_EVENT_PHASE_END << + LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_SYSCALL << + LLTRACE_EVENT_CLASS_SHIFT; + record |= ((uint64_t)code & LLTRACE_SYSCALL_MASK) << + LLTRACE_SYSCALL_SHIFT; + record |= (uint64_t)error << LLTRACE_SYSCALL_V_SHIFT; + + llt = lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); + if (llt == NULL) { + struct lltrace_softc *sc = lltrace_sc; + + rw_enter_write(&sc->sc_lock); + if (sc->sc_running) + lltrace_stop(sc, curproc); + + knote_locked(&sc->sc_sel.si_note, 0); + if (sc->sc_reading) + wakeup(&sc->sc_reading); + rw_exit_write(&sc->sc_lock); + } +} + +struct lltrace_cpu * +lltrace_pidname(struct lltrace_cpu *llt, struct proc *p) +{ + struct process *ps = p->p_p; + uint64_t record; + uint64_t extra[3]; + unsigned int l, n; + + CTASSERT(sizeof(extra) == sizeof(ps->ps_comm)); + + record = LLTRACE_ID_TYPE_TID << LLTRACE_ID_TYPE_SHIFT; + record |= (uint64_t)p->p_tid << LLTRACE_ID_TID_SHIFT; + record |= (uint64_t)ps->ps_pid << LLTRACE_ID_TID_PID_SHIFT; + if (ISSET(ps->ps_flags, PS_SYSTEM)) + record |= LLTRACE_ID_TID_SYSTEM; + + extra[0] = extra[1] = extra[2] = 0; /* memset */ + l = strlcpy((char *)extra, p->p_p->ps_comm, sizeof(extra)); + + /* turn the string length into the number of slots we need */ + n = howmany(l, sizeof(uint64_t)); + + return (lltrace_insert_record(llt, LLTRACE_TYPE_ID, record, extra, n)); +} + +void +lltrace_switch(struct lltrace_cpu *llt, struct proc *op, struct proc *np) +{ + struct process *nps = np->p_p; + uint64_t state; + uint64_t record; + unsigned int pid; + unsigned int wake; + + llt = lltrace_pidname(llt, np); + if (llt == NULL) + return; + + record = LLTRACE_EVENT_PHASE_INSTANT << + LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_SCHED << + LLTRACE_EVENT_CLASS_SHIFT; + record |= (uint64_t)np->p_tid << LLTRACE_EVENT_DATA_SHIFT; + + /* record what we think the state of the outgoing thread is */ + if (op == NULL) + state = LLTRACE_SCHED_STATE_DEAD; + else if (ISSET(op->p_flag, P_WEXIT)) + state = LLTRACE_SCHED_STATE_DYING; + else if (ISSET(op->p_flag, P_WSLEEP)) + state = LLTRACE_SCHED_STATE_SUSPENDED; + else + state = LLTRACE_SCHED_STATE_BLOCKED; + + record |= (state << LLTRACE_SCHED_STATE_SHIFT); + + pid = nps->ps_pid; + if (ISSET(nps->ps_flags, PS_SYSTEM)) + pid |= (1U << 31); + + llt->llt_pid = pid; + llt->llt_tid = np->p_tid; + + wake = np->p_wakeid != 0; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, &np->p_wakeid, wake); + + if (wake) + np->p_wakeid = 0; +} + +void +lltrace_runnable(struct lltrace_cpu *llt, struct proc *p) +{ + uint64_t record; + uint64_t wakeid; + + llt = lltrace_pidname(llt, p); + if (llt == NULL) + return; + + record = LLTRACE_EVENT_PHASE_INSTANT << + LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_WAKE << + LLTRACE_EVENT_CLASS_SHIFT; + record |= (uint64_t)p->p_tid << LLTRACE_EVENT_DATA_SHIFT; + + wakeid = (uint64_t)cpu_number() << 48; + wakeid |= (llt->llt_wakeid += 2) & LLTRACE_MASK(48); + p->p_wakeid = wakeid; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, &p->p_wakeid, 1); +} + +void +lltrace_sched_enter(struct lltrace_cpu *llt) +{ + uint64_t record = LLTRACE_EVENT_PHASE_START << + LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_SCHED << + LLTRACE_EVENT_CLASS_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_sched_leave(struct lltrace_cpu *llt) +{ + uint64_t record = LLTRACE_EVENT_PHASE_END << + LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_SCHED << + LLTRACE_EVENT_CLASS_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_idle(struct lltrace_cpu *llt, unsigned int idle) +{ + uint64_t record = + (idle ? LLTRACE_EVENT_PHASE_START : LLTRACE_EVENT_PHASE_END) << + LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_IDLE << LLTRACE_EVENT_CLASS_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_event_start(struct lltrace_cpu *llt, unsigned int class) +{ + uint64_t record = LLTRACE_EVENT_PHASE_START << + LLTRACE_EVENT_PHASE_SHIFT; + record |= class << LLTRACE_EVENT_CLASS_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_event_end(struct lltrace_cpu *llt, unsigned int class) +{ + uint64_t record = LLTRACE_EVENT_PHASE_END << + LLTRACE_EVENT_PHASE_SHIFT; + record |= class << LLTRACE_EVENT_CLASS_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +static inline void +lltrace_intr(struct lltrace_cpu *llt, uint64_t phase, + uint64_t type, uint64_t data) +{ + uint64_t record = phase << LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_INTR << LLTRACE_EVENT_CLASS_SHIFT; + record |= type << LLTRACE_INTR_T_SHIFT; + record |= data << LLTRACE_INTR_DATA_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_ipi(struct lltrace_cpu *llt, unsigned int cpu) +{ + lltrace_intr(llt, LLTRACE_EVENT_PHASE_INSTANT, + LLTRACE_INTR_T_IPI, cpu); +} + +void +lltrace_intr_enter(struct lltrace_cpu *llt, unsigned int type, unsigned int vec) +{ + lltrace_intr(llt, LLTRACE_EVENT_PHASE_START, type, vec); +} + +void +lltrace_intr_leave(struct lltrace_cpu *llt, unsigned int type, unsigned int vec) +{ + lltrace_intr(llt, LLTRACE_EVENT_PHASE_END, type, vec); +} + +void +lltrace_lock(struct lltrace_cpu *llt, void *lock, + unsigned int type, unsigned int step) +{ + uint64_t record = (uint64_t)type << LLTRACE_LK_TYPE_SHIFT; + record |= (uint64_t)step << LLTRACE_LK_PHASE_SHIFT; + record |= (uint64_t)lock << LLTRACE_LK_ADDR_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_LOCKING, record, NULL, 0); +} + +void +lltrace_count(struct lltrace_cpu *llt, unsigned int t, unsigned int v) +{ + uint64_t record; + + record = LLTRACE_EVENT_PHASE_INSTANT << LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_COUNT << LLTRACE_EVENT_CLASS_SHIFT; + record |= (uint64_t)t << LLTRACE_COUNT_T_SHIFT; + record |= (uint64_t)v << LLTRACE_COUNT_V_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_mark(struct lltrace_cpu *llt) +{ +#if 0 + uint64_t record = LLTRACE_EVENT_MARK << LLTRACE_EVENT_SHIFT; + + lltrace_insert(llt, record, NULL, 0); +#endif +} + +static void +lltrace_fn(struct lltrace_cpu *llt, unsigned int phase, void *fn) +{ + uint64_t record = (uint64_t)phase << LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_FUNC << LLTRACE_EVENT_CLASS_SHIFT; + /* 32 bits is enough to identify most symbols */ + record |= (uint64_t)fn << LLTRACE_EVENT_DATA_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_fn_enter(struct lltrace_cpu *llt, void *fn) +{ + lltrace_fn(llt, LLTRACE_EVENT_PHASE_START, fn); +} + +void +lltrace_fn_leave(struct lltrace_cpu *llt, void *fn) +{ + lltrace_fn(llt, LLTRACE_EVENT_PHASE_END, fn); +} + +void +__cyg_profile_func_enter(void *fn, void *pc) +{ + struct lltrace_cpu *llt; + + llt = lltrace_enter(); + if (llt == NULL) + return; + + lltrace_fn_enter(llt, fn); +} + +void +__cyg_profile_func_exit(void *fn, void *pc) +{ + struct lltrace_cpu *llt; + + llt = lltrace_enter(); + if (llt == NULL) + return; + + lltrace_fn_leave(llt, fn); +} Index: sys/dev/pci/if_em.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_em.c,v diff -u -p -r1.378 if_em.c --- sys/dev/pci/if_em.c 31 Aug 2024 16:23:09 -0000 1.378 +++ sys/dev/pci/if_em.c 11 Sep 2024 11:29:26 -0000 @@ -452,7 +452,7 @@ em_attach(struct device *parent, struct sc->hw.wait_autoneg_complete = WAIT_FOR_AUTO_NEG_DEFAULT; sc->hw.autoneg_advertised = AUTONEG_ADV_DEFAULT; sc->hw.tbi_compatibility_en = TRUE; - sc->sc_rx_buffer_len = EM_RXBUFFER_2048; + sc->sc_rx_buffer_len = EM_MCLBYTES; sc->hw.phy_init_script = 1; sc->hw.phy_reset_disable = FALSE; @@ -780,7 +780,7 @@ em_ioctl(struct ifnet *ifp, u_long comma case SIOCGIFRXR: error = if_rxr_ioctl((struct if_rxrinfo *)ifr->ifr_data, - NULL, EM_MCLBYTES, &sc->queues->rx.sc_rx_ring); + NULL, sc->sc_rx_buffer_len, &sc->queues->rx.sc_rx_ring); break; case SIOCGIFSFFPAGE: @@ -2712,16 +2712,18 @@ em_get_buf(struct em_queue *que, int i) KASSERT(pkt->pkt_m == NULL); - m = MCLGETL(NULL, M_DONTWAIT, EM_MCLBYTES); + m = MCLGETL(NULL, M_DONTWAIT, sc->sc_rx_buffer_len + ETHER_ALIGN); if (m == NULL) { sc->mbuf_cluster_failed++; return (ENOBUFS); } - m->m_len = m->m_pkthdr.len = EM_MCLBYTES; - m_adj(m, ETHER_ALIGN); + + m->m_data += ETHER_ALIGN; + m->m_len = m->m_pkthdr.len = sc->sc_rx_buffer_len; error = bus_dmamap_load_mbuf(sc->sc_dmat, pkt->pkt_map, - m, BUS_DMA_NOWAIT); + m, BUS_DMA_NOWAIT | + (ISSET(sc->sc_ac.ac_if.if_flags, IFF_LINK0) ? BUS_DMA_BUS4 : 0)); if (error) { m_freem(m); return (error); @@ -2771,8 +2773,9 @@ em_allocate_receive_structures(struct em for (i = 0; i < sc->sc_rx_slots; i++) { pkt = &que->rx.sc_rx_pkts_ring[i]; - error = bus_dmamap_create(sc->sc_dmat, EM_MCLBYTES, 1, - EM_MCLBYTES, 0, BUS_DMA_NOWAIT, &pkt->pkt_map); + error = bus_dmamap_create(sc->sc_dmat, + sc->sc_rx_buffer_len, 1, sc->sc_rx_buffer_len, 0, + BUS_DMA_NOWAIT, &pkt->pkt_map); if (error != 0) { printf("%s: em_allocate_receive_structures: " "bus_dmamap_create failed; error %u\n", @@ -2801,7 +2804,6 @@ em_setup_receive_structures(struct em_so { struct ifnet *ifp = &sc->sc_ac.ac_if; struct em_queue *que; - u_int lwm; if (em_allocate_receive_structures(sc)) return (ENOMEM); @@ -2814,8 +2816,9 @@ em_setup_receive_structures(struct em_so que->rx.sc_rx_desc_tail = 0; que->rx.sc_rx_desc_head = sc->sc_rx_slots - 1; - lwm = max(4, 2 * ((ifp->if_hardmtu / MCLBYTES) + 1)); - if_rxr_init(&que->rx.sc_rx_ring, lwm, sc->sc_rx_slots); + if_rxr_init(&que->rx.sc_rx_ring, + 2 * howmany(ifp->if_hardmtu, sc->sc_rx_buffer_len) + 1, + sc->sc_rx_slots - 1); if (em_rxfill(que) == 0) { printf("%s: unable to fill any rx descriptors\n", Index: sys/dev/pci/if_em.h =================================================================== RCS file: /cvs/src/sys/dev/pci/if_em.h,v diff -u -p -r1.83 if_em.h --- sys/dev/pci/if_em.h 16 Feb 2024 22:30:54 -0000 1.83 +++ sys/dev/pci/if_em.h 11 Sep 2024 11:29:26 -0000 @@ -268,7 +268,7 @@ typedef int boolean_t; #define EM_RXBUFFER_8192 8192 #define EM_RXBUFFER_16384 16384 -#define EM_MCLBYTES (EM_RXBUFFER_2048 + ETHER_ALIGN) +#define EM_MCLBYTES EM_RXBUFFER_2048 #define EM_MAX_SCATTER 64 #define EM_TSO_SIZE 65535 Index: sys/dev/pci/if_ix.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_ix.c,v diff -u -p -r1.217 if_ix.c --- sys/dev/pci/if_ix.c 4 Sep 2024 07:54:52 -0000 1.217 +++ sys/dev/pci/if_ix.c 11 Sep 2024 11:29:26 -0000 @@ -764,7 +764,7 @@ ixgbe_init(void *arg) ixgbe_initialize_transmit_units(sc); /* Use 2k clusters, even for jumbo frames */ - sc->rx_mbuf_sz = MCLBYTES + ETHER_ALIGN; + sc->rx_mbuf_sz = MCLBYTES; /* Prepare receive descriptors and buffers */ if (ixgbe_setup_receive_structures(sc)) { @@ -1475,10 +1475,10 @@ ixgbe_encap(struct ix_txring *txr, struc for (j = 0; j < map->dm_nsegs; j++) { txd = &txr->tx_base[i]; - txd->read.buffer_addr = htole64(map->dm_segs[j].ds_addr); - txd->read.cmd_type_len = htole32(txr->txd_cmd | + htolem64(&txd->read.buffer_addr, map->dm_segs[j].ds_addr); + htolem32(&txd->read.cmd_type_len, txr->txd_cmd | cmd_type_len | map->dm_segs[j].ds_len); - txd->read.olinfo_status = htole32(olinfo_status); + htolem32(&txd->read.olinfo_status, olinfo_status); last = i; /* descriptor that will get completion IRQ */ if (++i == sc->num_tx_desc) @@ -2620,9 +2620,6 @@ ixgbe_txeof(struct ix_txring *txr) struct ixgbe_tx_buf *tx_buffer; struct ixgbe_legacy_tx_desc *tx_desc; - if (!ISSET(ifp->if_flags, IFF_RUNNING)) - return FALSE; - head = txr->next_avail_desc; tail = txr->next_to_clean; @@ -2697,12 +2694,11 @@ ixgbe_get_buf(struct ix_rxring *rxr, int return (ENOBUFS); } - /* needed in any case so preallocate since this one will fail for sure */ - mp = MCLGETL(NULL, M_DONTWAIT, sc->rx_mbuf_sz); + mp = MCLGETL(NULL, M_DONTWAIT, sc->rx_mbuf_sz + ETHER_ALIGN); if (!mp) return (ENOBUFS); - mp->m_data += (mp->m_ext.ext_size - sc->rx_mbuf_sz); + mp->m_data += ETHER_ALIGN; mp->m_len = mp->m_pkthdr.len = sc->rx_mbuf_sz; error = bus_dmamap_load_mbuf(rxr->rxdma.dma_tag, rxbuf->map, @@ -2716,7 +2712,7 @@ ixgbe_get_buf(struct ix_rxring *rxr, int 0, rxbuf->map->dm_mapsize, BUS_DMASYNC_PREREAD); rxbuf->buf = mp; - rxdesc->read.pkt_addr = htole64(rxbuf->map->dm_segs[0].ds_addr); + htolem64(&rxdesc->read.pkt_addr, rxbuf->map->dm_segs[0].ds_addr); return (0); } @@ -2747,8 +2743,9 @@ ixgbe_allocate_receive_buffers(struct ix rxbuf = rxr->rx_buffers; for (i = 0; i < sc->num_rx_desc; i++, rxbuf++) { - error = bus_dmamap_create(rxr->rxdma.dma_tag, 16 * 1024, 1, - 16 * 1024, 0, BUS_DMA_NOWAIT, &rxbuf->map); + error = bus_dmamap_create(rxr->rxdma.dma_tag, + sc->rx_mbuf_sz, 1, sc->rx_mbuf_sz, 0, + BUS_DMA_NOWAIT, &rxbuf->map); if (error) { printf("%s: Unable to create Pack DMA map\n", ifp->if_xname); @@ -2789,7 +2786,8 @@ ixgbe_setup_receive_ring(struct ix_rxrin rxr->next_to_check = 0; rxr->last_desc_filled = sc->num_rx_desc - 1; - if_rxr_init(&rxr->rx_ring, 2 * ((ifp->if_hardmtu / MCLBYTES) + 1), + if_rxr_init(&rxr->rx_ring, + 2 * howmany(ifp->if_hardmtu, MCLBYTES) + 1, sc->num_rx_desc - 1); ixgbe_rxfill(rxr); @@ -2924,7 +2922,7 @@ ixgbe_initialize_receive_units(struct ix IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl); } - bufsz = (sc->rx_mbuf_sz - ETHER_ALIGN) >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; + bufsz = sc->rx_mbuf_sz >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; for (i = 0; i < sc->num_queues; i++, rxr++) { uint64_t rdba = rxr->rxdma.dma_map->dm_segs[0].ds_addr; @@ -3139,7 +3137,7 @@ ixgbe_rxeof(struct ix_rxring *rxr) dsize * i, dsize, BUS_DMASYNC_POSTREAD); rxdesc = &rxr->rx_base[i]; - staterr = letoh32(rxdesc->wb.upper.status_error); + staterr = lemtoh32(&rxdesc->wb.upper.status_error); if (!ISSET(staterr, IXGBE_RXD_STAT_DD)) { bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, dsize * i, dsize, @@ -3157,8 +3155,8 @@ ixgbe_rxeof(struct ix_rxring *rxr) bus_dmamap_unload(rxr->rxdma.dma_tag, rxbuf->map); mp = rxbuf->buf; - len = letoh16(rxdesc->wb.upper.length); - vtag = letoh16(rxdesc->wb.upper.vlan); + len = lemtoh16(&rxdesc->wb.upper.length); + vtag = lemtoh16(&rxdesc->wb.upper.vlan); eop = ((staterr & IXGBE_RXD_STAT_EOP) != 0); hash = lemtoh32(&rxdesc->wb.lower.hi_dword.rss); hashtype = Index: sys/dev/pci/if_ix.h =================================================================== RCS file: /cvs/src/sys/dev/pci/if_ix.h,v diff -u -p -r1.47 if_ix.h --- sys/dev/pci/if_ix.h 21 May 2024 11:19:39 -0000 1.47 +++ sys/dev/pci/if_ix.h 11 Sep 2024 11:29:26 -0000 @@ -63,7 +63,7 @@ * against the system mbuf pool limit, you can tune nmbclusters * to adjust for this. */ -#define DEFAULT_RXD 256 +#define DEFAULT_RXD 2048 #define PERFORM_RXD 2048 #define MAX_RXD 4096 #define MIN_RXD 64 Index: sys/dev/pci/if_ixl.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_ixl.c,v diff -u -p -r1.101 if_ixl.c --- sys/dev/pci/if_ixl.c 24 May 2024 06:02:53 -0000 1.101 +++ sys/dev/pci/if_ixl.c 11 Sep 2024 11:29:26 -0000 @@ -3373,8 +3373,9 @@ ixl_rxfill(struct ixl_softc *sc, struct m = MCLGETL(NULL, M_DONTWAIT, MCLBYTES + ETHER_ALIGN); if (m == NULL) break; - m->m_data += (m->m_ext.ext_size - (MCLBYTES + ETHER_ALIGN)); - m->m_len = m->m_pkthdr.len = MCLBYTES + ETHER_ALIGN; + + m->m_data += ETHER_ALIGN; + m->m_len = m->m_pkthdr.len = MCLBYTES; map = rxm->rxm_map; Index: sys/kern/kern_clockintr.c =================================================================== RCS file: /cvs/src/sys/kern/kern_clockintr.c,v diff -u -p -r1.70 kern_clockintr.c --- sys/kern/kern_clockintr.c 25 Feb 2024 19:15:50 -0000 1.70 +++ sys/kern/kern_clockintr.c 11 Sep 2024 11:29:29 -0000 @@ -30,6 +30,7 @@ #include #include #include +#include void clockintr_cancel_locked(struct clockintr *); void clockintr_hardclock(struct clockrequest *, void *, void *); @@ -209,7 +210,9 @@ clockintr_dispatch(void *frame) cq->cq_running = cl; mtx_leave(&cq->cq_mtx); + LLTRACE_CPU(ci, lltrace_fn_enter, func); func(request, frame, arg); + LLTRACE_CPU(ci, lltrace_fn_leave, func); mtx_enter(&cq->cq_mtx); cq->cq_running = NULL; Index: sys/kern/kern_exec.c =================================================================== RCS file: /cvs/src/sys/kern/kern_exec.c,v diff -u -p -r1.258 kern_exec.c --- sys/kern/kern_exec.c 21 Aug 2024 03:07:45 -0000 1.258 +++ sys/kern/kern_exec.c 11 Sep 2024 11:29:29 -0000 @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -539,6 +540,8 @@ sys_execve(struct proc *p, void *v, regi memset(pr->ps_comm, 0, sizeof(pr->ps_comm)); strlcpy(pr->ps_comm, nid.ni_cnd.cn_nameptr, sizeof(pr->ps_comm)); pr->ps_acflag &= ~AFORK; + + LLTRACE(lltrace_pidname, p); /* record proc's vnode, for use by sysctl */ otvp = pr->ps_textvp; Index: sys/kern/kern_intrmap.c =================================================================== RCS file: /cvs/src/sys/kern/kern_intrmap.c,v diff -u -p -r1.3 kern_intrmap.c --- sys/kern/kern_intrmap.c 23 Jun 2020 01:40:03 -0000 1.3 +++ sys/kern/kern_intrmap.c 11 Sep 2024 11:29:29 -0000 @@ -103,6 +103,8 @@ intrmap_cpus_get(void) M_DEVBUF, M_WAITOK); CPU_INFO_FOREACH(cii, ci) { + if (icpus > 0) + continue; #ifdef __HAVE_CPU_TOPOLOGY if (ci->ci_smt_id > 0) continue; Index: sys/kern/kern_lock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_lock.c,v diff -u -p -r1.75 kern_lock.c --- sys/kern/kern_lock.c 3 Jul 2024 01:36:50 -0000 1.75 +++ sys/kern/kern_lock.c 11 Sep 2024 11:29:29 -0000 @@ -24,6 +24,7 @@ #include #include #include +#include #include @@ -129,6 +130,7 @@ __mp_lock(struct __mp_lock *mpl) { struct __mp_lock_cpu *cpu = &mpl->mpl_cpus[cpu_number()]; unsigned long s; + unsigned int depth; #ifdef WITNESS if (!__mp_lock_held(mpl, curcpu())) @@ -136,15 +138,22 @@ __mp_lock(struct __mp_lock *mpl) LOP_EXCLUSIVE | LOP_NEWORDER, NULL); #endif + s = intr_disable(); - if (cpu->mplc_depth++ == 0) + depth = cpu->mplc_depth++; + if (depth == 0) { + LLTRACE(lltrace_lock, mpl, LLTRACE_LK_K, LLTRACE_LK_A_START); cpu->mplc_ticket = atomic_inc_int_nv(&mpl->mpl_users); + } intr_restore(s); __mp_lock_spin(mpl, cpu->mplc_ticket); membar_enter_after_atomic(); WITNESS_LOCK(&mpl->mpl_lock_obj, LOP_EXCLUSIVE); + + if (depth == 0) + LLTRACE(lltrace_lock, mpl, LLTRACE_LK_K, LLTRACE_LK_A_EXCL); } void @@ -164,6 +173,7 @@ __mp_unlock(struct __mp_lock *mpl) s = intr_disable(); if (--cpu->mplc_depth == 0) { + LLTRACE(lltrace_lock, mpl, LLTRACE_LK_K, LLTRACE_LK_R_EXCL); membar_exit(); mpl->mpl_ticket++; } @@ -180,6 +190,8 @@ __mp_release_all(struct __mp_lock *mpl) int i; #endif + LLTRACE(lltrace_lock, mpl, LLTRACE_LK_K, LLTRACE_LK_R_EXCL); + s = intr_disable(); rv = cpu->mplc_depth; #ifdef WITNESS @@ -227,29 +239,60 @@ __mtx_init(struct mutex *mtx, int wantip void mtx_enter(struct mutex *mtx) { - struct schedstate_percpu *spc = &curcpu()->ci_schedstate; + struct cpu_info *owner, *ci = curcpu(); + struct schedstate_percpu *spc = &ci->ci_schedstate; + int s; #ifdef MP_LOCKDEBUG int nticks = __mp_lock_spinout; #endif +#if NLLT > 0 + unsigned int lltev = LLTRACE_LK_I_EXCL; +#endif WITNESS_CHECKORDER(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE | LOP_NEWORDER, NULL); - spc->spc_spinning++; - while (mtx_enter_try(mtx) == 0) { + if (mtx->mtx_wantipl != IPL_NONE) + s = splraise(mtx->mtx_wantipl); + + owner = atomic_cas_ptr(&mtx->mtx_owner, NULL, ci); +#ifdef DIAGNOSTIC + if (__predict_false(owner == ci)) + panic("mtx %p: locking against myself", mtx); +#endif + if (owner != NULL) { + LLTRACE_SPC(spc, lltrace_lock, mtx, LLTRACE_LK_MTX, + LLTRACE_LK_A_START); + + spc->spc_spinning++; do { - CPU_BUSY_CYCLE(); + do { + CPU_BUSY_CYCLE(); #ifdef MP_LOCKDEBUG - if (--nticks == 0) { - db_printf("%s: %p lock spun out\n", - __func__, mtx); - db_enter(); - nticks = __mp_lock_spinout; - } + if (--nticks == 0) { + db_printf("%s: %p lock spun out\n", + __func__, mtx); + db_enter(); + nticks = __mp_lock_spinout; + } +#endif + } while (mtx->mtx_owner != NULL); + } while (atomic_cas_ptr(&mtx->mtx_owner, NULL, ci) != NULL); + spc->spc_spinning--; + +#if NLLT > 0 + lltev = LLTRACE_LK_A_EXCL; #endif - } while (mtx->mtx_owner != NULL); } - spc->spc_spinning--; + + membar_enter_after_atomic(); + if (mtx->mtx_wantipl != IPL_NONE) + mtx->mtx_oldipl = s; +#ifdef DIAGNOSTIC + ci->ci_mutex_level++; +#endif + WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); + LLTRACE_SPC(spc, lltrace_lock, mtx, LLTRACE_LK_MTX, lltev); } int @@ -278,12 +321,15 @@ mtx_enter_try(struct mutex *mtx) ci->ci_mutex_level++; #endif WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); + LLTRACE_CPU(ci, lltrace_lock, mtx, LLTRACE_LK_MTX, + LLTRACE_LK_I_EXCL); return (1); } if (mtx->mtx_wantipl != IPL_NONE) splx(s); + LLTRACE_CPU(ci, lltrace_lock, mtx, LLTRACE_LK_MTX, LLTRACE_LK_I_FAIL); return (0); } #else @@ -313,6 +359,7 @@ mtx_enter(struct mutex *mtx) ci->ci_mutex_level++; #endif WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); + LLTRACE(lltrace_lock, mtx, LLTRACE_LK_MTX, LLTRACE_LK_I_EXCL); } int @@ -333,6 +380,7 @@ mtx_leave(struct mutex *mtx) return; MUTEX_ASSERT_LOCKED(mtx); + LLTRACE(lltrace_lock, mtx, LLTRACE_LK_MTX, LLTRACE_LK_R_EXCL); WITNESS_UNLOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); #ifdef DIAGNOSTIC Index: sys/kern/kern_rwlock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_rwlock.c,v diff -u -p -r1.50 kern_rwlock.c --- sys/kern/kern_rwlock.c 14 Jul 2023 07:07:08 -0000 1.50 +++ sys/kern/kern_rwlock.c 11 Sep 2024 11:29:29 -0000 @@ -25,6 +25,7 @@ #include #include #include +#include void rw_do_exit(struct rwlock *, unsigned long); @@ -110,6 +111,7 @@ rw_enter_read(struct rwlock *rwl) membar_enter_after_atomic(); WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_NEWORDER, NULL); WITNESS_LOCK(&rwl->rwl_lock_obj, 0); + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, LLTRACE_LK_I_SHARED); } } @@ -126,6 +128,7 @@ rw_enter_write(struct rwlock *rwl) WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_EXCLUSIVE | LOP_NEWORDER, NULL); WITNESS_LOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, LLTRACE_LK_I_EXCL); } } @@ -135,6 +138,7 @@ rw_exit_read(struct rwlock *rwl) unsigned long owner; rw_assert_rdlock(rwl); + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, LLTRACE_LK_R_SHARED); WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0); membar_exit_before_atomic(); @@ -150,6 +154,7 @@ rw_exit_write(struct rwlock *rwl) unsigned long owner; rw_assert_wrlock(rwl); + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, LLTRACE_LK_R_SHARED); WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); membar_exit_before_atomic(); @@ -249,6 +254,7 @@ rw_enter(struct rwlock *rwl, int flags) op = &rw_ops[(flags & RW_OPMASK) - 1]; inc = op->inc + RW_PROC(curproc) * op->proc_mult; + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, LLTRACE_LK_A_START); retry: while (__predict_false(((o = rwl->rwl_owner) & op->check) != 0)) { unsigned long set = o | op->wait_set; @@ -272,8 +278,10 @@ retry: rw_enter_diag(rwl, flags); - if (flags & RW_NOSLEEP) - return (EBUSY); + if (flags & RW_NOSLEEP) { + error = EBUSY; + goto abort; + } prio = op->wait_prio; if (flags & RW_INTR) @@ -285,15 +293,28 @@ retry: error = sleep_finish(0, do_sleep); if ((flags & RW_INTR) && (error != 0)) - return (error); - if (flags & RW_SLEEPFAIL) - return (EAGAIN); + goto abort; + if (flags & RW_SLEEPFAIL) { + error = EAGAIN; + goto abort; + } } if (__predict_false(rw_cas(&rwl->rwl_owner, o, o + inc))) goto retry; membar_enter_after_atomic(); + if (flags & RW_DOWNGRADE) { + WITNESS_DOWNGRADE(&rwl->rwl_lock_obj, lop_flags); + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, + LLTRACE_LK_DOWNGRADE); + } else { + WITNESS_LOCK(&rwl->rwl_lock_obj, lop_flags); + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, + ISSET(flags, RW_WRITE) ? + LLTRACE_LK_A_EXCL : LLTRACE_LK_A_SHARED); + } + /* * If old lock had RWLOCK_WAIT and RWLOCK_WRLOCK set, it means we * downgraded a write lock and had possible read waiter, wake them @@ -303,12 +324,10 @@ retry: (RWLOCK_WRLOCK|RWLOCK_WAIT))) wakeup(rwl); - if (flags & RW_DOWNGRADE) - WITNESS_DOWNGRADE(&rwl->rwl_lock_obj, lop_flags); - else - WITNESS_LOCK(&rwl->rwl_lock_obj, lop_flags); - return (0); +abort: + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, LLTRACE_LK_A_ABORT); + return (error); } void @@ -325,6 +344,8 @@ rw_exit(struct rwlock *rwl) rw_assert_wrlock(rwl); else rw_assert_rdlock(rwl); + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, + wrlock ? LLTRACE_LK_R_EXCL : LLTRACE_LK_R_SHARED); WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0); membar_exit_before_atomic(); Index: sys/kern/kern_sched.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sched.c,v diff -u -p -r1.100 kern_sched.c --- sys/kern/kern_sched.c 9 Jul 2024 08:44:36 -0000 1.100 +++ sys/kern/kern_sched.c 11 Sep 2024 11:29:29 -0000 @@ -191,7 +191,10 @@ sched_idle(void *v) wakeup(spc); } #endif + + LLTRACE(lltrace_idle, 1); cpu_idle_cycle(); + LLTRACE(lltrace_idle, 0); } cpu_idle_leave(); cpuset_del(&sched_idle_cpus, ci); @@ -609,6 +612,7 @@ sched_proc_to_cpu_cost(struct cpu_info * if (cpuset_isset(&sched_queued_cpus, ci)) cost += spc->spc_nrun * sched_cost_runnable; +#if 0 /* * Try to avoid the primary cpu as it handles hardware interrupts. * @@ -617,6 +621,7 @@ sched_proc_to_cpu_cost(struct cpu_info * */ if (CPU_IS_PRIMARY(ci)) cost += sched_cost_runnable; +#endif /* * If the proc is on this cpu already, lower the cost by how much Index: sys/kern/kern_sensors.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sensors.c,v diff -u -p -r1.40 kern_sensors.c --- sys/kern/kern_sensors.c 5 Dec 2022 23:18:37 -0000 1.40 +++ sys/kern/kern_sensors.c 11 Sep 2024 11:29:29 -0000 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "hotplug.h" @@ -260,8 +261,11 @@ sensor_task_work(void *xst) atomic_inc_int(&sensors_running); rw_enter_write(&st->lock); period = st->period; - if (period > 0 && !sensors_quiesced) + if (period > 0 && !sensors_quiesced) { + LLTRACE(lltrace_fn_enter, st->func); st->func(st->arg); + LLTRACE(lltrace_fn_leave, st->func); + } rw_exit_write(&st->lock); if (atomic_dec_int_nv(&sensors_running) == 0) { if (sensors_quiesced) Index: sys/kern/kern_synch.c =================================================================== RCS file: /cvs/src/sys/kern/kern_synch.c,v diff -u -p -r1.206 kern_synch.c --- sys/kern/kern_synch.c 23 Jul 2024 08:38:02 -0000 1.206 +++ sys/kern/kern_synch.c 11 Sep 2024 11:29:29 -0000 @@ -37,6 +37,8 @@ * @(#)kern_synch.c 8.6 (Berkeley) 1/21/94 */ +#include "llt.h" + #include #include #include @@ -521,6 +523,7 @@ unsleep(struct proc *p) p->p_wmesg = NULL; TRACEPOINT(sched, unsleep, p->p_tid + THREAD_PID_OFFSET, p->p_p->ps_pid); + LLTRACE(lltrace_runnable, p); } } @@ -557,6 +560,7 @@ wakeup_n(const volatile void *ident, int TAILQ_REMOVE(&wakeq, p, p_runq); TRACEPOINT(sched, unsleep, p->p_tid + THREAD_PID_OFFSET, p->p_p->ps_pid); + LLTRACE(lltrace_runnable, p); if (p->p_stat == SSLEEP) setrunnable(p); } Index: sys/kern/kern_task.c =================================================================== RCS file: /cvs/src/sys/kern/kern_task.c,v diff -u -p -r1.35 kern_task.c --- sys/kern/kern_task.c 14 May 2024 08:26:13 -0000 1.35 +++ sys/kern/kern_task.c 11 Sep 2024 11:29:29 -0000 @@ -24,6 +24,7 @@ #include #include #include +#include #include "kcov.h" #if NKCOV > 0 @@ -443,7 +444,9 @@ taskq_thread(void *xtq) #if NKCOV > 0 kcov_remote_enter(KCOV_REMOTE_COMMON, work.t_process); #endif + LLTRACE(lltrace_fn_enter, work.t_func); (*work.t_func)(work.t_arg); + LLTRACE(lltrace_fn_leave, work.t_func); #if NKCOV > 0 kcov_remote_leave(KCOV_REMOTE_COMMON, work.t_process); #endif Index: sys/kern/kern_timeout.c =================================================================== RCS file: /cvs/src/sys/kern/kern_timeout.c,v diff -u -p -r1.99 kern_timeout.c --- sys/kern/kern_timeout.c 11 Aug 2024 00:49:34 -0000 1.99 +++ sys/kern/kern_timeout.c 11 Sep 2024 11:29:29 -0000 @@ -35,6 +35,7 @@ #include /* _Q_INVALIDATE */ #include #include +#include #ifdef DDB #include @@ -658,7 +659,9 @@ timeout_run(struct timeout *to) #if NKCOV > 0 kcov_remote_enter(KCOV_REMOTE_COMMON, kcov_process); #endif + LLTRACE(lltrace_fn_enter, fn); fn(arg); + LLTRACE(lltrace_fn_leave, fn); #if NKCOV > 0 kcov_remote_leave(KCOV_REMOTE_COMMON, kcov_process); #endif @@ -736,6 +739,8 @@ softclock(void *arg) int need_proc_mp; #endif + //LLTRACE(lltrace_irq, LLTRACE_IRQ_BOTTOM_HALF, 0); + first_new = NULL; new = 0; @@ -770,6 +775,8 @@ softclock(void *arg) if (need_proc_mp) wakeup(&timeout_proc_mp); #endif + + //LLTRACE(lltrace_irqret, LLTRACE_IRQ_BOTTOM_HALF, 0); } void Index: sys/kern/sched_bsd.c =================================================================== RCS file: /cvs/src/sys/kern/sched_bsd.c,v diff -u -p -r1.94 sched_bsd.c --- sys/kern/sched_bsd.c 8 Jul 2024 13:17:12 -0000 1.94 +++ sys/kern/sched_bsd.c 11 Sep 2024 11:29:29 -0000 @@ -350,6 +350,8 @@ mi_switch(void) int hold_count; #endif + LLTRACE(lltrace_sched_enter); + KASSERT(p->p_stat != SONPROC); SCHED_ASSERT_LOCKED(); @@ -410,14 +412,19 @@ mi_switch(void) uvmexp.swtch++; TRACEPOINT(sched, off__cpu, nextproc->p_tid + THREAD_PID_OFFSET, nextproc->p_p->ps_pid); + LLTRACE(lltrace_switch, p, nextproc); cpu_switchto(p, nextproc); TRACEPOINT(sched, on__cpu, NULL); + + //LLTRACE(lltrace_pidname, p); } else { TRACEPOINT(sched, remain__cpu, NULL); p->p_stat = SONPROC; } clear_resched(curcpu()); + + LLTRACE(lltrace_sched_leave); SCHED_ASSERT_LOCKED(); Index: sys/kern/subr_vmem.c =================================================================== RCS file: sys/kern/subr_vmem.c diff -N sys/kern/subr_vmem.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/kern/subr_vmem.c 11 Sep 2024 11:29:29 -0000 @@ -0,0 +1,1660 @@ +/* $NetBSD: subr_vmem.c,v 1.116 2024/04/24 02:08:03 thorpej Exp $ */ + +/*- + * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi, + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * reference: + * - Magazines and Vmem: Extending the Slab Allocator + * to Many CPUs and Arbitrary Resources + * http://www.usenix.org/event/usenix01/bonwick.html + * + * locking & the boundary tag pool: + * - A pool(9) is used for vmem boundary tags + * - During pool_put calls no vmem mutexes are locked. + */ + +#include +#include +#include + +#include +#include /* hz */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define VMEM_MAXORDER (sizeof(vmem_size_t) * CHAR_BIT) + +typedef struct vmem_btag bt_t; + +TAILQ_HEAD(vmem_seglist, vmem_btag); +LIST_HEAD(vmem_freelist, vmem_btag); +LIST_HEAD(vmem_hashlist, vmem_btag); + +#define VMEM_NAME_MAX 16 + +/* vmem arena */ +struct vmem { + struct mutex vm_lock; + + vm_flag_t vm_flags; + vmem_import_t *vm_importfn; + vmem_release_t *vm_releasefn; + size_t vm_nfreetags; + LIST_HEAD(, vmem_btag) vm_freetags; + void *vm_arg; + struct vmem_seglist vm_seglist; + struct vmem_freelist vm_freelist[VMEM_MAXORDER]; + size_t vm_hashsize; + size_t vm_hashmask; + size_t vm_nbusytag; + size_t vm_maxbusytag; + struct vmem_hashlist *vm_hashlist; + struct vmem_hashlist vm_hash0; + size_t vm_quantum_mask; + int vm_quantum_shift; + size_t vm_size; + size_t vm_inuse; + char vm_name[VMEM_NAME_MAX+1]; + LIST_ENTRY(vmem) vm_alllist; +}; + +/* boundary tag */ +struct vmem_btag { + TAILQ_ENTRY(vmem_btag) bt_seglist; + union { + LIST_ENTRY(vmem_btag) u_freelist; /* BT_TYPE_FREE */ + LIST_ENTRY(vmem_btag) u_hashlist; /* BT_TYPE_BUSY */ + } bt_u; +#define bt_hashlist bt_u.u_hashlist +#define bt_freelist bt_u.u_freelist + vmem_addr_t bt_start; + vmem_size_t bt_size; + short bt_type; + short bt_flags; +}; + +#define BT_TYPE_SPAN 1 +#define BT_TYPE_SPAN_STATIC 2 +#define BT_TYPE_FREE 3 +#define BT_TYPE_BUSY 4 +#define BT_ISSPAN_P(bt) ((bt)->bt_type <= BT_TYPE_SPAN_STATIC) + +#define BT_F_PRIVATE 0x0001 + +#define BT_END(bt) ((bt)->bt_start + (bt)->bt_size - 1) + +/* + * Provide an estimated number of boundary tags needed for a given + * number of allocations from the vmem arena. This estimate is + * based on 2 boundary tags per allocation (see vmem_xalloc()) and + * 2 boundary tags per added span (one to describe the span, one to + * describe the span's free space; see vmem_add1()). + */ +#define VMEM_EST_BTCOUNT(ns, na) (((ns) * 2) + ((na) * 2)) + +vmem_t * vmem_init(vmem_t *, const char *, vmem_addr_t, vmem_size_t, + vmem_size_t, vmem_import_t *, vmem_release_t *, vmem_t *, + vmem_size_t, vm_flag_t, int); +void vmem_add_bts(vmem_t *, struct vmem_btag *, unsigned int); + +#if defined(VMEM_SANITY) +static void vmem_check(vmem_t *); +#else /* defined(VMEM_SANITY) */ +#define vmem_check(vm) /* nothing */ +#endif /* defined(VMEM_SANITY) */ + +#define VMEM_HASHSIZE_MIN 1 /* XXX */ +#define VMEM_HASHSIZE_MAX 65536 /* XXX */ +#define VMEM_HASHSIZE_INIT 1 + +#define VM_FITMASK (VM_BESTFIT | VM_INSTANTFIT) + +static bool vmem_bootstrapped; +struct rwlock vmem_list_lock = RWLOCK_INITIALIZER("vmemlist"); +static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list); + +/* ---- misc */ + +#define VMEM_LOCK(vm) mtx_enter(&(vm)->vm_lock) +#define VMEM_TRYLOCK(vm) mtx_tryenter(&(vm)->vm_lock) +#define VMEM_UNLOCK(vm) mtx_leave(&(vm)->vm_lock) +#define VMEM_LOCK_INIT(vm, ipl) mtx_init(&(vm)->vm_lock, (ipl)) +#define VMEM_LOCK_DESTROY(vm) /* nop */ +#define VMEM_ASSERT_LOCKED(vm) MUTEX_ASSERT_LOCKED(&(vm)->vm_lock) + +#define VMEM_ALIGNUP(addr, align) \ + (-(-(addr) & -(align))) + +#define VMEM_CROSS_P(addr1, addr2, boundary) \ + ((((addr1) ^ (addr2)) & -(boundary)) != 0) + +#define ORDER2SIZE(order) ((vmem_size_t)1 << (order)) +#define SIZE2ORDER(size) (flsl(size) - 1) + +static void +vmem_kick_pdaemon(void) +{ + printf("%s\n", __func__); +#if defined(_KERNEL) + //uvm_kick_pdaemon(); +#endif +} + +static void vmem_xfree_bt(vmem_t *, bt_t *); + +/* + * This reserve is 4 for each arena involved in allocating vmems memory. + * BT_MAXFREE: don't cache excessive counts of bts in arenas + */ +#define BT_MINRESERVE 4 +#define BT_MAXFREE 64 + +static struct pool vmem_btag_pool; + +/* ---- boundary tag */ + +static int bt_refill(vmem_t *vm); +static int bt_refill_locked(vmem_t *vm); + +static int +bt_refill_locked(vmem_t *vm) +{ + bt_t *bt; + + VMEM_ASSERT_LOCKED(vm); + + if (vm->vm_nfreetags > BT_MINRESERVE) { + return 0; + } + + while (vm->vm_nfreetags <= BT_MINRESERVE) { + VMEM_UNLOCK(vm); + KASSERT(vmem_btag_pool.pr_size); + bt = pool_get(&vmem_btag_pool, PR_NOWAIT); + VMEM_LOCK(vm); + if (bt == NULL) + break; + bt->bt_flags = 0; + LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); + vm->vm_nfreetags++; + } + + if (vm->vm_nfreetags <= BT_MINRESERVE) { + return ENOMEM; + } + + return 0; +} + +static int +bt_refill(vmem_t *vm) +{ + int rv; + + VMEM_LOCK(vm); + rv = bt_refill_locked(vm); + VMEM_UNLOCK(vm); + return rv; +} + +static bt_t * +bt_alloc(vmem_t *vm, vm_flag_t flags) +{ + bt_t *bt; + + VMEM_ASSERT_LOCKED(vm); + + while (vm->vm_nfreetags <= BT_MINRESERVE && (flags & VM_POPULATING) == 0) { + if (bt_refill_locked(vm)) { + if ((flags & VM_NOSLEEP) != 0) { + return NULL; + } + + /* + * It would be nice to wait for something specific here + * but there are multiple ways that a retry could + * succeed and we can't wait for multiple things + * simultaneously. So we'll just sleep for an arbitrary + * short period of time and retry regardless. + * This should be a very rare case. + */ + + vmem_kick_pdaemon(); + msleep_nsec(&vm->vm_freetags, &vm->vm_lock, PWAIT, + "btalloc", 1); + } + } + bt = LIST_FIRST(&vm->vm_freetags); + LIST_REMOVE(bt, bt_freelist); + vm->vm_nfreetags--; + + return bt; +} + +static void +bt_free(vmem_t *vm, bt_t *bt) +{ + + VMEM_ASSERT_LOCKED(vm); + + LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); + vm->vm_nfreetags++; +} + +static void +bt_freetrim(vmem_t *vm, int freelimit) +{ + bt_t *bt, *next_bt; + LIST_HEAD(, vmem_btag) tofree; + + VMEM_ASSERT_LOCKED(vm); + + LIST_INIT(&tofree); + + LIST_FOREACH_SAFE(bt, &vm->vm_freetags, bt_freelist, next_bt) { + if (vm->vm_nfreetags <= freelimit) { + break; + } + if (bt->bt_flags & BT_F_PRIVATE) { + continue; + } + LIST_REMOVE(bt, bt_freelist); + vm->vm_nfreetags--; + LIST_INSERT_HEAD(&tofree, bt, bt_freelist); + } + + VMEM_UNLOCK(vm); + while (!LIST_EMPTY(&tofree)) { + bt = LIST_FIRST(&tofree); + LIST_REMOVE(bt, bt_freelist); + pool_put(&vmem_btag_pool, bt); + } +} + +/* + * Add private boundary tags (statically-allocated by the caller) + * to a vmem arena's free tag list. + */ +void +vmem_add_bts(vmem_t *vm, struct vmem_btag *bts, unsigned int nbts) +{ + VMEM_LOCK(vm); + while (nbts != 0) { + bts->bt_flags = BT_F_PRIVATE; + LIST_INSERT_HEAD(&vm->vm_freetags, bts, bt_freelist); + vm->vm_nfreetags++; + bts++; + nbts--; + } + VMEM_UNLOCK(vm); +} + +/* + * freelist[0] ... [1, 1] + * freelist[1] ... [2, 3] + * freelist[2] ... [4, 7] + * freelist[3] ... [8, 15] + * : + * freelist[n] ... [(1 << n), (1 << (n + 1)) - 1] + * : + */ + +static struct vmem_freelist * +bt_freehead_tofree(vmem_t *vm, vmem_size_t size) +{ + const vmem_size_t qsize = size >> vm->vm_quantum_shift; + const int idx = SIZE2ORDER(qsize); + + KASSERT(size != 0); + KASSERT(qsize != 0); + KASSERT((size & vm->vm_quantum_mask) == 0); + KASSERT(idx >= 0); + KASSERT(idx < VMEM_MAXORDER); + + return &vm->vm_freelist[idx]; +} + +/* + * bt_freehead_toalloc: return the freelist for the given size and allocation + * strategy. + * + * for VM_INSTANTFIT, return the list in which any blocks are large enough + * for the requested size. otherwise, return the list which can have blocks + * large enough for the requested size. + */ + +static struct vmem_freelist * +bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, vm_flag_t strat) +{ + const vmem_size_t qsize = size >> vm->vm_quantum_shift; + int idx = SIZE2ORDER(qsize); + + KASSERT(size != 0); + KASSERT(qsize != 0); + KASSERT((size & vm->vm_quantum_mask) == 0); + + if (strat == VM_INSTANTFIT && ORDER2SIZE(idx) != qsize) { + idx++; + /* check too large request? */ + } + KASSERT(idx >= 0); + KASSERT(idx < VMEM_MAXORDER); + + return &vm->vm_freelist[idx]; +} + +/* ---- boundary tag hash */ + +static struct vmem_hashlist * +bt_hashhead(vmem_t *vm, vmem_addr_t addr) +{ + struct vmem_hashlist *list; + unsigned long hash; + + hash = addr >> vm->vm_quantum_shift; + list = &vm->vm_hashlist[hash & vm->vm_hashmask]; + + return list; +} + +static bt_t * +bt_lookupbusy(vmem_t *vm, vmem_addr_t addr) +{ + struct vmem_hashlist *list; + bt_t *bt; + + list = bt_hashhead(vm, addr); + LIST_FOREACH(bt, list, bt_hashlist) { + if (bt->bt_start == addr) { + break; + } + } + + return bt; +} + +static void +bt_rembusy(vmem_t *vm, bt_t *bt) +{ + + KASSERT(vm->vm_nbusytag > 0); + vm->vm_inuse -= bt->bt_size; + vm->vm_nbusytag--; + LIST_REMOVE(bt, bt_hashlist); +} + +static void +bt_insbusy(vmem_t *vm, bt_t *bt) +{ + struct vmem_hashlist *list; + + KASSERT(bt->bt_type == BT_TYPE_BUSY); + + list = bt_hashhead(vm, bt->bt_start); + LIST_INSERT_HEAD(list, bt, bt_hashlist); + if (++vm->vm_nbusytag > vm->vm_maxbusytag) { + vm->vm_maxbusytag = vm->vm_nbusytag; + } + vm->vm_inuse += bt->bt_size; +} + +/* ---- boundary tag list */ + +static void +bt_remseg(vmem_t *vm, bt_t *bt) +{ + + TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist); +} + +static void +bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev) +{ + + TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist); +} + +static void +bt_insseg_tail(vmem_t *vm, bt_t *bt) +{ + + TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist); +} + +static void +bt_remfree(vmem_t *vm, bt_t *bt) +{ + + KASSERT(bt->bt_type == BT_TYPE_FREE); + + LIST_REMOVE(bt, bt_freelist); +} + +static void +bt_insfree(vmem_t *vm, bt_t *bt) +{ + struct vmem_freelist *list; + + list = bt_freehead_tofree(vm, bt->bt_size); + LIST_INSERT_HEAD(list, bt, bt_freelist); +} + +/* ---- vmem internal functions */ + +static void +vmem_bootstrap(void) +{ + pool_init(&vmem_btag_pool, sizeof(bt_t), CACHELINESIZE, IPL_VM, 0, + "vmembt", NULL); + pool_setlowat(&vmem_btag_pool, 200); + pool_prime(&vmem_btag_pool, 200); +#if 0 +#ifdef MULTIPROCESSOR + pool_cache_init(&vmem_btag_pool); +#endif +#endif + + vmem_rehash_start(); +} + +static int +vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, vm_flag_t flags, + int spanbttype) +{ + bt_t *btspan; + bt_t *btfree; + + VMEM_ASSERT_LOCKED(vm); + KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0); + KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0); + KASSERT(spanbttype == BT_TYPE_SPAN || + spanbttype == BT_TYPE_SPAN_STATIC); + + btspan = bt_alloc(vm, flags); + if (btspan == NULL) { + return ENOMEM; + } + btfree = bt_alloc(vm, flags); + if (btfree == NULL) { + bt_free(vm, btspan); + return ENOMEM; + } + + btspan->bt_type = spanbttype; + btspan->bt_start = addr; + btspan->bt_size = size; + + btfree->bt_type = BT_TYPE_FREE; + btfree->bt_start = addr; + btfree->bt_size = size; + + bt_insseg_tail(vm, btspan); + bt_insseg(vm, btfree, btspan); + bt_insfree(vm, btfree); + vm->vm_size += size; + + return 0; +} + +static void +vmem_destroy1(vmem_t *vm) +{ + +#if defined(QCACHE) + qc_destroy(vm); +#endif /* defined(QCACHE) */ + VMEM_LOCK(vm); + + for (int i = 0; i < vm->vm_hashsize; i++) { + bt_t *bt; + + while ((bt = LIST_FIRST(&vm->vm_hashlist[i])) != NULL) { + KASSERT(bt->bt_type == BT_TYPE_SPAN_STATIC); + LIST_REMOVE(bt, bt_hashlist); + bt_free(vm, bt); + } + } + + /* bt_freetrim() drops the lock. */ + bt_freetrim(vm, 0); + if (vm->vm_hashlist != &vm->vm_hash0) { + free(vm->vm_hashlist, M_DEVBUF, + sizeof(*vm->vm_hashlist) * vm->vm_hashsize); + } + + VMEM_LOCK_DESTROY(vm); + free(vm, M_DEVBUF, sizeof(*vm)); +} + +static int +vmem_import(vmem_t *vm, vmem_size_t size, vm_flag_t flags) +{ + vmem_addr_t addr; + int rc; + + VMEM_ASSERT_LOCKED(vm); + + if (vm->vm_importfn == NULL) { + return EINVAL; + } + + if (vm->vm_flags & VM_LARGEIMPORT) { + size *= 16; + } + + VMEM_UNLOCK(vm); + rc = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr); + VMEM_LOCK(vm); + + if (rc) { + return ENOMEM; + } + + if (vmem_add1(vm, addr, size, flags, BT_TYPE_SPAN) != 0) { + VMEM_UNLOCK(vm); + (*vm->vm_releasefn)(vm->vm_arg, addr, size); + VMEM_LOCK(vm); + return ENOMEM; + } + + return 0; +} + +#if defined(_KERNEL) +static int +vmem_rehash(vmem_t *vm, size_t newhashsize, vm_flag_t flags) +{ + bt_t *bt; + int i; + struct vmem_hashlist *newhashlist; + struct vmem_hashlist *oldhashlist; + size_t oldhashsize; + + KASSERT(newhashsize > 0); + + /* Round hash size up to a power of 2. */ + newhashsize = 1 << fls(newhashsize); + + newhashlist = mallocarray(newhashsize, sizeof(*newhashlist), + M_DEVBUF, ISSET(flags, VM_SLEEP) ? M_WAITOK : M_NOWAIT); + if (newhashlist == NULL) { + return ENOMEM; + } + for (i = 0; i < newhashsize; i++) { + LIST_INIT(&newhashlist[i]); + } + + VMEM_LOCK(vm); + /* Decay back to a small hash slowly. */ + if (vm->vm_maxbusytag >= 2) { + vm->vm_maxbusytag = vm->vm_maxbusytag / 2 - 1; + if (vm->vm_nbusytag > vm->vm_maxbusytag) { + vm->vm_maxbusytag = vm->vm_nbusytag; + } + } else { + vm->vm_maxbusytag = vm->vm_nbusytag; + } + oldhashlist = vm->vm_hashlist; + oldhashsize = vm->vm_hashsize; + vm->vm_hashlist = newhashlist; + vm->vm_hashsize = newhashsize; + vm->vm_hashmask = newhashsize - 1; + if (oldhashlist == NULL) { + VMEM_UNLOCK(vm); + return 0; + } + for (i = 0; i < oldhashsize; i++) { + while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) { + bt_rembusy(vm, bt); /* XXX */ + bt_insbusy(vm, bt); + } + } + VMEM_UNLOCK(vm); + + if (oldhashlist != &vm->vm_hash0) { + free(oldhashlist, M_DEVBUF, + sizeof(*oldhashlist) * oldhashsize); + } + + return 0; +} +#endif /* _KERNEL */ + +/* + * vmem_fit: check if a bt can satisfy the given restrictions. + * + * it's a caller's responsibility to ensure the region is big enough + * before calling us. + */ + +static int +vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align, + vmem_size_t phase, vmem_size_t nocross, + vmem_addr_t minaddr, vmem_addr_t maxaddr, vmem_addr_t *addrp) +{ + vmem_addr_t start; + vmem_addr_t end; + + KASSERT(size > 0); + KASSERT(bt->bt_size >= size); /* caller's responsibility */ + + /* + * XXX assumption: vmem_addr_t and vmem_size_t are + * unsigned integer of the same size. + */ + + start = bt->bt_start; + if (start < minaddr) { + start = minaddr; + } + end = BT_END(bt); + if (end > maxaddr) { + end = maxaddr; + } + if (start > end) { + return ENOMEM; + } + + start = VMEM_ALIGNUP(start - phase, align) + phase; + if (start < bt->bt_start) { + start += align; + } + if (VMEM_CROSS_P(start, start + size - 1, nocross)) { + KASSERT(align < nocross); + start = VMEM_ALIGNUP(start - phase, nocross) + phase; + } + if (start <= end && end - start >= size - 1) { + KASSERT((start & (align - 1)) == phase); + KASSERT(!VMEM_CROSS_P(start, start + size - 1, nocross)); + KASSERT(minaddr <= start); + KASSERT(maxaddr == 0 || start + size - 1 <= maxaddr); + KASSERT(bt->bt_start <= start); + KASSERT(BT_END(bt) - start >= size - 1); + *addrp = start; + return 0; + } + return ENOMEM; +} + +/* ---- vmem API */ + +/* + * vmem_init: creates a vmem arena. + */ + +vmem_t * +vmem_init(vmem_t *vm, const char *name, + vmem_addr_t base, vmem_size_t size, vmem_size_t quantum, + vmem_import_t *importfn, vmem_release_t *releasefn, + vmem_t *arg, vmem_size_t qcache_max, vm_flag_t flags, int ipl) +{ + int i; + KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0); + KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0); + KASSERT(quantum > 0); + KASSERT(powerof2(quantum)); + + /* + * If private tags are going to be used, they must + * be added to the arena before the first span is + * added. + */ + KASSERT((flags & VM_PRIVTAGS) == 0 || size == 0); + +#if defined(_KERNEL) + /* XXX: SMP, we get called early... */ + if (!vmem_bootstrapped) { + vmem_bootstrap(); + vmem_bootstrapped = 1; + } +#endif /* defined(_KERNEL) */ + + if (vm == NULL) { + vm = malloc(sizeof(*vm), M_DEVBUF, M_WAITOK|M_CANFAIL); + } + if (vm == NULL) { + return NULL; + } + + VMEM_LOCK_INIT(vm, ipl); + vm->vm_flags = flags; + vm->vm_nfreetags = 0; + LIST_INIT(&vm->vm_freetags); + strlcpy(vm->vm_name, name, sizeof(vm->vm_name)); + vm->vm_quantum_mask = quantum - 1; + vm->vm_quantum_shift = SIZE2ORDER(quantum); + KASSERT(ORDER2SIZE(vm->vm_quantum_shift) == quantum); + vm->vm_importfn = importfn; + vm->vm_releasefn = releasefn; + vm->vm_arg = arg; + vm->vm_nbusytag = 0; + vm->vm_maxbusytag = 0; + vm->vm_size = 0; + vm->vm_inuse = 0; +#if defined(QCACHE) + qc_init(vm, qcache_max, ipl); +#endif /* defined(QCACHE) */ + + TAILQ_INIT(&vm->vm_seglist); + for (i = 0; i < VMEM_MAXORDER; i++) { + LIST_INIT(&vm->vm_freelist[i]); + } + memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0)); + vm->vm_hashsize = 1; + vm->vm_hashmask = vm->vm_hashsize - 1; + vm->vm_hashlist = &vm->vm_hash0; + + if (size != 0) { + if (vmem_add(vm, base, size, flags) != 0) { + vmem_destroy1(vm); + return NULL; + } + } + +#if defined(_KERNEL) + if (flags & VM_BOOTSTRAP) { + bt_refill(vm); + } + + rw_enter_write(&vmem_list_lock); + LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist); + rw_exit_write(&vmem_list_lock); +#endif /* defined(_KERNEL) */ + + return vm; +} + + + +/* + * vmem_create: create an arena. + * + * => must not be called from interrupt context. + */ + +vmem_t * +vmem_create(const char *name, vmem_addr_t base, vmem_size_t size, + vmem_size_t quantum, vmem_import_t *importfn, vmem_release_t *releasefn, + vmem_t *source, vmem_size_t qcache_max, vm_flag_t flags, int ipl) +{ + return vmem_init(NULL, name, base, size, quantum, + importfn, releasefn, source, qcache_max, flags, ipl); +} + +void +vmem_destroy(vmem_t *vm) +{ + +#if defined(_KERNEL) + rw_enter_write(&vmem_list_lock); + LIST_REMOVE(vm, vm_alllist); + rw_exit_write(&vmem_list_lock); +#endif /* defined(_KERNEL) */ + + vmem_destroy1(vm); +} + +vmem_size_t +vmem_roundup_size(vmem_t *vm, vmem_size_t size) +{ + + return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask; +} + +/* + * vmem_alloc: allocate resource from the arena. + */ + +int +vmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags, vmem_addr_t *addrp) +{ + int error; + + KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0); + KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0); + + KASSERT(size > 0); + KASSERT(!ISSET(flags, VM_BESTFIT) != !ISSET(flags, VM_INSTANTFIT)); +#if 0 + if ((flags & VM_SLEEP) != 0) { + ASSERT_SLEEPABLE(); + } +#endif + +#if defined(QCACHE) + if (size <= vm->vm_qcache_max) { + void *p; + int qidx = (size + vm->vm_quantum_mask) >> vm->vm_quantum_shift; + qcache_t *qc = vm->vm_qcache[qidx - 1]; + + p = pool_cache_get(qc->qc_cache, vmf_to_prf(flags)); + if (addrp != NULL) + *addrp = (vmem_addr_t)p; + error = (p == NULL) ? ENOMEM : 0; + goto out; + } +#endif /* defined(QCACHE) */ + + error = vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, + flags, addrp); +#if defined(QCACHE) + out: +#endif /* defined(QCACHE) */ + KASSERTMSG(error || addrp == NULL || + (*addrp & vm->vm_quantum_mask) == 0, + "vmem %s mask=0x%llx addr=0x%llx", vm->vm_name, + (unsigned long long)vm->vm_quantum_mask, + (unsigned long long)*addrp); + KASSERT(error == 0 || (flags & VM_SLEEP) == 0); + return error; +} + +int +vmem_xalloc_addr(vmem_t *vm, const vmem_addr_t addr, const vmem_size_t size, + vm_flag_t flags) +{ + vmem_addr_t result; + int error; + + KASSERT((addr & vm->vm_quantum_mask) == 0); + KASSERT(size != 0); + + flags = (flags & ~VM_INSTANTFIT) | VM_BESTFIT; + + error = vmem_xalloc(vm, size, 0, 0, 0, addr, addr + size - 1, + flags, &result); + + KASSERT(error || result == addr); + KASSERT(error == 0 || (flags & VM_SLEEP) == 0); + return error; +} + +int +vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align, + const vmem_size_t phase, const vmem_size_t nocross, + const vmem_addr_t minaddr, const vmem_addr_t maxaddr, const vm_flag_t flags, + vmem_addr_t *addrp) +{ + struct vmem_freelist *list; + struct vmem_freelist *first; + struct vmem_freelist *end; + bt_t *bt; + bt_t *btnew; + bt_t *btnew2; + const vmem_size_t size = vmem_roundup_size(vm, size0); + vm_flag_t strat = flags & VM_FITMASK; + vmem_addr_t start; + int rc; + + KASSERT(size0 > 0); + KASSERT(size > 0); + KASSERT(!ISSET(flags, VM_BESTFIT) != !ISSET(flags, VM_INSTANTFIT)); +#if 0 + if ((flags & VM_SLEEP) != 0) { + ASSERT_SLEEPABLE(); + } +#endif + KASSERT((align & vm->vm_quantum_mask) == 0); + KASSERT((align & (align - 1)) == 0); + KASSERT((phase & vm->vm_quantum_mask) == 0); + KASSERT((nocross & vm->vm_quantum_mask) == 0); + KASSERT((nocross & (nocross - 1)) == 0); + KASSERT(align == 0 || phase < align); + KASSERT(phase == 0 || phase < align); + KASSERT(nocross == 0 || nocross >= size); + KASSERT(minaddr <= maxaddr); + KASSERT(!VMEM_CROSS_P(phase, phase + size - 1, nocross)); + + if (align == 0) { + align = vm->vm_quantum_mask + 1; + } + + /* + * allocate boundary tags before acquiring the vmem lock. + */ + VMEM_LOCK(vm); + btnew = bt_alloc(vm, flags); + if (btnew == NULL) { + VMEM_UNLOCK(vm); + return ENOMEM; + } + btnew2 = bt_alloc(vm, flags); /* XXX not necessary if no restrictions */ + if (btnew2 == NULL) { + bt_free(vm, btnew); + VMEM_UNLOCK(vm); + return ENOMEM; + } + + /* + * choose a free block from which we allocate. + */ +retry_strat: + first = bt_freehead_toalloc(vm, size, strat); + end = &vm->vm_freelist[VMEM_MAXORDER]; +retry: + bt = NULL; + vmem_check(vm); + if (strat == VM_INSTANTFIT) { + /* + * just choose the first block which satisfies our restrictions. + * + * note that we don't need to check the size of the blocks + * because any blocks found on these list should be larger than + * the given size. + */ + for (list = first; list < end; list++) { + bt = LIST_FIRST(list); + if (bt != NULL) { + rc = vmem_fit(bt, size, align, phase, + nocross, minaddr, maxaddr, &start); + if (rc == 0) { + goto gotit; + } + /* + * don't bother to follow the bt_freelist link + * here. the list can be very long and we are + * told to run fast. blocks from the later free + * lists are larger and have better chances to + * satisfy our restrictions. + */ + } + } + } else { /* VM_BESTFIT */ + /* + * we assume that, for space efficiency, it's better to + * allocate from a smaller block. thus we will start searching + * from the lower-order list than VM_INSTANTFIT. + * however, don't bother to find the smallest block in a free + * list because the list can be very long. we can revisit it + * if/when it turns out to be a problem. + * + * note that the 'first' list can contain blocks smaller than + * the requested size. thus we need to check bt_size. + */ + for (list = first; list < end; list++) { + LIST_FOREACH(bt, list, bt_freelist) { + if (bt->bt_size >= size) { + rc = vmem_fit(bt, size, align, phase, + nocross, minaddr, maxaddr, &start); + if (rc == 0) { + goto gotit; + } + } + } + } + } +#if 1 + if (strat == VM_INSTANTFIT) { + strat = VM_BESTFIT; + goto retry_strat; + } +#endif + if (align != vm->vm_quantum_mask + 1 || phase != 0 || nocross != 0) { + + /* + * XXX should try to import a region large enough to + * satisfy restrictions? + */ + + goto fail; + } + /* XXX eeek, minaddr & maxaddr not respected */ + if (vmem_import(vm, size, flags) == 0) { + goto retry; + } + /* XXX */ +#if 0 + if ((flags & VM_SLEEP) != 0) { + vmem_kick_pdaemon(); + VMEM_CONDVAR_WAIT(vm); + goto retry; + } +#endif +fail: + bt_free(vm, btnew); + bt_free(vm, btnew2); + VMEM_UNLOCK(vm); + return ENOMEM; + +gotit: + KASSERT(bt->bt_type == BT_TYPE_FREE); + KASSERT(bt->bt_size >= size); + bt_remfree(vm, bt); + vmem_check(vm); + if (bt->bt_start != start) { + btnew2->bt_type = BT_TYPE_FREE; + btnew2->bt_start = bt->bt_start; + btnew2->bt_size = start - bt->bt_start; + bt->bt_start = start; + bt->bt_size -= btnew2->bt_size; + bt_insfree(vm, btnew2); + bt_insseg(vm, btnew2, TAILQ_PREV(bt, vmem_seglist, bt_seglist)); + btnew2 = NULL; + vmem_check(vm); + } + KASSERT(bt->bt_start == start); + if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) { + /* split */ + btnew->bt_type = BT_TYPE_BUSY; + btnew->bt_start = bt->bt_start; + btnew->bt_size = size; + bt->bt_start = bt->bt_start + size; + bt->bt_size -= size; + bt_insfree(vm, bt); + bt_insseg(vm, btnew, TAILQ_PREV(bt, vmem_seglist, bt_seglist)); + bt_insbusy(vm, btnew); + vmem_check(vm); + } else { + bt->bt_type = BT_TYPE_BUSY; + bt_insbusy(vm, bt); + vmem_check(vm); + bt_free(vm, btnew); + btnew = bt; + } + if (btnew2 != NULL) { + bt_free(vm, btnew2); + } + KASSERT(btnew->bt_size >= size); + btnew->bt_type = BT_TYPE_BUSY; + if (addrp != NULL) + *addrp = btnew->bt_start; + VMEM_UNLOCK(vm); + KASSERTMSG(addrp == NULL || + (*addrp & vm->vm_quantum_mask) == 0, + "vmem %s mask=0x%llx addr=0x%llx", vm->vm_name, + (unsigned long long)vm->vm_quantum_mask, + (unsigned long long)*addrp); + return 0; +} + +/* + * vmem_free: free the resource to the arena. + */ + +void +vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) +{ + + KASSERT(size > 0); + KASSERTMSG((addr & vm->vm_quantum_mask) == 0, + "vmem %s mask=0x%llx addr=0x%llx", vm->vm_name, + (unsigned long long)vm->vm_quantum_mask, + (unsigned long long)addr); + +#if defined(QCACHE) + if (size <= vm->vm_qcache_max) { + int qidx = (size + vm->vm_quantum_mask) >> vm->vm_quantum_shift; + qcache_t *qc = vm->vm_qcache[qidx - 1]; + + pool_cache_put(qc->qc_cache, (void *)addr); + return; + } +#endif /* defined(QCACHE) */ + + vmem_xfree(vm, addr, size); +} + +void +vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) +{ + bt_t *bt; + + KASSERT(size > 0); + KASSERTMSG((addr & vm->vm_quantum_mask) == 0, + "vmem %s mask=0x%llx addr=0x%llx", vm->vm_name, + (unsigned long long)vm->vm_quantum_mask, + (unsigned long long)addr); + + VMEM_LOCK(vm); + + bt = bt_lookupbusy(vm, addr); + KASSERTMSG(bt != NULL, "vmem %s addr 0x%llx size 0x%llx", vm->vm_name, + (unsigned long long)addr, (unsigned long long)size); + KASSERT(bt->bt_start == addr); + KASSERT(bt->bt_size == vmem_roundup_size(vm, size) || + bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask); + + /* vmem_xfree_bt() drops the lock. */ + vmem_xfree_bt(vm, bt); +} + +void +vmem_xfreeall(vmem_t *vm) +{ + bt_t *bt; + +#if defined(QCACHE) + /* This can't be used if the arena has a quantum cache. */ + KASSERT(vm->vm_qcache_max == 0); +#endif /* defined(QCACHE) */ + + for (;;) { + VMEM_LOCK(vm); + TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { + if (bt->bt_type == BT_TYPE_BUSY) + break; + } + if (bt != NULL) { + /* vmem_xfree_bt() drops the lock. */ + vmem_xfree_bt(vm, bt); + } else { + VMEM_UNLOCK(vm); + return; + } + } +} + +static void +vmem_xfree_bt(vmem_t *vm, bt_t *bt) +{ + bt_t *t; + + VMEM_ASSERT_LOCKED(vm); + + KASSERT(bt->bt_type == BT_TYPE_BUSY); + bt_rembusy(vm, bt); + bt->bt_type = BT_TYPE_FREE; + + /* coalesce */ + t = TAILQ_NEXT(bt, bt_seglist); + if (t != NULL && t->bt_type == BT_TYPE_FREE) { + KASSERT(BT_END(bt) < t->bt_start); /* YYY */ + bt_remfree(vm, t); + bt_remseg(vm, t); + bt->bt_size += t->bt_size; + bt_free(vm, t); + } + t = TAILQ_PREV(bt, vmem_seglist, bt_seglist); + if (t != NULL && t->bt_type == BT_TYPE_FREE) { + KASSERT(BT_END(t) < bt->bt_start); /* YYY */ + bt_remfree(vm, t); + bt_remseg(vm, t); + bt->bt_size += t->bt_size; + bt->bt_start = t->bt_start; + bt_free(vm, t); + } + + t = TAILQ_PREV(bt, vmem_seglist, bt_seglist); + KASSERT(t != NULL); + KASSERT(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY); + if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN && + t->bt_size == bt->bt_size) { + vmem_addr_t spanaddr; + vmem_size_t spansize; + + KASSERT(t->bt_start == bt->bt_start); + spanaddr = bt->bt_start; + spansize = bt->bt_size; + bt_remseg(vm, bt); + bt_free(vm, bt); + bt_remseg(vm, t); + bt_free(vm, t); + vm->vm_size -= spansize; + //VMEM_CONDVAR_BROADCAST(vm); + /* bt_freetrim() drops the lock. */ + bt_freetrim(vm, BT_MAXFREE); + (*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize); + } else { + bt_insfree(vm, bt); + //VMEM_CONDVAR_BROADCAST(vm); + /* bt_freetrim() drops the lock. */ + bt_freetrim(vm, BT_MAXFREE); + } +} + +/* + * vmem_add: + * + * => caller must ensure appropriate spl, + * if the arena can be accessed from interrupt context. + */ + +int +vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, vm_flag_t flags) +{ + int rv; + + VMEM_LOCK(vm); + rv = vmem_add1(vm, addr, size, flags, BT_TYPE_SPAN_STATIC); + VMEM_UNLOCK(vm); + + return rv; +} + +/* + * vmem_size: information about arenas size + * + * => return free/allocated size in arena + */ +vmem_size_t +vmem_size(vmem_t *vm, int typemask) +{ + + switch (typemask) { + case VMEM_ALLOC: + return vm->vm_inuse; + case VMEM_FREE: + return vm->vm_size - vm->vm_inuse; + case VMEM_FREE|VMEM_ALLOC: + return vm->vm_size; + default: + panic("vmem_size"); + } +} + +/* ---- rehash */ + +#if defined(_KERNEL) +static struct timeout vmem_rehash_tick; +static struct task vmem_rehash_task; +static int vmem_rehash_interval; + +static void +vmem_rehash_all(void *arg) +{ + vmem_t *vm; + + rw_enter_read(&vmem_list_lock); + LIST_FOREACH(vm, &vmem_list, vm_alllist) { + size_t desired; + size_t current; + + desired = READ_ONCE(vm->vm_maxbusytag); + current = READ_ONCE(vm->vm_hashsize); + + if (desired > VMEM_HASHSIZE_MAX) { + desired = VMEM_HASHSIZE_MAX; + } else if (desired < VMEM_HASHSIZE_MIN) { + desired = VMEM_HASHSIZE_MIN; + } + if (desired > current * 2 || desired * 2 < current) { + vmem_rehash(vm, desired, VM_NOSLEEP); + } + } + rw_exit_read(&vmem_list_lock); + +} + +static void +vmem_rehash_add(void *arg) +{ + timeout_add_sec(&vmem_rehash_tick, vmem_rehash_interval); + task_add(systqmp, &vmem_rehash_task); +} + +void +vmem_rehash_start(void) +{ + timeout_set(&vmem_rehash_tick, vmem_rehash_add, NULL); + task_set(&vmem_rehash_task, vmem_rehash_all, NULL); + + vmem_rehash_interval = 10; + timeout_add_sec(&vmem_rehash_tick, vmem_rehash_interval); +} +#endif /* defined(_KERNEL) */ + +/* ---- debug */ + +#if defined(DDB) || defined(UNITTEST) || defined(VMEM_SANITY) + +static void bt_dump(const bt_t *, void (*)(const char *, ...) + __attribute__((__format__(__kprintf__,1,2)))); + +static const char * +bt_type_string(int type) +{ + static const char * const table[] = { + [BT_TYPE_BUSY] = "busy", + [BT_TYPE_FREE] = "free", + [BT_TYPE_SPAN] = "span", + [BT_TYPE_SPAN_STATIC] = "static span", + }; + + if (type >= nitems(table)) { + return "BOGUS"; + } + return table[type]; +} + +static void +bt_dump(const bt_t *bt, void (*pr)(const char *, ...)) +{ + + (*pr)("\t%p: %llu, %llu, %d(%s)\n", + bt, (uint64_t)bt->bt_start, (uint64_t)bt->bt_size, + bt->bt_type, bt_type_string(bt->bt_type)); +} + +static void +vmem_dump(const vmem_t *vm , void (*pr)(const char *, ...)) +{ + const bt_t *bt; + int i; + + (*pr)("vmem %p '%s'\n", vm, vm->vm_name); + TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { + bt_dump(bt, pr); + } + + for (i = 0; i < VMEM_MAXORDER; i++) { + const struct vmem_freelist *fl = &vm->vm_freelist[i]; + + if (LIST_EMPTY(fl)) { + continue; + } + + (*pr)("freelist[%d]\n", i); + LIST_FOREACH(bt, fl, bt_freelist) { + bt_dump(bt, pr); + } + } +} + +#endif /* defined(DDB) || defined(UNITTEST) || defined(VMEM_SANITY) */ + +#if defined(DDB) +static bt_t * +vmem_whatis_lookup(vmem_t *vm, uintptr_t addr) +{ + bt_t *bt; + + TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { + if (BT_ISSPAN_P(bt)) { + continue; + } + if (bt->bt_start <= addr && addr <= BT_END(bt)) { + return bt; + } + } + + return NULL; +} + +void +vmem_whatis(uintptr_t addr, void (*pr)(const char *, ...)) +{ + vmem_t *vm; + + LIST_FOREACH(vm, &vmem_list, vm_alllist) { + bt_t *bt; + + bt = vmem_whatis_lookup(vm, addr); + if (bt == NULL) { + continue; + } + (*pr)("%p is %p+%zu in VMEM '%s' (%s)\n", + (void *)addr, (void *)bt->bt_start, + (size_t)(addr - bt->bt_start), vm->vm_name, + (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free"); + } +} + +void +vmem_printall(const char *modif, void (*pr)(const char *, ...)) +{ + const vmem_t *vm; + + LIST_FOREACH(vm, &vmem_list, vm_alllist) { + vmem_dump(vm, pr); + } +} + +void +vmem_print(uintptr_t addr, const char *modif, void (*pr)(const char *, ...)) +{ + const vmem_t *vm = (const void *)addr; + + vmem_dump(vm, pr); +} +#endif /* defined(DDB) */ + +#if defined(_KERNEL) +#define vmem_printf printf +#else +#include +#include + +static void +vmem_printf(const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); +} +#endif + +#if defined(VMEM_SANITY) + +static bool +vmem_check_sanity(vmem_t *vm) +{ + const bt_t *bt, *bt2; + + KASSERT(vm != NULL); + + TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { + if (bt->bt_start > BT_END(bt)) { + printf("corrupted tag\n"); + bt_dump(bt, vmem_printf); + return false; + } + } + TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { + TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) { + if (bt == bt2) { + continue; + } + if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) { + continue; + } + if (bt->bt_start <= BT_END(bt2) && + bt2->bt_start <= BT_END(bt)) { + printf("overwrapped tags\n"); + bt_dump(bt, vmem_printf); + bt_dump(bt2, vmem_printf); + return false; + } + } + } + + return true; +} + +static void +vmem_check(vmem_t *vm) +{ + + if (!vmem_check_sanity(vm)) { + panic("insanity vmem %p", vm); + } +} + +#endif /* defined(VMEM_SANITY) */ + +#if defined(UNITTEST) +int +main(void) +{ + int rc; + vmem_t *vm; + vmem_addr_t p; + struct reg { + vmem_addr_t p; + vmem_size_t sz; + bool x; + } *reg = NULL; + int nreg = 0; + int nalloc = 0; + int nfree = 0; + vmem_size_t total = 0; +#if 1 + vm_flag_t strat = VM_INSTANTFIT; +#else + vm_flag_t strat = VM_BESTFIT; +#endif + + vm = vmem_create("test", 0, 0, 1, NULL, NULL, NULL, 0, VM_SLEEP, +#ifdef _KERNEL + IPL_NONE +#else + 0 +#endif + ); + if (vm == NULL) { + printf("vmem_create\n"); + exit(EXIT_FAILURE); + } + vmem_dump(vm, vmem_printf); + + rc = vmem_add(vm, 0, 50, VM_SLEEP); + assert(rc == 0); + rc = vmem_add(vm, 100, 200, VM_SLEEP); + assert(rc == 0); + rc = vmem_add(vm, 2000, 1, VM_SLEEP); + assert(rc == 0); + rc = vmem_add(vm, 40000, 65536, VM_SLEEP); + assert(rc == 0); + rc = vmem_add(vm, 10000, 10000, VM_SLEEP); + assert(rc == 0); + rc = vmem_add(vm, 500, 1000, VM_SLEEP); + assert(rc == 0); + rc = vmem_add(vm, 0xffffff00, 0x100, VM_SLEEP); + assert(rc == 0); + rc = vmem_xalloc(vm, 0x101, 0, 0, 0, + 0xffffff00, 0xffffffff, strat|VM_SLEEP, &p); + assert(rc != 0); + rc = vmem_xalloc(vm, 50, 0, 0, 0, 0, 49, strat|VM_SLEEP, &p); + assert(rc == 0 && p == 0); + vmem_xfree(vm, p, 50); + rc = vmem_xalloc(vm, 25, 0, 0, 0, 0, 24, strat|VM_SLEEP, &p); + assert(rc == 0 && p == 0); + rc = vmem_xalloc(vm, 0x100, 0, 0, 0, + 0xffffff01, 0xffffffff, strat|VM_SLEEP, &p); + assert(rc != 0); + rc = vmem_xalloc(vm, 0x100, 0, 0, 0, + 0xffffff00, 0xfffffffe, strat|VM_SLEEP, &p); + assert(rc != 0); + rc = vmem_xalloc(vm, 0x100, 0, 0, 0, + 0xffffff00, 0xffffffff, strat|VM_SLEEP, &p); + assert(rc == 0); + vmem_dump(vm, vmem_printf); + for (;;) { + struct reg *r; + int t = rand() % 100; + + if (t > 45) { + /* alloc */ + vmem_size_t sz = rand() % 500 + 1; + bool x; + vmem_size_t align, phase, nocross; + vmem_addr_t minaddr, maxaddr; + + if (t > 70) { + x = true; + /* XXX */ + align = 1 << (rand() % 15); + phase = rand() % 65536; + nocross = 1 << (rand() % 15); + if (align <= phase) { + phase = 0; + } + if (VMEM_CROSS_P(phase, phase + sz - 1, + nocross)) { + nocross = 0; + } + do { + minaddr = rand() % 50000; + maxaddr = rand() % 70000; + } while (minaddr > maxaddr); + printf("=== xalloc %" PRIu64 + " align=%" PRIu64 ", phase=%" PRIu64 + ", nocross=%" PRIu64 ", min=%" PRIu64 + ", max=%" PRIu64 "\n", + (uint64_t)sz, + (uint64_t)align, + (uint64_t)phase, + (uint64_t)nocross, + (uint64_t)minaddr, + (uint64_t)maxaddr); + rc = vmem_xalloc(vm, sz, align, phase, nocross, + minaddr, maxaddr, strat|VM_SLEEP, &p); + } else { + x = false; + printf("=== alloc %" PRIu64 "\n", (uint64_t)sz); + rc = vmem_alloc(vm, sz, strat|VM_SLEEP, &p); + } + printf("-> %" PRIu64 "\n", (uint64_t)p); + vmem_dump(vm, vmem_printf); + if (rc != 0) { + if (x) { + continue; + } + break; + } + nreg++; + reg = realloc(reg, sizeof(*reg) * nreg); + r = ®[nreg - 1]; + r->p = p; + r->sz = sz; + r->x = x; + total += sz; + nalloc++; + } else if (nreg != 0) { + /* free */ + r = ®[rand() % nreg]; + printf("=== free %" PRIu64 ", %" PRIu64 "\n", + (uint64_t)r->p, (uint64_t)r->sz); + if (r->x) { + vmem_xfree(vm, r->p, r->sz); + } else { + vmem_free(vm, r->p, r->sz); + } + total -= r->sz; + vmem_dump(vm, vmem_printf); + *r = reg[nreg - 1]; + nreg--; + nfree++; + } + printf("total=%" PRIu64 "\n", (uint64_t)total); + } + fprintf(stderr, "total=%" PRIu64 ", nalloc=%d, nfree=%d\n", + (uint64_t)total, nalloc, nfree); + exit(EXIT_SUCCESS); +} +#endif /* defined(UNITTEST) */ Index: sys/sys/conf.h =================================================================== RCS file: /cvs/src/sys/sys/conf.h,v diff -u -p -r1.163 conf.h --- sys/sys/conf.h 11 Jun 2024 01:49:17 -0000 1.163 +++ sys/sys/conf.h 11 Sep 2024 11:29:29 -0000 @@ -326,6 +326,21 @@ extern struct cdevsw cdevsw[]; (dev_type_stop((*))) enodev, 0, \ (dev_type_mmap((*))) enodev } +/* open, close, read, ioctl, poll, kqfilter */ +#define cdev_lltrace_init(c,n) { \ + .d_open = dev_init(c,n,open), \ + .d_close = dev_init(c,n,close), \ + .d_read = dev_init(c,n,read), \ + .d_write = (dev_type_write((*))) enodev, \ + .d_ioctl = dev_init(c,n,ioctl), \ + .d_stop = (dev_type_stop((*))) enodev, \ + .d_tty = NULL, \ + .d_mmap = (dev_type_mmap((*))) enodev, \ + .d_type = 0, \ + .d_flags = 0, \ + .d_kqfilter = dev_init(c,n,kqfilter), \ +} + /* open, close, read, write, ioctl, stop, tty, mmap, kqfilter */ #define cdev_wsdisplay_init(c,n) { \ dev_init(c,n,open), dev_init(c,n,close), dev_init(c,n,read), \ @@ -615,6 +630,7 @@ cdev_decl(wsmux); cdev_decl(ksyms); cdev_decl(kstat); +cdev_decl(lltrace); cdev_decl(bio); cdev_decl(vscsi); Index: sys/sys/lltrace.h =================================================================== RCS file: sys/sys/lltrace.h diff -N sys/sys/lltrace.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/sys/lltrace.h 11 Sep 2024 11:29:29 -0000 @@ -0,0 +1,297 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2022 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _SYS_LLTRACE_H_ +#define _SYS_LLTRACE_H_ + +/* + * lltrace is heavily based KUTrace (kernel/userland tracing) by + * Richard L. Sites. + */ + +#define LLTRACE_NSLOTS 8192 + +struct lltrace_buffer { + uint64_t llt_slots[LLTRACE_NSLOTS]; +}; + +#define LLTIOCSTART _IO('t',128) +#define LLTIOCSTOP _IO('t',129) +#define LLTIOCFLUSH _IO('t',130) + +/* + * trace until all the buffers are used, or trace and reuse buffers. + */ +#define LLTRACE_MODE_HEAD 0 +#define LLTRACE_MODE_TAIL 1 +#define LLTRACE_MODE_COUNT 2 + +#define LLTIOCSMODE _IOW('t', 131, unsigned int) +#define LLTIOCGMODE _IOR('t', 131, unsigned int) + +/* + * how much memory in MB to allocate for lltrace_buffer structs + * during tracing. + */ + +#define LLTRACE_BLEN_MIN 1 +#define LLTRACE_BLEN_MAX 128 + +#define LLTIOCSBLEN _IOW('t', 132, unsigned int) +#define LLTIOCGBLEN _IOR('t', 132, unsigned int) + +/* + * lltrace collects kernel events in per-CPU buffers. + */ + +/* + * The first 8 words of the per-CPU buffer are dedicated to metadata + * about the CPU and the period of time over which events were + * collected. + */ + +struct lltrace_header { + /* slots[0] */ + uint32_t h_cpu; + uint32_t h_idletid; + + /* slots[1] */ + uint64_t h_boottime; + + /* slots[2] */ + uint64_t h_start_cy; + /* slots[3] */ + uint64_t h_start_ns; + /* slots[4] */ + uint64_t h_end_cy; + /* slots[5] */ + uint64_t h_end_ns; + + /* slots[6] */ + uint32_t h_pid; + uint32_t h_tid; + /* slots[7] */ + uint64_t h_zero; +}; + +#define LLTRACE_MASK(_w) ((1ULL << (_w)) - 1) + +#define LLTRACE_TYPE_SHIFT 0 +#define LLTRACE_TYPE_WIDTH 3 +#define LLTRACE_TYPE_MASK LLTRACE_MASK(LLTRACE_TYPE_WIDTH) + +#define LLTRACE_TYPE_ID 0x0ULL +#define LLTRACE_TYPE_EVENT 0x1ULL +#define LLTRACE_TYPE_LOCKING 0x2ULL + +#define LLTRACE_LEN_SHIFT (LLTRACE_TYPE_SHIFT + LLTRACE_TYPE_WIDTH) +#define LLTRACE_LEN_WIDTH 3 +#define LLTRACE_LEN_MASK LLTRACE_MASK(LLTRACE_LEN_WIDTH) + +/* most records have a timestamp */ +#define LLTRACE_TS_TYPES ( \ + (1 << LLTRACE_TYPE_EVENT) | \ + (1 << LLTRACE_TYPE_LOCKING) \ + ) + +#define LLTRACE_TS_SHIFT (LLTRACE_LEN_SHIFT + LLTRACE_LEN_WIDTH) +#define LLTRACE_TS_WIDTH 20 +#define LLTRACE_TS_MASK LLTRACE_MASK(20) + +/* + * id records + */ + +/* tid record contains pid and kthread flag, followed by proc name */ +#define LLTRACE_ID_TYPE_SHIFT (LLTRACE_LEN_SHIFT + LLTRACE_LEN_WIDTH) +#define LLTRACE_ID_TYPE_WIDTH 6 +#define LLTRACE_ID_TYPE_MASK LLTRACE_MASK(3) +#define LLTRACE_ID_TYPE_TID 0x0 + +#define LLTRACE_ID_TID_SHIFT (LLTRACE_ID_TYPE_SHIFT + LLTRACE_ID_TYPE_WIDTH) +#define LLTRACE_ID_TID_WIDTH 20 /* >= than 19 bit TID_MASK */ +#define LLTRACE_ID_TID_MASK LLTRACE_MASK(LLTRACE_ID_TID_WIDTH) + +#define LLTRACE_ID_TID_PID_SHIFT 32 +#define LLTRACE_ID_TID_PID_WIDTH 20 /* >= whatever kernel pid range is */ +#define LLTRACE_ID_TID_PID_MASK LLTRACE_MASK(LLTRACE_ID_TID_PID_WIDTH) +#define LLTRACE_ID_TID_SYSTEM (1ULL << 63) /* kernel thread */ + +/* + * event records + */ + +#define LLTRACE_EVENT_PHASE_SHIFT (LLTRACE_TS_SHIFT + LLTRACE_TS_WIDTH) +#define LLTRACE_EVENT_PHASE_WIDTH 2 +#define LLTRACE_EVENT_PHASE_MASK LLTRACE_MASK(LLTRACE_EVENT_PHASE_WIDTH) +#define LLTRACE_EVENT_PHASE_INSTANT 0x0 +#define LLTRACE_EVENT_PHASE_START 0x1 +#define LLTRACE_EVENT_PHASE_STEP 0x2 +#define LLTRACE_EVENT_PHASE_END 0x3 + +#define LLTRACE_EVENT_CLASS_WIDTH 4 +#define LLTRACE_EVENT_CLASS_SHIFT \ + (LLTRACE_EVENT_PHASE_SHIFT + LLTRACE_EVENT_PHASE_WIDTH) +#define LLTRACE_EVENT_CLASS_MASK LLTRACE_MASK(LLTRACE_EVENT_CLASS_WIDTH) +#define LLTRACE_EVENT_CLASS_SYSCALL 0 +#define LLTRACE_EVENT_CLASS_IDLE 1 +#define LLTRACE_EVENT_CLASS_PAGEFAULT 2 +#define LLTRACE_EVENT_CLASS_INTR 3 +#define LLTRACE_EVENT_CLASS_SCHED 4 +#define LLTRACE_EVENT_CLASS_FUNC 5 +#define LLTRACE_EVENT_CLASS_WAKE 6 +#define LLTRACE_EVENT_CLASS_COUNT 7 + +#define LLTRACE_EVENT_DATA_SHIFT \ + (LLTRACE_EVENT_CLASS_SHIFT + LLTRACE_EVENT_CLASS_WIDTH) +#define LLTRACE_EVENT_DATA_SHIFT_CHECK 32 + +#define LLTRACE_SYSCALL_SHIFT LLTRACE_EVENT_DATA_SHIFT +#define LLTRACE_SYSCALL_WIDTH 10 +#define LLTRACE_SYSCALL_MASK LLTRACE_MASK(LLTRACE_SYSCALL_WIDTH) + +#define LLTRACE_SCHED_TID_SHIFT LLTRACE_EVENT_DATA_SHIFT +#define LLTRACE_SCHED_TID_WIDTH LLTRACE_ID_TID_WIDTH +#define LLTRACE_SCHED_TID_MASK LLTRACE_MASK(LLTRACE_SCHED_TID_WIDTH) +#define LLTRACE_SCHED_STATE_SHIFT \ + (LLTRACE_EVENT_DATA_SHIFT + LLTRACE_ID_TID_WIDTH) +#define LLTRACE_SCHED_STATE_WIDTH 4 +#define LLTRACE_SCHED_STATE_MASK LLTRACE_MASK(LLTRACE_SCHED_STATE_WIDTH) +#define LLTRACE_SCHED_STATE_NEW 0 +#define LLTRACE_SCHED_STATE_RUNNING 1 +#define LLTRACE_SCHED_STATE_SUSPENDED 2 +#define LLTRACE_SCHED_STATE_BLOCKED 3 +#define LLTRACE_SCHED_STATE_DYING 4 +#define LLTRACE_SCHED_STATE_DEAD 5 + +#define LLTRACE_SYSCALL_V_SHIFT \ + (LLTRACE_SYSCALL_SHIFT + LLTRACE_SYSCALL_WIDTH) + +#define LLTRACE_INTR_T_SHIFT LLTRACE_EVENT_DATA_SHIFT +#define LLTRACE_INTR_T_WIDTH 2 +#define LLTRACE_INTR_T_MASK LLTRACE_MASK(LLTRACE_INTR_T_WIDTH) +#define LLTRACE_INTR_T_HW 0ULL +#define LLTRACE_INTR_T_SW 1ULL +#define LLTRACE_INTR_T_IPI 2ULL +#define LLTRACE_INTR_T_CLOCK 3ULL + +#define LLTRACE_INTR_DATA_SHIFT \ + (LLTRACE_INTR_T_SHIFT + LLTRACE_INTR_T_WIDTH) + +/* record a count of something */ +#define LLTRACE_COUNT_T_SHIFT LLTRACE_EVENT_DATA_SHIFT +#define LLTRACE_COUNT_T_WIDTH 8 +#define LLTRACE_COUNT_T_MASK LLTRACE_MASK(LLTRACE_COUNT_T_WIDTH) + +#define LLTRACE_COUNT_T_PKTS_IFIQ 0 +#define LLTRACE_COUNT_T_PKTS_NETTQ 1 +#define LLTRACE_COUNT_T_PKTS_IFQ 2 +#define LLTRACE_COUNT_T_PKTS_QDROP 3 +#define LLTRACE_COUNT_T_PKTS_HDROP 4 + +#define LLTRACE_COUNT_V_SHIFT \ + (LLTRACE_COUNT_T_SHIFT + LLTRACE_COUNT_T_WIDTH) + +/* + * locking records + */ + +#define LLTRACE_LK_TYPE_SHIFT (LLTRACE_TS_SHIFT + LLTRACE_TS_WIDTH) +#define LLTRACE_LK_TYPE_WIDTH 2 +#define LLTRACE_LK_TYPE_MASK LLTRACE_MASK(LLTRACE_LK_TYPE_WIDTH) +#define LLTRACE_LK_RW 0x0 +#define LLTRACE_LK_MTX 0x1 +#define LLTRACE_LK_K 0x2 + +#define LLTRACE_LK_PHASE_SHIFT \ + (LLTRACE_LK_TYPE_SHIFT + LLTRACE_LK_TYPE_WIDTH) +#define LLTRACE_LK_PHASE_WIDTH 4 +#define LLTRACE_LK_PHASE_MASK LLTRACE_MASK(LLTRACE_LK_PHASE_WIDTH) +#define LLTRACE_LK_I_EXCL 0x0 /* instantly got wr lock */ +#define LLTRACE_LK_I_SHARED 0x1 /* instantly got rd lock */ +#define LLTRACE_LK_A_START 0x2 /* acquiring lock */ +#define LLTRACE_LK_A_EXCL 0x3 /* acquired wr lock */ +#define LLTRACE_LK_A_SHARED 0x4 /* acquired rd lock */ +#define LLTRACE_LK_A_ABORT 0x5 /* acquire aborted */ +#define LLTRACE_LK_DOWNGRADE 0x6 /* wr to rd lock */ +#define LLTRACE_LK_R_EXCL 0x7 /* released wr lock */ +#define LLTRACE_LK_R_SHARED 0x8 /* released rd lock */ +#define LLTRACE_LK_I_FAIL 0x9 /* try failed */ + +#define LLTRACE_LK_ADDR_SHIFT \ + (LLTRACE_LK_PHASE_SHIFT + LLTRACE_LK_PHASE_WIDTH) + +#ifdef _KERNEL + +struct lltrace_cpu; + +static inline struct lltrace_cpu * +lltrace_enter_spc(struct schedstate_percpu *spc) +{ + return (READ_ONCE(spc->spc_lltrace)); +} + +static inline struct lltrace_cpu * +lltrace_enter_cpu(struct cpu_info *ci) +{ + return lltrace_enter_spc(&ci->ci_schedstate); +} + +static inline struct lltrace_cpu * +lltrace_enter(void) +{ + return lltrace_enter_cpu(curcpu()); +} + +void lltrace_idle(struct lltrace_cpu *, unsigned int); +void lltrace_statclock(struct lltrace_cpu *, int, unsigned long); + +void lltrace_syscall(struct lltrace_cpu *, register_t, + size_t, const register_t *); +void lltrace_sysret(struct lltrace_cpu *, register_t, + int, const register_t [2]); +struct lltrace_cpu * + lltrace_pidname(struct lltrace_cpu *, struct proc *); +void lltrace_switch(struct lltrace_cpu *, struct proc *, struct proc *); +void lltrace_sched_enter(struct lltrace_cpu *); +void lltrace_sched_leave(struct lltrace_cpu *); +void lltrace_runnable(struct lltrace_cpu *, struct proc *); + +void lltrace_event_start(struct lltrace_cpu *, unsigned int); +void lltrace_event_end(struct lltrace_cpu *, unsigned int); +void lltrace_count(struct lltrace_cpu *, unsigned int, unsigned int); + +void lltrace_lock(struct lltrace_cpu *, void *, unsigned int, unsigned int); + +void lltrace_pkts(struct lltrace_cpu *, unsigned int, unsigned int); +void lltrace_mark(struct lltrace_cpu *); + +void lltrace_fn_enter(struct lltrace_cpu *, void *); +void lltrace_fn_leave(struct lltrace_cpu *, void *); + +/* MD bits */ + +void lltrace_ipi(struct lltrace_cpu *, unsigned int); +#define lltrace_ipi_bcast(_llt) lltrace_ipi((_llt), ~0U); + +void lltrace_intr_enter(struct lltrace_cpu *, unsigned int, unsigned int); +void lltrace_intr_leave(struct lltrace_cpu *, unsigned int, unsigned int); + +#endif /* _KERNEL */ + +#endif /* _SYS_LLTRACE_H_ */ Index: sys/sys/proc.h =================================================================== RCS file: /cvs/src/sys/sys/proc.h,v diff -u -p -r1.371 proc.h --- sys/sys/proc.h 1 Sep 2024 03:09:00 -0000 1.371 +++ sys/sys/proc.h 11 Sep 2024 11:29:29 -0000 @@ -363,6 +363,7 @@ struct proc { /* scheduling */ int p_cpticks; /* Ticks of cpu time. */ + uint64_t p_wakeid; /* [S] */ const volatile void *p_wchan; /* [S] Sleep address. */ struct timeout p_sleep_to;/* timeout for tsleep() */ const char *p_wmesg; /* [S] Reason for sleep. */ Index: sys/sys/sched.h =================================================================== RCS file: /cvs/src/sys/sys/sched.h,v diff -u -p -r1.73 sched.h --- sys/sys/sched.h 8 Jul 2024 14:46:47 -0000 1.73 +++ sys/sys/sched.h 11 Sep 2024 11:29:29 -0000 @@ -101,11 +101,13 @@ struct cpustats { #define SCHED_NQS 32 /* 32 run queues. */ struct smr_entry; +struct lltrace_cpu; /* * Per-CPU scheduler state. */ struct schedstate_percpu { + struct lltrace_cpu *spc_lltrace; struct proc *spc_idleproc; /* idle proc for this cpu */ TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS]; LIST_HEAD(,proc) spc_deadproc; Index: sys/sys/syscall_mi.h =================================================================== RCS file: /cvs/src/sys/sys/syscall_mi.h,v diff -u -p -r1.35 syscall_mi.h --- sys/sys/syscall_mi.h 1 Sep 2024 03:09:01 -0000 1.35 +++ sys/sys/syscall_mi.h 11 Sep 2024 11:29:29 -0000 @@ -157,6 +157,7 @@ mi_syscall(struct proc *p, register_t co KERNEL_UNLOCK(); } #endif + LLTRACE_CPU(p->p_cpu, lltrace_syscall, code, callp->sy_argsize, argp); /* SP must be within MAP_STACK space */ if (!uvm_map_inentry(p, &p->p_spinentry, PROC_STACK(p), @@ -190,6 +191,7 @@ static inline void mi_syscall_return(struct proc *p, register_t code, int error, const register_t retval[2]) { + LLTRACE_CPU(p->p_cpu, lltrace_sysret, code, error, retval); #ifdef SYSCALL_DEBUG KERNEL_LOCK(); scdebug_ret(p, code, error, retval); @@ -217,12 +219,13 @@ mi_syscall_return(struct proc *p, regist static inline void mi_child_return(struct proc *p) { -#if defined(SYSCALL_DEBUG) || defined(KTRACE) || NDT > 0 +#if defined(SYSCALL_DEBUG) || defined(KTRACE) || NDT > 0 || NLLT > 0 int code = (p->p_flag & P_THREAD) ? SYS___tfork : (p->p_p->ps_flags & PS_PPWAIT) ? SYS_vfork : SYS_fork; const register_t child_retval[2] = { 0, 1 }; #endif + LLTRACE_CPU(p->p_cpu, lltrace_sysret, code, 0, child_retval); TRACEPOINT(sched, on__cpu, NULL); #ifdef SYSCALL_DEBUG Index: sys/sys/tracepoint.h =================================================================== RCS file: /cvs/src/sys/sys/tracepoint.h,v diff -u -p -r1.2 tracepoint.h --- sys/sys/tracepoint.h 28 Jun 2022 09:32:28 -0000 1.2 +++ sys/sys/tracepoint.h 11 Sep 2024 11:29:29 -0000 @@ -34,5 +34,33 @@ #define TRACEINDEX(func, index, args...) #endif /* NDT > 0 */ + +#include "llt.h" +#if NLLT > 0 +#include + +#define LLTRACE_SPC(_spc, _fn, ...) { \ + struct lltrace_cpu *_llt = lltrace_enter_spc((_spc)); \ + if (_llt != NULL) \ + (_fn)(_llt, ## __VA_ARGS__); \ +} while (0) + +#define LLTRACE_CPU(_ci, _fn, ...) { \ + struct lltrace_cpu *_llt = lltrace_enter_cpu((_ci)); \ + if (_llt != NULL) \ + (_fn)(_llt, ##__VA_ARGS__); \ +} while (0) + +#define LLTRACE(_fn, ...) { \ + struct lltrace_cpu *_llt = lltrace_enter(); \ + if (_llt != NULL) \ + (_fn)(_llt, ## __VA_ARGS__); \ +} while (0) + +#else /* NLLT > 0 */ + +#define LLTRACE(_fn, ...) + +#endif /* NLLT > 0 */ #endif /* _KERNEL */ #endif /* _SYS_TRACEPOINT_H_ */ Index: sys/sys/vmem.h =================================================================== RCS file: sys/sys/vmem.h diff -N sys/sys/vmem.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/sys/vmem.h 11 Sep 2024 11:29:29 -0000 @@ -0,0 +1,100 @@ +/* $NetBSD: vmem.h,v 1.25 2023/12/03 19:34:08 thorpej Exp $ */ + +/*- + * Copyright (c)2006 YAMAMOTO Takashi, + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_VMEM_H_ +#define _SYS_VMEM_H_ + +#include + +#if defined(_KERNEL) +#else /* defined(_KERNEL) */ +#include +#endif /* defined(_KERNEL) */ + +typedef struct vmem vmem_t; + +typedef unsigned int vm_flag_t; + +typedef uintptr_t vmem_addr_t; +typedef size_t vmem_size_t; +#define VMEM_ADDR_MIN 0 +#define VMEM_ADDR_MAX (~(vmem_addr_t)0) + +typedef int (vmem_import_t)(vmem_t *, vmem_size_t, vm_flag_t, + vmem_addr_t *); +typedef void (vmem_release_t)(vmem_t *, vmem_addr_t, vmem_size_t); + +typedef int (vmem_ximport_t)(vmem_t *, vmem_size_t, vmem_size_t *, + vm_flag_t, vmem_addr_t *); + +extern vmem_t *kmem_arena; +extern vmem_t *kmem_meta_arena; +extern vmem_t *kmem_va_arena; + +vmem_t * vmem_create(const char *, vmem_addr_t, vmem_size_t, vmem_size_t, + vmem_import_t *, vmem_release_t *, vmem_t *, vmem_size_t, + vm_flag_t, int); +vmem_t * vmem_xcreate(const char *, vmem_addr_t, vmem_size_t, + vmem_size_t, vmem_ximport_t *, vmem_release_t *, vmem_t *, + vmem_size_t, vm_flag_t, int); +void vmem_destroy(vmem_t *); +int vmem_alloc(vmem_t *, vmem_size_t, vm_flag_t, vmem_addr_t *); +void vmem_free(vmem_t *, vmem_addr_t, vmem_size_t); +int vmem_xalloc(vmem_t *, vmem_size_t, vmem_size_t, vmem_size_t, + vmem_size_t, vmem_addr_t, vmem_addr_t, vm_flag_t, + vmem_addr_t *); +int vmem_xalloc_addr(vmem_t *, vmem_addr_t, vmem_size_t, vm_flag_t); +void vmem_xfree(vmem_t *, vmem_addr_t, vmem_size_t); +void vmem_xfreeall(vmem_t *); +int vmem_add(vmem_t *, vmem_addr_t, vmem_size_t, vm_flag_t); +vmem_size_t vmem_roundup_size(vmem_t *, vmem_size_t); +vmem_size_t vmem_size(vmem_t *, int typemask); +void vmem_rehash_start(void); +void vmem_whatis(uintptr_t, void (*)(const char *, ...) + __attribute__((__format__(__kprintf__,1,2)))); +void vmem_print(uintptr_t, const char *, void (*)(const char *, ...) + __attribute__((__format__(__kprintf__,1,2)))); +void vmem_printall(const char *, void (*)(const char *, ...) + __attribute__((__format__(__kprintf__,1,2)))); + +/* vm_flag_t */ +#define VM_SLEEP 0x00000001 +#define VM_NOSLEEP 0x00000002 +#define VM_INSTANTFIT 0x00001000 +#define VM_BESTFIT 0x00002000 +#define VM_BOOTSTRAP 0x00010000 +#define VM_POPULATING 0x00040000 +#define VM_LARGEIMPORT 0x00080000 +#define VM_XIMPORT 0x00100000 +#define VM_PRIVTAGS 0x00200000 + +/* vmem_size typemask */ +#define VMEM_ALLOC 0x01 +#define VMEM_FREE 0x02 + +#endif /* !_SYS_VMEM_H_ */ Index: sys/uvm/uvm_fault.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_fault.c,v diff -u -p -r1.135 uvm_fault.c --- sys/uvm/uvm_fault.c 5 Sep 2023 05:08:26 -0000 1.135 +++ sys/uvm/uvm_fault.c 11 Sep 2024 11:29:29 -0000 @@ -576,6 +576,8 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad struct vm_page *pages[UVM_MAXRANGE]; int error; + LLTRACE(lltrace_event_start, LLTRACE_EVENT_CLASS_PAGEFAULT); + counters_inc(uvmexp_counters, faults); TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL); @@ -639,6 +641,8 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad } } } + + LLTRACE(lltrace_event_end, LLTRACE_EVENT_CLASS_PAGEFAULT); return error; } Index: usr.bin/lltextract/Makefile =================================================================== RCS file: usr.bin/lltextract/Makefile diff -N usr.bin/lltextract/Makefile --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ usr.bin/lltextract/Makefile 11 Sep 2024 11:29:30 -0000 @@ -0,0 +1,12 @@ +PROG= lltextract +SRCS= lltextract.c syscallnames.c names.c +SRCS+= heap.c +MAN= + +SYS_DIR= ${.CURDIR}/../../sys +CFLAGS+= -I${SYS_DIR} + +DEBUG= -g +WARNINGS= Yes + +.include Index: usr.bin/lltextract/fxt.h =================================================================== RCS file: usr.bin/lltextract/fxt.h diff -N usr.bin/lltextract/fxt.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ usr.bin/lltextract/fxt.h 11 Sep 2024 11:29:30 -0000 @@ -0,0 +1,30 @@ +#define FXT_T_METADATA 0 /* Metadata */ +#define FXT_T_INIT 1 /* Initialization */ +#define FXT_T_STRING 2 /* String */ +#define FXT_T_THREAD 3 /* Thread */ +#define FXT_T_EVENT 4 /* Event */ +#define FXT_T_BLOB 5 /* Blob */ +#define FXT_T_UOBJ 6 /* Userspace object */ +#define FXT_T_KOBJ 7 /* Kernel object */ +#define FXT_T_SCHED 8 /* Scheduling */ +#define FXT_T_LBLOB 15 /* Large BLOB */ + +#define FXT_H_TYPE_SHIFT 0 +#define FXT_H_TYPE_BITS 4 +#define FXT_H_SIZE_SHIFT 4 +#define FXT_H_SIZE_BITS 12 + +#define FXT_MAX_WORDS (1ULL << 12) + +#define FXT_RECORD(_type, _size) \ + htole64(((_type) << FXT_H_TYPE_SHIFT) | ((_size) << FXT_H_SIZE_SHIFT)) + +#define FXT_H_METADATA_TYPE_SHIFT 16 +#define FXT_H_METADATA_TYPE_BITS 4 + +#define FXT_MD_RECORD(_size, _mdtype) (FXT_RECORD(FXT_T_METADATA, (_size)) | \ + ((_mdtype) << FXT_H_METADATA_TYPE_SHIFT)) + +#define FXT_INIT_MAGIC 0x0016547846040010 +#define FXT_INIT_RECORD(_f) FXT_RECORD(FXT_T_INIT, 2), htole64(_f) + Index: usr.bin/lltextract/heap.c =================================================================== RCS file: usr.bin/lltextract/heap.c diff -N usr.bin/lltextract/heap.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ usr.bin/lltextract/heap.c 11 Sep 2024 11:29:30 -0000 @@ -0,0 +1,204 @@ +/* */ + +/* + * Copyright (c) 2017 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "heap.h" +#include + +static inline struct _heap_entry * +heap_n2e(const struct _heap_type *t, void *node) +{ + unsigned long addr = (unsigned long)node; + + return ((struct _heap_entry *)(addr + t->t_offset)); +} + +static inline void * +heap_e2n(const struct _heap_type *t, struct _heap_entry *he) +{ + unsigned long addr = (unsigned long)he; + + return ((void *)(addr - t->t_offset)); +} + +static struct _heap_entry * +_heap_entry_merge(const struct _heap_type *t, + struct _heap_entry *he1, struct _heap_entry *he2) +{ + struct _heap_entry *hi, *lo; + struct _heap_entry *child; + + if (he1 == NULL) + return (he2); + if (he2 == NULL) + return (he1); + + if (t->t_compare(heap_e2n(t, he1), heap_e2n(t, he2)) >= 0) { + hi = he1; + lo = he2; + } else { + lo = he1; + hi = he2; + } + + child = lo->he_child; + + hi->he_left = lo; + hi->he_nextsibling = child; + if (child != NULL) + child->he_left = hi; + lo->he_child = hi; + lo->he_left = NULL; + lo->he_nextsibling = NULL; + + return (lo); +} + +static inline void +_heap_sibling_remove(struct _heap_entry *he) +{ + if (he->he_left == NULL) + return; + + if (he->he_left->he_child == he) { + if ((he->he_left->he_child = he->he_nextsibling) != NULL) + he->he_nextsibling->he_left = he->he_left; + } else { + if ((he->he_left->he_nextsibling = he->he_nextsibling) != NULL) + he->he_nextsibling->he_left = he->he_left; + } + + he->he_left = NULL; + he->he_nextsibling = NULL; +} + +static inline struct _heap_entry * +_heap_2pass_merge(const struct _heap_type *t, struct _heap_entry *root) +{ + struct _heap_entry *node, *next = NULL; + struct _heap_entry *tmp, *list = NULL; + + node = root->he_child; + if (node == NULL) + return (NULL); + + root->he_child = NULL; + + /* first pass */ + for (next = node->he_nextsibling; next != NULL; + next = (node != NULL ? node->he_nextsibling : NULL)) { + tmp = next->he_nextsibling; + node = _heap_entry_merge(t, node, next); + + /* insert head */ + node->he_nextsibling = list; + list = node; + node = tmp; + } + + /* odd child case */ + if (node != NULL) { + node->he_nextsibling = list; + list = node; + } + + /* second pass */ + while (list->he_nextsibling != NULL) { + tmp = list->he_nextsibling->he_nextsibling; + list = _heap_entry_merge(t, list, list->he_nextsibling); + list->he_nextsibling = tmp; + } + + list->he_left = NULL; + list->he_nextsibling = NULL; + + return (list); +} + +void +_heap_insert(const struct _heap_type *t, struct _heap *h, void *node) +{ + struct _heap_entry *he = heap_n2e(t, node); + + he->he_left = NULL; + he->he_child = NULL; + he->he_nextsibling = NULL; + + h->h_root = _heap_entry_merge(t, h->h_root, he); +} + +void +_heap_remove(const struct _heap_type *t, struct _heap *h, void *node) +{ + struct _heap_entry *he = heap_n2e(t, node); + + if (he->he_left == NULL) { + _heap_extract(t, h); + return; + } + + _heap_sibling_remove(he); + h->h_root = _heap_entry_merge(t, h->h_root, _heap_2pass_merge(t, he)); +} + +void +_heap_merge(const struct _heap_type *t, struct _heap *h1, struct _heap *h2) +{ + h1->h_root = _heap_entry_merge(t, h1->h_root, h2->h_root); +} + +void * +_heap_first(const struct _heap_type *t, struct _heap *h) +{ + struct _heap_entry *first = h->h_root; + + if (first == NULL) + return (NULL); + + return (heap_e2n(t, first)); +} + +void * +_heap_extract(const struct _heap_type *t, struct _heap *h) +{ + struct _heap_entry *first = h->h_root; + + if (first == NULL) + return (NULL); + + h->h_root = _heap_2pass_merge(t, first); + + return (heap_e2n(t, first)); +} + +void * +_heap_cextract(const struct _heap_type *t, struct _heap *h, const void *key) +{ + struct _heap_entry *first = h->h_root; + void *node; + + if (first == NULL) + return (NULL); + + node = heap_e2n(t, first); + if (t->t_compare(node, key) > 0) + return (NULL); + + h->h_root = _heap_2pass_merge(t, first); + + return (node); +} Index: usr.bin/lltextract/heap.h =================================================================== RCS file: usr.bin/lltextract/heap.h diff -N usr.bin/lltextract/heap.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ usr.bin/lltextract/heap.h 11 Sep 2024 11:29:30 -0000 @@ -0,0 +1,140 @@ +/* */ + +/* + * Copyright (c) 2017 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _HEAP_H_ +#define _HEAP_H_ + +#include + +struct _heap_type { + int (*t_compare)(const void *, const void *); + unsigned int t_offset; /* offset of heap_entry in type */ +}; + +struct _heap_entry { + struct _heap_entry *he_left; + struct _heap_entry *he_child; + struct _heap_entry *he_nextsibling; +}; +#define HEAP_ENTRY(_entry) struct _heap_entry + +struct _heap { + struct _heap_entry *h_root; +}; + +#define HEAP_HEAD(_name) \ +struct _name { \ + struct _heap heap; \ +} + +static inline void +_heap_init(struct _heap *h) +{ + h->h_root = NULL; +} + +static inline int +_heap_empty(struct _heap *h) +{ + return (h->h_root == NULL); +} + +void _heap_insert(const struct _heap_type *, struct _heap *, void *); +void _heap_remove(const struct _heap_type *, struct _heap *, void *); +void _heap_merge(const struct _heap_type *, struct _heap *, struct _heap *); +void *_heap_first(const struct _heap_type *, struct _heap *); +void *_heap_extract(const struct _heap_type *, struct _heap *); +void *_heap_cextract(const struct _heap_type *, struct _heap *, + const void *); + +#define HEAP_INITIALIZER(_head) { { NULL } } + +#define HEAP_PROTOTYPE(_name, _type) \ +extern const struct _heap_type *const _name##_HEAP_TYPE; \ + \ +__unused static inline void \ +_name##_HEAP_INIT(struct _name *head) \ +{ \ + _heap_init(&head->heap); \ +} \ + \ +__unused static inline void \ +_name##_HEAP_INSERT(struct _name *head, struct _type *elm) \ +{ \ + _heap_insert(_name##_HEAP_TYPE, &head->heap, elm); \ +} \ + \ +__unused static inline void \ +_name##_HEAP_REMOVE(struct _name *head, struct _type *elm) \ +{ \ + _heap_remove(_name##_HEAP_TYPE, &head->heap, elm); \ +} \ + \ +__unused static inline struct _type * \ +_name##_HEAP_FIRST(struct _name *head) \ +{ \ + return _heap_first(_name##_HEAP_TYPE, &head->heap); \ +} \ + \ +__unused static inline void \ +_name##_HEAP_MERGE(struct _name *head1, struct _name *head2) \ +{ \ + _heap_merge(_name##_HEAP_TYPE, &head1->heap, &head2->heap); \ +} \ + \ +__unused static inline struct _type * \ +_name##_HEAP_EXTRACT(struct _name *head) \ +{ \ + return _heap_extract(_name##_HEAP_TYPE, &head->heap); \ +} \ + \ +__unused static inline struct _type * \ +_name##_HEAP_CEXTRACT(struct _name *head, const struct _type *key) \ +{ \ + return _heap_cextract(_name##_HEAP_TYPE, &head->heap, key); \ +} \ + \ +__unused static inline int \ +_name##_HEAP_EMPTY(struct _name *head) \ +{ \ + return _heap_empty(&head->heap); \ +} + +#define HEAP_GENERATE(_name, _type, _field, _cmp) \ +static int \ +_name##_HEAP_COMPARE(const void *lptr, const void *rptr) \ +{ \ + const struct _type *l = lptr, *r = rptr; \ + return _cmp(l, r); \ +} \ +static const struct _heap_type _name##_HEAP_INFO = { \ + _name##_HEAP_COMPARE, \ + offsetof(struct _type, _field), \ +}; \ +const struct _heap_type *const _name##_HEAP_TYPE = &_name##_HEAP_INFO + +#define HEAP_INIT(_name, _h) _name##_HEAP_INIT((_h)) +#define HEAP_INSERT(_name, _h, _e) _name##_HEAP_INSERT((_h), (_e)) +#define HEAP_REMOVE(_name, _h, _e) _name##_HEAP_REMOVE((_h), (_e)) +#define HEAP_FIRST(_name, _h) _name##_HEAP_FIRST((_h)) +#define HEAP_MERGE(_name, _h1, _h2) _name##_HEAP_MERGE((_h1), (_h2)) +#define HEAP_EXTRACT(_name, _h) _name##_HEAP_EXTRACT((_h)) +#define HEAP_CEXTRACT(_name, _h, _k) _name##_HEAP_CEXTRACT((_h), (_k)) +#define HEAP_EMPTY(_name, _h) _name##_HEAP_EMPTY((_h)) + +#endif /* _HEAP_H_ */ Index: usr.bin/lltextract/lltextract.c =================================================================== RCS file: usr.bin/lltextract/lltextract.c diff -N usr.bin/lltextract/lltextract.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ usr.bin/lltextract/lltextract.c 11 Sep 2024 11:29:31 -0000 @@ -0,0 +1,1829 @@ +/* $OpenBSD */ + +/* + * Copyright (c) 2022 The University of Queensland + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * + * Copyright 2021 Richard L. Sites + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include /* for SYS_MAXSYSCALL */ +#include /* for _MAXCOMLEN */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "heap.h" + +#include "fxt.h" +#include "lltextract.h" + +#ifndef nitems +#define nitems(_a) (sizeof((_a)) / sizeof((_a)[0])) +#endif + +#ifndef ISSET +#define ISSET(_a, _b) ((_a) & (_b)) +#endif + +#define THREAD_PID_OFFSET 100000 + +struct cytime { + uint64_t base_cy; + uint64_t base_ns; + uint64_t base_cy10; + uint64_t base_ns10; + + double slope; +}; + +struct ring { + uint64_t slots[8192]; +}; + +static void lltextract(size_t, const struct ring *); + +struct llt_pid { + /* this knows a lot about process names in the kernel */ + union { + uint64_t words[3]; + char str[_MAXCOMLEN]; + } _ps_comm; +#define ps_comm _ps_comm.str +#define ps_comm64 _ps_comm.words + unsigned int ps_comm_n; + unsigned int ps_strid; + + unsigned int ps_pid; + unsigned int ps_system; + uint64_t ps_fxtid; + + uint64_t ps_ts; + RBT_ENTRY(llt_pid) ps_entry; +}; + +RBT_HEAD(llt_pid_tree, llt_pid); + +static inline int +llt_pid_cmp(const struct llt_pid *a, const struct llt_pid *b) +{ + if (a->ps_pid > b->ps_pid) + return (1); + if (a->ps_pid < b->ps_pid) + return (-1); + return (0); +} + +struct llt_tid { + struct llt_pid *p_p; + unsigned int p_strid; + //unsigned int p_thrid; + unsigned int p_tid; + uint64_t p_fxtid; + + RBT_ENTRY(llt_tid) p_entry; +}; + +RBT_HEAD(llt_tid_tree, llt_tid); + +static inline int +llt_tid_cmp(const struct llt_tid *a, const struct llt_tid *b) +{ + if (a->p_tid > b->p_tid) + return (1); + if (a->p_tid < b->p_tid) + return (-1); + return (0); +} + +RBT_PROTOTYPE(llt_pid_tree, llt_pid, ps_entry, llt_pid_cmp); +RBT_PROTOTYPE(llt_tid_tree, llt_tid, p_entry, llt_tid_cmp); + +struct lltx_fxt_record { + HEAP_ENTRY(lltx_fxt_record) + entry; + uint64_t ts; + unsigned int n; + + /* followed by n * uint64_ts */ +}; + +HEAP_HEAD(lltx_fxt_heap); + +HEAP_PROTOTYPE(lltx_fxt_heap, lltx_fxt_record); + +__dead static void +usage(void) +{ + extern char *__progname; + + fprintf(stderr, "usage: %s [-v] -i infile -o outfile\n", + __progname); + + exit(1); +} + +static const uint64_t fxt_magic[] = { htole64(FXT_INIT_MAGIC) }; +static const uint64_t fxt_init[2] = { FXT_INIT_RECORD(1000000000ULL) }; + +static FILE *ifile = stdin; +static FILE *ofile = stdout; +static int verbose = 0; + +static struct llt_pid_tree lltx_pids = RBT_INITIALIZER(); +static struct llt_tid_tree lltx_tids = RBT_INITIALIZER(); + +static void lltx_kobj_bsd(void); +static unsigned int lltx_str(const char *); + +static unsigned int lltx_strids; +static unsigned int lltx_strid_process; +static unsigned int lltx_strid_sched; +static unsigned int lltx_strid_wakeup; +static unsigned int lltx_strid_woken; +static unsigned int lltx_strid_unknown; +static unsigned int lltx_strid_acquire; +static unsigned int lltx_strid_symbol; +static unsigned int lltx_strid_offset; +static unsigned int lltx_strid_count; + +static const char str_process[] = "process"; +static const char str_sched[] = "sched"; +static const char str_wakeup[] = "wakeup"; +static const char str_woken[] = "woken"; +static const char str_unknown[] = "unknown"; +static const char str_acquire[] = "acquire"; +static const char str_symbol[] = "symbol"; +static const char str_offset[] = "offset"; +static const char str_count[] = "count"; + +static const char *str_locks[1 << LLTRACE_LK_TYPE_WIDTH] = { + [LLTRACE_LK_RW] = "rwlock", + [LLTRACE_LK_MTX] = "mutex", + [LLTRACE_LK_K] = "kernel", +}; +static unsigned int lltx_strids_locks[1 << LLTRACE_LK_TYPE_WIDTH]; + +static const char *str_lock_ops[1 << LLTRACE_LK_PHASE_WIDTH] = { + [LLTRACE_LK_I_EXCL] = "instant-exclusive", + [LLTRACE_LK_I_SHARED] = "instant-shared", + [LLTRACE_LK_A_START] = "acquire-start", + [LLTRACE_LK_A_EXCL] = "acquired-exclusive", + [LLTRACE_LK_A_SHARED] = "acquired-shared", + [LLTRACE_LK_A_ABORT] = "acquire-abort", + [LLTRACE_LK_DOWNGRADE] = "downgrade", + [LLTRACE_LK_R_EXCL] = "release-exclusive", + [LLTRACE_LK_R_SHARED] = "release-shared", + [LLTRACE_LK_I_FAIL] = "instant-fail", +}; +static unsigned int lltx_strids_lock_ops[1 << LLTRACE_LK_PHASE_WIDTH]; + +static struct lltx_fxt_heap lltx_records = HEAP_INITIALIZER(); + +static void +fxt_insert(uint64_t ts, const uint64_t *atoms, unsigned int n) +{ + struct lltx_fxt_record *r; + uint64_t *dst; + unsigned int i; + + r = malloc(sizeof(*r) + (sizeof(*atoms) * n)); + if (r == NULL) + err(1, "fxt_insert"); + + r->ts = ts; + r->n = n; + dst = (uint64_t *)(r + 1); + for (i = 0; i < n; i++) + dst[i] = atoms[i]; + + HEAP_INSERT(lltx_fxt_heap, &lltx_records, r); +} + +static struct lltx_fxt_record * +fxt_extract(void) +{ + return (HEAP_EXTRACT(lltx_fxt_heap, &lltx_records)); +} + +static inline size_t +fxt_write(const uint64_t *w, size_t n, FILE *f) +{ + return fwrite(w, sizeof(*w), n, f); +} + +int +main(int argc, char *argv[]) +{ + const char *ifname = NULL; + const char *ofname = NULL; + const char *ofmode = "wx"; + struct ring ring; + size_t block = 0; + size_t rv; + size_t i; + + int ch; + + while ((ch = getopt(argc, argv, "fi:o:v")) != -1) { + switch (ch) { + case 'f': + ofmode = "w"; + break; + case 'i': + ifname = optarg; + break; + case 'o': + ofname = optarg; + break; + case 'v': + verbose++; + break; + default: + usage(); + /* NOTREACHED */ + } + } + + argc -= optind; + argv += optind; + + if (argc != 0) + usage(); + + if (ifname == NULL) + warnx("input file not specified"); + if (ofname == NULL) + warnx("output file not specified"); + if (ifname == NULL || ofname == NULL) + usage(); + + ifile = fopen(ifname, "r"); + if (ifile == NULL) + err(1, "%s", ifname); + + ofile = fopen(ofname, ofmode); + if (ofile == NULL) + err(1, "%s", ofname); + + rv = fxt_write(fxt_magic, nitems(fxt_magic), ofile); + if (rv == 0) + err(1, "%s fxt magic write", ofname); + + rv = fxt_write(fxt_init, nitems(fxt_init), ofile); + if (rv == 0) + err(1, "%s fxt ts write", ofname); + + lltx_kobj_bsd(); + lltx_strid_process = lltx_str(str_process); + lltx_strid_sched = lltx_str(str_sched); + lltx_strid_wakeup = lltx_str(str_wakeup); + lltx_strid_woken = lltx_str(str_woken); + lltx_strid_unknown = lltx_str(str_unknown); + lltx_strid_acquire = lltx_str(str_acquire); + lltx_strid_symbol = lltx_str(str_symbol); + lltx_strid_offset = lltx_str(str_offset); + lltx_strid_count = lltx_str(str_count); + + for (i = 0; i < nitems(str_locks); i++) { + const char *str = str_locks[i]; + if (str == NULL) + continue; + lltx_strids_locks[i] = lltx_str(str); + } + + for (i = 0; i < nitems(str_lock_ops); i++) { + const char *str = str_lock_ops[i]; + if (str == NULL) + continue; + lltx_strids_lock_ops[i] = lltx_str(str); + } + +printf("[\n"); + for (;;) { + size_t nread = fread(&ring, sizeof(ring), 1, ifile); + if (nread == 0) { + if (ferror(ifile)) + errx(1, "error reading %s", ifname); + if (feof(ifile)) + break; + } + + lltextract(block++, &ring); + } + + { + struct llt_tid *p; + + RBT_FOREACH(p, llt_tid_tree, &lltx_tids) { + printf("### pid %u tid %u -> %llu %llu\n", + p->p_p->ps_pid, p->p_tid, + p->p_p->ps_fxtid, p->p_fxtid); + } + } + + { + struct lltx_fxt_record *r; + + while ((r = fxt_extract()) != NULL) { + uint64_t *atoms = (uint64_t *)(r + 1); + fxt_write(atoms, r->n, ofile); + free(r); + } + } + + return (0); +} + + +static int +printable(int ch) +{ + if (ch == '\0') + return ('_'); + if (!isprint(ch)) + return ('~'); + + return (ch); +} + +static void +dump_slot(size_t slot, uint64_t v) +{ + uint8_t buf[sizeof(v)]; + size_t i; + + printf("## slot %4zu = 0x%016llx |", slot, v); + + memcpy(buf, &v, sizeof(buf)); + for (i = 0; i < sizeof(buf); i++) + putchar(printable(buf[i])); + + printf("|\n"); +} + +static void +dump_slots(const struct ring *ring, size_t slot, size_t n) +{ + n += slot; + while (slot < n) { + dump_slot(slot, ring->slots[slot]); + slot++; + } +} + +static void +cytime_init(struct cytime *ct, + uint64_t start_cy, uint64_t start_ns, uint64_t stop_cy, uint64_t stop_ns) +{ + uint64_t diff_cy = stop_cy - start_cy; + uint64_t diff_ns = stop_ns - start_ns; + + ct->base_cy = start_cy; + ct->base_ns = start_ns; + + ct->slope = (double)diff_ns / (double)diff_cy; + + if (verbose >= 1) { + printf("SetParams maps %18llucy ==> %18lluns\n", + start_cy, start_ns); + printf("SetParams maps %18llucy ==> %18lluns\n", + stop_cy, stop_ns); + printf(" diff %18llucy ==> %18lluns\n", + diff_cy, diff_ns); + printf("SetParams slope %f ns/cy\n", ct->slope); + } +} + +struct lltstate { + struct cytime ct; + + uint32_t cy32; + int64_t cy; + unsigned int cpu; + unsigned int idletid; + + uint64_t ns; + struct llt_tid *p; + + unsigned int idle; +}; + +#define TS32_SHIFT (32 - (LLTRACE_TS_WIDTH + LLTRACE_TS_SHIFT)) + +struct llevent { + size_t block; + size_t slot; + int64_t cy; + uint32_t cy32; +}; + +#if 0 +static void lltextract_mark(struct lltstate *, struct llevent *, + unsigned int, uint64_t); +static void lltextract_irq(struct lltstate *, struct llevent *, + unsigned int, uint64_t); +static void lltextract_syscall(struct lltstate *, struct llevent *, + unsigned int, uint64_t); +static void lltextract_sysret(struct lltstate *, struct llevent *, + unsigned int, uint64_t); +#endif + +static void lltx_id(struct lltstate *, struct llevent *, uint64_t, + const uint64_t *, unsigned int); +static void lltx_event(struct lltstate *, struct llevent *, uint64_t, + const uint64_t *, unsigned int); +static void lltx_locking(struct lltstate *, struct llevent *, uint64_t, + const uint64_t *, unsigned int); +static void lltx_idle(struct lltstate *, struct llevent *, unsigned int); + +static struct llt_tid * +lltx_tid(unsigned int tid) +{ + struct llt_tid *p; + struct llt_tid key = { .p_tid = tid }; + + p = RBT_FIND(llt_tid_tree, &lltx_tids, &key); + if (p != NULL) + return (p); + + p = malloc(sizeof(*p)); + if (p == NULL) + err(1, "llt tid alloc"); + + p->p_tid = tid; + + p->p_p = NULL; + p->p_strid = 0; + //p->p_thrid = 0; + p->p_fxtid = p->p_tid + THREAD_PID_OFFSET; + + if (RBT_INSERT(llt_tid_tree, &lltx_tids, p) != NULL) + errx(1, "llt tid %d insert failed", tid); + + return (p); +} + +static struct llt_tid * +lltx_tid_pid(unsigned int tid, unsigned int pid, unsigned int sys) +{ + struct llt_tid *p; + struct llt_pid *ps; + + p = lltx_tid(tid); + ps = p->p_p; + if (ps == NULL) { + struct llt_pid key = { .ps_pid = pid }; + + ps = RBT_FIND(llt_pid_tree, &lltx_pids, &key); + if (ps == NULL) { + ps = malloc(sizeof(*ps)); + if (ps == NULL) + err(1, "llt pid alloc"); + + ps->ps_pid = pid; + ps->ps_system = sys; + + ps->ps_strid = 0; + ps->ps_ts = 0; + + /* lie about kernel threads */ + ps->ps_fxtid = ps->ps_system ? 0 : ps->ps_pid; + + if (RBT_INSERT(llt_pid_tree, &lltx_pids, ps) != NULL) + errx(1, "llt pid %u insert failed", pid); + } + + p->p_p = ps; + p->p_fxtid = ps->ps_system ? ps->ps_pid : + (p->p_tid + THREAD_PID_OFFSET); + + if (!ps->ps_system) { + uint64_t atoms[4]; + + atoms[0] = htole64(FXT_T_KOBJ); + atoms[0] |= htole64(nitems(atoms) << FXT_H_SIZE_SHIFT); + atoms[0] |= htole64(2ULL << 16); /* ZX_OBJ_TYPE_THREAD */ + atoms[0] |= htole64(1ULL << 40); /* number of args */ + atoms[1] = htole64(p->p_fxtid); + atoms[2] = htole64(8 | (2 << 4)); /* koid */ + atoms[2] |= htole64((uint64_t)lltx_strid_process << 16); + atoms[3] = htole64(ps->ps_fxtid); + + fxt_write(atoms, nitems(atoms), ofile); + } + } else { + if (ps->ps_pid != pid) + errx(1, "tid %u has a new pid %u", tid, pid); + } + + return (p); +} + +static void +lltextract(size_t block, const struct ring *ring) +{ + const struct lltrace_header *llh = (struct lltrace_header *)ring; + struct lltstate state = { + .cpu = llh->h_cpu, + .idletid = llh->h_idletid, + .cy = 0, + .idle = LLTRACE_EVENT_PHASE_END, + }; + struct llevent lle; + unsigned int pid, sys; + + size_t slot, nslot; + uint32_t cy32; + int32_t cydiff; + + if (verbose >= 2) + dump_slots(ring, 0, 8); + + cytime_init(&state.ct, ring->slots[2], ring->slots[3], + ring->slots[4], ring->slots[5]); + + printf("{"); + printf("\"name\":\"cpu%u\",", state.cpu); + printf("\"cat\":\"lltrace\","); + printf("\"ph\":\"b\","); + printf("\"pid\":0,"); + printf("\"tid\":%u,", state.cpu); + printf("\"ts\":%lf,", (double)ring->slots[3] / 1000.0); + printf("\"id\":%zu", block); + printf("},\n"); + + printf("{"); + printf("\"name\":\"cpu%u\",", state.cpu); + printf("\"cat\":\"lltrace\","); + printf("\"ph\":\"e\","); + printf("\"pid\":0,"); + printf("\"tid\":%u,", state.cpu); + printf("\"ts\":%lf,", (double)ring->slots[5] / 1000.0); + printf("\"id\":%zu", block); + printf("},\n"); + + state.cy32 = ring->slots[2] << TS32_SHIFT; + state.ns = state.ct.base_ns; + + sys = llh->h_pid & (1U << 31); + pid = llh->h_pid & ~(1U << 31); + + state.p = lltx_tid_pid(llh->h_tid, pid, sys); + + for (slot = 8; slot < nitems(ring->slots); slot++) { + const uint64_t *slots = ring->slots + slot; + uint64_t record = slots[0]; + unsigned int type, len; + + if (verbose >= 2) + dump_slot(slot, record); + + if (record == 0) + return; + + type = (record >> LLTRACE_TYPE_SHIFT) & LLTRACE_TYPE_MASK; + len = (record >> LLTRACE_LEN_SHIFT) & LLTRACE_LEN_MASK; + + nslot = slot + len; + if (nslot >= nitems(ring->slots)) + errx(1, "slot %zu has %u extra", slot, len); + + if (verbose >= 2) { + dump_slots(ring, slot + 1, len); + printf("slot %4zu+%u type 0x%x\n", slot, len, type); + } + + if (ISSET(LLTRACE_TS_TYPES, 1U << type)) { + cy32 = record & (LLTRACE_TS_MASK << LLTRACE_TS_SHIFT); + cy32 <<= TS32_SHIFT; + cydiff = (cy32 - state.cy32); + cydiff >>= TS32_SHIFT; + + int64_t cy = state.cy + cydiff; + if (cydiff > 0) { + state.cy32 = cy32; + state.cy += cydiff; + } + //lle.cy = state.cy; + state.ns = state.ct.base_ns + (cy * state.ct.slope); + //state.ns = state.ct.base_cy + cy; + + if (verbose >= 2) { + printf("state.cy %llu state.cy32 %u diff %d (%.1f)\n", + state.cy, state.cy32, cydiff, cydiff * state.ct.slope); + } + + if (state.idle == LLTRACE_EVENT_PHASE_START) { + lltx_idle(&state, &lle, + LLTRACE_EVENT_PHASE_END); + } + } + + lle.block = block; + lle.slot = slot; + + switch (type) { + case LLTRACE_TYPE_ID: + lltx_id(&state, &lle, record, slots + 1, len); + break; + case LLTRACE_TYPE_EVENT: + lltx_event(&state, &lle, record, slots + 1, len); + break; + case LLTRACE_TYPE_LOCKING: + lltx_locking(&state, &lle, record, slots + 1, len); + break; + default: + warnx("slot %4zu+%u unknown type 0x%x ", + slot, len, type); + break; + } + + slot = nslot; + } +} + +static size_t +strtoatoms(uint64_t *atoms, size_t n, const char *str, size_t len) +{ + size_t natoms = (len + (sizeof(*atoms) - 1)) / sizeof(*atoms); + size_t nn = n + natoms; + size_t i; + + if (nn >= FXT_MAX_WORDS) + errx(1, "too far"); + + for (i = n; i < nn; i++) + atoms[i] = 0; + + memcpy(atoms + n, str, len); + + return (nn); +} + +static int +str64eq(const uint64_t *a, const uint64_t *b, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) { + if (a[i] != b[i]) + return (0); + } + + return (1); +} + +uint64_t fxt_atoms[128]; + +static void +lltx_id_tid(struct lltstate *state, struct llevent *lle, uint64_t record, + const uint64_t *extra, unsigned int extralen) +{ + unsigned int tid, pid, sys; + struct llt_tid *p; + struct llt_pid *ps; + unsigned int i; + size_t n; + + tid = (record >> LLTRACE_ID_TID_SHIFT) & LLTRACE_ID_TID_MASK; + pid = (record >> LLTRACE_ID_TID_PID_SHIFT) & LLTRACE_ID_TID_PID_MASK; + sys = !!ISSET(record, LLTRACE_ID_TID_SYSTEM); + + printf("#pn %zu[%zu] cpu %u %s pid %u tid %u", + lle->block, lle->slot, state->cpu, + sys ? "kernel" : "user", pid, tid); + if (extralen > 0) { + printf(" %.*s", + (int)(extralen * sizeof(*extra)), (const char *)extra); + } + printf("\n"); + + p = lltx_tid_pid(tid, pid, sys); + ps = p->p_p; + +// state->tid = tid; +// state->p = p; + + if (ps->ps_ts > state->ns) { + /* a later version of the info has already been reported */ + return; + } + ps->ps_ts = state->ns; + + if (extralen > nitems(ps->ps_comm64)) + errx(1, "pid %d name is too long", ps->ps_pid); + + if (ps->ps_comm_n == extralen && + str64eq(ps->ps_comm64, extra, extralen)) + return; + + for (i = 0; i < extralen; i++) + ps->ps_comm64[i] = extra[i]; + while (i < nitems(ps->ps_comm64)) + ps->ps_comm64[i++] = 0; + ps->ps_comm_n = extralen; + + fxt_atoms[0] = htole64(FXT_T_KOBJ); + + n = 1; + if (ps->ps_system) { + fxt_atoms[0] |= htole64(2 << 16); /* ZX_OBJ_TYPE_THREAD */ + fxt_atoms[n++] = htole64(p->p_fxtid); + } else { + fxt_atoms[0] |= htole64(1 << 16); /* ZX_OBJ_TYPE_PROCESS */ + fxt_atoms[n++] = htole64(ps->ps_fxtid); + } + for (i = 0; i < extralen; i++) + fxt_atoms[n++] = extra[i]; + fxt_atoms[0] |= htole64(n << 4); + fxt_atoms[0] |= htole64(((1 << 15) | + strnlen(ps->ps_comm, ps->ps_comm_n * 8)) << 24); + + fxt_write(fxt_atoms, n, ofile); +} + +static void +lltx_kobj_bsd(void) +{ + static const char name[] = "/bsd"; + size_t namelen = sizeof(name) - 1; /* - nul */ + size_t n; + + n = 1; + fxt_atoms[n++] = 0; /* pid 0 is the kernel */ + n = strtoatoms(fxt_atoms, n, name, namelen); + + fxt_atoms[0] = htole64(FXT_T_KOBJ); + fxt_atoms[0] |= htole64(1 << 16); /* ZX_OBJ_TYPE_PROCESS */ + fxt_atoms[0] |= htole64(n << 4); + fxt_atoms[0] |= htole64(((1 << 15) | namelen) << 24); + + fxt_write(fxt_atoms, n, ofile); +} + +static unsigned int +lltx_str(const char *str) +{ + size_t len = strlen(str); + uint64_t strid = ++lltx_strids; + size_t n; + + n = strtoatoms(fxt_atoms, 1, str, len); + + fxt_atoms[0] = htole64(FXT_T_STRING | (n << 4)); + fxt_atoms[0] |= htole64(strid << 16); + fxt_atoms[0] |= htole64((uint64_t)len << 32); + + fxt_write(fxt_atoms, n, ofile); + + return (strid); +} + +static void +lltx_id(struct lltstate *state, struct llevent *lle, uint64_t record, + const uint64_t *extra, unsigned int n) +{ + unsigned int type; + + type = (record >> LLTRACE_ID_TYPE_SHIFT) & LLTRACE_ID_TYPE_MASK; + + switch (type) { + case LLTRACE_ID_TYPE_TID: + lltx_id_tid(state, lle, record, extra, n); + break; + default: + warnx("slot %4zu+%u unknown id type 0x%x ", lle->slot, n, + type); + break; + } +} + +static const char *lltrace_event_class_names[] = { + [LLTRACE_EVENT_CLASS_SYSCALL] = "syscall", + [LLTRACE_EVENT_CLASS_IDLE] = "idle", + [LLTRACE_EVENT_CLASS_INTR] = "intr", + [LLTRACE_EVENT_CLASS_SCHED] = "sched", + [LLTRACE_EVENT_CLASS_FUNC] = "function", + [LLTRACE_EVENT_CLASS_PAGEFAULT] = "pagefault", + [LLTRACE_EVENT_CLASS_WAKE] = "wake", + [LLTRACE_EVENT_CLASS_COUNT] = "count", +}; + +static const char *lltrace_event_phase_names[] = { + [LLTRACE_EVENT_PHASE_INSTANT] = "instant", + [LLTRACE_EVENT_PHASE_START] = "start", + [LLTRACE_EVENT_PHASE_STEP] = "step", + [LLTRACE_EVENT_PHASE_END] = "end", +}; + +static const unsigned int lltrace_event_phase_map[] = { + [LLTRACE_EVENT_PHASE_INSTANT] = 0, + [LLTRACE_EVENT_PHASE_START] = 2, + [LLTRACE_EVENT_PHASE_END] = 3, +}; + +static const char *lltrace_intr_type_names[1 << LLTRACE_INTR_T_WIDTH] = { + [LLTRACE_INTR_T_HW] = "hardintr", + [LLTRACE_INTR_T_SW] = "softintr", + [LLTRACE_INTR_T_IPI] = "ipi", + [LLTRACE_INTR_T_CLOCK] = "clockintr", +}; + +static const char *lltrace_count_type_names[] = { + [LLTRACE_COUNT_T_PKTS_IFIQ] = "pkts:ifiq", + [LLTRACE_COUNT_T_PKTS_NETTQ] = "pkts:nettq", + [LLTRACE_COUNT_T_PKTS_IFQ] = "pkts:ifq", + [LLTRACE_COUNT_T_PKTS_QDROP] = "pkts:qdrop", + [LLTRACE_COUNT_T_PKTS_HDROP] = "pkts:hdrop", +}; + +static const char * +syscall_name(unsigned int sc) +{ + extern const char *const syscallnames[]; + + if (sc < SYS_MAXSYSCALL) + return (syscallnames[sc]); + + return (NULL); +} + +#if 0 +static uint64_t +lltx_thrid(struct llt_tid *p) +{ + static unsigned int thrids; + unsigned int thrid = p->p_thrid; + uint64_t atoms[3]; + + if (thrid != 0) + return thrid; + + thrid = ++thrids; + p->p_thrid = thrid; + + /* XXX not the nicest place to do this */ + atoms[0] = htole64(FXT_T_THREAD | (nitems(atoms) << FXT_H_SIZE_SHIFT)); + atoms[0] |= htole64(thrid << 16); + atoms[1] = htole64(p->p_p->ps_fxtid); + atoms[2] = htole64(p->p_fxtid); + + printf("#th 0x%016llx %llu %llu\n", atoms[0], atoms[1], atoms[2]); + + fxt_write(atoms, nitems(atoms), ofile); + + return (thrid); +} +#endif + +static void +lltx_sched(struct lltstate *state, struct llevent *lle, uint64_t record, + const uint64_t *extra, unsigned int extralen) +{ + unsigned int ntid, ostate; + struct llt_tid *op = state->p; + struct llt_tid *np; +// uint64_t oid, nid; + size_t n; + + ntid = (record >> LLTRACE_SCHED_TID_SHIFT) & + LLTRACE_SCHED_TID_MASK; + ostate = (record >> LLTRACE_SCHED_STATE_SHIFT) & + LLTRACE_SCHED_STATE_MASK; + + np = lltx_tid(ntid); + if (np->p_p == NULL) + errx(1, "new thread %u is unknown", ntid); + + if (verbose >= 2) { + printf("#ev %zu[%zu] %llu cpu %u pid %llu tid %llu " + "switch to pid %llu tid %llu\n", + lle->block, lle->slot, state->ns, state->cpu, + op->p_p->ps_fxtid, op->p_fxtid, + np->p_p->ps_fxtid, np->p_fxtid); + } + + if (extralen > 0) { + n = 1; + fxt_atoms[n++] = htole64(state->ns); + fxt_atoms[n++] = htole64(np->p_p->ps_fxtid); + fxt_atoms[n++] = htole64(np->p_fxtid); + + fxt_atoms[0] = htole64(FXT_T_EVENT | (n << FXT_H_SIZE_SHIFT)); + fxt_atoms[0] |= htole64((uint64_t)0 << 16); /* instant event */ + fxt_atoms[0] |= htole64(0ULL << 20); /* number of args */ + //fxt_atoms[0] |= htole64(nid << 24); + fxt_atoms[0] |= htole64((uint64_t)lltx_strid_sched << 32); + fxt_atoms[0] |= htole64((uint64_t)lltx_strid_woken << 48); + + //fxt_write(fxt_atoms, n, ofile); + fxt_insert(state->ns, fxt_atoms, n); + + n = 1; + fxt_atoms[n++] = htole64(state->ns); + fxt_atoms[n++] = htole64(np->p_p->ps_fxtid); + fxt_atoms[n++] = htole64(np->p_fxtid); + fxt_atoms[n++] = htole64(extra[0]); + + fxt_atoms[0] = htole64(FXT_T_EVENT | (n << FXT_H_SIZE_SHIFT)); + fxt_atoms[0] |= htole64((uint64_t)10 << 16); + fxt_atoms[0] |= htole64(0ULL << 20); /* number of args */ + //fxt_atoms[0] |= htole64(nid << 24); + fxt_atoms[0] |= htole64((uint64_t)lltx_strid_sched << 32); + fxt_atoms[0] |= htole64((uint64_t)lltx_strid_wakeup << 48); + + //fxt_write(fxt_atoms, n, ofile); + fxt_insert(state->ns, fxt_atoms, n); + } + +// oid = lltx_thrid(op); +// nid = lltx_thrid(np); + + n = 1; + fxt_atoms[n++] = htole64(state->ns); + fxt_atoms[n++] = htole64(op->p_p->ps_fxtid); + fxt_atoms[n++] = htole64(op->p_fxtid); + fxt_atoms[n++] = htole64(np->p_p->ps_fxtid); + fxt_atoms[n++] = htole64(np->p_fxtid); + + fxt_atoms[0] = htole64(FXT_T_SCHED | (n << FXT_H_SIZE_SHIFT)); + fxt_atoms[0] |= htole64((uint64_t)state->cpu << 16); + fxt_atoms[0] |= htole64((uint64_t)ostate << 24); +// fxt_atoms[0] |= htole64(oid << 28); +// fxt_atoms[0] |= htole64(nid << 36); + fxt_atoms[0] |= htole64(1ULL << 44); + fxt_atoms[0] |= htole64(1ULL << 52); + fxt_atoms[0] |= htole64((uint64_t)0 << 60); + + //fxt_write(fxt_atoms, n, ofile); + fxt_insert(state->ns, fxt_atoms, n); + + state->p = np; +} + +static void +lltx_sched_wake(struct lltstate *state, struct llevent *lle, uint64_t record, + const uint64_t *extra, unsigned int extralen) +{ + unsigned int tid; + struct llt_tid *p; + size_t n; + + if (extralen > 0) { + p = state->p; + + n = 1; + fxt_atoms[n++] = htole64(state->ns); + fxt_atoms[n++] = htole64(p->p_p->ps_fxtid); + fxt_atoms[n++] = htole64(p->p_fxtid); + + fxt_atoms[0] = htole64(FXT_T_EVENT | (n << FXT_H_SIZE_SHIFT)); + fxt_atoms[0] |= htole64((uint64_t)0 << 16); /* instant event */ + fxt_atoms[0] |= htole64(0ULL << 20); /* number of args */ + fxt_atoms[0] |= htole64((uint64_t)lltx_strid_sched << 32); + fxt_atoms[0] |= htole64((uint64_t)lltx_strid_wakeup << 48); + + //fxt_write(fxt_atoms, n, ofile); + fxt_insert(state->ns, fxt_atoms, n); + + n = 1; + fxt_atoms[n++] = htole64(state->ns); + fxt_atoms[n++] = htole64(p->p_p->ps_fxtid); + fxt_atoms[n++] = htole64(p->p_fxtid); + fxt_atoms[n++] = htole64(extra[0]); + + fxt_atoms[0] = htole64(FXT_T_EVENT | (n << FXT_H_SIZE_SHIFT)); + fxt_atoms[0] |= htole64((uint64_t)8 << 16); + fxt_atoms[0] |= htole64(0ULL << 20); /* number of args */ + fxt_atoms[0] |= htole64((uint64_t)lltx_strid_sched << 32); + fxt_atoms[0] |= htole64((uint64_t)lltx_strid_wakeup << 48); + + //fxt_write(fxt_atoms, n, ofile); + fxt_insert(state->ns, fxt_atoms, n); + } + + tid = (record >> LLTRACE_SCHED_TID_SHIFT) & + LLTRACE_SCHED_TID_MASK; + + p = lltx_tid(tid); + if (p->p_p == NULL) + errx(1, "wakeup thread %u is unknown", tid); + + if (verbose >= 2) { + printf("#ev %zu[%zu] %llu cpu %u pid %llu tid %llu " + "wakeup pid %llu tid %llu\n", + lle->block, lle->slot, state->ns, state->cpu, + state->p->p_p->ps_fxtid, state->p->p_fxtid, + p->p_p->ps_fxtid, p->p_fxtid); + } + + n = 1; + fxt_atoms[n++] = htole64(state->ns); + fxt_atoms[n++] = htole64(p->p_fxtid); + + fxt_atoms[0] = htole64(FXT_T_SCHED | (n << FXT_H_SIZE_SHIFT)); + fxt_atoms[0] |= htole64((uint64_t)state->cpu << 20); + fxt_atoms[0] |= htole64((uint64_t)2 << 60); + + //fxt_write(fxt_atoms, n, ofile); + //fxt_insert(state->ns, fxt_atoms, n); +} + +static void +lltx_idle(struct lltstate *state, struct llevent *lle, unsigned int phase) +{ + struct llt_tid *p = state->p; + uint64_t iprio, oprio; +// uint64_t oid, iid; + size_t n; + + if (state->idle == phase) + return; + + if (state->idletid != p->p_tid) { + errx(1, "idle outside the idle thread %u, in %u", + state->idletid, p->p_tid); + } + if (p->p_p == NULL) + errx(1, "idle thread %u is unknown", state->idletid); + + if (verbose >= 2) { + printf("#ev %zu[%zu] %llu cpu %u pid %llu tid %llu idle %s\n", + lle->block, lle->slot, state->ns, state->cpu, + p->p_p->ps_fxtid, p->p_fxtid, + lltrace_event_phase_names[phase]); + } + + n = 1; + fxt_atoms[n++] = htole64(state->ns); + + switch (phase) { + case LLTRACE_EVENT_PHASE_START: + oprio = 1; + fxt_atoms[n++] = htole64(p->p_p->ps_fxtid); + fxt_atoms[n++] = htole64(p->p_fxtid); + iprio = 0; + fxt_atoms[n++] = htole64(0); + fxt_atoms[n++] = htole64(0); + break; + case LLTRACE_EVENT_PHASE_END: + oprio = 0; + fxt_atoms[n++] = htole64(0); + fxt_atoms[n++] = htole64(0); + iprio = 1; + fxt_atoms[n++] = htole64(p->p_p->ps_fxtid); + fxt_atoms[n++] = htole64(p->p_fxtid); + break; + default: + return; + } + + fxt_atoms[0] = htole64(FXT_T_SCHED | (n << FXT_H_SIZE_SHIFT)); + fxt_atoms[0] |= htole64((uint64_t)state->cpu << 16); + fxt_atoms[0] |= htole64((uint64_t)3 << 24); + fxt_atoms[0] |= htole64(oprio << 44); + fxt_atoms[0] |= htole64(iprio << 52); + fxt_atoms[0] |= htole64((uint64_t)0 << 60); + + //fxt_write(fxt_atoms, n, ofile); + fxt_insert(state->ns, fxt_atoms, n); + + state->idle = phase; +} + +static void +lltx_event_count(struct lltstate *state, struct llevent *lle, + unsigned int phase, const char *classnm, size_t classnmlen, + uint64_t record) +{ + char tname[128]; + uint32_t t, v; + const char *eventnm; + size_t eventnmlen; + size_t n, an; + + t = (record >> LLTRACE_COUNT_T_SHIFT) & LLTRACE_COUNT_T_MASK; + if (t >= nitems(lltrace_count_type_names) || + (eventnm = lltrace_count_type_names[t]) == NULL) { + int rv; + + warnx("unknown count type class %u", t); + + rv = snprintf(tname, sizeof(tname), "count-type-%u", t); + if (rv == -1) + errx(1, "count event type name snprintf"); + eventnm = tname; + eventnmlen = rv; + if (classnmlen >= sizeof(tname)) + errx(1, "event class name too long"); + } else + eventnmlen = strlen(eventnm); + + v = (record >> LLTRACE_COUNT_V_SHIFT); + + n = 1; + fxt_atoms[n++] = htole64(state->ns); + fxt_atoms[n++] = htole64(state->p->p_p->ps_fxtid); + fxt_atoms[n++] = htole64(state->p->p_fxtid); + n = strtoatoms(fxt_atoms, n, classnm, classnmlen); + n = strtoatoms(fxt_atoms, n, eventnm, eventnmlen); + + an = n++; + fxt_atoms[an] = htole64(2 | (1 << 4)); + fxt_atoms[an] |= htole64(lltx_strid_count << 16); + fxt_atoms[an] |= htole64((uint64_t)v << 32); + + fxt_atoms[0] = htole64(FXT_T_EVENT | (n << FXT_H_SIZE_SHIFT)); + fxt_atoms[0] |= htole64(lltrace_event_phase_map[phase] << 16); + fxt_atoms[0] |= htole64(1 << 20); /* 1 argument */ + fxt_atoms[0] |= htole64(((1<<15) | classnmlen) << 32); + fxt_atoms[0] |= htole64(((1<<15) | eventnmlen) << 48); + + fxt_write(fxt_atoms, n, ofile); +} + +static void +lltx_event(struct lltstate *state, struct llevent *lle, uint64_t record, + const uint64_t *extra, unsigned int extralen) +{ + char cname[32], ename[128]; + unsigned int phase; + unsigned int class; + const char *classnm; + size_t classnmlen; + const char *eventnm; + size_t eventnmlen; + size_t n; + + phase = (record >> LLTRACE_EVENT_PHASE_SHIFT) & + LLTRACE_EVENT_PHASE_MASK; + class = (record >> LLTRACE_EVENT_CLASS_SHIFT) & + LLTRACE_EVENT_CLASS_MASK; + + if (class >= nitems(lltrace_event_class_names) || + (classnm = lltrace_event_class_names[class]) == NULL) { + int rv; + + warnx("unknown event class %u", class); + + rv = snprintf(cname, sizeof(cname), "class-%u", class); + if (rv == -1) + errx(1, "event class name snprintf"); + classnm = cname; + classnmlen = rv; + if (classnmlen >= sizeof(cname)) + errx(1, "event class name too long"); + } else + classnmlen = strlen(classnm); + + switch (class) { + case LLTRACE_EVENT_CLASS_SCHED: + if (verbose >= 2) { + printf("#ev %zu[%zu] %llu cpu %u tid %llu sched\n", + lle->block, lle->slot, state->ns, state->cpu, + state->p->p_fxtid); + } + + if (phase == LLTRACE_EVENT_PHASE_INSTANT) + lltx_sched(state, lle, record, extra, extralen); + return; + case LLTRACE_EVENT_CLASS_WAKE: + if (verbose >= 2) { + printf("#ev %zu[%zu] %llu cpu %u tid %llu wake\n", + lle->block, lle->slot, state->ns, state->cpu, + state->p->p_fxtid); + } + lltx_sched_wake(state, lle, record, extra, extralen); + return; + case LLTRACE_EVENT_CLASS_IDLE: + if (verbose >= 2) { + printf("#ev %zu[%zu] %llu cpu %u tid %llu idle\n", + lle->block, lle->slot, state->ns, state->cpu, + state->p->p_fxtid); + } + lltx_idle(state, lle, phase); + return; + case LLTRACE_EVENT_CLASS_SYSCALL: + { + unsigned int code = (record >> LLTRACE_SYSCALL_SHIFT) & + LLTRACE_SYSCALL_MASK; + eventnm = syscall_name(code); + + switch (code) { + case SYS_exit: + case SYS___threxit: + phase = LLTRACE_EVENT_PHASE_INSTANT; + break; + } + } + eventnmlen = strlen(eventnm); + break; + case LLTRACE_EVENT_CLASS_INTR: + { + unsigned int type = (record >> LLTRACE_INTR_T_SHIFT) & + LLTRACE_INTR_T_MASK; + eventnm = lltrace_intr_type_names[type]; + } + eventnmlen = strlen(eventnm); + break; + case LLTRACE_EVENT_CLASS_FUNC: { + uint32_t addr = record >> 32; + const struct ksym *k = ksym_nfind(addr); + if (k == NULL) { + int rv = snprintf(ename, sizeof(ename), + "?+%x", addr); + if (rv == -1) + errx(1, "func name snprintf"); + eventnm = ename; + eventnmlen = rv; + } else { + uint32_t diff = addr - k->addr; + if (diff != 0) { + int rv = snprintf(ename, sizeof(ename), + "%s+%x", k->name, diff); + if (rv == -1) + errx(1, "func name snprintf"); + eventnm = ename; + eventnmlen = rv; + } else { + eventnm = k->name; + eventnmlen = strlen(eventnm); + } + } + } + break; + case LLTRACE_EVENT_CLASS_COUNT: + lltx_event_count(state, lle, phase, classnm, classnmlen, + record); + return; + + default: + eventnm = classnm; + eventnmlen = classnmlen; + break; + } + + if (verbose >= 2) { + printf("#ev %zu[%zu] %llu cpu %u tid %llu %s:%s %s\n", + lle->block, lle->slot, state->ns, state->cpu, + state->p->p_fxtid, + classnm, eventnm, lltrace_event_phase_names[phase]); + } + + n = 1; + fxt_atoms[n++] = htole64(state->ns); + fxt_atoms[n++] = htole64(state->p->p_p->ps_fxtid); + fxt_atoms[n++] = htole64(state->p->p_fxtid); + n = strtoatoms(fxt_atoms, n, classnm, classnmlen); + n = strtoatoms(fxt_atoms, n, eventnm, eventnmlen); + + fxt_atoms[0] = htole64(FXT_T_EVENT | (n << FXT_H_SIZE_SHIFT)); + fxt_atoms[0] |= htole64(lltrace_event_phase_map[phase] << 16); + fxt_atoms[0] |= htole64(((1<<15) | classnmlen) << 32); + fxt_atoms[0] |= htole64(((1<<15) | eventnmlen) << 48); + + fxt_write(fxt_atoms, n, ofile); +} + +static void +lltx_locking(struct lltstate *state, struct llevent *lle, uint64_t record, + const uint64_t *extra, unsigned int extralen) +{ + struct llt_tid *p = state->p; + unsigned int ltype; + unsigned int lop; + uint64_t cref; + uint64_t nref; +// uint64_t tref; + uint64_t addr; + size_t n; + struct ksym *k; + int durev = -1; + unsigned int nargs = 1; + + ltype = (record >> LLTRACE_LK_TYPE_SHIFT) & LLTRACE_LK_TYPE_MASK; + lop = (record >> LLTRACE_LK_PHASE_SHIFT) & LLTRACE_LK_PHASE_MASK; + addr = record >> LLTRACE_LK_ADDR_SHIFT; + + cref = lltx_strids_locks[ltype]; + if (cref == 0) { + warnx("unknown lock type %u", ltype); + return; + } + nref = lltx_strids_lock_ops[lop]; + if (cref == 0) { + warnx("unknown %s lock op %u", str_locks[ltype], lop); + return; + } + +// tref = lltx_thrid(state->p); + + switch (lop) { + case LLTRACE_LK_A_START: + durev = 2; + break; + case LLTRACE_LK_A_EXCL: + case LLTRACE_LK_A_SHARED: + case LLTRACE_LK_A_ABORT: + durev = 3; + break; + } + + if (0 && ltype == LLTRACE_LK_RW && durev != -1) { + n = 1; + fxt_atoms[n++] = htole64(state->ns); + fxt_atoms[n++] = htole64(p->p_p->ps_fxtid); + fxt_atoms[n++] = htole64(p->p_fxtid); + + fxt_atoms[0] = htole64(FXT_T_EVENT | (n << FXT_H_SIZE_SHIFT)); + fxt_atoms[0] |= htole64((uint64_t)durev << 16); /* duration begin */ + fxt_atoms[0] |= htole64(cref << 32); + fxt_atoms[0] |= htole64((uint64_t)lltx_strid_acquire << 48); + + //fxt_write(fxt_atoms, n, ofile); + fxt_insert(state->ns, fxt_atoms, n); + } + + k = ksym_nfind(addr); + if (k != NULL && k->ref == 0) { + k->ref = lltx_str(k->name); +#if 0 + + n = 1; + fxt_atoms[n++] = addr; + + fxt_atoms[0] = htole64(FXT_T_KOBJ | (n << FXT_H_SIZE_SHIFT)); + fxt_atoms[0] |= htole64(0ULL << 16); /* ZX_OBJ_TYPE_NONE */ + fxt_atoms[0] |= htole64(k->ref << 24); /* name */ + fxt_atoms[0] |= htole64(0ULL << 40); /* number of args */ + + fxt_write(fxt_atoms, n, ofile); +#endif + } + + if (verbose >= 2) { + printf("#lk %zu[%zu] %llu cpu %u pid %llu tid %llu " + "%s %s\n", + lle->block, lle->slot, state->ns, state->cpu, + state->p->p_p->ps_fxtid, state->p->p_fxtid, + str_locks[ltype], str_lock_ops[lop]); + } + + n = 1; + fxt_atoms[n++] = htole64(state->ns); + fxt_atoms[n++] = htole64(p->p_p->ps_fxtid); + fxt_atoms[n++] = htole64(p->p_fxtid); + fxt_atoms[n++] = htole64(8 | (2 << 4) | (cref << 16)); + fxt_atoms[n++] = htole64(addr); + if (k != NULL) { + size_t na = n++; + uint32_t diff; + + fxt_atoms[na] = htole64(6 | (2 << 4)); + fxt_atoms[na] |= htole64((uint64_t)lltx_strid_symbol << 16); + fxt_atoms[na] |= htole64((uint64_t)k->ref << 32); + + nargs++; + + diff = addr - k->addr; + if (diff > 0) { + na = n++; + + fxt_atoms[na] = htole64(2 | (1 << 4)); + fxt_atoms[na] |= htole64((uint64_t)lltx_strid_offset << 16); + fxt_atoms[na] |= htole64((uint64_t)diff << 32); + + nargs++; + } + } + + fxt_atoms[0] = htole64(FXT_T_EVENT | (n << FXT_H_SIZE_SHIFT)); + fxt_atoms[0] |= htole64(0 << 16); /* instant event */ + fxt_atoms[0] |= htole64(nargs << 20); +// fxt_atoms[0] |= htole64(tref << 24); + fxt_atoms[0] |= htole64(cref << 32); + fxt_atoms[0] |= htole64(nref << 48); + + fxt_write(fxt_atoms, n, ofile); +} + +#if 0 +static void +lltextract_pc(struct llevent *lle, int event, uint64_t pc) +{ + lle->event = event; + + /* + * XXX The PC sample is generated after the local_timer + * interrupt, but we really want its sample time to be just + * before that interrupt. + */ + + /* + * Put a hash of the PC name into arg, so HTML display can + * choose colors quickly. + */ + lle->arg0 = (pc >> 6) & 0xffff; + + if (event == KUTRACE_PC_K) { + const struct ksym *k; + + k = ksym_nfind(pc); + if (k != NULL) { + if (asprintf(&lle->name, "PC=%s", k->name) == -1) + errx(1, "PC_K name asprintf"); + return; + } + } + + if (asprintf(&lle->name, "PC=%016llx", pc) == -1) + errx(1, "PC asprintf"); +} + +static char * +xstrdup(const char *src) +{ + char *dst; + + dst = strdup(src); + if (dst == NULL) + err(1, "strdup %s", src); + + return (dst); +} + +static void +lltx_event(const char *name, const char *cat, const char *ph, + uint64_t ts, pid_t pid, pid_t tid) +{ + fprintf(ofile, "{"); + fprintf(ofile, "\"name\":\"%s\",\"cat\":\"%s\",\"ph\":\"%s\",", + name, cat, ph); + fprintf(ofile, "\"ts\":%llu.%03llu,\"pid\":%d,\"tid\":%d", + ts / 1000, ts % 1000, pid, tid); + fprintf(ofile, "},\n"); +} + +static char * +trap_name(unsigned int trap) +{ + const char *source; + char *name; + + switch (trap) { + case LLTRACE_TRAP_PAGEFAULT: + source = "page_fault"; + break; + default: + if (asprintf(&name, "trap-%u", trap) == -1) + errx(1, "trap asprintf"); + return (name); + } + + name = xstrdup(source); + + return (name); +} + +static void +lltextract_trap(struct lltstate *state, struct llevent *lle, + unsigned int event, uint64_t v) +{ + unsigned int trap; + + trap = (v >> LLTRACE_ARG32_SHIFT) & LLTRACE_ARG32_MASK; + + lle->pid = state->pid; + lle->tid = state->tid; + lle->event = event + trap; + lle->name = trap_name(trap); + + lltx_event(trap_name(trap), "trap", event == KUTRACE_TRAP ? "B" : "E", + state->ns, lle->pid, lle->tid); +} + +static void +lltextract_sched(struct lltstate *state, struct llevent *lle, + unsigned int event) +{ + lle->pid = state->pid; + lle->tid = state->tid; + lle->event = event; + lle->arg0 = 0; + lle->name = xstrdup("-sched-"); + + lltx_event("sched", "sched", event == 0x9ff ? "B" : "E", + state->ns, lle->pid, lle->tid); +} + +static void +lltextract_lock(struct lltstate *state, struct llevent *lle, + unsigned int event, uint64_t v) +{ + unsigned int lock; + + lock = (v >> LLTRACE_ARG32_SHIFT) & LLTRACE_ARG32_MASK; + lock &= 0xffff; + + lle->pid = state->pid; + lle->tid = state->tid; + lle->event = event; + lle->arg0 = lock; + + if (asprintf(&lle->name, "lock.%x", lock) == -1) + errx(1, "lock asprintf"); +} + +static void +lltextract_pkts(struct lltstate *state, struct llevent *lle, uint64_t v) +{ + unsigned int type = v & LLTRACE_PKTS_T_MASK; + const char *name; + + switch (type) { + case LLTRACE_PKTS_T_IFQ: + name = "ifq"; + break; + case LLTRACE_PKTS_T_NETTQ: + name = "process"; + break; + case LLTRACE_PKTS_T_IFIQ: + name = "ifiq"; + break; +#ifdef LLTRACE_PKTS_T_DROP + case LLTRACE_PKTS_T_DROP: + name = "drop"; + break; +#endif + default: + errx(1, "unexpected pkts type %x", + type >> LLTRACE_PKTS_T_SHIFT); + /* NOTREACHED */ + } + + lle->tid = state->tid; + lle->event = KUTRACE_MARKA; /* sure */ + lle->arg0 = v; + + if (asprintf(&lle->name, "%s=%llu", name, + v & LLTRACE_PKTS_V_MASK) == -1) + errx(1, "pkts asprintf"); +} + +static void +lltextract_func(struct lltstate *state, struct llevent *lle, + unsigned int event, const char *evname, uint64_t v) +{ + const struct ksym *k; + + lle->arg0 = (v >> LLTRACE_ARG32_SHIFT) & LLTRACE_ARG32_MASK; + + lle->tid = state->tid; + lle->event = event; + + k = ksym_nfind(lle->arg0); + if (k != NULL) { + uint32_t diff = lle->arg0 - k->addr; + if (diff == 0) { + if (asprintf(&lle->name, "%s=%s", evname, + k->name) == -1) + err(1, "kfunc %s asprintf", evname); + } else { + if (asprintf(&lle->name, "%s=%s+%u", evname, + k->name, diff) == -1) + err(1, "kfunc %s asprintf", evname); + } + } else { + if (asprintf(&lle->name, "%s=0x%x", evname, lle->arg0) == -1) + err(1, "kfunc %s asprintf", evname); + } +} + +static void +lltextract_mark(struct lltstate *state, struct llevent *lle, + unsigned int ev, uint64_t v) +{ + + switch (ev) { + case LLTRACE_EVENT_IDLE: + lle->event = KUTRACE_MWAIT; + lle->arg0 = 255; + + lle->name = xstrdup("mwait"); + break; + + case LLTRACE_EVENT_RUNNABLE: + lle->tid = state->tid; + lle->event = KUTRACE_RUNNABLE; + lle->arg0 = (v >> LLTRACE_ARG32_SHIFT) & LLTRACE_ARG32_MASK; + lle->arg0 &= 0xffff; + + if (asprintf(&lle->name, "runnable.%u", lle->arg0) == -1) + err(1, "runnable asprintf"); + break; + + case LLTRACE_EVENT_IPI: + lle->tid = state->tid; + lle->event = KUTRACE_IPI; + lle->arg0 = (v >> LLTRACE_ARG32_SHIFT) & LLTRACE_ARG32_MASK; + + lle->name = xstrdup("sendipi"); + break; + + case LLTRACE_EVENT_SCHED: + lltextract_sched(state, lle, + KUTRACE_SYSCALL(KUTRACE_SYSCALL_SCHED)); + break; + case LLTRACE_EVENT_SCHEDRET: + lltextract_sched(state, lle, + KUTRACE_SYSRET(KUTRACE_SYSCALL_SCHED)); + break; + + case LLTRACE_EVENT_TRAP: + lltextract_trap(state, lle, KUTRACE_TRAP, v); + break; + case LLTRACE_EVENT_TRAPRET: + lltextract_trap(state, lle, KUTRACE_TRAPRET, v); + break; + + case LLTRACE_EVENT_LOCK(LLTRACE_LOCK_NOACQUIRE): + lltextract_lock(state, lle, KUTRACE_LOCKNOACQUIRE, v); + break; + case LLTRACE_EVENT_LOCK(LLTRACE_LOCK_ACQUIRE): + lltextract_lock(state, lle, KUTRACE_LOCKACQUIRE, v); + break; + case LLTRACE_EVENT_LOCK(LLTRACE_LOCK_WAKEUP): + lltextract_lock(state, lle, KUTRACE_LOCKWAKEUP, v); + break; + + case LLTRACE_EVENT_PKTS: + lltextract_pkts(state, lle, v); + break; + + case LLTRACE_EVENT_MARK: + lle->tid = state->tid; + lle->event = KUTRACE_MARKB; + lle->arg0 = 0; + + lle->name = xstrdup("markd=yep"); + break; + + case LLTRACE_EVENT_KFUNC_ENTER: + lltextract_func(state, lle, KUTRACE_MARKD, "enter", v); + break; + + case LLTRACE_EVENT_KFUNC_LEAVE: + lltextract_func(state, lle, KUTRACE_MARKD, "leave", v); + break; + + default: + errx(1, "unexpected mark event 0x%03x", ev); + /* NOTREACHED */ + } +} + +static char * +irq_name(unsigned int type, unsigned int vec) +{ + const char *source; + char *name; + + switch (type) { + case LLTRACE_IRQ_IPI: + source = "ipi"; + break; + case LLTRACE_IRQ_BOTTOM_HALF: + if (vec == 0) + return xstrdup("BH:timer"); + + source = "BH"; + break; + case LLTRACE_IRQ_LOCAL_TIMER: + return xstrdup("local_timer_vector"); + default: + if (asprintf(&name, "irq%u:%u", type, vec) == -1) + errx(1, "irq asprintf"); + return (name); + } + + if (asprintf(&name, "%s:%u", source, vec) == -1) + errx(1, "irq %s asprintf", source); + + return (name); +} + +static void +lltextract_irq(struct lltstate *state, struct llevent *lle, + unsigned int ev, uint64_t v) +{ + unsigned int ret = ev & 0x100; + unsigned int type = ev & 0xff; + unsigned int vec = (v >> LLTRACE_ARG32_SHIFT) & LLTRACE_ARG32_MASK; + + lle->event = (ret ? KUTRACE_IRQRET : KUTRACE_IRQ) | type; + lle->arg0 = vec; + + lle->name = irq_name(type, vec); +} + +static void +lltextract_syscall(struct lltstate *state, struct llevent *lle, + unsigned int ev, uint64_t v) +{ + unsigned int sc = LLTRACE_SYSCALL_MASK(ev); + + lle->pid = state->pid; + lle->tid = state->tid; + lle->event = KUTRACE_SYSCALL(sc); + lle->arg0 = (v >> LLTRACE_ARG0_SHIFT) & LLTRACE_ARG0_MASK; + lle->name = syscall_name(sc); + + lltx_event(syscall_name(sc), "syscall", "B", + state->ns, lle->pid, lle->tid); +} + +static void +lltextract_sysret(struct lltstate *state, struct llevent *lle, + unsigned int ev, uint64_t v) +{ + unsigned int sc = LLTRACE_SYSCALL_MASK(ev); + + lle->pid = state->pid; + lle->tid = state->tid; + lle->event = KUTRACE_SYSRET(sc); + lle->arg0 = (v >> LLTRACE_ARG0_SHIFT) & LLTRACE_ARG0_MASK; + lle->name = syscall_name(sc); + + lltx_event(syscall_name(sc), "syscall", "E", + state->ns, lle->pid, lle->tid); +} +#endif + +RBT_GENERATE(llt_pid_tree, llt_pid, ps_entry, llt_pid_cmp); +RBT_GENERATE(llt_tid_tree, llt_tid, p_entry, llt_tid_cmp); + +static inline int +lltx_fxt_record_cmp(const struct lltx_fxt_record *a, + const struct lltx_fxt_record *b) +{ + if (a->ts > b->ts) + return (1); + if (a->ts < b->ts) + return (-1); + return (0); +} + +HEAP_GENERATE(lltx_fxt_heap, lltx_fxt_record, entry, lltx_fxt_record_cmp); Index: usr.bin/lltextract/lltextract.h =================================================================== RCS file: usr.bin/lltextract/lltextract.h diff -N usr.bin/lltextract/lltextract.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ usr.bin/lltextract/lltextract.h 11 Sep 2024 11:29:31 -0000 @@ -0,0 +1,30 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2022 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +struct ksym { + RBT_ENTRY(ksym) entry; + char *name; + uint32_t addr; + uint32_t len; + unsigned int ref; +}; + +struct ksym *ksym_find(uint32_t); +struct ksym *ksym_nfind(uint32_t); Index: usr.bin/lltextract/names.c =================================================================== RCS file: usr.bin/lltextract/names.c diff -N usr.bin/lltextract/names.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ usr.bin/lltextract/names.c 11 Sep 2024 11:29:31 -0000 @@ -0,0 +1,133 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2022 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lltextract.h" + +#define DBNAME "/var/db/kvm_bsd.db" + +HASHINFO openinfo = { + 4096, /* bsize */ + 128, /* ffactor */ + 1024, /* nelem */ + 2048 * 1024, /* cachesize */ + NULL, /* hash() */ + 0 /* lorder */ +}; + +RBT_HEAD(ksyms, ksym); + +RBT_PROTOTYPE(ksyms, ksym, entry, ksym_cmp); + +static struct ksyms _ksyms = RBT_INITIALIZER(ksyms); + +static void +knames_load(struct ksyms *ksyms) +{ + DB *db; + DBT key, data; + struct nlist n; + struct ksym *k; + + db = dbopen(DBNAME, O_RDONLY, 0, DB_HASH, NULL); + if (db == NULL) + err(1, "%s", DBNAME); + + for (;;) { + int rv = db->seq(db, &key, &data, R_NEXT); + if (rv == -1) + errx(1, "%s seq", DBNAME); + + if (rv != 0) + break; + + if (key.size < 2 || *(const char *)key.data != '_') + continue; + if (data.size != sizeof(n)) + continue; + + memcpy(&n, data.data, sizeof(n)); + //if (n.n_type != N_TEXT) + // continue; + + k = malloc(sizeof(*k) + key.size); + if (k == NULL) + err(1, "%s ksym", __func__); + + k->addr = n.n_value; + k->len = 0; + k->name = (char *)(k + 1); + k->ref = 0; + + memcpy(k->name, (const char *)key.data + 1, key.size - 1); + k->name[key.size - 1] = '\0'; + + if (RBT_INSERT(ksyms, ksyms, k) != NULL) + free(k); + } + + db->close(db); +} + +struct ksym * +ksym_find(uint32_t addr) +{ + struct ksyms *ksyms = &_ksyms; + struct ksym key = { .addr = addr }; + + if (RBT_EMPTY(ksyms, ksyms)) + knames_load(ksyms); + + return (RBT_FIND(ksyms, ksyms, &key)); +} + +struct ksym * +ksym_nfind(uint32_t addr) +{ + struct ksyms *ksyms = &_ksyms; + struct ksym key = { .addr = addr }; + + if (RBT_EMPTY(ksyms, ksyms)) + knames_load(ksyms); + + return (RBT_NFIND(ksyms, ksyms, &key)); +} + +static inline int +ksym_cmp(const struct ksym *a, const struct ksym *b) +{ + if (a->addr > b->addr) + return (-1); + if (a->addr < b->addr) + return (1); + return (0); +} + +RBT_GENERATE(ksyms, ksym, entry, ksym_cmp); Index: usr.bin/lltextract/syscallnames.c =================================================================== RCS file: usr.bin/lltextract/syscallnames.c diff -N usr.bin/lltextract/syscallnames.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ usr.bin/lltextract/syscallnames.c 11 Sep 2024 11:29:31 -0000 @@ -0,0 +1,26 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2022 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#define ACCOUNTING +#define KTRACE +#define PTRACE +#define SYSVMSG +#define SYSVSEM +#define SYSVSHM + +#include Index: usr.sbin/lltrace/Makefile =================================================================== RCS file: usr.sbin/lltrace/Makefile diff -N usr.sbin/lltrace/Makefile --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ usr.sbin/lltrace/Makefile 11 Sep 2024 11:29:33 -0000 @@ -0,0 +1,13 @@ +# $OpenBSD$ + +PROG= lltrace +SRCS= lltrace.c +MAN= + +LDADD= -levent +DPADD= ${LIBEVENT} + +WARNINGS= Yes +DEBUG= -g + +.include Index: usr.sbin/lltrace/lltrace.c =================================================================== RCS file: usr.sbin/lltrace/lltrace.c diff -N usr.sbin/lltrace/lltrace.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ usr.sbin/lltrace/lltrace.c 11 Sep 2024 11:29:33 -0000 @@ -0,0 +1,678 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2022 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "/sys/sys/lltrace.h" + +#ifndef nitems +#define nitems(_a) (sizeof((_a)) / sizeof((_a)[0])) +#endif + +#define DEV_KUTRACE "/dev/lltrace" + +#define NRINGS_DEFAULT 256 /* 256 * 8192 * 8 is 16MB */ + +struct lltrace; + +struct mode { + const char *name; + void *(*setup)(struct lltrace *, int, char **); + int (*run)(struct lltrace *); +}; + +static void *mode_kill_setup(struct lltrace *, int, char **); +static int mode_kill_run(struct lltrace *); + +static const struct mode mode_kill = { + "kill", mode_kill_setup, mode_kill_run +}; + +static void *mode_wait_setup(struct lltrace *, int, char **); +static int mode_wait_run(struct lltrace *); +static void *mode_exec_setup(struct lltrace *, int, char **); +static int mode_exec_run(struct lltrace *); + +static const struct mode modes[] = { + { "wait", mode_wait_setup, mode_wait_run }, + { "exec", mode_exec_setup, mode_exec_run }, +}; + +static const struct mode * + mode_lookup(const char *); +static const char *outfile_default(void); + +__dead static void +usage(void) +{ + extern char *__progname; + + fprintf(stderr, "usage: %s [-v] [-m blen] [-o output] [command]\n", + __progname); + fprintf(stderr, " %s wait seconds\n", __progname); + fprintf(stderr, " %s exec program ...\n", __progname); + + exit(-1); +} + +struct lltrace { + const char *outfile; + int dv; /* /dev/lltrace fd */ + int of; /* outfile fd */ + void *mode; + + struct event dv_ev; /* handle reading from the kernel */ + + unsigned int blen; + size_t nbuffers; + struct lltrace_buffer + *buffers; + size_t buffer_idx; + + uint64_t nsec_first; + uint64_t nsec_last; + uint64_t count_buffers; + uint64_t count_slots; + uint64_t count_drops; +}; + +static void lltrace_start(struct lltrace *); +static void lltrace_stop(struct lltrace *); + +static void lltrace_read(int, short, void *); +static void lltrace_flush(struct lltrace *); + +int +main(int argc, char *argv[]) +{ + const struct mode *mode = &mode_kill; + int ch; + const char *errstr; + int verbose = 0; + int prio; + + struct lltrace lltrace = { + .outfile = NULL, + .blen = 0, + .nbuffers = NRINGS_DEFAULT, + + .nsec_first = ~0, + .nsec_last = 0, + .count_buffers = 0, + .count_slots = 0, + .count_drops = 0, + }; + struct lltrace *llt = &lltrace; + + while ((ch = getopt(argc, argv, "m:n:o:v")) != -1) { + switch (ch) { + case 'm': + llt->blen = strtonum(optarg, + LLTRACE_BLEN_MIN, LLTRACE_BLEN_MAX, &errstr); + if (errstr != NULL) { + errx(1, "kernel buffer len %s: %s", + optarg, errstr); + } + break; + case 'n': + llt->nbuffers = strtonum(optarg, 4, 4096, &errstr); + if (errstr != NULL) { + errx(1, "number of buffers %s: %s", + optarg, errstr); + } + break; + case 'o': + llt->outfile = optarg; + break; + case 'v': + verbose = 1; + break; + default: + usage(); + /* NOTREACHED */ + } + } + + argc -= optind; + argv += optind; + + optreset = optind = opterr = 1; /* kill mode has to be careful */ + + if (argc > 0) { + mode = mode_lookup(argv[0]); + if (mode == NULL) + errx(1, "unknown mode %s", argv[0]); + } + + if (llt->outfile == NULL) + llt->outfile = outfile_default(); + + event_init(); + + llt->mode = (*mode->setup)(llt, argc, argv); + + llt->dv = open(DEV_KUTRACE, O_NONBLOCK|O_RDWR|O_CLOEXEC); + if (llt->dv == -1) + err(1, "%s", DEV_KUTRACE); + + if (llt->blen != 0) { + if (ioctl(llt->dv, LLTIOCSBLEN, &llt->blen) == -1) + err(1, "set kernel buffer len %u", llt->blen); + } + + event_set(&llt->dv_ev, llt->dv, EV_READ|EV_PERSIST, + lltrace_read, llt); + + llt->of = open(llt->outfile, O_WRONLY|O_CREAT|O_CLOEXEC|O_TRUNC, 0640); + if (llt->of == -1) + err(1, "open %s", llt->outfile); + + llt->buffers = calloc(llt->nbuffers, sizeof(*llt->buffers)); + if (llt->buffers == NULL) + err(1, "unable to allocate %zu buffers", llt->nbuffers); + + llt->buffer_idx = 0; + + if ((*mode->run)(llt) == -1) + exit(1); + + prio = getpriority(PRIO_PROCESS, 0); + if (setpriority(PRIO_PROCESS, 0, -20) == -1) + err(1, "setpriority -20"); + + lltrace_start(llt); + + event_dispatch(); + + if (setpriority(PRIO_PROCESS, 0, prio) == -1) + err(1, "setpriority %d", prio); + + if (llt->buffer_idx != 0) + lltrace_flush(llt); + + if (verbose) { + uint64_t diff = llt->nsec_last - llt->nsec_first; + double interval = (double)diff / 1000000000.0; + int mib[] = { CTL_HW, HW_NCPU }; + int ncpus; + size_t ncpuslen = sizeof(ncpus); + + if (sysctl(mib, nitems(mib), &ncpus, &ncpuslen, NULL, 0) == -1) + err(1, "sysctl hw.ncpus"); + + printf("output file: %s\n", llt->outfile); + printf("interval: %.03lfs, ncpus: %d\n", interval, ncpus); + printf("buffers: %llu (%.01lf/cpu/s), " + "slots: %llu (%.01lf/cpu/s)\n", + llt->count_buffers, llt->count_buffers / interval / ncpus, + llt->count_slots, llt->count_slots / interval / ncpus); + printf("drops: %llu (%.01lf/cpu/s)\n", + llt->count_drops, llt->count_drops / interval / ncpus); + } + + return (0); +} + +static void +lltrace_start(struct lltrace *llt) +{ + event_add(&llt->dv_ev, NULL); + + if (ioctl(llt->dv, LLTIOCSTART) == -1) + err(1, "lltrace start"); +} + +static void +lltrace_flush(struct lltrace *llt) +{ + size_t len; + ssize_t rv; + + len = llt->buffer_idx * sizeof(*llt->buffers); + rv = write(llt->of, llt->buffers, len); + if (rv == -1) + err(1, "%s write", llt->outfile); + + if ((size_t)rv < len) { + errx(1, "%s write short (%zd/%zu bytes)", + llt->outfile, rv, len); + } +} + +static int +lltrace_read_one(struct lltrace *llt) +{ + struct lltrace_buffer *buffer; + ssize_t rv; + uint64_t nsec; + + if (llt->buffer_idx >= llt->nbuffers) { + size_t i, j; + + lltrace_flush(llt); + + /* reset */ + llt->buffer_idx = 0; + + /* + * memset(llt->buffers, 0, + * llt->nbuffers * sizeof(*llt->buffers)); + */ + for (i = 0; i < llt->nbuffers; i++) { + buffer = llt->buffers + i; + + for (j = 0; j < nitems(buffer->llt_slots); j++) + buffer->llt_slots[j] = 0; + } + } + + buffer = llt->buffers + llt->buffer_idx; + rv = read(llt->dv, buffer, sizeof(*buffer)); + if (rv == -1) { + switch (errno) { + case EAGAIN: + /* try again later */ + return (EAGAIN); + case ENOENT: + /* we're done */ + event_del(&llt->dv_ev); + return (ENOENT); + default: + err(1, "%s read", DEV_KUTRACE); + /* NOTREACHED */ + } + } + + if (rv == 0) { + /* we're done */ + event_del(&llt->dv_ev); + return (ENOENT); + } + + llt->buffer_idx++; + + nsec = buffer->llt_slots[3]; + if (nsec < llt->nsec_first) + llt->nsec_first = nsec; + + nsec = buffer->llt_slots[5]; + if (nsec > llt->nsec_last) + llt->nsec_last = nsec; + + llt->count_buffers++; + llt->count_slots += rv / sizeof(uint64_t); + //llt->count_drops += buffer->slots[7]; + + return (0); +} + +static void +lltrace_read(int dv, short events, void *arg) +{ + struct lltrace *llt = arg; + + lltrace_read_one(llt); +} + +static void +lltrace_stop(struct lltrace *llt) +{ + int error; + + if (ioctl(llt->dv, LLTIOCSTOP) == -1) { + if (errno != EALREADY) + err(1, "lltrace stop"); + } + + do { + error = lltrace_read_one(llt); + } while (error == 0); + + event_del(&llt->dv_ev); +} + +static const char * +outfile_default(void) +{ + extern char *__progname; + char host[MAXHOSTNAMELEN]; + time_t now; + struct tm *tm; + char *outfile; + + if (gethostname(host, sizeof(host)) == -1) + err(1, "gethostname"); + + now = time(NULL); + + tm = localtime(&now); + + if (asprintf(&outfile, "%s_%04d%02d%02d_%02d%02d%02d_%s.lltrace", + __progname, + tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, + tm->tm_hour, tm->tm_min, tm->tm_sec, + host) == -1) + errx(1, "error generating default output filename"); + + return (outfile); +} + +#if 0 +static int +printable(int ch) +{ + if (ch == '\0') + return ('_'); + if (!isprint(ch)) + return ('~'); + + return (ch); +} + +static void +hexdump(const void *d, size_t datalen) +{ + const uint8_t *data = d; + size_t i, j = 0; + + for (i = 0; i < datalen; i += j) { +#if 0 + printf("%04zu: ", i); + for (j = 0; j < 16 && i+j < datalen; j++) + printf("%02x ", data[i + j]); + while (j++ < 16) + printf(" "); +#endif + printf("|"); + + for (j = 0; j < 16 && i+j < datalen; j++) + putchar(printable(data[i + j])); + printf("|\n"); + } +} +#endif + +static const struct mode * +mode_lookup(const char *name) +{ + size_t i; + + for (i = 0; i < nitems(modes); i++) { + const struct mode *mode = &modes[i]; + + if (strcmp(mode->name, name) == 0) + return (mode); + } + + return (NULL); +} + +static void +mode_kill_event(int nil, short events, void *arg) +{ + struct lltrace *llt = arg; + struct event *ev = llt->mode; + + fprintf(stdout, "lltrace stopped\n"); + fflush(stdout); + + event_del(ev); + + lltrace_stop(llt); +} + +static void * +mode_kill_setup(struct lltrace *llt, int argc, char *argv[]) +{ + struct event *ev; + + if (argc != 0) + usage(); + + ev = malloc(sizeof(*ev)); + if (ev == NULL) + err(1, NULL); + + signal_set(ev, SIGINT, mode_kill_event, llt); + return (ev); +} + +static int +mode_kill_run(struct lltrace *llt) +{ + struct event *ev = llt->mode; + + signal_add(ev, NULL); + + fprintf(stdout, "lltrace starting, press Ctrl-C to end...\n"); + fflush(stdout); + + return (0); +} + +/* + * lltrace for specified number of seconds. + */ + +struct mode_wait_state { + struct lltrace *llt; + struct timeval tv; + struct event tmo; + struct event sig; +}; + +static void +mode_wait_tmo(int wat, short events, void *arg) +{ + struct mode_wait_state *state = arg; + struct lltrace *llt = state->llt; + + signal_del(&state->sig); + lltrace_stop(llt); +} + +static void +mode_wait_sig(int wat, short events, void *arg) +{ + struct mode_wait_state *state = arg; + struct lltrace *llt = state->llt; + + evtimer_del(&state->tmo); + signal_del(&state->sig); + lltrace_stop(llt); +} + +static void * +mode_wait_setup(struct lltrace *llt, int argc, char *argv[]) +{ + struct mode_wait_state *state; + const char *errstr; + + if (argc != 2) + usage(); + + state = malloc(sizeof(*state)); + if (state == NULL) + err(1, NULL); + + state->llt = llt; + + state->tv.tv_sec = strtonum(argv[1], 1, 600, &errstr); + if (errstr != NULL) + errx(1, "wait time %s: %s", argv[1], errstr); + + state->tv.tv_usec = 0; + + evtimer_set(&state->tmo, mode_wait_tmo, state); + signal_set(&state->sig, SIGINT, mode_wait_sig, state); + + return (state); +} + +static int +mode_wait_run(struct lltrace *llt) +{ + struct mode_wait_state *state = llt->mode; + + evtimer_add(&state->tmo, &state->tv); + signal_add(&state->sig, NULL); + + return (0); +} + +/* + * trace the execution of a (child) program + */ + +struct mode_exec_state { + struct lltrace *llt; + + char **argv; + + pid_t pid; + struct event sigchld; + struct event sigint; + + uid_t uid; + gid_t gid; + gid_t groups[NGROUPS_MAX]; + int ngroups; +}; + +static void +mode_exec_sig(int wat, short events, void *arg) +{ + struct mode_exec_state *state = arg; + struct lltrace *llt = state->llt; + + /* do we check the pid? */ + + signal_del(&state->sigchld); + signal_del(&state->sigint); + lltrace_stop(llt); +} + +static void * +mode_exec_setup(struct lltrace *llt, int argc, char *argv[]) +{ + struct mode_exec_state *state; + const char *user = NULL; + int ch; + + while ((ch = getopt(argc, argv, "u:")) != -1) { + switch (ch) { + case 'u': + user = optarg; + break; + default: + usage(); + /* NOTREACHED */ + } + } + + argc -= optind; + argv += optind; + + if (argc == 0) { + warnx("no command specified"); + usage(); + } + + state = malloc(sizeof(*state)); + if (state == NULL) + err(1, NULL); + + state->llt = llt; + state->argv = argv; + state->uid = 0; + state->pid = -1; /* not yet */ + signal_set(&state->sigchld, SIGCHLD, mode_exec_sig, state); + signal_set(&state->sigint, SIGINT, mode_exec_sig, state); + + if (user != NULL) { + struct passwd *pw; + + pw = getpwnam(user); + if (pw == NULL) + errx(1, "unable to lookup user %s", user); + + state->uid = pw->pw_uid; + state->gid = pw->pw_gid; + + endpwent(); + + state->ngroups = nitems(state->groups); + if (getgrouplist(user, pw->pw_gid, + state->groups, &state->ngroups) == -1) + errx(1, "unable to get groups for user %s", user); + } + + return (state); +} + +static int +mode_exec_run(struct lltrace *llt) +{ + struct mode_exec_state *state = llt->mode; + + signal_add(&state->sigchld, NULL); + signal_add(&state->sigint, NULL); + + state->pid = fork(); + switch (state->pid) { + case -1: + err(1, "unable to fork"); + /* NOTREACHED */ + case 0: /* child */ + break; + default: /* parent */ + return (0); + } + + if (state->uid != 0) { + if (setresgid(state->gid, state->gid, state->gid) == -1) + err(1, "setresgid %d", state->gid); + + if (setgroups(state->ngroups, state->groups) == -1) + err(1, "setgroups"); + + if (setresuid(state->uid, state->uid, state->uid) == -1) + err(1, "setresuid %d", state->uid); + } + + execvp(state->argv[0], state->argv); + + err(1, "exec %s", state->argv[0]); + return (-1); +}