Index: arch/amd64/amd64/conf.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/conf.c,v diff -u -p -r1.81 conf.c --- arch/amd64/amd64/conf.c 12 Jun 2024 12:54:54 -0000 1.81 +++ arch/amd64/amd64/conf.c 6 Sep 2024 11:18:12 -0000 @@ -137,6 +137,7 @@ cdev_decl(cy); #include "bktr.h" #include "ksyms.h" #include "kstat.h" +#include "llt.h" #include "usb.h" #include "uhid.h" #include "fido.h" @@ -215,7 +216,8 @@ struct cdevsw cdevsw[] = cdev_notdef(), /* 28 was LKM */ cdev_notdef(), /* 29 */ cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ - cdev_notdef(), /* 31 */ + cdev_lltrace_init(NLLT,lltrace), + /* 31: lltrace */ cdev_notdef(), /* 32 */ cdev_notdef(), /* 33 */ cdev_notdef(), /* 34 */ Index: arch/amd64/amd64/intr.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/intr.c,v diff -u -p -r1.61 intr.c --- arch/amd64/amd64/intr.c 25 Jun 2024 12:02:48 -0000 1.61 +++ arch/amd64/amd64/intr.c 6 Sep 2024 11:18:12 -0000 @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -543,6 +544,9 @@ intr_handler(struct intrframe *frame, st return 0; } + LLTRACE_CPU(ci, lltrace_intr_enter, LLTRACE_INTR_T_HW, + ci->ci_isources[ih->ih_slot]->is_idtvec); + #ifdef MULTIPROCESSOR if (ih->ih_flags & IPL_MPSAFE) need_lock = 0; @@ -552,14 +556,22 @@ intr_handler(struct intrframe *frame, st if (need_lock) __mp_lock(&kernel_lock); #endif + floor = ci->ci_handled_intr_level; ci->ci_handled_intr_level = ih->ih_level; + + LLTRACE_CPU(ci, lltrace_fn_enter, ih->ih_fun); rc = (*ih->ih_fun)(ih->ih_arg ? ih->ih_arg : frame); + LLTRACE_CPU(ci, lltrace_fn_leave, ih->ih_fun); + ci->ci_handled_intr_level = floor; + #ifdef MULTIPROCESSOR if (need_lock) __mp_unlock(&kernel_lock); #endif + LLTRACE_CPU(ci, lltrace_intr_leave, LLTRACE_INTR_T_HW, + ci->ci_isources[ih->ih_slot]->is_idtvec); return rc; } Index: arch/amd64/amd64/ipi.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/ipi.c,v diff -u -p -r1.18 ipi.c --- arch/amd64/amd64/ipi.c 10 Nov 2022 08:26:54 -0000 1.18 +++ arch/amd64/amd64/ipi.c 6 Sep 2024 11:18:12 -0000 @@ -35,9 +35,10 @@ #include #include #include +#include +#include #include -#include #include #include #include @@ -45,6 +46,8 @@ void x86_send_ipi(struct cpu_info *ci, int ipimask) { + LLTRACE(lltrace_ipi, ci->ci_cpuid); + x86_atomic_setbits_u32(&ci->ci_ipis, ipimask); /* Don't send IPI to cpu which isn't (yet) running. */ @@ -57,6 +60,8 @@ x86_send_ipi(struct cpu_info *ci, int ip int x86_fast_ipi(struct cpu_info *ci, int ipi) { + LLTRACE(lltrace_ipi, ci->ci_cpuid); + if (!(ci->ci_flags & CPUF_RUNNING)) return (ENOENT); @@ -72,6 +77,8 @@ x86_broadcast_ipi(int ipimask) int count = 0; CPU_INFO_ITERATOR cii; + LLTRACE_CPU(self, lltrace_ipi, ~0); + CPU_INFO_FOREACH(cii, ci) { if (ci == self) continue; @@ -95,17 +102,22 @@ x86_ipi_handler(void) int bit; int floor; + LLTRACE_CPU(ci, lltrace_intr_enter, LLTRACE_INTR_T_IPI, 0); + floor = ci->ci_handled_intr_level; ci->ci_handled_intr_level = ci->ci_ilevel; pending = atomic_swap_uint(&ci->ci_ipis, 0); for (bit = 0; bit < X86_NIPI && pending; bit++) { if (pending & (1 << bit)) { - pending &= ~(1 << bit); + LLTRACE_CPU(ci, lltrace_fn_enter, ipifunc[bit]); (*ipifunc[bit])(ci); + LLTRACE_CPU(ci, lltrace_fn_leave, ipifunc[bit]); evcount_inc(&ipi_count); } } ci->ci_handled_intr_level = floor; + + LLTRACE_CPU(ci, lltrace_intr_leave, LLTRACE_INTR_T_IPI, 0); } Index: arch/amd64/amd64/lapic.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v diff -u -p -r1.72 lapic.c --- arch/amd64/amd64/lapic.c 3 Apr 2024 02:01:21 -0000 1.72 +++ arch/amd64/amd64/lapic.c 6 Sep 2024 11:18:12 -0000 @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -476,12 +477,16 @@ lapic_clockintr(void *arg, struct intrfr struct cpu_info *ci = curcpu(); int floor; + LLTRACE_CPU(ci, lltrace_intr_enter, LLTRACE_INTR_T_CLOCK, 0); + floor = ci->ci_handled_intr_level; ci->ci_handled_intr_level = ci->ci_ilevel; clockintr_dispatch(&frame); ci->ci_handled_intr_level = floor; evcount_inc(&clk_count); + + LLTRACE_CPU(ci, lltrace_intr_leave, LLTRACE_INTR_T_CLOCK, 0); } void Index: arch/amd64/amd64/softintr.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/softintr.c,v diff -u -p -r1.10 softintr.c --- arch/amd64/amd64/softintr.c 11 Sep 2020 09:27:09 -0000 1.10 +++ arch/amd64/amd64/softintr.c 6 Sep 2024 11:18:12 -0000 @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -82,6 +83,8 @@ softintr_dispatch(int which) struct x86_soft_intrhand *sih; int floor; + LLTRACE_CPU(ci, lltrace_intr_enter, LLTRACE_INTR_T_SW, which); + floor = ci->ci_handled_intr_level; ci->ci_handled_intr_level = ci->ci_ilevel; @@ -99,12 +102,15 @@ softintr_dispatch(int which) uvmexp.softs++; mtx_leave(&si->softintr_lock); - + LLTRACE_CPU(ci, lltrace_fn_enter, sih->sih_fn); (*sih->sih_fn)(sih->sih_arg); + LLTRACE_CPU(ci, lltrace_fn_leave, sih->sih_fn); } KERNEL_UNLOCK(); ci->ci_handled_intr_level = floor; + + LLTRACE_CPU(ci, lltrace_intr_leave, LLTRACE_INTR_T_SW, which); } /* Index: arch/arm64/arm64/conf.c =================================================================== RCS file: /cvs/src/sys/arch/arm64/arm64/conf.c,v diff -u -p -r1.24 conf.c --- arch/arm64/arm64/conf.c 12 Jun 2024 02:50:25 -0000 1.24 +++ arch/arm64/arm64/conf.c 6 Sep 2024 11:18:12 -0000 @@ -91,6 +91,7 @@ cdev_decl(lpt); #include "bktr.h" #include "ksyms.h" #include "kstat.h" +#include "llt.h" #include "usb.h" #include "uhid.h" #include "fido.h" @@ -156,7 +157,8 @@ struct cdevsw cdevsw[] = cdev_notdef(), /* 28 was LKM */ cdev_notdef(), /* 29 */ cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ - cdev_notdef(), /* 31 */ + cdev_lltrace_init(NLLT,lltrace), + /* 31: lltrace */ cdev_notdef(), /* 32 */ cdev_notdef(), /* 33 */ cdev_notdef(), /* 34 */ Index: arch/arm64/dev/agintc.c =================================================================== RCS file: /cvs/src/sys/arch/arm64/dev/agintc.c,v diff -u -p -r1.59 agintc.c --- arch/arm64/dev/agintc.c 3 Jul 2024 22:37:00 -0000 1.59 +++ arch/arm64/dev/agintc.c 6 Sep 2024 11:18:12 -0000 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -1121,7 +1122,11 @@ agintc_run_handler(struct intrhand *ih, else arg = frame; + LLTRACE(lltrace_irq, ih->ih_ipl == IPL_IPI ? LLTRACE_IRQ_IPI : 0, + ih->ih_irq); handled = ih->ih_func(arg); + LLTRACE(lltrace_irqret, ih->ih_ipl == IPL_IPI ? LLTRACE_IRQ_IPI : 0, + ih->ih_irq); if (handled) ih->ih_count.ec_count++; @@ -1466,6 +1471,8 @@ agintc_send_ipi(struct cpu_info *ci, int { struct agintc_softc *sc = agintc_sc; uint64_t sendmask; + + LLTRACE(lltrace_ipi, ci->ci_cpuid); if (ci == curcpu() && id == ARM_IPI_NOP) return; Index: arch/sparc64/conf/files.sparc64 =================================================================== RCS file: /cvs/src/sys/arch/sparc64/conf/files.sparc64,v diff -u -p -r1.156 files.sparc64 --- arch/sparc64/conf/files.sparc64 29 Mar 2024 21:11:31 -0000 1.156 +++ arch/sparc64/conf/files.sparc64 6 Sep 2024 11:18:14 -0000 @@ -108,19 +108,19 @@ file arch/sparc64/dev/gfb.c gfb include "dev/pci/files.pci" major {wd = 12} -device psycho: pcibus, iommu +device psycho: pcibus, iommu, vmem attach psycho at mainbus file arch/sparc64/dev/psycho.c psycho -device schizo: pcibus, iommu +device schizo: pcibus, iommu, vmem attach schizo at mainbus file arch/sparc64/dev/schizo.c schizo -device pyro: pcibus, iommu, msi +device pyro: pcibus, iommu, vmem, msi attach pyro at mainbus file arch/sparc64/dev/pyro.c pyro -device vpci: pcibus, viommu, msi +device vpci: pcibus, viommu, vmem, msi attach vpci at mainbus file arch/sparc64/dev/vpci.c vpci Index: arch/sparc64/dev/ebus.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/ebus.c,v diff -u -p -r1.27 ebus.c --- arch/sparc64/dev/ebus.c 14 May 2024 08:26:13 -0000 1.27 +++ arch/sparc64/dev/ebus.c 6 Sep 2024 11:18:14 -0000 @@ -53,7 +53,6 @@ int ebus_debug = 0x0; #include #include #include -#include #include #include #include Index: arch/sparc64/dev/ebus_mainbus.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/ebus_mainbus.c,v diff -u -p -r1.13 ebus_mainbus.c --- arch/sparc64/dev/ebus_mainbus.c 29 Mar 2024 21:29:33 -0000 1.13 +++ arch/sparc64/dev/ebus_mainbus.c 6 Sep 2024 11:18:14 -0000 @@ -33,7 +33,6 @@ extern int ebus_debug; #include #include #include -#include #include #include #include Index: arch/sparc64/dev/iommu.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/iommu.c,v diff -u -p -r1.83 iommu.c --- arch/sparc64/dev/iommu.c 18 Oct 2023 14:24:29 -0000 1.83 +++ arch/sparc64/dev/iommu.c 6 Sep 2024 11:18:14 -0000 @@ -35,7 +35,6 @@ * UltraSPARC IOMMU support; used by both the sbus and pci code. */ #include -#include #include #include #include @@ -249,10 +248,11 @@ iommu_init(char *name, const struct iomm (unsigned long long)is->is_ptsb, (unsigned long long)(is->is_ptsb + size)); #endif - is->is_dvmamap = extent_create(name, - is->is_dvmabase, (u_long)is->is_dvmaend + 1, - M_DEVBUF, NULL, 0, EX_NOCOALESCE); - mtx_init(&is->is_mtx, IPL_HIGH); + is->is_dvmamap = vmem_create(name, + is->is_dvmabase, ((u_long)is->is_dvmaend + 1) - is->is_dvmabase, + PAGE_SIZE, + /* allocfn = */ NULL, /* freefn = */ NULL, /* backend = */ NULL, + 0, VM_NOSLEEP, IPL_VM); /* * Now actually start up the IOMMU. @@ -341,7 +341,7 @@ iommu_enter(struct iommu_state *is, stru if (tte & IOTTE_V) { printf("Overwriting valid tte entry (dva %lx pa %lx " "&tte %p tte %llx)\n", va, pa, tte_ptr, tte); - extent_print(is->is_dvmamap); + // extent_print(is->is_dvmamap); panic("IOMMU overwrite"); } #endif @@ -407,7 +407,7 @@ iommu_remove(struct iommu_state *is, str if ((tte & IOTTE_V) == 0) { printf("Removing invalid tte entry (dva %lx &tte %p " "tte %llx)\n", va, tte_ptr, tte); - extent_print(is->is_dvmamap); + // extent_print(is->is_dvmamap); panic("IOMMU remove overwrite"); } #endif @@ -679,7 +679,8 @@ iommu_dvmamap_load(bus_dma_tag_t t, bus_ { int err = 0; bus_size_t sgsize; - u_long dvmaddr, sgstart, sgend; + vmem_addr_t dvmaddr; + u_long sgstart, sgend; bus_size_t align, boundary; struct iommu_state *is; struct iommu_map_state *ims; @@ -765,27 +766,25 @@ iommu_dvmamap_load(bus_dma_tag_t t, bus_ } sgsize = ims->ims_map.ipm_pagecnt * PAGE_SIZE; - mtx_enter(&is->is_mtx); if (flags & BUS_DMA_24BIT) { - sgstart = MAX(is->is_dvmamap->ex_start, 0xff000000); - sgend = MIN(is->is_dvmamap->ex_end, 0xffffffff); + sgstart = 0x000000; + sgend = 0xffffff; } else { - sgstart = is->is_dvmamap->ex_start; - sgend = is->is_dvmamap->ex_end; + sgstart = VMEM_ADDR_MIN; + sgend = VMEM_ADDR_MAX; } /* * If our segment size is larger than the boundary we need to * split the transfer up into little pieces ourselves. */ - err = extent_alloc_subregion_with_descr(is->is_dvmamap, sgstart, sgend, - sgsize, align, 0, (sgsize > boundary) ? 0 : boundary, - EX_NOWAIT | EX_BOUNDZERO, &ims->ims_er, (u_long *)&dvmaddr); - mtx_leave(&is->is_mtx); + err = vmem_xalloc(is->is_dvmamap, sgsize, align, 0, + (sgsize > boundary) ? 0 : boundary, + sgstart, sgend, VM_NOSLEEP | VM_INSTANTFIT, &dvmaddr); #ifdef DEBUG if (err || (dvmaddr == (bus_addr_t)-1)) { - printf("iommu_dvmamap_load(): extent_alloc(%d, %x) failed!\n", + printf("iommu_dvmamap_load(): vmem_xalloc(%d, %x) failed!\n", (int)sgsize, flags); #ifdef DDB if (iommudebug & IDB_BREAK) @@ -889,7 +888,8 @@ iommu_dvmamap_load_raw(bus_dma_tag_t t, int err = 0; bus_size_t sgsize; bus_size_t boundary, align; - u_long dvmaddr, sgstart, sgend; + vmem_addr_t dvmaddr; + u_long sgstart, sgend; struct iommu_state *is; struct iommu_map_state *ims; @@ -986,23 +986,21 @@ iommu_dvmamap_load_raw(bus_dma_tag_t t, } sgsize = ims->ims_map.ipm_pagecnt * PAGE_SIZE; - mtx_enter(&is->is_mtx); if (flags & BUS_DMA_24BIT) { - sgstart = MAX(is->is_dvmamap->ex_start, 0xff000000); - sgend = MIN(is->is_dvmamap->ex_end, 0xffffffff); + sgstart = 0x000000; + sgend = 0xffffff; } else { - sgstart = is->is_dvmamap->ex_start; - sgend = is->is_dvmamap->ex_end; + sgstart = VMEM_ADDR_MIN; + sgend = VMEM_ADDR_MAX; } /* * If our segment size is larger than the boundary we need to * split the transfer up into little pieces ourselves. */ - err = extent_alloc_subregion_with_descr(is->is_dvmamap, sgstart, sgend, - sgsize, align, 0, (sgsize > boundary) ? 0 : boundary, - EX_NOWAIT | EX_BOUNDZERO, &ims->ims_er, (u_long *)&dvmaddr); - mtx_leave(&is->is_mtx); + err = vmem_xalloc(is->is_dvmamap, sgsize, align, 0, + (sgsize > boundary) ? 0 : boundary, + sgstart, sgend, VM_NOSLEEP | VM_INSTANTFIT, &dvmaddr); if (err != 0) { iommu_iomap_clear_pages(ims); @@ -1011,7 +1009,7 @@ iommu_dvmamap_load_raw(bus_dma_tag_t t, #ifdef DEBUG if (dvmaddr == (bus_addr_t)-1) { - printf("iommu_dvmamap_load_raw(): extent_alloc(%d, %x) " + printf("iommu_dvmamap_load_raw(): vmem_xalloc(%d, %x) " "failed!\n", (int)sgsize, flags); #ifdef DDB if (iommudebug & IDB_BREAK) @@ -1326,7 +1324,6 @@ iommu_dvmamap_unload(bus_dma_tag_t t, bu struct iommu_map_state *ims; bus_addr_t dvmaddr = map->_dm_dvmastart; bus_size_t sgsize = map->_dm_dvmasize; - int error; if (ISSET(map->_dm_flags, BUS_DMA_64BIT)) { bus_dmamap_unload(t->_parent, map); @@ -1365,13 +1362,9 @@ iommu_dvmamap_unload(bus_dma_tag_t t, bu map->dm_mapsize = 0; map->dm_nsegs = 0; - mtx_enter(&is->is_mtx); - error = extent_free(is->is_dvmamap, dvmaddr, sgsize, EX_NOWAIT); + vmem_xfree(is->is_dvmamap, dvmaddr, sgsize); map->_dm_dvmastart = 0; map->_dm_dvmasize = 0; - mtx_leave(&is->is_mtx); - if (error != 0) - printf("warning: %ld of DVMA space lost\n", sgsize); } #ifdef DEBUG Index: arch/sparc64/dev/iommuvar.h =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/iommuvar.h,v diff -u -p -r1.19 iommuvar.h --- arch/sparc64/dev/iommuvar.h 11 Mar 2021 11:17:00 -0000 1.19 +++ arch/sparc64/dev/iommuvar.h 6 Sep 2024 11:18:14 -0000 @@ -37,7 +37,7 @@ #include #endif -#include +#include #include /* @@ -95,7 +95,6 @@ struct iommu_map_state { struct strbuf_ctl *ims_sb; /* Link to parent */ struct iommu_state *ims_iommu; int ims_flags; - struct extent_region ims_er; struct iommu_page_map ims_map; /* map must be last (array at end) */ }; #define IOMMU_MAP_STREAM 1 @@ -125,8 +124,7 @@ struct iommu_state { u_int is_dvmabase; u_int is_dvmaend; int64_t is_cr; /* Control register value */ - struct mutex is_mtx; - struct extent *is_dvmamap; /* DVMA map for this instance */ + vmem_t *is_dvmamap; /* DVMA map for this instance */ const struct iommu_hw *is_hw; struct strbuf_ctl *is_sb[2]; /* Streaming buffers if any */ Index: arch/sparc64/dev/psycho.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/psycho.c,v diff -u -p -r1.84 psycho.c --- arch/sparc64/dev/psycho.c 29 Mar 2024 21:29:33 -0000 1.84 +++ arch/sparc64/dev/psycho.c 6 Sep 2024 11:18:14 -0000 @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include Index: arch/sparc64/dev/sbus.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/sbus.c,v diff -u -p -r1.47 sbus.c --- arch/sparc64/dev/sbus.c 29 Mar 2024 21:29:33 -0000 1.47 +++ arch/sparc64/dev/sbus.c 6 Sep 2024 11:18:14 -0000 @@ -101,7 +101,6 @@ #include #include #include -#include #include #include #include @@ -374,15 +373,9 @@ sbus_mb_attach(struct device *parent, st * NULL DMA pointer will be translated by the first page of the IOTSB. * To avoid bugs we'll alloc and ignore the first entry in the IOTSB. */ - { - u_long dummy; - - if (extent_alloc_subregion(sc->sc_is.is_dvmamap, - sc->sc_is.is_dvmabase, sc->sc_is.is_dvmabase + NBPG, NBPG, - NBPG, 0, 0, EX_NOWAIT | EX_BOUNDZERO, - (u_long *)&dummy) != 0) - panic("sbus iommu: can't toss first dvma page"); - } + if (vmem_xalloc_addr(sc->sc_is.is_dvmamap, 0x0, NBPG, + VM_NOSLEEP) != 0) + panic("sbus iommu: can't toss first dvma page"); sc->sc_dmatag = sbus_alloc_dma_tag(sc, ma->ma_dmatag); Index: arch/sparc64/dev/schizo.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/schizo.c,v diff -u -p -r1.70 schizo.c --- arch/sparc64/dev/schizo.c 29 Mar 2024 21:29:33 -0000 1.70 +++ arch/sparc64/dev/schizo.c 6 Sep 2024 11:18:14 -0000 @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include Index: arch/sparc64/dev/stp_sbus.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/stp_sbus.c,v diff -u -p -r1.13 stp_sbus.c --- arch/sparc64/dev/stp_sbus.c 16 Oct 2022 01:22:39 -0000 1.13 +++ arch/sparc64/dev/stp_sbus.c 6 Sep 2024 11:18:14 -0000 @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include Index: arch/sparc64/dev/viommu.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/dev/viommu.c,v diff -u -p -r1.20 viommu.c --- arch/sparc64/dev/viommu.c 16 May 2021 15:10:19 -0000 1.20 +++ arch/sparc64/dev/viommu.c 6 Sep 2024 11:18:14 -0000 @@ -37,12 +37,13 @@ */ #include -#include +#include #include #include #include #include #include +#include #include @@ -76,8 +77,8 @@ extern int iommudebug; #define DPRINTF(l, s) #endif -void viommu_enter(struct iommu_state *, struct strbuf_ctl *, bus_addr_t, - paddr_t, int); +void viommu_enter(struct iommu_state *, struct iommu_map_state *, + struct strbuf_ctl *, bus_addr_t, paddr_t, int); void viommu_remove(struct iommu_state *, struct strbuf_ctl *, bus_addr_t); int viommu_dvmamap_load_seg(bus_dma_tag_t, struct iommu_state *, bus_dmamap_t, bus_dma_segment_t *, int, int, bus_size_t, bus_size_t); @@ -135,10 +136,11 @@ viommu_init(char *name, struct iommu_sta * Allocate a dvma map. */ printf("dvma map %x-%x", is->is_dvmabase, is->is_dvmaend); - is->is_dvmamap = extent_create(name, - is->is_dvmabase, (u_long)is->is_dvmaend + 1, - M_DEVBUF, NULL, 0, EX_NOCOALESCE); - mtx_init(&is->is_mtx, IPL_HIGH); + is->is_dvmamap = vmem_create(name, + is->is_dvmabase, ((u_long)is->is_dvmaend + 1) - is->is_dvmabase, + PAGE_SIZE, + /* allocfn = */ NULL, /* freefn = */ NULL, /* backend = */ NULL, + 0, VM_NOSLEEP, IPL_VM); printf("\n"); } @@ -147,11 +149,12 @@ viommu_init(char *name, struct iommu_sta * Add an entry to the IOMMU table. */ void -viommu_enter(struct iommu_state *is, struct strbuf_ctl *sb, bus_addr_t va, - paddr_t pa, int flags) +viommu_enter(struct iommu_state *is, struct iommu_map_state *ims, + struct strbuf_ctl *sb, bus_addr_t va, paddr_t pa, int flags) { u_int64_t tsbid = IOTSBSLOT(va, is->is_tsbsize); - paddr_t page_list[1], addr; + struct strbuf_flush *sbf = &ims->ims_flush; + paddr_t *page_list = (paddr_t *)&sbf->sbf_area; u_int64_t attr, nmapped; int err; @@ -162,17 +165,18 @@ viommu_enter(struct iommu_state *is, str panic("viommu_enter: va %#lx not in DVMA space", va); #endif + page_list[0] = trunc_page(pa); + attr = PCI_MAP_ATTR_READ | PCI_MAP_ATTR_WRITE; if (flags & BUS_DMA_READ) attr &= ~PCI_MAP_ATTR_READ; if (flags & BUS_DMA_WRITE) attr &= ~PCI_MAP_ATTR_WRITE; - page_list[0] = trunc_page(pa); - if (!pmap_extract(pmap_kernel(), (vaddr_t)page_list, &addr)) - panic("viommu_enter: pmap_extract failed"); + LLTRACE(lltrace_fn_enter, hv_pci_iommu_map); err = hv_pci_iommu_map(is->is_devhandle, tsbid, 1, attr, - addr, &nmapped); + sbf->sbf_flushpa, &nmapped); + LLTRACE(lltrace_fn_leave, hv_pci_iommu_map); if (err != H_EOK || nmapped != 1) panic("hv_pci_iommu_map: err=%d", err); } @@ -198,7 +202,9 @@ viommu_remove(struct iommu_state *is, st } #endif + LLTRACE(lltrace_fn_enter, hv_pci_iommu_demap); err = hv_pci_iommu_demap(is->is_devhandle, tsbid, 1, &ndemapped); + LLTRACE(lltrace_fn_leave, hv_pci_iommu_demap); if (err != H_EOK || ndemapped != 1) panic("hv_pci_iommu_unmap: err=%d", err); } @@ -256,8 +262,8 @@ viommu_dvmamap_destroy(bus_dma_tag_t t, if (map->dm_nsegs) bus_dmamap_unload(t0, map); - if (map->_dm_cookie) - iommu_iomap_destroy(map->_dm_cookie); + if (map->_dm_cookie) + iommu_iomap_destroy(map->_dm_cookie); map->_dm_cookie = NULL; BUS_DMA_FIND_PARENT(t, _dmamap_destroy); @@ -279,7 +285,7 @@ viommu_dvmamap_load(bus_dma_tag_t t, bus { int err = 0; bus_size_t sgsize; - u_long dvmaddr, sgstart, sgend; + vmem_addr_t dvmaddr, sgstart, sgend; bus_size_t align, boundary; struct iommu_state *is; struct iommu_map_state *ims = map->_dm_cookie; @@ -360,27 +366,25 @@ viommu_dvmamap_load(bus_dma_tag_t t, bus } sgsize = ims->ims_map.ipm_pagecnt * PAGE_SIZE; - mtx_enter(&is->is_mtx); if (flags & BUS_DMA_24BIT) { - sgstart = MAX(is->is_dvmamap->ex_start, 0xff000000); - sgend = MIN(is->is_dvmamap->ex_end, 0xffffffff); + sgstart = 0xff000000; + sgend = 0xffffffff; } else { - sgstart = is->is_dvmamap->ex_start; - sgend = is->is_dvmamap->ex_end; + sgstart = VMEM_ADDR_MIN; + sgend = VMEM_ADDR_MAX; } - /* - * If our segment size is larger than the boundary we need to - * split the transfer up into little pieces ourselves. - */ - err = extent_alloc_subregion_with_descr(is->is_dvmamap, sgstart, sgend, - sgsize, align, 0, (sgsize > boundary) ? 0 : boundary, - EX_NOWAIT | EX_BOUNDZERO, &ims->ims_er, (u_long *)&dvmaddr); - mtx_leave(&is->is_mtx); - +//printf("%s[%u] size %lu align %lu boundary 0x%lx\n", __func__, __LINE__, +// sgsize, align, boundary); +// err = extent_alloc_subregion_with_descr(is->is_dvmamap, sgstart, sgend, +// sgsize, align, 0, (sgsize > boundary) ? 0 : boundary, +// EX_NOWAIT | EX_BOUNDZERO, &ims->ims_er, (u_long *)&dvmaddr); + err = vmem_xalloc(is->is_dvmamap, sgsize, align, 0, + (sgsize > boundary) ? 0 : boundary, + sgstart, sgend, VM_NOSLEEP | VM_INSTANTFIT, &dvmaddr); #ifdef DEBUG if (err || (dvmaddr == (bus_addr_t)-1)) { - printf("iommu_dvmamap_load(): extent_alloc(%d, %x) failed!\n", + printf("iommu_dvmamap_load(): vmem_xalloc(%d, %x) failed!\n", (int)sgsize, flags); #ifdef DDB if (iommudebug & IDB_BREAK) @@ -392,6 +396,7 @@ viommu_dvmamap_load(bus_dma_tag_t t, bus iommu_iomap_clear_pages(ims); return (err); } +//printf("%s[%u] addr 0x%lx size %lu\n", __func__, __LINE__, dvmaddr, sgsize); /* Set the active DVMA map */ map->_dm_dvmastart = dvmaddr; @@ -466,10 +471,18 @@ viommu_dvmamap_load_raw(bus_dma_tag_t t, int err = 0; bus_size_t sgsize; bus_size_t boundary, align; - u_long dvmaddr, sgstart, sgend; + vmem_addr_t dvmaddr, sgstart, sgend; struct iommu_state *is; struct iommu_map_state *ims = map->_dm_cookie; + if (ISSET(flags, BUS_DMA_BUS4)) { + for (i = 0; i < nsegs; i++) { + printf("%d: %llu @ %llx\n", i, + (unsigned long long)segs[i].ds_len, + (unsigned long long)segs[i].ds_addr); + } + } + #ifdef DIAGNOSTIC if (ims == NULL) panic("viommu_dvmamap_load_raw: null map state"); @@ -550,32 +563,32 @@ viommu_dvmamap_load_raw(bus_dma_tag_t t, } sgsize = ims->ims_map.ipm_pagecnt * PAGE_SIZE; - mtx_enter(&is->is_mtx); if (flags & BUS_DMA_24BIT) { - sgstart = MAX(is->is_dvmamap->ex_start, 0xff000000); - sgend = MIN(is->is_dvmamap->ex_end, 0xffffffff); + sgstart = 0xff000000; + sgend = 0xffffffff; } else { - sgstart = is->is_dvmamap->ex_start; - sgend = is->is_dvmamap->ex_end; + sgstart = VMEM_ADDR_MIN; + sgend = VMEM_ADDR_MAX; } /* * If our segment size is larger than the boundary we need to * split the transfer up into little pieces ourselves. */ - err = extent_alloc_subregion_with_descr(is->is_dvmamap, sgstart, sgend, - sgsize, align, 0, (sgsize > boundary) ? 0 : boundary, - EX_NOWAIT | EX_BOUNDZERO, &ims->ims_er, (u_long *)&dvmaddr); - mtx_leave(&is->is_mtx); - +//printf("%s[%u] size %lu align %lu boundary 0x%lx\n", __func__, __LINE__, +// sgsize, align, boundary); + err = vmem_xalloc(is->is_dvmamap, sgsize, align, 0, + (sgsize > boundary) ? 0 : boundary, + sgstart, sgend, VM_NOSLEEP | VM_INSTANTFIT, &dvmaddr); if (err != 0) { iommu_iomap_clear_pages(ims); return (err); } +//printf("%s[%u] addr 0x%lx size %lu\n", __func__, __LINE__, dvmaddr, sgsize); #ifdef DEBUG if (dvmaddr == (bus_addr_t)-1) { - printf("iommu_dvmamap_load_raw(): extent_alloc(%d, %x) " + printf("iommu_dvmamap_load_raw(): vmem_xalloc(%d, %x) " "failed!\n", (int)sgsize, flags); #ifdef DDB if (iommudebug & IDB_BREAK) @@ -836,7 +849,6 @@ viommu_dvmamap_unload(bus_dma_tag_t t, b struct iommu_map_state *ims = map->_dm_cookie; bus_addr_t dvmaddr = map->_dm_dvmastart; bus_size_t sgsize = map->_dm_dvmasize; - int error; #ifdef DEBUG if (ims == NULL) @@ -859,13 +871,10 @@ viommu_dvmamap_unload(bus_dma_tag_t t, b map->dm_mapsize = 0; map->dm_nsegs = 0; - mtx_enter(&is->is_mtx); - error = extent_free(is->is_dvmamap, dvmaddr, sgsize, EX_NOWAIT); +//printf("%s[%u] addr 0x%lx size %lu\n", __func__, __LINE__, dvmaddr, sgsize); + vmem_xfree(is->is_dvmamap, dvmaddr, sgsize); map->_dm_dvmastart = 0; map->_dm_dvmasize = 0; - mtx_leave(&is->is_mtx); - if (error != 0) - printf("warning: %ld of DVMA space lost\n", sgsize); } void @@ -883,16 +892,15 @@ viommu_dvmamap_sync(bus_dma_tag_t t, bus if (len == 0) return; - if (ops & BUS_DMASYNC_PREWRITE) + if (ISSET(ops, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_PREWRITE)) __membar("#MemIssue"); - #if 0 if (ops & (BUS_DMASYNC_POSTREAD | BUS_DMASYNC_PREWRITE)) _viommu_dvmamap_sync(t, t0, map, offset, len, ops); -#endif if (ops & BUS_DMASYNC_POSTREAD) __membar("#MemIssue"); +#endif } int @@ -928,6 +936,7 @@ struct iommu_map_state * viommu_iomap_create(int n) { struct iommu_map_state *ims; + struct strbuf_flush *sbf; /* Safety for heavily fragmented data, such as mbufs */ n += 4; @@ -943,6 +952,10 @@ viommu_iomap_create(int n) ims->ims_map.ipm_maxpage = n; SPLAY_INIT(&ims->ims_map.ipm_tree); + /* (Ab)use the flush area for use with the pci_iommu_map hypercall */ + sbf = &ims->ims_flush; + pmap_extract(pmap_kernel(), (vaddr_t)sbf->sbf_area, &sbf->sbf_flushpa); + return (ims); } @@ -960,7 +973,7 @@ viommu_iomap_load_map(struct iommu_state for (i = 0, e = ipm->ipm_map; i < ipm->ipm_pagecnt; ++i, ++e) { e->ipe_va = vmaddr; - viommu_enter(is, NULL, e->ipe_va, e->ipe_pa, flags); + viommu_enter(is, ims, NULL, e->ipe_va, e->ipe_pa, flags); vmaddr += PAGE_SIZE; } } Index: arch/sparc64/include/bus.h =================================================================== RCS file: /cvs/src/sys/arch/sparc64/include/bus.h,v diff -u -p -r1.37 bus.h --- arch/sparc64/include/bus.h 24 Dec 2023 11:12:34 -0000 1.37 +++ arch/sparc64/include/bus.h 6 Sep 2024 11:18:14 -0000 @@ -70,6 +70,8 @@ #ifdef _KERNEL +#include + /* * Debug hooks */ @@ -477,10 +479,12 @@ struct sparc_bus_dma_tag { #define _BD_PRECALL(t,f) \ while (t->f == NULL) { \ t = t->_parent; \ - } + } \ + LLTRACE(lltrace_fn_enter, t->f) #define _BD_CALL(t,f) \ (*(t)->f) -#define _BD_POSTCALL +#define _BD_POSTCALL(t,f) \ + LLTRACE(lltrace_fn_leave, t->f) static inline int bus_dmamap_create(bus_dma_tag_t t, bus_size_t s, int n, bus_size_t m, @@ -490,7 +494,7 @@ bus_dmamap_create(bus_dma_tag_t t, bus_s const bus_dma_tag_t t0 = t; _BD_PRECALL(t, _dmamap_create); r = _BD_CALL(t, _dmamap_create)(t, t0, s, n, m, b, f, p); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_create); return (r); } static inline void @@ -499,7 +503,7 @@ bus_dmamap_destroy(bus_dma_tag_t t, bus_ const bus_dma_tag_t t0 = t; _BD_PRECALL(t, _dmamap_destroy); _BD_CALL(t, _dmamap_destroy)(t, t0, p); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_destroy); } static inline int bus_dmamap_load(bus_dma_tag_t t, bus_dmamap_t m, void *b, bus_size_t s, @@ -509,7 +513,7 @@ bus_dmamap_load(bus_dma_tag_t t, bus_dma int r; _BD_PRECALL(t, _dmamap_load); r = _BD_CALL(t, _dmamap_load)(t, t0, m, b, s, p, f); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_load); return (r); } static inline int @@ -520,7 +524,7 @@ bus_dmamap_load_mbuf(bus_dma_tag_t t, bu int r; _BD_PRECALL(t, _dmamap_load_mbuf); r = _BD_CALL(t, _dmamap_load_mbuf)(t, t0, m, b, f); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_load_mbuf); return (r); } static inline int @@ -530,7 +534,7 @@ bus_dmamap_load_uio(bus_dma_tag_t t, bus int r; _BD_PRECALL(t, _dmamap_load_uio); r = _BD_CALL(t, _dmamap_load_uio)(t, t0, m, u, f); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_load_uio); return (r); } static inline int @@ -541,7 +545,7 @@ bus_dmamap_load_raw(bus_dma_tag_t t, bus int r; _BD_PRECALL(t, _dmamap_load_raw); r = _BD_CALL(t, _dmamap_load_raw)(t, t0, m, sg, n, s, f); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_load_raw); return (r); } static inline void @@ -550,7 +554,7 @@ bus_dmamap_unload(bus_dma_tag_t t, bus_d const bus_dma_tag_t t0 = t; _BD_PRECALL(t, _dmamap_unload); _BD_CALL(t, _dmamap_unload)(t, t0, p); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_unload); } static inline void bus_dmamap_sync(bus_dma_tag_t t, bus_dmamap_t p, bus_addr_t o, bus_size_t l, @@ -559,7 +563,7 @@ bus_dmamap_sync(bus_dma_tag_t t, bus_dma const bus_dma_tag_t t0 = t; _BD_PRECALL(t, _dmamap_sync); _BD_CALL(t, _dmamap_sync)(t, t0, p, o, l, ops); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamap_sync); } static inline int bus_dmamem_alloc(bus_dma_tag_t t, bus_size_t s, bus_size_t a, bus_size_t b, @@ -569,7 +573,7 @@ bus_dmamem_alloc(bus_dma_tag_t t, bus_si int ret; _BD_PRECALL(t, _dmamem_alloc); ret = _BD_CALL(t, _dmamem_alloc)(t, t0, s, a, b, sg, n, r, f); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamem_alloc); return (ret); } static inline void @@ -578,7 +582,7 @@ bus_dmamem_free(bus_dma_tag_t t, bus_dma const bus_dma_tag_t t0 = t; _BD_PRECALL(t, _dmamem_free); _BD_CALL(t, _dmamem_free)(t, t0, sg, n); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamem_free); } static inline int bus_dmamem_map(bus_dma_tag_t t, bus_dma_segment_t *sg, int n, size_t s, @@ -588,7 +592,7 @@ bus_dmamem_map(bus_dma_tag_t t, bus_dma_ int r; _BD_PRECALL(t, _dmamem_map); r = _BD_CALL(t, _dmamem_map)(t, t0, sg, n, s, k, f); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamem_map); return (r); } static inline void @@ -597,7 +601,7 @@ bus_dmamem_unmap(bus_dma_tag_t t, caddr_ const bus_dma_tag_t t0 = t; _BD_PRECALL(t, _dmamem_unmap); _BD_CALL(t, _dmamem_unmap)(t, t0, k, s); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamem_unmap); } static inline paddr_t bus_dmamem_mmap(bus_dma_tag_t t, bus_dma_segment_t *sg, int n, off_t o, int p, @@ -607,7 +611,7 @@ bus_dmamem_mmap(bus_dma_tag_t t, bus_dma int r; _BD_PRECALL(t, _dmamem_mmap); r = _BD_CALL(t, _dmamem_mmap)(t, t0, sg, n, o, p, f); - _BD_POSTCALL; + _BD_POSTCALL(t, _dmamem_mmap); return (r); } Index: arch/sparc64/sparc64/conf.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/sparc64/conf.c,v diff -u -p -r1.90 conf.c --- arch/sparc64/sparc64/conf.c 11 Jun 2024 09:21:32 -0000 1.90 +++ arch/sparc64/sparc64/conf.c 6 Sep 2024 11:18:14 -0000 @@ -110,6 +110,7 @@ cdev_decl(pci); #include "ksyms.h" #include "kstat.h" +#include "llt.h" #include "hotplug.h" #include "vscsi.h" @@ -180,7 +181,8 @@ struct cdevsw cdevsw[] = cdev_notdef(), /* 28 */ cdev_notdef(), /* 29 */ cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ - cdev_notdef(), /* 31 */ + cdev_lltrace_init(NLLT,lltrace), + /* 31: lltrace */ cdev_notdef(), /* 32 */ cdev_notdef(), /* 33 */ cdev_notdef(), /* 34 */ Index: arch/sparc64/sparc64/intr.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/sparc64/intr.c,v diff -u -p -r1.67 intr.c --- arch/sparc64/sparc64/intr.c 29 Mar 2024 21:29:34 -0000 1.67 +++ arch/sparc64/sparc64/intr.c 6 Sep 2024 11:18:14 -0000 @@ -45,6 +45,7 @@ #include #include #include +#include #include @@ -78,6 +79,8 @@ intr_handler(struct trapframe *tf, struc #ifdef MULTIPROCESSOR int need_lock; + LLTRACE(lltrace_intr_enter, LLTRACE_INTR_T_HW, ih->ih_number); + if (ih->ih_mpsafe) need_lock = 0; else @@ -86,11 +89,16 @@ intr_handler(struct trapframe *tf, struc if (need_lock) KERNEL_LOCK(); #endif + LLTRACE(lltrace_fn_enter, ih->ih_fun); rc = (*ih->ih_fun)(ih->ih_arg ? ih->ih_arg : tf); + LLTRACE(lltrace_fn_leave, ih->ih_fun); #ifdef MULTIPROCESSOR if (need_lock) KERNEL_UNLOCK(); #endif + + LLTRACE(lltrace_intr_leave, LLTRACE_INTR_T_HW, ih->ih_number); + return rc; } @@ -109,7 +117,9 @@ intr_list_handler(void *arg) sparc_wrpr(pil, ih->ih_pil, 0); ci->ci_handled_intr_level = ih->ih_pil; + LLTRACE_CPU(ci, lltrace_fn_enter, ih->ih_fun); rv = ih->ih_fun(ih->ih_arg); + LLTRACE_CPU(ci, lltrace_fn_leave, ih->ih_fun); if (rv) { ih->ih_count.ec_count++; claimed = 1; Index: arch/sparc64/sparc64/ipifuncs.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/sparc64/ipifuncs.c,v diff -u -p -r1.22 ipifuncs.c --- arch/sparc64/sparc64/ipifuncs.c 14 Apr 2024 19:08:09 -0000 1.22 +++ arch/sparc64/sparc64/ipifuncs.c 6 Sep 2024 11:18:14 -0000 @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -74,6 +75,8 @@ sun4u_send_ipi(int itid, void (*func)(vo KASSERT((u_int64_t)func > MAXINTNUM); + LLTRACE(lltrace_ipi, itid); + /* * UltraSPARC-IIIi CPUs select the BUSY/NACK pair based on the * lower two bits of the ITID. @@ -127,6 +130,8 @@ sun4v_send_ipi(int itid, void (*func)(vo u_int64_t s; int err, i; + LLTRACE(lltrace_ipi, itid); + s = intr_disable(); stha(ci->ci_cpuset, ASI_PHYS_CACHED, itid); @@ -154,6 +159,8 @@ sun4v_send_ipi(int itid, void (*func)(vo void sparc64_broadcast_ipi(void (*func)(void), u_int64_t arg0, u_int64_t arg1) { + LLTRACE(lltrace_ipi, ~0x0); + if (CPU_ISSUN4V) sun4v_broadcast_ipi(func, arg0, arg1); else @@ -180,6 +187,8 @@ sun4v_broadcast_ipi(void (*func)(void), struct cpu_info *ci = curcpu(); paddr_t cpuset = ci->ci_cpuset; int err, i, ncpus = 0; + + LLTRACE(lltrace_ipi, ~0x0); for (ci = cpus; ci != NULL; ci = ci->ci_next) { if (ci->ci_cpuid == cpu_number()) Index: arch/sparc64/sparc64/machdep.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/sparc64/machdep.c,v diff -u -p -r1.218 machdep.c --- arch/sparc64/sparc64/machdep.c 22 May 2024 05:51:49 -0000 1.218 +++ arch/sparc64/sparc64/machdep.c 6 Sep 2024 11:18:14 -0000 @@ -991,12 +991,20 @@ _bus_dmamap_load_mbuf(bus_dma_tag_t t, b buflen -= incr; vaddr += incr; - if (i > 0 && pa == (segs[i - 1].ds_addr + - segs[i - 1].ds_len) && ((segs[i - 1].ds_len + incr) - < map->_dm_maxsegsz)) { - /* Hey, waddyaknow, they're contiguous */ - segs[i - 1].ds_len += incr; - continue; + if (i > 0) { + bus_dma_segment_t *pseg = &segs[i - 1]; + if (pa == pseg->ds_addr + pseg->ds_len) { + /* waddyaknow, they're contiguous */ + long nlen = pseg->ds_len + incr; + if (nlen <= map->_dm_maxsegsz) { + pseg->ds_len = nlen; + continue; + } + pseg->ds_len = map->_dm_maxsegsz; + + pa = pseg->ds_addr + map->_dm_maxsegsz; + incr = nlen - map->_dm_maxsegsz; + } } segs[i].ds_addr = pa; segs[i].ds_len = incr; Index: conf/GENERIC =================================================================== RCS file: /cvs/src/sys/conf/GENERIC,v diff -u -p -r1.297 GENERIC --- conf/GENERIC 31 Aug 2024 04:17:14 -0000 1.297 +++ conf/GENERIC 6 Sep 2024 11:18:14 -0000 @@ -81,6 +81,7 @@ pseudo-device endrun 1 # EndRun line dis pseudo-device vnd 4 # vnode disk devices pseudo-device ksyms 1 # kernel symbols device pseudo-device kstat # kernel statistics device +pseudo-device llt # low-level tracing device # clonable devices pseudo-device bpfilter # packet filter Index: conf/files =================================================================== RCS file: /cvs/src/sys/conf/files,v diff -u -p -r1.736 files --- conf/files 31 Aug 2024 04:17:14 -0000 1.736 +++ conf/files 6 Sep 2024 11:18:14 -0000 @@ -24,6 +24,8 @@ define video {} define intrmap {} define fdt {[early = 0]} +define vmem + # filesystem firmware loading attribute define firmload @@ -602,6 +604,9 @@ file dev/ksyms.c ksyms needs-flag pseudo-device kstat file dev/kstat.c kstat needs-flag +pseudo-device llt +file dev/lltrace.c llt needs-flag + pseudo-device fuse file miscfs/fuse/fuse_device.c fuse needs-flag file miscfs/fuse/fuse_file.c fuse @@ -744,6 +749,7 @@ file kern/subr_blist.c file kern/subr_disk.c file kern/subr_evcount.c file kern/subr_extent.c +file kern/subr_vmem.c vmem file kern/subr_suspend.c suspend file kern/subr_hibernate.c hibernate file kern/subr_kubsan.c kubsan Index: dev/lltrace.c =================================================================== RCS file: dev/lltrace.c diff -N dev/lltrace.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ dev/lltrace.c 6 Sep 2024 11:18:14 -0000 @@ -0,0 +1,1104 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2022 The University of Queensland + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * This code was written by David Gwynne as part + * of the Information Technology Infrastructure Group (ITIG) in the + * Faculty of Engineering, Architecture and Information Technology + * (EAIT). + * + * It was heavily inspired by the KUTrace (kernel/userland tracing) + * framework by Richard L. Sites. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#if defined(__amd64__) || defined(__i386__) + +static inline unsigned int +lltrace_cas(unsigned int *p, unsigned int e, unsigned int n) +{ + __asm volatile("cmpxchgl %2, %1" + : "=a" (e), "=m" (*p) + : "r" (n), "a" (e), "m" (*p)); + + return (e); +} + +static inline uint64_t +lltrace_ts(void) +{ + unsigned int hi, lo; + + __asm volatile("lfence; rdtsc" : "=d" (hi), "=a" (lo)); + + return (lo & (LLTRACE_TS_MASK << LLTRACE_TS_SHIFT)); +} + +static inline uint64_t +lltrace_ts_long(void) +{ + return (rdtsc_lfence() & ~LLTRACE_MASK(LLTRACE_TS_SHIFT)); +} + +#elif defined(__aarch64__) + +#define lltrace_cas(_p, _e, _n) atomic_cas_uint((_p), (_e), (_n)) + +static inline uint64_t +lltrace_ts_long(void) +{ + uint64_t ts; + + __asm volatile("mrs %x0, cntvct_el0" : "=r" (ts)); + + return (ts << LLTRACE_TS_SHIFT); +} + +static inline uint64_t +lltrace_ts(void) +{ + uint64_t ts = ltrace_ts_long(); + + return (ts & (LLTRACE_TS_MASK << LLTRACE_TS_SHIFT)); +} + +#elif defined(__sparc64__) + +#define lltrace_cas(_p, _e, _n) atomic_cas_uint((_p), (_e), (_n)) + +static inline uint64_t +lltrace_ts_long(void) +{ + uint64_t ts; + + ts = sys_tick(); + + return (ts << LLTRACE_TS_SHIFT); +} + +static inline uint64_t +lltrace_ts(void) +{ + uint64_t ts = lltrace_ts_long(); + + return (ts & (LLTRACE_TS_MASK << LLTRACE_TS_SHIFT)); +} + +#else /* not x86 or arm64 */ + +#error not supported (yet) + +static unsigned int +lltrace_cas(unsigned int *p, unsigned int e, unsigned int n) +{ + unsigned int o; + int s; + + s = intr_disable(); + o = *p; + if (o == e) + *p = n; + intr_restore(s); + + return (o); +} + +static inline uint64_t +lltrace_ts(void) +{ + return (countertime()); +} + +static inline uint64_t +lltrace_ts_long(void) +{ + return (countertime()); +} + +#endif + +#define LLTRACE_MB2NBUF(_mb) \ + (((_mb) * (1U << 20)) / sizeof(struct lltrace_buffer)) +#define LLTRACE_NBUF2MB(_nbuf) \ + (((_nbuf) * sizeof(struct lltrace_buffer)) / (1U << 20)) + +#define LLTRACE_BLEN_DEFAULT 16 + +struct lltrace_cpu { + SIMPLEQ_ENTRY(lltrace_cpu) + llt_entry; + struct lltrace_buffer llt_buffer; + unsigned int llt_slot; + unsigned int llt_pid; + unsigned int llt_tid; + uint64_t llt_wakeid; +}; + +SIMPLEQ_HEAD(lltrace_cpu_list, lltrace_cpu); + +struct lltrace_softc { + unsigned int sc_running; + unsigned int sc_mode; + struct rwlock sc_lock; + unsigned int sc_nbuffers; + + unsigned int sc_free; + unsigned int sc_used; + struct lltrace_cpu **sc_ring; + struct lltrace_cpu *sc_buffers; + + unsigned int sc_read; + unsigned int sc_reading; + struct selinfo sc_sel; + + uint64_t sc_boottime; + uint64_t sc_monotime; +}; + +static int lltrace_start(struct lltrace_softc *, struct proc *); +static int lltrace_stop(struct lltrace_softc *, struct proc *); +static int lltrace_flush(struct lltrace_softc *); + +static struct lltrace_softc *lltrace_sc; + +int +lltattach(int num) +{ + return (0); +} + +int +lltraceopen(dev_t dev, int flag, int mode, struct proc *p) +{ + struct lltrace_softc *sc; + int error; + + if (minor(dev) != 0) + return (ENXIO); + + error = suser(p); + if (error != 0) + return (error); + + if (lltrace_sc != NULL) + return (EBUSY); + + sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO); + if (sc == NULL) + return (ENOMEM); + + sc->sc_running = 0; + sc->sc_nbuffers = LLTRACE_MB2NBUF(LLTRACE_BLEN_DEFAULT); + + rw_init(&sc->sc_lock, "lltlk"); + + sc->sc_read = 0; + sc->sc_reading = 0; + klist_init_rwlock(&sc->sc_sel.si_note, &sc->sc_lock); + + /* commit */ + if (atomic_cas_ptr(&lltrace_sc, NULL, sc) != NULL) { + free(sc, M_DEVBUF, sizeof(*sc)); + return (EBUSY); + } + + return (0); +} + +int +lltraceclose(dev_t dev, int flag, int mode, struct proc *p) +{ + struct lltrace_softc *sc = lltrace_sc; + + rw_enter_write(&sc->sc_lock); + lltrace_stop(sc, p); + lltrace_flush(sc); + rw_exit_write(&sc->sc_lock); + + lltrace_sc = NULL; + membar_sync(); + + free(sc, M_DEVBUF, sizeof(*sc)); + + return (0); +} + +static int +lltrace_fionread(struct lltrace_softc *sc) +{ + int canread; + + rw_enter_read(&sc->sc_lock); + canread = !sc->sc_running && (sc->sc_buffers != NULL) && + (sc->sc_read < sc->sc_nbuffers); + rw_exit_read(&sc->sc_lock); + + return (canread ? sizeof(struct lltrace_buffer) : 0); +} + +static void +lltrace_cpu_init(struct lltrace_cpu *llt, struct lltrace_softc *sc, + struct cpu_info *ci, unsigned int pid, unsigned int tid, uint64_t wakeid) +{ + struct lltrace_header *llh; + + llh = (struct lltrace_header *)&llt->llt_buffer; + llh->h_cpu = cpu_number(); + llh->h_idletid = ci->ci_schedstate.spc_idleproc->p_tid; + llh->h_boottime = sc->sc_boottime; + llh->h_start_cy = lltrace_ts_long(); + llh->h_start_ns = nsecuptime() - sc->sc_monotime; + llh->h_end_cy = 0; + llh->h_end_ns = 0; + llh->h_pid = pid; + llh->h_tid = tid; + llh->h_zero = 0; + + llt->llt_pid = pid; + llt->llt_tid = tid; + llt->llt_slot = 8; + llt->llt_wakeid = wakeid; +} + +static void +lltrace_cpu_fini(struct lltrace_cpu *llt, struct lltrace_softc *sc) +{ + struct lltrace_header *llh; + + llh = (struct lltrace_header *)&llt->llt_buffer; + llh->h_end_cy = lltrace_ts_long(); + llh->h_end_ns = nsecuptime() - sc->sc_monotime; +} + +static int +lltrace_set_mode(struct lltrace_softc *sc, unsigned int mode) +{ + int error; + + if (mode >= LLTRACE_MODE_COUNT) + return (EINVAL); + + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + return (error); + + if (sc->sc_running) + error = EBUSY; + else + sc->sc_mode = mode; + + rw_exit(&sc->sc_lock); + return (error); +} + +static int +lltrace_set_blen(struct lltrace_softc *sc, unsigned int blen) +{ + int error; + unsigned int nbuffers; + + if (blen < LLTRACE_BLEN_MIN || blen > LLTRACE_BLEN_MAX) + return (EINVAL); + + /* convert megabytes to the number of buffers */ + nbuffers = LLTRACE_MB2NBUF(blen); + if (nbuffers <= ncpus) + return (EINVAL); + + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + return (error); + + if (sc->sc_buffers != NULL) + error = EBUSY; + else + sc->sc_nbuffers = nbuffers; + + rw_exit(&sc->sc_lock); + return (error); +} + +static int +lltrace_start(struct lltrace_softc *sc, struct proc *p) +{ + struct process *ps = p->p_p; + struct bintime boottime; + unsigned int i; + size_t sz; + struct lltrace_cpu_list l = SIMPLEQ_HEAD_INITIALIZER(l); + struct lltrace_cpu *llt; + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + unsigned int pid, tid; + + if (sc->sc_running) + return EINVAL; + + if (sc->sc_nbuffers <= (ncpus * 2 + 1)) + return (EINVAL); + + lltrace_flush(sc); + + sc->sc_monotime = nsecuptime(); + + binboottime(&boottime); + sc->sc_boottime = BINTIME_TO_NSEC(&boottime) + sc->sc_monotime; + + sz = roundup(sc->sc_nbuffers * sizeof(*sc->sc_buffers), PAGE_SIZE); + sc->sc_buffers = km_alloc(sz, &kv_any, &kp_dirty, &kd_waitok); + if (sc->sc_buffers == NULL) + return (ENOMEM); + sc->sc_ring = mallocarray(sc->sc_nbuffers, sizeof(*sc->sc_ring), + M_DEVBUF, M_WAITOK); + for (i = 0; i < sc->sc_nbuffers; i++) { + llt = &sc->sc_buffers[i]; + llt->llt_slot = 0; + sc->sc_ring[i] = llt; + } + + sc->sc_free = 0; /* next slot to pull a free buffer from */ + sc->sc_used = 0; /* next slot to put a used buffer in */ + + CPU_INFO_FOREACH(cii, ci) { + i = sc->sc_free++; /* can't wrap yet */ + + llt = sc->sc_ring[i]; + sc->sc_ring[i] = NULL; + + SIMPLEQ_INSERT_HEAD(&l, llt, llt_entry); + } + + tid = p->p_tid; + pid = ps->ps_pid; + if (ISSET(ps->ps_flags, PS_SYSTEM)) + pid |= (1U << 31); + + CPU_INFO_FOREACH(cii, ci) { + sched_peg_curproc(ci); + + llt = SIMPLEQ_FIRST(&l); + SIMPLEQ_REMOVE_HEAD(&l, llt_entry); + + lltrace_cpu_init(llt, sc, ci, pid, tid, 0x1); + lltrace_pidname(llt, p); + + membar_producer(); + ci->ci_schedstate.spc_lltrace = llt; + } + atomic_clearbits_int(&p->p_flag, P_CPUPEG); + + sc->sc_running = 1; + + return (0); +} + +static int +lltrace_stop(struct lltrace_softc *sc, struct proc *p) +{ + struct lltrace_cpu *llt; + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + unsigned long s; + + if (!sc->sc_running) + return (EALREADY); + + sc->sc_running = 0; + + /* visit each cpu to take llt away safely */ + CPU_INFO_FOREACH(cii, ci) { + sched_peg_curproc(ci); + + s = intr_disable(); + llt = ci->ci_schedstate.spc_lltrace; + ci->ci_schedstate.spc_lltrace = NULL; + intr_restore(s); + + lltrace_cpu_fini(llt, sc); + } + atomic_clearbits_int(&p->p_flag, P_CPUPEG); + + return (0); +} + +static int +lltrace_flush(struct lltrace_softc *sc) +{ + size_t sz; + + rw_assert_wrlock(&sc->sc_lock); + if (sc->sc_running) + return (EBUSY); + + if (sc->sc_buffers == NULL) + return (0); + + sz = roundup(sc->sc_nbuffers * sizeof(*sc->sc_buffers), PAGE_SIZE); + km_free(sc->sc_buffers, sz, &kv_any, &kp_dirty); + free(sc->sc_ring, M_DEVBUF, sc->sc_nbuffers * sizeof(*sc->sc_ring)); + + sc->sc_buffers = NULL; + sc->sc_ring = NULL; + sc->sc_read = 0; + + return (0); +} + +int +lltraceioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) +{ + struct lltrace_softc *sc = lltrace_sc; + int error = 0; + + KERNEL_UNLOCK(); + + switch (cmd) { + case FIONREAD: + *(int *)data = lltrace_fionread(sc); + break; + case FIONBIO: + /* vfs tracks this for us if we let it */ + break; + + case LLTIOCSTART: + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + break; + error = lltrace_start(sc, p); + rw_exit(&sc->sc_lock); + break; + case LLTIOCSTOP: + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + break; + error = lltrace_stop(sc, p); + rw_exit(&sc->sc_lock); + break; + case LLTIOCFLUSH: + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + break; + error = lltrace_flush(sc); + rw_exit(&sc->sc_lock); + break; + + case LLTIOCSBLEN: + error = lltrace_set_blen(sc, *(unsigned int *)data); + break; + case LLTIOCGBLEN: + *(unsigned int *)data = LLTRACE_NBUF2MB(sc->sc_nbuffers); + break; + + case LLTIOCSMODE: + error = lltrace_set_mode(sc, *(unsigned int *)data); + break; + case LLTIOCGMODE: + *(unsigned int *)data = sc->sc_mode; + break; + + default: + error = ENOTTY; + break; + } + + KERNEL_LOCK(); + + return (error); +} + +int +lltraceread(dev_t dev, struct uio *uio, int ioflag) +{ + struct lltrace_softc *sc = lltrace_sc; + struct lltrace_cpu *llt; + unsigned int slot; + int error; + + KERNEL_UNLOCK(); + + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + goto lock; + + if (sc->sc_running) { + if (ISSET(ioflag, IO_NDELAY)) { + error = EWOULDBLOCK; + goto unlock; + } + + do { + sc->sc_reading++; + error = rwsleep_nsec(&sc->sc_reading, &sc->sc_lock, + PRIBIO|PCATCH, "lltread", INFSLP); + sc->sc_reading--; + if (error != 0) + goto unlock; + } while (sc->sc_running); + } + + if (sc->sc_buffers == NULL) { + error = 0; + goto unlock; + } + + slot = sc->sc_read; + for (;;) { + if (slot >= sc->sc_nbuffers) { + error = 0; + goto unlock; + } + + llt = &sc->sc_buffers[slot]; + KASSERT(llt->llt_slot <= nitems(llt->llt_buffer.llt_slots)); + if (llt->llt_slot > 0) + break; + + slot++; + } + + error = uiomove(&llt->llt_buffer, + llt->llt_slot * sizeof(llt->llt_buffer.llt_slots[0]), uio); + if (error != 0) + goto unlock; + + sc->sc_read = slot + 1; + +unlock: + rw_exit(&sc->sc_lock); +lock: + KERNEL_LOCK(); + return (error); +} + +static void +lltrace_filt_detach(struct knote *kn) +{ + struct lltrace_softc *sc = kn->kn_hook; + + klist_remove(&sc->sc_sel.si_note, kn); +} + +static int +lltrace_filt_event(struct knote *kn, long hint) +{ + struct lltrace_softc *sc = kn->kn_hook; + int canread; + + canread = !sc->sc_running && (sc->sc_buffers != NULL) && + (sc->sc_read < sc->sc_nbuffers); + + kn->kn_data = canread ? sizeof(struct lltrace_buffer) : 0; + + return (canread); +} + +static int +lltrace_filt_modify(struct kevent *kev, struct knote *kn) +{ + struct lltrace_softc *sc = kn->kn_hook; + int active; + + rw_enter_write(&sc->sc_lock); + active = knote_modify_fn(kev, kn, lltrace_filt_event); + rw_exit_write(&sc->sc_lock); + + return (active); +} + +static int +lltrace_filt_process(struct knote *kn, struct kevent *kev) +{ + struct lltrace_softc *sc = kn->kn_hook; + int active; + + rw_enter_write(&sc->sc_lock); + active = knote_process_fn(kn, kev, lltrace_filt_event); + rw_exit_write(&sc->sc_lock); + + return (active); +} + +static const struct filterops lltrace_filtops = { + .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, + .f_attach = NULL, + .f_detach = lltrace_filt_detach, + .f_event = lltrace_filt_event, + .f_modify = lltrace_filt_modify, + .f_process = lltrace_filt_process, +}; + +int +lltracekqfilter(dev_t dev, struct knote *kn) +{ + struct lltrace_softc *sc = lltrace_sc; + struct klist *klist; + + switch (kn->kn_filter) { + case EVFILT_READ: + klist = &sc->sc_sel.si_note; + kn->kn_fop = &lltrace_filtops; + break; + default: + return (EINVAL); + } + + kn->kn_hook = sc; + klist_insert(klist, kn); + + return (0); +} + +static struct lltrace_cpu * +lltrace_next(struct lltrace_cpu *llt) +{ + struct lltrace_softc *sc = lltrace_sc; + struct cpu_info *ci = curcpu(); + struct lltrace_cpu *nllt; + unsigned int slot, oslot, nslot; + + /* check if we were preempted */ + nllt = ci->ci_schedstate.spc_lltrace; + if (nllt != llt) { + /* something preempted us and swapped buffers already */ + return (nllt); + } + + slot = sc->sc_free; + for (;;) { + nslot = slot + 1; + if (nslot > sc->sc_nbuffers) { + if (sc->sc_mode == LLTRACE_MODE_HEAD) + return (NULL); + } + + oslot = atomic_cas_uint(&sc->sc_free, slot, nslot); + if (slot == oslot) + break; + + slot = oslot; + } + + slot %= sc->sc_nbuffers; + nllt = sc->sc_ring[slot]; + sc->sc_ring[slot] = NULL; + + slot = sc->sc_used; + for (;;) { + nslot = slot + 1; + + oslot = atomic_cas_uint(&sc->sc_used, slot, nslot); + if (slot == oslot) + break; + + slot = oslot; + } + + lltrace_cpu_init(nllt, sc, ci, llt->llt_pid, llt->llt_tid, + llt->llt_wakeid); + lltrace_cpu_fini(llt, sc); + + slot %= sc->sc_nbuffers; + sc->sc_ring[slot] = llt; + + ci->ci_schedstate.spc_lltrace = nllt; + + return (nllt); +} + +static struct lltrace_cpu * +lltrace_insert_record(struct lltrace_cpu *llt, uint64_t type, uint64_t record, + const uint64_t *extra, unsigned int n) +{ + unsigned int slot, oslot, nslot; + uint64_t *slots; + + record |= type << LLTRACE_TYPE_SHIFT; + record |= n++ << LLTRACE_LEN_SHIFT; + + slot = llt->llt_slot; + for (;;) { + nslot = slot + n; + if (nslot > nitems(llt->llt_buffer.llt_slots)) { + unsigned long s; + + s = intr_disable(); + llt = lltrace_next(llt); + intr_restore(s); + + if (llt == NULL) + return (NULL); + + slot = llt->llt_slot; + continue; + } + + oslot = lltrace_cas(&llt->llt_slot, slot, nslot); + if (slot == oslot) + break; + + slot = oslot; + } + + slots = llt->llt_buffer.llt_slots + slot; + *slots = record; + while (n > 1) { + *(++slots) = *(extra++); + n--; + } + + return (llt); +} + +static struct lltrace_cpu * +lltrace_insert(struct lltrace_cpu *llt, uint64_t type, uint64_t record, + const uint64_t *extra, unsigned int n) +{ + record |= lltrace_ts(); + return (lltrace_insert_record(llt, type, record, extra, n)); +} + +void +lltrace_statclock(struct lltrace_cpu *llt, int usermode, unsigned long pc) +{ +#if 0 + uint64_t event = usermode ? LLTRACE_EVENT_PC_U : LLTRACE_EVENT_PC_K; + uint64_t extra[1] = { pc }; + + lltrace_insert(llt, (event | nitems(extra)) << LLTRACE_EVENT_SHIFT, + extra, nitems(extra)); +#endif +} + +void +lltrace_syscall(struct lltrace_cpu *llt, register_t code, + size_t argsize, const register_t *args) +{ + uint64_t record = LLTRACE_EVENT_PHASE_START << + LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_SYSCALL << + LLTRACE_EVENT_CLASS_SHIFT; + record |= ((uint64_t)code & LLTRACE_SYSCALL_MASK) << + LLTRACE_SYSCALL_SHIFT; + + if (argsize > 0) + record |= (uint64_t)args[0] << LLTRACE_SYSCALL_V_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_sysret(struct lltrace_cpu *llt, register_t code, + int error, const register_t retvals[2]) +{ + uint64_t record; + + record = LLTRACE_EVENT_PHASE_END << + LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_SYSCALL << + LLTRACE_EVENT_CLASS_SHIFT; + record |= ((uint64_t)code & LLTRACE_SYSCALL_MASK) << + LLTRACE_SYSCALL_SHIFT; + record |= (uint64_t)error << LLTRACE_SYSCALL_V_SHIFT; + + llt = lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); + if (llt == NULL) { + struct lltrace_softc *sc = lltrace_sc; + + rw_enter_write(&sc->sc_lock); + if (sc->sc_running) + lltrace_stop(sc, curproc); + + knote_locked(&sc->sc_sel.si_note, 0); + if (sc->sc_reading) + wakeup(&sc->sc_reading); + rw_exit_write(&sc->sc_lock); + } +} + +struct lltrace_cpu * +lltrace_pidname(struct lltrace_cpu *llt, struct proc *p) +{ + struct process *ps = p->p_p; + uint64_t record; + uint64_t extra[3]; + unsigned int l, n; + + CTASSERT(sizeof(extra) == sizeof(ps->ps_comm)); + + record = LLTRACE_ID_TYPE_TID << LLTRACE_ID_TYPE_SHIFT; + record |= (uint64_t)p->p_tid << LLTRACE_ID_TID_SHIFT; + record |= (uint64_t)ps->ps_pid << LLTRACE_ID_TID_PID_SHIFT; + if (ISSET(ps->ps_flags, PS_SYSTEM)) + record |= LLTRACE_ID_TID_SYSTEM; + + extra[0] = extra[1] = extra[2] = 0; /* memset */ + l = strlcpy((char *)extra, p->p_p->ps_comm, sizeof(extra)); + + /* turn the string length into the number of slots we need */ + n = howmany(l, sizeof(uint64_t)); + + return (lltrace_insert_record(llt, LLTRACE_TYPE_ID, record, extra, n)); +} + +void +lltrace_switch(struct lltrace_cpu *llt, struct proc *op, struct proc *np) +{ + struct process *nps = np->p_p; + uint64_t state; + uint64_t record; + unsigned int pid; + unsigned int wake; + + llt = lltrace_pidname(llt, np); + if (llt == NULL) + return; + + record = LLTRACE_EVENT_PHASE_INSTANT << + LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_SCHED << + LLTRACE_EVENT_CLASS_SHIFT; + record |= (uint64_t)np->p_tid << LLTRACE_EVENT_DATA_SHIFT; + + /* record what we think the state of the outgoing thread is */ + if (op == NULL) + state = LLTRACE_SCHED_STATE_DEAD; + else if (ISSET(op->p_flag, P_WEXIT)) + state = LLTRACE_SCHED_STATE_DYING; + else if (ISSET(op->p_flag, P_WSLEEP)) + state = LLTRACE_SCHED_STATE_SUSPENDED; + else + state = LLTRACE_SCHED_STATE_BLOCKED; + + record |= (state << LLTRACE_SCHED_STATE_SHIFT); + + pid = nps->ps_pid; + if (ISSET(nps->ps_flags, PS_SYSTEM)) + pid |= (1U << 31); + + llt->llt_pid = pid; + llt->llt_tid = np->p_tid; + + wake = np->p_wakeid != 0; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, &np->p_wakeid, wake); + + if (wake) + np->p_wakeid = 0; +} + +void +lltrace_runnable(struct lltrace_cpu *llt, struct proc *p) +{ + uint64_t record; + uint64_t wakeid; + + llt = lltrace_pidname(llt, p); + if (llt == NULL) + return; + + record = LLTRACE_EVENT_PHASE_INSTANT << + LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_WAKE << + LLTRACE_EVENT_CLASS_SHIFT; + record |= (uint64_t)p->p_tid << LLTRACE_EVENT_DATA_SHIFT; + + wakeid = (uint64_t)cpu_number() << 48; + wakeid |= (llt->llt_wakeid += 2) & LLTRACE_MASK(48); + p->p_wakeid = wakeid; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, &p->p_wakeid, 1); +} + +void +lltrace_sched_enter(struct lltrace_cpu *llt) +{ + uint64_t record = LLTRACE_EVENT_PHASE_START << + LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_SCHED << + LLTRACE_EVENT_CLASS_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_sched_leave(struct lltrace_cpu *llt) +{ + uint64_t record = LLTRACE_EVENT_PHASE_END << + LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_SCHED << + LLTRACE_EVENT_CLASS_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_idle(struct lltrace_cpu *llt, unsigned int idle) +{ + uint64_t record = + (idle ? LLTRACE_EVENT_PHASE_START : LLTRACE_EVENT_PHASE_END) << + LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_IDLE << LLTRACE_EVENT_CLASS_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_event_start(struct lltrace_cpu *llt, unsigned int class) +{ + uint64_t record = LLTRACE_EVENT_PHASE_START << + LLTRACE_EVENT_PHASE_SHIFT; + record |= class << LLTRACE_EVENT_CLASS_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_event_end(struct lltrace_cpu *llt, unsigned int class) +{ + uint64_t record = LLTRACE_EVENT_PHASE_END << + LLTRACE_EVENT_PHASE_SHIFT; + record |= class << LLTRACE_EVENT_CLASS_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +static inline void +lltrace_intr(struct lltrace_cpu *llt, uint64_t phase, + uint64_t type, uint64_t data) +{ + uint64_t record = phase << LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_INTR << LLTRACE_EVENT_CLASS_SHIFT; + record |= type << LLTRACE_INTR_T_SHIFT; + record |= data << LLTRACE_INTR_DATA_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_ipi(struct lltrace_cpu *llt, unsigned int cpu) +{ + lltrace_intr(llt, LLTRACE_EVENT_PHASE_INSTANT, + LLTRACE_INTR_T_IPI, cpu); +} + +void +lltrace_intr_enter(struct lltrace_cpu *llt, unsigned int type, unsigned int vec) +{ + lltrace_intr(llt, LLTRACE_EVENT_PHASE_START, type, vec); +} + +void +lltrace_intr_leave(struct lltrace_cpu *llt, unsigned int type, unsigned int vec) +{ + lltrace_intr(llt, LLTRACE_EVENT_PHASE_END, type, vec); +} + +void +lltrace_lock(struct lltrace_cpu *llt, void *lock, + unsigned int type, unsigned int step) +{ + uint64_t record = (uint64_t)type << LLTRACE_LK_TYPE_SHIFT; + record |= (uint64_t)step << LLTRACE_LK_PHASE_SHIFT; + record |= (uint64_t)lock << LLTRACE_LK_ADDR_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_LOCKING, record, NULL, 0); +} + +void +lltrace_count(struct lltrace_cpu *llt, unsigned int t, unsigned int v) +{ + uint64_t record; + + record = LLTRACE_EVENT_PHASE_INSTANT << LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_COUNT << LLTRACE_EVENT_CLASS_SHIFT; + record |= (uint64_t)t << LLTRACE_COUNT_T_SHIFT; + record |= (uint64_t)v << LLTRACE_COUNT_V_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_mark(struct lltrace_cpu *llt) +{ +#if 0 + uint64_t record = LLTRACE_EVENT_MARK << LLTRACE_EVENT_SHIFT; + + lltrace_insert(llt, record, NULL, 0); +#endif +} + +static void +lltrace_fn(struct lltrace_cpu *llt, unsigned int phase, void *fn) +{ + uint64_t record = (uint64_t)phase << LLTRACE_EVENT_PHASE_SHIFT; + record |= LLTRACE_EVENT_CLASS_FUNC << LLTRACE_EVENT_CLASS_SHIFT; + /* 32 bits is enough to identify most symbols */ + record |= (uint64_t)fn << LLTRACE_EVENT_DATA_SHIFT; + + lltrace_insert(llt, LLTRACE_TYPE_EVENT, record, NULL, 0); +} + +void +lltrace_fn_enter(struct lltrace_cpu *llt, void *fn) +{ + lltrace_fn(llt, LLTRACE_EVENT_PHASE_START, fn); +} + +void +lltrace_fn_leave(struct lltrace_cpu *llt, void *fn) +{ + lltrace_fn(llt, LLTRACE_EVENT_PHASE_END, fn); +} + +void +__cyg_profile_func_enter(void *fn, void *pc) +{ + struct lltrace_cpu *llt; + + llt = lltrace_enter(); + if (llt == NULL) + return; + + lltrace_fn_enter(llt, fn); +} + +void +__cyg_profile_func_exit(void *fn, void *pc) +{ + struct lltrace_cpu *llt; + + llt = lltrace_enter(); + if (llt == NULL) + return; + + lltrace_fn_leave(llt, fn); +} Index: dev/pci/if_em.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_em.c,v diff -u -p -r1.378 if_em.c --- dev/pci/if_em.c 31 Aug 2024 16:23:09 -0000 1.378 +++ dev/pci/if_em.c 6 Sep 2024 11:18:14 -0000 @@ -452,7 +452,7 @@ em_attach(struct device *parent, struct sc->hw.wait_autoneg_complete = WAIT_FOR_AUTO_NEG_DEFAULT; sc->hw.autoneg_advertised = AUTONEG_ADV_DEFAULT; sc->hw.tbi_compatibility_en = TRUE; - sc->sc_rx_buffer_len = EM_RXBUFFER_2048; + sc->sc_rx_buffer_len = EM_MCLBYTES; sc->hw.phy_init_script = 1; sc->hw.phy_reset_disable = FALSE; @@ -2712,16 +2712,17 @@ em_get_buf(struct em_queue *que, int i) KASSERT(pkt->pkt_m == NULL); - m = MCLGETL(NULL, M_DONTWAIT, EM_MCLBYTES); + m = MCLGETL(NULL, M_DONTWAIT, EM_MCLBYTES + ETHER_ALIGN); if (m == NULL) { sc->mbuf_cluster_failed++; return (ENOBUFS); } - m->m_len = m->m_pkthdr.len = EM_MCLBYTES; + m->m_len = m->m_pkthdr.len = EM_MCLBYTES + ETHER_ALIGN; m_adj(m, ETHER_ALIGN); error = bus_dmamap_load_mbuf(sc->sc_dmat, pkt->pkt_map, - m, BUS_DMA_NOWAIT); + m, BUS_DMA_NOWAIT | + (ISSET(sc->sc_ac.ac_if.if_flags, IFF_LINK0) ? BUS_DMA_BUS4 : 0)); if (error) { m_freem(m); return (error); Index: dev/pci/if_em.h =================================================================== RCS file: /cvs/src/sys/dev/pci/if_em.h,v diff -u -p -r1.83 if_em.h --- dev/pci/if_em.h 16 Feb 2024 22:30:54 -0000 1.83 +++ dev/pci/if_em.h 6 Sep 2024 11:18:14 -0000 @@ -268,7 +268,7 @@ typedef int boolean_t; #define EM_RXBUFFER_8192 8192 #define EM_RXBUFFER_16384 16384 -#define EM_MCLBYTES (EM_RXBUFFER_2048 + ETHER_ALIGN) +#define EM_MCLBYTES EM_RXBUFFER_2048 #define EM_MAX_SCATTER 64 #define EM_TSO_SIZE 65535 Index: dev/pci/if_ix.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_ix.c,v diff -u -p -r1.216 if_ix.c --- dev/pci/if_ix.c 31 Aug 2024 16:23:09 -0000 1.216 +++ dev/pci/if_ix.c 6 Sep 2024 11:18:14 -0000 @@ -764,7 +764,7 @@ ixgbe_init(void *arg) ixgbe_initialize_transmit_units(sc); /* Use 2k clusters, even for jumbo frames */ - sc->rx_mbuf_sz = MCLBYTES + ETHER_ALIGN; + sc->rx_mbuf_sz = MCLBYTES; /* Prepare receive descriptors and buffers */ if (ixgbe_setup_receive_structures(sc)) { @@ -1475,10 +1475,10 @@ ixgbe_encap(struct ix_txring *txr, struc for (j = 0; j < map->dm_nsegs; j++) { txd = &txr->tx_base[i]; - txd->read.buffer_addr = htole64(map->dm_segs[j].ds_addr); - txd->read.cmd_type_len = htole32(txr->txd_cmd | + htolem64(&txd->read.buffer_addr, map->dm_segs[j].ds_addr); + htolem32(&txd->read.cmd_type_len, txr->txd_cmd | cmd_type_len | map->dm_segs[j].ds_len); - txd->read.olinfo_status = htole32(olinfo_status); + htolem32(&txd->read.olinfo_status, olinfo_status); last = i; /* descriptor that will get completion IRQ */ if (++i == sc->num_tx_desc) @@ -2620,9 +2620,6 @@ ixgbe_txeof(struct ix_txring *txr) struct ixgbe_tx_buf *tx_buffer; struct ixgbe_legacy_tx_desc *tx_desc; - if (!ISSET(ifp->if_flags, IFF_RUNNING)) - return FALSE; - head = txr->next_avail_desc; tail = txr->next_to_clean; @@ -2698,15 +2695,16 @@ ixgbe_get_buf(struct ix_rxring *rxr, int } /* needed in any case so prealocate since this one will fail for sure */ - mp = MCLGETL(NULL, M_DONTWAIT, sc->rx_mbuf_sz); + mp = MCLGETL(NULL, M_DONTWAIT, sc->rx_mbuf_sz + ETHER_ALIGN); if (!mp) return (ENOBUFS); - mp->m_data += (mp->m_ext.ext_size - sc->rx_mbuf_sz); + mp->m_data += ETHER_ALIGN; mp->m_len = mp->m_pkthdr.len = sc->rx_mbuf_sz; error = bus_dmamap_load_mbuf(rxr->rxdma.dma_tag, rxbuf->map, - mp, BUS_DMA_NOWAIT); + mp, BUS_DMA_NOWAIT | + (ISSET(sc->arpcom.ac_if.if_flags, IFF_LINK0) ? BUS_DMA_BUS4 : 0)); if (error) { m_freem(mp); return (error); @@ -2716,7 +2714,7 @@ ixgbe_get_buf(struct ix_rxring *rxr, int 0, rxbuf->map->dm_mapsize, BUS_DMASYNC_PREREAD); rxbuf->buf = mp; - rxdesc->read.pkt_addr = htole64(rxbuf->map->dm_segs[0].ds_addr); + htolem64(&rxdesc->read.pkt_addr, rxbuf->map->dm_segs[0].ds_addr); return (0); } @@ -2747,8 +2745,9 @@ ixgbe_allocate_receive_buffers(struct ix rxbuf = rxr->rx_buffers; for (i = 0; i < sc->num_rx_desc; i++, rxbuf++) { - error = bus_dmamap_create(rxr->rxdma.dma_tag, 16 * 1024, 1, - 16 * 1024, 0, BUS_DMA_NOWAIT, &rxbuf->map); + error = bus_dmamap_create(rxr->rxdma.dma_tag, + sc->rx_mbuf_sz, 1, sc->rx_mbuf_sz, 0, + BUS_DMA_NOWAIT, &rxbuf->map); if (error) { printf("%s: Unable to create Pack DMA map\n", ifp->if_xname); @@ -2789,7 +2788,8 @@ ixgbe_setup_receive_ring(struct ix_rxrin rxr->next_to_check = 0; rxr->last_desc_filled = sc->num_rx_desc - 1; - if_rxr_init(&rxr->rx_ring, 2 * ((ifp->if_hardmtu / MCLBYTES) + 1), + if_rxr_init(&rxr->rx_ring, + 2 * howmany(ifp->if_hardmtu, MCLBYTES) + 1, sc->num_rx_desc - 1); ixgbe_rxfill(rxr); @@ -2924,7 +2924,7 @@ ixgbe_initialize_receive_units(struct ix IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl); } - bufsz = (sc->rx_mbuf_sz - ETHER_ALIGN) >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; + bufsz = sc->rx_mbuf_sz >> IXGBE_SRRCTL_BSIZEPKT_SHIFT; for (i = 0; i < sc->num_queues; i++, rxr++) { uint64_t rdba = rxr->rxdma.dma_map->dm_segs[0].ds_addr; @@ -3139,7 +3139,7 @@ ixgbe_rxeof(struct ix_rxring *rxr) dsize * i, dsize, BUS_DMASYNC_POSTREAD); rxdesc = &rxr->rx_base[i]; - staterr = letoh32(rxdesc->wb.upper.status_error); + staterr = lemtoh32(&rxdesc->wb.upper.status_error); if (!ISSET(staterr, IXGBE_RXD_STAT_DD)) { bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, dsize * i, dsize, @@ -3157,8 +3157,8 @@ ixgbe_rxeof(struct ix_rxring *rxr) bus_dmamap_unload(rxr->rxdma.dma_tag, rxbuf->map); mp = rxbuf->buf; - len = letoh16(rxdesc->wb.upper.length); - vtag = letoh16(rxdesc->wb.upper.vlan); + len = lemtoh16(&rxdesc->wb.upper.length); + vtag = lemtoh16(&rxdesc->wb.upper.vlan); eop = ((staterr & IXGBE_RXD_STAT_EOP) != 0); hash = lemtoh32(&rxdesc->wb.lower.hi_dword.rss); hashtype = Index: dev/pci/if_ix.h =================================================================== RCS file: /cvs/src/sys/dev/pci/if_ix.h,v diff -u -p -r1.47 if_ix.h --- dev/pci/if_ix.h 21 May 2024 11:19:39 -0000 1.47 +++ dev/pci/if_ix.h 6 Sep 2024 11:18:14 -0000 @@ -63,7 +63,7 @@ * against the system mbuf pool limit, you can tune nmbclusters * to adjust for this. */ -#define DEFAULT_RXD 256 +#define DEFAULT_RXD 2048 #define PERFORM_RXD 2048 #define MAX_RXD 4096 #define MIN_RXD 64 Index: kern/kern_clockintr.c =================================================================== RCS file: /cvs/src/sys/kern/kern_clockintr.c,v diff -u -p -r1.70 kern_clockintr.c --- kern/kern_clockintr.c 25 Feb 2024 19:15:50 -0000 1.70 +++ kern/kern_clockintr.c 6 Sep 2024 11:18:16 -0000 @@ -30,6 +30,7 @@ #include #include #include +#include void clockintr_cancel_locked(struct clockintr *); void clockintr_hardclock(struct clockrequest *, void *, void *); @@ -209,7 +210,9 @@ clockintr_dispatch(void *frame) cq->cq_running = cl; mtx_leave(&cq->cq_mtx); + LLTRACE_CPU(ci, lltrace_fn_enter, func); func(request, frame, arg); + LLTRACE_CPU(ci, lltrace_fn_leave, func); mtx_enter(&cq->cq_mtx); cq->cq_running = NULL; Index: kern/kern_exec.c =================================================================== RCS file: /cvs/src/sys/kern/kern_exec.c,v diff -u -p -r1.258 kern_exec.c --- kern/kern_exec.c 21 Aug 2024 03:07:45 -0000 1.258 +++ kern/kern_exec.c 6 Sep 2024 11:18:16 -0000 @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -539,6 +540,8 @@ sys_execve(struct proc *p, void *v, regi memset(pr->ps_comm, 0, sizeof(pr->ps_comm)); strlcpy(pr->ps_comm, nid.ni_cnd.cn_nameptr, sizeof(pr->ps_comm)); pr->ps_acflag &= ~AFORK; + + LLTRACE(lltrace_pidname, p); /* record proc's vnode, for use by sysctl */ otvp = pr->ps_textvp; Index: kern/kern_intrmap.c =================================================================== RCS file: /cvs/src/sys/kern/kern_intrmap.c,v diff -u -p -r1.3 kern_intrmap.c --- kern/kern_intrmap.c 23 Jun 2020 01:40:03 -0000 1.3 +++ kern/kern_intrmap.c 6 Sep 2024 11:18:16 -0000 @@ -103,6 +103,8 @@ intrmap_cpus_get(void) M_DEVBUF, M_WAITOK); CPU_INFO_FOREACH(cii, ci) { + if (icpus > 0) + continue; #ifdef __HAVE_CPU_TOPOLOGY if (ci->ci_smt_id > 0) continue; Index: kern/kern_lock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_lock.c,v diff -u -p -r1.75 kern_lock.c --- kern/kern_lock.c 3 Jul 2024 01:36:50 -0000 1.75 +++ kern/kern_lock.c 6 Sep 2024 11:18:16 -0000 @@ -24,6 +24,7 @@ #include #include #include +#include #include @@ -129,6 +130,7 @@ __mp_lock(struct __mp_lock *mpl) { struct __mp_lock_cpu *cpu = &mpl->mpl_cpus[cpu_number()]; unsigned long s; + unsigned int depth; #ifdef WITNESS if (!__mp_lock_held(mpl, curcpu())) @@ -136,15 +138,22 @@ __mp_lock(struct __mp_lock *mpl) LOP_EXCLUSIVE | LOP_NEWORDER, NULL); #endif + s = intr_disable(); - if (cpu->mplc_depth++ == 0) + depth = cpu->mplc_depth++; + if (depth == 0) { + LLTRACE(lltrace_lock, mpl, LLTRACE_LK_K, LLTRACE_LK_A_START); cpu->mplc_ticket = atomic_inc_int_nv(&mpl->mpl_users); + } intr_restore(s); __mp_lock_spin(mpl, cpu->mplc_ticket); membar_enter_after_atomic(); WITNESS_LOCK(&mpl->mpl_lock_obj, LOP_EXCLUSIVE); + + if (depth == 0) + LLTRACE(lltrace_lock, mpl, LLTRACE_LK_K, LLTRACE_LK_A_EXCL); } void @@ -164,6 +173,7 @@ __mp_unlock(struct __mp_lock *mpl) s = intr_disable(); if (--cpu->mplc_depth == 0) { + LLTRACE(lltrace_lock, mpl, LLTRACE_LK_K, LLTRACE_LK_R_EXCL); membar_exit(); mpl->mpl_ticket++; } @@ -180,6 +190,8 @@ __mp_release_all(struct __mp_lock *mpl) int i; #endif + LLTRACE(lltrace_lock, mpl, LLTRACE_LK_K, LLTRACE_LK_R_EXCL); + s = intr_disable(); rv = cpu->mplc_depth; #ifdef WITNESS @@ -227,29 +239,60 @@ __mtx_init(struct mutex *mtx, int wantip void mtx_enter(struct mutex *mtx) { - struct schedstate_percpu *spc = &curcpu()->ci_schedstate; + struct cpu_info *owner, *ci = curcpu(); + struct schedstate_percpu *spc = &ci->ci_schedstate; + int s; #ifdef MP_LOCKDEBUG int nticks = __mp_lock_spinout; #endif +#if NLLT > 0 + unsigned int lltev = LLTRACE_LK_I_EXCL; +#endif WITNESS_CHECKORDER(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE | LOP_NEWORDER, NULL); - spc->spc_spinning++; - while (mtx_enter_try(mtx) == 0) { + if (mtx->mtx_wantipl != IPL_NONE) + s = splraise(mtx->mtx_wantipl); + + owner = atomic_cas_ptr(&mtx->mtx_owner, NULL, ci); +#ifdef DIAGNOSTIC + if (__predict_false(owner == ci)) + panic("mtx %p: locking against myself", mtx); +#endif + if (owner != NULL) { + LLTRACE_SPC(spc, lltrace_lock, mtx, LLTRACE_LK_MTX, + LLTRACE_LK_A_START); + + spc->spc_spinning++; do { - CPU_BUSY_CYCLE(); + do { + CPU_BUSY_CYCLE(); #ifdef MP_LOCKDEBUG - if (--nticks == 0) { - db_printf("%s: %p lock spun out\n", - __func__, mtx); - db_enter(); - nticks = __mp_lock_spinout; - } + if (--nticks == 0) { + db_printf("%s: %p lock spun out\n", + __func__, mtx); + db_enter(); + nticks = __mp_lock_spinout; + } +#endif + } while (mtx->mtx_owner != NULL); + } while (atomic_cas_ptr(&mtx->mtx_owner, NULL, ci) != NULL); + spc->spc_spinning--; + +#if NLLT > 0 + lltev = LLTRACE_LK_A_EXCL; #endif - } while (mtx->mtx_owner != NULL); } - spc->spc_spinning--; + + membar_enter_after_atomic(); + if (mtx->mtx_wantipl != IPL_NONE) + mtx->mtx_oldipl = s; +#ifdef DIAGNOSTIC + ci->ci_mutex_level++; +#endif + WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); + LLTRACE_SPC(spc, lltrace_lock, mtx, LLTRACE_LK_MTX, lltev); } int @@ -278,12 +321,15 @@ mtx_enter_try(struct mutex *mtx) ci->ci_mutex_level++; #endif WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); + LLTRACE_CPU(ci, lltrace_lock, mtx, LLTRACE_LK_MTX, + LLTRACE_LK_I_EXCL); return (1); } if (mtx->mtx_wantipl != IPL_NONE) splx(s); + LLTRACE_CPU(ci, lltrace_lock, mtx, LLTRACE_LK_MTX, LLTRACE_LK_I_FAIL); return (0); } #else @@ -313,6 +359,7 @@ mtx_enter(struct mutex *mtx) ci->ci_mutex_level++; #endif WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); + LLTRACE(lltrace_lock, mtx, LLTRACE_LK_MTX, LLTRACE_LK_I_EXCL); } int @@ -333,6 +380,7 @@ mtx_leave(struct mutex *mtx) return; MUTEX_ASSERT_LOCKED(mtx); + LLTRACE(lltrace_lock, mtx, LLTRACE_LK_MTX, LLTRACE_LK_R_EXCL); WITNESS_UNLOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); #ifdef DIAGNOSTIC Index: kern/kern_rwlock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_rwlock.c,v diff -u -p -r1.50 kern_rwlock.c --- kern/kern_rwlock.c 14 Jul 2023 07:07:08 -0000 1.50 +++ kern/kern_rwlock.c 6 Sep 2024 11:18:16 -0000 @@ -25,6 +25,7 @@ #include #include #include +#include void rw_do_exit(struct rwlock *, unsigned long); @@ -110,6 +111,7 @@ rw_enter_read(struct rwlock *rwl) membar_enter_after_atomic(); WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_NEWORDER, NULL); WITNESS_LOCK(&rwl->rwl_lock_obj, 0); + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, LLTRACE_LK_I_SHARED); } } @@ -126,6 +128,7 @@ rw_enter_write(struct rwlock *rwl) WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_EXCLUSIVE | LOP_NEWORDER, NULL); WITNESS_LOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, LLTRACE_LK_I_EXCL); } } @@ -135,6 +138,7 @@ rw_exit_read(struct rwlock *rwl) unsigned long owner; rw_assert_rdlock(rwl); + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, LLTRACE_LK_R_SHARED); WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0); membar_exit_before_atomic(); @@ -150,6 +154,7 @@ rw_exit_write(struct rwlock *rwl) unsigned long owner; rw_assert_wrlock(rwl); + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, LLTRACE_LK_R_SHARED); WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); membar_exit_before_atomic(); @@ -249,6 +254,7 @@ rw_enter(struct rwlock *rwl, int flags) op = &rw_ops[(flags & RW_OPMASK) - 1]; inc = op->inc + RW_PROC(curproc) * op->proc_mult; + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, LLTRACE_LK_A_START); retry: while (__predict_false(((o = rwl->rwl_owner) & op->check) != 0)) { unsigned long set = o | op->wait_set; @@ -272,8 +278,10 @@ retry: rw_enter_diag(rwl, flags); - if (flags & RW_NOSLEEP) - return (EBUSY); + if (flags & RW_NOSLEEP) { + error = EBUSY; + goto abort; + } prio = op->wait_prio; if (flags & RW_INTR) @@ -285,15 +293,28 @@ retry: error = sleep_finish(0, do_sleep); if ((flags & RW_INTR) && (error != 0)) - return (error); - if (flags & RW_SLEEPFAIL) - return (EAGAIN); + goto abort; + if (flags & RW_SLEEPFAIL) { + error = EAGAIN; + goto abort; + } } if (__predict_false(rw_cas(&rwl->rwl_owner, o, o + inc))) goto retry; membar_enter_after_atomic(); + if (flags & RW_DOWNGRADE) { + WITNESS_DOWNGRADE(&rwl->rwl_lock_obj, lop_flags); + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, + LLTRACE_LK_DOWNGRADE); + } else { + WITNESS_LOCK(&rwl->rwl_lock_obj, lop_flags); + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, + ISSET(flags, RW_WRITE) ? + LLTRACE_LK_A_EXCL : LLTRACE_LK_A_SHARED); + } + /* * If old lock had RWLOCK_WAIT and RWLOCK_WRLOCK set, it means we * downgraded a write lock and had possible read waiter, wake them @@ -303,12 +324,10 @@ retry: (RWLOCK_WRLOCK|RWLOCK_WAIT))) wakeup(rwl); - if (flags & RW_DOWNGRADE) - WITNESS_DOWNGRADE(&rwl->rwl_lock_obj, lop_flags); - else - WITNESS_LOCK(&rwl->rwl_lock_obj, lop_flags); - return (0); +abort: + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, LLTRACE_LK_A_ABORT); + return (error); } void @@ -325,6 +344,8 @@ rw_exit(struct rwlock *rwl) rw_assert_wrlock(rwl); else rw_assert_rdlock(rwl); + LLTRACE(lltrace_lock, rwl, LLTRACE_LK_RW, + wrlock ? LLTRACE_LK_R_EXCL : LLTRACE_LK_R_SHARED); WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0); membar_exit_before_atomic(); Index: kern/kern_sched.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sched.c,v diff -u -p -r1.100 kern_sched.c --- kern/kern_sched.c 9 Jul 2024 08:44:36 -0000 1.100 +++ kern/kern_sched.c 6 Sep 2024 11:18:16 -0000 @@ -191,7 +191,10 @@ sched_idle(void *v) wakeup(spc); } #endif + + LLTRACE(lltrace_idle, 1); cpu_idle_cycle(); + LLTRACE(lltrace_idle, 0); } cpu_idle_leave(); cpuset_del(&sched_idle_cpus, ci); @@ -609,6 +612,7 @@ sched_proc_to_cpu_cost(struct cpu_info * if (cpuset_isset(&sched_queued_cpus, ci)) cost += spc->spc_nrun * sched_cost_runnable; +#if 0 /* * Try to avoid the primary cpu as it handles hardware interrupts. * @@ -617,6 +621,7 @@ sched_proc_to_cpu_cost(struct cpu_info * */ if (CPU_IS_PRIMARY(ci)) cost += sched_cost_runnable; +#endif /* * If the proc is on this cpu already, lower the cost by how much Index: kern/kern_sensors.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sensors.c,v diff -u -p -r1.40 kern_sensors.c --- kern/kern_sensors.c 5 Dec 2022 23:18:37 -0000 1.40 +++ kern/kern_sensors.c 6 Sep 2024 11:18:16 -0000 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "hotplug.h" @@ -260,8 +261,11 @@ sensor_task_work(void *xst) atomic_inc_int(&sensors_running); rw_enter_write(&st->lock); period = st->period; - if (period > 0 && !sensors_quiesced) + if (period > 0 && !sensors_quiesced) { + LLTRACE(lltrace_fn_enter, st->func); st->func(st->arg); + LLTRACE(lltrace_fn_leave, st->func); + } rw_exit_write(&st->lock); if (atomic_dec_int_nv(&sensors_running) == 0) { if (sensors_quiesced) Index: kern/kern_synch.c =================================================================== RCS file: /cvs/src/sys/kern/kern_synch.c,v diff -u -p -r1.206 kern_synch.c --- kern/kern_synch.c 23 Jul 2024 08:38:02 -0000 1.206 +++ kern/kern_synch.c 6 Sep 2024 11:18:16 -0000 @@ -37,6 +37,8 @@ * @(#)kern_synch.c 8.6 (Berkeley) 1/21/94 */ +#include "llt.h" + #include #include #include @@ -521,6 +523,7 @@ unsleep(struct proc *p) p->p_wmesg = NULL; TRACEPOINT(sched, unsleep, p->p_tid + THREAD_PID_OFFSET, p->p_p->ps_pid); + LLTRACE(lltrace_runnable, p); } } @@ -557,6 +560,7 @@ wakeup_n(const volatile void *ident, int TAILQ_REMOVE(&wakeq, p, p_runq); TRACEPOINT(sched, unsleep, p->p_tid + THREAD_PID_OFFSET, p->p_p->ps_pid); + LLTRACE(lltrace_runnable, p); if (p->p_stat == SSLEEP) setrunnable(p); } Index: kern/kern_task.c =================================================================== RCS file: /cvs/src/sys/kern/kern_task.c,v diff -u -p -r1.35 kern_task.c --- kern/kern_task.c 14 May 2024 08:26:13 -0000 1.35 +++ kern/kern_task.c 6 Sep 2024 11:18:16 -0000 @@ -24,6 +24,7 @@ #include #include #include +#include #include "kcov.h" #if NKCOV > 0 @@ -443,7 +444,9 @@ taskq_thread(void *xtq) #if NKCOV > 0 kcov_remote_enter(KCOV_REMOTE_COMMON, work.t_process); #endif + LLTRACE(lltrace_fn_enter, work.t_func); (*work.t_func)(work.t_arg); + LLTRACE(lltrace_fn_leave, work.t_func); #if NKCOV > 0 kcov_remote_leave(KCOV_REMOTE_COMMON, work.t_process); #endif Index: kern/kern_timeout.c =================================================================== RCS file: /cvs/src/sys/kern/kern_timeout.c,v diff -u -p -r1.99 kern_timeout.c --- kern/kern_timeout.c 11 Aug 2024 00:49:34 -0000 1.99 +++ kern/kern_timeout.c 6 Sep 2024 11:18:16 -0000 @@ -35,6 +35,7 @@ #include /* _Q_INVALIDATE */ #include #include +#include #ifdef DDB #include @@ -658,7 +659,9 @@ timeout_run(struct timeout *to) #if NKCOV > 0 kcov_remote_enter(KCOV_REMOTE_COMMON, kcov_process); #endif + LLTRACE(lltrace_fn_enter, fn); fn(arg); + LLTRACE(lltrace_fn_leave, fn); #if NKCOV > 0 kcov_remote_leave(KCOV_REMOTE_COMMON, kcov_process); #endif @@ -736,6 +739,8 @@ softclock(void *arg) int need_proc_mp; #endif + //LLTRACE(lltrace_irq, LLTRACE_IRQ_BOTTOM_HALF, 0); + first_new = NULL; new = 0; @@ -770,6 +775,8 @@ softclock(void *arg) if (need_proc_mp) wakeup(&timeout_proc_mp); #endif + + //LLTRACE(lltrace_irqret, LLTRACE_IRQ_BOTTOM_HALF, 0); } void Index: kern/sched_bsd.c =================================================================== RCS file: /cvs/src/sys/kern/sched_bsd.c,v diff -u -p -r1.94 sched_bsd.c --- kern/sched_bsd.c 8 Jul 2024 13:17:12 -0000 1.94 +++ kern/sched_bsd.c 6 Sep 2024 11:18:16 -0000 @@ -350,6 +350,8 @@ mi_switch(void) int hold_count; #endif + LLTRACE(lltrace_sched_enter); + KASSERT(p->p_stat != SONPROC); SCHED_ASSERT_LOCKED(); @@ -410,14 +412,19 @@ mi_switch(void) uvmexp.swtch++; TRACEPOINT(sched, off__cpu, nextproc->p_tid + THREAD_PID_OFFSET, nextproc->p_p->ps_pid); + LLTRACE(lltrace_switch, p, nextproc); cpu_switchto(p, nextproc); TRACEPOINT(sched, on__cpu, NULL); + + //LLTRACE(lltrace_pidname, p); } else { TRACEPOINT(sched, remain__cpu, NULL); p->p_stat = SONPROC; } clear_resched(curcpu()); + + LLTRACE(lltrace_sched_leave); SCHED_ASSERT_LOCKED(); Index: kern/subr_vmem.c =================================================================== RCS file: kern/subr_vmem.c diff -N kern/subr_vmem.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ kern/subr_vmem.c 6 Sep 2024 11:18:16 -0000 @@ -0,0 +1,1668 @@ +/* $NetBSD: subr_vmem.c,v 1.116 2024/04/24 02:08:03 thorpej Exp $ */ + +/*- + * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi, + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * reference: + * - Magazines and Vmem: Extending the Slab Allocator + * to Many CPUs and Arbitrary Resources + * http://www.usenix.org/event/usenix01/bonwick.html + * + * locking & the boundary tag pool: + * - A pool(9) is used for vmem boundary tags + * - During a pool get call the global vmem_btag_refill_lock is taken, + * to serialize access to the allocation reserve, but no other + * vmem arena locks. + * - During pool_put calls no vmem mutexes are locked. + * - pool_drain doesn't hold the pool's mutex while releasing memory to + * its backing therefore no interference with any vmem mutexes. + * - The boundary tag pool is forced to put page headers into pool pages + * (PR_PHINPAGE) and not off page to avoid pool recursion. + * (due to sizeof(bt_t) it should be the case anyway) + */ + +#include +#include +#include + +#include +#include /* hz */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define VMEM_MAXORDER (sizeof(vmem_size_t) * CHAR_BIT) + +typedef struct vmem_btag bt_t; + +TAILQ_HEAD(vmem_seglist, vmem_btag); +LIST_HEAD(vmem_freelist, vmem_btag); +LIST_HEAD(vmem_hashlist, vmem_btag); + +#define VMEM_NAME_MAX 16 + +/* vmem arena */ +struct vmem { + struct mutex vm_lock; + + vm_flag_t vm_flags; + vmem_import_t *vm_importfn; + vmem_release_t *vm_releasefn; + size_t vm_nfreetags; + LIST_HEAD(, vmem_btag) vm_freetags; + void *vm_arg; + struct vmem_seglist vm_seglist; + struct vmem_freelist vm_freelist[VMEM_MAXORDER]; + size_t vm_hashsize; + size_t vm_hashmask; + size_t vm_nbusytag; + size_t vm_maxbusytag; + struct vmem_hashlist *vm_hashlist; + struct vmem_hashlist vm_hash0; + size_t vm_quantum_mask; + int vm_quantum_shift; + size_t vm_size; + size_t vm_inuse; + char vm_name[VMEM_NAME_MAX+1]; + LIST_ENTRY(vmem) vm_alllist; +}; + +/* boundary tag */ +struct vmem_btag { + TAILQ_ENTRY(vmem_btag) bt_seglist; + union { + LIST_ENTRY(vmem_btag) u_freelist; /* BT_TYPE_FREE */ + LIST_ENTRY(vmem_btag) u_hashlist; /* BT_TYPE_BUSY */ + } bt_u; +#define bt_hashlist bt_u.u_hashlist +#define bt_freelist bt_u.u_freelist + vmem_addr_t bt_start; + vmem_size_t bt_size; + short bt_type; + short bt_flags; +}; + +#define BT_TYPE_SPAN 1 +#define BT_TYPE_SPAN_STATIC 2 +#define BT_TYPE_FREE 3 +#define BT_TYPE_BUSY 4 +#define BT_ISSPAN_P(bt) ((bt)->bt_type <= BT_TYPE_SPAN_STATIC) + +#define BT_F_PRIVATE 0x0001 + +#define BT_END(bt) ((bt)->bt_start + (bt)->bt_size - 1) + +/* + * Provide an estimated number of boundary tags needed for a given + * number of allocations from the vmem arena. This estimate is + * based on 2 boundary tags per allocation (see vmem_xalloc()) and + * 2 boundary tags per added span (one to describe the span, one to + * describe the span's free space; see vmem_add1()). + */ +#define VMEM_EST_BTCOUNT(ns, na) (((ns) * 2) + ((na) * 2)) + +vmem_t * vmem_init(vmem_t *, const char *, vmem_addr_t, vmem_size_t, + vmem_size_t, vmem_import_t *, vmem_release_t *, vmem_t *, + vmem_size_t, vm_flag_t, int); +void vmem_add_bts(vmem_t *, struct vmem_btag *, unsigned int); + +#if defined(VMEM_SANITY) +static void vmem_check(vmem_t *); +#else /* defined(VMEM_SANITY) */ +#define vmem_check(vm) /* nothing */ +#endif /* defined(VMEM_SANITY) */ + +#define VMEM_HASHSIZE_MIN 1 /* XXX */ +#define VMEM_HASHSIZE_MAX 65536 /* XXX */ +#define VMEM_HASHSIZE_INIT 1 + +#define VM_FITMASK (VM_BESTFIT | VM_INSTANTFIT) + +static bool vmem_bootstrapped; +struct rwlock vmem_list_lock = RWLOCK_INITIALIZER("vmemlist"); +static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list); + +/* ---- misc */ + +#define VMEM_LOCK(vm) mtx_enter(&(vm)->vm_lock) +#define VMEM_TRYLOCK(vm) mtx_tryenter(&(vm)->vm_lock) +#define VMEM_UNLOCK(vm) mtx_leave(&(vm)->vm_lock) +#define VMEM_LOCK_INIT(vm, ipl) mtx_init(&(vm)->vm_lock, (ipl)) +#define VMEM_LOCK_DESTROY(vm) /* nop */ +#define VMEM_ASSERT_LOCKED(vm) MUTEX_ASSERT_LOCKED(&(vm)->vm_lock) + +#define VMEM_ALIGNUP(addr, align) \ + (-(-(addr) & -(align))) + +#define VMEM_CROSS_P(addr1, addr2, boundary) \ + ((((addr1) ^ (addr2)) & -(boundary)) != 0) + +#define ORDER2SIZE(order) ((vmem_size_t)1 << (order)) +#define SIZE2ORDER(size) ((int)fls(size) - 1) + +static void +vmem_kick_pdaemon(void) +{ + printf("%s\n", __func__); +#if defined(_KERNEL) + //uvm_kick_pdaemon(); +#endif +} + +static void vmem_xfree_bt(vmem_t *, bt_t *); + +/* + * This reserve is 4 for each arena involved in allocating vmems memory. + * BT_MAXFREE: don't cache excessive counts of bts in arenas + */ +#define BT_MINRESERVE 4 +#define BT_MAXFREE 64 + +static struct pool vmem_btag_pool; + +/* ---- boundary tag */ + +static int bt_refill(vmem_t *vm); +static int bt_refill_locked(vmem_t *vm); + +static int +bt_refill_locked(vmem_t *vm) +{ + bt_t *bt; + + VMEM_ASSERT_LOCKED(vm); + + if (vm->vm_nfreetags > BT_MINRESERVE) { + return 0; + } + + while (vm->vm_nfreetags <= BT_MINRESERVE) { + VMEM_UNLOCK(vm); + KASSERT(vmem_btag_pool.pr_size); + bt = pool_get(&vmem_btag_pool, PR_NOWAIT); + VMEM_LOCK(vm); + if (bt == NULL) + break; + bt->bt_flags = 0; + LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); + vm->vm_nfreetags++; + } + + if (vm->vm_nfreetags <= BT_MINRESERVE) { + return ENOMEM; + } + + return 0; +} + +static int +bt_refill(vmem_t *vm) +{ + int rv; + + VMEM_LOCK(vm); + rv = bt_refill_locked(vm); + VMEM_UNLOCK(vm); + return rv; +} + +static bt_t * +bt_alloc(vmem_t *vm, vm_flag_t flags) +{ + bt_t *bt; + + VMEM_ASSERT_LOCKED(vm); + + while (vm->vm_nfreetags <= BT_MINRESERVE && (flags & VM_POPULATING) == 0) { + if (bt_refill_locked(vm)) { + if ((flags & VM_NOSLEEP) != 0) { + return NULL; + } + + /* + * It would be nice to wait for something specific here + * but there are multiple ways that a retry could + * succeed and we can't wait for multiple things + * simultaneously. So we'll just sleep for an arbitrary + * short period of time and retry regardless. + * This should be a very rare case. + */ + + vmem_kick_pdaemon(); + msleep_nsec(&vm->vm_freetags, &vm->vm_lock, PWAIT, + "btalloc", 1); + } + } + bt = LIST_FIRST(&vm->vm_freetags); + LIST_REMOVE(bt, bt_freelist); + vm->vm_nfreetags--; + + return bt; +} + +static void +bt_free(vmem_t *vm, bt_t *bt) +{ + + VMEM_ASSERT_LOCKED(vm); + + LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); + vm->vm_nfreetags++; +} + +static void +bt_freetrim(vmem_t *vm, int freelimit) +{ + bt_t *bt, *next_bt; + LIST_HEAD(, vmem_btag) tofree; + + VMEM_ASSERT_LOCKED(vm); + + LIST_INIT(&tofree); + + LIST_FOREACH_SAFE(bt, &vm->vm_freetags, bt_freelist, next_bt) { + if (vm->vm_nfreetags <= freelimit) { + break; + } + if (bt->bt_flags & BT_F_PRIVATE) { + continue; + } + LIST_REMOVE(bt, bt_freelist); + vm->vm_nfreetags--; + LIST_INSERT_HEAD(&tofree, bt, bt_freelist); + } + + VMEM_UNLOCK(vm); + while (!LIST_EMPTY(&tofree)) { + bt = LIST_FIRST(&tofree); + LIST_REMOVE(bt, bt_freelist); + pool_put(&vmem_btag_pool, bt); + } +} + +/* + * Add private boundary tags (statically-allocated by the caller) + * to a vmem arena's free tag list. + */ +void +vmem_add_bts(vmem_t *vm, struct vmem_btag *bts, unsigned int nbts) +{ + VMEM_LOCK(vm); + while (nbts != 0) { + bts->bt_flags = BT_F_PRIVATE; + LIST_INSERT_HEAD(&vm->vm_freetags, bts, bt_freelist); + vm->vm_nfreetags++; + bts++; + nbts--; + } + VMEM_UNLOCK(vm); +} + +/* + * freelist[0] ... [1, 1] + * freelist[1] ... [2, 3] + * freelist[2] ... [4, 7] + * freelist[3] ... [8, 15] + * : + * freelist[n] ... [(1 << n), (1 << (n + 1)) - 1] + * : + */ + +static struct vmem_freelist * +bt_freehead_tofree(vmem_t *vm, vmem_size_t size) +{ + const vmem_size_t qsize = size >> vm->vm_quantum_shift; + const int idx = SIZE2ORDER(qsize); + + KASSERT(size != 0); + KASSERT(qsize != 0); + KASSERT((size & vm->vm_quantum_mask) == 0); + KASSERT(idx >= 0); + KASSERT(idx < VMEM_MAXORDER); + + return &vm->vm_freelist[idx]; +} + +/* + * bt_freehead_toalloc: return the freelist for the given size and allocation + * strategy. + * + * for VM_INSTANTFIT, return the list in which any blocks are large enough + * for the requested size. otherwise, return the list which can have blocks + * large enough for the requested size. + */ + +static struct vmem_freelist * +bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, vm_flag_t strat) +{ + const vmem_size_t qsize = size >> vm->vm_quantum_shift; + int idx = SIZE2ORDER(qsize); + + KASSERT(size != 0); + KASSERT(qsize != 0); + KASSERT((size & vm->vm_quantum_mask) == 0); + + if (strat == VM_INSTANTFIT && ORDER2SIZE(idx) != qsize) { + idx++; + /* check too large request? */ + } + KASSERT(idx >= 0); + KASSERT(idx < VMEM_MAXORDER); + + return &vm->vm_freelist[idx]; +} + +/* ---- boundary tag hash */ + +static struct vmem_hashlist * +bt_hashhead(vmem_t *vm, vmem_addr_t addr) +{ + struct vmem_hashlist *list; + unsigned long hash; + + hash = addr >> vm->vm_quantum_shift; + list = &vm->vm_hashlist[hash & vm->vm_hashmask]; + + return list; +} + +static bt_t * +bt_lookupbusy(vmem_t *vm, vmem_addr_t addr) +{ + struct vmem_hashlist *list; + bt_t *bt; + + list = bt_hashhead(vm, addr); + LIST_FOREACH(bt, list, bt_hashlist) { + if (bt->bt_start == addr) { + break; + } + } + + return bt; +} + +static void +bt_rembusy(vmem_t *vm, bt_t *bt) +{ + + KASSERT(vm->vm_nbusytag > 0); + vm->vm_inuse -= bt->bt_size; + vm->vm_nbusytag--; + LIST_REMOVE(bt, bt_hashlist); +} + +static void +bt_insbusy(vmem_t *vm, bt_t *bt) +{ + struct vmem_hashlist *list; + + KASSERT(bt->bt_type == BT_TYPE_BUSY); + + list = bt_hashhead(vm, bt->bt_start); + LIST_INSERT_HEAD(list, bt, bt_hashlist); + if (++vm->vm_nbusytag > vm->vm_maxbusytag) { + vm->vm_maxbusytag = vm->vm_nbusytag; + } + vm->vm_inuse += bt->bt_size; +} + +/* ---- boundary tag list */ + +static void +bt_remseg(vmem_t *vm, bt_t *bt) +{ + + TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist); +} + +static void +bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev) +{ + + TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist); +} + +static void +bt_insseg_tail(vmem_t *vm, bt_t *bt) +{ + + TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist); +} + +static void +bt_remfree(vmem_t *vm, bt_t *bt) +{ + + KASSERT(bt->bt_type == BT_TYPE_FREE); + + LIST_REMOVE(bt, bt_freelist); +} + +static void +bt_insfree(vmem_t *vm, bt_t *bt) +{ + struct vmem_freelist *list; + + list = bt_freehead_tofree(vm, bt->bt_size); + LIST_INSERT_HEAD(list, bt, bt_freelist); +} + +/* ---- vmem internal functions */ + +static void +vmem_bootstrap(void) +{ + pool_init(&vmem_btag_pool, sizeof(bt_t), CACHELINESIZE, IPL_VM, 0, + "vmembt", NULL); + pool_setlowat(&vmem_btag_pool, 200); + pool_prime(&vmem_btag_pool, 200); +#if 0 +#ifdef MULTIPROCESSOR + pool_cache_init(&vmem_btag_pool); +#endif +#endif + + vmem_rehash_start(); +} + +static int +vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, vm_flag_t flags, + int spanbttype) +{ + bt_t *btspan; + bt_t *btfree; + + VMEM_ASSERT_LOCKED(vm); + KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0); + KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0); + KASSERT(spanbttype == BT_TYPE_SPAN || + spanbttype == BT_TYPE_SPAN_STATIC); + + btspan = bt_alloc(vm, flags); + if (btspan == NULL) { + return ENOMEM; + } + btfree = bt_alloc(vm, flags); + if (btfree == NULL) { + bt_free(vm, btspan); + return ENOMEM; + } + + btspan->bt_type = spanbttype; + btspan->bt_start = addr; + btspan->bt_size = size; + + btfree->bt_type = BT_TYPE_FREE; + btfree->bt_start = addr; + btfree->bt_size = size; + + bt_insseg_tail(vm, btspan); + bt_insseg(vm, btfree, btspan); + bt_insfree(vm, btfree); + vm->vm_size += size; + + return 0; +} + +static void +vmem_destroy1(vmem_t *vm) +{ + +#if defined(QCACHE) + qc_destroy(vm); +#endif /* defined(QCACHE) */ + VMEM_LOCK(vm); + + for (int i = 0; i < vm->vm_hashsize; i++) { + bt_t *bt; + + while ((bt = LIST_FIRST(&vm->vm_hashlist[i])) != NULL) { + KASSERT(bt->bt_type == BT_TYPE_SPAN_STATIC); + LIST_REMOVE(bt, bt_hashlist); + bt_free(vm, bt); + } + } + + /* bt_freetrim() drops the lock. */ + bt_freetrim(vm, 0); + if (vm->vm_hashlist != &vm->vm_hash0) { + free(vm->vm_hashlist, M_DEVBUF, + sizeof(*vm->vm_hashlist) * vm->vm_hashsize); + } + + VMEM_LOCK_DESTROY(vm); + free(vm, M_DEVBUF, sizeof(*vm)); +} + +static int +vmem_import(vmem_t *vm, vmem_size_t size, vm_flag_t flags) +{ + vmem_addr_t addr; + int rc; + + VMEM_ASSERT_LOCKED(vm); + + if (vm->vm_importfn == NULL) { + return EINVAL; + } + + if (vm->vm_flags & VM_LARGEIMPORT) { + size *= 16; + } + + VMEM_UNLOCK(vm); + rc = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr); + VMEM_LOCK(vm); + + if (rc) { + return ENOMEM; + } + + if (vmem_add1(vm, addr, size, flags, BT_TYPE_SPAN) != 0) { + VMEM_UNLOCK(vm); + (*vm->vm_releasefn)(vm->vm_arg, addr, size); + VMEM_LOCK(vm); + return ENOMEM; + } + + return 0; +} + +#if defined(_KERNEL) +static int +vmem_rehash(vmem_t *vm, size_t newhashsize, vm_flag_t flags) +{ + bt_t *bt; + int i; + struct vmem_hashlist *newhashlist; + struct vmem_hashlist *oldhashlist; + size_t oldhashsize; + + KASSERT(newhashsize > 0); + + /* Round hash size up to a power of 2. */ + newhashsize = 1 << fls(newhashsize); + + newhashlist = mallocarray(newhashsize, sizeof(*newhashlist), + M_DEVBUF, ISSET(flags, VM_SLEEP) ? M_WAITOK : M_NOWAIT); + if (newhashlist == NULL) { + return ENOMEM; + } + for (i = 0; i < newhashsize; i++) { + LIST_INIT(&newhashlist[i]); + } + + VMEM_LOCK(vm); + /* Decay back to a small hash slowly. */ + if (vm->vm_maxbusytag >= 2) { + vm->vm_maxbusytag = vm->vm_maxbusytag / 2 - 1; + if (vm->vm_nbusytag > vm->vm_maxbusytag) { + vm->vm_maxbusytag = vm->vm_nbusytag; + } + } else { + vm->vm_maxbusytag = vm->vm_nbusytag; + } + oldhashlist = vm->vm_hashlist; + oldhashsize = vm->vm_hashsize; + vm->vm_hashlist = newhashlist; + vm->vm_hashsize = newhashsize; + vm->vm_hashmask = newhashsize - 1; + if (oldhashlist == NULL) { + VMEM_UNLOCK(vm); + return 0; + } + for (i = 0; i < oldhashsize; i++) { + while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) { + bt_rembusy(vm, bt); /* XXX */ + bt_insbusy(vm, bt); + } + } + VMEM_UNLOCK(vm); + + if (oldhashlist != &vm->vm_hash0) { + free(oldhashlist, M_DEVBUF, + sizeof(*oldhashlist) * oldhashsize); + } + + return 0; +} +#endif /* _KERNEL */ + +/* + * vmem_fit: check if a bt can satisfy the given restrictions. + * + * it's a caller's responsibility to ensure the region is big enough + * before calling us. + */ + +static int +vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align, + vmem_size_t phase, vmem_size_t nocross, + vmem_addr_t minaddr, vmem_addr_t maxaddr, vmem_addr_t *addrp) +{ + vmem_addr_t start; + vmem_addr_t end; + + KASSERT(size > 0); + KASSERT(bt->bt_size >= size); /* caller's responsibility */ + + /* + * XXX assumption: vmem_addr_t and vmem_size_t are + * unsigned integer of the same size. + */ + + start = bt->bt_start; + if (start < minaddr) { + start = minaddr; + } + end = BT_END(bt); + if (end > maxaddr) { + end = maxaddr; + } + if (start > end) { + return ENOMEM; + } + + start = VMEM_ALIGNUP(start - phase, align) + phase; + if (start < bt->bt_start) { + start += align; + } + if (VMEM_CROSS_P(start, start + size - 1, nocross)) { + KASSERT(align < nocross); + start = VMEM_ALIGNUP(start - phase, nocross) + phase; + } + if (start <= end && end - start >= size - 1) { + KASSERT((start & (align - 1)) == phase); + KASSERT(!VMEM_CROSS_P(start, start + size - 1, nocross)); + KASSERT(minaddr <= start); + KASSERT(maxaddr == 0 || start + size - 1 <= maxaddr); + KASSERT(bt->bt_start <= start); + KASSERT(BT_END(bt) - start >= size - 1); + *addrp = start; + return 0; + } + return ENOMEM; +} + +/* ---- vmem API */ + +/* + * vmem_init: creates a vmem arena. + */ + +vmem_t * +vmem_init(vmem_t *vm, const char *name, + vmem_addr_t base, vmem_size_t size, vmem_size_t quantum, + vmem_import_t *importfn, vmem_release_t *releasefn, + vmem_t *arg, vmem_size_t qcache_max, vm_flag_t flags, int ipl) +{ + int i; + KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0); + KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0); + KASSERT(quantum > 0); + KASSERT(powerof2(quantum)); + + /* + * If private tags are going to be used, they must + * be added to the arena before the first span is + * added. + */ + KASSERT((flags & VM_PRIVTAGS) == 0 || size == 0); + +#if defined(_KERNEL) + /* XXX: SMP, we get called early... */ + if (!vmem_bootstrapped) { + vmem_bootstrap(); + vmem_bootstrapped = 1; + } +#endif /* defined(_KERNEL) */ + + if (vm == NULL) { + vm = malloc(sizeof(*vm), M_DEVBUF, M_WAITOK|M_CANFAIL); + } + if (vm == NULL) { + return NULL; + } + + VMEM_LOCK_INIT(vm, ipl); + vm->vm_flags = flags; + vm->vm_nfreetags = 0; + LIST_INIT(&vm->vm_freetags); + strlcpy(vm->vm_name, name, sizeof(vm->vm_name)); + vm->vm_quantum_mask = quantum - 1; + vm->vm_quantum_shift = SIZE2ORDER(quantum); + KASSERT(ORDER2SIZE(vm->vm_quantum_shift) == quantum); + vm->vm_importfn = importfn; + vm->vm_releasefn = releasefn; + vm->vm_arg = arg; + vm->vm_nbusytag = 0; + vm->vm_maxbusytag = 0; + vm->vm_size = 0; + vm->vm_inuse = 0; +#if defined(QCACHE) + qc_init(vm, qcache_max, ipl); +#endif /* defined(QCACHE) */ + + TAILQ_INIT(&vm->vm_seglist); + for (i = 0; i < VMEM_MAXORDER; i++) { + LIST_INIT(&vm->vm_freelist[i]); + } + memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0)); + vm->vm_hashsize = 1; + vm->vm_hashmask = vm->vm_hashsize - 1; + vm->vm_hashlist = &vm->vm_hash0; + + if (size != 0) { + if (vmem_add(vm, base, size, flags) != 0) { + vmem_destroy1(vm); + return NULL; + } + } + +#if defined(_KERNEL) + if (flags & VM_BOOTSTRAP) { + bt_refill(vm); + } + + rw_enter_write(&vmem_list_lock); + LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist); + rw_exit_write(&vmem_list_lock); +#endif /* defined(_KERNEL) */ + + return vm; +} + + + +/* + * vmem_create: create an arena. + * + * => must not be called from interrupt context. + */ + +vmem_t * +vmem_create(const char *name, vmem_addr_t base, vmem_size_t size, + vmem_size_t quantum, vmem_import_t *importfn, vmem_release_t *releasefn, + vmem_t *source, vmem_size_t qcache_max, vm_flag_t flags, int ipl) +{ + return vmem_init(NULL, name, base, size, quantum, + importfn, releasefn, source, qcache_max, flags, ipl); +} + +void +vmem_destroy(vmem_t *vm) +{ + +#if defined(_KERNEL) + rw_enter_write(&vmem_list_lock); + LIST_REMOVE(vm, vm_alllist); + rw_exit_write(&vmem_list_lock); +#endif /* defined(_KERNEL) */ + + vmem_destroy1(vm); +} + +vmem_size_t +vmem_roundup_size(vmem_t *vm, vmem_size_t size) +{ + + return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask; +} + +/* + * vmem_alloc: allocate resource from the arena. + */ + +int +vmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags, vmem_addr_t *addrp) +{ + int error; + + KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0); + KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0); + + KASSERT(size > 0); + KASSERT(!ISSET(flags, VM_BESTFIT) != !ISSET(flags, VM_INSTANTFIT)); +#if 0 + if ((flags & VM_SLEEP) != 0) { + ASSERT_SLEEPABLE(); + } +#endif + +#if defined(QCACHE) + if (size <= vm->vm_qcache_max) { + void *p; + int qidx = (size + vm->vm_quantum_mask) >> vm->vm_quantum_shift; + qcache_t *qc = vm->vm_qcache[qidx - 1]; + + p = pool_cache_get(qc->qc_cache, vmf_to_prf(flags)); + if (addrp != NULL) + *addrp = (vmem_addr_t)p; + error = (p == NULL) ? ENOMEM : 0; + goto out; + } +#endif /* defined(QCACHE) */ + + error = vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, + flags, addrp); +#if defined(QCACHE) + out: +#endif /* defined(QCACHE) */ + KASSERTMSG(error || addrp == NULL || + (*addrp & vm->vm_quantum_mask) == 0, + "vmem %s mask=0x%llx addr=0x%llx", vm->vm_name, + (unsigned long long)vm->vm_quantum_mask, + (unsigned long long)*addrp); + KASSERT(error == 0 || (flags & VM_SLEEP) == 0); + return error; +} + +int +vmem_xalloc_addr(vmem_t *vm, const vmem_addr_t addr, const vmem_size_t size, + vm_flag_t flags) +{ + vmem_addr_t result; + int error; + + KASSERT((addr & vm->vm_quantum_mask) == 0); + KASSERT(size != 0); + + flags = (flags & ~VM_INSTANTFIT) | VM_BESTFIT; + + error = vmem_xalloc(vm, size, 0, 0, 0, addr, addr + size - 1, + flags, &result); + + KASSERT(error || result == addr); + KASSERT(error == 0 || (flags & VM_SLEEP) == 0); + return error; +} + +int +vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align, + const vmem_size_t phase, const vmem_size_t nocross, + const vmem_addr_t minaddr, const vmem_addr_t maxaddr, const vm_flag_t flags, + vmem_addr_t *addrp) +{ + struct vmem_freelist *list; + struct vmem_freelist *first; + struct vmem_freelist *end; + bt_t *bt; + bt_t *btnew; + bt_t *btnew2; + const vmem_size_t size = vmem_roundup_size(vm, size0); + vm_flag_t strat = flags & VM_FITMASK; + vmem_addr_t start; + int rc; + + KASSERT(size0 > 0); + KASSERT(size > 0); + KASSERT(!ISSET(flags, VM_BESTFIT) != !ISSET(flags, VM_INSTANTFIT)); +#if 0 + if ((flags & VM_SLEEP) != 0) { + ASSERT_SLEEPABLE(); + } +#endif + KASSERT((align & vm->vm_quantum_mask) == 0); + KASSERT((align & (align - 1)) == 0); + KASSERT((phase & vm->vm_quantum_mask) == 0); + KASSERT((nocross & vm->vm_quantum_mask) == 0); + KASSERT((nocross & (nocross - 1)) == 0); + KASSERT(align == 0 || phase < align); + KASSERT(phase == 0 || phase < align); + KASSERT(nocross == 0 || nocross >= size); + KASSERT(minaddr <= maxaddr); + KASSERT(!VMEM_CROSS_P(phase, phase + size - 1, nocross)); + + if (align == 0) { + align = vm->vm_quantum_mask + 1; + } + + /* + * allocate boundary tags before acquiring the vmem lock. + */ + VMEM_LOCK(vm); + btnew = bt_alloc(vm, flags); + if (btnew == NULL) { + VMEM_UNLOCK(vm); + return ENOMEM; + } + btnew2 = bt_alloc(vm, flags); /* XXX not necessary if no restrictions */ + if (btnew2 == NULL) { + bt_free(vm, btnew); + VMEM_UNLOCK(vm); + return ENOMEM; + } + + /* + * choose a free block from which we allocate. + */ +retry_strat: + first = bt_freehead_toalloc(vm, size, strat); + end = &vm->vm_freelist[VMEM_MAXORDER]; +retry: + bt = NULL; + vmem_check(vm); + if (strat == VM_INSTANTFIT) { + /* + * just choose the first block which satisfies our restrictions. + * + * note that we don't need to check the size of the blocks + * because any blocks found on these list should be larger than + * the given size. + */ + for (list = first; list < end; list++) { + bt = LIST_FIRST(list); + if (bt != NULL) { + rc = vmem_fit(bt, size, align, phase, + nocross, minaddr, maxaddr, &start); + if (rc == 0) { + goto gotit; + } + /* + * don't bother to follow the bt_freelist link + * here. the list can be very long and we are + * told to run fast. blocks from the later free + * lists are larger and have better chances to + * satisfy our restrictions. + */ + } + } + } else { /* VM_BESTFIT */ + /* + * we assume that, for space efficiency, it's better to + * allocate from a smaller block. thus we will start searching + * from the lower-order list than VM_INSTANTFIT. + * however, don't bother to find the smallest block in a free + * list because the list can be very long. we can revisit it + * if/when it turns out to be a problem. + * + * note that the 'first' list can contain blocks smaller than + * the requested size. thus we need to check bt_size. + */ + for (list = first; list < end; list++) { + LIST_FOREACH(bt, list, bt_freelist) { + if (bt->bt_size >= size) { + rc = vmem_fit(bt, size, align, phase, + nocross, minaddr, maxaddr, &start); + if (rc == 0) { + goto gotit; + } + } + } + } + } +#if 1 + if (strat == VM_INSTANTFIT) { + strat = VM_BESTFIT; + goto retry_strat; + } +#endif + if (align != vm->vm_quantum_mask + 1 || phase != 0 || nocross != 0) { + + /* + * XXX should try to import a region large enough to + * satisfy restrictions? + */ + + goto fail; + } + /* XXX eeek, minaddr & maxaddr not respected */ + if (vmem_import(vm, size, flags) == 0) { + goto retry; + } + /* XXX */ +#if 0 + if ((flags & VM_SLEEP) != 0) { + vmem_kick_pdaemon(); + VMEM_CONDVAR_WAIT(vm); + goto retry; + } +#endif +fail: + bt_free(vm, btnew); + bt_free(vm, btnew2); + VMEM_UNLOCK(vm); + return ENOMEM; + +gotit: + KASSERT(bt->bt_type == BT_TYPE_FREE); + KASSERT(bt->bt_size >= size); + bt_remfree(vm, bt); + vmem_check(vm); + if (bt->bt_start != start) { + btnew2->bt_type = BT_TYPE_FREE; + btnew2->bt_start = bt->bt_start; + btnew2->bt_size = start - bt->bt_start; + bt->bt_start = start; + bt->bt_size -= btnew2->bt_size; + bt_insfree(vm, btnew2); + bt_insseg(vm, btnew2, TAILQ_PREV(bt, vmem_seglist, bt_seglist)); + btnew2 = NULL; + vmem_check(vm); + } + KASSERT(bt->bt_start == start); + if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) { + /* split */ + btnew->bt_type = BT_TYPE_BUSY; + btnew->bt_start = bt->bt_start; + btnew->bt_size = size; + bt->bt_start = bt->bt_start + size; + bt->bt_size -= size; + bt_insfree(vm, bt); + bt_insseg(vm, btnew, TAILQ_PREV(bt, vmem_seglist, bt_seglist)); + bt_insbusy(vm, btnew); + vmem_check(vm); + } else { + bt->bt_type = BT_TYPE_BUSY; + bt_insbusy(vm, bt); + vmem_check(vm); + bt_free(vm, btnew); + btnew = bt; + } + if (btnew2 != NULL) { + bt_free(vm, btnew2); + } + KASSERT(btnew->bt_size >= size); + btnew->bt_type = BT_TYPE_BUSY; + if (addrp != NULL) + *addrp = btnew->bt_start; + VMEM_UNLOCK(vm); + KASSERTMSG(addrp == NULL || + (*addrp & vm->vm_quantum_mask) == 0, + "vmem %s mask=0x%llx addr=0x%llx", vm->vm_name, + (unsigned long long)vm->vm_quantum_mask, + (unsigned long long)*addrp); + return 0; +} + +/* + * vmem_free: free the resource to the arena. + */ + +void +vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) +{ + + KASSERT(size > 0); + KASSERTMSG((addr & vm->vm_quantum_mask) == 0, + "vmem %s mask=0x%llx addr=0x%llx", vm->vm_name, + (unsigned long long)vm->vm_quantum_mask, + (unsigned long long)addr); + +#if defined(QCACHE) + if (size <= vm->vm_qcache_max) { + int qidx = (size + vm->vm_quantum_mask) >> vm->vm_quantum_shift; + qcache_t *qc = vm->vm_qcache[qidx - 1]; + + pool_cache_put(qc->qc_cache, (void *)addr); + return; + } +#endif /* defined(QCACHE) */ + + vmem_xfree(vm, addr, size); +} + +void +vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) +{ + bt_t *bt; + + KASSERT(size > 0); + KASSERTMSG((addr & vm->vm_quantum_mask) == 0, + "vmem %s mask=0x%llx addr=0x%llx", vm->vm_name, + (unsigned long long)vm->vm_quantum_mask, + (unsigned long long)addr); + + VMEM_LOCK(vm); + + bt = bt_lookupbusy(vm, addr); + KASSERTMSG(bt != NULL, "vmem %s addr 0x%llx size 0x%llx", vm->vm_name, + (unsigned long long)addr, (unsigned long long)size); + KASSERT(bt->bt_start == addr); + KASSERT(bt->bt_size == vmem_roundup_size(vm, size) || + bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask); + + /* vmem_xfree_bt() drops the lock. */ + vmem_xfree_bt(vm, bt); +} + +void +vmem_xfreeall(vmem_t *vm) +{ + bt_t *bt; + +#if defined(QCACHE) + /* This can't be used if the arena has a quantum cache. */ + KASSERT(vm->vm_qcache_max == 0); +#endif /* defined(QCACHE) */ + + for (;;) { + VMEM_LOCK(vm); + TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { + if (bt->bt_type == BT_TYPE_BUSY) + break; + } + if (bt != NULL) { + /* vmem_xfree_bt() drops the lock. */ + vmem_xfree_bt(vm, bt); + } else { + VMEM_UNLOCK(vm); + return; + } + } +} + +static void +vmem_xfree_bt(vmem_t *vm, bt_t *bt) +{ + bt_t *t; + + VMEM_ASSERT_LOCKED(vm); + + KASSERT(bt->bt_type == BT_TYPE_BUSY); + bt_rembusy(vm, bt); + bt->bt_type = BT_TYPE_FREE; + + /* coalesce */ + t = TAILQ_NEXT(bt, bt_seglist); + if (t != NULL && t->bt_type == BT_TYPE_FREE) { + KASSERT(BT_END(bt) < t->bt_start); /* YYY */ + bt_remfree(vm, t); + bt_remseg(vm, t); + bt->bt_size += t->bt_size; + bt_free(vm, t); + } + t = TAILQ_PREV(bt, vmem_seglist, bt_seglist); + if (t != NULL && t->bt_type == BT_TYPE_FREE) { + KASSERT(BT_END(t) < bt->bt_start); /* YYY */ + bt_remfree(vm, t); + bt_remseg(vm, t); + bt->bt_size += t->bt_size; + bt->bt_start = t->bt_start; + bt_free(vm, t); + } + + t = TAILQ_PREV(bt, vmem_seglist, bt_seglist); + KASSERT(t != NULL); + KASSERT(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY); + if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN && + t->bt_size == bt->bt_size) { + vmem_addr_t spanaddr; + vmem_size_t spansize; + + KASSERT(t->bt_start == bt->bt_start); + spanaddr = bt->bt_start; + spansize = bt->bt_size; + bt_remseg(vm, bt); + bt_free(vm, bt); + bt_remseg(vm, t); + bt_free(vm, t); + vm->vm_size -= spansize; + //VMEM_CONDVAR_BROADCAST(vm); + /* bt_freetrim() drops the lock. */ + bt_freetrim(vm, BT_MAXFREE); + (*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize); + } else { + bt_insfree(vm, bt); + //VMEM_CONDVAR_BROADCAST(vm); + /* bt_freetrim() drops the lock. */ + bt_freetrim(vm, BT_MAXFREE); + } +} + +/* + * vmem_add: + * + * => caller must ensure appropriate spl, + * if the arena can be accessed from interrupt context. + */ + +int +vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, vm_flag_t flags) +{ + int rv; + + VMEM_LOCK(vm); + rv = vmem_add1(vm, addr, size, flags, BT_TYPE_SPAN_STATIC); + VMEM_UNLOCK(vm); + + return rv; +} + +/* + * vmem_size: information about arenas size + * + * => return free/allocated size in arena + */ +vmem_size_t +vmem_size(vmem_t *vm, int typemask) +{ + + switch (typemask) { + case VMEM_ALLOC: + return vm->vm_inuse; + case VMEM_FREE: + return vm->vm_size - vm->vm_inuse; + case VMEM_FREE|VMEM_ALLOC: + return vm->vm_size; + default: + panic("vmem_size"); + } +} + +/* ---- rehash */ + +#if defined(_KERNEL) +static struct timeout vmem_rehash_tick; +static struct task vmem_rehash_task; +static int vmem_rehash_interval; + +static void +vmem_rehash_all(void *arg) +{ + vmem_t *vm; + + rw_enter_read(&vmem_list_lock); + LIST_FOREACH(vm, &vmem_list, vm_alllist) { + size_t desired; + size_t current; + + desired = READ_ONCE(vm->vm_maxbusytag); + current = READ_ONCE(vm->vm_hashsize); + + if (desired > VMEM_HASHSIZE_MAX) { + desired = VMEM_HASHSIZE_MAX; + } else if (desired < VMEM_HASHSIZE_MIN) { + desired = VMEM_HASHSIZE_MIN; + } + if (desired > current * 2 || desired * 2 < current) { + vmem_rehash(vm, desired, VM_NOSLEEP); + } + } + rw_exit_read(&vmem_list_lock); + +} + +static void +vmem_rehash_add(void *arg) +{ + timeout_add_sec(&vmem_rehash_tick, vmem_rehash_interval); + task_add(systqmp, &vmem_rehash_task); +} + +void +vmem_rehash_start(void) +{ + timeout_set(&vmem_rehash_tick, vmem_rehash_add, NULL); + task_set(&vmem_rehash_task, vmem_rehash_all, NULL); + + vmem_rehash_interval = 10; + timeout_add_sec(&vmem_rehash_tick, vmem_rehash_interval); +} +#endif /* defined(_KERNEL) */ + +/* ---- debug */ + +#if defined(DDB) || defined(UNITTEST) || defined(VMEM_SANITY) + +static void bt_dump(const bt_t *, void (*)(const char *, ...) + __attribute__((__format__(__kprintf__,1,2)))); + +static const char * +bt_type_string(int type) +{ + static const char * const table[] = { + [BT_TYPE_BUSY] = "busy", + [BT_TYPE_FREE] = "free", + [BT_TYPE_SPAN] = "span", + [BT_TYPE_SPAN_STATIC] = "static span", + }; + + if (type >= nitems(table)) { + return "BOGUS"; + } + return table[type]; +} + +static void +bt_dump(const bt_t *bt, void (*pr)(const char *, ...)) +{ + + (*pr)("\t%p: %llu, %llu, %d(%s)\n", + bt, (uint64_t)bt->bt_start, (uint64_t)bt->bt_size, + bt->bt_type, bt_type_string(bt->bt_type)); +} + +static void +vmem_dump(const vmem_t *vm , void (*pr)(const char *, ...)) +{ + const bt_t *bt; + int i; + + (*pr)("vmem %p '%s'\n", vm, vm->vm_name); + TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { + bt_dump(bt, pr); + } + + for (i = 0; i < VMEM_MAXORDER; i++) { + const struct vmem_freelist *fl = &vm->vm_freelist[i]; + + if (LIST_EMPTY(fl)) { + continue; + } + + (*pr)("freelist[%d]\n", i); + LIST_FOREACH(bt, fl, bt_freelist) { + bt_dump(bt, pr); + } + } +} + +#endif /* defined(DDB) || defined(UNITTEST) || defined(VMEM_SANITY) */ + +#if defined(DDB) +static bt_t * +vmem_whatis_lookup(vmem_t *vm, uintptr_t addr) +{ + bt_t *bt; + + TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { + if (BT_ISSPAN_P(bt)) { + continue; + } + if (bt->bt_start <= addr && addr <= BT_END(bt)) { + return bt; + } + } + + return NULL; +} + +void +vmem_whatis(uintptr_t addr, void (*pr)(const char *, ...)) +{ + vmem_t *vm; + + LIST_FOREACH(vm, &vmem_list, vm_alllist) { + bt_t *bt; + + bt = vmem_whatis_lookup(vm, addr); + if (bt == NULL) { + continue; + } + (*pr)("%p is %p+%zu in VMEM '%s' (%s)\n", + (void *)addr, (void *)bt->bt_start, + (size_t)(addr - bt->bt_start), vm->vm_name, + (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free"); + } +} + +void +vmem_printall(const char *modif, void (*pr)(const char *, ...)) +{ + const vmem_t *vm; + + LIST_FOREACH(vm, &vmem_list, vm_alllist) { + vmem_dump(vm, pr); + } +} + +void +vmem_print(uintptr_t addr, const char *modif, void (*pr)(const char *, ...)) +{ + const vmem_t *vm = (const void *)addr; + + vmem_dump(vm, pr); +} +#endif /* defined(DDB) */ + +#if defined(_KERNEL) +#define vmem_printf printf +#else +#include +#include + +static void +vmem_printf(const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); +} +#endif + +#if defined(VMEM_SANITY) + +static bool +vmem_check_sanity(vmem_t *vm) +{ + const bt_t *bt, *bt2; + + KASSERT(vm != NULL); + + TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { + if (bt->bt_start > BT_END(bt)) { + printf("corrupted tag\n"); + bt_dump(bt, vmem_printf); + return false; + } + } + TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { + TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) { + if (bt == bt2) { + continue; + } + if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) { + continue; + } + if (bt->bt_start <= BT_END(bt2) && + bt2->bt_start <= BT_END(bt)) { + printf("overwrapped tags\n"); + bt_dump(bt, vmem_printf); + bt_dump(bt2, vmem_printf); + return false; + } + } + } + + return true; +} + +static void +vmem_check(vmem_t *vm) +{ + + if (!vmem_check_sanity(vm)) { + panic("insanity vmem %p", vm); + } +} + +#endif /* defined(VMEM_SANITY) */ + +#if defined(UNITTEST) +int +main(void) +{ + int rc; + vmem_t *vm; + vmem_addr_t p; + struct reg { + vmem_addr_t p; + vmem_size_t sz; + bool x; + } *reg = NULL; + int nreg = 0; + int nalloc = 0; + int nfree = 0; + vmem_size_t total = 0; +#if 1 + vm_flag_t strat = VM_INSTANTFIT; +#else + vm_flag_t strat = VM_BESTFIT; +#endif + + vm = vmem_create("test", 0, 0, 1, NULL, NULL, NULL, 0, VM_SLEEP, +#ifdef _KERNEL + IPL_NONE +#else + 0 +#endif + ); + if (vm == NULL) { + printf("vmem_create\n"); + exit(EXIT_FAILURE); + } + vmem_dump(vm, vmem_printf); + + rc = vmem_add(vm, 0, 50, VM_SLEEP); + assert(rc == 0); + rc = vmem_add(vm, 100, 200, VM_SLEEP); + assert(rc == 0); + rc = vmem_add(vm, 2000, 1, VM_SLEEP); + assert(rc == 0); + rc = vmem_add(vm, 40000, 65536, VM_SLEEP); + assert(rc == 0); + rc = vmem_add(vm, 10000, 10000, VM_SLEEP); + assert(rc == 0); + rc = vmem_add(vm, 500, 1000, VM_SLEEP); + assert(rc == 0); + rc = vmem_add(vm, 0xffffff00, 0x100, VM_SLEEP); + assert(rc == 0); + rc = vmem_xalloc(vm, 0x101, 0, 0, 0, + 0xffffff00, 0xffffffff, strat|VM_SLEEP, &p); + assert(rc != 0); + rc = vmem_xalloc(vm, 50, 0, 0, 0, 0, 49, strat|VM_SLEEP, &p); + assert(rc == 0 && p == 0); + vmem_xfree(vm, p, 50); + rc = vmem_xalloc(vm, 25, 0, 0, 0, 0, 24, strat|VM_SLEEP, &p); + assert(rc == 0 && p == 0); + rc = vmem_xalloc(vm, 0x100, 0, 0, 0, + 0xffffff01, 0xffffffff, strat|VM_SLEEP, &p); + assert(rc != 0); + rc = vmem_xalloc(vm, 0x100, 0, 0, 0, + 0xffffff00, 0xfffffffe, strat|VM_SLEEP, &p); + assert(rc != 0); + rc = vmem_xalloc(vm, 0x100, 0, 0, 0, + 0xffffff00, 0xffffffff, strat|VM_SLEEP, &p); + assert(rc == 0); + vmem_dump(vm, vmem_printf); + for (;;) { + struct reg *r; + int t = rand() % 100; + + if (t > 45) { + /* alloc */ + vmem_size_t sz = rand() % 500 + 1; + bool x; + vmem_size_t align, phase, nocross; + vmem_addr_t minaddr, maxaddr; + + if (t > 70) { + x = true; + /* XXX */ + align = 1 << (rand() % 15); + phase = rand() % 65536; + nocross = 1 << (rand() % 15); + if (align <= phase) { + phase = 0; + } + if (VMEM_CROSS_P(phase, phase + sz - 1, + nocross)) { + nocross = 0; + } + do { + minaddr = rand() % 50000; + maxaddr = rand() % 70000; + } while (minaddr > maxaddr); + printf("=== xalloc %" PRIu64 + " align=%" PRIu64 ", phase=%" PRIu64 + ", nocross=%" PRIu64 ", min=%" PRIu64 + ", max=%" PRIu64 "\n", + (uint64_t)sz, + (uint64_t)align, + (uint64_t)phase, + (uint64_t)nocross, + (uint64_t)minaddr, + (uint64_t)maxaddr); + rc = vmem_xalloc(vm, sz, align, phase, nocross, + minaddr, maxaddr, strat|VM_SLEEP, &p); + } else { + x = false; + printf("=== alloc %" PRIu64 "\n", (uint64_t)sz); + rc = vmem_alloc(vm, sz, strat|VM_SLEEP, &p); + } + printf("-> %" PRIu64 "\n", (uint64_t)p); + vmem_dump(vm, vmem_printf); + if (rc != 0) { + if (x) { + continue; + } + break; + } + nreg++; + reg = realloc(reg, sizeof(*reg) * nreg); + r = ®[nreg - 1]; + r->p = p; + r->sz = sz; + r->x = x; + total += sz; + nalloc++; + } else if (nreg != 0) { + /* free */ + r = ®[rand() % nreg]; + printf("=== free %" PRIu64 ", %" PRIu64 "\n", + (uint64_t)r->p, (uint64_t)r->sz); + if (r->x) { + vmem_xfree(vm, r->p, r->sz); + } else { + vmem_free(vm, r->p, r->sz); + } + total -= r->sz; + vmem_dump(vm, vmem_printf); + *r = reg[nreg - 1]; + nreg--; + nfree++; + } + printf("total=%" PRIu64 "\n", (uint64_t)total); + } + fprintf(stderr, "total=%" PRIu64 ", nalloc=%d, nfree=%d\n", + (uint64_t)total, nalloc, nfree); + exit(EXIT_SUCCESS); +} +#endif /* defined(UNITTEST) */ Index: sys/conf.h =================================================================== RCS file: /cvs/src/sys/sys/conf.h,v diff -u -p -r1.163 conf.h --- sys/conf.h 11 Jun 2024 01:49:17 -0000 1.163 +++ sys/conf.h 6 Sep 2024 11:18:17 -0000 @@ -326,6 +326,21 @@ extern struct cdevsw cdevsw[]; (dev_type_stop((*))) enodev, 0, \ (dev_type_mmap((*))) enodev } +/* open, close, read, ioctl, poll, kqfilter */ +#define cdev_lltrace_init(c,n) { \ + .d_open = dev_init(c,n,open), \ + .d_close = dev_init(c,n,close), \ + .d_read = dev_init(c,n,read), \ + .d_write = (dev_type_write((*))) enodev, \ + .d_ioctl = dev_init(c,n,ioctl), \ + .d_stop = (dev_type_stop((*))) enodev, \ + .d_tty = NULL, \ + .d_mmap = (dev_type_mmap((*))) enodev, \ + .d_type = 0, \ + .d_flags = 0, \ + .d_kqfilter = dev_init(c,n,kqfilter), \ +} + /* open, close, read, write, ioctl, stop, tty, mmap, kqfilter */ #define cdev_wsdisplay_init(c,n) { \ dev_init(c,n,open), dev_init(c,n,close), dev_init(c,n,read), \ @@ -615,6 +630,7 @@ cdev_decl(wsmux); cdev_decl(ksyms); cdev_decl(kstat); +cdev_decl(lltrace); cdev_decl(bio); cdev_decl(vscsi); Index: sys/lltrace.h =================================================================== RCS file: sys/lltrace.h diff -N sys/lltrace.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/lltrace.h 6 Sep 2024 11:18:17 -0000 @@ -0,0 +1,297 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2022 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _SYS_LLTRACE_H_ +#define _SYS_LLTRACE_H_ + +/* + * lltrace is heavily based KUTrace (kernel/userland tracing) by + * Richard L. Sites. + */ + +#define LLTRACE_NSLOTS 8192 + +struct lltrace_buffer { + uint64_t llt_slots[LLTRACE_NSLOTS]; +}; + +#define LLTIOCSTART _IO('t',128) +#define LLTIOCSTOP _IO('t',129) +#define LLTIOCFLUSH _IO('t',130) + +/* + * trace until all the buffers are used, or trace and reuse buffers. + */ +#define LLTRACE_MODE_HEAD 0 +#define LLTRACE_MODE_TAIL 1 +#define LLTRACE_MODE_COUNT 2 + +#define LLTIOCSMODE _IOW('t', 131, unsigned int) +#define LLTIOCGMODE _IOR('t', 131, unsigned int) + +/* + * how much memory in MB to allocate for lltrace_buffer structs + * during tracing. + */ + +#define LLTRACE_BLEN_MIN 1 +#define LLTRACE_BLEN_MAX 128 + +#define LLTIOCSBLEN _IOW('t', 132, unsigned int) +#define LLTIOCGBLEN _IOR('t', 132, unsigned int) + +/* + * lltrace collects kernel events in per-CPU buffers. + */ + +/* + * The first 8 words of the per-CPU buffer are dedicated to metadata + * about the CPU and the period of time over which events were + * collected. + */ + +struct lltrace_header { + /* slots[0] */ + uint32_t h_cpu; + uint32_t h_idletid; + + /* slots[1] */ + uint64_t h_boottime; + + /* slots[2] */ + uint64_t h_start_cy; + /* slots[3] */ + uint64_t h_start_ns; + /* slots[4] */ + uint64_t h_end_cy; + /* slots[5] */ + uint64_t h_end_ns; + + /* slots[6] */ + uint32_t h_pid; + uint32_t h_tid; + /* slots[7] */ + uint64_t h_zero; +}; + +#define LLTRACE_MASK(_w) ((1ULL << (_w)) - 1) + +#define LLTRACE_TYPE_SHIFT 0 +#define LLTRACE_TYPE_WIDTH 3 +#define LLTRACE_TYPE_MASK LLTRACE_MASK(LLTRACE_TYPE_WIDTH) + +#define LLTRACE_TYPE_ID 0x0ULL +#define LLTRACE_TYPE_EVENT 0x1ULL +#define LLTRACE_TYPE_LOCKING 0x2ULL + +#define LLTRACE_LEN_SHIFT (LLTRACE_TYPE_SHIFT + LLTRACE_TYPE_WIDTH) +#define LLTRACE_LEN_WIDTH 3 +#define LLTRACE_LEN_MASK LLTRACE_MASK(LLTRACE_LEN_WIDTH) + +/* most records have a timestamp */ +#define LLTRACE_TS_TYPES ( \ + (1 << LLTRACE_TYPE_EVENT) | \ + (1 << LLTRACE_TYPE_LOCKING) \ + ) + +#define LLTRACE_TS_SHIFT (LLTRACE_LEN_SHIFT + LLTRACE_LEN_WIDTH) +#define LLTRACE_TS_WIDTH 20 +#define LLTRACE_TS_MASK LLTRACE_MASK(20) + +/* + * id records + */ + +/* tid record contains pid and kthread flag, followed by proc name */ +#define LLTRACE_ID_TYPE_SHIFT (LLTRACE_LEN_SHIFT + LLTRACE_LEN_WIDTH) +#define LLTRACE_ID_TYPE_WIDTH 6 +#define LLTRACE_ID_TYPE_MASK LLTRACE_MASK(3) +#define LLTRACE_ID_TYPE_TID 0x0 + +#define LLTRACE_ID_TID_SHIFT (LLTRACE_ID_TYPE_SHIFT + LLTRACE_ID_TYPE_WIDTH) +#define LLTRACE_ID_TID_WIDTH 20 /* >= than 19 bit TID_MASK */ +#define LLTRACE_ID_TID_MASK LLTRACE_MASK(LLTRACE_ID_TID_WIDTH) + +#define LLTRACE_ID_TID_PID_SHIFT 32 +#define LLTRACE_ID_TID_PID_WIDTH 20 /* >= whatever kernel pid range is */ +#define LLTRACE_ID_TID_PID_MASK LLTRACE_MASK(LLTRACE_ID_TID_PID_WIDTH) +#define LLTRACE_ID_TID_SYSTEM (1ULL << 63) /* kernel thread */ + +/* + * event records + */ + +#define LLTRACE_EVENT_PHASE_SHIFT (LLTRACE_TS_SHIFT + LLTRACE_TS_WIDTH) +#define LLTRACE_EVENT_PHASE_WIDTH 2 +#define LLTRACE_EVENT_PHASE_MASK LLTRACE_MASK(LLTRACE_EVENT_PHASE_WIDTH) +#define LLTRACE_EVENT_PHASE_INSTANT 0x0 +#define LLTRACE_EVENT_PHASE_START 0x1 +#define LLTRACE_EVENT_PHASE_STEP 0x2 +#define LLTRACE_EVENT_PHASE_END 0x3 + +#define LLTRACE_EVENT_CLASS_WIDTH 4 +#define LLTRACE_EVENT_CLASS_SHIFT \ + (LLTRACE_EVENT_PHASE_SHIFT + LLTRACE_EVENT_PHASE_WIDTH) +#define LLTRACE_EVENT_CLASS_MASK LLTRACE_MASK(LLTRACE_EVENT_CLASS_WIDTH) +#define LLTRACE_EVENT_CLASS_SYSCALL 0 +#define LLTRACE_EVENT_CLASS_IDLE 1 +#define LLTRACE_EVENT_CLASS_PAGEFAULT 2 +#define LLTRACE_EVENT_CLASS_INTR 3 +#define LLTRACE_EVENT_CLASS_SCHED 4 +#define LLTRACE_EVENT_CLASS_FUNC 5 +#define LLTRACE_EVENT_CLASS_WAKE 6 +#define LLTRACE_EVENT_CLASS_COUNT 7 + +#define LLTRACE_EVENT_DATA_SHIFT \ + (LLTRACE_EVENT_CLASS_SHIFT + LLTRACE_EVENT_CLASS_WIDTH) +#define LLTRACE_EVENT_DATA_SHIFT_CHECK 32 + +#define LLTRACE_SYSCALL_SHIFT LLTRACE_EVENT_DATA_SHIFT +#define LLTRACE_SYSCALL_WIDTH 10 +#define LLTRACE_SYSCALL_MASK LLTRACE_MASK(LLTRACE_SYSCALL_WIDTH) + +#define LLTRACE_SCHED_TID_SHIFT LLTRACE_EVENT_DATA_SHIFT +#define LLTRACE_SCHED_TID_WIDTH LLTRACE_ID_TID_WIDTH +#define LLTRACE_SCHED_TID_MASK LLTRACE_MASK(LLTRACE_SCHED_TID_WIDTH) +#define LLTRACE_SCHED_STATE_SHIFT \ + (LLTRACE_EVENT_DATA_SHIFT + LLTRACE_ID_TID_WIDTH) +#define LLTRACE_SCHED_STATE_WIDTH 4 +#define LLTRACE_SCHED_STATE_MASK LLTRACE_MASK(LLTRACE_SCHED_STATE_WIDTH) +#define LLTRACE_SCHED_STATE_NEW 0 +#define LLTRACE_SCHED_STATE_RUNNING 1 +#define LLTRACE_SCHED_STATE_SUSPENDED 2 +#define LLTRACE_SCHED_STATE_BLOCKED 3 +#define LLTRACE_SCHED_STATE_DYING 4 +#define LLTRACE_SCHED_STATE_DEAD 5 + +#define LLTRACE_SYSCALL_V_SHIFT \ + (LLTRACE_SYSCALL_SHIFT + LLTRACE_SYSCALL_WIDTH) + +#define LLTRACE_INTR_T_SHIFT LLTRACE_EVENT_DATA_SHIFT +#define LLTRACE_INTR_T_WIDTH 2 +#define LLTRACE_INTR_T_MASK LLTRACE_MASK(LLTRACE_INTR_T_WIDTH) +#define LLTRACE_INTR_T_HW 0ULL +#define LLTRACE_INTR_T_SW 1ULL +#define LLTRACE_INTR_T_IPI 2ULL +#define LLTRACE_INTR_T_CLOCK 3ULL + +#define LLTRACE_INTR_DATA_SHIFT \ + (LLTRACE_INTR_T_SHIFT + LLTRACE_INTR_T_WIDTH) + +/* record a count of something */ +#define LLTRACE_COUNT_T_SHIFT LLTRACE_EVENT_DATA_SHIFT +#define LLTRACE_COUNT_T_WIDTH 8 +#define LLTRACE_COUNT_T_MASK LLTRACE_MASK(LLTRACE_COUNT_T_WIDTH) + +#define LLTRACE_COUNT_T_PKTS_IFIQ 0 +#define LLTRACE_COUNT_T_PKTS_NETTQ 1 +#define LLTRACE_COUNT_T_PKTS_IFQ 2 +#define LLTRACE_COUNT_T_PKTS_QDROP 3 +#define LLTRACE_COUNT_T_PKTS_HDROP 4 + +#define LLTRACE_COUNT_V_SHIFT \ + (LLTRACE_COUNT_T_SHIFT + LLTRACE_COUNT_T_WIDTH) + +/* + * locking records + */ + +#define LLTRACE_LK_TYPE_SHIFT (LLTRACE_TS_SHIFT + LLTRACE_TS_WIDTH) +#define LLTRACE_LK_TYPE_WIDTH 2 +#define LLTRACE_LK_TYPE_MASK LLTRACE_MASK(LLTRACE_LK_TYPE_WIDTH) +#define LLTRACE_LK_RW 0x0 +#define LLTRACE_LK_MTX 0x1 +#define LLTRACE_LK_K 0x2 + +#define LLTRACE_LK_PHASE_SHIFT \ + (LLTRACE_LK_TYPE_SHIFT + LLTRACE_LK_TYPE_WIDTH) +#define LLTRACE_LK_PHASE_WIDTH 4 +#define LLTRACE_LK_PHASE_MASK LLTRACE_MASK(LLTRACE_LK_PHASE_WIDTH) +#define LLTRACE_LK_I_EXCL 0x0 /* instantly got wr lock */ +#define LLTRACE_LK_I_SHARED 0x1 /* instantly got rd lock */ +#define LLTRACE_LK_A_START 0x2 /* acquiring lock */ +#define LLTRACE_LK_A_EXCL 0x3 /* acquired wr lock */ +#define LLTRACE_LK_A_SHARED 0x4 /* acquired rd lock */ +#define LLTRACE_LK_A_ABORT 0x5 /* acquire aborted */ +#define LLTRACE_LK_DOWNGRADE 0x6 /* wr to rd lock */ +#define LLTRACE_LK_R_EXCL 0x7 /* released wr lock */ +#define LLTRACE_LK_R_SHARED 0x8 /* released rd lock */ +#define LLTRACE_LK_I_FAIL 0x9 /* try failed */ + +#define LLTRACE_LK_ADDR_SHIFT \ + (LLTRACE_LK_PHASE_SHIFT + LLTRACE_LK_PHASE_WIDTH) + +#ifdef _KERNEL + +struct lltrace_cpu; + +static inline struct lltrace_cpu * +lltrace_enter_spc(struct schedstate_percpu *spc) +{ + return (READ_ONCE(spc->spc_lltrace)); +} + +static inline struct lltrace_cpu * +lltrace_enter_cpu(struct cpu_info *ci) +{ + return lltrace_enter_spc(&ci->ci_schedstate); +} + +static inline struct lltrace_cpu * +lltrace_enter(void) +{ + return lltrace_enter_cpu(curcpu()); +} + +void lltrace_idle(struct lltrace_cpu *, unsigned int); +void lltrace_statclock(struct lltrace_cpu *, int, unsigned long); + +void lltrace_syscall(struct lltrace_cpu *, register_t, + size_t, const register_t *); +void lltrace_sysret(struct lltrace_cpu *, register_t, + int, const register_t [2]); +struct lltrace_cpu * + lltrace_pidname(struct lltrace_cpu *, struct proc *); +void lltrace_switch(struct lltrace_cpu *, struct proc *, struct proc *); +void lltrace_sched_enter(struct lltrace_cpu *); +void lltrace_sched_leave(struct lltrace_cpu *); +void lltrace_runnable(struct lltrace_cpu *, struct proc *); + +void lltrace_event_start(struct lltrace_cpu *, unsigned int); +void lltrace_event_end(struct lltrace_cpu *, unsigned int); +void lltrace_count(struct lltrace_cpu *, unsigned int, unsigned int); + +void lltrace_lock(struct lltrace_cpu *, void *, unsigned int, unsigned int); + +void lltrace_pkts(struct lltrace_cpu *, unsigned int, unsigned int); +void lltrace_mark(struct lltrace_cpu *); + +void lltrace_fn_enter(struct lltrace_cpu *, void *); +void lltrace_fn_leave(struct lltrace_cpu *, void *); + +/* MD bits */ + +void lltrace_ipi(struct lltrace_cpu *, unsigned int); +#define lltrace_ipi_bcast(_llt) lltrace_ipi((_llt), ~0U); + +void lltrace_intr_enter(struct lltrace_cpu *, unsigned int, unsigned int); +void lltrace_intr_leave(struct lltrace_cpu *, unsigned int, unsigned int); + +#endif /* _KERNEL */ + +#endif /* _SYS_LLTRACE_H_ */ Index: sys/proc.h =================================================================== RCS file: /cvs/src/sys/sys/proc.h,v diff -u -p -r1.371 proc.h --- sys/proc.h 1 Sep 2024 03:09:00 -0000 1.371 +++ sys/proc.h 6 Sep 2024 11:18:17 -0000 @@ -363,6 +363,7 @@ struct proc { /* scheduling */ int p_cpticks; /* Ticks of cpu time. */ + uint64_t p_wakeid; /* [S] */ const volatile void *p_wchan; /* [S] Sleep address. */ struct timeout p_sleep_to;/* timeout for tsleep() */ const char *p_wmesg; /* [S] Reason for sleep. */ Index: sys/sched.h =================================================================== RCS file: /cvs/src/sys/sys/sched.h,v diff -u -p -r1.73 sched.h --- sys/sched.h 8 Jul 2024 14:46:47 -0000 1.73 +++ sys/sched.h 6 Sep 2024 11:18:17 -0000 @@ -101,11 +101,13 @@ struct cpustats { #define SCHED_NQS 32 /* 32 run queues. */ struct smr_entry; +struct lltrace_cpu; /* * Per-CPU scheduler state. */ struct schedstate_percpu { + struct lltrace_cpu *spc_lltrace; struct proc *spc_idleproc; /* idle proc for this cpu */ TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS]; LIST_HEAD(,proc) spc_deadproc; Index: sys/syscall_mi.h =================================================================== RCS file: /cvs/src/sys/sys/syscall_mi.h,v diff -u -p -r1.35 syscall_mi.h --- sys/syscall_mi.h 1 Sep 2024 03:09:01 -0000 1.35 +++ sys/syscall_mi.h 6 Sep 2024 11:18:17 -0000 @@ -157,6 +157,7 @@ mi_syscall(struct proc *p, register_t co KERNEL_UNLOCK(); } #endif + LLTRACE_CPU(p->p_cpu, lltrace_syscall, code, callp->sy_argsize, argp); /* SP must be within MAP_STACK space */ if (!uvm_map_inentry(p, &p->p_spinentry, PROC_STACK(p), @@ -190,6 +191,7 @@ static inline void mi_syscall_return(struct proc *p, register_t code, int error, const register_t retval[2]) { + LLTRACE_CPU(p->p_cpu, lltrace_sysret, code, error, retval); #ifdef SYSCALL_DEBUG KERNEL_LOCK(); scdebug_ret(p, code, error, retval); @@ -217,12 +219,13 @@ mi_syscall_return(struct proc *p, regist static inline void mi_child_return(struct proc *p) { -#if defined(SYSCALL_DEBUG) || defined(KTRACE) || NDT > 0 +#if defined(SYSCALL_DEBUG) || defined(KTRACE) || NDT > 0 || NLLT > 0 int code = (p->p_flag & P_THREAD) ? SYS___tfork : (p->p_p->ps_flags & PS_PPWAIT) ? SYS_vfork : SYS_fork; const register_t child_retval[2] = { 0, 1 }; #endif + LLTRACE_CPU(p->p_cpu, lltrace_sysret, code, 0, child_retval); TRACEPOINT(sched, on__cpu, NULL); #ifdef SYSCALL_DEBUG Index: sys/tracepoint.h =================================================================== RCS file: /cvs/src/sys/sys/tracepoint.h,v diff -u -p -r1.2 tracepoint.h --- sys/tracepoint.h 28 Jun 2022 09:32:28 -0000 1.2 +++ sys/tracepoint.h 6 Sep 2024 11:18:17 -0000 @@ -34,5 +34,33 @@ #define TRACEINDEX(func, index, args...) #endif /* NDT > 0 */ + +#include "llt.h" +#if NLLT > 0 +#include + +#define LLTRACE_SPC(_spc, _fn, ...) { \ + struct lltrace_cpu *_llt = lltrace_enter_spc((_spc)); \ + if (_llt != NULL) \ + (_fn)(_llt, ## __VA_ARGS__); \ +} while (0) + +#define LLTRACE_CPU(_ci, _fn, ...) { \ + struct lltrace_cpu *_llt = lltrace_enter_cpu((_ci)); \ + if (_llt != NULL) \ + (_fn)(_llt, ##__VA_ARGS__); \ +} while (0) + +#define LLTRACE(_fn, ...) { \ + struct lltrace_cpu *_llt = lltrace_enter(); \ + if (_llt != NULL) \ + (_fn)(_llt, ## __VA_ARGS__); \ +} while (0) + +#else /* NLLT > 0 */ + +#define LLTRACE(_fn, ...) + +#endif /* NLLT > 0 */ #endif /* _KERNEL */ #endif /* _SYS_TRACEPOINT_H_ */ Index: sys/vmem.h =================================================================== RCS file: sys/vmem.h diff -N sys/vmem.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/vmem.h 6 Sep 2024 11:18:17 -0000 @@ -0,0 +1,100 @@ +/* $NetBSD: vmem.h,v 1.25 2023/12/03 19:34:08 thorpej Exp $ */ + +/*- + * Copyright (c)2006 YAMAMOTO Takashi, + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_VMEM_H_ +#define _SYS_VMEM_H_ + +#include + +#if defined(_KERNEL) +#else /* defined(_KERNEL) */ +#include +#endif /* defined(_KERNEL) */ + +typedef struct vmem vmem_t; + +typedef unsigned int vm_flag_t; + +typedef uintptr_t vmem_addr_t; +typedef size_t vmem_size_t; +#define VMEM_ADDR_MIN 0 +#define VMEM_ADDR_MAX (~(vmem_addr_t)0) + +typedef int (vmem_import_t)(vmem_t *, vmem_size_t, vm_flag_t, + vmem_addr_t *); +typedef void (vmem_release_t)(vmem_t *, vmem_addr_t, vmem_size_t); + +typedef int (vmem_ximport_t)(vmem_t *, vmem_size_t, vmem_size_t *, + vm_flag_t, vmem_addr_t *); + +extern vmem_t *kmem_arena; +extern vmem_t *kmem_meta_arena; +extern vmem_t *kmem_va_arena; + +vmem_t * vmem_create(const char *, vmem_addr_t, vmem_size_t, vmem_size_t, + vmem_import_t *, vmem_release_t *, vmem_t *, vmem_size_t, + vm_flag_t, int); +vmem_t * vmem_xcreate(const char *, vmem_addr_t, vmem_size_t, + vmem_size_t, vmem_ximport_t *, vmem_release_t *, vmem_t *, + vmem_size_t, vm_flag_t, int); +void vmem_destroy(vmem_t *); +int vmem_alloc(vmem_t *, vmem_size_t, vm_flag_t, vmem_addr_t *); +void vmem_free(vmem_t *, vmem_addr_t, vmem_size_t); +int vmem_xalloc(vmem_t *, vmem_size_t, vmem_size_t, vmem_size_t, + vmem_size_t, vmem_addr_t, vmem_addr_t, vm_flag_t, + vmem_addr_t *); +int vmem_xalloc_addr(vmem_t *, vmem_addr_t, vmem_size_t, vm_flag_t); +void vmem_xfree(vmem_t *, vmem_addr_t, vmem_size_t); +void vmem_xfreeall(vmem_t *); +int vmem_add(vmem_t *, vmem_addr_t, vmem_size_t, vm_flag_t); +vmem_size_t vmem_roundup_size(vmem_t *, vmem_size_t); +vmem_size_t vmem_size(vmem_t *, int typemask); +void vmem_rehash_start(void); +void vmem_whatis(uintptr_t, void (*)(const char *, ...) + __attribute__((__format__(__kprintf__,1,2)))); +void vmem_print(uintptr_t, const char *, void (*)(const char *, ...) + __attribute__((__format__(__kprintf__,1,2)))); +void vmem_printall(const char *, void (*)(const char *, ...) + __attribute__((__format__(__kprintf__,1,2)))); + +/* vm_flag_t */ +#define VM_SLEEP 0x00000001 +#define VM_NOSLEEP 0x00000002 +#define VM_INSTANTFIT 0x00001000 +#define VM_BESTFIT 0x00002000 +#define VM_BOOTSTRAP 0x00010000 +#define VM_POPULATING 0x00040000 +#define VM_LARGEIMPORT 0x00080000 +#define VM_XIMPORT 0x00100000 +#define VM_PRIVTAGS 0x00200000 + +/* vmem_size typemask */ +#define VMEM_ALLOC 0x01 +#define VMEM_FREE 0x02 + +#endif /* !_SYS_VMEM_H_ */ Index: uvm/uvm_fault.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_fault.c,v diff -u -p -r1.135 uvm_fault.c --- uvm/uvm_fault.c 5 Sep 2023 05:08:26 -0000 1.135 +++ uvm/uvm_fault.c 6 Sep 2024 11:18:17 -0000 @@ -576,6 +576,8 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad struct vm_page *pages[UVM_MAXRANGE]; int error; + LLTRACE(lltrace_event_start, LLTRACE_EVENT_CLASS_PAGEFAULT); + counters_inc(uvmexp_counters, faults); TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL); @@ -639,6 +641,8 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad } } } + + LLTRACE(lltrace_event_end, LLTRACE_EVENT_CLASS_PAGEFAULT); return error; }