Index: dev/dt/dt_dev.c =================================================================== RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v diff -u -p -r1.42 dt_dev.c --- dev/dt/dt_dev.c 4 Dec 2024 09:37:33 -0000 1.42 +++ dev/dt/dt_dev.c 1 May 2025 08:01:23 -0000 @@ -252,7 +252,7 @@ dtread(dev_t dev, struct uio *uio, int f while (!atomic_load_int(&sc->ds_evtcnt)) { sleep_setup(sc, PWAIT | PCATCH, "dtread"); - error = sleep_finish(0, !atomic_load_int(&sc->ds_evtcnt)); + error = sleep_finish(INFSLP, !atomic_load_int(&sc->ds_evtcnt)); if (error == EINTR || error == ERESTART) break; } Index: dev/pci/if_myx.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_myx.c,v diff -u -p -r1.120 if_myx.c --- dev/pci/if_myx.c 24 May 2024 06:02:56 -0000 1.120 +++ dev/pci/if_myx.c 1 May 2025 08:01:23 -0000 @@ -1395,7 +1395,7 @@ myx_down(struct myx_softc *sc) while (sc->sc_state != MYX_S_OFF) { sleep_setup(sts, PWAIT, "myxdown"); membar_consumer(); - sleep_finish(0, sc->sc_state != MYX_S_OFF); + sleep_finish(INFSLP, sc->sc_state != MYX_S_OFF); } s = splnet(); Index: kern/kern_exit.c =================================================================== RCS file: /cvs/src/sys/kern/kern_exit.c,v diff -u -p -r1.244 kern_exit.c --- kern/kern_exit.c 14 Apr 2025 09:15:24 -0000 1.244 +++ kern/kern_exit.c 1 May 2025 08:01:27 -0000 @@ -648,7 +651,7 @@ loop: return (0); } sleep_setup(q->p_p, PWAIT | PCATCH, "wait"); - if ((error = sleep_finish(0, + if ((error = sleep_finish(INFSLP, !ISSET(atomic_load_int(&q->p_p->ps_flags), PS_WAITEVENT))) != 0) return (error); goto loop; Index: kern/kern_rwlock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_rwlock.c,v diff -u -p -r1.55 kern_rwlock.c --- kern/kern_rwlock.c 29 Jan 2025 15:10:09 -0000 1.55 +++ kern/kern_rwlock.c 1 May 2025 08:01:27 -0000 @@ -27,10 +27,9 @@ #include #ifdef RWDIAG -#include /* for hz */ -#define RW_SLEEP_TMO 10 * hz +#define RW_SLEEP_TMO 10000000000ULL /* 10 seconds */ #else -#define RW_SLEEP_TMO 0 +#define RW_SLEEP_TMO INFSLP #endif /* Index: kern/kern_sched.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sched.c,v diff -u -p -r1.104 kern_sched.c --- kern/kern_sched.c 10 Mar 2025 09:28:56 -0000 1.104 +++ kern/kern_sched.c 1 May 2025 08:01:27 -0000 @@ -692,7 +692,7 @@ sched_stop_secondary_cpus(void) continue; while ((spc->spc_schedflags & SPCF_HALTED) == 0) { sleep_setup(spc, PZERO, "schedstate"); - sleep_finish(0, + sleep_finish(INFSLP, (spc->spc_schedflags & SPCF_HALTED) == 0); } } Index: kern/kern_synch.c =================================================================== RCS file: /cvs/src/sys/kern/kern_synch.c,v diff -u -p -r1.223 kern_synch.c --- kern/kern_synch.c 1 May 2025 06:58:21 -0000 1.223 +++ kern/kern_synch.c 1 May 2025 08:01:27 -0000 @@ -111,17 +111,27 @@ extern int safepri; * call should be interrupted by the signal (return EINTR). */ int -tsleep(const volatile void *ident, int priority, const char *wmesg, int timo) +tsleep_nsec(const volatile void *ident, int priority, const char *wmesg, + uint64_t nsecs) { #ifdef MULTIPROCESSOR int hold_count; #endif +#ifdef DIAGNOSTIC + if (nsecs == 0) { + log(LOG_WARNING, + "%s: %s[%d]: %s: trying to sleep zero nanoseconds\n", + __func__, curproc->p_p->ps_comm, curproc->p_p->ps_pid, + wmesg); + } +#endif + KASSERT((priority & ~(PRIMASK | PCATCH)) == 0); - KASSERT(ident != &nowake || ISSET(priority, PCATCH) || timo != 0); + KASSERT(ident != &nowake || ISSET(priority, PCATCH) || nsecs != INFSLP); #ifdef MULTIPROCESSOR - KASSERT(ident == &nowake || timo || _kernel_lock_held()); + KASSERT(ident == &nowake || nsecs == INFSLP || _kernel_lock_held()); #endif #ifdef DDB @@ -149,50 +159,21 @@ tsleep(const volatile void *ident, int p } sleep_setup(ident, priority, wmesg); - return sleep_finish(timo, 1); + return sleep_finish(nsecs, 1); } int -tsleep_nsec(const volatile void *ident, int priority, const char *wmesg, - uint64_t nsecs) +tsleep(const volatile void *ident, int priority, const char *wmesg, + int timo) { - uint64_t to_ticks; + uint64_t nsecs = INFSLP; - if (nsecs == INFSLP) - return tsleep(ident, priority, wmesg, 0); -#ifdef DIAGNOSTIC - if (nsecs == 0) { - log(LOG_WARNING, - "%s: %s[%d]: %s: trying to sleep zero nanoseconds\n", - __func__, curproc->p_p->ps_comm, curproc->p_p->ps_pid, - wmesg); - } -#endif - /* - * We want to sleep at least nsecs nanoseconds worth of ticks. - * - * - Clamp nsecs to prevent arithmetic overflow. - * - * - Round nsecs up to account for any nanoseconds that do not - * divide evenly into tick_nsec, otherwise we'll lose them to - * integer division in the next step. We add (tick_nsec - 1) - * to keep from introducing a spurious tick if there are no - * such nanoseconds, i.e. nsecs % tick_nsec == 0. - * - * - Divide the rounded value to a count of ticks. We divide - * by (tick_nsec + 1) to discard the extra tick introduced if, - * before rounding, nsecs % tick_nsec == 1. - * - * - Finally, add a tick to the result. We need to wait out - * the current tick before we can begin counting our interval, - * as we do not know how much time has elapsed since the - * current tick began. - */ - nsecs = MIN(nsecs, UINT64_MAX - tick_nsec); - to_ticks = (nsecs + tick_nsec - 1) / (tick_nsec + 1) + 1; - if (to_ticks > INT_MAX) - to_ticks = INT_MAX; - return tsleep(ident, priority, wmesg, (int)to_ticks); + if (timo < 0) + panic("%s: negative timo %d", __func__, timo); + if (timo > 0) + nsecs = timo * tick_nsec; + + return tsleep_nsec(ident, priority, wmesg, nsecs); } /* @@ -200,8 +181,8 @@ tsleep_nsec(const volatile void *ident, * entered the sleep queue we drop the mutex. After sleeping we re-lock. */ int -msleep(const volatile void *ident, struct mutex *mtx, int priority, - const char *wmesg, int timo) +msleep_nsec(const volatile void *ident, struct mutex *mtx, int priority, + const char *wmesg, uint64_t nsecs) { int error, spl; #ifdef MULTIPROCESSOR @@ -209,7 +190,7 @@ msleep(const volatile void *ident, struc #endif KASSERT((priority & ~(PRIMASK | PCATCH | PNORELOCK)) == 0); - KASSERT(ident != &nowake || ISSET(priority, PCATCH) || timo != 0); + KASSERT(ident != &nowake || ISSET(priority, PCATCH) || nsecs != INFSLP); KASSERT(mtx != NULL); #ifdef DDB @@ -244,7 +225,7 @@ msleep(const volatile void *ident, struc mtx_leave(mtx); /* signal may stop the process, release mutex before that */ - error = sleep_finish(timo, 1); + error = sleep_finish(nsecs, 1); if ((priority & PNORELOCK) == 0) mtx_enter(mtx); @@ -253,26 +234,17 @@ msleep(const volatile void *ident, struc } int -msleep_nsec(const volatile void *ident, struct mutex *mtx, int priority, - const char *wmesg, uint64_t nsecs) +msleep(const volatile void *ident, struct mutex *mtx, int priority, + const char *wmesg, int timo) { - uint64_t to_ticks; + uint64_t nsecs = INFSLP; - if (nsecs == INFSLP) - return msleep(ident, mtx, priority, wmesg, 0); -#ifdef DIAGNOSTIC - if (nsecs == 0) { - log(LOG_WARNING, - "%s: %s[%d]: %s: trying to sleep zero nanoseconds\n", - __func__, curproc->p_p->ps_comm, curproc->p_p->ps_pid, - wmesg); - } -#endif - nsecs = MIN(nsecs, UINT64_MAX - tick_nsec); - to_ticks = (nsecs + tick_nsec - 1) / (tick_nsec + 1) + 1; - if (to_ticks > INT_MAX) - to_ticks = INT_MAX; - return msleep(ident, mtx, priority, wmesg, (int)to_ticks); + if (timo < 0) + panic("%s: negative timo %d", __func__, timo); + if (timo > 0) + nsecs = timo * tick_nsec; + + return msleep_nsec(ident, mtx, priority, wmesg, nsecs); } /* @@ -280,13 +252,13 @@ msleep_nsec(const volatile void *ident, * entered the sleep queue we drop the it. After sleeping we re-lock. */ int -rwsleep(const volatile void *ident, struct rwlock *rwl, int priority, - const char *wmesg, int timo) +rwsleep_nsec(const volatile void *ident, struct rwlock *rwl, int priority, + const char *wmesg, uint64_t nsecs) { int error, status; KASSERT((priority & ~(PRIMASK | PCATCH | PNORELOCK)) == 0); - KASSERT(ident != &nowake || ISSET(priority, PCATCH) || timo != 0); + KASSERT(ident != &nowake || ISSET(priority, PCATCH) || nsecs != INFSLP); KASSERT(ident != rwl); rw_assert_anylock(rwl); status = rw_status(rwl); @@ -295,7 +267,7 @@ rwsleep(const volatile void *ident, stru rw_exit(rwl); /* signal may stop the process, release rwlock before that */ - error = sleep_finish(timo, 1); + error = sleep_finish(nsecs, 1); if ((priority & PNORELOCK) == 0) rw_enter(rwl, status); @@ -304,26 +276,17 @@ rwsleep(const volatile void *ident, stru } int -rwsleep_nsec(const volatile void *ident, struct rwlock *rwl, int priority, - const char *wmesg, uint64_t nsecs) +rwsleep(const volatile void *ident, struct rwlock *rwl, int priority, + const char *wmesg, int timo) { - uint64_t to_ticks; + uint64_t nsecs = INFSLP; - if (nsecs == INFSLP) - return rwsleep(ident, rwl, priority, wmesg, 0); -#ifdef DIAGNOSTIC - if (nsecs == 0) { - log(LOG_WARNING, - "%s: %s[%d]: %s: trying to sleep zero nanoseconds\n", - __func__, curproc->p_p->ps_comm, curproc->p_p->ps_pid, - wmesg); - } -#endif - nsecs = MIN(nsecs, UINT64_MAX - tick_nsec); - to_ticks = (nsecs + tick_nsec - 1) / (tick_nsec + 1) + 1; - if (to_ticks > INT_MAX) - to_ticks = INT_MAX; - return rwsleep(ident, rwl, priority, wmesg, (int)to_ticks); + if (timo < 0) + panic("%s: negative timo %d", __func__, timo); + if (timo > 0) + nsecs = timo * tick_nsec; + + return rwsleep_nsec(ident, rwl, priority, wmesg, nsecs); } void @@ -361,16 +324,16 @@ sleep_setup(const volatile void *ident, } int -sleep_finish(int timo, int do_sleep) +sleep_finish(uint64_t nsecs, int do_sleep) { struct proc *p = curproc; int catch, error = 0, error1 = 0; catch = p->p_flag & P_SINTR; - if (timo != 0) { - KASSERT(!ISSET(p->p_flag, P_TIMEOUT|P_TIMEOUTRAN)); - timeout_add(&p->p_sleep_to, timo); + KASSERT(!ISSET(p->p_flag, P_TIMEOUT|P_TIMEOUTRAN)); + if (nsecs != INFSLP) { + timeout_add_nsec(&p->p_sleep_to, nsecs); } if (catch != 0) { @@ -445,7 +408,7 @@ sleep_finish(int timo, int do_sleep) * to sleep to wait for endtsleep to run, we'd also have to * take the sched lock, so we'd be spinning against it anyway. */ - if (timo != 0 && !timeout_del(&p->p_sleep_to)) { + if (nsecs != INFSLP && !timeout_del(&p->p_sleep_to)) { int flag; /* Wait for endtsleep timeout to finish running */ @@ -753,14 +716,13 @@ thrsleep(struct proc *p, struct sys___th void *lock = SCARG(uap, lock); const uint32_t *abortp = SCARG(uap, abort); clockid_t clock_id = SCARG(uap, clock_id); - uint64_t to_ticks = 0; + uint64_t nsecs = INFSLP; int error = 0; if (ident == 0) return (EINVAL); if (tsp != NULL) { struct timespec now; - uint64_t nsecs; if ((error = clock_gettime(p, clock_id, &now))) return (error); @@ -777,10 +739,7 @@ thrsleep(struct proc *p, struct sys___th } timespecsub(tsp, &now, tsp); - nsecs = MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP); - to_ticks = (nsecs + tick_nsec - 1) / (tick_nsec + 1) + 1; - if (to_ticks > INT_MAX) - to_ticks = INT_MAX; + nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(tsp), MAXTSLP)); } tsb = (ident == -1) ? &tsb_shared : thrsleep_bucket(ident); @@ -810,7 +769,7 @@ thrsleep(struct proc *p, struct sys___th } sleep_setup(&entry, PWAIT|PCATCH, "thrsleep"); - error = sleep_finish(to_ticks, entry.tslp_p != NULL); + error = sleep_finish(nsecs, entry.tslp_p != NULL); if (error != 0 || entry.tslp_p != NULL) { mtx_enter(&tsb->tsb_lock); if (entry.tslp_p != NULL) @@ -997,7 +956,7 @@ refcnt_finalize(struct refcnt *r, const while (refs) { sleep_setup(r, PWAIT, wmesg); refs = atomic_load_int(&r->r_refs); - sleep_finish(0, refs); + sleep_finish(INFSLP, refs); } TRACEINDEX(refcnt, r->r_traceidx, r, refs, 0); /* Order subsequent loads and stores after refs == 0 load. */ @@ -1047,6 +1006,6 @@ cond_wait(struct cond *c, const char *wm while (wait) { sleep_setup(c, PWAIT, wmesg); wait = atomic_load_int(&c->c_wait); - sleep_finish(0, wait); + sleep_finish(INFSLP, wait); } } Index: kern/kern_timeout.c =================================================================== RCS file: /cvs/src/sys/kern/kern_timeout.c,v diff -u -p -r1.102 kern_timeout.c --- kern/kern_timeout.c 1 May 2025 01:43:10 -0000 1.102 +++ kern/kern_timeout.c 1 May 2025 08:01:27 -0000 @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_timeout.c,v 1.102 2025/05/01 01:43:10 dlg Exp $ */ +/* $OpenBSD: kern_timeout.c,v 1.101 2025/01/13 03:21:10 mvs Exp $ */ /* * Copyright (c) 2001 Thomas Nordin * Copyright (c) 2000-2001 Artur Grabowski @@ -48,6 +48,11 @@ #include #endif +struct timeout_ctx { + struct circq *tctx_todo; + struct timeout *tctx_running; +}; + /* * Locks used to protect global variables in this file: * @@ -74,9 +79,21 @@ struct circq timeout_wheel[BUCKETS]; /* struct circq timeout_wheel_kc[BUCKETS]; /* [T] Clock-based timeouts */ struct circq timeout_new; /* [T] New, unscheduled timeouts */ struct circq timeout_todo; /* [T] Due or needs rescheduling */ +static struct timeout_ctx timeout_ctx_si = { + .tctx_todo = &timeout_todo, /* [I] */ + .tctx_running = NULL, /* [T] */ +}; struct circq timeout_proc; /* [T] Due + needs process context */ +static struct timeout_ctx timeout_ctx_proc = { + .tctx_todo = &timeout_proc, /* [I] */ + .tctx_running = NULL, /* [T] */ +}; #ifdef MULTIPROCESSOR struct circq timeout_proc_mp; /* [T] Process ctx + no kernel lock */ +static struct timeout_ctx timeout_ctx_proc_mp = { + .tctx_todo = &timeout_proc_mp, /* [I] */ + .tctx_running = NULL, /* [T] */ +}; #endif time_t timeout_level_width[WHEELCOUNT]; /* [I] Wheel level width (seconds) */ @@ -113,6 +130,14 @@ struct kclock { (elem)->prev = (elem); \ } while (0) +#define CIRCQ_INSERT_HEAD(list, elem) do { \ + (elem)->next = (list)->next; \ + (list)->next->prev = (elem); \ + (list)->next = (elem); \ + (elem)->prev = (list); \ + tostat.tos_pending++; \ +} while (0) + #define CIRCQ_INSERT_TAIL(list, elem) do { \ (elem)->prev = (list)->prev; \ (elem)->next = (list); \ @@ -180,7 +205,7 @@ void softclock_thread_mp(void *); void timeout_barrier_timeout(void *); uint32_t timeout_bucket(const struct timeout *); uint32_t timeout_maskwheel(uint32_t, const struct timespec *); -void timeout_run(struct timeout *); +void timeout_run(struct timeout_ctx *, struct timeout *); /* * The first thing in a struct timeout is its struct circq, so we @@ -465,6 +490,7 @@ timeout_del_barrier(struct timeout *to) void timeout_barrier(struct timeout *to) { + struct timeout_ctx *tctx; struct timeout barrier; struct cond c; int flags; @@ -478,30 +504,31 @@ timeout_barrier(struct timeout *to) cond_init(&c); mtx_enter(&timeout_mutex); - - barrier.to_time = ticks; - SET(barrier.to_flags, TIMEOUT_ONQUEUE); if (ISSET(flags, TIMEOUT_PROC)) { #ifdef MULTIPROCESSOR if (ISSET(flags, TIMEOUT_MPSAFE)) - CIRCQ_INSERT_TAIL(&timeout_proc_mp, &barrier.to_list); + tctx = &timeout_ctx_proc_mp; else #endif - CIRCQ_INSERT_TAIL(&timeout_proc, &barrier.to_list); + tctx = &timeout_ctx_proc; } else - CIRCQ_INSERT_TAIL(&timeout_todo, &barrier.to_list); + tctx = &timeout_ctx_si; + + if (tctx->tctx_running != to) { + mtx_leave(&timeout_mutex); + return; + } + barrier.to_time = ticks; + SET(barrier.to_flags, TIMEOUT_ONQUEUE); + CIRCQ_INSERT_HEAD(tctx->tctx_todo, &barrier.to_list); mtx_leave(&timeout_mutex); - if (ISSET(flags, TIMEOUT_PROC)) { -#ifdef MULTIPROCESSOR - if (ISSET(flags, TIMEOUT_MPSAFE)) - wakeup_one(&timeout_proc_mp); - else -#endif - wakeup_one(&timeout_proc); - } else - softintr_schedule(softclock_si); + /* + * We know the relevant timeout context was running something + * and now also has the barrier to run, so we just have to + * wait for it to pick up the barrier task now. + */ cond_wait(&c, "tmobar"); } @@ -634,7 +661,7 @@ timeout_hardclock_update(void) } void -timeout_run(struct timeout *to) +timeout_run(struct timeout_ctx *tctx, struct timeout *to) { void (*fn)(void *); void *arg; @@ -652,6 +679,7 @@ timeout_run(struct timeout *to) struct process *kcov_process = to->to_process; #endif + tctx->tctx_running = to; mtx_leave(&timeout_mutex); timeout_sync_enter(needsproc); #if NKCOV > 0 @@ -663,6 +691,7 @@ timeout_run(struct timeout *to) #endif timeout_sync_leave(needsproc); mtx_enter(&timeout_mutex); + tctx->tctx_running = NULL; } void @@ -689,7 +718,7 @@ softclock_process_kclock_timeout(struct CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list); return; } - timeout_run(to); + timeout_run(&timeout_ctx_si, to); tostat.tos_run_softclock++; } @@ -716,7 +745,7 @@ softclock_process_tick_timeout(struct ti CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list); return; } - timeout_run(to); + timeout_run(&timeout_ctx_si, to); tostat.tos_run_softclock++; } @@ -783,8 +812,9 @@ softclock_create_thread(void *arg) } static void -softclock_thread_run(struct circq *todo) +softclock_thread_run(struct timeout_ctx *tctx) { + struct circq *todo = tctx->tctx_todo; struct timeout *to; for (;;) { @@ -793,14 +823,14 @@ softclock_thread_run(struct circq *todo) * at the same time. */ sleep_setup(todo, PSWP, "tmoslp"); - sleep_finish(0, CIRCQ_EMPTY(todo)); + sleep_finish(INFSLP, CIRCQ_EMPTY(tctx->tctx_todo)); mtx_enter(&timeout_mutex); tostat.tos_thread_wakeups++; while (!CIRCQ_EMPTY(todo)) { to = timeout_from_circq(CIRCQ_FIRST(todo)); CIRCQ_REMOVE(&to->to_list); - timeout_run(to); + timeout_run(tctx, to); tostat.tos_run_thread++; } mtx_leave(&timeout_mutex); @@ -825,7 +855,7 @@ softclock_thread(void *arg) sched_peg_curproc(ci); s = splsoftclock(); - softclock_thread_run(&timeout_proc); + softclock_thread_run(&timeout_ctx_proc); splx(s); } @@ -836,7 +866,7 @@ softclock_thread_mp(void *arg) KERNEL_ASSERT_LOCKED(); KERNEL_UNLOCK(); - softclock_thread_run(&timeout_proc_mp); + softclock_thread_run(&timeout_ctx_proc_mp); } #endif /* MULTIPROCESSOR */ Index: kern/subr_log.c =================================================================== RCS file: /cvs/src/sys/kern/subr_log.c,v diff -u -p -r1.80 subr_log.c --- kern/subr_log.c 30 Dec 2024 02:46:00 -0000 1.80 +++ kern/subr_log.c 1 May 2025 08:01:27 -0000 @@ -261,7 +261,7 @@ logread(dev_t dev, struct uio *uio, int * to keep log_mtx as a leaf lock. */ sleep_setup(mbp, LOG_RDPRI | PCATCH, "klog"); - error = sleep_finish(0, logsoftc.sc_state & LOG_RDWAIT); + error = sleep_finish(INFSLP, logsoftc.sc_state & LOG_RDWAIT); mtx_enter(&log_mtx); if (error) goto out; Index: kern/sys_futex.c =================================================================== RCS file: /cvs/src/sys/kern/sys_futex.c,v diff -u -p -r1.22 sys_futex.c --- kern/sys_futex.c 14 Aug 2023 07:42:34 -0000 1.22 +++ kern/sys_futex.c 1 May 2025 08:01:27 -0000 @@ -24,6 +24,7 @@ #include #include #include +#include /* CACHELINESIZE */ #include #ifdef KTRACE @@ -36,41 +37,65 @@ * Kernel representation of a futex. */ struct futex { - LIST_ENTRY(futex) ft_list; /* list of all futexes */ - TAILQ_HEAD(, proc) ft_threads; /* sleeping queue */ + TAILQ_ENTRY(futex) ft_entry; /* list of all futexes */ + struct process *ft_ps; struct uvm_object *ft_obj; /* UVM object */ struct vm_amap *ft_amap; /* UVM amap */ - voff_t ft_off; /* UVM offset */ - unsigned int ft_refcnt; /* # of references */ + volatile voff_t ft_off; /* UVM offset */ + + struct proc * volatile ft_proc; }; -/* Syscall helpers. */ -int futex_wait(uint32_t *, uint32_t, const struct timespec *, int); -int futex_wake(uint32_t *, uint32_t, int); -int futex_requeue(uint32_t *, uint32_t, uint32_t *, uint32_t, int); - -/* Flags for futex_get(). */ -#define FT_CREATE 0x1 /* Create a futex if it doesn't exist. */ -#define FT_PRIVATE 0x2 /* Futex is process-private. */ +static int +futex_is_eq(const struct futex *a, const struct futex *b) +{ + return (a->ft_off == b->ft_off && + a->ft_ps == b->ft_ps && + a->ft_obj == b->ft_obj && + a->ft_amap == b->ft_amap); +} -struct futex *futex_get(uint32_t *, int); -void futex_put(struct futex *); +TAILQ_HEAD(futexen, futex); -/* - * The global futex lock serializes futex(2) calls so that no wakeup - * event is lost, and protects all futex lists and futex states. - */ -struct rwlock ftlock = RWLOCK_INITIALIZER("futex"); -static struct futex_list ftlist_shared = - LIST_HEAD_INITIALIZER(ftlist_shared); -struct pool ftpool; +struct futex_bucket { + struct futexen fb_list; + struct rwlock fb_lock; + uint32_t fb_id; /* for lock ordering */ +} __aligned(CACHELINESIZE); +/* Syscall helpers. */ +static int futex_wait(struct proc *, uint32_t *, uint32_t, + const struct timespec *, int); +static int futex_wake(struct proc *, uint32_t *, uint32_t, int, + register_t *); +static int futex_requeue(struct proc *, uint32_t *, uint32_t, + uint32_t *, uint32_t, int, register_t *); + +/* Flags for futex_get(). kernel private flags sit in FUTEX_OP_MASK space */ +#define FT_PRIVATE FUTEX_PRIVATE_FLAG /* Futex is process-private. */ + +#define FUTEX_BUCKET_BITS 6 +#define FUTEX_BUCKET_SIZE (1U << FUTEX_BUCKET_BITS) +#define FUTEX_BUCKET_MASK (FUTEX_BUCKET_SIZE - 1) + +static struct futex_bucket futex_hash[FUTEX_BUCKET_SIZE]; void futex_init(void) { - pool_init(&ftpool, sizeof(struct futex), 0, IPL_NONE, - PR_WAITOK | PR_RWLOCK, "futexpl", NULL); + struct futex_bucket *fb; + unsigned int i; + + for (i = 0; i < nitems(futex_hash); i++) { + fb = &futex_hash[i]; + + TAILQ_INIT(&fb->fb_list); + rw_init(&fb->fb_lock, "futexlk"); + + fb->fb_id = arc4random(); + fb->fb_id &= ~FUTEX_BUCKET_MASK; + fb->fb_id |= i; + } } int @@ -88,65 +113,51 @@ sys_futex(struct proc *p, void *v, regis uint32_t val = SCARG(uap, val); const struct timespec *timeout = SCARG(uap, timeout); void *g = SCARG(uap, g); - int flags = 0; + int flags = op & FUTEX_FLAG_MASK; int error = 0; - if (op & FUTEX_PRIVATE_FLAG) - flags |= FT_PRIVATE; - - rw_enter_write(&ftlock); - switch (op) { + switch (op & FUTEX_OP_MASK) { case FUTEX_WAIT: - case FUTEX_WAIT_PRIVATE: - error = futex_wait(uaddr, val, timeout, flags); + error = futex_wait(p, uaddr, val, timeout, flags); break; case FUTEX_WAKE: - case FUTEX_WAKE_PRIVATE: - *retval = futex_wake(uaddr, val, flags); + error = futex_wake(p, uaddr, val, flags, retval); break; case FUTEX_REQUEUE: - case FUTEX_REQUEUE_PRIVATE: - *retval = futex_requeue(uaddr, val, g, (u_long)timeout, flags); + error = futex_requeue(p, uaddr, val, g, + (u_long)timeout, flags, retval); break; default: error = ENOSYS; break; } - rw_exit_write(&ftlock); return error; } -/* - * Return an existing futex matching userspace address ``uaddr''. - * - * If such futex does not exist and FT_CREATE is given, create it. - */ -struct futex * -futex_get(uint32_t *uaddr, int flags) +static void +futex_addrs(struct proc *p, struct futex *f, uint32_t *uaddr, int flags) { - struct proc *p = curproc; vm_map_t map = &p->p_vmspace->vm_map; vm_map_entry_t entry; struct uvm_object *obj = NULL; struct vm_amap *amap = NULL; voff_t off = (vaddr_t)uaddr; - struct futex *f; - struct futex_list *ftlist = &p->p_p->ps_ftlist; + struct process *ps; - rw_assert_wrlock(&ftlock); + if (ISSET(flags, FT_PRIVATE)) + ps = p->p_p; + else { + ps = NULL; - if (!(flags & FT_PRIVATE)) { vm_map_lock_read(map); if (uvm_map_lookup_entry(map, (vaddr_t)uaddr, &entry) && entry->inheritance == MAP_INHERIT_SHARE) { if (UVM_ET_ISOBJ(entry)) { - ftlist = &ftlist_shared; obj = entry->object.uvm_obj; off = entry->offset + ((vaddr_t)uaddr - entry->start); } else if (entry->aref.ar_amap) { - ftlist = &ftlist_shared; amap = entry->aref.ar_amap; off = ptoa(entry->aref.ar_pageoff) + ((vaddr_t)uaddr - entry->start); @@ -155,47 +166,47 @@ futex_get(uint32_t *uaddr, int flags) vm_map_unlock_read(map); } - LIST_FOREACH(f, ftlist, ft_list) { - if (f->ft_obj == obj && f->ft_amap == amap && - f->ft_off == off) { - f->ft_refcnt++; - break; - } - } + f->ft_ps = ps; + f->ft_obj = obj; + f->ft_amap = amap; + f->ft_off = off; +} - if ((f == NULL) && (flags & FT_CREATE)) { - /* - * We rely on the rwlock to ensure that no other thread - * create the same futex. - */ - f = pool_get(&ftpool, PR_WAITOK); - TAILQ_INIT(&f->ft_threads); - f->ft_obj = obj; - f->ft_amap = amap; - f->ft_off = off; - f->ft_refcnt = 1; - LIST_INSERT_HEAD(ftlist, f, ft_list); - } +static inline struct futex_bucket * +futex_get_bucket(struct futex *f) +{ + uint32_t key = f->ft_off >> 3; /* watevs */ + key ^= key >> FUTEX_BUCKET_BITS; - return f; + return (&futex_hash[key & FUTEX_BUCKET_MASK]); } -/* - * Release a given futex. - */ -void -futex_put(struct futex *f) +static int +futex_remove(struct futex_bucket *ofb, struct futex *f) { - rw_assert_wrlock(&ftlock); + struct futex_bucket *fb; + int rv; + + /* + * REQUEUE can move a futex between buckets, so follow it if needed. + */ - KASSERT(f->ft_refcnt > 0); + for (;;) { + rw_enter_write(&ofb->fb_lock); + fb = futex_get_bucket(f); + if (ofb == fb) + break; - --f->ft_refcnt; - if (f->ft_refcnt == 0) { - KASSERT(TAILQ_EMPTY(&f->ft_threads)); - LIST_REMOVE(f, ft_list); - pool_put(&ftpool, f); + rw_exit_write(&ofb->fb_lock); + ofb = fb; } + + rv = f->ft_proc != NULL; + if (rv) + TAILQ_REMOVE(&fb->fb_list, f, ft_entry); + rw_exit_write(&fb->fb_lock); + + return (rv); } /* @@ -203,32 +214,16 @@ futex_put(struct futex *f) * ``uaddr''. Let it sleep for the specified ``timeout'' time, or * indefinitely if the argument is NULL. */ -int -futex_wait(uint32_t *uaddr, uint32_t val, const struct timespec *timeout, - int flags) +static int +futex_wait(struct proc *p, uint32_t *uaddr, uint32_t val, + const struct timespec *timeout, int flags) { - struct proc *p = curproc; - struct futex *f; + struct futex f; + struct futex_bucket *fb; uint64_t nsecs = INFSLP; uint32_t cval; int error; - /* - * After reading the value a race is still possible but - * we deal with it by serializing all futex syscalls. - */ - rw_assert_wrlock(&ftlock); - - /* - * Read user space futex value - */ - if ((error = copyin32(uaddr, &cval))) - return error; - - /* If the value changed, stop here. */ - if (cval != val) - return EAGAIN; - if (timeout != NULL) { struct timespec ts; @@ -240,32 +235,74 @@ futex_wait(uint32_t *uaddr, uint32_t val #endif if (ts.tv_sec < 0 || !timespecisvalid(&ts)) return EINVAL; + nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(&ts), MAXTSLP)); } - f = futex_get(uaddr, flags | FT_CREATE); - TAILQ_INSERT_TAIL(&f->ft_threads, p, p_fut_link); - p->p_futex = f; - - error = rwsleep_nsec(p, &ftlock, PWAIT|PCATCH, "fsleep", nsecs); - if (error == ERESTART) - error = ECANCELED; - else if (error == EWOULDBLOCK) { - /* A race occurred between a wakeup and a timeout. */ - if (p->p_futex == NULL) - error = 0; - else - error = ETIMEDOUT; + futex_addrs(p, &f, uaddr, flags); + fb = futex_get_bucket(&f); + + f.ft_proc = p; + rw_enter_write(&fb->fb_lock); + TAILQ_INSERT_TAIL(&fb->fb_list, &f, ft_entry); + rw_exit_write(&fb->fb_lock); + + /* + * Read user space futex value + */ + if ((error = copyin32(uaddr, &cval)) != 0) + goto exit; + + /* If the value changed, stop here. */ + if (cval != val) { + error = EAGAIN; + goto exit; } + sleep_setup(&f, PWAIT|PCATCH, "fsleep"); + error = sleep_finish(nsecs, f.ft_proc != NULL); /* Remove ourself if we haven't been awaken. */ - if ((f = p->p_futex) != NULL) { - p->p_futex = NULL; - TAILQ_REMOVE(&f->ft_threads, p, p_fut_link); - futex_put(f); + if (error != 0 || f.ft_proc != NULL) { + if (futex_remove(fb, &f) == 0) + error = 0; + + switch (error) { + case ERESTART: + error = ECANCELED; + break; + case EWOULDBLOCK: + error = ETIMEDOUT; + break; + } } return error; +exit: + if (f.ft_proc != NULL) + futex_remove(fb, &f); + return error; +} + +static void +futexen_wakeup(struct futexen *fl) +{ + struct futex *f, *nf; + struct proc *p; + + /* + * take care to avoid referencing f after we set ft_proc + * to NULL (and wake the associated thread up). f is on the + * stack of the thread we're trying let out of the kernel, + * so it can go away. + */ + + SCHED_LOCK(); + TAILQ_FOREACH_SAFE(f, fl, ft_entry, nf) { + p = f->ft_proc; + f->ft_proc = NULL; + wakeup_proc(p); + } + SCHED_UNLOCK(); } /* @@ -273,46 +310,135 @@ futex_wait(uint32_t *uaddr, uint32_t val * ``uaddr'' and requeue at most ``m'' sibling threads on a futex at * address ``uaddr2''. */ -int -futex_requeue(uint32_t *uaddr, uint32_t n, uint32_t *uaddr2, uint32_t m, - int flags) +static int +futex_requeue(struct proc *p, uint32_t *uaddr, uint32_t n, + uint32_t *uaddr2, uint32_t m, int flags, register_t *retval) { - struct futex *f, *g; - struct proc *p; + struct futexen fl = TAILQ_HEAD_INITIALIZER(fl); + struct futex okey, nkey; + struct futex *f, *nf, *mf = NULL; + struct futex_bucket *ofb, *nfb; uint32_t count = 0; - rw_assert_wrlock(&ftlock); + if (m == 0) + return futex_wake(p, uaddr, n, flags, retval); - f = futex_get(uaddr, flags); - if (f == NULL) - return 0; + futex_addrs(p, &okey, uaddr, flags); + ofb = futex_get_bucket(&okey); + futex_addrs(p, &nkey, uaddr2, flags); + nfb = futex_get_bucket(&nkey); + + if (ofb->fb_id < nfb->fb_id) { + rw_enter_write(&ofb->fb_lock); + rw_enter_write(&nfb->fb_lock); + } else if (ofb->fb_id > nfb->fb_id) { + rw_enter_write(&nfb->fb_lock); + rw_enter_write(&ofb->fb_lock); + } else + rw_enter_write(&ofb->fb_lock); + + TAILQ_FOREACH_SAFE(f, &ofb->fb_list, ft_entry, nf) { + /* __builtin_prefetch(nf, 1); */ + KASSERT(f->ft_proc != NULL); - while ((p = TAILQ_FIRST(&f->ft_threads)) != NULL && (count < (n + m))) { - p->p_futex = NULL; - TAILQ_REMOVE(&f->ft_threads, p, p_fut_link); - futex_put(f); - - if (count < n) { - wakeup_one(p); - } else if (uaddr2 != NULL) { - g = futex_get(uaddr2, FT_CREATE); - TAILQ_INSERT_TAIL(&g->ft_threads, p, p_fut_link); - p->p_futex = g; + if (!futex_is_eq(f, &okey)) + continue; + + TAILQ_REMOVE(&ofb->fb_list, f, ft_entry); + TAILQ_INSERT_TAIL(&fl, f, ft_entry); + + if (++count == n) { + mf = nf; + break; } - count++; } - futex_put(f); + if (!TAILQ_EMPTY(&fl)) + futexen_wakeup(&fl); + + /* update matching futexes */ + if (mf != NULL) { + /* + * only iterate from the current entry to the tail + * of the list as it is now in case we're requeueing + * on the end of the same list. + */ + nf = TAILQ_LAST(&ofb->fb_list, futexen); + do { + f = mf; + mf = TAILQ_NEXT(f, ft_entry); + /* __builtin_prefetch(mf, 1); */ + + KASSERT(f->ft_proc != NULL); + + if (!futex_is_eq(f, &okey)) + continue; + + TAILQ_REMOVE(&ofb->fb_list, f, ft_entry); + /* it should only be ft_off that changes, but eh */ + f->ft_ps = nkey.ft_ps; + f->ft_obj = nkey.ft_obj; + f->ft_amap = nkey.ft_amap; + f->ft_off = nkey.ft_off; + + TAILQ_INSERT_TAIL(&nfb->fb_list, f, ft_entry); + + if (--m == 0) + break; + } while (f != nf); + } + + if (ofb->fb_id != nfb->fb_id) + rw_exit_write(&nfb->fb_lock); + rw_exit_write(&ofb->fb_lock); - return count; + *retval = count; + return 0; } /* * Wakeup at most ``n'' sibling threads sleeping on a futex at address * ``uaddr''. */ -int -futex_wake(uint32_t *uaddr, uint32_t n, int flags) +static int +futex_wake(struct proc *p, uint32_t *uaddr, uint32_t n, int flags, + register_t *retval) { - return futex_requeue(uaddr, n, NULL, 0, flags); + struct futexen fl = TAILQ_HEAD_INITIALIZER(fl); + struct futex key; + struct futex *f, *nf; + struct futex_bucket *fb; + int count = 0; + + if (n == 0) { + *retval = 0; + return 0; + } + + futex_addrs(p, &key, uaddr, flags); + fb = futex_get_bucket(&key); + + rw_enter_write(&fb->fb_lock); + + TAILQ_FOREACH_SAFE(f, &fb->fb_list, ft_entry, nf) { + /* __builtin_prefetch(nf, 1); */ + KASSERT(f->ft_proc != NULL); + + if (!futex_is_eq(f, &key)) + continue; + + TAILQ_REMOVE(&fb->fb_list, f, ft_entry); + TAILQ_INSERT_TAIL(&fl, f, ft_entry); + + if (++count == n) + break; + } + + if (!TAILQ_EMPTY(&fl)) + futexen_wakeup(&fl); + + rw_exit_write(&fb->fb_lock); + + *retval = count; + return 0; } Index: sys/futex.h =================================================================== RCS file: /cvs/src/sys/sys/futex.h,v diff -u -p -r1.2 futex.h --- sys/futex.h 3 Jun 2018 15:09:26 -0000 1.2 +++ sys/futex.h 1 May 2025 08:01:27 -0000 @@ -28,11 +28,15 @@ int futex(volatile uint32_t *, int, int, __END_DECLS #endif /* ! _KERNEL */ +#define FUTEX_OP_MASK 0x007f + #define FUTEX_WAIT 1 #define FUTEX_WAKE 2 #define FUTEX_REQUEUE 3 -#define FUTEX_PRIVATE_FLAG 128 +#define FUTEX_FLAG_MASK 0xff80 + +#define FUTEX_PRIVATE_FLAG 0x0080 #define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) #define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) Index: sys/systm.h =================================================================== RCS file: /cvs/src/sys/sys/systm.h,v diff -u -p -r1.171 systm.h --- sys/systm.h 28 May 2024 12:50:23 -0000 1.171 +++ sys/systm.h 1 May 2025 08:01:27 -0000 @@ -256,7 +256,7 @@ void start_periodic_resettodr(void); void stop_periodic_resettodr(void); void sleep_setup(const volatile void *, int, const char *); -int sleep_finish(int, int); +int sleep_finish(uint64_t, int); void sleep_queue_init(void); struct cond;