Index: sys/futex.h =================================================================== RCS file: /cvs/src/sys/sys/futex.h,v diff -u -p -r1.2 futex.h --- sys/futex.h 3 Jun 2018 15:09:26 -0000 1.2 +++ sys/futex.h 2 May 2025 02:19:46 -0000 @@ -28,11 +28,15 @@ int futex(volatile uint32_t *, int, int, __END_DECLS #endif /* ! _KERNEL */ +#define FUTEX_OP_MASK 0x007f + #define FUTEX_WAIT 1 #define FUTEX_WAKE 2 #define FUTEX_REQUEUE 3 -#define FUTEX_PRIVATE_FLAG 128 +#define FUTEX_FLAG_MASK 0xff80 + +#define FUTEX_PRIVATE_FLAG 0x0080 #define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) #define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) Index: sys/proc.h =================================================================== RCS file: /cvs/src/sys/sys/proc.h,v diff -u -p -r1.386 proc.h --- sys/proc.h 1 May 2025 01:16:42 -0000 1.386 +++ sys/proc.h 2 May 2025 02:19:47 -0000 @@ -116,8 +125,6 @@ struct tusage { * run-time information needed by threads. */ #ifdef __need_process -struct futex; -LIST_HEAD(futex_list, futex); struct proc; struct tslpentry; TAILQ_HEAD(tslpqueue, tslpentry); @@ -178,7 +185,6 @@ struct process { struct vmspace *ps_vmspace; /* Address space */ pid_t ps_pid; /* [I] Process identifier. */ - struct futex_list ps_ftlist; /* futexes attached to this process */ struct tslpqueue ps_tslpqueue; /* [p] queue of threads in thrsleep */ struct rwlock ps_lock; /* per-process rwlock */ struct mutex ps_mtx; /* per-process mutex */ @@ -343,9 +349,6 @@ struct proc { struct process *p_p; /* [I] The process of this thread. */ TAILQ_ENTRY(proc) p_thr_link; /* [K|m] Threads in a process linkage. */ - - TAILQ_ENTRY(proc) p_fut_link; /* Threads in a futex linkage. */ - struct futex *p_futex; /* Current sleeping futex. */ /* substructures: */ struct filedesc *p_fd; /* copy of p_p->ps_fd */ Index: kern/kern_fork.c =================================================================== RCS file: /cvs/src/sys/kern/kern_fork.c,v diff -u -p -r1.270 kern_fork.c --- kern/kern_fork.c 14 Apr 2025 09:15:24 -0000 1.270 +++ kern/kern_fork.c 2 May 2025 02:19:47 -0000 @@ -196,7 +196,6 @@ process_initialize(struct process *pr, s LIST_INIT(&pr->ps_children); LIST_INIT(&pr->ps_orphans); - LIST_INIT(&pr->ps_ftlist); LIST_INIT(&pr->ps_sigiolst); TAILQ_INIT(&pr->ps_tslpqueue); Index: kern/sys_futex.c =================================================================== RCS file: /cvs/src/sys/kern/sys_futex.c,v diff -u -p -r1.22 sys_futex.c --- kern/sys_futex.c 14 Aug 2023 07:42:34 -0000 1.22 +++ kern/sys_futex.c 2 May 2025 02:19:47 -0000 @@ -24,6 +24,8 @@ #include #include #include +#include /* CACHELINESIZE */ +#include /* tick_nsec */ #include #ifdef KTRACE @@ -33,44 +35,98 @@ #include /* + * Locks used to protect variables in this file: + * + * I immutable after initialization + * F futex_slpque fsq_lock + */ + +/* * Kernel representation of a futex. + * + * The userland address that the futex is waiting on is represented by + * ft_ps, ft_obj, ft_amap, and ft_off. + * + * Whether the futex is waiting or woken up is represented by the + * ft_proc pointer being set (ie, not NULL) or not (ie, NULL) respectively. + * When the futex is waiting it is referenced by the list in a + * futex_slpque. When the futex gets woken up, it is removed from the + * list and the ft_proc pointer is cleared to indicate that the reference + * held by the list has been released. Only a thread holding the lock + * may remove the futex from the list and clear ft_proc. This is true + * even for futex_wait(). + * + * However, futex_wait() may read ft_proc without the lock so it can + * avoid contending with the thread that just woke it up. This means + * that once ft_proc is cleared, futex_wait() may return, the struct + * futex will no longer exist, and it is no longer safe to access it + * from the wakeup side. + * + * tl;dr: the thread holding the slpque lock "owns" the references + * to the futexes on the list until it clears ft_proc. */ + struct futex { - LIST_ENTRY(futex) ft_list; /* list of all futexes */ - TAILQ_HEAD(, proc) ft_threads; /* sleeping queue */ - struct uvm_object *ft_obj; /* UVM object */ - struct vm_amap *ft_amap; /* UVM amap */ - voff_t ft_off; /* UVM offset */ - unsigned int ft_refcnt; /* # of references */ + TAILQ_ENTRY(futex) ft_entry; /* [F] entry on futex_slpque */ + + struct process *ft_ps; /* [I] for private futexes */ + struct uvm_object *ft_obj; /* [F] UVM object */ + struct vm_amap *ft_amap; /* [F] UVM amap */ + volatile voff_t ft_off; /* [F] UVM offset */ + + struct proc * volatile ft_proc; /* [F] waiting thread */ }; -/* Syscall helpers. */ -int futex_wait(uint32_t *, uint32_t, const struct timespec *, int); -int futex_wake(uint32_t *, uint32_t, int); -int futex_requeue(uint32_t *, uint32_t, uint32_t *, uint32_t, int); - -/* Flags for futex_get(). */ -#define FT_CREATE 0x1 /* Create a futex if it doesn't exist. */ -#define FT_PRIVATE 0x2 /* Futex is process-private. */ +static int +futex_is_eq(const struct futex *a, const struct futex *b) +{ + return (a->ft_off == b->ft_off && + a->ft_ps == b->ft_ps && + a->ft_obj == b->ft_obj && + a->ft_amap == b->ft_amap); +} -struct futex *futex_get(uint32_t *, int); -void futex_put(struct futex *); +TAILQ_HEAD(futex_list, futex); -/* - * The global futex lock serializes futex(2) calls so that no wakeup - * event is lost, and protects all futex lists and futex states. - */ -struct rwlock ftlock = RWLOCK_INITIALIZER("futex"); -static struct futex_list ftlist_shared = - LIST_HEAD_INITIALIZER(ftlist_shared); -struct pool ftpool; +struct futex_slpque { + struct futex_list fsq_list; /* [F] */ + struct rwlock fsq_lock; + uint32_t fsq_id; /* [I] for lock ordering */ +} __aligned(CACHELINESIZE); + +/* Syscall helpers. */ +static int futex_wait(struct proc *, uint32_t *, uint32_t, + const struct timespec *, int); +static int futex_wake(struct proc *, uint32_t *, uint32_t, int, + register_t *); +static int futex_requeue(struct proc *, uint32_t *, uint32_t, + uint32_t *, uint32_t, int, register_t *); + +/* Flags for futex_get(). kernel private flags sit in FUTEX_OP_MASK space */ +#define FT_PRIVATE FUTEX_PRIVATE_FLAG /* Futex is process-private. */ + +#define FUTEX_SLPQUES_BITS 6 +#define FUTEX_SLPQUES_SIZE (1U << FUTEX_SLPQUES_BITS) +#define FUTEX_SLPQUES_MASK (FUTEX_SLPQUES_SIZE - 1) +static struct futex_slpque futex_slpques[FUTEX_SLPQUES_SIZE]; void futex_init(void) { - pool_init(&ftpool, sizeof(struct futex), 0, IPL_NONE, - PR_WAITOK | PR_RWLOCK, "futexpl", NULL); + struct futex_slpque *fsq; + unsigned int i; + + for (i = 0; i < nitems(futex_slpques); i++) { + fsq = &futex_slpques[i]; + + TAILQ_INIT(&fsq->fsq_list); + rw_init(&fsq->fsq_lock, "futexlk"); + + fsq->fsq_id = arc4random(); + fsq->fsq_id &= ~FUTEX_SLPQUES_MASK; + fsq->fsq_id |= i; + } } int @@ -88,65 +144,51 @@ sys_futex(struct proc *p, void *v, regis uint32_t val = SCARG(uap, val); const struct timespec *timeout = SCARG(uap, timeout); void *g = SCARG(uap, g); - int flags = 0; + int flags = op & FUTEX_FLAG_MASK; int error = 0; - if (op & FUTEX_PRIVATE_FLAG) - flags |= FT_PRIVATE; - - rw_enter_write(&ftlock); - switch (op) { + switch (op & FUTEX_OP_MASK) { case FUTEX_WAIT: - case FUTEX_WAIT_PRIVATE: - error = futex_wait(uaddr, val, timeout, flags); + error = futex_wait(p, uaddr, val, timeout, flags); break; case FUTEX_WAKE: - case FUTEX_WAKE_PRIVATE: - *retval = futex_wake(uaddr, val, flags); + error = futex_wake(p, uaddr, val, flags, retval); break; case FUTEX_REQUEUE: - case FUTEX_REQUEUE_PRIVATE: - *retval = futex_requeue(uaddr, val, g, (u_long)timeout, flags); + error = futex_requeue(p, uaddr, val, g, + (u_long)timeout, flags, retval); break; default: error = ENOSYS; break; } - rw_exit_write(&ftlock); return error; } -/* - * Return an existing futex matching userspace address ``uaddr''. - * - * If such futex does not exist and FT_CREATE is given, create it. - */ -struct futex * -futex_get(uint32_t *uaddr, int flags) +static void +futex_addrs(struct proc *p, struct futex *f, uint32_t *uaddr, int flags) { - struct proc *p = curproc; vm_map_t map = &p->p_vmspace->vm_map; vm_map_entry_t entry; struct uvm_object *obj = NULL; struct vm_amap *amap = NULL; voff_t off = (vaddr_t)uaddr; - struct futex *f; - struct futex_list *ftlist = &p->p_p->ps_ftlist; + struct process *ps; - rw_assert_wrlock(&ftlock); + if (ISSET(flags, FT_PRIVATE)) + ps = p->p_p; + else { + ps = NULL; - if (!(flags & FT_PRIVATE)) { vm_map_lock_read(map); if (uvm_map_lookup_entry(map, (vaddr_t)uaddr, &entry) && entry->inheritance == MAP_INHERIT_SHARE) { if (UVM_ET_ISOBJ(entry)) { - ftlist = &ftlist_shared; obj = entry->object.uvm_obj; off = entry->offset + ((vaddr_t)uaddr - entry->start); } else if (entry->aref.ar_amap) { - ftlist = &ftlist_shared; amap = entry->aref.ar_amap; off = ptoa(entry->aref.ar_pageoff) + ((vaddr_t)uaddr - entry->start); @@ -155,47 +197,47 @@ futex_get(uint32_t *uaddr, int flags) vm_map_unlock_read(map); } - LIST_FOREACH(f, ftlist, ft_list) { - if (f->ft_obj == obj && f->ft_amap == amap && - f->ft_off == off) { - f->ft_refcnt++; - break; - } - } + f->ft_ps = ps; + f->ft_obj = obj; + f->ft_amap = amap; + f->ft_off = off; +} - if ((f == NULL) && (flags & FT_CREATE)) { - /* - * We rely on the rwlock to ensure that no other thread - * create the same futex. - */ - f = pool_get(&ftpool, PR_WAITOK); - TAILQ_INIT(&f->ft_threads); - f->ft_obj = obj; - f->ft_amap = amap; - f->ft_off = off; - f->ft_refcnt = 1; - LIST_INSERT_HEAD(ftlist, f, ft_list); - } +static inline struct futex_slpque * +futex_get_slpque(struct futex *f) +{ + uint32_t key = f->ft_off >> 3; /* watevs */ + key ^= key >> FUTEX_SLPQUES_BITS; - return f; + return (&futex_slpques[key & FUTEX_SLPQUES_MASK]); } -/* - * Release a given futex. - */ -void -futex_put(struct futex *f) +static int +futex_unwait(struct futex_slpque *ofsq, struct futex *f) { - rw_assert_wrlock(&ftlock); + struct futex_slpque *fsq; + int rv; - KASSERT(f->ft_refcnt > 0); + /* + * REQUEUE can move a futex between buckets, so follow it if needed. + */ - --f->ft_refcnt; - if (f->ft_refcnt == 0) { - KASSERT(TAILQ_EMPTY(&f->ft_threads)); - LIST_REMOVE(f, ft_list); - pool_put(&ftpool, f); + for (;;) { + rw_enter_write(&ofsq->fsq_lock); + fsq = futex_get_slpque(f); + if (ofsq == fsq) + break; + + rw_exit_write(&ofsq->fsq_lock); + ofsq = fsq; } + + rv = f->ft_proc != NULL; + if (rv) + TAILQ_REMOVE(&fsq->fsq_list, f, ft_entry); + rw_exit_write(&fsq->fsq_lock); + + return (rv); } /* @@ -203,34 +245,19 @@ futex_put(struct futex *f) * ``uaddr''. Let it sleep for the specified ``timeout'' time, or * indefinitely if the argument is NULL. */ -int -futex_wait(uint32_t *uaddr, uint32_t val, const struct timespec *timeout, - int flags) +static int +futex_wait(struct proc *p, uint32_t *uaddr, uint32_t val, + const struct timespec *timeout, int flags) { - struct proc *p = curproc; - struct futex *f; - uint64_t nsecs = INFSLP; + struct futex f; + struct futex_slpque *fsq; + uint64_t to_ticks = 0; uint32_t cval; int error; - /* - * After reading the value a race is still possible but - * we deal with it by serializing all futex syscalls. - */ - rw_assert_wrlock(&ftlock); - - /* - * Read user space futex value - */ - if ((error = copyin32(uaddr, &cval))) - return error; - - /* If the value changed, stop here. */ - if (cval != val) - return EAGAIN; - if (timeout != NULL) { struct timespec ts; + uint64_t nsecs; if ((error = copyin(timeout, &ts, sizeof(ts)))) return error; @@ -240,32 +267,85 @@ futex_wait(uint32_t *uaddr, uint32_t val #endif if (ts.tv_sec < 0 || !timespecisvalid(&ts)) return EINVAL; + nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(&ts), MAXTSLP)); + to_ticks = (nsecs + tick_nsec - 1) / (tick_nsec + 1) + 1; + if (to_ticks > INT_MAX) + to_ticks = INT_MAX; } - f = futex_get(uaddr, flags | FT_CREATE); - TAILQ_INSERT_TAIL(&f->ft_threads, p, p_fut_link); - p->p_futex = f; - - error = rwsleep_nsec(p, &ftlock, PWAIT|PCATCH, "fsleep", nsecs); - if (error == ERESTART) - error = ECANCELED; - else if (error == EWOULDBLOCK) { - /* A race occurred between a wakeup and a timeout. */ - if (p->p_futex == NULL) - error = 0; - else - error = ETIMEDOUT; + futex_addrs(p, &f, uaddr, flags); + fsq = futex_get_slpque(&f); + + /* Mark futex as waiting. */ + f.ft_proc = p; + rw_enter_write(&fsq->fsq_lock); + /* Make the waiting futex visible to wake/requeue */ + TAILQ_INSERT_TAIL(&fsq->fsq_list, &f, ft_entry); + rw_exit_write(&fsq->fsq_lock); + + /* + * Do not return before f has been removed from the slpque! + */ + + /* + * Read user space futex value + */ + if ((error = copyin32(uaddr, &cval)) != 0) + goto exit; + + /* If the value changed, stop here. */ + if (cval != val) { + error = EAGAIN; + goto exit; } + sleep_setup(&f, PWAIT|PCATCH, "fsleep"); + error = sleep_finish(to_ticks, f.ft_proc != NULL); /* Remove ourself if we haven't been awaken. */ - if ((f = p->p_futex) != NULL) { - p->p_futex = NULL; - TAILQ_REMOVE(&f->ft_threads, p, p_fut_link); - futex_put(f); + if (error != 0 || f.ft_proc != NULL) { + if (futex_unwait(fsq, &f) == 0) + error = 0; + + switch (error) { + case ERESTART: + error = ECANCELED; + break; + case EWOULDBLOCK: + error = ETIMEDOUT; + break; + default: + break; + } } return error; +exit: + if (f.ft_proc != NULL) + futex_unwait(fsq, &f); + return error; +} + +static void +futex_list_wakeup(struct futex_list *fl) +{ + struct futex *f, *nf; + struct proc *p; + + /* + * Setting ft_proc to NULL releases the futex reference + * currently held via the slpque lock. + * + * SCHED_LOCK is only needed to call wakeup_proc. + */ + + SCHED_LOCK(); + TAILQ_FOREACH_SAFE(f, fl, ft_entry, nf) { + p = f->ft_proc; + f->ft_proc = NULL; + wakeup_proc(p); + } + SCHED_UNLOCK(); } /* @@ -273,46 +353,133 @@ futex_wait(uint32_t *uaddr, uint32_t val * ``uaddr'' and requeue at most ``m'' sibling threads on a futex at * address ``uaddr2''. */ -int -futex_requeue(uint32_t *uaddr, uint32_t n, uint32_t *uaddr2, uint32_t m, - int flags) +static int +futex_requeue(struct proc *p, uint32_t *uaddr, uint32_t n, + uint32_t *uaddr2, uint32_t m, int flags, register_t *retval) { - struct futex *f, *g; - struct proc *p; + struct futex_list fl = TAILQ_HEAD_INITIALIZER(fl); + struct futex okey, nkey; + struct futex *f, *nf, *mf = NULL; + struct futex_slpque *ofsq, *nfsq; uint32_t count = 0; - rw_assert_wrlock(&ftlock); + if (m == 0) + return futex_wake(p, uaddr, n, flags, retval); - f = futex_get(uaddr, flags); - if (f == NULL) - return 0; + futex_addrs(p, &okey, uaddr, flags); + ofsq = futex_get_slpque(&okey); + futex_addrs(p, &nkey, uaddr2, flags); + nfsq = futex_get_slpque(&nkey); + + if (ofsq->fsq_id < nfsq->fsq_id) { + rw_enter_write(&ofsq->fsq_lock); + rw_enter_write(&nfsq->fsq_lock); + } else if (ofsq->fsq_id > nfsq->fsq_id) { + rw_enter_write(&nfsq->fsq_lock); + rw_enter_write(&ofsq->fsq_lock); + } else + rw_enter_write(&ofsq->fsq_lock); + + TAILQ_FOREACH_SAFE(f, &ofsq->fsq_list, ft_entry, nf) { + /* __builtin_prefetch(nf, 1); */ + KASSERT(f->ft_proc != NULL); - while ((p = TAILQ_FIRST(&f->ft_threads)) != NULL && (count < (n + m))) { - p->p_futex = NULL; - TAILQ_REMOVE(&f->ft_threads, p, p_fut_link); - futex_put(f); - - if (count < n) { - wakeup_one(p); - } else if (uaddr2 != NULL) { - g = futex_get(uaddr2, FT_CREATE); - TAILQ_INSERT_TAIL(&g->ft_threads, p, p_fut_link); - p->p_futex = g; + if (!futex_is_eq(f, &okey)) + continue; + + TAILQ_REMOVE(&ofsq->fsq_list, f, ft_entry); + TAILQ_INSERT_TAIL(&fl, f, ft_entry); + + if (++count == n) { + mf = nf; + break; } - count++; } - futex_put(f); + if (!TAILQ_EMPTY(&fl)) + futex_list_wakeup(&fl); - return count; + /* update matching futexes */ + if (mf != NULL) { + /* + * only iterate from the current entry to the tail + * of the list as it is now in case we're requeueing + * on the end of the same list. + */ + nf = TAILQ_LAST(&ofsq->fsq_list, futex_list); + do { + f = mf; + mf = TAILQ_NEXT(f, ft_entry); + /* __builtin_prefetch(mf, 1); */ + + KASSERT(f->ft_proc != NULL); + + if (!futex_is_eq(f, &okey)) + continue; + + TAILQ_REMOVE(&ofsq->fsq_list, f, ft_entry); + f->ft_ps = nkey.ft_ps; + f->ft_obj = nkey.ft_obj; + f->ft_amap = nkey.ft_amap; + f->ft_off = nkey.ft_off; + TAILQ_INSERT_TAIL(&nfsq->fsq_list, f, ft_entry); + + if (--m == 0) + break; + } while (f != nf); + } + + if (ofsq->fsq_id != nfsq->fsq_id) + rw_exit_write(&nfsq->fsq_lock); + rw_exit_write(&ofsq->fsq_lock); + + *retval = count; + return 0; } /* * Wakeup at most ``n'' sibling threads sleeping on a futex at address * ``uaddr''. */ -int -futex_wake(uint32_t *uaddr, uint32_t n, int flags) +static int +futex_wake(struct proc *p, uint32_t *uaddr, uint32_t n, int flags, + register_t *retval) { - return futex_requeue(uaddr, n, NULL, 0, flags); + struct futex_list fl = TAILQ_HEAD_INITIALIZER(fl); + struct futex key; + struct futex *f, *nf; + struct futex_slpque *fsq; + int count = 0; + + if (n == 0) { + *retval = 0; + return 0; + } + + futex_addrs(p, &key, uaddr, flags); + fsq = futex_get_slpque(&key); + + rw_enter_write(&fsq->fsq_lock); + + TAILQ_FOREACH_SAFE(f, &fsq->fsq_list, ft_entry, nf) { + /* __builtin_prefetch(nf, 1); */ + KASSERT(f->ft_proc != NULL); + + if (!futex_is_eq(f, &key)) + continue; + + TAILQ_REMOVE(&fsq->fsq_list, f, ft_entry); + TAILQ_INSERT_TAIL(&fl, f, ft_entry); + + if (++count == n) + break; + } + + if (!TAILQ_EMPTY(&fl)) + futex_list_wakeup(&fl); + + rw_exit_write(&fsq->fsq_lock); + + *retval = count; + return 0; }