Index: sys/mutex.h =================================================================== RCS file: /cvs/src/sys/sys/mutex.h,v diff -u -p -r1.23 mutex.h --- sys/mutex.h 2 Jul 2025 14:36:56 -0000 1.23 +++ sys/mutex.h 7 Jul 2025 20:30:44 -0000 @@ -54,7 +54,7 @@ #include struct mutex { - void *volatile mtx_owner; + volatile unsigned long mtx_owner; int mtx_wantipl; int mtx_oldipl; #ifdef WITNESS @@ -64,23 +64,26 @@ struct mutex { #ifdef WITNESS #define MUTEX_INITIALIZER_FLAGS(ipl, name, flags) \ - { NULL, __MUTEX_IPL((ipl)), IPL_NONE, MTX_LO_INITIALIZER(name, flags) } + { 0, __MUTEX_IPL((ipl)), IPL_NONE, MTX_LO_INITIALIZER(name, flags) } #else #define MUTEX_INITIALIZER_FLAGS(ipl, name, flags) \ - { NULL, __MUTEX_IPL((ipl)), IPL_NONE } + { 0, __MUTEX_IPL((ipl)), IPL_NONE } #endif void __mtx_init(struct mutex *, int); #define _mtx_init(mtx, ipl) __mtx_init((mtx), __MUTEX_IPL((ipl))) +#define mtx_curcpu() (unsigned long)curcpu() +#define mtx_owner(mtx) ((mtx)->mtx_owner & ~1UL) + #ifdef DIAGNOSTIC #define MUTEX_ASSERT_LOCKED(mtx) do { \ - if (((mtx)->mtx_owner != curcpu()) && !(panicstr || db_active)) \ + if (mtx_owner(mtx) != mtx_curcpu() && !(panicstr || db_active)) \ panic("mutex %p not held in %s", (mtx), __func__); \ } while (0) #define MUTEX_ASSERT_UNLOCKED(mtx) do { \ - if (((mtx)->mtx_owner == curcpu()) && !(panicstr || db_active)) \ + if (mtx_owner(mtx) == mtx_curcpu() && !(panicstr || db_active)) \ panic("mutex %p held in %s", (mtx), __func__); \ } while (0) #else @@ -128,7 +131,7 @@ void mtx_leave(struct mutex *); #define mtx_init(m, ipl) mtx_init_flags(m, ipl, NULL, 0) #define mtx_owned(mtx) \ - (((mtx)->mtx_owner == curcpu()) || panicstr || db_active) + ((mtx_owner(mtx) == mtx_curcpu()) || panicstr || db_active) #ifdef WITNESS Index: kern/kern_lock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_lock.c,v diff -u -p -r1.81 kern_lock.c --- kern/kern_lock.c 24 Jun 2025 15:37:43 -0000 1.81 +++ kern/kern_lock.c 7 Jul 2025 20:30:44 -0000 @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -59,9 +60,12 @@ extern int ncpusfound; #ifdef MULTIPROCESSOR +#include /* CACHELINESIZE */ #include struct __mp_lock kernel_lock; +static void mtx_init_parking(void); + /* * Functions for manipulating the kernel_lock. We put them here * so that they show up in profiles. @@ -71,6 +75,7 @@ void _kernel_lock_init(void) { __mp_lock_init(&kernel_lock); + mtx_init_parking(); } /* @@ -239,17 +244,133 @@ __mp_lock_held(struct __mp_lock *mpl, st void __mtx_init(struct mutex *mtx, int wantipl) { - mtx->mtx_owner = NULL; + mtx->mtx_owner = 0; mtx->mtx_wantipl = wantipl; mtx->mtx_oldipl = IPL_NONE; } #ifdef MULTIPROCESSOR +struct mtx_waiter { + struct mutex *volatile mtx; + unsigned long self; + unsigned int spins; + TAILQ_ENTRY(mtx_waiter) entry; +}; + +TAILQ_HEAD(mtx_waitlist, mtx_waiter); + +struct mtx_park { + struct cpu_info *volatile lock; + struct mtx_waitlist waiters; +} __aligned(CACHELINESIZE); + +#define MTX_PARKING_BITS 7 +#define MTX_PARKING_LOTS (1 << MTX_PARKING_BITS) +#define MTX_PARKING_MASK (MTX_PARKING_LOTS - 1) + +static struct mtx_park mtx_parking[MTX_PARKING_LOTS]; + +static void +mtx_init_parking(void) +{ + size_t i; + + for (i = 0; i < nitems(mtx_parking); i++) { + struct mtx_park *p = &mtx_parking[i]; + + p->lock = NULL; + TAILQ_INIT(&p->waiters); + } +} + +static struct mtx_park * +mtx_park(struct mutex *mtx) +{ + unsigned long addr = (unsigned long)mtx; + addr >>= 6; + addr ^= addr >> MTX_PARKING_BITS; + addr &= MTX_PARKING_MASK; + + return &mtx_parking[addr]; +} + +static unsigned long +mtx_enter_park(struct mtx_park *p) +{ + struct cpu_info *ci = curcpu(); + struct cpu_info *owner; + unsigned long m; + + m = intr_disable(); + while ((owner = atomic_cas_ptr(&p->lock, NULL, ci)) != NULL) + CPU_BUSY_CYCLE(); + membar_enter_after_atomic(); + + return (m); +} + +static void +mtx_leave_park(struct mtx_park *p, unsigned long m) +{ + membar_exit(); + p->lock = NULL; + intr_restore(m); +} + +static inline unsigned long +mtx_cas(struct mutex *mtx, unsigned long e, unsigned long v) +{ + return atomic_cas_ulong(&mtx->mtx_owner, e, v); +} + +int +mtx_enter_try(struct mutex *mtx) +{ + struct cpu_info *ci = curcpu(); + unsigned long owner, self = (unsigned long)ci; + int s; + + /* Avoid deadlocks after panic or in DDB */ + if (panicstr || db_active) + return (1); + + if (mtx->mtx_wantipl != IPL_NONE) + s = splraise(mtx->mtx_wantipl); + + owner = mtx_cas(mtx, 0, self); + if (owner == 0) { + membar_enter_after_atomic(); + if (mtx->mtx_wantipl != IPL_NONE) + mtx->mtx_oldipl = s; +#ifdef DIAGNOSTIC + ci->ci_mutex_level++; +#endif + WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); + return (1); + } + + if (mtx->mtx_wantipl != IPL_NONE) + splx(s); + +#ifdef DIAGNOSTIC + if (__predict_false((owner & ~1UL) == self)) + panic("mtx %p: locking against myself", mtx); +#endif + + return (0); +} + void mtx_enter(struct mutex *mtx) { - struct schedstate_percpu *spc = &curcpu()->ci_schedstate; - unsigned int i, ncycle = CPU_MIN_BUSY_CYCLES; + struct cpu_info *ci = curcpu(); + struct schedstate_percpu *spc = &ci->ci_schedstate; + unsigned long owner, self = (unsigned long)ci; + struct mtx_park *p; + struct mtx_waiter w, *n; + unsigned long m; + int spins = 0; + int s; #ifdef MP_LOCKDEBUG long nticks = __mp_lock_spinout; #endif @@ -257,72 +378,171 @@ mtx_enter(struct mutex *mtx) WITNESS_CHECKORDER(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE | LOP_NEWORDER, NULL); + if (mtx->mtx_wantipl != IPL_NONE) + s = splraise(mtx->mtx_wantipl); + + owner = mtx_cas(mtx, 0, self); + if (owner == 0) { + /* we got the lock first go. this is the fast path */ + goto locked; + } + +#ifdef DIAGNOSTIC + if (__predict_false((owner & ~1ULL) == self)) + panic("mtx %p: locking against myself", mtx); +#endif + + /* we're going to have to spin for it now */ spc->spc_spinning++; - while (mtx_enter_try(mtx) == 0) { - do { - /* Busy loop with exponential backoff. */ - for (i = ncycle; i > 0; i--) + + for (spins = 0; spins < 40; spins++) { + if (ISSET(owner, 1)) { + /* don't spin if cpus are already parked */ + break; + } + CPU_BUSY_CYCLE(); + owner = mtx->mtx_owner; + if (owner == 0) { + owner = mtx_cas(mtx, 0, self); + if (owner == 0) + goto spinlocked; + } + } + + /* take the really slow path */ + p = mtx_park(mtx); + + w.self = self; + w.spins = 0; + + /* publish our existence in the parking lot */ + w.mtx = mtx; + m = mtx_enter_park(p); + TAILQ_INSERT_TAIL(&p->waiters, &w, entry); + mtx_leave_park(p, m); + + for (;;) { + unsigned long n, o; + + n = owner | 1; + o = mtx_cas(mtx, owner, n); + if (o == owner) + o = n; + if ((o | 1) == (self | 1)) + break; + + if (ISSET(o, 1)) { + while (w.mtx != NULL) CPU_BUSY_CYCLE(); #ifdef MP_LOCKDEBUG - if ((nticks -= ncycle) <= 0) { - db_printf("%s: %p lock spun out\n", __func__, mtx); + if (--nticks <= 0) { + db_printf("%s: %p lock spun out\n", + __func__, mtx); db_enter(); nticks = __mp_lock_spinout; } #endif - if (ncycle < CPU_MAX_BUSY_CYCLES) - ncycle += ncycle; - } while (mtx->mtx_owner != NULL); + w.spins++; + } + + owner = mtx_cas(mtx, 0, self); + if (owner == 0) + break; + + w.mtx = mtx; + } + + m = mtx_enter_park(p); + TAILQ_REMOVE(&p->waiters, &w, entry); + TAILQ_FOREACH(n, &p->waiters, entry) { + if (n->mtx == mtx) { + mtx->mtx_owner = self | 1; + break; + } } + mtx_leave_park(p, m); +spinlocked: spc->spc_spinning--; +locked: + membar_enter_after_atomic(); + if (mtx->mtx_wantipl != IPL_NONE) + mtx->mtx_oldipl = s; +#ifdef DIAGNOSTIC + ci->ci_mutex_level++; +#endif + WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); } -int -mtx_enter_try(struct mutex *mtx) +/* + * this is the number of times mtx_enter waits and gets woken up + * before mtx_leave decides to move ownership of the mutex rather + * than release it. a cpu that has woken up a lot has lost a lot of + * races with "barging" cpus. + * + * XXX make this runtime tweakable via sysctl + */ +unsigned int mtx_fair_prio = 8; + +static inline void +mtx_wakeup(struct mutex *mtx, struct mtx_park *p) +{ + struct mtx_waiter *w; + + TAILQ_FOREACH(w, &p->waiters, entry) { + if (w->mtx == mtx) { + mtx->mtx_owner = (w->spins > mtx_fair_prio) ? + w->self : 0; + w->mtx = NULL; + return; + } + } + + mtx->mtx_owner = 0; +} + +void +mtx_leave(struct mutex *mtx) { - struct cpu_info *owner, *ci = curcpu(); + struct cpu_info *ci = curcpu(); + unsigned long owner, self = (unsigned long)ci; int s; /* Avoid deadlocks after panic or in DDB */ if (panicstr || db_active) - return (1); + return; - if (mtx->mtx_wantipl != IPL_NONE) - s = splraise(mtx->mtx_wantipl); + WITNESS_UNLOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); - /* - * Avoid unconditional atomic operation to prevent cache line - * contention. - */ - owner = mtx->mtx_owner; #ifdef DIAGNOSTIC - if (__predict_false(owner == ci)) - panic("mtx %p: locking against myself", mtx); + curcpu()->ci_mutex_level--; #endif - if (owner == NULL) { - owner = atomic_cas_ptr(&mtx->mtx_owner, NULL, ci); - if (owner == NULL) { - membar_enter_after_atomic(); - if (mtx->mtx_wantipl != IPL_NONE) - mtx->mtx_oldipl = s; + + s = mtx->mtx_oldipl; + membar_exit_before_atomic(); + owner = atomic_cas_ulong(&mtx->mtx_owner, self, 0); + if (owner != self) { + struct mtx_park *p; + unsigned long m; + #ifdef DIAGNOSTIC - ci->ci_mutex_level++; + if (__predict_false((owner & ~1ULL) != self)) + panic("mtx %p: not held", mtx); #endif - WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); - return (1); - } + + p = mtx_park(mtx); + m = mtx_enter_park(p); + mtx_wakeup(mtx, p); + mtx_leave_park(p, m); } if (mtx->mtx_wantipl != IPL_NONE) splx(s); - - return (0); } #else void mtx_enter(struct mutex *mtx) { - struct cpu_info *ci = curcpu(); + unsigned long self = mtx_curcpu(); /* Avoid deadlocks after panic or in DDB */ if (panicstr || db_active) @@ -332,14 +552,14 @@ mtx_enter(struct mutex *mtx) LOP_EXCLUSIVE | LOP_NEWORDER, NULL); #ifdef DIAGNOSTIC - if (__predict_false(mtx->mtx_owner == ci)) + if (__predict_false(mtx_owner(mtx) == self)) panic("mtx %p: locking against myself", mtx); #endif if (mtx->mtx_wantipl != IPL_NONE) mtx->mtx_oldipl = splraise(mtx->mtx_wantipl); - mtx->mtx_owner = ci; + mtx->mtx_owner = self; #ifdef DIAGNOSTIC ci->ci_mutex_level++; @@ -353,7 +573,6 @@ mtx_enter_try(struct mutex *mtx) mtx_enter(mtx); return (1); } -#endif void mtx_leave(struct mutex *mtx) @@ -372,13 +591,11 @@ mtx_leave(struct mutex *mtx) #endif s = mtx->mtx_oldipl; -#ifdef MULTIPROCESSOR - membar_exit(); -#endif - mtx->mtx_owner = NULL; + mtx->mtx_owner = 0; if (mtx->mtx_wantipl != IPL_NONE) splx(s); } +#endif #ifdef DDB void Index: kern/subr_witness.c =================================================================== RCS file: /cvs/src/sys/kern/subr_witness.c,v diff -u -p -r1.55 subr_witness.c --- kern/subr_witness.c 14 Apr 2025 09:14:51 -0000 1.55 +++ kern/subr_witness.c 7 Jul 2025 20:30:44 -0000 @@ -1677,7 +1677,7 @@ _isitmyx(struct witness *w1, struct witn if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) || (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) { /* Don't squawk if we're potentially racing with an update. */ - if (w_mtx.mtx_owner != curcpu()) + if (mtx_owner(&w_mtx) != mtx_curcpu()) return (0); printf("witness: %s: rmatrix mismatch between %s (index %d) " "and %s (index %d): w_rmatrix[%d][%d] == %x but "