Index: dev/sdmmc/sdmmc_scsi.c =================================================================== RCS file: /cvs/src/sys/dev/sdmmc/sdmmc_scsi.c,v diff -u -p -r1.63 sdmmc_scsi.c --- dev/sdmmc/sdmmc_scsi.c 19 Apr 2023 01:46:10 -0000 1.63 +++ dev/sdmmc/sdmmc_scsi.c 11 Oct 2024 10:53:00 -0000 @@ -623,8 +623,9 @@ sdmmc_scsi_hibernate_io(dev_t dev, daddr state->sdmmc_sf.sc = &state->sdmmc_sc; /* pretend we own the lock */ - state->sdmmc_sc.sc_lock.rwl_owner = - (((long)curproc) & ~RWLOCK_MASK) | RWLOCK_WRLOCK; + state->sdmmc_sc.sc_lock.rwl_state = RW_WRITE; + state->sdmmc_sc.sc_lock.rwl_owner = curproc; + state->sdmmc_sc.sc_lock.rwl_depth = 1; /* build chip layer fake softc */ error = state->sdmmc_sc.sct->hibernate_init(state->sdmmc_sc.sch, Index: kern/kern_lock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_lock.c,v diff -u -p -r1.75 kern_lock.c --- kern/kern_lock.c 3 Jul 2024 01:36:50 -0000 1.75 +++ kern/kern_lock.c 11 Oct 2024 10:53:00 -0000 @@ -24,6 +24,7 @@ #include #include #include +#include #include @@ -129,6 +130,7 @@ __mp_lock(struct __mp_lock *mpl) { struct __mp_lock_cpu *cpu = &mpl->mpl_cpus[cpu_number()]; unsigned long s; + unsigned int depth; #ifdef WITNESS if (!__mp_lock_held(mpl, curcpu())) @@ -136,15 +138,22 @@ __mp_lock(struct __mp_lock *mpl) LOP_EXCLUSIVE | LOP_NEWORDER, NULL); #endif + s = intr_disable(); - if (cpu->mplc_depth++ == 0) + depth = cpu->mplc_depth++; + if (depth == 0) { + LLTRACE(lltrace_lock, mpl, LLTRACE_LK_K, LLTRACE_LK_A_START); cpu->mplc_ticket = atomic_inc_int_nv(&mpl->mpl_users); + } intr_restore(s); __mp_lock_spin(mpl, cpu->mplc_ticket); membar_enter_after_atomic(); WITNESS_LOCK(&mpl->mpl_lock_obj, LOP_EXCLUSIVE); + + if (depth == 0) + LLTRACE(lltrace_lock, mpl, LLTRACE_LK_K, LLTRACE_LK_A_EXCL); } void @@ -164,6 +173,7 @@ __mp_unlock(struct __mp_lock *mpl) s = intr_disable(); if (--cpu->mplc_depth == 0) { + LLTRACE(lltrace_lock, mpl, LLTRACE_LK_K, LLTRACE_LK_R_EXCL); membar_exit(); mpl->mpl_ticket++; } @@ -180,6 +190,8 @@ __mp_release_all(struct __mp_lock *mpl) int i; #endif + LLTRACE(lltrace_lock, mpl, LLTRACE_LK_K, LLTRACE_LK_R_EXCL); + s = intr_disable(); rv = cpu->mplc_depth; #ifdef WITNESS @@ -227,29 +239,64 @@ __mtx_init(struct mutex *mtx, int wantip void mtx_enter(struct mutex *mtx) { - struct schedstate_percpu *spc = &curcpu()->ci_schedstate; + struct cpu_info *owner, *ci = curcpu(); + struct schedstate_percpu *spc = &ci->ci_schedstate; + int s; #ifdef MP_LOCKDEBUG int nticks = __mp_lock_spinout; #endif +#if NLLT > 0 + unsigned int lltev = LLTRACE_LK_I_EXCL; +#endif + + /* Avoid deadlocks after panic or in DDB */ + if (panicstr || db_active) + return; WITNESS_CHECKORDER(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE | LOP_NEWORDER, NULL); - spc->spc_spinning++; - while (mtx_enter_try(mtx) == 0) { + if (mtx->mtx_wantipl != IPL_NONE) + s = splraise(mtx->mtx_wantipl); + + owner = atomic_cas_ptr(&mtx->mtx_owner, NULL, ci); +#ifdef DIAGNOSTIC + if (__predict_false(owner == ci)) + panic("mtx %p: locking against myself", mtx); +#endif + if (owner != NULL) { + LLTRACE_SPC(spc, lltrace_lock, mtx, LLTRACE_LK_MTX, + LLTRACE_LK_A_START); + + spc->spc_spinning++; do { - CPU_BUSY_CYCLE(); + do { + CPU_BUSY_CYCLE(); #ifdef MP_LOCKDEBUG - if (--nticks == 0) { - db_printf("%s: %p lock spun out\n", - __func__, mtx); - db_enter(); - nticks = __mp_lock_spinout; - } + if (--nticks == 0) { + db_printf("%s: %p lock spun out\n", + __func__, mtx); + db_enter(); + nticks = __mp_lock_spinout; + } +#endif + } while (mtx->mtx_owner != NULL); + } while (atomic_cas_ptr(&mtx->mtx_owner, NULL, ci) != NULL); + spc->spc_spinning--; + +#if NLLT > 0 + lltev = LLTRACE_LK_A_EXCL; #endif - } while (mtx->mtx_owner != NULL); } - spc->spc_spinning--; + + membar_enter_after_atomic(); + if (mtx->mtx_wantipl != IPL_NONE) + mtx->mtx_oldipl = s; +#ifdef DIAGNOSTIC + ci->ci_mutex_level++; +#endif + WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); + LLTRACE_SPC(spc, lltrace_lock, mtx, LLTRACE_LK_MTX, lltev); } int @@ -278,12 +325,15 @@ mtx_enter_try(struct mutex *mtx) ci->ci_mutex_level++; #endif WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); + LLTRACE_CPU(ci, lltrace_lock, mtx, LLTRACE_LK_MTX, + LLTRACE_LK_I_EXCL); return (1); } if (mtx->mtx_wantipl != IPL_NONE) splx(s); + LLTRACE_CPU(ci, lltrace_lock, mtx, LLTRACE_LK_MTX, LLTRACE_LK_I_FAIL); return (0); } #else @@ -313,6 +363,7 @@ mtx_enter(struct mutex *mtx) ci->ci_mutex_level++; #endif WITNESS_LOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); + LLTRACE(lltrace_lock, mtx, LLTRACE_LK_MTX, LLTRACE_LK_I_EXCL); } int @@ -333,6 +384,7 @@ mtx_leave(struct mutex *mtx) return; MUTEX_ASSERT_LOCKED(mtx); + LLTRACE(lltrace_lock, mtx, LLTRACE_LK_MTX, LLTRACE_LK_R_EXCL); WITNESS_UNLOCK(MUTEX_LOCK_OBJECT(mtx), LOP_EXCLUSIVE); #ifdef DIAGNOSTIC Index: kern/kern_rwlock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_rwlock.c,v diff -u -p -r1.50 kern_rwlock.c --- kern/kern_rwlock.c 14 Jul 2023 07:07:08 -0000 1.50 +++ kern/kern_rwlock.c 11 Oct 2024 10:53:00 -0000 @@ -25,11 +25,20 @@ #include #include #include +#include -void rw_do_exit(struct rwlock *, unsigned long); +#define LLTRW(_rwl, _ev) LLTRACE(lltrace_lock, (_rwl), LLTRACE_LK_RW, (_ev)) -/* XXX - temporary measure until proc0 is properly aligned */ -#define RW_PROC(p) (((long)p) & ~RWLOCK_MASK) +struct rwlock_waiter { + volatile unsigned int rww_wait; + struct proc *rww_owner; + struct rwlock_waiter **rww_prev; + struct rwlock_waiter *rww_next; +}; + +static int rw_write(struct rwlock *, int); +static int rw_read(struct rwlock *, int); +static int rw_downgrade(struct rwlock *, int); /* * Other OSes implement more sophisticated mechanism to determine how long the @@ -39,335 +48,462 @@ void rw_do_exit(struct rwlock *, unsigne */ #define RW_SPINS 1000 -#ifdef MULTIPROCESSOR -#define rw_cas(p, o, n) (atomic_cas_ulong(p, o, n) != o) -#else -static inline int -rw_cas(volatile unsigned long *p, unsigned long o, unsigned long n) +static void +_rw_init_flags_witness(struct rwlock *rwl, const char *name, int lo_flags, + const struct lock_type *type) { - if (*p != o) - return (1); - *p = n; + rwl->rwl_lock = 0; + rwl->rwl_state = 0; + rwl->rwl_readers = 0; + rwl->rwl_depth = 0; + rwl->rwl_owner = NULL; + rwl->rwl_name = name; + rwl->rwl_head = NULL; + rwl->rwl_tail = &rwl->rwl_head; - return (0); -} +#ifdef WITNESS + rwl->rwl_lock_obj.lo_flags = lo_flags; + rwl->rwl_lock_obj.lo_name = name; + rwl->rwl_lock_obj.lo_type = type; + WITNESS_INIT(&rwl->rwl_lock_obj, type); +#else + (void)type; + (void)lo_flags; #endif - -/* - * Magic wand for lock operations. Every operation checks if certain - * flags are set and if they aren't, it increments the lock with some - * value (that might need some computing in a few cases). If the operation - * fails, we need to set certain flags while waiting for the lock. - * - * RW_WRITE The lock must be completely empty. We increment it with - * RWLOCK_WRLOCK and the proc pointer of the holder. - * Sets RWLOCK_WAIT|RWLOCK_WRWANT while waiting. - * RW_READ RWLOCK_WRLOCK|RWLOCK_WRWANT may not be set. We increment - * with RWLOCK_READ_INCR. RWLOCK_WAIT while waiting. - */ -static const struct rwlock_op { - unsigned long inc; - unsigned long check; - unsigned long wait_set; - long proc_mult; - int wait_prio; -} rw_ops[] = { - { /* RW_WRITE */ - RWLOCK_WRLOCK, - ULONG_MAX, - RWLOCK_WAIT | RWLOCK_WRWANT, - 1, - PLOCK - 4 - }, - { /* RW_READ */ - RWLOCK_READ_INCR, - RWLOCK_WRLOCK | RWLOCK_WRWANT, - RWLOCK_WAIT, - 0, - PLOCK - }, - { /* Sparse Entry. */ - 0, - }, - { /* RW_DOWNGRADE */ - RWLOCK_READ_INCR - RWLOCK_WRLOCK, - 0, - 0, - -1, - PLOCK - }, -}; +} void -rw_enter_read(struct rwlock *rwl) +_rw_init_flags(struct rwlock *rwl, const char *name, int flags, + const struct lock_type *type) { - unsigned long owner = rwl->rwl_owner; + _rw_init_flags_witness(rwl, name, RWLOCK_LO_FLAGS(flags), type); +} - if (__predict_false((owner & (RWLOCK_WRLOCK | RWLOCK_WRWANT)) || - rw_cas(&rwl->rwl_owner, owner, owner + RWLOCK_READ_INCR))) - rw_enter(rwl, RW_READ); - else { - membar_enter_after_atomic(); - WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_NEWORDER, NULL); - WITNESS_LOCK(&rwl->rwl_lock_obj, 0); +#ifdef MULTIPROCESSOR +static inline unsigned long +rw_lock_enter(struct rwlock *rwl) +{ + unsigned long s = intr_disable(); + while (atomic_cas_uint(&rwl->rwl_lock, 0, 1) != 0) { + do { + CPU_BUSY_CYCLE(); + } while (atomic_load_int(&rwl->rwl_lock) != 0); } + membar_enter_after_atomic(); + return (s); } -void -rw_enter_write(struct rwlock *rwl) +static inline void +rw_lock_leave(struct rwlock *rwl, unsigned long s) +{ + atomic_store_int(&rwl->rwl_lock, 0); + intr_restore(s); +} +#else /* MULTIPROCESSOR */ +static inline unsigned long +rw_lock_enter(struct rwlock *rwl) { - struct proc *p = curproc; + rwl->rwl_lock = 1; + return (1); +} - if (__predict_false(rw_cas(&rwl->rwl_owner, 0, - RW_PROC(p) | RWLOCK_WRLOCK))) - rw_enter(rwl, RW_WRITE); - else { - membar_enter_after_atomic(); - WITNESS_CHECKORDER(&rwl->rwl_lock_obj, - LOP_EXCLUSIVE | LOP_NEWORDER, NULL); - WITNESS_LOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); - } +static inline void +rw_lock_leave(struct rwlock *rwl, unsigned long s) +{ + rwl->rwl_lock = 0; } +#endif /* MULTIPROCESSOR */ -void -rw_exit_read(struct rwlock *rwl) +static inline void +rw_insert(struct rwlock *rwl, struct rwlock_waiter *rww) { - unsigned long owner; + struct rwlock_waiter **tail = rwl->rwl_tail; - rw_assert_rdlock(rwl); - WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0); + if (__predict_false(tail == NULL)) + tail = &rwl->rwl_head; - membar_exit_before_atomic(); - owner = rwl->rwl_owner; - if (__predict_false((owner & RWLOCK_WAIT) || - rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR))) - rw_do_exit(rwl, 0); + rww->rww_next = NULL; + rww->rww_prev = tail; + + *tail = rww; + rwl->rwl_tail = &rww->rww_next; } -void -rw_exit_write(struct rwlock *rwl) +static inline struct rwlock_waiter * +rw_first(struct rwlock *rwl) { - unsigned long owner; - - rw_assert_wrlock(rwl); - WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); + return (rwl->rwl_head); +} - membar_exit_before_atomic(); - owner = rwl->rwl_owner; - if (__predict_false((owner & RWLOCK_WAIT) || - rw_cas(&rwl->rwl_owner, owner, 0))) - rw_do_exit(rwl, RWLOCK_WRLOCK); +static inline void +rw_remove(struct rwlock *rwl, struct rwlock_waiter *rww) +{ + if (rww->rww_next != NULL) + rww->rww_next->rww_prev = rww->rww_prev; + else + rwl->rwl_tail = rww->rww_prev; + *rww->rww_prev = rww->rww_next; } -#ifdef DIAGNOSTIC -/* - * Put the diagnostic functions here to keep the main code free - * from ifdef clutter. - */ -static void -rw_enter_diag(struct rwlock *rwl, int flags) +static int +rw_write(struct rwlock *rwl, int flags) { - switch (flags & RW_OPMASK) { - case RW_WRITE: - case RW_READ: - if (RW_PROC(curproc) == RW_PROC(rwl->rwl_owner)) - panic("rw_enter: %s locking against myself", - rwl->rwl_name); - break; - case RW_DOWNGRADE: - /* - * If we're downgrading, we must hold the write lock. - */ - if ((rwl->rwl_owner & RWLOCK_WRLOCK) == 0) - panic("rw_enter: %s downgrade of non-write lock", - rwl->rwl_name); - if (RW_PROC(curproc) != RW_PROC(rwl->rwl_owner)) - panic("rw_enter: %s downgrade, not holder", - rwl->rwl_name); - break; + struct proc *self = curproc; + unsigned int state; + struct rwlock_waiter waiter = { .rww_wait = 1, .rww_owner = self }; + int prio = PLOCK - 4; + unsigned long s; - default: - panic("rw_enter: unknown op 0x%x", flags); +#ifdef WITNESS + if (!ISSET(flags, RW_NOSLEEP)) { + int lop_flags = LOP_NEWORDER | LOP_EXCLUSIVE; + if (ISSET(flags, RW_DUPOK)) + lop_flags |= LOP_DUPOK; + WITNESS_CHECKORDER(&rwl->rwl_lock_obj, lop_flags, NULL); } -} +#endif -#else -#define rw_enter_diag(r, f) + s = rw_lock_enter(rwl); + state = rwl->rwl_state; + if (state == 0) { + KASSERT(rwl->rwl_owner == NULL); + KASSERT(rwl->rwl_depth == 0); + rwl->rwl_state = RW_WRITE; + rwl->rwl_owner = self; + rwl->rwl_depth = 1; + } else { + if (rwl->rwl_owner == self) { + KASSERT(state == RW_WRITE); + rw_lock_leave(rwl, s); + /* for rrwlocks to handle */ + return (EDEADLK); + } + if (ISSET(flags, RW_NOSLEEP)) { + rw_lock_leave(rwl, s); + return (EBUSY); + } + rw_insert(rwl, &waiter); + } + rw_lock_leave(rwl, s); + + if (state == 0) { + membar_enter_after_atomic(); + LLTRW(rwl, LLTRACE_LK_I_EXCL); + return (0); + } + + LLTRW(rwl, LLTRACE_LK_A_START) + ; + +#ifdef MULTIPROCESSOR + { + unsigned int i; + + for (i = 0; i < RW_SPINS; i++) { + CPU_BUSY_CYCLE(); + if (!atomic_load_int(&waiter.rww_wait)) + goto locked; + } + } #endif -static void -_rw_init_flags_witness(struct rwlock *rwl, const char *name, int lo_flags, - const struct lock_type *type) + if (ISSET(flags, RW_INTR)) + prio |= PCATCH; + + do { + int error; + + sleep_setup(&waiter.rww_wait, prio, rwl->rwl_name); + error = sleep_finish(0, atomic_load_int(&waiter.rww_wait)); + if (ISSET(flags, RW_INTR) && (error != 0)) { + s = rw_lock_enter(rwl); + if (waiter.rww_wait) + rw_remove(rwl, &waiter); + else { + KASSERT(rwl->rwl_state == RW_WRITE); + KASSERT(rwl->rwl_owner == self); + error = 0; + } + rw_lock_leave(rwl, s); + if (error != 0) { + LLTRW(rwl, LLTRACE_LK_A_ABORT); + return (error); + } + + goto locked; + } + } while (atomic_load_int(&waiter.rww_wait)); + +locked: + LLTRW(rwl, LLTRACE_LK_A_EXCL); + if (ISSET(flags, RW_SLEEPFAIL)) { + rw_exit(rwl); + return (EAGAIN); + } + + __builtin_prefetch(rwl, 1); + membar_enter(); + return (0); +} + +void +rw_enter_write(struct rwlock *rwl) { - rwl->rwl_owner = 0; - rwl->rwl_name = name; + int error; -#ifdef WITNESS - rwl->rwl_lock_obj.lo_flags = lo_flags; - rwl->rwl_lock_obj.lo_name = name; - rwl->rwl_lock_obj.lo_type = type; - WITNESS_INIT(&rwl->rwl_lock_obj, type); -#else - (void)type; - (void)lo_flags; -#endif + error = rw_write(rwl, 0); + if (error == EDEADLK) + panic("%s(%p): %s deadlock", __func__, rwl, rwl->rwl_name); } void -_rw_init_flags(struct rwlock *rwl, const char *name, int flags, - const struct lock_type *type) +rw_exit_write(struct rwlock *rwl) { - _rw_init_flags_witness(rwl, name, RWLOCK_LO_FLAGS(flags), type); + rw_exit(rwl); } -int -rw_enter(struct rwlock *rwl, int flags) +static int +rw_read(struct rwlock *rwl, int flags) { - const struct rwlock_op *op; - unsigned long inc, o; -#ifdef MULTIPROCESSOR - /* - * If process holds the kernel lock, then we want to give up on CPU - * as soon as possible so other processes waiting for the kernel lock - * can progress. Hence no spinning if we hold the kernel lock. - */ - unsigned int spin = (_kernel_lock_held()) ? 0 : RW_SPINS; -#endif - int error, prio; -#ifdef WITNESS - int lop_flags; + struct proc *self = curproc; + struct proc *owner = NULL; + unsigned int state; + int prio = PLOCK; + unsigned long s; - lop_flags = LOP_NEWORDER; - if (flags & RW_WRITE) - lop_flags |= LOP_EXCLUSIVE; - if (flags & RW_DUPOK) - lop_flags |= LOP_DUPOK; - if ((flags & RW_NOSLEEP) == 0 && (flags & RW_DOWNGRADE) == 0) +#ifdef WITNESS + if (!ISSET(flags, RW_NOSLEEP)) { + int lop_flags = LOP_NEWORDER; + if (ISSET(flags, RW_DUPOK)) + lop_flags |= LOP_DUPOK; WITNESS_CHECKORDER(&rwl->rwl_lock_obj, lop_flags, NULL); + } #endif - op = &rw_ops[(flags & RW_OPMASK) - 1]; + s = rw_lock_enter(rwl); + state = rwl->rwl_state; + switch (state) { + case 0: + rwl->rwl_state = state = RW_READ; + break; + case RW_WRITE: + owner = rwl->rwl_owner; + KASSERT(owner != NULL); + KASSERT(owner != self); + if (ISSET(flags, RW_NOSLEEP)) { + rw_lock_leave(rwl, s); + return (EBUSY); + } + break; + } + rwl->rwl_readers++; + rw_lock_leave(rwl, s); + + if (state == RW_READ) { + LLTRW(rwl, LLTRACE_LK_I_SHARED); + membar_enter_after_atomic(); + return (0); + } - inc = op->inc + RW_PROC(curproc) * op->proc_mult; -retry: - while (__predict_false(((o = rwl->rwl_owner) & op->check) != 0)) { - unsigned long set = o | op->wait_set; - int do_sleep; - - /* Avoid deadlocks after panic or in DDB */ - if (panicstr || db_active) - return (0); + LLTRW(rwl, LLTRACE_LK_A_START) + ; #ifdef MULTIPROCESSOR - /* - * It makes sense to try to spin just in case the lock - * is acquired by writer. - */ - if ((o & RWLOCK_WRLOCK) && (spin != 0)) { - spin--; + { + unsigned int i; + + for (i = 0; i < RW_SPINS; i++) { CPU_BUSY_CYCLE(); - continue; + state = atomic_load_int(&rwl->rwl_state); + if (state == RW_READ) + goto locked; } + } #endif - rw_enter_diag(rwl, flags); + if (ISSET(flags, RW_INTR)) + prio |= PCATCH; - if (flags & RW_NOSLEEP) - return (EBUSY); + do { + int error; + + sleep_setup(&rwl->rwl_state, prio, rwl->rwl_name); + state = atomic_load_int(&rwl->rwl_state); + error = sleep_finish(0, state != RW_READ); + if (ISSET(flags, RW_INTR) && (error != 0)) { + s = rw_lock_enter(rwl); + if (rwl->rwl_state != RW_READ) { + KASSERT(rwl->rwl_readers > 0); + rwl->rwl_readers--; + } else + error = 0; + rw_lock_leave(rwl, s); + if (error != 0) { + LLTRW(rwl, LLTRACE_LK_A_ABORT); + return (error); + } + goto locked; + } + } while (state != RW_READ); - prio = op->wait_prio; - if (flags & RW_INTR) - prio |= PCATCH; - sleep_setup(rwl, prio, rwl->rwl_name); - - do_sleep = !rw_cas(&rwl->rwl_owner, o, set); - - error = sleep_finish(0, do_sleep); - if ((flags & RW_INTR) && - (error != 0)) - return (error); - if (flags & RW_SLEEPFAIL) - return (EAGAIN); +locked: + LLTRW(rwl, LLTRACE_LK_A_SHARED); + if (ISSET(flags, RW_SLEEPFAIL)) { + rw_exit(rwl); + return (EAGAIN); } - if (__predict_false(rw_cas(&rwl->rwl_owner, o, o + inc))) - goto retry; - membar_enter_after_atomic(); + membar_enter(); + return (0); +} + +void +rw_enter_read(struct rwlock *rwl) +{ + rw_read(rwl, 0); +} - /* - * If old lock had RWLOCK_WAIT and RWLOCK_WRLOCK set, it means we - * downgraded a write lock and had possible read waiter, wake them - * to let them retry the lock. - */ - if (__predict_false((o & (RWLOCK_WRLOCK|RWLOCK_WAIT)) == - (RWLOCK_WRLOCK|RWLOCK_WAIT))) - wakeup(rwl); +void +rw_exit_read(struct rwlock *rwl) +{ + rw_exit(rwl); +} - if (flags & RW_DOWNGRADE) - WITNESS_DOWNGRADE(&rwl->rwl_lock_obj, lop_flags); - else - WITNESS_LOCK(&rwl->rwl_lock_obj, lop_flags); +static int +rw_downgrade(struct rwlock *rwl, int flags) +{ + struct proc *self = curproc; + unsigned int nwake; + unsigned long s; + + LLTRW(rwl, LLTRACE_LK_DOWNGRADE); + + s = rw_lock_enter(rwl); + KASSERT(rwl->rwl_state == RW_WRITE); + KASSERT(rwl->rwl_owner == self); + KASSERT(rwl->rwl_depth == 1); + rwl->rwl_state = RW_READ; + rwl->rwl_owner = NULL; + rwl->rwl_depth = 0; + nwake = rwl->rwl_readers++; + rw_lock_leave(rwl, s); + + if (nwake > 0) + wakeup_n(&rwl->rwl_state, nwake); return (0); } +int +rw_enter(struct rwlock *rwl, int flags) +{ + int op = flags & RW_OPMASK; + int error; + + switch (op) { + case RW_WRITE: + error = rw_write(rwl, flags); + if (error == EDEADLK) { + panic("%s(%p): %s deadlock", __func__, rwl, + rwl->rwl_name); + } + break; + case RW_READ: + error = rw_read(rwl, flags); + break; + case RW_DOWNGRADE: + error = rw_downgrade(rwl, flags); + break; + default: + panic("%s(%p, 0x%x): unknown op 0x%x", __func__, rwl, flags, + op); + /* NOTREACHED */ + } + + return (error); +} + void rw_exit(struct rwlock *rwl) { - unsigned long wrlock; + struct proc *self = curproc; + struct rwlock_waiter *rww; + volatile void *wchan = NULL; + unsigned int nwake; + unsigned long s; /* Avoid deadlocks after panic or in DDB */ if (panicstr || db_active) return; - wrlock = rwl->rwl_owner & RWLOCK_WRLOCK; - if (wrlock) - rw_assert_wrlock(rwl); - else - rw_assert_rdlock(rwl); - WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0); + s = rw_lock_enter(rwl); + switch (rwl->rwl_state) { + case RW_WRITE: + KASSERT(rwl->rwl_owner == self); + if (--rwl->rwl_depth > 0) + goto leave; + LLTRW(rwl, LLTRACE_LK_R_EXCL); + break; + case RW_READ: + KASSERT(rwl->rwl_owner == NULL); + if (--rwl->rwl_readers > 0) + goto leave; + LLTRW(rwl, LLTRACE_LK_R_SHARED); + break; + default: + panic("%s(%p): %s unexpected state %u", __func__, rwl, + rwl->rwl_name, rwl->rwl_state); + /* NOTREACHED */ + } + membar_exit(); - membar_exit_before_atomic(); - rw_do_exit(rwl, wrlock); -} + rww = rw_first(rwl); + if (rww != NULL) { + rw_remove(rwl, rww); + + /* move ownership */ + rwl->rwl_state = RW_WRITE; + rwl->rwl_owner = rww->rww_owner; + rwl->rwl_depth = 1; + + nwake = 1; + wchan = &rww->rww_wait; + + atomic_store_int(&rww->rww_wait, 0); + } else { + rwl->rwl_owner = NULL; + + nwake = rwl->rwl_readers; + if (nwake > 0) { + wchan = &rwl->rwl_state; + rwl->rwl_state = RW_READ; + } else + rwl->rwl_state = 0; + } +leave: + rw_lock_leave(rwl, s); -/* membar_exit_before_atomic() has to precede call of this function. */ -void -rw_do_exit(struct rwlock *rwl, unsigned long wrlock) -{ - unsigned long owner, set; + if (wchan != NULL) + wakeup_n(wchan, nwake); - do { - owner = rwl->rwl_owner; - if (wrlock) - set = 0; - else - set = (owner - RWLOCK_READ_INCR) & - ~(RWLOCK_WAIT|RWLOCK_WRWANT); - /* - * Potential MP race here. If the owner had WRWANT set, we - * cleared it and a reader can sneak in before a writer. - */ - } while (__predict_false(rw_cas(&rwl->rwl_owner, owner, set))); - - if (owner & RWLOCK_WAIT) - wakeup(rwl); +// WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0); } int rw_status(struct rwlock *rwl) { - unsigned long owner = rwl->rwl_owner; + unsigned int state; + + /* avoid taking the spinlock to read these variables */ - if (owner & RWLOCK_WRLOCK) { - if (RW_PROC(curproc) == RW_PROC(owner)) - return RW_WRITE; - else - return RW_WRITE_OTHER; + state = atomic_load_int(&rwl->rwl_state); + if (state == RW_WRITE) { + membar_datadep_consumer(); + if (rwl->rwl_owner != curproc) + state = RW_WRITE_OTHER; } - if (owner) - return RW_READ; - return (0); + + return (state); } #ifdef DIAGNOSTIC @@ -380,11 +516,16 @@ rw_assert_wrlock(struct rwlock *rwl) #ifdef WITNESS witness_assert(&rwl->rwl_lock_obj, LA_XLOCKED); #else - if (!(rwl->rwl_owner & RWLOCK_WRLOCK)) - panic("%s: lock not held", rwl->rwl_name); - - if (RW_PROC(curproc) != RW_PROC(rwl->rwl_owner)) + switch (rw_status(rwl)) { + case RW_WRITE: + break; + case RW_WRITE_OTHER: panic("%s: lock not held by this process", rwl->rwl_name); + /* NOTREACHED */ + default: + panic("%s: lock not held", rwl->rwl_name); + /* NOTREACHED */ + } #endif } @@ -397,7 +538,7 @@ rw_assert_rdlock(struct rwlock *rwl) #ifdef WITNESS witness_assert(&rwl->rwl_lock_obj, LA_SLOCKED); #else - if (!RW_PROC(rwl->rwl_owner) || (rwl->rwl_owner & RWLOCK_WRLOCK)) + if (rw_status(rwl) != RW_READ) panic("%s: lock not shared", rwl->rwl_name); #endif } @@ -429,7 +570,7 @@ rw_assert_unlocked(struct rwlock *rwl) #ifdef WITNESS witness_assert(&rwl->rwl_lock_obj, LA_UNLOCKED); #else - if (RW_PROC(curproc) == RW_PROC(rwl->rwl_owner)) + if (rw_status(rwl) == RW_WRITE) panic("%s: lock held", rwl->rwl_name); #endif } @@ -440,7 +581,6 @@ void _rrw_init_flags(struct rrwlock *rrwl, const char *name, int flags, const struct lock_type *type) { - memset(rrwl, 0, sizeof(struct rrwlock)); _rw_init_flags_witness(&rrwl->rrwl_lock, name, RRWLOCK_LO_FLAGS(flags), type); } @@ -448,47 +588,48 @@ _rrw_init_flags(struct rrwlock *rrwl, co int rrw_enter(struct rrwlock *rrwl, int flags) { - int rv; + struct rwlock *rwl = &rrwl->rrwl_lock; + int op = flags & RW_OPMASK; + int error; - if (RW_PROC(rrwl->rrwl_lock.rwl_owner) == RW_PROC(curproc)) { - if (flags & RW_RECURSEFAIL) - return (EDEADLK); - else { - rrwl->rrwl_wcnt++; - WITNESS_LOCK(&rrwl->rrwl_lock.rwl_lock_obj, - LOP_EXCLUSIVE); - return (0); + switch (op) { + case RW_WRITE: + error = rw_write(rwl, flags); + if (error == EDEADLK && !ISSET(flags, RW_RECURSEFAIL)) { + rwl->rwl_depth++; + error = 0; } + break; + case RW_READ: + error = rw_read(rwl, flags); + break; + case RW_DOWNGRADE: + panic("%s(%p, 0x%x): downgrade not supported", __func__, + rwl, flags); + /* NOTREACHED */ + default: + panic("%s(%p, 0x%x): unknown op 0x%x", __func__, rwl, flags, + op); + /* NOTREACHED */ } - rv = rw_enter(&rrwl->rrwl_lock, flags); - if (rv == 0) - rrwl->rrwl_wcnt = 1; - - return (rv); + return (error); } void rrw_exit(struct rrwlock *rrwl) { + struct rwlock *rwl = &rrwl->rrwl_lock; - if (RW_PROC(rrwl->rrwl_lock.rwl_owner) == RW_PROC(curproc)) { - KASSERT(rrwl->rrwl_wcnt > 0); - rrwl->rrwl_wcnt--; - if (rrwl->rrwl_wcnt != 0) { - WITNESS_UNLOCK(&rrwl->rrwl_lock.rwl_lock_obj, - LOP_EXCLUSIVE); - return; - } - } - - rw_exit(&rrwl->rrwl_lock); + rw_exit(rwl); } int rrw_status(struct rrwlock *rrwl) { - return (rw_status(&rrwl->rrwl_lock)); + struct rwlock *rwl = &rrwl->rrwl_lock; + + return (rw_status(rwl)); } /*- Index: kern/vfs_subr.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_subr.c,v diff -u -p -r1.322 vfs_subr.c --- kern/vfs_subr.c 13 Jul 2024 14:37:55 -0000 1.322 +++ kern/vfs_subr.c 11 Oct 2024 10:53:00 -0000 @@ -246,10 +246,7 @@ vfs_unbusy(struct mount *mp) int vfs_isbusy(struct mount *mp) { - if (RWLOCK_OWNER(&mp->mnt_lock) > 0) - return (1); - else - return (0); + return (rw_status(&mp->mnt_lock) != 0); } /* Index: sys/rwlock.h =================================================================== RCS file: /cvs/src/sys/sys/rwlock.h,v diff -u -p -r1.28 rwlock.h --- sys/rwlock.h 11 Jan 2021 18:49:38 -0000 1.28 +++ sys/rwlock.h 11 Oct 2024 10:53:00 -0000 @@ -57,12 +57,19 @@ #include struct proc; +struct rwlock_waiter; struct rwlock { - volatile unsigned long rwl_owner; - const char *rwl_name; + unsigned int rwl_lock; + unsigned int rwl_state; + unsigned int rwl_readers; + unsigned int rwl_depth; + struct proc *rwl_owner; + const char *rwl_name; + struct rwlock_waiter *rwl_head; + struct rwlock_waiter **rwl_tail; #ifdef WITNESS - struct lock_object rwl_lock_obj; + struct lock_object rwl_lock_obj; #endif }; @@ -90,11 +97,27 @@ struct rwlock { #define RWL_IS_VNODE 0x04 #ifdef WITNESS -#define RWLOCK_INITIALIZER(name) \ - { 0, name, .rwl_lock_obj = RWLOCK_LO_INITIALIZER(name, 0) } +#define RWLOCK_INITIALIZER(name) { \ + .rwl_lock = 0, \ + .rwl_state = 0, \ + .rwl_readers = 0, \ + .rwl_depth = 0, \ + .rwl_owner = NULL, \ + .rwl_name = name, \ + .rwl_head = NULL, \ + .rwl_tail = NULL, \ + .rwl_lock_obj = RWLOCK_LO_INITIALIZER(name, 0) } \ +} #else -#define RWLOCK_INITIALIZER(name) \ - { 0, name } +#define RWLOCK_INITIALIZER(name) { \ + .rwl_lock = 0, \ + .rwl_state = 0, \ + .rwl_readers = 0, \ + .rwl_owner = NULL, \ + .rwl_name = name, \ + .rwl_head = NULL, \ + .rwl_tail = NULL, \ +} #endif #define RWLOCK_WAIT 0x01UL @@ -127,7 +150,6 @@ struct rwlock { /* recursive rwlocks; */ struct rrwlock { struct rwlock rrwl_lock; - uint32_t rrwl_wcnt; /* # writers. */ }; #ifdef _KERNEL Index: uvm/uvm_map.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_map.c,v diff -u -p -r1.330 uvm_map.c --- uvm/uvm_map.c 24 Jul 2024 12:17:31 -0000 1.330 +++ uvm/uvm_map.c 11 Oct 2024 10:53:00 -0000 @@ -5201,68 +5201,77 @@ out: boolean_t vm_map_lock_try_ln(struct vm_map *map, char *file, int line) { - boolean_t rv; + int rv; if (map->flags & VM_MAP_INTRSAFE) { - rv = mtx_enter_try(&map->mtx); + if (!mtx_enter_try(&map->mtx)) + return (FALSE); } else { + struct proc *busy; + mtx_enter(&map->flags_lock); - if ((map->flags & VM_MAP_BUSY) && (map->busy != curproc)) { - mtx_leave(&map->flags_lock); + busy = map->busy; + mtx_leave(&map->flags_lock); + if (busy != NULL && busy != curproc) return (FALSE); - } + + rv = rw_enter(&map->lock, RW_WRITE|RW_NOSLEEP); + if (rv != 0) + return (FALSE); + + /* to be sure, to be sure */ + mtx_enter(&map->flags_lock); + busy = map->busy; mtx_leave(&map->flags_lock); - rv = (rw_enter(&map->lock, RW_WRITE|RW_NOSLEEP) == 0); - /* check if the lock is busy and back out if we won the race */ - if (rv) { - mtx_enter(&map->flags_lock); - if ((map->flags & VM_MAP_BUSY) && - (map->busy != curproc)) { - rw_exit(&map->lock); - rv = FALSE; - } - mtx_leave(&map->flags_lock); + if (busy != NULL && busy != curproc) { + rw_exit(&map->lock); + return (FALSE); } } - if (rv) { - map->timestamp++; - LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); - uvm_tree_sanity(map, file, line); - uvm_tree_size_chk(map, file, line); - } + map->timestamp++; + LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); + uvm_tree_sanity(map, file, line); + uvm_tree_size_chk(map, file, line); - return (rv); + return (TRUE); } void vm_map_lock_ln(struct vm_map *map, char *file, int line) { - if ((map->flags & VM_MAP_INTRSAFE) == 0) { - do { - mtx_enter(&map->flags_lock); -tryagain: - while ((map->flags & VM_MAP_BUSY) && - (map->busy != curproc)) { - map->flags |= VM_MAP_WANTLOCK; - msleep_nsec(&map->flags, &map->flags_lock, + if (map->flags & VM_MAP_INTRSAFE) { + mtx_enter(&map->mtx); + } else { + mtx_enter(&map->flags_lock); + for (;;) { + while (map->busy != NULL && map->busy != curproc) { + map->nbusy++; + msleep_nsec(&map->busy, &map->mtx, PVM, vmmapbsy, INFSLP); + map->nbusy--; } mtx_leave(&map->flags_lock); - } while (rw_enter(&map->lock, RW_WRITE|RW_SLEEPFAIL) != 0); - /* check if the lock is busy and back out if we won the race */ - mtx_enter(&map->flags_lock); - if ((map->flags & VM_MAP_BUSY) && (map->busy != curproc)) { - rw_exit(&map->lock); - goto tryagain; + + rw_enter_write(&map->lock); + + /* to be sure, to be sure */ + mtx_enter(&map->flags_lock); + if (map->busy != NULL && map->busy != curproc) { + /* go around again */ + rw_exit_write(&map->lock); + } else { + /* we won */ + break; + } } mtx_leave(&map->flags_lock); - } else { - mtx_enter(&map->mtx); } - if (map->busy != curproc) + if (map->busy != curproc) { + KASSERT(map->busy == NULL); map->timestamp++; + } LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); uvm_tree_sanity(map, file, line); uvm_tree_size_chk(map, file, line); @@ -5312,27 +5321,24 @@ vm_map_busy_ln(struct vm_map *map, char KASSERT(rw_write_held(&map->lock)); KASSERT(map->busy == NULL); - mtx_enter(&map->flags_lock); map->busy = curproc; - map->flags |= VM_MAP_BUSY; - mtx_leave(&map->flags_lock); } void vm_map_unbusy_ln(struct vm_map *map, char *file, int line) { - int oflags; + unsigned int nbusy; KASSERT((map->flags & VM_MAP_INTRSAFE) == 0); KASSERT(map->busy == curproc); mtx_enter(&map->flags_lock); - oflags = map->flags; + nbusy = map->nbusy; map->busy = NULL; - map->flags &= ~(VM_MAP_BUSY|VM_MAP_WANTLOCK); mtx_leave(&map->flags_lock); - if (oflags & VM_MAP_WANTLOCK) - wakeup(&map->flags); + + if (nbusy > 0) + wakeup_n(&map->busy, nbusy); } void Index: uvm/uvm_map.h =================================================================== RCS file: /cvs/src/sys/uvm/uvm_map.h,v diff -u -p -r1.90 uvm_map.h --- uvm/uvm_map.h 18 Jun 2024 12:37:29 -0000 1.90 +++ uvm/uvm_map.h 11 Oct 2024 10:53:00 -0000 @@ -214,17 +214,6 @@ RBT_PROTOTYPE(uvm_map_addr, vm_map_entry * map is write-locked. may be tested * without asserting `flags_lock'. * - * VM_MAP_BUSY r/w; may only be set when map is - * write-locked, may only be cleared by - * thread which set it, map read-locked - * or write-locked. must be tested - * while `flags_lock' is asserted. - * - * VM_MAP_WANTLOCK r/w; may only be set when the map - * is busy, and thread is attempting - * to write-lock. must be tested - * while `flags_lock' is asserted. - * * VM_MAP_GUARDPAGES r/o; must be specified at map * initialization time. * If set, guards will appear between @@ -257,6 +246,7 @@ RBT_PROTOTYPE(uvm_map_addr, vm_map_entry * a atomic operations * I immutable after creation or exec(2) * v `vm_map_lock' (this map `lock' or `mtx') + * f flags_lock */ struct vm_map { struct pmap *pmap; /* [I] Physical map */ @@ -266,9 +256,10 @@ struct vm_map { vsize_t size; /* virtual size */ int ref_count; /* [a] Reference count */ - int flags; /* flags */ + int flags; /* [f] flags */ unsigned int timestamp; /* Version number */ - struct proc *busy; /* [v] thread holding map busy*/ + struct proc *busy; /* [f] thread holding map busy*/ + unsigned int nbusy; /* [f] waiters for busy */ vaddr_t min_offset; /* [I] First address in map. */ vaddr_t max_offset; /* [I] Last address in map. */ @@ -323,8 +314,6 @@ struct vm_map { #define VM_MAP_PAGEABLE 0x01 /* ro: entries are pageable */ #define VM_MAP_INTRSAFE 0x02 /* ro: interrupt safe map */ #define VM_MAP_WIREFUTURE 0x04 /* rw: wire future mappings */ -#define VM_MAP_BUSY 0x08 /* rw: map is busy */ -#define VM_MAP_WANTLOCK 0x10 /* rw: want to write-lock */ #define VM_MAP_GUARDPAGES 0x20 /* rw: add guard pgs to map */ #define VM_MAP_ISVMSPACE 0x40 /* ro: map is a vmspace */ #define VM_MAP_PINSYSCALL_ONCE 0x100 /* rw: pinsyscall done */