Index: kern/kern_rwlock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_rwlock.c,v diff -u -p -r1.50 kern_rwlock.c --- kern/kern_rwlock.c 14 Jul 2023 07:07:08 -0000 1.50 +++ kern/kern_rwlock.c 13 Oct 2024 11:38:44 -0000 @@ -25,11 +25,9 @@ #include #include #include +#include -void rw_do_exit(struct rwlock *, unsigned long); - -/* XXX - temporary measure until proc0 is properly aligned */ -#define RW_PROC(p) (((long)p) & ~RWLOCK_MASK) +#define LLTRW(_rwl, _ev) LLTRACE(lltrace_lock, (_rwl), LLTRACE_LK_RW, (_ev)) /* * Other OSes implement more sophisticated mechanism to determine how long the @@ -40,166 +38,124 @@ void rw_do_exit(struct rwlock *, unsigne #define RW_SPINS 1000 #ifdef MULTIPROCESSOR -#define rw_cas(p, o, n) (atomic_cas_ulong(p, o, n) != o) +#define rw_cas(p, e, n) atomic_cas_ulong(p, e, n) +#define rw_inc(p) atomic_inc_int(p) +#define rw_dec(p) atomic_dec_int(p) #else -static inline int -rw_cas(volatile unsigned long *p, unsigned long o, unsigned long n) +static inline unsigned long +rw_cas(volatile unsigned long *p, unsigned long e, unsigned long n) { - if (*p != o) - return (1); - *p = n; + unsigned long o = *p; - return (0); + if (o == e) + *p = n; + + return (o); +} + +static inline void +rw_inc(volatile unsigned int *p) +{ + ++(*p); +} + +static inline void +rw_dec(volatile unsigned int *p) +{ + (*p)--; } #endif -/* - * Magic wand for lock operations. Every operation checks if certain - * flags are set and if they aren't, it increments the lock with some - * value (that might need some computing in a few cases). If the operation - * fails, we need to set certain flags while waiting for the lock. - * - * RW_WRITE The lock must be completely empty. We increment it with - * RWLOCK_WRLOCK and the proc pointer of the holder. - * Sets RWLOCK_WAIT|RWLOCK_WRWANT while waiting. - * RW_READ RWLOCK_WRLOCK|RWLOCK_WRWANT may not be set. We increment - * with RWLOCK_READ_INCR. RWLOCK_WAIT while waiting. - */ -static const struct rwlock_op { - unsigned long inc; - unsigned long check; - unsigned long wait_set; - long proc_mult; - int wait_prio; -} rw_ops[] = { - { /* RW_WRITE */ - RWLOCK_WRLOCK, - ULONG_MAX, - RWLOCK_WAIT | RWLOCK_WRWANT, - 1, - PLOCK - 4 - }, - { /* RW_READ */ - RWLOCK_READ_INCR, - RWLOCK_WRLOCK | RWLOCK_WRWANT, - RWLOCK_WAIT, - 0, - PLOCK - }, - { /* Sparse Entry. */ - 0, - }, - { /* RW_DOWNGRADE */ - RWLOCK_READ_INCR - RWLOCK_WRLOCK, - 0, - 0, - -1, - PLOCK - }, -}; +static int rw_read(struct rwlock *, int); +static int rw_write(struct rwlock *, int); +static int rw_downgrade(struct rwlock *, int); + +static void rw_exited(struct rwlock *); + +static unsigned long +rw_self(void) +{ + unsigned long self = (unsigned long)curproc; + + CLR(self, RWLOCK_MASK); + SET(self, RWLOCK_WRLOCK); + + return (self); +} void rw_enter_read(struct rwlock *rwl) { - unsigned long owner = rwl->rwl_owner; - - if (__predict_false((owner & (RWLOCK_WRLOCK | RWLOCK_WRWANT)) || - rw_cas(&rwl->rwl_owner, owner, owner + RWLOCK_READ_INCR))) - rw_enter(rwl, RW_READ); - else { - membar_enter_after_atomic(); - WITNESS_CHECKORDER(&rwl->rwl_lock_obj, LOP_NEWORDER, NULL); - WITNESS_LOCK(&rwl->rwl_lock_obj, 0); - } + rw_read(rwl, 0); } void rw_enter_write(struct rwlock *rwl) { - struct proc *p = curproc; + rw_write(rwl, 0); +} + +static void +rw_do_exit_read(struct rwlock *rwl, unsigned long owner) +{ + unsigned long decr; + unsigned long nowner; + + for (;;) { + decr = owner - RWLOCK_READ_INCR; + nowner = rw_cas(&rwl->rwl_owner, owner, decr); + if (owner == nowner) + break; + + if (__predict_false(ISSET(nowner, RWLOCK_WRLOCK))) { + panic("%s rwlock %p: exit read on write locked lock" + " (owner 0x%lx)", rwl->rwl_name, rwl, nowner); + } + if (__predict_false(nowner == 0)) { + panic("%s rwlock %p: exit read on unlocked lock", + rwl->rwl_name, rwl); + } + + owner = nowner; + } + + LLTRW(rwl, LLTRACE_LK_R_SHARED); + + /* read lock didn't change anything, so no barrier needed? */ - if (__predict_false(rw_cas(&rwl->rwl_owner, 0, - RW_PROC(p) | RWLOCK_WRLOCK))) - rw_enter(rwl, RW_WRITE); - else { - membar_enter_after_atomic(); - WITNESS_CHECKORDER(&rwl->rwl_lock_obj, - LOP_EXCLUSIVE | LOP_NEWORDER, NULL); - WITNESS_LOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); + if (decr == 0) { + /* last one out */ + rw_exited(rwl); } } void rw_exit_read(struct rwlock *rwl) { - unsigned long owner; - - rw_assert_rdlock(rwl); - WITNESS_UNLOCK(&rwl->rwl_lock_obj, 0); - - membar_exit_before_atomic(); - owner = rwl->rwl_owner; - if (__predict_false((owner & RWLOCK_WAIT) || - rw_cas(&rwl->rwl_owner, owner, owner - RWLOCK_READ_INCR))) - rw_do_exit(rwl, 0); + /* maybe we're the last one? */ + rw_do_exit_read(rwl, RWLOCK_READ_INCR); } void rw_exit_write(struct rwlock *rwl) { - unsigned long owner; - rw_assert_wrlock(rwl); WITNESS_UNLOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); - membar_exit_before_atomic(); - owner = rwl->rwl_owner; - if (__predict_false((owner & RWLOCK_WAIT) || - rw_cas(&rwl->rwl_owner, owner, 0))) - rw_do_exit(rwl, RWLOCK_WRLOCK); -} - -#ifdef DIAGNOSTIC -/* - * Put the diagnostic functions here to keep the main code free - * from ifdef clutter. - */ -static void -rw_enter_diag(struct rwlock *rwl, int flags) -{ - switch (flags & RW_OPMASK) { - case RW_WRITE: - case RW_READ: - if (RW_PROC(curproc) == RW_PROC(rwl->rwl_owner)) - panic("rw_enter: %s locking against myself", - rwl->rwl_name); - break; - case RW_DOWNGRADE: - /* - * If we're downgrading, we must hold the write lock. - */ - if ((rwl->rwl_owner & RWLOCK_WRLOCK) == 0) - panic("rw_enter: %s downgrade of non-write lock", - rwl->rwl_name); - if (RW_PROC(curproc) != RW_PROC(rwl->rwl_owner)) - panic("rw_enter: %s downgrade, not holder", - rwl->rwl_name); - break; + membar_exit(); + atomic_store_long(&rwl->rwl_owner, 0); + LLTRW(rwl, LLTRACE_LK_R_EXCL); - default: - panic("rw_enter: unknown op 0x%x", flags); - } + rw_exited(rwl); } -#else -#define rw_enter_diag(r, f) -#endif - static void _rw_init_flags_witness(struct rwlock *rwl, const char *name, int lo_flags, const struct lock_type *type) { rwl->rwl_owner = 0; + rwl->rwl_waiters = 0; + rwl->rwl_readers = 0; rwl->rwl_name = name; #ifdef WITNESS @@ -223,90 +179,219 @@ _rw_init_flags(struct rwlock *rwl, const int rw_enter(struct rwlock *rwl, int flags) { - const struct rwlock_op *op; - unsigned long inc, o; -#ifdef MULTIPROCESSOR - /* - * If process holds the kernel lock, then we want to give up on CPU - * as soon as possible so other processes waiting for the kernel lock - * can progress. Hence no spinning if we hold the kernel lock. - */ - unsigned int spin = (_kernel_lock_held()) ? 0 : RW_SPINS; -#endif - int error, prio; -#ifdef WITNESS - int lop_flags; + int op = flags & RW_OPMASK; + int error; - lop_flags = LOP_NEWORDER; - if (flags & RW_WRITE) - lop_flags |= LOP_EXCLUSIVE; - if (flags & RW_DUPOK) - lop_flags |= LOP_DUPOK; - if ((flags & RW_NOSLEEP) == 0 && (flags & RW_DOWNGRADE) == 0) - WITNESS_CHECKORDER(&rwl->rwl_lock_obj, lop_flags, NULL); -#endif + switch (op) { + case RW_WRITE: + error = rw_write(rwl, flags); + break; + case RW_READ: + error = rw_read(rwl, flags); + break; + case RW_DOWNGRADE: + error = rw_downgrade(rwl, flags); + break; + default: + panic("%s rwlock %p: %s unexpected op 0x%x", + rwl->rwl_name, rwl, __func__, op); + /* NOTREACHED */ + } - op = &rw_ops[(flags & RW_OPMASK) - 1]; + return (error); +} - inc = op->inc + RW_PROC(curproc) * op->proc_mult; -retry: - while (__predict_false(((o = rwl->rwl_owner) & op->check) != 0)) { - unsigned long set = o | op->wait_set; - int do_sleep; +static int +rw_write(struct rwlock *rwl, int flags) +{ + unsigned long self = rw_self(); + unsigned long owner; + int prio; + int error; - /* Avoid deadlocks after panic or in DDB */ - if (panicstr || db_active) - return (0); +#ifdef WITNESS + if (!ISSET(flags, RW_NOSLEEP)) { + int lop_flags = LOP_NEWORDER | LOP_EXCLUSIVE; + if (ISSET(flags, RW_DUPOK)) + lop_flags |= LOP_DUPOK; + WITNESS_CHECKORDER(&rwl->rwl_lock_obj, lop_flags, NULL); + } +#endif + + owner = rw_cas(&rwl->rwl_owner, 0, self); + if (owner == 0) { + /* wow, we won. so easy */ + LLTRW(rwl, LLTRACE_LK_I_EXCL); + goto locked; + } #ifdef MULTIPROCESSOR + { + int spins; + /* * It makes sense to try to spin just in case the lock * is acquired by writer. */ - if ((o & RWLOCK_WRLOCK) && (spin != 0)) { - spin--; + + for (spins = 0; spins < RW_SPINS; spins++) { CPU_BUSY_CYCLE(); - continue; + owner = atomic_load_long(&rwl->rwl_owner); + if (owner != 0) + continue; + + owner = rw_cas(&rwl->rwl_owner, 0, self); + if (owner == 0) { + /* ok, we won now. */ + LLTRW(rwl, LLTRACE_LK_I_EXCL); + goto locked; + } } + } #endif - rw_enter_diag(rwl, flags); - - if (flags & RW_NOSLEEP) - return (EBUSY); + if (ISSET(flags, RW_NOSLEEP)) + return (EBUSY); - prio = op->wait_prio; - if (flags & RW_INTR) - prio |= PCATCH; - sleep_setup(rwl, prio, rwl->rwl_name); + prio = PLOCK - 4; + if (ISSET(flags, RW_INTR)) + prio |= PCATCH; - do_sleep = !rw_cas(&rwl->rwl_owner, o, set); - - error = sleep_finish(0, do_sleep); - if ((flags & RW_INTR) && - (error != 0)) + LLTRW(rwl, LLTRACE_LK_A_START); + rw_inc(&rwl->rwl_waiters); + do { + sleep_setup(&rwl->rwl_owner, prio, rwl->rwl_name); + owner = atomic_load_long(&rwl->rwl_owner); + error = sleep_finish(0, owner != 0); + if (ISSET(flags, RW_INTR) && (error != 0)) { + rw_dec(&rwl->rwl_waiters); + LLTRW(rwl, LLTRACE_LK_A_ABORT); return (error); - if (flags & RW_SLEEPFAIL) + } + if (ISSET(flags, RW_SLEEPFAIL)) { + rw_dec(&rwl->rwl_waiters); + rw_exited(rwl); + LLTRW(rwl, LLTRACE_LK_A_ABORT); return (EAGAIN); + } + + owner = rw_cas(&rwl->rwl_owner, 0, self); + } while (owner != 0); + rw_dec(&rwl->rwl_waiters); + LLTRW(rwl, LLTRACE_LK_A_EXCL); + +locked: + membar_enter_after_atomic(); + WITNESS_LOCK(&rwl->rwl_lock_obj, lop_flags); + + return (0); +} + +static int +rw_read_incr(struct rwlock *rwl, unsigned long owner) +{ + unsigned long incr; + unsigned long nowner; + + do { + incr = owner + RWLOCK_READ_INCR; + nowner = rw_cas(&rwl->rwl_owner, owner, incr); + if (nowner == owner) + return (1); + + owner = nowner; + } while (!ISSET(owner, RWLOCK_WRLOCK)); + + return (0); +} + +static int +rw_read(struct rwlock *rwl, int flags) +{ + unsigned long owner; + int error; + int prio; + +#ifdef WITNESS + if (!ISSET(flags, RW_NOSLEEP)) { + int lop_flags = LOP_NEWORDER; + if (ISSET(flags, RW_DUPOK)) + lop_flags |= LOP_DUPOK; + WITNESS_CHECKORDER(&rwl->rwl_lock_obj, lop_flags, NULL); + } +#endif + + owner = rw_cas(&rwl->rwl_owner, 0, RWLOCK_READ_INCR); + if (owner == 0) { + /* ermagerd, we won! */ + LLTRW(rwl, LLTRACE_LK_I_SHARED); + goto locked; } - if (__predict_false(rw_cas(&rwl->rwl_owner, o, o + inc))) - goto retry; + if (ISSET(owner, RWLOCK_WRLOCK)) { + if (__predict_false(owner == rw_self())) { + panic("%s rwlock %p: enter read deadlock", + rwl->rwl_name, rwl); + } + } else if (rw_read_incr(rwl, owner)) { + LLTRW(rwl, LLTRACE_LK_I_SHARED); + goto locked; + } + + if (ISSET(flags, RW_NOSLEEP)) + return (EBUSY); + + prio = PLOCK; + if (ISSET(flags, RW_INTR)) + prio |= PCATCH; + + LLTRW(rwl, LLTRACE_LK_A_START); + rw_inc(&rwl->rwl_readers); + do { + sleep_setup(&rwl->rwl_readers, prio, rwl->rwl_name); + owner = atomic_load_long(&rwl->rwl_owner); + error = sleep_finish(0, ISSET(owner, RWLOCK_WRLOCK)); + if (ISSET(flags, RW_INTR) && (error != 0)) + goto fail; + if (ISSET(flags, RW_SLEEPFAIL)) { + error = EAGAIN; + goto fail; + } + } while (!rw_read_incr(rwl, 0)); + rw_dec(&rwl->rwl_readers); + LLTRW(rwl, LLTRACE_LK_A_SHARED); + +locked: membar_enter_after_atomic(); + WITNESS_LOCK(&rwl->rwl_lock_obj, lop_flags); + + return (0); +fail: + rw_dec(&rwl->rwl_readers); + LLTRW(rwl, LLTRACE_LK_A_ABORT); + return (error); +} + +static int +rw_downgrade(struct rwlock *rwl, int flags) +{ + rw_assert_wrlock(rwl); - /* - * If old lock had RWLOCK_WAIT and RWLOCK_WRLOCK set, it means we - * downgraded a write lock and had possible read waiter, wake them - * to let them retry the lock. - */ - if (__predict_false((o & (RWLOCK_WRLOCK|RWLOCK_WAIT)) == - (RWLOCK_WRLOCK|RWLOCK_WAIT))) - wakeup(rwl); + membar_exit(); + atomic_store_long(&rwl->rwl_owner, RWLOCK_READ_INCR); + LLTRW(rwl, LLTRACE_LK_DOWNGRADE); - if (flags & RW_DOWNGRADE) +#ifdef WITNESS + { + int lop_flags = LOP_NEWORDER; + if (ISSET(flags, RW_DUPOK)) + lop_flags |= LOP_DUPOK; WITNESS_DOWNGRADE(&rwl->rwl_lock_obj, lop_flags); - else - WITNESS_LOCK(&rwl->rwl_lock_obj, lop_flags); + } +#endif + + if (atomic_load_int(&rwl->rwl_readers) > 0) + wakeup(&rwl->rwl_readers); return (0); } @@ -314,53 +399,38 @@ retry: void rw_exit(struct rwlock *rwl) { - unsigned long wrlock; + unsigned long owner; - /* Avoid deadlocks after panic or in DDB */ - if (panicstr || db_active) - return; + owner = atomic_load_long(&rwl->rwl_owner); + if (__predict_false(owner == 0)) { + panic("%s rwlock %p: exit on unlocked lock", + rwl->rwl_name, rwl); + } - wrlock = rwl->rwl_owner & RWLOCK_WRLOCK; - if (wrlock) - rw_assert_wrlock(rwl); + if (ISSET(owner, RWLOCK_WRLOCK)) + rw_exit_write(rwl); else - rw_assert_rdlock(rwl); - WITNESS_UNLOCK(&rwl->rwl_lock_obj, wrlock ? LOP_EXCLUSIVE : 0); - - membar_exit_before_atomic(); - rw_do_exit(rwl, wrlock); + rw_do_exit_read(rwl, owner); } /* membar_exit_before_atomic() has to precede call of this function. */ -void -rw_do_exit(struct rwlock *rwl, unsigned long wrlock) +static void +rw_exited(struct rwlock *rwl) { - unsigned long owner, set; - - do { - owner = rwl->rwl_owner; - if (wrlock) - set = 0; - else - set = (owner - RWLOCK_READ_INCR) & - ~(RWLOCK_WAIT|RWLOCK_WRWANT); - /* - * Potential MP race here. If the owner had WRWANT set, we - * cleared it and a reader can sneak in before a writer. - */ - } while (__predict_false(rw_cas(&rwl->rwl_owner, owner, set))); - - if (owner & RWLOCK_WAIT) - wakeup(rwl); + if (atomic_load_int(&rwl->rwl_waiters) && wakeup_one(&rwl->rwl_owner)) + return; + if (atomic_load_int(&rwl->rwl_readers)) + wakeup(&rwl->rwl_readers); } int rw_status(struct rwlock *rwl) { - unsigned long owner = rwl->rwl_owner; + unsigned long owner; - if (owner & RWLOCK_WRLOCK) { - if (RW_PROC(curproc) == RW_PROC(owner)) + owner = atomic_load_long(&rwl->rwl_owner); + if (ISSET(owner, RWLOCK_WRLOCK)) { + if (rw_self() == owner) return RW_WRITE; else return RW_WRITE_OTHER; @@ -380,11 +450,10 @@ rw_assert_wrlock(struct rwlock *rwl) #ifdef WITNESS witness_assert(&rwl->rwl_lock_obj, LA_XLOCKED); #else - if (!(rwl->rwl_owner & RWLOCK_WRLOCK)) - panic("%s: lock not held", rwl->rwl_name); - - if (RW_PROC(curproc) != RW_PROC(rwl->rwl_owner)) - panic("%s: lock not held by this process", rwl->rwl_name); + if (atomic_load_long(&rwl->rwl_owner) != rw_self()) { + panic("%s rwlock %p: lock not held by this process", + rwl->rwl_name, rwl); + } #endif } @@ -397,8 +466,8 @@ rw_assert_rdlock(struct rwlock *rwl) #ifdef WITNESS witness_assert(&rwl->rwl_lock_obj, LA_SLOCKED); #else - if (!RW_PROC(rwl->rwl_owner) || (rwl->rwl_owner & RWLOCK_WRLOCK)) - panic("%s: lock not shared", rwl->rwl_name); + if (rw_status(rwl) != RW_READ) + panic("%s rwlock %p: lock not shared", rwl->rwl_name, rwl); #endif } @@ -413,9 +482,11 @@ rw_assert_anylock(struct rwlock *rwl) #else switch (rw_status(rwl)) { case RW_WRITE_OTHER: - panic("%s: lock held by different process", rwl->rwl_name); + panic("%s rwlock %p: lock held by different process " + "(self %lx, owner %lx)", rwl->rwl_name, rwl, + rw_self(), rwl->rwl_owner); case 0: - panic("%s: lock not held", rwl->rwl_name); + panic("%s rwlock %p: lock not held", rwl->rwl_name, rwl); } #endif } @@ -429,8 +500,8 @@ rw_assert_unlocked(struct rwlock *rwl) #ifdef WITNESS witness_assert(&rwl->rwl_lock_obj, LA_UNLOCKED); #else - if (RW_PROC(curproc) == RW_PROC(rwl->rwl_owner)) - panic("%s: lock held", rwl->rwl_name); + if (atomic_load_long(&rwl->rwl_owner) == rw_self()) + panic("%s rwlock %p: lock held", rwl->rwl_name, rwl); #endif } #endif @@ -450,7 +521,7 @@ rrw_enter(struct rrwlock *rrwl, int flag { int rv; - if (RW_PROC(rrwl->rrwl_lock.rwl_owner) == RW_PROC(curproc)) { + if (atomic_load_long(&rrwl->rrwl_lock.rwl_owner) == rw_self()) { if (flags & RW_RECURSEFAIL) return (EDEADLK); else { @@ -472,7 +543,7 @@ void rrw_exit(struct rrwlock *rrwl) { - if (RW_PROC(rrwl->rrwl_lock.rwl_owner) == RW_PROC(curproc)) { + if (atomic_load_long(&rrwl->rrwl_lock.rwl_owner) == rw_self()) { KASSERT(rrwl->rrwl_wcnt > 0); rrwl->rrwl_wcnt--; if (rrwl->rrwl_wcnt != 0) { Index: kern/kern_synch.c =================================================================== RCS file: /cvs/src/sys/kern/kern_synch.c,v diff -u -p -r1.206 kern_synch.c --- kern/kern_synch.c 23 Jul 2024 08:38:02 -0000 1.206 +++ kern/kern_synch.c 13 Oct 2024 11:38:44 -0000 @@ -37,6 +37,8 @@ * @(#)kern_synch.c 8.6 (Berkeley) 1/21/94 */ +#include "llt.h" + #include #include #include @@ -521,25 +523,26 @@ unsleep(struct proc *p) p->p_wmesg = NULL; TRACEPOINT(sched, unsleep, p->p_tid + THREAD_PID_OFFSET, p->p_p->ps_pid); + LLTRACE(lltrace_runnable, p); } } /* * Make a number of processes sleeping on the specified identifier runnable. */ -void +int wakeup_n(const volatile void *ident, int n) { struct slpque *qp, wakeq; - struct proc *p; - struct proc *pnext; + struct proc *p, *np; + int c = 0; TAILQ_INIT(&wakeq); - SCHED_LOCK(); qp = &slpque[LOOKUP(ident)]; - for (p = TAILQ_FIRST(qp); p != NULL && n != 0; p = pnext) { - pnext = TAILQ_NEXT(p, p_runq); + + SCHED_LOCK(); + TAILQ_FOREACH_SAFE(p, qp, p_runq, np) { #ifdef DIAGNOSTIC if (p->p_stat != SSLEEP && p->p_stat != SSTOP) panic("thread %d p_stat is %d", p->p_tid, p->p_stat); @@ -550,26 +553,31 @@ wakeup_n(const volatile void *ident, int p->p_wchan = NULL; p->p_wmesg = NULL; TAILQ_INSERT_TAIL(&wakeq, p, p_runq); - --n; + + if (++c >= n) + break; } } while ((p = TAILQ_FIRST(&wakeq))) { TAILQ_REMOVE(&wakeq, p, p_runq); TRACEPOINT(sched, unsleep, p->p_tid + THREAD_PID_OFFSET, p->p_p->ps_pid); + LLTRACE(lltrace_runnable, p); if (p->p_stat == SSLEEP) setrunnable(p); } SCHED_UNLOCK(); + + return (c); } /* * Make all processes sleeping on the specified identifier runnable. */ -void +int wakeup(const volatile void *chan) { - wakeup_n(chan, -1); + return wakeup_n(chan, INT_MAX); } int Index: kern/vfs_subr.c =================================================================== RCS file: /cvs/src/sys/kern/vfs_subr.c,v diff -u -p -r1.322 vfs_subr.c --- kern/vfs_subr.c 13 Jul 2024 14:37:55 -0000 1.322 +++ kern/vfs_subr.c 13 Oct 2024 11:38:44 -0000 @@ -246,10 +246,7 @@ vfs_unbusy(struct mount *mp) int vfs_isbusy(struct mount *mp) { - if (RWLOCK_OWNER(&mp->mnt_lock) > 0) - return (1); - else - return (0); + return (rw_status(&mp->mnt_lock) != 0); } /* Index: sys/rwlock.h =================================================================== RCS file: /cvs/src/sys/sys/rwlock.h,v diff -u -p -r1.28 rwlock.h --- sys/rwlock.h 11 Jan 2021 18:49:38 -0000 1.28 +++ sys/rwlock.h 13 Oct 2024 11:38:44 -0000 @@ -60,6 +60,8 @@ struct proc; struct rwlock { volatile unsigned long rwl_owner; + volatile unsigned int rwl_waiters; + volatile unsigned int rwl_readers; const char *rwl_name; #ifdef WITNESS struct lock_object rwl_lock_obj; @@ -91,14 +93,12 @@ struct rwlock { #ifdef WITNESS #define RWLOCK_INITIALIZER(name) \ - { 0, name, .rwl_lock_obj = RWLOCK_LO_INITIALIZER(name, 0) } + { 0, 0, 0, name, .rwl_lock_obj = RWLOCK_LO_INITIALIZER(name, 0) } #else #define RWLOCK_INITIALIZER(name) \ - { 0, name } + { 0, 0, 0, name } #endif -#define RWLOCK_WAIT 0x01UL -#define RWLOCK_WRWANT 0x02UL #define RWLOCK_WRLOCK 0x04UL #define RWLOCK_MASK 0x07UL Index: sys/systm.h =================================================================== RCS file: /cvs/src/sys/sys/systm.h,v diff -u -p -r1.171 systm.h --- sys/systm.h 28 May 2024 12:50:23 -0000 1.171 +++ sys/systm.h 13 Oct 2024 11:38:44 -0000 @@ -269,8 +269,8 @@ void cond_signal(struct cond *); struct mutex; struct rwlock; -void wakeup_n(const volatile void *, int); -void wakeup(const volatile void *); +int wakeup_n(const volatile void *, int); +int wakeup(const volatile void *); #define wakeup_one(c) wakeup_n((c), 1) int tsleep(const volatile void *, int, const char *, int); int tsleep_nsec(const volatile void *, int, const char *, uint64_t); Index: uvm/uvm_map.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_map.c,v diff -u -p -r1.330 uvm_map.c --- uvm/uvm_map.c 24 Jul 2024 12:17:31 -0000 1.330 +++ uvm/uvm_map.c 13 Oct 2024 11:38:44 -0000 @@ -5201,68 +5201,77 @@ out: boolean_t vm_map_lock_try_ln(struct vm_map *map, char *file, int line) { - boolean_t rv; + int rv; if (map->flags & VM_MAP_INTRSAFE) { - rv = mtx_enter_try(&map->mtx); + if (!mtx_enter_try(&map->mtx)) + return (FALSE); } else { + struct proc *busy; + mtx_enter(&map->flags_lock); - if ((map->flags & VM_MAP_BUSY) && (map->busy != curproc)) { - mtx_leave(&map->flags_lock); + busy = map->busy; + mtx_leave(&map->flags_lock); + if (busy != NULL && busy != curproc) return (FALSE); - } + + rv = rw_enter(&map->lock, RW_WRITE|RW_NOSLEEP); + if (rv != 0) + return (FALSE); + + /* to be sure, to be sure */ + mtx_enter(&map->flags_lock); + busy = map->busy; mtx_leave(&map->flags_lock); - rv = (rw_enter(&map->lock, RW_WRITE|RW_NOSLEEP) == 0); - /* check if the lock is busy and back out if we won the race */ - if (rv) { - mtx_enter(&map->flags_lock); - if ((map->flags & VM_MAP_BUSY) && - (map->busy != curproc)) { - rw_exit(&map->lock); - rv = FALSE; - } - mtx_leave(&map->flags_lock); + if (busy != NULL && busy != curproc) { + rw_exit(&map->lock); + return (FALSE); } } - if (rv) { - map->timestamp++; - LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); - uvm_tree_sanity(map, file, line); - uvm_tree_size_chk(map, file, line); - } + map->timestamp++; + LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); + uvm_tree_sanity(map, file, line); + uvm_tree_size_chk(map, file, line); - return (rv); + return (TRUE); } void vm_map_lock_ln(struct vm_map *map, char *file, int line) { if ((map->flags & VM_MAP_INTRSAFE) == 0) { - do { - mtx_enter(&map->flags_lock); -tryagain: - while ((map->flags & VM_MAP_BUSY) && - (map->busy != curproc)) { - map->flags |= VM_MAP_WANTLOCK; - msleep_nsec(&map->flags, &map->flags_lock, + mtx_enter(&map->flags_lock); + for (;;) { + while (map->busy != NULL && map->busy != curproc) { + map->nbusy++; + msleep_nsec(&map->busy, &map->mtx, PVM, vmmapbsy, INFSLP); + map->nbusy--; } mtx_leave(&map->flags_lock); - } while (rw_enter(&map->lock, RW_WRITE|RW_SLEEPFAIL) != 0); - /* check if the lock is busy and back out if we won the race */ - mtx_enter(&map->flags_lock); - if ((map->flags & VM_MAP_BUSY) && (map->busy != curproc)) { - rw_exit(&map->lock); - goto tryagain; + + rw_enter_write(&map->lock); + + /* to be sure, to be sure */ + mtx_enter(&map->flags_lock); + if (map->busy != NULL && map->busy != curproc) { + /* go around again */ + rw_exit_write(&map->lock); + } else { + /* we won */ + break; + } } mtx_leave(&map->flags_lock); } else { mtx_enter(&map->mtx); } - if (map->busy != curproc) + if (map->busy != curproc) { + KASSERT(map->busy == NULL); map->timestamp++; + } LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); uvm_tree_sanity(map, file, line); uvm_tree_size_chk(map, file, line); @@ -5314,25 +5323,24 @@ vm_map_busy_ln(struct vm_map *map, char mtx_enter(&map->flags_lock); map->busy = curproc; - map->flags |= VM_MAP_BUSY; mtx_leave(&map->flags_lock); } void vm_map_unbusy_ln(struct vm_map *map, char *file, int line) { - int oflags; + unsigned int nbusy; KASSERT((map->flags & VM_MAP_INTRSAFE) == 0); KASSERT(map->busy == curproc); mtx_enter(&map->flags_lock); - oflags = map->flags; + nbusy = map->nbusy; map->busy = NULL; - map->flags &= ~(VM_MAP_BUSY|VM_MAP_WANTLOCK); mtx_leave(&map->flags_lock); - if (oflags & VM_MAP_WANTLOCK) - wakeup(&map->flags); + + if (nbusy > 0) + wakeup_n(&map->busy, nbusy); } void Index: uvm/uvm_map.h =================================================================== RCS file: /cvs/src/sys/uvm/uvm_map.h,v diff -u -p -r1.90 uvm_map.h --- uvm/uvm_map.h 18 Jun 2024 12:37:29 -0000 1.90 +++ uvm/uvm_map.h 13 Oct 2024 11:38:44 -0000 @@ -214,17 +214,6 @@ RBT_PROTOTYPE(uvm_map_addr, vm_map_entry * map is write-locked. may be tested * without asserting `flags_lock'. * - * VM_MAP_BUSY r/w; may only be set when map is - * write-locked, may only be cleared by - * thread which set it, map read-locked - * or write-locked. must be tested - * while `flags_lock' is asserted. - * - * VM_MAP_WANTLOCK r/w; may only be set when the map - * is busy, and thread is attempting - * to write-lock. must be tested - * while `flags_lock' is asserted. - * * VM_MAP_GUARDPAGES r/o; must be specified at map * initialization time. * If set, guards will appear between @@ -257,6 +246,7 @@ RBT_PROTOTYPE(uvm_map_addr, vm_map_entry * a atomic operations * I immutable after creation or exec(2) * v `vm_map_lock' (this map `lock' or `mtx') + * f flags_lock */ struct vm_map { struct pmap *pmap; /* [I] Physical map */ @@ -266,9 +256,10 @@ struct vm_map { vsize_t size; /* virtual size */ int ref_count; /* [a] Reference count */ - int flags; /* flags */ + int flags; /* [f] flags */ unsigned int timestamp; /* Version number */ - struct proc *busy; /* [v] thread holding map busy*/ + struct proc *busy; /* [f] thread holding map busy*/ + unsigned int nbusy; /* [f] waiters for busy */ vaddr_t min_offset; /* [I] First address in map. */ vaddr_t max_offset; /* [I] Last address in map. */ @@ -323,8 +314,6 @@ struct vm_map { #define VM_MAP_PAGEABLE 0x01 /* ro: entries are pageable */ #define VM_MAP_INTRSAFE 0x02 /* ro: interrupt safe map */ #define VM_MAP_WIREFUTURE 0x04 /* rw: wire future mappings */ -#define VM_MAP_BUSY 0x08 /* rw: map is busy */ -#define VM_MAP_WANTLOCK 0x10 /* rw: want to write-lock */ #define VM_MAP_GUARDPAGES 0x20 /* rw: add guard pgs to map */ #define VM_MAP_ISVMSPACE 0x40 /* ro: map is a vmspace */ #define VM_MAP_PINSYSCALL_ONCE 0x100 /* rw: pinsyscall done */