Index: lib/libkvm/kvm_file2.c =================================================================== RCS file: /cvs/src/lib/libkvm/kvm_file2.c,v retrieving revision 1.49 diff -u -p -r1.49 kvm_file2.c --- lib/libkvm/kvm_file2.c 4 May 2016 01:28:42 -0000 1.49 +++ lib/libkvm/kvm_file2.c 12 Jul 2016 03:40:00 -0000 @@ -496,6 +496,7 @@ fill_file(kvm_t *kd, struct kinfo_file * kf->f_iflags = fp->f_iflags; kf->f_type = fp->f_type; kf->f_count = fp->f_count; + kf->f_msgcount = fp->f_msgcount; kf->f_ucred = PTRTOINT64(fp->f_cred); kf->f_uid = f_cred.cr_uid; kf->f_gid = f_cred.cr_gid; @@ -666,7 +667,6 @@ fill_file(kvm_t *kd, struct kinfo_file * _kvm_err(kd, kd->program, "can't read unpcb"); return (-1); } - kf->f_msgcount = unpcb.unp_msgcount; kf->unp_conn = PTRTOINT64(unpcb.unp_conn); kf->unp_refs = PTRTOINT64( SLIST_FIRST(&unpcb.unp_refs)); Index: sys/kern/kern_sysctl.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sysctl.c,v retrieving revision 1.305 diff -u -p -r1.305 kern_sysctl.c --- sys/kern/kern_sysctl.c 27 May 2016 19:45:04 -0000 1.305 +++ sys/kern/kern_sysctl.c 12 Jul 2016 03:40:03 -0000 @@ -1020,6 +1020,7 @@ fill_file(struct kinfo_file *kf, struct kf->f_iflags = fp->f_iflags; kf->f_type = fp->f_type; kf->f_count = fp->f_count; + kf->f_msgcount = fp->f_msgcount; if (show_pointers) kf->f_ucred = PTRTOINT64(fp->f_cred); kf->f_uid = fp->f_cred->cr_uid; @@ -1152,7 +1153,6 @@ fill_file(struct kinfo_file *kf, struct case AF_UNIX: { struct unpcb *unpcb = so->so_pcb; - kf->f_msgcount = unpcb->unp_msgcount; if (show_pointers) { kf->unp_conn = PTRTOINT64(unpcb->unp_conn); kf->unp_refs = PTRTOINT64( Index: sys/kern/uipc_usrreq.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v retrieving revision 1.97 diff -u -p -r1.97 uipc_usrreq.c --- sys/kern/uipc_usrreq.c 25 Apr 2016 20:18:31 -0000 1.97 +++ sys/kern/uipc_usrreq.c 12 Jul 2016 03:40:03 -0000 @@ -53,25 +53,6 @@ void uipc_setaddr(const struct unpcb *, struct mbuf *); -/* list of all UNIX domain sockets, for unp_gc() */ -LIST_HEAD(unp_head, unpcb) unp_head = LIST_HEAD_INITIALIZER(&unp_head); - -/* - * Stack of sets of files that were passed over a socket but were - * not received and need to be closed. - */ -struct unp_deferral { - SLIST_ENTRY(unp_deferral) ud_link; - int ud_n; - /* followed by ud_n struct file * pointers */ -}; - -/* list of sets of files that were sent over sockets that are now closed */ -SLIST_HEAD(,unp_deferral) unp_deferred = SLIST_HEAD_INITIALIZER(&unp_deferred); - -struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL); - - /* * Unix communications domain. * @@ -369,7 +350,6 @@ unp_attach(struct socket *so) unp->unp_socket = so; so->so_pcb = unp; getnanotime(&unp->unp_ctime); - LIST_INSERT_HEAD(&unp_head, unp, unp_link); return (0); } @@ -378,7 +358,6 @@ unp_detach(struct unpcb *unp) { struct vnode *vp; - LIST_REMOVE(unp, unp_link); if (unp->unp_vnode) { unp->unp_vnode->v_socket = NULL; vp = unp->unp_vnode; @@ -392,9 +371,19 @@ unp_detach(struct unpcb *unp) soisdisconnected(unp->unp_socket); unp->unp_socket->so_pcb = NULL; m_freem(unp->unp_addr); - free(unp, M_PCB, sizeof *unp); - if (unp_rights) - task_add(systq, &unp_gc_task); + if (unp_rights) { + /* + * Normally the receive buffer is flushed later, + * in sofree, but if our receive buffer holds references + * to descriptors that are now garbage, we will dispose + * of those descriptor references after the garbage collector + * gets them (resulting in a "panic: closef: count < 0"). + */ + sorflush(unp->unp_socket); + free(unp, M_PCB, 0); + unp_gc(); + } else + free(unp, M_PCB, 0); } int @@ -636,22 +625,6 @@ unp_drain(void) } #endif -extern struct domain unixdomain; - -static struct unpcb * -fptounp(struct file *fp) -{ - struct socket *so; - - if (fp->f_type != DTYPE_SOCKET) - return (NULL); - if ((so = fp->f_data) == NULL) - return (NULL); - if (so->so_proto->pr_domain != &unixdomain) - return (NULL); - return (sotounpcb(so)); -} - int unp_externalize(struct mbuf *rights, socklen_t controllen, int flags) { @@ -706,10 +679,8 @@ unp_externalize(struct mbuf *rights, soc restart: fdplock(p->p_fd); if (error != 0) { - if (nfds > 0) { - rp = ((struct file **)CMSG_DATA(cm)); - unp_discard(rp, nfds); - } + rp = ((struct file **)CMSG_DATA(cm)); + unp_discard(rp, nfds); goto out; } @@ -758,11 +729,8 @@ restart: */ rp = (struct file **)CMSG_DATA(cm); for (i = 0; i < nfds; i++) { - struct unpcb *unp; - fp = *rp++; - if ((unp = fptounp(fp)) != NULL) - unp->unp_msgcount--; + fp->f_msgcount--; unp_rights--; } @@ -786,7 +754,6 @@ unp_internalize(struct mbuf *control, st struct filedesc *fdp = p->p_fd; struct cmsghdr *cm = mtod(control, struct cmsghdr *); struct file **rp, *fp; - struct unpcb *unp; int i, error; int nfds, *ip, fd, neededspace; @@ -844,7 +811,8 @@ morespace: error = EBADF; goto fail; } - if (fp->f_count == LONG_MAX-2) { + if (fp->f_count == LONG_MAX-2 || + fp->f_msgcount == LONG_MAX-2) { error = EDEADLK; goto fail; } @@ -860,10 +828,7 @@ morespace: memcpy(rp, &fp, sizeof fp); rp--; fp->f_count++; - if ((unp = fptounp(fp)) != NULL) { - unp->unp_file = fp; - unp->unp_msgcount++; - } + fp->f_msgcount++; unp_rights++; } return (0); @@ -873,8 +838,7 @@ fail: rp++; memcpy(&fp, rp, sizeof(fp)); fp->f_count--; - if ((unp = fptounp(fp)) != NULL) - unp->unp_msgcount--; + fp->f_msgcount--; unp_rights--; } @@ -882,83 +846,43 @@ fail: } int unp_defer, unp_gcing; +extern struct domain unixdomain; void -unp_gc(void *arg __unused) +unp_gc(void) { - struct unp_deferral *defer; - struct file *fp; + struct file *fp, *nextfp; struct socket *so; - struct unpcb *unp; + struct file **extra_ref, **fpp; int nunref, i; if (unp_gcing) return; unp_gcing = 1; - - /* close any fds on the deferred list */ - while ((defer = SLIST_FIRST(&unp_deferred)) != NULL) { - SLIST_REMOVE_HEAD(&unp_deferred, ud_link); - for (i = 0; i < defer->ud_n; i++) { - memcpy(&fp, &((struct file **)(defer + 1))[i], - sizeof(fp)); - FREF(fp); - if ((unp = fptounp(fp)) != NULL) - unp->unp_msgcount--; - unp_rights--; - (void) closef(fp, NULL); - } - free(defer, M_TEMP, sizeof(*defer) + sizeof(fp) * defer->ud_n); - } - unp_defer = 0; - LIST_FOREACH(unp, &unp_head, unp_link) - unp->unp_flags &= ~(UNP_GCMARK | UNP_GCDEFER | UNP_GCDEAD); + LIST_FOREACH(fp, &filehead, f_list) + fp->f_iflags &= ~(FIF_MARK|FIF_DEFER); do { - nunref = 0; - LIST_FOREACH(unp, &unp_head, unp_link) { - if (unp->unp_flags & UNP_GCDEFER) { - /* - * This socket is referenced by another - * socket which is known to be live, - * so it's certainly live. - */ - unp->unp_flags &= ~UNP_GCDEFER; + LIST_FOREACH(fp, &filehead, f_list) { + if (fp->f_iflags & FIF_DEFER) { + fp->f_iflags &= ~FIF_DEFER; unp_defer--; - } else if (unp->unp_flags & UNP_GCMARK) { - /* marked as live in previous pass */ - continue; - } else if ((fp = unp->unp_file) == NULL) { - /* not being passed, so can't be in loop */ - } else if (fp->f_count == 0) { - /* - * Already being closed, let normal close - * path take its course - */ } else { - /* - * Unreferenced by other sockets so far, - * so if all the references (f_count) are - * from passing (unp_msgcount) then this - * socket is prospectively dead - */ - if (fp->f_count == unp->unp_msgcount) { - nunref++; - unp->unp_flags |= UNP_GCDEAD; + if (fp->f_count == 0) + continue; + if (fp->f_iflags & FIF_MARK) + continue; + if (fp->f_count == fp->f_msgcount) continue; - } } + fp->f_iflags |= FIF_MARK; - /* - * This is the first time we've seen this socket on - * the mark pass and known it has a live reference, - * so mark it, then scan its receive buffer for - * sockets and note them as deferred (== referenced, - * but not yet marked). - */ - unp->unp_flags |= UNP_GCMARK; - - so = unp->unp_socket; + if (fp->f_type != DTYPE_SOCKET || + (so = fp->f_data) == NULL) + continue; + if (so->so_proto->pr_domain != &unixdomain || + (so->so_proto->pr_flags&PR_RIGHTS) == 0) + continue; #ifdef notdef if (so->so_rcv.sb_flags & SB_LOCK) { /* @@ -978,18 +902,65 @@ unp_gc(void *arg __unused) unp_scan(so->so_rcv.sb_mb, unp_mark); } } while (unp_defer); - /* - * If there are any unreferenced sockets, then for each dispose - * of files in its receive buffer and then close it. + * We grab an extra reference to each of the file table entries + * that are not otherwise accessible and then free the rights + * that are stored in messages on them. + * + * The bug in the original code is a little tricky, so I'll describe + * what's wrong with it here. + * + * It is incorrect to simply unp_discard each entry for f_msgcount + * times -- consider the case of sockets A and B that contain + * references to each other. On a last close of some other socket, + * we trigger a gc since the number of outstanding rights (unp_rights) + * is non-zero. If during the sweep phase the gc code un_discards, + * we end up doing a (full) closef on the descriptor. A closef on A + * results in the following chain. Closef calls soo_close, which + * calls soclose. Soclose calls first (through the switch + * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply + * returns because the previous instance had set unp_gcing, and + * we return all the way back to soclose, which marks the socket + * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush + * to free up the rights that are queued in messages on the socket A, + * i.e., the reference on B. The sorflush calls via the dom_dispose + * switch unp_dispose, which unp_scans with unp_discard. This second + * instance of unp_discard just calls closef on B. + * + * Well, a similar chain occurs on B, resulting in a sorflush on B, + * which results in another closef on A. Unfortunately, A is already + * being closed, and the descriptor has already been marked with + * SS_NOFDREF, and soclose panics at this point. + * + * Here, we first take an extra reference to each inaccessible + * descriptor. Then, we call sorflush ourself, since we know + * it is a Unix domain socket anyhow. After we destroy all the + * rights carried in messages, we do a last closef to get rid + * of our extra reference. This is the last close, and the + * unp_detach etc will shut down the socket. + * + * 91/09/19, bsy@cs.cmu.edu */ - if (nunref) { - LIST_FOREACH(unp, &unp_head, unp_link) { - if (unp->unp_flags & UNP_GCDEAD) - unp_scan(unp->unp_socket->so_rcv.sb_mb, - unp_discard); + extra_ref = mallocarray(nfiles, sizeof(struct file *), M_FILE, M_WAITOK); + for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; + fp != NULL; fp = nextfp) { + nextfp = LIST_NEXT(fp, f_list); + if (fp->f_count == 0) + continue; + if (fp->f_count == fp->f_msgcount && + !(fp->f_iflags & FIF_MARK)) { + *fpp++ = fp; + nunref++; + FREF(fp); + fp->f_count++; } } + for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) + if ((*fpp)->f_type == DTYPE_SOCKET && (*fpp)->f_data != NULL) + sorflush((*fpp)->f_data); + for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) + (void) closef(*fpp, NULL); + free(extra_ref, M_FILE, 0); unp_gcing = 0; } @@ -1033,37 +1004,37 @@ unp_scan(struct mbuf *m0, void (*op)(str void unp_mark(struct file **rp, int nfds) { - struct unpcb *unp; int i; for (i = 0; i < nfds; i++) { if (rp[i] == NULL) continue; - unp = fptounp(rp[i]); - if (unp == NULL) + if (rp[i]->f_iflags & (FIF_MARK|FIF_DEFER)) continue; - if (unp->unp_flags & (UNP_GCMARK|UNP_GCDEFER)) - continue; - - unp_defer++; - unp->unp_flags |= UNP_GCDEFER; - unp->unp_flags &= ~UNP_GCDEAD; + if (rp[i]->f_type == DTYPE_SOCKET) { + unp_defer++; + rp[i]->f_iflags |= FIF_DEFER; + } else { + rp[i]->f_iflags |= FIF_MARK; + } } } void unp_discard(struct file **rp, int nfds) { - struct unp_deferral *defer; - - /* copy the file pointers to a deferral structure */ - defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK); - defer->ud_n = nfds; - memcpy(defer + 1, rp, sizeof(*rp) * nfds); - memset(rp, 0, sizeof(*rp) * nfds); - SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link); + struct file *fp; + int i; - task_add(systq, &unp_gc_task); + for (i = 0; i < nfds; i++) { + if ((fp = rp[i]) == NULL) + continue; + rp[i] = NULL; + FREF(fp); + fp->f_msgcount--; + unp_rights--; + (void) closef(fp, NULL); + } } Index: sys/sys/file.h =================================================================== RCS file: /cvs/src/sys/sys/file.h,v retrieving revision 1.37 diff -u -p -r1.37 file.h --- sys/sys/file.h 26 Apr 2016 09:13:05 -0000 1.37 +++ sys/sys/file.h 12 Jul 2016 03:40:03 -0000 @@ -70,6 +70,7 @@ struct file { #define DTYPE_KQUEUE 4 /* event queue */ short f_type; /* descriptor type */ long f_count; /* reference count */ + long f_msgcount; /* references from message queue */ struct ucred *f_cred; /* credentials associated with descriptor */ struct fileops *f_ops; off_t f_offset; @@ -84,6 +85,8 @@ struct file { #define FIF_HASLOCK 0x01 /* descriptor holds advisory lock */ #define FIF_LARVAL 0x02 /* not fully constructed, don't use */ +#define FIF_MARK 0x04 /* mark during gc() */ +#define FIF_DEFER 0x08 /* defer for next gc() pass */ #define FILE_IS_USABLE(fp) \ (((fp)->f_iflags & FIF_LARVAL) == 0) Index: sys/sys/unpcb.h =================================================================== RCS file: /cvs/src/sys/sys/unpcb.h,v retrieving revision 1.12 diff -u -p -r1.12 unpcb.h --- sys/sys/unpcb.h 28 Aug 2015 04:38:47 -0000 1.12 +++ sys/sys/unpcb.h 12 Jul 2016 03:40:03 -0000 @@ -61,27 +61,21 @@ struct unpcb { struct socket *unp_socket; /* pointer back to socket */ struct vnode *unp_vnode; /* if associated with file */ - struct file *unp_file; /* backpointer for unp_gc() */ - struct unpcb *unp_conn; /* control block of connected socket */ ino_t unp_ino; /* fake inode number */ + struct unpcb *unp_conn; /* control block of connected socket */ SLIST_HEAD(,unpcb) unp_refs; /* referencing socket linked list */ SLIST_ENTRY(unpcb) unp_nextref; /* link in unp_refs list */ struct mbuf *unp_addr; /* bound address of socket */ - long unp_msgcount; /* references from socket rcv buf */ int unp_flags; /* this unpcb contains peer eids */ struct sockpeercred unp_connid;/* id of peer process */ struct timespec unp_ctime; /* holds creation time */ - LIST_ENTRY(unpcb) unp_link; /* link in per-AF list of sockets */ }; /* * flag bits in unp_flags */ -#define UNP_FEIDS 0x01 /* unp_connid contains information */ -#define UNP_FEIDSBIND 0x02 /* unp_connid was set by a bind */ -#define UNP_GCMARK 0x04 /* mark during unp_gc() */ -#define UNP_GCDEFER 0x08 /* ref'd, but not marked in this pass */ -#define UNP_GCDEAD 0x10 /* unref'd in this pass */ +#define UNP_FEIDS 1 /* unp_connid contains information */ +#define UNP_FEIDSBIND 2 /* unp_connid was set by a bind */ #define sotounpcb(so) ((struct unpcb *)((so)->so_pcb)) @@ -94,7 +88,7 @@ void unp_detach(struct unpcb *); void unp_discard(struct file **, int); void unp_disconnect(struct unpcb *); void unp_drop(struct unpcb *, int); -void unp_gc(void *); +void unp_gc(void); void unp_mark(struct file **, int); void unp_scan(struct mbuf *, void (*)(struct file **, int)); void unp_shutdown(struct unpcb *); Index: usr.sbin/pstat/pstat.8 =================================================================== RCS file: /cvs/src/usr.sbin/pstat/pstat.8,v retrieving revision 1.49 diff -u -p -r1.49 pstat.8 --- usr.sbin/pstat/pstat.8 28 Aug 2015 04:38:47 -0000 1.49 +++ usr.sbin/pstat/pstat.8 12 Jul 2016 03:40:04 -0000 @@ -1,4 +1,4 @@ -.\" $OpenBSD: pstat.8,v 1.49 2015/08/28 04:38:47 guenther Exp $ +.\" $OpenBSD: pstat.8,v 1.48 2014/10/01 09:56:36 mpi Exp $ .\" $NetBSD: pstat.8,v 1.9.4.1 1996/06/02 09:08:17 mrg Exp $ .\" .\" Copyright (c) 1980, 1991, 1993, 1994 @@ -30,7 +30,7 @@ .\" .\" from: @(#)pstat.8 8.4 (Berkeley) 4/19/94 .\" -.Dd $Mdocdate: August 28 2015 $ +.Dd $Mdocdate: October 1 2014 $ .Dt PSTAT 8 .Os .Sh NAME @@ -94,8 +94,12 @@ open for appending exclusive or shared lock present .It I signal pgrp when data ready +.It d +deferred for next AF_UNIX garbage collection pass .It l file descriptor slot is larval +.It m +marked during AF_UNIX garbage collection pass .El .It CNT Number of processes that know this open file. Index: usr.sbin/pstat/pstat.c =================================================================== RCS file: /cvs/src/usr.sbin/pstat/pstat.c,v retrieving revision 1.103 diff -u -p -r1.103 pstat.c --- usr.sbin/pstat/pstat.c 25 Apr 2016 19:19:34 -0000 1.103 +++ usr.sbin/pstat/pstat.c 12 Jul 2016 03:40:04 -0000 @@ -1,4 +1,4 @@ -/* $OpenBSD: pstat.c,v 1.103 2016/04/25 19:19:34 tedu Exp $ */ +/* $OpenBSD: pstat.c,v 1.99 2015/03/11 14:59:04 deraadt Exp $ */ /* $NetBSD: pstat.c,v 1.27 1996/10/23 22:50:06 cgd Exp $ */ /*- @@ -1038,6 +1038,10 @@ filemode(void) *fbp++ = 'L'; if (kf->f_iflags & FIF_LARVAL) *fbp++ = 'l'; + if (kf->f_iflags & FIF_MARK) + *fbp++ = 'm'; + if (kf->f_iflags & FIF_DEFER) + *fbp++ = 'd'; *fbp = '\0'; (void)printf("%6s %3ld", flagbuf, (long)kf->f_count);