Index: if_pfsync.c =================================================================== RCS file: /cvs/src/sys/net/if_pfsync.c,v retrieving revision 1.288 diff -u -p -r1.288 if_pfsync.c --- if_pfsync.c 10 Mar 2021 10:21:48 -0000 1.288 +++ if_pfsync.c 1 Jun 2021 05:40:47 -0000 @@ -2508,22 +2508,23 @@ void pfsync_bulk_start(void) { struct pfsync_softc *sc = pfsyncif; + struct pf_state *st; DPFPRINTF(LOG_INFO, "received bulk update request"); - if (TAILQ_EMPTY(&state_list)) + if (TAILQ_EMPTY(&pf_state_list.pfs_list)) pfsync_bulk_status(PFSYNC_BUS_END); else { sc->sc_ureq_received = getuptime(); + mtx_enter(&pf_state_list.pfs_mtx); if (sc->sc_bulk_next == NULL) { - PF_STATE_ENTER_READ(); - sc->sc_bulk_next = TAILQ_FIRST(&state_list); - pf_state_ref(sc->sc_bulk_next); - PF_STATE_EXIT_READ(); + st = TAILQ_FIRST(&pf_state_list.pfs_list); + sc->sc_bulk_next = pf_state_ref(st); } - sc->sc_bulk_last = sc->sc_bulk_next; - pf_state_ref(sc->sc_bulk_last); + st = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); + sc->sc_bulk_last = pf_state_ref(st); + mtx_leave(&pf_state_list.pfs_mtx); pfsync_bulk_status(PFSYNC_BUS_START); timeout_add(&sc->sc_bulk_tmo, 0); @@ -2534,15 +2535,19 @@ void pfsync_bulk_update(void *arg) { struct pfsync_softc *sc; - struct pf_state *st, *st_next; + struct pf_state *st; int i = 0; NET_LOCK(); sc = pfsyncif; if (sc == NULL) - goto out; + goto unlock; + + rw_enter_read(&pf_state_list.pfs_rwl); st = sc->sc_bulk_next; - sc->sc_bulk_next = NULL; + + /* states can't go away while we hold the pf_state_list rwl */ + pf_state_unref(st); for (;;) { if (st->sync_state == PFSYNC_S_NONE && @@ -2552,20 +2557,7 @@ pfsync_bulk_update(void *arg) i++; } - /* - * I wonder how we prevent infinite bulk update. IMO it can - * happen when sc_bulk_last state expires before we iterate - * through the whole list. - */ - PF_STATE_ENTER_READ(); - st_next = TAILQ_NEXT(st, entry_list); - pf_state_unref(st); - st = st_next; - if (st == NULL) - st = TAILQ_FIRST(&state_list); - pf_state_ref(st); - PF_STATE_EXIT_READ(); - + st = TAILQ_NEXT(st, entry_list); if ((st == NULL) || (st == sc->sc_bulk_last)) { /* we're done */ pf_state_unref(sc->sc_bulk_last); @@ -2577,12 +2569,14 @@ pfsync_bulk_update(void *arg) if (i > 1 && (sc->sc_if.if_mtu - sc->sc_len) < sizeof(struct pfsync_state)) { /* we've filled a packet */ - sc->sc_bulk_next = st; + sc->sc_bulk_next = pf_state_ref(st); timeout_add(&sc->sc_bulk_tmo, 1); break; } } - out: + + rw_exit_read(&pf_state_list.pfs_rwl); +unlock: NET_UNLOCK(); } Index: if_tun.c =================================================================== RCS file: /cvs/src/sys/net/if_tun.c,v retrieving revision 1.231 diff -u -p -r1.231 if_tun.c --- if_tun.c 9 Mar 2021 20:05:14 -0000 1.231 +++ if_tun.c 1 Jun 2021 05:40:47 -0000 @@ -118,7 +118,6 @@ int tun_ioctl(struct ifnet *, u_long, ca void tun_input(struct ifnet *, struct mbuf *); int tun_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); -int tun_enqueue(struct ifnet *, struct mbuf *); int tun_clone_create(struct if_clone *, int); int tap_clone_create(struct if_clone *, int); int tun_create(struct if_clone *, int, int); @@ -231,10 +230,10 @@ tun_create(struct if_clone *ifc, int uni /* build the interface */ ifp->if_ioctl = tun_ioctl; - ifp->if_enqueue = tun_enqueue; ifp->if_start = tun_start; ifp->if_hardmtu = TUNMRU; ifp->if_link_state = LINK_STATE_DOWN; + ifq_set_maxlen(&ifp->if_snd, 2048); /* XXX */ if_counters_alloc(ifp); @@ -606,21 +605,6 @@ tun_output(struct ifnet *ifp, struct mbu return (if_enqueue(ifp, m0)); } -int -tun_enqueue(struct ifnet *ifp, struct mbuf *m0) -{ - struct tun_softc *sc = ifp->if_softc; - int error; - - error = ifq_enqueue(&ifp->if_snd, m0); - if (error != 0) - return (error); - - tun_wakeup(sc); - - return (0); -} - void tun_wakeup(struct tun_softc *sc) { @@ -783,6 +767,8 @@ tun_dev_read(dev_t dev, struct uio *uio, if (error != 0) goto put; + KERNEL_UNLOCK(); + #if NBPFILTER > 0 if (ifp->if_bpf) bpf_mtap(ifp->if_bpf, m0, BPF_DIRECTION_OUT); @@ -804,6 +790,8 @@ tun_dev_read(dev_t dev, struct uio *uio, m_freem(m0); + KERNEL_LOCK(); + put: tun_put(sc); return (error); @@ -815,13 +803,21 @@ put: int tunwrite(dev_t dev, struct uio *uio, int ioflag) { - return (tun_dev_write(dev, uio, ioflag, 0)); + int rv; + KERNEL_UNLOCK(); + rv = tun_dev_write(dev, uio, ioflag, 0); + KERNEL_LOCK(); + return (rv); } int tapwrite(dev_t dev, struct uio *uio, int ioflag) { - return (tun_dev_write(dev, uio, ioflag, ETHER_ALIGN)); + int rv; + KERNEL_UNLOCK(); + rv = tun_dev_write(dev, uio, ioflag, ETHER_ALIGN); + KERNEL_LOCK(); + return (rv); } int @@ -1061,8 +1057,7 @@ tun_start(struct ifnet *ifp) splassert(IPL_NET); - if (ifq_len(&ifp->if_snd)) - tun_wakeup(sc); + tun_wakeup(sc); } void Index: ifq.c =================================================================== RCS file: /cvs/src/sys/net/ifq.c,v retrieving revision 1.43 diff -u -p -r1.43 ifq.c --- ifq.c 20 Feb 2021 04:37:26 -0000 1.43 +++ ifq.c 1 Jun 2021 05:40:47 -0000 @@ -500,6 +500,9 @@ ifq_hdatalen(struct ifqueue *ifq) struct mbuf *m; int len = 0; + if (ifq_empty(ifq)) + return (0); + m = ifq_deq_begin(ifq); if (m != NULL) { len = m->m_pkthdr.len; Index: pf.c =================================================================== RCS file: /cvs/src/sys/net/pf.c,v retrieving revision 1.1117 diff -u -p -r1.1117 pf.c --- pf.c 17 May 2021 23:01:26 -0000 1.1117 +++ pf.c 1 Jun 2021 05:40:47 -0000 @@ -308,7 +308,7 @@ static __inline void pf_set_protostate(s struct pf_src_tree tree_src_tracking; struct pf_state_tree_id tree_id; -struct pf_state_queue state_list; +struct pf_state_list pf_state_list = PF_STATE_LIST_INITIALIZER(pf_state_list); RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare); RB_GENERATE(pf_state_tree, pf_state_key, entry, pf_state_compare_key); @@ -440,6 +440,37 @@ pf_check_threshold(struct pf_threshold * return (threshold->count > threshold->limit); } +void +pf_state_list_insert(struct pf_state_list *pfs, struct pf_state *st) +{ + pf_state_ref(st); /* get a ref for the list */ + + /* + * we can always put states on the end of the list. + * + * things reading the list should take a read lock, then + * the mutex, get the head and tail pointers, release the + * mutex, and then they can iterate between the head and tail. + */ + + mtx_enter(&pfs->pfs_mtx); + TAILQ_INSERT_TAIL(&pfs->pfs_list, st, entry_list); + mtx_leave(&pfs->pfs_mtx); +} + +void +pf_state_list_remove(struct pf_state_list *pfs, struct pf_state *st) +{ + /* states can only be removed when the write lock is held */ + rw_assert_wrlock(&pfs->pfs_rwl); + + mtx_enter(&pfs->pfs_mtx); + TAILQ_REMOVE(&pfs->pfs_list, st, entry_list); + mtx_leave(&pfs->pfs_mtx); + + pf_state_unref(st); /* list no longer references the state */ +} + int pf_src_connlimit(struct pf_state **state) { @@ -986,7 +1017,7 @@ pf_state_insert(struct pfi_kif *kif, str PF_STATE_EXIT_WRITE(); return (-1); } - TAILQ_INSERT_TAIL(&state_list, s, entry_list); + pf_state_list_insert(&pf_state_list, s); pf_status.fcounters[FCNT_STATE_INSERT]++; pf_status.states++; pfi_kif_ref(kif, PFI_KIF_REF_STATE); @@ -1247,7 +1278,8 @@ pf_purge_expired_rules(void) void pf_purge_timeout(void *unused) { - task_add(net_tq(0), &pf_purge_task); + /* XXX move to systqmp to avoid KERNEL_LOCK */ + task_add(systq, &pf_purge_task); } void @@ -1255,7 +1287,8 @@ pf_purge(void *xnloops) { int *nloops = xnloops; - KERNEL_LOCK(); + timeout_add_sec(&pf_purge_to, 1); + NET_LOCK(); /* @@ -1284,9 +1317,6 @@ pf_purge(void *xnloops) *nloops = 0; } NET_UNLOCK(); - KERNEL_UNLOCK(); - - timeout_add_sec(&pf_purge_to, 1); } int32_t @@ -1447,7 +1477,7 @@ pf_free_state(struct pf_state *cur) } pf_normalize_tcp_cleanup(cur); pfi_kif_unref(cur->kif, PFI_KIF_REF_STATE); - TAILQ_REMOVE(&state_list, cur, entry_list); + pf_state_list_remove(&pf_state_list, cur); if (cur->tag) pf_tag_unref(cur->tag); pf_state_unref(cur); @@ -1459,52 +1489,71 @@ void pf_purge_expired_states(u_int32_t maxcheck) { static struct pf_state *cur = NULL; - struct pf_state *next; - SLIST_HEAD(pf_state_gcl, pf_state) gcl; + struct pf_state *end, *st; + SLIST_HEAD(pf_state_gcl, pf_state) gcl = SLIST_HEAD_INITIALIZER(gcl); PF_ASSERT_UNLOCKED(); - SLIST_INIT(&gcl); - PF_STATE_ENTER_READ(); - while (maxcheck--) { - /* wrap to start of list when we hit the end */ - if (cur == NULL) { - cur = pf_state_ref(TAILQ_FIRST(&state_list)); - if (cur == NULL) - break; /* list empty */ - } + rw_enter_read(&pf_state_list.pfs_rwl); + + /* items in the list can't go away while the read lock is held */ + pf_state_unref(cur); - /* get next state, as cur may get deleted */ - next = TAILQ_NEXT(cur, entry_list); + mtx_enter(&pf_state_list.pfs_mtx); + /* wrap to start of list when we hit the end */ + if (cur == NULL) + cur = TAILQ_FIRST(&pf_state_list.pfs_list); + + end = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); + + do { + if (cur == NULL) + break; /* list empty */ if ((cur->timeout == PFTM_UNLINKED) || (pf_state_expires(cur) <= getuptime())) - SLIST_INSERT_HEAD(&gcl, cur, gc_list); - else - pf_state_unref(cur); + SLIST_INSERT_HEAD(&gcl, pf_state_ref(cur), gc_list); - cur = pf_state_ref(next); + if (cur == end) { + mtx_enter(&pf_state_list.pfs_mtx); + end = TAILQ_LAST(&pf_state_list.pfs_list, + pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); + + if (cur == end) { + /* this really wass the end of the list */ + cur = NULL; + break; + } + } - if (cur == NULL) - break; - } - PF_STATE_EXIT_READ(); + cur = TAILQ_NEXT(cur, entry_list); + } while (maxcheck--); + + pf_state_ref(cur); + rw_exit_read(&pf_state_list.pfs_rwl); + if (SLIST_EMPTY(&gcl)) + return; + + rw_enter_write(&pf_state_list.pfs_rwl); PF_LOCK(); PF_STATE_ENTER_WRITE(); - while ((next = SLIST_FIRST(&gcl)) != NULL) { - SLIST_REMOVE_HEAD(&gcl, gc_list); - if (next->timeout == PFTM_UNLINKED) - pf_free_state(next); - else { - pf_remove_state(next); - pf_free_state(next); - } + SLIST_FOREACH(st, &gcl, gc_list) { + if (st->timeout != PFTM_UNLINKED) + pf_remove_state(st); - pf_state_unref(next); + pf_free_state(st); } PF_STATE_EXIT_WRITE(); PF_UNLOCK(); + rw_exit_write(&pf_state_list.pfs_rwl); + + while ((st = SLIST_FIRST(&gcl)) != NULL) { + SLIST_REMOVE_HEAD(&gcl, gc_list); + pf_state_unref(st); + } } int Index: pf_ioctl.c =================================================================== RCS file: /cvs/src/sys/net/pf_ioctl.c,v retrieving revision 1.363 diff -u -p -r1.363 pf_ioctl.c --- pf_ioctl.c 9 Feb 2021 23:37:54 -0000 1.363 +++ pf_ioctl.c 1 Jun 2021 05:40:47 -0000 @@ -113,6 +113,7 @@ int pf_rule_copyin(struct pf_rule *, u_int16_t pf_qname2qid(char *, int); void pf_qid2qname(u_int16_t, char *); void pf_qid_unref(u_int16_t); +int pf_states_get(struct pfioc_states *); struct pf_rule pf_default_rule, pf_default_rule_new; @@ -206,7 +207,6 @@ pfattach(int num) TAILQ_INIT(&pf_queues[1]); pf_queues_active = &pf_queues[0]; pf_queues_inactive = &pf_queues[1]; - TAILQ_INIT(&state_list); /* default rule should never be garbage collected */ pf_default_rule.entries.tqe_prev = &pf_default_rule.entries.tqe_next; @@ -903,6 +903,64 @@ pf_addr_copyout(struct pf_addr_wrap *add } int +pf_states_get(struct pfioc_states *ps) +{ + struct pf_state *head, *tail, *state; + struct pfsync_state *p, pstore; + u_int32_t nr = 0; + int error; + + if (ps->ps_len == 0) { + nr = pf_status.states; + ps->ps_len = sizeof(struct pfsync_state) * nr; + return (0); + } + + p = ps->ps_states; + + /* + * Block the things that remove items from the list and + * free them, eg, the pf expiry task or the ioctls that kill + * states. + */ + error = rw_enter(&pf_state_list.pfs_rwl, RW_READ|RW_INTR); + if (error != 0) + return (error); + + /* + * Figure out the limits of what we can safely iterate over. + */ + mtx_enter(&pf_state_list.pfs_mtx); + head = TAILQ_FIRST(&pf_state_list.pfs_list); + tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); + + state = head; + while (state != tail) { + if (state->timeout != PFTM_UNLINKED) { + if ((nr+1) * sizeof(*p) > ps->ps_len) + break; + + pf_state_export(&pstore, state); + error = copyout(&pstore, p, sizeof(*p)); + if (error) + goto fail; + + p++; + nr++; + } + + state = TAILQ_NEXT(state, entry_list); + } + ps->ps_len = sizeof(struct pfsync_state) * nr; + +fail: + rw_exit(&pf_state_list.pfs_rwl); + + return (error); +} + +int pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) { int error = 0; @@ -1757,50 +1815,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a break; } - case DIOCGETSTATES: { - struct pfioc_states *ps = (struct pfioc_states *)addr; - struct pf_state *state; - struct pfsync_state *p, *pstore; - u_int32_t nr = 0; - - if (ps->ps_len == 0) { - nr = pf_status.states; - ps->ps_len = sizeof(struct pfsync_state) * nr; - break; - } - - pstore = malloc(sizeof(*pstore), M_TEMP, M_WAITOK); - - p = ps->ps_states; - - NET_LOCK(); - PF_STATE_ENTER_READ(); - state = TAILQ_FIRST(&state_list); - while (state) { - if (state->timeout != PFTM_UNLINKED) { - if ((nr+1) * sizeof(*p) > ps->ps_len) - break; - pf_state_export(pstore, state); - error = copyout(pstore, p, sizeof(*p)); - if (error) { - free(pstore, M_TEMP, sizeof(*pstore)); - PF_STATE_EXIT_READ(); - NET_UNLOCK(); - goto fail; - } - p++; - nr++; - } - state = TAILQ_NEXT(state, entry_list); - } - PF_STATE_EXIT_READ(); - NET_UNLOCK(); - - ps->ps_len = sizeof(struct pfsync_state) * nr; - - free(pstore, M_TEMP, sizeof(*pstore)); + case DIOCGETSTATES: + error = pf_states_get((struct pfioc_states *)addr); break; - } case DIOCGETSTATUS: { struct pf_status *s = (struct pf_status *)addr; Index: pfvar.h =================================================================== RCS file: /cvs/src/sys/net/pfvar.h,v retrieving revision 1.500 diff -u -p -r1.500 pfvar.h --- pfvar.h 10 Mar 2021 10:21:48 -0000 1.500 +++ pfvar.h 1 Jun 2021 05:40:47 -0000 @@ -1682,7 +1682,7 @@ RB_HEAD(pf_state_tree_id, pf_state); RB_PROTOTYPE(pf_state_tree_id, pf_state, entry_id, pf_state_compare_id); extern struct pf_state_tree_id tree_id; -extern struct pf_state_queue state_list; +extern struct pf_state_list pf_state_list; TAILQ_HEAD(pf_queuehead, pf_queuespec); extern struct pf_queuehead pf_queues[2]; Index: pfvar_priv.h =================================================================== RCS file: /cvs/src/sys/net/pfvar_priv.h,v retrieving revision 1.6 diff -u -p -r1.6 pfvar_priv.h --- pfvar_priv.h 9 Feb 2021 14:06:19 -0000 1.6 +++ pfvar_priv.h 1 Jun 2021 05:40:47 -0000 @@ -38,8 +38,26 @@ #ifdef _KERNEL #include +#include extern struct rwlock pf_lock; + +struct pf_state_list { + /* the actual list of states in the system */ + struct pf_state_queue pfs_list; + + /* serialises insertion of states in the list */ + struct mutex pfs_mtx; + + /* coordinate access to the list for deletion */ + struct rwlock pfs_rwl; +}; + +#define PF_STATE_LIST_INITIALIZER(_pfs) { \ + .pfs_list = TAILQ_HEAD_INITIALIZER(_pfs.pfs_list), \ + .pfs_mtx = MUTEX_INITIALIZER(IPL_SOFTNET), \ + .pfs_rwl = RWLOCK_INITIALIZER("pfstates"), \ +} struct pf_pdesc { struct {