Index: net/if.c =================================================================== RCS file: /cvs/src/sys/net/if.c,v retrieving revision 1.700 diff -u -p -r1.700 if.c --- net/if.c 12 Jun 2023 21:19:54 -0000 1.700 +++ net/if.c 21 Jun 2023 01:46:13 -0000 @@ -1014,14 +1014,6 @@ if_netisr(void *unused) t |= n; } -#if NPFSYNC > 0 - if (t & (1 << NETISR_PFSYNC)) { - KERNEL_LOCK(); - pfsyncintr(); - KERNEL_UNLOCK(); - } -#endif - NET_UNLOCK(); } Index: net/if_pfsync.c =================================================================== RCS file: /cvs/src/sys/net/if_pfsync.c,v retrieving revision 1.317 diff -u -p -r1.317 if_pfsync.c --- net/if_pfsync.c 5 Jun 2023 08:45:20 -0000 1.317 +++ net/if_pfsync.c 21 Jun 2023 01:46:13 -0000 @@ -27,7 +27,7 @@ */ /* - * Copyright (c) 2009 David Gwynne + * Copyright (c) 2009, 2022, 2023 David Gwynne * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -42,6 +42,10 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include "bpfilter.h" +#include "pfsync.h" +#include "kstat.h" + #include #include #include @@ -54,6 +58,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include #include @@ -85,226 +95,267 @@ #include #endif -#define PF_DEBUGNAME "pfsync: " #include #include #include -#include "bpfilter.h" -#include "pfsync.h" - -#define PFSYNC_DEFER_NSEC 20000000ULL +#if 0 +#define DPRINTF(_fmt...) do { \ + printf("%s[%u]: ", __func__, __LINE__); \ + printf(_fmt); \ + printf("\n"); \ +} while (0) +#else +#define DPRINTF(_fmt, ...) /* nop */ +#endif #define PFSYNC_MINPKT ( \ sizeof(struct ip) + \ sizeof(struct pfsync_header)) -int pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *, - struct pfsync_state_peer *); +struct pfsync_softc; -int pfsync_in_clr(caddr_t, int, int, int); -int pfsync_in_iack(caddr_t, int, int, int); -int pfsync_in_upd_c(caddr_t, int, int, int); -int pfsync_in_ureq(caddr_t, int, int, int); -int pfsync_in_del(caddr_t, int, int, int); -int pfsync_in_del_c(caddr_t, int, int, int); -int pfsync_in_bus(caddr_t, int, int, int); -int pfsync_in_tdb(caddr_t, int, int, int); -int pfsync_in_ins(caddr_t, int, int, int); -int pfsync_in_upd(caddr_t, int, int, int); -int pfsync_in_eof(caddr_t, int, int, int); - -int pfsync_in_error(caddr_t, int, int, int); - -void pfsync_update_state_locked(struct pf_state *); - -const struct { - int (*in)(caddr_t, int, int, int); - size_t len; -} pfsync_acts[] = { - /* PFSYNC_ACT_CLR */ - { pfsync_in_clr, sizeof(struct pfsync_clr) }, - /* PFSYNC_ACT_OINS */ - { pfsync_in_error, 0 }, - /* PFSYNC_ACT_INS_ACK */ - { pfsync_in_iack, sizeof(struct pfsync_ins_ack) }, - /* PFSYNC_ACT_OUPD */ - { pfsync_in_error, 0 }, - /* PFSYNC_ACT_UPD_C */ - { pfsync_in_upd_c, sizeof(struct pfsync_upd_c) }, - /* PFSYNC_ACT_UPD_REQ */ - { pfsync_in_ureq, sizeof(struct pfsync_upd_req) }, - /* PFSYNC_ACT_DEL */ - { pfsync_in_del, sizeof(struct pfsync_state) }, - /* PFSYNC_ACT_DEL_C */ - { pfsync_in_del_c, sizeof(struct pfsync_del_c) }, - /* PFSYNC_ACT_INS_F */ - { pfsync_in_error, 0 }, - /* PFSYNC_ACT_DEL_F */ - { pfsync_in_error, 0 }, - /* PFSYNC_ACT_BUS */ - { pfsync_in_bus, sizeof(struct pfsync_bus) }, - /* PFSYNC_ACT_OTDB */ - { pfsync_in_error, 0 }, - /* PFSYNC_ACT_EOF */ - { pfsync_in_error, 0 }, - /* PFSYNC_ACT_INS */ - { pfsync_in_ins, sizeof(struct pfsync_state) }, - /* PFSYNC_ACT_UPD */ - { pfsync_in_upd, sizeof(struct pfsync_state) }, - /* PFSYNC_ACT_TDB */ - { pfsync_in_tdb, sizeof(struct pfsync_tdb) }, +struct pfsync_deferral { + TAILQ_ENTRY(pfsync_deferral) pd_entry; + struct pf_state *pd_st; + struct mbuf *pd_m; + uint64_t pd_deadline; }; +TAILQ_HEAD(pfsync_deferrals, pfsync_deferral); -struct pfsync_q { - void (*write)(struct pf_state *, void *); - size_t len; - u_int8_t action; +#define PFSYNC_DEFER_NSEC 20000000ULL +#define PFSYNC_DEFER_LIMIT 128 +#define PFSYNC_BULK_SND_IVAL_MS 20 + +static struct pool pfsync_deferrals_pool; + +enum pfsync_bulk_req_state { + PFSYNC_BREQ_S_NONE, + PFSYNC_BREQ_S_START, + PFSYNC_BREQ_S_SENT, + PFSYNC_BREQ_S_BULK, + PFSYNC_BREQ_S_DONE, }; -/* we have one of these for every PFSYNC_S_ */ -void pfsync_out_state(struct pf_state *, void *); -void pfsync_out_iack(struct pf_state *, void *); -void pfsync_out_upd_c(struct pf_state *, void *); -void pfsync_out_del(struct pf_state *, void *); - -struct pfsync_q pfsync_qs[] = { - { pfsync_out_iack, sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK }, - { pfsync_out_upd_c, sizeof(struct pfsync_upd_c), PFSYNC_ACT_UPD_C }, - { pfsync_out_del, sizeof(struct pfsync_del_c), PFSYNC_ACT_DEL_C }, - { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_INS }, - { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_UPD } +static const char *pfsync_bulk_req_state_names[] = { + [PFSYNC_BREQ_S_NONE] = "none", + [PFSYNC_BREQ_S_START] = "start", + [PFSYNC_BREQ_S_SENT] = "sent", + [PFSYNC_BREQ_S_BULK] = "bulk", + [PFSYNC_BREQ_S_DONE] = "done", }; -void pfsync_q_ins(struct pf_state *, int); -void pfsync_q_del(struct pf_state *); - -struct pfsync_upd_req_item { - TAILQ_ENTRY(pfsync_upd_req_item) ur_entry; - TAILQ_ENTRY(pfsync_upd_req_item) ur_snap; - struct pfsync_upd_req ur_msg; +enum pfsync_bulk_req_event { + PFSYNC_BREQ_EVT_UP, + PFSYNC_BREQ_EVT_DOWN, + PFSYNC_BREQ_EVT_TMO, + PFSYNC_BREQ_EVT_LINK, + PFSYNC_BREQ_EVT_BUS_START, + PFSYNC_BREQ_EVT_BUS_END, }; -TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item); -struct pfsync_deferral { - TAILQ_ENTRY(pfsync_deferral) pd_entry; - struct pf_state *pd_st; - struct mbuf *pd_m; - uint64_t pd_deadline; +static const char *pfsync_bulk_req_event_names[] = { + [PFSYNC_BREQ_EVT_UP] = "up", + [PFSYNC_BREQ_EVT_DOWN] = "down", + [PFSYNC_BREQ_EVT_TMO] = "timeout", + [PFSYNC_BREQ_EVT_LINK] = "link", + [PFSYNC_BREQ_EVT_BUS_START] = "bus-start", + [PFSYNC_BREQ_EVT_BUS_END] = "bus-end", }; -TAILQ_HEAD(pfsync_deferrals, pfsync_deferral); -#define PFSYNC_PLSIZE MAX(sizeof(struct pfsync_upd_req_item), \ - sizeof(struct pfsync_deferral)) +struct pfsync_slice { + struct pfsync_softc *s_pfsync; + struct mutex s_mtx; + + struct pf_state_queue s_qs[PFSYNC_S_COUNT]; + TAILQ_HEAD(, tdb) s_tdb_q; + size_t s_len; + struct mbuf_list s_ml; + + struct taskq *s_softnet; + struct task s_task; + struct timeout s_tmo; + + struct mbuf_queue s_sendq; + struct task s_send; + + struct pfsync_deferrals s_deferrals; + unsigned int s_deferred; + struct task s_deferrals_task; + struct timeout s_deferrals_tmo; + + uint64_t s_stat_locks; + uint64_t s_stat_contended; + uint64_t s_stat_write_nop; + uint64_t s_stat_task_add; + uint64_t s_stat_task_run; + uint64_t s_stat_enqueue; + uint64_t s_stat_dequeue; + + uint64_t s_stat_defer_add; + uint64_t s_stat_defer_ack; + uint64_t s_stat_defer_run; + uint64_t s_stat_defer_overlimit; -void pfsync_out_tdb(struct tdb *, void *); + struct kstat *s_kstat; +} __aligned(CACHELINESIZE); + +#define PFSYNC_SLICE_BITS 1 +#define PFSYNC_NSLICES (1 << PFSYNC_SLICE_BITS) struct pfsync_softc { struct ifnet sc_if; - unsigned int sc_sync_ifidx; + unsigned int sc_dead; + unsigned int sc_up; + struct refcnt sc_refs; + + /* config */ + struct in_addr sc_syncpeer; + unsigned int sc_maxupdates; + unsigned int sc_defer; - struct pool sc_pool; - - struct ip_moptions sc_imo; + /* operation */ + unsigned int sc_sync_ifidx; + unsigned int sc_sync_if_down; + void *sc_inm; + struct task sc_ltask; + struct task sc_dtask; + struct ip sc_template; - struct in_addr sc_sync_peer; - u_int8_t sc_maxupdates; + struct pfsync_slice sc_slices[PFSYNC_NSLICES]; - struct ip sc_template; + struct { + struct rwlock req_lock; + struct timeout req_tmo; + enum pfsync_bulk_req_state req_state; + unsigned int req_tries; + unsigned int req_demoted; + } sc_bulk_req; - struct pf_state_queue sc_qs[PFSYNC_S_COUNT]; - struct mutex sc_st_mtx; - size_t sc_len; - - struct pfsync_upd_reqs sc_upd_req_list; - struct mutex sc_upd_req_mtx; - - int sc_initial_bulk; - int sc_link_demoted; - - int sc_defer; - struct pfsync_deferrals sc_deferrals; - u_int sc_deferred; - struct mutex sc_deferrals_mtx; - struct timeout sc_deferrals_tmo; - - void *sc_plus; - size_t sc_pluslen; - - u_int32_t sc_ureq_sent; - int sc_bulk_tries; - struct timeout sc_bulkfail_tmo; - - u_int32_t sc_ureq_received; - struct pf_state *sc_bulk_next; - struct pf_state *sc_bulk_last; - struct timeout sc_bulk_tmo; + struct { + struct rwlock snd_lock; + struct timeout snd_tmo; + time_t snd_requested; + + struct pf_state *snd_next; + struct pf_state *snd_tail; + unsigned int snd_again; + } sc_bulk_snd; +}; - TAILQ_HEAD(, tdb) sc_tdb_q; - struct mutex sc_tdb_mtx; +static struct pfsync_softc *pfsyncif = NULL; +static struct cpumem *pfsynccounters; - struct task sc_ltask; - struct task sc_dtask; +static inline void +pfsyncstat_inc(enum pfsync_counters c) +{ + counters_inc(pfsynccounters, c); +} - struct timeout sc_tmo; +static int pfsync_clone_create(struct if_clone *, int); +static int pfsync_clone_destroy(struct ifnet *); + +static int pfsync_output(struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *); +static void pfsync_start(struct ifqueue *); + +static int pfsync_ioctl(struct ifnet *, u_long, caddr_t); +static int pfsync_up(struct pfsync_softc *); +static int pfsync_down(struct pfsync_softc *); + +static int pfsync_set_mtu(struct pfsync_softc *, unsigned int); +static int pfsync_set_parent(struct pfsync_softc *, + const struct if_parent *); +static int pfsync_get_parent(struct pfsync_softc *, struct if_parent *); +static int pfsync_del_parent(struct pfsync_softc *); + +static int pfsync_get_ioc(struct pfsync_softc *, struct ifreq *); +static int pfsync_set_ioc(struct pfsync_softc *, struct ifreq *); + +static void pfsync_syncif_link(void *); +static void pfsync_syncif_detach(void *); + +static void pfsync_sendout(struct pfsync_softc *, struct mbuf *); +static void pfsync_slice_drop(struct pfsync_softc *, struct pfsync_slice *); + +static void pfsync_slice_tmo(void *); +static void pfsync_slice_task(void *); +static void pfsync_slice_sendq(void *); + +static void pfsync_deferrals_tmo(void *); +static void pfsync_deferrals_task(void *); +static void pfsync_defer_output(struct pfsync_deferral *); + +static void pfsync_bulk_req_evt(struct pfsync_softc *, + enum pfsync_bulk_req_event); +static void pfsync_bulk_req_tmo(void *); + +static void pfsync_bulk_snd_tmo(void *); + +#if NKSTAT > 0 +struct pfsync_kstat_data { + struct kstat_kv pd_locks; + struct kstat_kv pd_contended; + struct kstat_kv pd_write_nop; + struct kstat_kv pd_task_add; + struct kstat_kv pd_task_run; + struct kstat_kv pd_enqueue; + struct kstat_kv pd_dequeue; + struct kstat_kv pd_qdrop; + + struct kstat_kv pd_defer_len; + struct kstat_kv pd_defer_add; + struct kstat_kv pd_defer_ack; + struct kstat_kv pd_defer_run; + struct kstat_kv pd_defer_overlimit; }; -struct pfsync_snapshot { - struct pfsync_softc *sn_sc; - struct pf_state_queue sn_qs[PFSYNC_S_COUNT]; - struct pfsync_upd_reqs sn_upd_req_list; - TAILQ_HEAD(, tdb) sn_tdb_q; - size_t sn_len; - void *sn_plus; - size_t sn_pluslen; +static const struct pfsync_kstat_data pfsync_kstat_tpl = { + KSTAT_KV_INITIALIZER("locks", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("contended", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("write-nops", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("send-sched", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("send-run", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("enqueues", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("dequeues", KSTAT_KV_T_COUNTER64), + KSTAT_KV_UNIT_INITIALIZER("qdrops", + KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS), + + KSTAT_KV_UNIT_INITIALIZER("defer-len", + KSTAT_KV_T_COUNTER32, KSTAT_KV_U_PACKETS), + KSTAT_KV_INITIALIZER("defer-add", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("defer-ack", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("defer-run", KSTAT_KV_T_COUNTER64), + KSTAT_KV_INITIALIZER("defer-over", KSTAT_KV_T_COUNTER64), }; -struct pfsync_softc *pfsyncif = NULL; -struct cpumem *pfsynccounters; +static int +pfsync_kstat_copy(struct kstat *ks, void *dst) +{ + struct pfsync_slice *s = ks->ks_softc; + struct pfsync_kstat_data *pd = dst; + + *pd = pfsync_kstat_tpl; + kstat_kv_u64(&pd->pd_locks) = s->s_stat_locks; + kstat_kv_u64(&pd->pd_contended) = s->s_stat_contended; + kstat_kv_u64(&pd->pd_write_nop) = s->s_stat_write_nop; + kstat_kv_u64(&pd->pd_task_add) = s->s_stat_task_add; + kstat_kv_u64(&pd->pd_task_run) = s->s_stat_task_run; + kstat_kv_u64(&pd->pd_enqueue) = s->s_stat_enqueue; + kstat_kv_u64(&pd->pd_dequeue) = s->s_stat_dequeue; + kstat_kv_u32(&pd->pd_qdrop) = mq_drops(&s->s_sendq); + + kstat_kv_u32(&pd->pd_defer_len) = s->s_deferred; + kstat_kv_u64(&pd->pd_defer_add) = s->s_stat_defer_add; + kstat_kv_u64(&pd->pd_defer_ack) = s->s_stat_defer_ack; + kstat_kv_u64(&pd->pd_defer_run) = s->s_stat_defer_run; + kstat_kv_u64(&pd->pd_defer_overlimit) = s->s_stat_defer_overlimit; -void pfsyncattach(int); -int pfsync_clone_create(struct if_clone *, int); -int pfsync_clone_destroy(struct ifnet *); -void pfsync_update_net_tdb(struct pfsync_tdb *); -int pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *, - struct rtentry *); -int pfsyncioctl(struct ifnet *, u_long, caddr_t); -void pfsyncstart(struct ifqueue *); -void pfsync_syncdev_state(void *); -void pfsync_ifdetach(void *); - -void pfsync_deferred(struct pf_state *, int); -void pfsync_undefer(struct pfsync_deferral *, int); -void pfsync_deferrals_tmo(void *); - -void pfsync_cancel_full_update(struct pfsync_softc *); -void pfsync_request_full_update(struct pfsync_softc *); -void pfsync_request_update(u_int32_t, u_int64_t); -void pfsync_update_state_req(struct pf_state *); - -void pfsync_drop(struct pfsync_softc *); -void pfsync_sendout(void); -void pfsync_send_plus(void *, size_t); -void pfsync_timeout(void *); -void pfsync_tdb_timeout(void *); - -void pfsync_bulk_start(void); -void pfsync_bulk_status(u_int8_t); -void pfsync_bulk_update(void *); -void pfsync_bulk_fail(void *); - -void pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *); -void pfsync_drop_snapshot(struct pfsync_snapshot *); - -void pfsync_send_dispatch(void *); -void pfsync_send_pkt(struct mbuf *); - -static struct mbuf_queue pfsync_mq; -static struct task pfsync_task = - TASK_INITIALIZER(pfsync_send_dispatch, &pfsync_mq); + return (0); +} +#endif /* NKSTAT > 0 */ #define PFSYNC_MAX_BULKTRIES 12 -int pfsync_sync_ok; struct if_clone pfsync_cloner = IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy); @@ -312,63 +363,98 @@ struct if_clone pfsync_cloner = void pfsyncattach(int npfsync) { - if_clone_attach(&pfsync_cloner); pfsynccounters = counters_alloc(pfsyncs_ncounters); - mq_init(&pfsync_mq, 4096, IPL_MPFLOOR); + if_clone_attach(&pfsync_cloner); } -int +static int pfsync_clone_create(struct if_clone *ifc, int unit) { struct pfsync_softc *sc; struct ifnet *ifp; - int q; + size_t i, q; if (unit != 0) - return (EINVAL); + return (ENXIO); - pfsync_sync_ok = 1; + if (pfsync_deferrals_pool.pr_size == 0) { + pool_init(&pfsync_deferrals_pool, + sizeof(struct pfsync_deferral), 0, + IPL_MPFLOOR, 0, "pfdefer", NULL); + /* pool_cache_init(&pfsync_deferrals_pool); */ + } - sc = malloc(sizeof(*pfsyncif), M_DEVBUF, M_WAITOK|M_ZERO); - for (q = 0; q < PFSYNC_S_COUNT; q++) - TAILQ_INIT(&sc->sc_qs[q]); - mtx_init(&sc->sc_st_mtx, IPL_MPFLOOR); - - pool_init(&sc->sc_pool, PFSYNC_PLSIZE, 0, IPL_MPFLOOR, 0, "pfsync", - NULL); - TAILQ_INIT(&sc->sc_upd_req_list); - mtx_init(&sc->sc_upd_req_mtx, IPL_MPFLOOR); - TAILQ_INIT(&sc->sc_deferrals); - mtx_init(&sc->sc_deferrals_mtx, IPL_MPFLOOR); - timeout_set_proc(&sc->sc_deferrals_tmo, pfsync_deferrals_tmo, sc); - task_set(&sc->sc_ltask, pfsync_syncdev_state, sc); - task_set(&sc->sc_dtask, pfsync_ifdetach, sc); - sc->sc_deferred = 0; + sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO|M_CANFAIL); + if (sc == NULL) + return (ENOMEM); - TAILQ_INIT(&sc->sc_tdb_q); - mtx_init(&sc->sc_tdb_mtx, IPL_MPFLOOR); + /* sc_refs is "owned" by IFF_RUNNING */ - sc->sc_len = PFSYNC_MINPKT; + sc->sc_syncpeer.s_addr = INADDR_PFSYNC_GROUP; sc->sc_maxupdates = 128; + sc->sc_defer = 0; - sc->sc_imo.imo_membership = mallocarray(IP_MIN_MEMBERSHIPS, - sizeof(struct in_multi *), M_IPMOPTS, M_WAITOK|M_ZERO); - sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS; + task_set(&sc->sc_ltask, pfsync_syncif_link, sc); + task_set(&sc->sc_dtask, pfsync_syncif_detach, sc); + + rw_init(&sc->sc_bulk_req.req_lock, "pfsyncbreq"); + /* XXX grumble grumble */ + timeout_set_proc(&sc->sc_bulk_req.req_tmo, pfsync_bulk_req_tmo, sc); + + rw_init(&sc->sc_bulk_snd.snd_lock, "pfsyncbsnd"); + /* XXX grumble grumble */ + timeout_set_proc(&sc->sc_bulk_snd.snd_tmo, pfsync_bulk_snd_tmo, sc); ifp = &sc->sc_if; - snprintf(ifp->if_xname, sizeof ifp->if_xname, "pfsync%d", unit); + snprintf(ifp->if_xname, sizeof ifp->if_xname, "%s%d", + ifc->ifc_name, unit); ifp->if_softc = sc; - ifp->if_ioctl = pfsyncioctl; - ifp->if_output = pfsyncoutput; - ifp->if_qstart = pfsyncstart; + ifp->if_ioctl = pfsync_ioctl; + ifp->if_output = pfsync_output; + ifp->if_qstart = pfsync_start; ifp->if_type = IFT_PFSYNC; ifp->if_hdrlen = sizeof(struct pfsync_header); ifp->if_mtu = ETHERMTU; ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE; - timeout_set_proc(&sc->sc_tmo, pfsync_timeout, NULL); - timeout_set_proc(&sc->sc_bulk_tmo, pfsync_bulk_update, NULL); - timeout_set_proc(&sc->sc_bulkfail_tmo, pfsync_bulk_fail, NULL); + for (i = 0; i < nitems(sc->sc_slices); i++) { + struct pfsync_slice *s = &sc->sc_slices[i]; + + s->s_pfsync = sc; + + mtx_init_flags(&s->s_mtx, IPL_SOFTNET, "pfslice", 0); + s->s_softnet = net_tq(i); + timeout_set(&s->s_tmo, pfsync_slice_tmo, s); + task_set(&s->s_task, pfsync_slice_task, s); + + mq_init(&s->s_sendq, 16, IPL_SOFTNET); + task_set(&s->s_send, pfsync_slice_sendq, s); + + s->s_len = PFSYNC_MINPKT; + ml_init(&s->s_ml); + + for (q = 0; q < nitems(s->s_qs); q++) + TAILQ_INIT(&s->s_qs[q]); + TAILQ_INIT(&s->s_tdb_q); + + /* stupid NET_LOCK */ + timeout_set(&s->s_deferrals_tmo, pfsync_deferrals_tmo, s); + task_set(&s->s_deferrals_task, pfsync_deferrals_task, s); + TAILQ_INIT(&s->s_deferrals); + +#if NKSTAT > 0 + s->s_kstat = kstat_create(ifp->if_xname, 0, "pfsync-slice", i, + KSTAT_T_KV, 0); + + kstat_set_mutex(s->s_kstat, &s->s_mtx); + s->s_kstat->ks_softc = s; + s->s_kstat->ks_datalen = sizeof(pfsync_kstat_tpl); + s->s_kstat->ks_copy = pfsync_kstat_copy; + kstat_install(s->s_kstat); +#endif + } + + if_counters_alloc(ifp); if_attach(ifp); if_alloc_sadl(ifp); @@ -380,2261 +466,2851 @@ pfsync_clone_create(struct if_clone *ifc bpfattach(&sc->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN); #endif - pfsyncif = sc; - return (0); } -int +static int pfsync_clone_destroy(struct ifnet *ifp) { struct pfsync_softc *sc = ifp->if_softc; - struct ifnet *ifp0; - struct pfsync_deferral *pd; - struct pfsync_deferrals deferrals; +#if NKSTAT > 0 + size_t i; +#endif NET_LOCK(); + sc->sc_dead = 1; -#if NCARP > 0 - if (!pfsync_sync_ok) - carp_group_demote_adj(&sc->sc_if, -1, "pfsync destroy"); - if (sc->sc_link_demoted) - carp_group_demote_adj(&sc->sc_if, -1, "pfsync destroy"); -#endif - if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) { - if_linkstatehook_del(ifp0, &sc->sc_ltask); - if_detachhook_del(ifp0, &sc->sc_dtask); - } - if_put(ifp0); - - /* XXXSMP breaks atomicity */ + if (ISSET(ifp->if_flags, IFF_RUNNING)) + pfsync_down(sc); NET_UNLOCK(); - if_detach(ifp); - NET_LOCK(); - pfsync_drop(sc); + if_detach(ifp); - if (sc->sc_deferred > 0) { - TAILQ_INIT(&deferrals); - mtx_enter(&sc->sc_deferrals_mtx); - TAILQ_CONCAT(&deferrals, &sc->sc_deferrals, pd_entry); - sc->sc_deferred = 0; - mtx_leave(&sc->sc_deferrals_mtx); +#if NKSTAT > 0 + for (i = 0; i < nitems(sc->sc_slices); i++) { + struct pfsync_slice *s = &sc->sc_slices[i]; - while ((pd = TAILQ_FIRST(&deferrals)) != NULL) { - TAILQ_REMOVE(&deferrals, pd, pd_entry); - pfsync_undefer(pd, 0); - } + kstat_destroy(s->s_kstat); } +#endif - pfsyncif = NULL; - timeout_del(&sc->sc_bulkfail_tmo); - timeout_del(&sc->sc_bulk_tmo); - timeout_del(&sc->sc_tmo); - - NET_UNLOCK(); - - pool_destroy(&sc->sc_pool); - free(sc->sc_imo.imo_membership, M_IPMOPTS, - sc->sc_imo.imo_max_memberships * sizeof(struct in_multi *)); free(sc, M_DEVBUF, sizeof(*sc)); return (0); } -/* - * Start output on the pfsync interface. - */ -void -pfsyncstart(struct ifqueue *ifq) +static void +pfsync_dprintf(struct pfsync_softc *sc, const char *fmt, ...) { - ifq_purge(ifq); + struct ifnet *ifp = &sc->sc_if; + va_list ap; + + if (!ISSET(ifp->if_flags, IFF_DEBUG)) + return; + + printf("%s: ", ifp->if_xname); + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); + printf("\n"); } -void -pfsync_syncdev_state(void *arg) +static void +pfsync_syncif_link(void *arg) { struct pfsync_softc *sc = arg; - struct ifnet *ifp; + struct ifnet *ifp0; + unsigned int sync_if_down = 1; - if ((sc->sc_if.if_flags & IFF_UP) == 0) - return; - if ((ifp = if_get(sc->sc_sync_ifidx)) == NULL) - return; + ifp0 = if_get(sc->sc_sync_ifidx); + if (ifp0 != NULL && LINK_STATE_IS_UP(ifp0->if_link_state)) { + pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_LINK); + sync_if_down = 0; + } + if_put(ifp0); - if (ifp->if_link_state == LINK_STATE_DOWN) { - sc->sc_if.if_flags &= ~IFF_RUNNING; - if (!sc->sc_link_demoted) { #if NCARP > 0 - carp_group_demote_adj(&sc->sc_if, 1, - "pfsync link state down"); -#endif - sc->sc_link_demoted = 1; - } - - /* drop everything */ - timeout_del(&sc->sc_tmo); - pfsync_drop(sc); - - pfsync_cancel_full_update(sc); - } else if (sc->sc_link_demoted) { - sc->sc_if.if_flags |= IFF_RUNNING; - - pfsync_request_full_update(sc); + if (sc->sc_sync_if_down != sync_if_down) { + carp_group_demote_adj(&sc->sc_if, + sync_if_down ? 1 : -1, "pfsync link"); } +#endif - if_put(ifp); + sc->sc_sync_if_down = sync_if_down; } -void -pfsync_ifdetach(void *arg) +static void +pfsync_syncif_detach(void *arg) { struct pfsync_softc *sc = arg; - struct ifnet *ifp; + struct ifnet *ifp = &sc->sc_if; - if ((ifp = if_get(sc->sc_sync_ifidx)) != NULL) { - if_linkstatehook_del(ifp, &sc->sc_ltask); - if_detachhook_del(ifp, &sc->sc_dtask); + if (ISSET(ifp->if_flags, IFF_RUNNING)) { + pfsync_down(sc); + if_down(ifp); } - if_put(ifp); sc->sc_sync_ifidx = 0; } -int -pfsync_input(struct mbuf **mp, int *offp, int proto, int af) +static int +pfsync_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct rtentry *rt) { - struct mbuf *n, *m = *mp; - struct pfsync_softc *sc = pfsyncif; - struct ip *ip = mtod(m, struct ip *); - struct pfsync_header *ph; - struct pfsync_subheader subh; - int offset, noff, len, count, mlen, flags = 0; - int e; + m_freem(m); /* drop packet */ + return (EAFNOSUPPORT); +} - NET_ASSERT_LOCKED(); +static int +pfsync_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct pfsync_softc *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *)data; + int error = ENOTTY; - pfsyncstat_inc(pfsyncs_ipackets); + switch (cmd) { + case SIOCSIFADDR: + error = EOPNOTSUPP; + break; + + case SIOCSIFFLAGS: + if (ISSET(ifp->if_flags, IFF_UP)) { + if (!ISSET(ifp->if_flags, IFF_RUNNING)) + error = pfsync_up(sc); + else + error = ENETRESET; + } else { + if (ISSET(ifp->if_flags, IFF_RUNNING)) + error = pfsync_down(sc); + } + break; - /* verify that we have a sync interface configured */ - if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) || - sc->sc_sync_ifidx == 0 || !pf_status.running) - goto done; + case SIOCSIFMTU: + error = pfsync_set_mtu(sc, ifr->ifr_mtu); + break; - /* verify that the packet came in on the right interface */ - if (sc->sc_sync_ifidx != m->m_pkthdr.ph_ifidx) { - pfsyncstat_inc(pfsyncs_badif); - goto done; - } + case SIOCSIFPARENT: + error = pfsync_set_parent(sc, (struct if_parent *)data); + break; + case SIOCGIFPARENT: + error = pfsync_get_parent(sc, (struct if_parent *)data); + break; + case SIOCDIFPARENT: + error = pfsync_del_parent(sc); + break; - sc->sc_if.if_ipackets++; - sc->sc_if.if_ibytes += m->m_pkthdr.len; + case SIOCSETPFSYNC: + error = pfsync_set_ioc(sc, ifr); + break; + case SIOCGETPFSYNC: + error = pfsync_get_ioc(sc, ifr); + break; - /* verify that the IP TTL is 255. */ - if (ip->ip_ttl != PFSYNC_DFLTTL) { - pfsyncstat_inc(pfsyncs_badttl); - goto done; + default: + break; } - offset = ip->ip_hl << 2; - n = m_pulldown(m, offset, sizeof(*ph), &noff); - if (n == NULL) { - pfsyncstat_inc(pfsyncs_hdrops); - return IPPROTO_DONE; - } - ph = (struct pfsync_header *)(n->m_data + noff); + if (error == ENETRESET) + error = 0; - /* verify the version */ - if (ph->version != PFSYNC_VERSION) { - pfsyncstat_inc(pfsyncs_badver); - goto done; - } - len = ntohs(ph->len) + offset; - if (m->m_pkthdr.len < len) { - pfsyncstat_inc(pfsyncs_badlen); - goto done; - } + return (error); +} + +static int +pfsync_set_mtu(struct pfsync_softc *sc, unsigned int mtu) +{ + struct ifnet *ifp = &sc->sc_if; + struct ifnet *ifp0; + int error = 0; - if (!bcmp(&ph->pfcksum, &pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH)) - flags = PFSYNC_SI_CKSUM; + ifp0 = if_get(sc->sc_sync_ifidx); + if (ifp0 == NULL) + return (EINVAL); - offset += sizeof(*ph); - while (offset <= len - sizeof(subh)) { - m_copydata(m, offset, sizeof(subh), &subh); - offset += sizeof(subh); + if (mtu <= PFSYNC_MINPKT || mtu > ifp0->if_mtu) { + error = EINVAL; + goto put; + } - mlen = subh.len << 2; - count = ntohs(subh.count); + /* commit */ + ifp->if_mtu = mtu; - if (subh.action >= PFSYNC_ACT_MAX || - subh.action >= nitems(pfsync_acts) || - mlen < pfsync_acts[subh.action].len) { - /* - * subheaders are always followed by at least one - * message, so if the peer is new - * enough to tell us how big its messages are then we - * know enough to skip them. - */ - if (count > 0 && mlen > 0) { - offset += count * mlen; - continue; - } - pfsyncstat_inc(pfsyncs_badact); - goto done; - } +put: + if_put(ifp0); + return (error); +} - n = m_pulldown(m, offset, mlen * count, &noff); - if (n == NULL) { - pfsyncstat_inc(pfsyncs_badlen); - return IPPROTO_DONE; - } +static int +pfsync_set_parent(struct pfsync_softc *sc, const struct if_parent *p) +{ + struct ifnet *ifp = &sc->sc_if; + struct ifnet *ifp0; + int error = 0; - e = pfsync_acts[subh.action].in(n->m_data + noff, mlen, count, - flags); - if (e != 0) - goto done; + ifp0 = if_unit(p->ifp_parent); + if (ifp0 == NULL) + return (ENXIO); - offset += mlen * count; + if (ifp0->if_index == sc->sc_sync_ifidx) + goto put; + + if (ISSET(ifp->if_flags, IFF_RUNNING)) { + error = EBUSY; + goto put; } -done: - m_freem(m); - return IPPROTO_DONE; + /* commit */ + sc->sc_sync_ifidx = ifp0->if_index; + +put: + if_put(ifp0); + return (error); } -int -pfsync_in_clr(caddr_t buf, int len, int count, int flags) +static int +pfsync_get_parent(struct pfsync_softc *sc, struct if_parent *p) { - struct pfsync_clr *clr; - struct pf_state *st, *nexts; - struct pfi_kif *kif; - u_int32_t creatorid; - int i; + struct ifnet *ifp0; + int error = 0; - PF_LOCK(); - for (i = 0; i < count; i++) { - clr = (struct pfsync_clr *)buf + len * i; - kif = NULL; - creatorid = clr->creatorid; - if (strlen(clr->ifname) && - (kif = pfi_kif_find(clr->ifname)) == NULL) - continue; + ifp0 = if_get(sc->sc_sync_ifidx); + if (ifp0 == NULL) + error = EADDRNOTAVAIL; + else + strlcpy(p->ifp_parent, ifp0->if_xname, sizeof(p->ifp_parent)); + if_put(ifp0); - PF_STATE_ENTER_WRITE(); - RBT_FOREACH_SAFE(st, pf_state_tree_id, &tree_id, nexts) { - if (st->creatorid == creatorid && - ((kif && st->kif == kif) || !kif)) { - SET(st->state_flags, PFSTATE_NOSYNC); - pf_remove_state(st); - } - } - PF_STATE_EXIT_WRITE(); - } - PF_UNLOCK(); + return (error); +} + +static int +pfsync_del_parent(struct pfsync_softc *sc) +{ + struct ifnet *ifp = &sc->sc_if; + + if (ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); + + /* commit */ + sc->sc_sync_ifidx = 0; return (0); } -int -pfsync_in_ins(caddr_t buf, int len, int count, int flags) +static int +pfsync_get_ioc(struct pfsync_softc *sc, struct ifreq *ifr) { - struct pfsync_state *sp; - sa_family_t af1, af2; - int i; - - PF_LOCK(); - for (i = 0; i < count; i++) { - sp = (struct pfsync_state *)(buf + len * i); - af1 = sp->key[0].af; - af2 = sp->key[1].af; + struct pfsyncreq pfsyncr; + struct ifnet *ifp0; - /* check for invalid values */ - if (sp->timeout >= PFTM_MAX || - sp->src.state > PF_TCPS_PROXY_DST || - sp->dst.state > PF_TCPS_PROXY_DST || - sp->direction > PF_OUT || - (((af1 || af2) && - ((af1 != AF_INET && af1 != AF_INET6) || - (af2 != AF_INET && af2 != AF_INET6))) || - (sp->af != AF_INET && sp->af != AF_INET6))) { - DPFPRINTF(LOG_NOTICE, - "pfsync_input: PFSYNC5_ACT_INS: invalid value"); - pfsyncstat_inc(pfsyncs_badval); - continue; - } + memset(&pfsyncr, 0, sizeof(pfsyncr)); - if (pf_state_import(sp, flags) == ENOMEM) { - /* drop out, but process the rest of the actions */ - break; - } + ifp0 = if_get(sc->sc_sync_ifidx); + if (ifp0 != NULL) { + strlcpy(pfsyncr.pfsyncr_syncdev, ifp0->if_xname, + sizeof(pfsyncr.pfsyncr_syncdev)); } - PF_UNLOCK(); + if_put(ifp0); - return (0); + pfsyncr.pfsyncr_syncpeer = sc->sc_syncpeer; + pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates; + pfsyncr.pfsyncr_defer = sc->sc_defer; + + return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr))); } -int -pfsync_in_iack(caddr_t buf, int len, int count, int flags) +static int +pfsync_set_ioc(struct pfsync_softc *sc, struct ifreq *ifr) { - struct pfsync_ins_ack *ia; - struct pf_state_cmp id_key; - struct pf_state *st; - int i; + struct ifnet *ifp = &sc->sc_if; + struct pfsyncreq pfsyncr; + unsigned int sync_ifidx = sc->sc_sync_ifidx; + int wantdown = 0; + int error; - for (i = 0; i < count; i++) { - ia = (struct pfsync_ins_ack *)(buf + len * i); + error = suser(curproc); + if (error != 0) + return (error); + + error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr)); + if (error != 0) + return (error); - id_key.id = ia->id; - id_key.creatorid = ia->creatorid; + if (pfsyncr.pfsyncr_maxupdates > 255) + return (EINVAL); - PF_STATE_ENTER_READ(); - st = pf_find_state_byid(&id_key); - pf_state_ref(st); - PF_STATE_EXIT_READ(); - if (st == NULL) - continue; + if (pfsyncr.pfsyncr_syncdev[0] != '\0') { /* set */ + struct ifnet *ifp0 = if_unit(pfsyncr.pfsyncr_syncdev); + if (ifp0 == NULL) + return (ENXIO); - if (ISSET(st->state_flags, PFSTATE_ACK)) - pfsync_deferred(st, 0); + if (ifp0->if_index != sync_ifidx) + wantdown = 1; - pf_state_unref(st); - } + sync_ifidx = ifp0->if_index; + if_put(ifp0); + } else { /* del */ + wantdown = 1; + sync_ifidx = 0; + } + + if (pfsyncr.pfsyncr_syncpeer.s_addr == INADDR_ANY) + pfsyncr.pfsyncr_syncpeer.s_addr = INADDR_PFSYNC_GROUP; + if (pfsyncr.pfsyncr_syncpeer.s_addr != sc->sc_syncpeer.s_addr) + wantdown = 1; + + if (wantdown && ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); + + /* commit */ + sc->sc_sync_ifidx = sync_ifidx; + sc->sc_syncpeer = pfsyncr.pfsyncr_syncpeer; + sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates; + sc->sc_defer = pfsyncr.pfsyncr_defer; return (0); } -int -pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src, - struct pfsync_state_peer *dst) +static int +pfsync_up(struct pfsync_softc *sc) { - int sync = 0; - + struct ifnet *ifp = &sc->sc_if; + struct ifnet *ifp0; + void *inm = NULL; + int error = 0; + struct ip *ip; + + NET_ASSERT_LOCKED(); + KASSERT(!ISSET(ifp->if_flags, IFF_RUNNING)); + + if (sc->sc_dead) + return (ENXIO); + /* - * The state should never go backwards except - * for syn-proxy states. Neither should the - * sequence window slide backwards. + * coordinate with pfsync_down(). if sc_up is still up and + * we're here then something else is tearing pfsync down. */ - if ((st->src.state > src->state && - (st->src.state < PF_TCPS_PROXY_SRC || - src->state >= PF_TCPS_PROXY_SRC)) || + if (sc->sc_up) + return (EBUSY); - (st->src.state == src->state && - SEQ_GT(st->src.seqlo, ntohl(src->seqlo)))) - sync++; - else - pf_state_peer_ntoh(src, &st->src); + if (sc->sc_syncpeer.s_addr == INADDR_ANY || + sc->sc_syncpeer.s_addr == INADDR_BROADCAST) + return (EDESTADDRREQ); + + ifp0 = if_get(sc->sc_sync_ifidx); + if (ifp0 == NULL) + return (ENXIO); + + if (IN_MULTICAST(sc->sc_syncpeer.s_addr)) { + if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) { + error = ENODEV; + goto put; + } + inm = in_addmulti(&sc->sc_syncpeer, ifp0); + if (inm == NULL) { + error = ECONNABORTED; + goto put; + } + } - if ((st->dst.state > dst->state) || + sc->sc_up = 1; + + ip = &sc->sc_template; + memset(ip, 0, sizeof(*ip)); + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(*ip) >> 2; + ip->ip_tos = IPTOS_LOWDELAY; + /* len and id are set later */ + ip->ip_off = htons(IP_DF); + ip->ip_ttl = PFSYNC_DFLTTL; + ip->ip_p = IPPROTO_PFSYNC; + ip->ip_src.s_addr = INADDR_ANY; + ip->ip_dst.s_addr = sc->sc_syncpeer.s_addr; - (st->dst.state >= TCPS_SYN_SENT && - SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo)))) - sync++; + /* commit */ + refcnt_init(&sc->sc_refs); /* IFF_RUNNING kind of owns this */ + +#if NCARP > 0 + sc->sc_sync_if_down = 1; + carp_group_demote_adj(&sc->sc_if, 1, "pfsync up"); +#endif + + if_linkstatehook_add(ifp0, &sc->sc_ltask); + if_detachhook_add(ifp0, &sc->sc_dtask); + + sc->sc_inm = inm; + SET(ifp->if_flags, IFF_RUNNING); + + pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_UP); + + refcnt_take(&sc->sc_refs); /* give one to SMR */ + SMR_PTR_SET_LOCKED(&pfsyncif, sc); + + pfsync_syncif_link(sc); /* try and push the bulk req state forward */ + +put: + if_put(ifp0); + return (error); +} + +static struct mbuf * +pfsync_encap(struct pfsync_softc *sc, struct mbuf *m) +{ + struct { + struct ip ip; + struct pfsync_header ph; + } __packed __aligned(4) *h; + unsigned int mlen = m->m_pkthdr.len; + + m = m_prepend(m, sizeof(*h), M_DONTWAIT); + if (m == NULL) + return (NULL); + + h = mtod(m, void *); + memset(h, 0, sizeof(*h)); + + mlen += sizeof(h->ph); + h->ph.version = PFSYNC_VERSION; + h->ph.len = htons(mlen); + /* h->ph.pfcksum */ + + mlen += sizeof(h->ip); + h->ip = sc->sc_template; + h->ip.ip_len = htons(mlen); + h->ip.ip_id = htons(ip_randomid()); + + return (m); +} + +static void +pfsync_bulk_req_send(struct pfsync_softc *sc) +{ + struct { + struct pfsync_subheader subh; + struct pfsync_upd_req ur; + } __packed __aligned(4) *h; + unsigned mlen = max_linkhdr + + sizeof(struct ip) + sizeof(struct pfsync_header) + sizeof(*h); + struct mbuf *m; + + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) + goto fail; + + if (mlen > MHLEN) { + MCLGETL(m, M_DONTWAIT, mlen); + if (!ISSET(m->m_flags, M_EXT)) + goto drop; + } + + m_align(m, sizeof(*h)); + m->m_len = m->m_pkthdr.len = sizeof(*h); + + h = mtod(m, void *); + memset(h, 0, sizeof(*h)); + + h->subh.action = PFSYNC_ACT_UPD_REQ; + h->subh.len = sizeof(h->ur) >> 2; + h->subh.count = htons(1); + + h->ur.id = htobe64(0); + h->ur.creatorid = htobe32(0); + + m = pfsync_encap(sc, m); + if (m == NULL) + goto fail; + + pfsync_sendout(sc, m); + return; + +drop: + m_freem(m); +fail: + printf("%s: unable to request bulk update\n", sc->sc_if.if_xname); +} + +static void +pfsync_bulk_req_nstate(struct pfsync_softc *sc, + enum pfsync_bulk_req_state nstate, int seconds) +{ + sc->sc_bulk_req.req_state = nstate; + if (seconds > 0) + timeout_add_sec(&sc->sc_bulk_req.req_tmo, seconds); else - pf_state_peer_ntoh(dst, &st->dst); + timeout_del(&sc->sc_bulk_req.req_tmo); +} - return (sync); +static void +pfsync_bulk_req_invstate(struct pfsync_softc *sc, + enum pfsync_bulk_req_event evt) +{ + panic("%s: unexpected event %s in state %s", sc->sc_if.if_xname, + pfsync_bulk_req_event_names[evt], + pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state]); } -int -pfsync_in_upd(caddr_t buf, int len, int count, int flags) +static void +pfsync_bulk_req_nstate_bulk(struct pfsync_softc *sc) { - struct pfsync_state *sp; - struct pf_state_cmp id_key; - struct pf_state *st; - int sync, error; - int i; + /* calculate the number of packets we expect */ + int t = pf_pool_limits[PF_LIMIT_STATES].limit / + ((sc->sc_if.if_mtu - PFSYNC_MINPKT) / + sizeof(struct pfsync_state)); - for (i = 0; i < count; i++) { - sp = (struct pfsync_state *)(buf + len * i); + /* turn it into seconds */ + t /= 1000 / PFSYNC_BULK_SND_IVAL_MS; - /* check for invalid values */ - if (sp->timeout >= PFTM_MAX || - sp->src.state > PF_TCPS_PROXY_DST || - sp->dst.state > PF_TCPS_PROXY_DST) { - DPFPRINTF(LOG_NOTICE, - "pfsync_input: PFSYNC_ACT_UPD: invalid value"); - pfsyncstat_inc(pfsyncs_badval); - continue; - } + if (t == 0) + t = 1; - id_key.id = sp->id; - id_key.creatorid = sp->creatorid; + pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_BULK, t * 4); +} - PF_STATE_ENTER_READ(); - st = pf_find_state_byid(&id_key); - pf_state_ref(st); - PF_STATE_EXIT_READ(); - if (st == NULL) { - /* insert the update */ - PF_LOCK(); - error = pf_state_import(sp, flags); - if (error) - pfsyncstat_inc(pfsyncs_badstate); - PF_UNLOCK(); - continue; - } +static inline void +pfsync_bulk_req_nstate_done(struct pfsync_softc *sc) +{ + pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_DONE, 0); - if (ISSET(st->state_flags, PFSTATE_ACK)) - pfsync_deferred(st, 1); + KASSERT(sc->sc_bulk_req.req_demoted == 1); + sc->sc_bulk_req.req_demoted = 0; - if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) - sync = pfsync_upd_tcp(st, &sp->src, &sp->dst); - else { - sync = 0; +#if NCARP > 0 + carp_group_demote_adj(&sc->sc_if, -32, "pfsync done"); +#endif +} - /* - * Non-TCP protocol state machine always go - * forwards - */ - if (st->src.state > sp->src.state) - sync++; - else - pf_state_peer_ntoh(&sp->src, &st->src); +static void +pfsync_bulk_req_evt(struct pfsync_softc *sc, enum pfsync_bulk_req_event evt) +{ + struct ifnet *ifp = &sc->sc_if; - if (st->dst.state > sp->dst.state) - sync++; - else - pf_state_peer_ntoh(&sp->dst, &st->dst); + rw_enter_write(&sc->sc_bulk_req.req_lock); + pfsync_dprintf(sc, "%s state %s evt %s", __func__, + pfsync_bulk_req_state_names[sc->sc_bulk_req.req_state], + pfsync_bulk_req_event_names[evt]); + + if (evt == PFSYNC_BREQ_EVT_DOWN) { + /* unconditionally move down */ + sc->sc_bulk_req.req_tries = 0; + pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_NONE, 0); + + if (sc->sc_bulk_req.req_demoted) { + sc->sc_bulk_req.req_demoted = 0; +#if NCARP > 0 + carp_group_demote_adj(&sc->sc_if, -32, + "pfsync down"); +#endif } + } else switch (sc->sc_bulk_req.req_state) { + case PFSYNC_BREQ_S_NONE: + switch (evt) { + case PFSYNC_BREQ_EVT_UP: + KASSERT(sc->sc_bulk_req.req_demoted == 0); + sc->sc_bulk_req.req_demoted = 1; +#if NCARP > 0 + carp_group_demote_adj(&sc->sc_if, 32, + "pfsync start"); +#endif + pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_START, 30); + break; + default: + pfsync_bulk_req_invstate(sc, evt); + } + + break; + + case PFSYNC_BREQ_S_START: + switch (evt) { + case PFSYNC_BREQ_EVT_LINK: + pfsync_bulk_req_send(sc); + pfsync_bulk_req_nstate(sc, PFSYNC_BREQ_S_SENT, 2); + break; + case PFSYNC_BREQ_EVT_TMO: + pfsync_dprintf(sc, "timeout waiting for link"); + pfsync_bulk_req_nstate_done(sc); + break; + case PFSYNC_BREQ_EVT_BUS_START: + pfsync_bulk_req_nstate_bulk(sc); + break; + case PFSYNC_BREQ_EVT_BUS_END: + /* ignore this */ + break; + default: + pfsync_bulk_req_invstate(sc, evt); + } + break; - if (sync < 2) { - pf_state_alloc_scrub_memory(&sp->dst, &st->dst); - pf_state_peer_ntoh(&sp->dst, &st->dst); - st->expire = getuptime(); - st->timeout = sp->timeout; + case PFSYNC_BREQ_S_SENT: + switch (evt) { + case PFSYNC_BREQ_EVT_BUS_START: + pfsync_bulk_req_nstate_bulk(sc); + break; + case PFSYNC_BREQ_EVT_BUS_END: + case PFSYNC_BREQ_EVT_LINK: + /* ignore this */ + break; + case PFSYNC_BREQ_EVT_TMO: + if (++sc->sc_bulk_req.req_tries < + PFSYNC_MAX_BULKTRIES) { + pfsync_bulk_req_send(sc); + pfsync_bulk_req_nstate(sc, + PFSYNC_BREQ_S_SENT, 2); + break; + } + + pfsync_dprintf(sc, + "timeout waiting for bulk transfer start"); + pfsync_bulk_req_nstate_done(sc); + break; + default: + pfsync_bulk_req_invstate(sc, evt); } - st->pfsync_time = getuptime(); + break; - if (sync) { - pfsyncstat_inc(pfsyncs_stale); + case PFSYNC_BREQ_S_BULK: + switch (evt) { + case PFSYNC_BREQ_EVT_BUS_START: + case PFSYNC_BREQ_EVT_LINK: + /* ignore this */ + break; + case PFSYNC_BREQ_EVT_BUS_END: + pfsync_bulk_req_nstate_done(sc); + break; + case PFSYNC_BREQ_EVT_TMO: + if (++sc->sc_bulk_req.req_tries < + PFSYNC_MAX_BULKTRIES) { + pfsync_bulk_req_send(sc); + pfsync_bulk_req_nstate(sc, + PFSYNC_BREQ_S_SENT, 2); + } - pfsync_update_state(st); - schednetisr(NETISR_PFSYNC); + pfsync_dprintf(sc, + "timeout waiting for bulk transfer end"); + pfsync_bulk_req_nstate_done(sc); + break; + default: + pfsync_bulk_req_invstate(sc, evt); } + break; - pf_state_unref(st); + case PFSYNC_BREQ_S_DONE: /* pfsync is up and running */ + switch (evt) { + case PFSYNC_BREQ_EVT_BUS_START: + case PFSYNC_BREQ_EVT_BUS_END: + case PFSYNC_BREQ_EVT_LINK: + /* nops */ + break; + default: + pfsync_bulk_req_invstate(sc, evt); + } + break; + + default: + panic("%s: unknown event %d", ifp->if_xname, evt); + /* NOTREACHED */ } + rw_exit_write(&sc->sc_bulk_req.req_lock); +} - return (0); +static void +pfsync_bulk_req_tmo(void *arg) +{ + struct pfsync_softc *sc = arg; + + NET_LOCK(); + pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_TMO); + NET_UNLOCK(); } -int -pfsync_in_upd_c(caddr_t buf, int len, int count, int flags) +static int +pfsync_down(struct pfsync_softc *sc) { - struct pfsync_upd_c *up; - struct pf_state_cmp id_key; - struct pf_state *st; + struct ifnet *ifp = &sc->sc_if; + struct ifnet *ifp0; + struct smr_entry smr; + size_t i; + void *inm = NULL; + unsigned int sndbar = 0; + struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds); + struct pfsync_deferral *pd; - int sync; + NET_ASSERT_LOCKED(); + KASSERT(ISSET(ifp->if_flags, IFF_RUNNING)); - int i; + /* + * tearing down pfsync involves waiting for pfsync to stop + * running in various contexts including softnet taskqs. + * this thread cannot hold netlock while waiting for a + * barrier in softnet because softnet might be waiting for + * the netlock. sc->sc_up is used to coordinate with + * pfsync_up. + */ - for (i = 0; i < count; i++) { - up = (struct pfsync_upd_c *)(buf + len * i); + CLR(ifp->if_flags, IFF_RUNNING); - /* check for invalid values */ - if (up->timeout >= PFTM_MAX || - up->src.state > PF_TCPS_PROXY_DST || - up->dst.state > PF_TCPS_PROXY_DST) { - DPFPRINTF(LOG_NOTICE, - "pfsync_input: PFSYNC_ACT_UPD_C: invalid value"); - pfsyncstat_inc(pfsyncs_badval); - continue; - } + ifp0 = if_get(sc->sc_sync_ifidx); + if (ifp0 != NULL) { + if_linkstatehook_del(ifp0, &sc->sc_ltask); + if_detachhook_del(ifp0, &sc->sc_dtask); + } + if_put(ifp0); - id_key.id = up->id; - id_key.creatorid = up->creatorid; +#if NCARP > 0 + if (sc->sc_sync_if_down) + carp_group_demote_adj(&sc->sc_if, -1, "pfsync down"); +#endif - PF_STATE_ENTER_READ(); - st = pf_find_state_byid(&id_key); - pf_state_ref(st); - PF_STATE_EXIT_READ(); - if (st == NULL) { - /* We don't have this state. Ask for it. */ - pfsync_request_update(id_key.creatorid, id_key.id); - continue; - } + NET_UNLOCK(); - if (ISSET(st->state_flags, PFSTATE_ACK)) - pfsync_deferred(st, 1); + KASSERTMSG(SMR_PTR_GET_LOCKED(&pfsyncif) == sc, + "pfsyncif %p != sc %p", pfsyncif, sc); + SMR_PTR_SET_LOCKED(&pfsyncif, NULL); + smr_init(&smr); + smr_call(&smr, (void (*)(void *))refcnt_rele_wake, &sc->sc_refs); - if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) - sync = pfsync_upd_tcp(st, &up->src, &up->dst); - else { - sync = 0; - /* - * Non-TCP protocol state machine always go - * forwards - */ - if (st->src.state > up->src.state) - sync++; - else - pf_state_peer_ntoh(&up->src, &st->src); + /* stop pf producing work before cleaning up the timeouts and tasks */ + refcnt_finalize(&sc->sc_refs, "pfsyncfini"); - if (st->dst.state > up->dst.state) - sync++; - else - pf_state_peer_ntoh(&up->dst, &st->dst); - } - if (sync < 2) { - pf_state_alloc_scrub_memory(&up->dst, &st->dst); - pf_state_peer_ntoh(&up->dst, &st->dst); - st->expire = getuptime(); - st->timeout = up->timeout; - } - st->pfsync_time = getuptime(); + pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_DOWN); + + rw_enter_read(&pf_state_list.pfs_rwl); + rw_enter_write(&sc->sc_bulk_snd.snd_lock); + if (sc->sc_bulk_snd.snd_tail != NULL) { + sndbar = !timeout_del(&sc->sc_bulk_snd.snd_tmo); + + sc->sc_bulk_snd.snd_again = 0; + sc->sc_bulk_snd.snd_next = NULL; + sc->sc_bulk_snd.snd_tail = NULL; + } + rw_exit_write(&sc->sc_bulk_snd.snd_lock); + rw_exit_read(&pf_state_list.pfs_rwl); + + /* + * do a single barrier for all the timeouts. because the + * timeouts in each slice are configured the same way, the + * barrier for one will work for all of them. + */ + for (i = 0; i < nitems(sc->sc_slices); i++) { + struct pfsync_slice *s = &sc->sc_slices[i]; + + timeout_del(&s->s_tmo); + task_del(s->s_softnet, &s->s_task); + task_del(s->s_softnet, &s->s_send); + + timeout_del(&s->s_deferrals_tmo); + task_del(s->s_softnet, &s->s_deferrals_task); + } + timeout_barrier(&sc->sc_slices[0].s_tmo); + timeout_barrier(&sc->sc_bulk_req.req_tmo); /* XXX proc */ + if (sndbar) { + /* technically the preceding barrier does the same job */ + timeout_barrier(&sc->sc_bulk_snd.snd_tmo); + } + net_tq_barriers("pfsyncbar"); - if (sync) { - pfsyncstat_inc(pfsyncs_stale); + /* pfsync is no longer running */ + + if (sc->sc_inm != NULL) { + inm = sc->sc_inm; + sc->sc_inm = NULL; + } - pfsync_update_state(st); - schednetisr(NETISR_PFSYNC); + for (i = 0; i < nitems(sc->sc_slices); i++) { + struct pfsync_slice *s = &sc->sc_slices[i]; + struct pf_state *st; + + pfsync_slice_drop(sc, s); + mq_purge(&s->s_sendq); + + while ((pd = TAILQ_FIRST(&s->s_deferrals)) != NULL) { + TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry); + + st = pd->pd_st; + st->sync_defer = NULL; + + TAILQ_INSERT_TAIL(&pds, pd, pd_entry); } + s->s_deferred = 0; + } - pf_state_unref(st); + NET_LOCK(); + sc->sc_up = 0; + + if (inm != NULL) + in_delmulti(inm); + + while ((pd = TAILQ_FIRST(&pds)) != NULL) { + TAILQ_REMOVE(&pds, pd, pd_entry); + + pfsync_defer_output(pd); } return (0); } int -pfsync_in_ureq(caddr_t buf, int len, int count, int flags) +pfsync_is_up(void) { - struct pfsync_upd_req *ur; - int i; + int rv; - struct pf_state_cmp id_key; - struct pf_state *st; + smr_read_enter(); + rv = SMR_PTR_GET(&pfsyncif) != NULL; + smr_read_leave(); - for (i = 0; i < count; i++) { - ur = (struct pfsync_upd_req *)(buf + len * i); + return (rv); +} - id_key.id = ur->id; - id_key.creatorid = ur->creatorid; +static void +pfsync_start(struct ifqueue *ifq) +{ + ifq_purge(ifq); +} - if (id_key.id == 0 && id_key.creatorid == 0) - pfsync_bulk_start(); - else { - PF_STATE_ENTER_READ(); - st = pf_find_state_byid(&id_key); - pf_state_ref(st); - PF_STATE_EXIT_READ(); - if (st == NULL) { - pfsyncstat_inc(pfsyncs_badstate); - continue; - } - if (ISSET(st->state_flags, PFSTATE_NOSYNC)) { - pf_state_unref(st); - continue; - } +struct pfsync_q { + void (*write)(struct pf_state *, void *); + size_t len; + u_int8_t action; +}; - pfsync_update_state_req(st); - pf_state_unref(st); - } +static struct pfsync_slice * +pfsync_slice_enter(struct pfsync_softc *sc, const struct pf_state *st) +{ + unsigned int idx = st->key[0]->hash % nitems(sc->sc_slices); + struct pfsync_slice *s = &sc->sc_slices[idx]; + + if (!mtx_enter_try(&s->s_mtx)) { + mtx_enter(&s->s_mtx); + s->s_stat_contended++; } + s->s_stat_locks++; - return (0); + return (s); } -int -pfsync_in_del(caddr_t buf, int len, int count, int flags) +static void +pfsync_slice_leave(struct pfsync_softc *sc, struct pfsync_slice *s) { - struct pfsync_state *sp; - struct pf_state_cmp id_key; - struct pf_state *st; - int i; + mtx_leave(&s->s_mtx); +} - PF_STATE_ENTER_WRITE(); - for (i = 0; i < count; i++) { - sp = (struct pfsync_state *)(buf + len * i); +/* we have one of these for every PFSYNC_S_ */ +static void pfsync_out_state(struct pf_state *, void *); +static void pfsync_out_iack(struct pf_state *, void *); +static void pfsync_out_upd_c(struct pf_state *, void *); +static void pfsync_out_del(struct pf_state *, void *); +#if defined(IPSEC) +static void pfsync_out_tdb(struct tdb *, void *); +#endif - id_key.id = sp->id; - id_key.creatorid = sp->creatorid; +static const struct pfsync_q pfsync_qs[] = { + { pfsync_out_iack, sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK }, + { pfsync_out_upd_c, sizeof(struct pfsync_upd_c), PFSYNC_ACT_UPD_C }, + { pfsync_out_del, sizeof(struct pfsync_del_c), PFSYNC_ACT_DEL_C }, + { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_INS }, + { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_UPD } +}; - st = pf_find_state_byid(&id_key); - if (st == NULL) { - pfsyncstat_inc(pfsyncs_badstate); +static void +pfsync_out_state(struct pf_state *st, void *buf) +{ + struct pfsync_state *sp = buf; + + mtx_enter(&st->mtx); + pf_state_export(sp, st); + mtx_leave(&st->mtx); +} + +static void +pfsync_out_iack(struct pf_state *st, void *buf) +{ + struct pfsync_ins_ack *iack = buf; + + iack->id = st->id; + iack->creatorid = st->creatorid; +} + +static void +pfsync_out_upd_c(struct pf_state *st, void *buf) +{ + struct pfsync_upd_c *up = buf; + + memset(up, 0, sizeof(*up)); + up->id = st->id; + up->creatorid = st->creatorid; + + mtx_enter(&st->mtx); + pf_state_peer_hton(&st->src, &up->src); + pf_state_peer_hton(&st->dst, &up->dst); + up->timeout = st->timeout; + mtx_leave(&st->mtx); +} + +static void +pfsync_out_del(struct pf_state *st, void *buf) +{ + struct pfsync_del_c *dp = buf; + + dp->id = st->id; + dp->creatorid = st->creatorid; + + st->sync_state = PFSYNC_S_DEAD; +} + +#if defined(IPSEC) +static inline void +pfsync_tdb_enter(struct tdb *tdb) +{ + mtx_enter(&tdb->tdb_mtx); +} + +static inline void +pfsync_tdb_leave(struct tdb *tdb) +{ + unsigned int snapped = ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); + mtx_leave(&tdb->tdb_mtx); + if (snapped) + wakeup_one(&tdb->tdb_updates); +} +#endif /* defined(IPSEC) */ + +static void +pfsync_slice_drop(struct pfsync_softc *sc, struct pfsync_slice *s) +{ + struct pf_state *st; + int q; +#if defined(IPSEC) + struct tdb *tdb; +#endif + + for (q = 0; q < nitems(s->s_qs); q++) { + if (TAILQ_EMPTY(&s->s_qs[q])) continue; + + while ((st = TAILQ_FIRST(&s->s_qs[q])) != NULL) { + TAILQ_REMOVE(&s->s_qs[q], st, sync_list); +#ifdef PFSYNC_DEBUG + KASSERT(st->sync_state == q); +#endif + st->sync_state = PFSYNC_S_NONE; + pf_state_unref(st); } - SET(st->state_flags, PFSTATE_NOSYNC); - pf_remove_state(st); } - PF_STATE_EXIT_WRITE(); - return (0); +#if defined(IPSEC) + while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) { + TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry); + + pfsync_tdb_enter(tdb); + KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC)); + CLR(tdb->tdb_flags, TDBF_PFSYNC); + pfsync_tdb_leave(tdb); + } +#endif /* defined(IPSEC) */ + + timeout_del(&s->s_tmo); + s->s_len = PFSYNC_MINPKT; } -int -pfsync_in_del_c(caddr_t buf, int len, int count, int flags) +static struct mbuf * +pfsync_slice_write(struct pfsync_slice *s) { - struct pfsync_del_c *sp; - struct pf_state_cmp id_key; - struct pf_state *st; - int i; + struct pfsync_softc *sc = s->s_pfsync; + struct mbuf *m; - PF_LOCK(); - PF_STATE_ENTER_WRITE(); - for (i = 0; i < count; i++) { - sp = (struct pfsync_del_c *)(buf + len * i); + struct ip *ip; + struct pfsync_header *ph; + struct pfsync_subheader *subh; - id_key.id = sp->id; - id_key.creatorid = sp->creatorid; + unsigned int mlen = max_linkhdr + s->s_len; + unsigned int q, count; + caddr_t ptr; + size_t off; - st = pf_find_state_byid(&id_key); - if (st == NULL) { - pfsyncstat_inc(pfsyncs_badstate); + MUTEX_ASSERT_LOCKED(&s->s_mtx); + if (s->s_len == PFSYNC_MINPKT) { + s->s_stat_write_nop++; + return (NULL); + } + + task_del(s->s_softnet, &s->s_task); + + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) + goto drop; + + if (mlen > MHLEN) { + MCLGETL(m, M_DONTWAIT, mlen); + if (!ISSET(m->m_flags, M_EXT)) + goto drop; + } + + m_align(m, s->s_len); + m->m_len = m->m_pkthdr.len = s->s_len; + + ptr = mtod(m, caddr_t); + off = 0; + + ip = (struct ip *)(ptr + off); + off += sizeof(*ip); + *ip = sc->sc_template; + ip->ip_len = htons(m->m_pkthdr.len); + ip->ip_id = htons(ip_randomid()); + + ph = (struct pfsync_header *)(ptr + off); + off += sizeof(*ph); + memset(ph, 0, sizeof(*ph)); + ph->version = PFSYNC_VERSION; + ph->len = htons(m->m_pkthdr.len - sizeof(*ip)); + + for (q = 0; q < nitems(s->s_qs); q++) { + struct pf_state_queue *psq = &s->s_qs[q]; + struct pf_state *st; + + if (TAILQ_EMPTY(psq)) continue; + + subh = (struct pfsync_subheader *)(ptr + off); + off += sizeof(*subh); + + count = 0; + while ((st = TAILQ_FIRST(psq)) != NULL) { + TAILQ_REMOVE(psq, st, sync_list); + count++; + + KASSERT(st->sync_state == q); + /* the write handler below may override this */ + st->sync_state = PFSYNC_S_NONE; + + pfsync_qs[q].write(st, ptr + off); + off += pfsync_qs[q].len; + + pf_state_unref(st); } - SET(st->state_flags, PFSTATE_NOSYNC); - pf_remove_state(st); + subh->action = pfsync_qs[q].action; + subh->len = pfsync_qs[q].len >> 2; + subh->count = htons(count); } - PF_STATE_EXIT_WRITE(); - PF_UNLOCK(); - return (0); +#if defined(IPSEC) + if (!TAILQ_EMPTY(&s->s_tdb_q)) { + struct tdb *tdb; + + subh = (struct pfsync_subheader *)(ptr + off); + off += sizeof(*subh); + + count = 0; + while ((tdb = TAILQ_FIRST(&s->s_tdb_q)) != NULL) { + TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry); + count++; + + pfsync_tdb_enter(tdb); + KASSERT(ISSET(tdb->tdb_flags, TDBF_PFSYNC)); + + /* get a consistent view of the counters */ + pfsync_out_tdb(tdb, ptr + off); + + CLR(tdb->tdb_flags, TDBF_PFSYNC); + pfsync_tdb_leave(tdb); + + off += sizeof(struct pfsync_tdb); + } + + subh->action = PFSYNC_ACT_TDB; + subh->len = sizeof(struct pfsync_tdb) >> 2; + subh->count = htons(count); + } +#endif + + timeout_del(&s->s_tmo); + s->s_len = PFSYNC_MINPKT; + + return (m); +drop: + m_freem(m); + pfsyncstat_inc(pfsyncs_onomem); + pfsync_slice_drop(sc, s); + return (NULL); +} + +static void +pfsync_sendout(struct pfsync_softc *sc, struct mbuf *m) +{ + struct ip_moptions imo; /* XXX */ + unsigned int len = m->m_pkthdr.len; +#if NBPF > 0 + caddr_t if_bpf = sc->sc_if.if_bpf; + if (if_bpf) + bpf_mtap(if_bpf, m, BPF_DIRECTION_OUT); +#endif + + imo.imo_ifidx = sc->sc_sync_ifidx; + imo.imo_ttl = PFSYNC_DFLTTL; + imo.imo_loop = 0; + + if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &imo, NULL, 0) == 0) { + counters_pkt(sc->sc_if.if_counters, ifc_opackets, + ifc_obytes, len); + pfsyncstat_inc(pfsyncs_opackets); + } else { + counters_inc(sc->sc_if.if_counters, ifc_oerrors); + pfsyncstat_inc(pfsyncs_oerrors); + } } -int -pfsync_in_bus(caddr_t buf, int len, int count, int flags) +static void +pfsync_slice_tmo(void *arg) { - struct pfsync_softc *sc = pfsyncif; - struct pfsync_bus *bus; + struct pfsync_slice *s = arg; - /* If we're not waiting for a bulk update, who cares. */ - if (sc->sc_ureq_sent == 0) - return (0); + task_add(s->s_softnet, &s->s_task); +} - bus = (struct pfsync_bus *)buf; +static void +pfsync_slice_sched(struct pfsync_slice *s) +{ + s->s_stat_task_add++; + task_add(s->s_softnet, &s->s_task); +} - switch (bus->status) { - case PFSYNC_BUS_START: - PF_LOCK(); - timeout_add(&sc->sc_bulkfail_tmo, 4 * hz + - pf_pool_limits[PF_LIMIT_STATES].limit / - ((sc->sc_if.if_mtu - PFSYNC_MINPKT) / - sizeof(struct pfsync_state))); - PF_UNLOCK(); - DPFPRINTF(LOG_INFO, "received bulk update start"); - break; +static void +pfsync_slice_task(void *arg) +{ + struct pfsync_slice *s = arg; + struct mbuf *m; - case PFSYNC_BUS_END: - if (getuptime() - ntohl(bus->endtime) >= - sc->sc_ureq_sent) { - /* that's it, we're happy */ - sc->sc_ureq_sent = 0; - sc->sc_bulk_tries = 0; - timeout_del(&sc->sc_bulkfail_tmo); -#if NCARP > 0 - if (!pfsync_sync_ok) - carp_group_demote_adj(&sc->sc_if, -1, - sc->sc_link_demoted ? - "pfsync link state up" : - "pfsync bulk done"); - if (sc->sc_initial_bulk) { - carp_group_demote_adj(&sc->sc_if, -32, - "pfsync init"); - sc->sc_initial_bulk = 0; - } -#endif - pfsync_sync_ok = 1; - sc->sc_link_demoted = 0; - DPFPRINTF(LOG_INFO, "received valid bulk update end"); - } else { - DPFPRINTF(LOG_WARNING, "received invalid " - "bulk update end: bad timestamp"); + mtx_enter(&s->s_mtx); + s->s_stat_task_run++; + + m = pfsync_slice_write(s); + mtx_leave(&s->s_mtx); + if (m != NULL) { + NET_LOCK(); + pfsync_sendout(s->s_pfsync, m); + NET_UNLOCK(); + } +} + +static void +pfsync_slice_sendq(void *arg) +{ + struct pfsync_slice *s = arg; + struct mbuf_list ml; + struct mbuf *m; + + mq_delist(&s->s_sendq, &ml); + if (ml_empty(&ml)) + return; + + mtx_enter(&s->s_mtx); + s->s_stat_dequeue++; + mtx_leave(&s->s_mtx); + + NET_LOCK(); + while ((m = ml_dequeue(&ml)) != NULL) + pfsync_sendout(s->s_pfsync, m); + NET_UNLOCK(); +} + +static void +pfsync_q_ins(struct pfsync_slice *s, struct pf_state *st, unsigned int q) +{ + size_t nlen = pfsync_qs[q].len; + struct mbuf *m = NULL; + + MUTEX_ASSERT_LOCKED(&s->s_mtx); + KASSERT(st->sync_state == PFSYNC_S_NONE); + KASSERT(s->s_len >= PFSYNC_MINPKT); + + if (TAILQ_EMPTY(&s->s_qs[q])) + nlen += sizeof(struct pfsync_subheader); + + if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) { + m = pfsync_slice_write(s); + if (m != NULL) { + s->s_stat_enqueue++; + if (mq_enqueue(&s->s_sendq, m) == 0) + task_add(s->s_softnet, &s->s_send); } - break; + + nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len; } - return (0); + s->s_len += nlen; + pf_state_ref(st); + TAILQ_INSERT_TAIL(&s->s_qs[q], st, sync_list); + st->sync_state = q; + + if (!timeout_pending(&s->s_tmo)) + timeout_add_sec(&s->s_tmo, 1); } -int -pfsync_in_tdb(caddr_t buf, int len, int count, int flags) +static void +pfsync_q_del(struct pfsync_slice *s, struct pf_state *st) { -#if defined(IPSEC) - struct pfsync_tdb *tp; - int i; + unsigned int q = st->sync_state; - for (i = 0; i < count; i++) { - tp = (struct pfsync_tdb *)(buf + len * i); - pfsync_update_net_tdb(tp); + MUTEX_ASSERT_LOCKED(&s->s_mtx); + KASSERT(st->sync_state < PFSYNC_S_NONE); + + st->sync_state = PFSYNC_S_NONE; + TAILQ_REMOVE(&s->s_qs[q], st, sync_list); + pf_state_unref(st); + s->s_len -= pfsync_qs[q].len; + + if (TAILQ_EMPTY(&s->s_qs[q])) + s->s_len -= sizeof(struct pfsync_subheader); +} + +/* + * the pfsync hooks that pf calls + */ + +void +pfsync_init_state(struct pf_state *st, const struct pf_state_key *skw, + const struct pf_state_key *sks, int flags) +{ + DPRINTF("%p %016llx %08x: state_flags %x flags %x", + st, st->id, st->creatorid, st->state_flags, flags); + + /* this is called before pf_state_insert */ + + if (skw->proto == IPPROTO_PFSYNC) + SET(st->state_flags, PFSTATE_NOSYNC); + + if (ISSET(st->state_flags, PFSTATE_NOSYNC)) { + st->sync_state = PFSYNC_S_DEAD; + return; } -#endif - return (0); + if (ISSET(flags, PFSYNC_SI_IOCTL)) { + /* all good */ + return; + } + + /* state came off the wire */ + if (ISSET(st->state_flags, PFSTATE_ACK)) { + CLR(st->state_flags, PFSTATE_ACK); + + /* peer wants an iack, not an insert */ + st->sync_state = PFSYNC_S_SYNC; + } +} + +void +pfsync_insert_state(struct pf_state *st) +{ + struct pfsync_softc *sc; + + MUTEX_ASSERT_UNLOCKED(&st->mtx); + + if (ISSET(st->state_flags, PFSTATE_NOSYNC) || + st->sync_state == PFSYNC_S_DEAD) + return; + + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc != NULL) { + struct pfsync_slice *s = pfsync_slice_enter(sc, st); + + DPRINTF("%p %016llx %08x: sync_state %02x", + st, st->id, st->creatorid, st->sync_state); + + switch (st->sync_state) { + case PFSYNC_S_UPD_C: + /* we must have lost a race after insert */ + pfsync_q_del(s, st); + /* FALLTHROUGH */ + case PFSYNC_S_NONE: + pfsync_q_ins(s, st, PFSYNC_S_INS); + break; + case PFSYNC_S_SYNC: + st->sync_state = PFSYNC_S_NONE; /* gross */ + pfsync_q_ins(s, st, PFSYNC_S_IACK); + pfsync_slice_sched(s); /* the peer is waiting */ + break; + default: + panic("%s: state %p unexpected sync_state %d", + __func__, st, st->sync_state); + /* NOTREACHED */ + } + + pfsync_slice_leave(sc, s); + } + smr_read_leave(); } -#if defined(IPSEC) -/* Update an in-kernel tdb. Silently fail if no tdb is found. */ void -pfsync_update_net_tdb(struct pfsync_tdb *pt) +pfsync_update_state(struct pf_state *st) { - struct tdb *tdb; + struct pfsync_softc *sc; - NET_ASSERT_LOCKED(); + MUTEX_ASSERT_UNLOCKED(&st->mtx); - /* check for invalid values */ - if (ntohl(pt->spi) <= SPI_RESERVED_MAX || - (pt->dst.sa.sa_family != AF_INET && - pt->dst.sa.sa_family != AF_INET6)) - goto bad; + if (ISSET(st->state_flags, PFSTATE_NOSYNC) || + st->sync_state == PFSYNC_S_DEAD) + return; - tdb = gettdb(ntohs(pt->rdomain), pt->spi, - (union sockaddr_union *)&pt->dst, pt->sproto); - if (tdb) { - pt->rpl = betoh64(pt->rpl); - pt->cur_bytes = betoh64(pt->cur_bytes); + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc != NULL) { + struct pfsync_slice *s = pfsync_slice_enter(sc, st); + int sync = 0; + + DPRINTF("%p %016llx %08x: sync_state %02x", + st, st->id, st->creatorid, st->sync_state); + + switch (st->sync_state) { + case PFSYNC_S_UPD_C: + case PFSYNC_S_UPD: + /* we're already handling it */ + if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) { + st->sync_updates++; + if (st->sync_updates >= sc->sc_maxupdates) + sync = 1; + } + /* FALLTHROUGH */ + case PFSYNC_S_INS: + case PFSYNC_S_DEL: + case PFSYNC_S_DEAD: + break; - /* Neither replay nor byte counter should ever decrease. */ - if (pt->rpl < tdb->tdb_rpl || - pt->cur_bytes < tdb->tdb_cur_bytes) { - tdb_unref(tdb); - goto bad; + case PFSYNC_S_IACK: + pfsync_q_del(s, st); + /* FALLTHROUGH */ + case PFSYNC_S_NONE: + pfsync_q_ins(s, st, PFSYNC_S_UPD_C); + st->sync_updates = 0; + break; + default: + panic("%s: state %p unexpected sync_state %d", + __func__, st, st->sync_state); + /* NOTREACHED */ } - tdb->tdb_rpl = pt->rpl; - tdb->tdb_cur_bytes = pt->cur_bytes; - tdb_unref(tdb); - } - return; + if (!sync && (getuptime() - st->pfsync_time) < 2) + sync = 1; - bad: - DPFPRINTF(LOG_WARNING, "pfsync_insert: PFSYNC_ACT_TDB_UPD: " - "invalid value"); - pfsyncstat_inc(pfsyncs_badstate); - return; + if (sync) + pfsync_slice_sched(s); + pfsync_slice_leave(sc, s); + } + smr_read_leave(); } -#endif - -int -pfsync_in_eof(caddr_t buf, int len, int count, int flags) +void +pfsync_delete_state(struct pf_state *st) { - if (len > 0 || count > 0) - pfsyncstat_inc(pfsyncs_badact); - - /* we're done. let the caller return */ - return (1); -} + struct pfsync_softc *sc; -int -pfsync_in_error(caddr_t buf, int len, int count, int flags) -{ - pfsyncstat_inc(pfsyncs_badact); - return (-1); -} + MUTEX_ASSERT_UNLOCKED(&st->mtx); -int -pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, - struct rtentry *rt) -{ - m_freem(m); /* drop packet */ - return (EAFNOSUPPORT); -} + if (ISSET(st->state_flags, PFSTATE_NOSYNC) || + st->sync_state == PFSYNC_S_DEAD) + return; -int -pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data) -{ - struct proc *p = curproc; - struct pfsync_softc *sc = ifp->if_softc; - struct ifreq *ifr = (struct ifreq *)data; - struct ip_moptions *imo = &sc->sc_imo; - struct pfsyncreq pfsyncr; - struct ifnet *ifp0, *sifp; - struct ip *ip; - int error; + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc != NULL) { + struct pfsync_slice *s = pfsync_slice_enter(sc, st); - switch (cmd) { - case SIOCSIFFLAGS: - if ((ifp->if_flags & IFF_RUNNING) == 0 && - (ifp->if_flags & IFF_UP)) { - ifp->if_flags |= IFF_RUNNING; + DPRINTF("%p %016llx %08x: sync_state %02x", + st, st->id, st->creatorid, st->sync_state); -#if NCARP > 0 - sc->sc_initial_bulk = 1; - carp_group_demote_adj(&sc->sc_if, 32, "pfsync init"); -#endif + switch (st->sync_state) { + case PFSYNC_S_INS: + /* let's pretend this never happened */ + pfsync_q_del(s, st); + break; - pfsync_request_full_update(sc); + case PFSYNC_S_UPD_C: + case PFSYNC_S_UPD: + case PFSYNC_S_IACK: + pfsync_q_del(s, st); + /* FALLTHROUGH */ + case PFSYNC_S_NONE: + pfsync_q_ins(s, st, PFSYNC_S_DEL); + st->sync_updates = 0; + break; + case PFSYNC_S_DEL: + case PFSYNC_S_DEAD: + /* XXX we should count this */ + break; + default: + panic("%s: state %p unexpected sync_state %d", + __func__, st, st->sync_state); + /* NOTREACHED */ } - if ((ifp->if_flags & IFF_RUNNING) && - (ifp->if_flags & IFF_UP) == 0) { - ifp->if_flags &= ~IFF_RUNNING; - /* drop everything */ - timeout_del(&sc->sc_tmo); - pfsync_drop(sc); + pfsync_slice_leave(sc, s); + } + smr_read_leave(); +} + +void +pfsync_clear_states(u_int32_t creatorid, const char *ifname) +{ + struct pfsync_softc *sc; - pfsync_cancel_full_update(sc); - } - break; - case SIOCSIFMTU: - if ((ifp0 = if_get(sc->sc_sync_ifidx)) == NULL) - return (EINVAL); - error = 0; - if (ifr->ifr_mtu <= PFSYNC_MINPKT || - ifr->ifr_mtu > ifp0->if_mtu) { - error = EINVAL; - } - if_put(ifp0); - if (error) - return error; - if (ifr->ifr_mtu < ifp->if_mtu) - pfsync_sendout(); - ifp->if_mtu = ifr->ifr_mtu; - break; - case SIOCGETPFSYNC: - bzero(&pfsyncr, sizeof(pfsyncr)); - if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) { - strlcpy(pfsyncr.pfsyncr_syncdev, - ifp0->if_xname, IFNAMSIZ); - } - if_put(ifp0); - pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer; - pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates; - pfsyncr.pfsyncr_defer = sc->sc_defer; - return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr))); + DPRINTF("creatorid %08x ifname %s", creatorid, + ifname ? ifname : "(unset)"); - case SIOCSETPFSYNC: - if ((error = suser(p)) != 0) - return (error); - if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr)))) - return (error); + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc != NULL) + refcnt_take(&sc->sc_refs); + smr_read_leave(); - if (pfsyncr.pfsyncr_syncpeer.s_addr == 0) - sc->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP; - else - sc->sc_sync_peer.s_addr = - pfsyncr.pfsyncr_syncpeer.s_addr; + if (sc == NULL) + return; - if (pfsyncr.pfsyncr_maxupdates > 255) - return (EINVAL); - sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates; - - sc->sc_defer = pfsyncr.pfsyncr_defer; - - if (pfsyncr.pfsyncr_syncdev[0] == 0) { - if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) { - if_linkstatehook_del(ifp0, &sc->sc_ltask); - if_detachhook_del(ifp0, &sc->sc_dtask); - } - if_put(ifp0); - sc->sc_sync_ifidx = 0; - if (imo->imo_num_memberships > 0) { - in_delmulti(imo->imo_membership[ - --imo->imo_num_memberships]); - imo->imo_ifidx = 0; - } - break; - } + refcnt_rele_wake(&sc->sc_refs); +} - if ((sifp = if_unit(pfsyncr.pfsyncr_syncdev)) == NULL) - return (EINVAL); +int +pfsync_state_in_use(struct pf_state *st) +{ + struct pfsync_softc *sc; + int rv = 0; - ifp0 = if_get(sc->sc_sync_ifidx); + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc != NULL) { + /* + * pfsync bulk sends run inside + * rw_enter_read(&pf_state_list.pfs_rwl), and this + * code (pfsync_state_in_use) is only called from the + * purge code inside + * rw_enter_write(&pf_state_list.pfs_rwl). therefore, + * those two sections are exclusive so we can safely + * look at the bulk send pointers. + */ + /* rw_assert_wrlock(&pf_state_list.pfs_rwl); */ + if (sc->sc_bulk_snd.snd_next == st || + sc->sc_bulk_snd.snd_tail == st) + rv = 1; + } + smr_read_leave(); - if (sifp->if_mtu < sc->sc_if.if_mtu || (ifp0 != NULL && - sifp->if_mtu < ifp0->if_mtu) || - sifp->if_mtu < MCLBYTES - sizeof(struct ip)) - pfsync_sendout(); - - if (ifp0) { - if_linkstatehook_del(ifp0, &sc->sc_ltask); - if_detachhook_del(ifp0, &sc->sc_dtask); - } - if_put(ifp0); - sc->sc_sync_ifidx = sifp->if_index; + return (rv); +} - if (imo->imo_num_memberships > 0) { - in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); - imo->imo_ifidx = 0; - } +int +pfsync_defer(struct pf_state *st, struct mbuf *m) +{ + struct pfsync_softc *sc; + struct pfsync_slice *s; + struct pfsync_deferral *pd; + int sched = 0; + int rv = 0; - if (sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) { - struct in_addr addr; + if (ISSET(st->state_flags, PFSTATE_NOSYNC) || + ISSET(m->m_flags, M_BCAST|M_MCAST)) + return (0); - if (!(sifp->if_flags & IFF_MULTICAST)) { - sc->sc_sync_ifidx = 0; - if_put(sifp); - return (EADDRNOTAVAIL); - } + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc == NULL || !sc->sc_defer) + goto leave; + + pd = pool_get(&pfsync_deferrals_pool, M_NOWAIT); + if (pd == NULL) { + goto leave; + } - addr.s_addr = INADDR_PFSYNC_GROUP; + s = pfsync_slice_enter(sc, st); + s->s_stat_defer_add++; - if ((imo->imo_membership[0] = - in_addmulti(&addr, sifp)) == NULL) { - sc->sc_sync_ifidx = 0; - if_put(sifp); - return (ENOBUFS); - } - imo->imo_num_memberships++; - imo->imo_ifidx = sc->sc_sync_ifidx; - imo->imo_ttl = PFSYNC_DFLTTL; - imo->imo_loop = 0; - } - - ip = &sc->sc_template; - bzero(ip, sizeof(*ip)); - ip->ip_v = IPVERSION; - ip->ip_hl = sizeof(sc->sc_template) >> 2; - ip->ip_tos = IPTOS_LOWDELAY; - /* len and id are set later */ - ip->ip_off = htons(IP_DF); - ip->ip_ttl = PFSYNC_DFLTTL; - ip->ip_p = IPPROTO_PFSYNC; - ip->ip_src.s_addr = INADDR_ANY; - ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr; - - if_linkstatehook_add(sifp, &sc->sc_ltask); - if_detachhook_add(sifp, &sc->sc_dtask); - if_put(sifp); + pd->pd_st = pf_state_ref(st); + pd->pd_m = m; + pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC; - pfsync_request_full_update(sc); + m->m_pkthdr.pf.flags |= PF_TAG_GENERATED; + st->sync_defer = pd; - break; + sched = s->s_deferred++; + TAILQ_INSERT_TAIL(&s->s_deferrals, pd, pd_entry); - default: - return (ENOTTY); + if (sched == 0) + timeout_add_nsec(&s->s_deferrals_tmo, PFSYNC_DEFER_NSEC); + else if (sched >= PFSYNC_DEFER_LIMIT) { + s->s_stat_defer_overlimit++; + timeout_del(&s->s_deferrals_tmo); + task_add(s->s_softnet, &s->s_deferrals_task); } - return (0); + pfsync_slice_sched(s); + pfsync_slice_leave(sc, s); + rv = 1; +leave: + smr_read_leave(); + + return (rv); } -void -pfsync_out_state(struct pf_state *st, void *buf) +static void +pfsync_deferred(struct pfsync_softc *sc, struct pf_state *st) { - struct pfsync_state *sp = buf; + struct pfsync_slice *s; + struct pfsync_deferral *pd; - pf_state_export(sp, st); -} + s = pfsync_slice_enter(sc, st); -void -pfsync_out_iack(struct pf_state *st, void *buf) -{ - struct pfsync_ins_ack *iack = buf; + pd = st->sync_defer; + if (pd != NULL) { + s->s_stat_defer_ack++; - iack->id = st->id; - iack->creatorid = st->creatorid; -} + TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry); + s->s_deferred--; -void -pfsync_out_upd_c(struct pf_state *st, void *buf) -{ - struct pfsync_upd_c *up = buf; + st = pd->pd_st; + st->sync_defer = NULL; + } + pfsync_slice_leave(sc, s); - bzero(up, sizeof(*up)); - up->id = st->id; - pf_state_peer_hton(&st->src, &up->src); - pf_state_peer_hton(&st->dst, &up->dst); - up->creatorid = st->creatorid; - up->timeout = st->timeout; + if (pd != NULL) + pfsync_defer_output(pd); } -void -pfsync_out_del(struct pf_state *st, void *buf) +static void +pfsync_deferrals_tmo(void *arg) { - struct pfsync_del_c *dp = buf; + struct pfsync_slice *s = arg; - dp->id = st->id; - dp->creatorid = st->creatorid; - - SET(st->state_flags, PFSTATE_NOSYNC); + if (READ_ONCE(s->s_deferred) > 0) + task_add(s->s_softnet, &s->s_deferrals_task); } -void -pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc) +static void +pfsync_deferrals_task(void *arg) { - int q; + struct pfsync_slice *s = arg; + struct pfsync_deferral *pd; struct pf_state *st; - struct pfsync_upd_req_item *ur; -#if defined(IPSEC) - struct tdb *tdb; -#endif - - sn->sn_sc = sc; + uint64_t now, nsec = 0; + struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds); - mtx_enter(&sc->sc_st_mtx); - mtx_enter(&sc->sc_upd_req_mtx); - mtx_enter(&sc->sc_tdb_mtx); + now = getnsecuptime(); - for (q = 0; q < PFSYNC_S_COUNT; q++) { - TAILQ_INIT(&sn->sn_qs[q]); + mtx_enter(&s->s_mtx); + s->s_stat_defer_run++; /* maybe move this into the loop */ + for (;;) { + pd = TAILQ_FIRST(&s->s_deferrals); + if (pd == NULL) + break; - while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) { - TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); - mtx_enter(&st->mtx); - if (st->snapped == 0) { - TAILQ_INSERT_TAIL(&sn->sn_qs[q], st, sync_snap); - st->snapped = 1; - mtx_leave(&st->mtx); - } else { - /* - * item is on snapshot list already, so we can - * skip it now. - */ - mtx_leave(&st->mtx); - pf_state_unref(st); - } + if (s->s_deferred < PFSYNC_DEFER_LIMIT && + now < pd->pd_deadline) { + nsec = pd->pd_deadline - now; + break; } + + TAILQ_REMOVE(&s->s_deferrals, pd, pd_entry); + s->s_deferred--; + + /* + * detach the pd from the state. the pd still refers + * to the state though. + */ + st = pd->pd_st; + st->sync_defer = NULL; + + TAILQ_INSERT_TAIL(&pds, pd, pd_entry); } + mtx_leave(&s->s_mtx); - TAILQ_INIT(&sn->sn_upd_req_list); - while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) { - TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); - TAILQ_INSERT_TAIL(&sn->sn_upd_req_list, ur, ur_snap); + if (nsec > 0) { + /* we were looking at a pd, but it wasn't old enough */ + timeout_add_nsec(&s->s_deferrals_tmo, nsec); } - TAILQ_INIT(&sn->sn_tdb_q); -#if defined(IPSEC) - while ((tdb = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) { - TAILQ_REMOVE(&sc->sc_tdb_q, tdb, tdb_sync_entry); - TAILQ_INSERT_TAIL(&sn->sn_tdb_q, tdb, tdb_sync_snap); + if (TAILQ_EMPTY(&pds)) + return; - mtx_enter(&tdb->tdb_mtx); - KASSERT(!ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED)); - SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); - mtx_leave(&tdb->tdb_mtx); + NET_LOCK(); + while ((pd = TAILQ_FIRST(&pds)) != NULL) { + TAILQ_REMOVE(&pds, pd, pd_entry); + + pfsync_defer_output(pd); } -#endif + NET_UNLOCK(); +} + +static void +pfsync_defer_output(struct pfsync_deferral *pd) +{ + struct pf_pdesc pdesc; + struct pf_state *st = pd->pd_st; + + if (st->rt == PF_ROUTETO) { + if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af, + st->direction, NULL, pd->pd_m, NULL) != PF_PASS) + return; + switch (st->key[PF_SK_WIRE]->af) { + case AF_INET: + pf_route(&pdesc, st); + break; +#ifdef INET6 + case AF_INET6: + pf_route6(&pdesc, st); + break; +#endif /* INET6 */ + default: + unhandled_af(st->key[PF_SK_WIRE]->af); + } + pd->pd_m = pdesc.m; + } else { + switch (st->key[PF_SK_WIRE]->af) { + case AF_INET: + ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0); + break; +#ifdef INET6 + case AF_INET6: + ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL); + break; +#endif /* INET6 */ + default: + unhandled_af(st->key[PF_SK_WIRE]->af); + } - sn->sn_len = sc->sc_len; - sc->sc_len = PFSYNC_MINPKT; + pd->pd_m = NULL; + } - sn->sn_plus = sc->sc_plus; - sc->sc_plus = NULL; - sn->sn_pluslen = sc->sc_pluslen; - sc->sc_pluslen = 0; - - mtx_leave(&sc->sc_tdb_mtx); - mtx_leave(&sc->sc_upd_req_mtx); - mtx_leave(&sc->sc_st_mtx); + pf_state_unref(st); + m_freem(pd->pd_m); + pool_put(&pfsync_deferrals_pool, pd); } -void -pfsync_drop_snapshot(struct pfsync_snapshot *sn) +struct pfsync_subh_bus { + struct pfsync_subheader subh; + struct pfsync_bus bus; +} __packed __aligned(4); + +static unsigned +pfsync_bulk_snd_bus(struct pfsync_softc *sc, + struct mbuf *m, const unsigned int space, + uint32_t endtime, uint8_t status) { - struct pf_state *st; - struct pfsync_upd_req_item *ur; -#if defined(IPSEC) - struct tdb *t; -#endif - int q; + struct pfsync_subh_bus *h; + unsigned int nlen; - for (q = 0; q < PFSYNC_S_COUNT; q++) { - if (TAILQ_EMPTY(&sn->sn_qs[q])) - continue; + nlen = m->m_len + sizeof(*h); + if (nlen < sizeof(*h)) + return (0); - while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) { - mtx_enter(&st->mtx); - KASSERT(st->sync_state == q); - KASSERT(st->snapped == 1); - TAILQ_REMOVE(&sn->sn_qs[q], st, sync_snap); - st->sync_state = PFSYNC_S_NONE; - st->snapped = 0; - mtx_leave(&st->mtx); - pf_state_unref(st); - } - } + h = (struct pfsync_subh_bus *)(mtod(m, caddr_t) + m->m_len); + memset(h, 0, sizeof(*h)); - while ((ur = TAILQ_FIRST(&sn->sn_upd_req_list)) != NULL) { - TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_snap); - pool_put(&sn->sn_sc->sc_pool, ur); - } + h->subh.action = PFSYNC_ACT_BUS; + h->subh.len = sizeof(h->bus) >> 2; + h->subh.count = htons(1); + + h->bus.creatorid = pf_status.hostid; + h->bus.endtime = htonl(endtime); + h->bus.status = status; -#if defined(IPSEC) - while ((t = TAILQ_FIRST(&sn->sn_tdb_q)) != NULL) { - TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_snap); - mtx_enter(&t->tdb_mtx); - KASSERT(ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED)); - CLR(t->tdb_flags, TDBF_PFSYNC_SNAPPED); - CLR(t->tdb_flags, TDBF_PFSYNC); - mtx_leave(&t->tdb_mtx); - } -#endif + m->m_len = nlen; + + return (1); } -int -pfsync_is_snapshot_empty(struct pfsync_snapshot *sn) +static unsigned int +pfsync_bulk_snd_states(struct pfsync_softc *sc, + struct mbuf *m, const unsigned int space, unsigned int len) { - int q; + struct pf_state *st; + struct pfsync_state *sp; + unsigned int nlen; + unsigned int count = 0; - for (q = 0; q < PFSYNC_S_COUNT; q++) - if (!TAILQ_EMPTY(&sn->sn_qs[q])) - return (0); + st = sc->sc_bulk_snd.snd_next; - if (!TAILQ_EMPTY(&sn->sn_upd_req_list)) - return (0); + for (;;) { + nlen = len + sizeof(*sp); + sp = (struct pfsync_state *)(mtod(m, caddr_t) + len); + if (space < nlen) + break; - if (!TAILQ_EMPTY(&sn->sn_tdb_q)) - return (0); + mtx_enter(&st->mtx); + pf_state_export(sp, st); + mtx_leave(&st->mtx); + + /* commit */ + count++; + m->m_len = len = nlen; + + if (st == sc->sc_bulk_snd.snd_tail) { + if (pfsync_bulk_snd_bus(sc, m, space, + 0, PFSYNC_BUS_END) == 0) { + /* couldn't fit the BUS */ + st = NULL; + break; + } + + /* this BUS is done */ + pfsync_dprintf(sc, "bulk send done (%s)", __func__); + sc->sc_bulk_snd.snd_again = 0; /* XXX */ + sc->sc_bulk_snd.snd_next = NULL; + sc->sc_bulk_snd.snd_tail = NULL; + return (count); + } - return (sn->sn_plus == NULL); -} + st = TAILQ_NEXT(st, entry_list); + } -void -pfsync_drop(struct pfsync_softc *sc) -{ - struct pfsync_snapshot sn; + /* there's still work to do */ + sc->sc_bulk_snd.snd_next = st; + timeout_add_msec(&sc->sc_bulk_snd.snd_tmo, PFSYNC_BULK_SND_IVAL_MS); - pfsync_grab_snapshot(&sn, sc); - pfsync_drop_snapshot(&sn); + return (count); } -void -pfsync_send_dispatch(void *xmq) +static unsigned int +pfsync_bulk_snd_sub(struct pfsync_softc *sc, + struct mbuf *m, const unsigned int space) { - struct mbuf_queue *mq = xmq; - struct pfsync_softc *sc; - struct mbuf *m; - struct mbuf_list ml; - int error; + struct pfsync_subheader *subh; + unsigned int count; + unsigned int len, nlen; - mq_delist(mq, &ml); - if (ml_empty(&ml)) - return; + len = m->m_len; + nlen = len + sizeof(*subh); + if (nlen > space) + return (0); - NET_LOCK(); - sc = pfsyncif; - if (sc == NULL) { - ml_purge(&ml); - goto done; - } + subh = (struct pfsync_subheader *)(mtod(m, caddr_t) + len); - while ((m = ml_dequeue(&ml)) != NULL) { - if ((error = ip_output(m, NULL, NULL, IP_RAWOUTPUT, - &sc->sc_imo, NULL, 0)) == 0) - pfsyncstat_inc(pfsyncs_opackets); - else { - DPFPRINTF(LOG_DEBUG, - "ip_output() @ %s failed (%d)\n", __func__, error); - pfsyncstat_inc(pfsyncs_oerrors); - } - } -done: - NET_UNLOCK(); -} + /* + * pfsync_bulk_snd_states only updates m->m_len after + * filling in a state after the offset we gave it. + */ + count = pfsync_bulk_snd_states(sc, m, space, nlen); + if (count == 0) + return (0); -void -pfsync_send_pkt(struct mbuf *m) -{ - if (mq_enqueue(&pfsync_mq, m) != 0) { - pfsyncstat_inc(pfsyncs_oerrors); - DPFPRINTF(LOG_DEBUG, "mq_enqueue() @ %s failed, queue full\n", - __func__); - } else - task_add(net_tq(0), &pfsync_task); + subh->action = PFSYNC_ACT_UPD; + subh->len = sizeof(struct pfsync_state) >> 2; + subh->count = htons(count); + + return (count); } -void -pfsync_sendout(void) +static void +pfsync_bulk_snd_start(struct pfsync_softc *sc) { - struct pfsync_snapshot sn; - struct pfsync_softc *sc = pfsyncif; -#if NBPFILTER > 0 - struct ifnet *ifp = &sc->sc_if; -#endif + const unsigned int space = sc->sc_if.if_mtu - + (sizeof(struct ip) + sizeof(struct pfsync_header)); struct mbuf *m; - struct ip *ip; - struct pfsync_header *ph; - struct pfsync_subheader *subh; - struct pf_state *st; - struct pfsync_upd_req_item *ur; - int offset; - int q, count = 0; - if (sc == NULL || sc->sc_len == PFSYNC_MINPKT) - return; + rw_enter_read(&pf_state_list.pfs_rwl); - if (!ISSET(sc->sc_if.if_flags, IFF_RUNNING) || -#if NBPFILTER > 0 - (ifp->if_bpf == NULL && sc->sc_sync_ifidx == 0)) { -#else - sc->sc_sync_ifidx == 0) { -#endif - pfsync_drop(sc); - return; + rw_enter_write(&sc->sc_bulk_snd.snd_lock); + if (sc->sc_bulk_snd.snd_next != NULL) { + sc->sc_bulk_snd.snd_again = 1; + goto leave; } - pfsync_grab_snapshot(&sn, sc); + mtx_enter(&pf_state_list.pfs_mtx); + sc->sc_bulk_snd.snd_next = TAILQ_FIRST(&pf_state_list.pfs_list); + sc->sc_bulk_snd.snd_tail = TAILQ_LAST(&pf_state_list.pfs_list, + pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); - /* - * Check below is sufficient to prevent us from sending empty packets, - * but it does not stop us from sending short packets. - */ - if (pfsync_is_snapshot_empty(&sn)) - return; + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) + goto leave; - MGETHDR(m, M_DONTWAIT, MT_DATA); - if (m == NULL) { - sc->sc_if.if_oerrors++; - pfsyncstat_inc(pfsyncs_onomem); - pfsync_drop_snapshot(&sn); - return; + MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu); + if (!ISSET(m->m_flags, M_EXT)) { + /* some error++ */ + m_freem(m); /* drop */ + goto leave; } - if (max_linkhdr + sn.sn_len > MHLEN) { - MCLGETL(m, M_DONTWAIT, max_linkhdr + sn.sn_len); - if (!ISSET(m->m_flags, M_EXT)) { - m_free(m); - sc->sc_if.if_oerrors++; - pfsyncstat_inc(pfsyncs_onomem); - pfsync_drop_snapshot(&sn); - return; - } - } - m->m_data += max_linkhdr; - m->m_len = m->m_pkthdr.len = sn.sn_len; + m_align(m, space); + m->m_len = 0; - /* build the ip header */ - ip = mtod(m, struct ip *); - bcopy(&sc->sc_template, ip, sizeof(*ip)); - offset = sizeof(*ip); + if (sc->sc_bulk_snd.snd_tail == NULL) { + pfsync_dprintf(sc, "bulk send empty (%s)", __func__); - ip->ip_len = htons(m->m_pkthdr.len); - ip->ip_id = htons(ip_randomid()); + /* list is empty */ + if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0) + panic("%s: mtu is too low", __func__); + goto encap; + } - /* build the pfsync header */ - ph = (struct pfsync_header *)(m->m_data + offset); - bzero(ph, sizeof(*ph)); - offset += sizeof(*ph); + pfsync_dprintf(sc, "bulk send start (%s)", __func__); - ph->version = PFSYNC_VERSION; - ph->len = htons(sn.sn_len - sizeof(*ip)); - bcopy(pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH); + /* start a bulk update. */ + if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_START) == 0) + panic("%s: mtu is too low", __func__); - if (!TAILQ_EMPTY(&sn.sn_upd_req_list)) { - subh = (struct pfsync_subheader *)(m->m_data + offset); - offset += sizeof(*subh); + /* fill it up with state updates. */ + pfsync_bulk_snd_sub(sc, m, space); - count = 0; - while ((ur = TAILQ_FIRST(&sn.sn_upd_req_list)) != NULL) { - TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_snap); +encap: + m->m_pkthdr.len = m->m_len; + m = pfsync_encap(sc, m); + if (m == NULL) + goto leave; - bcopy(&ur->ur_msg, m->m_data + offset, - sizeof(ur->ur_msg)); - offset += sizeof(ur->ur_msg); + pfsync_sendout(sc, m); - pool_put(&sc->sc_pool, ur); +leave: + rw_exit_write(&sc->sc_bulk_snd.snd_lock); - count++; - } + rw_exit_read(&pf_state_list.pfs_rwl); +} - bzero(subh, sizeof(*subh)); - subh->len = sizeof(ur->ur_msg) >> 2; - subh->action = PFSYNC_ACT_UPD_REQ; - subh->count = htons(count); +static void +pfsync_bulk_snd_tmo(void *arg) +{ + struct pfsync_softc *sc = arg; + const unsigned int space = sc->sc_if.if_mtu - + (sizeof(struct ip) + sizeof(struct pfsync_header)); + struct mbuf *m; + + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) { + /* some error++ */ + /* retry later */ + timeout_add_msec(&sc->sc_bulk_snd.snd_tmo, + PFSYNC_BULK_SND_IVAL_MS); + return; } - /* has someone built a custom region for us to add? */ - if (sn.sn_plus != NULL) { - bcopy(sn.sn_plus, m->m_data + offset, sn.sn_pluslen); - offset += sn.sn_pluslen; - sn.sn_plus = NULL; /* XXX memory leak ? */ + MCLGETL(m, M_DONTWAIT, max_linkhdr + sc->sc_if.if_mtu); + if (!ISSET(m->m_flags, M_EXT)) { + /* some error++ */ + m_freem(m); + /* retry later */ + timeout_add_msec(&sc->sc_bulk_snd.snd_tmo, + PFSYNC_BULK_SND_IVAL_MS); + return; } -#if defined(IPSEC) - if (!TAILQ_EMPTY(&sn.sn_tdb_q)) { - struct tdb *t; + m_align(m, space); + m->m_len = 0; - subh = (struct pfsync_subheader *)(m->m_data + offset); - offset += sizeof(*subh); + rw_enter_read(&pf_state_list.pfs_rwl); + rw_enter_write(&sc->sc_bulk_snd.snd_lock); - count = 0; - while ((t = TAILQ_FIRST(&sn.sn_tdb_q)) != NULL) { - TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_snap); - pfsync_out_tdb(t, m->m_data + offset); - offset += sizeof(struct pfsync_tdb); - mtx_enter(&t->tdb_mtx); - KASSERT(ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED)); - CLR(t->tdb_flags, TDBF_PFSYNC_SNAPPED); - CLR(t->tdb_flags, TDBF_PFSYNC); - mtx_leave(&t->tdb_mtx); - tdb_unref(t); - count++; - } + if (sc->sc_bulk_snd.snd_next == NULL) { + /* there was no space in the previous packet for a BUS END */ - bzero(subh, sizeof(*subh)); - subh->action = PFSYNC_ACT_TDB; - subh->len = sizeof(struct pfsync_tdb) >> 2; - subh->count = htons(count); - } -#endif + if (pfsync_bulk_snd_bus(sc, m, space, 0, PFSYNC_BUS_END) == 0) + panic("%s: mtu is too low", __func__); - /* walk the queues */ - for (q = 0; q < PFSYNC_S_COUNT; q++) { - if (TAILQ_EMPTY(&sn.sn_qs[q])) - continue; + /* this bulk is done */ + pfsync_dprintf(sc, "bulk send done (%s)", __func__); + sc->sc_bulk_snd.snd_again = 0; /* XXX */ + sc->sc_bulk_snd.snd_tail = NULL; + } else { + pfsync_dprintf(sc, "bulk send again (%s)", __func__); - subh = (struct pfsync_subheader *)(m->m_data + offset); - offset += sizeof(*subh); + /* fill it up with state updates. */ + pfsync_bulk_snd_sub(sc, m, space); + } - count = 0; - while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) { - mtx_enter(&st->mtx); - TAILQ_REMOVE(&sn.sn_qs[q], st, sync_snap); - KASSERT(st->sync_state == q); - KASSERT(st->snapped == 1); - st->sync_state = PFSYNC_S_NONE; - st->snapped = 0; - pfsync_qs[q].write(st, m->m_data + offset); - offset += pfsync_qs[q].len; - mtx_leave(&st->mtx); + m->m_pkthdr.len = m->m_len; + m = pfsync_encap(sc, m); - pf_state_unref(st); - count++; - } + rw_exit_write(&sc->sc_bulk_snd.snd_lock); + rw_exit_read(&pf_state_list.pfs_rwl); - bzero(subh, sizeof(*subh)); - subh->action = pfsync_qs[q].action; - subh->len = pfsync_qs[q].len >> 2; - subh->count = htons(count); + if (m != NULL) { + NET_LOCK(); + pfsync_sendout(sc, m); + NET_UNLOCK(); } +} - /* we're done, let's put it on the wire */ -#if NBPFILTER > 0 - if (ifp->if_bpf) { - m->m_data += sizeof(*ip); - m->m_len = m->m_pkthdr.len = sn.sn_len - sizeof(*ip); - bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT); - m->m_data -= sizeof(*ip); - m->m_len = m->m_pkthdr.len = sn.sn_len; - } +static void +pfsync_update_state_req(struct pfsync_softc *sc, struct pf_state *st) +{ + struct pfsync_slice *s = pfsync_slice_enter(sc, st); - if (sc->sc_sync_ifidx == 0) { - sc->sc_len = PFSYNC_MINPKT; - m_freem(m); - return; - } -#endif + DPRINTF("%p %016llx %08x: sync_state %02x", + st, st->id, st->creatorid, st->sync_state); - sc->sc_if.if_opackets++; - sc->sc_if.if_obytes += m->m_pkthdr.len; + switch (st->sync_state) { + case PFSYNC_S_UPD_C: + case PFSYNC_S_IACK: + pfsync_q_del(s, st); + /* FALLTHROUGH */ + case PFSYNC_S_NONE: + pfsync_q_ins(s, st, PFSYNC_S_UPD); + break; - m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain; + case PFSYNC_S_INS: + case PFSYNC_S_UPD: + case PFSYNC_S_DEL: + /* we're already handling it */ + break; + default: + panic("%s: state %p unexpected sync_state %d", + __func__, st, st->sync_state); + } - pfsync_send_pkt(m); + pfsync_slice_sched(s); + pfsync_slice_leave(sc, s); } -void -pfsync_insert_state(struct pf_state *st) +#if defined(IPSEC) +static void +pfsync_out_tdb(struct tdb *tdb, void *buf) { - struct pfsync_softc *sc = pfsyncif; + struct pfsync_tdb *ut = buf; - NET_ASSERT_LOCKED(); + memset(ut, 0, sizeof(*ut)); + ut->spi = tdb->tdb_spi; + memcpy(&ut->dst, &tdb->tdb_dst, sizeof(ut->dst)); + /* + * When a failover happens, the master's rpl is probably above + * what we see here (we may be up to a second late), so + * increase it a bit for outbound tdbs to manage most such + * situations. + * + * For now, just add an offset that is likely to be larger + * than the number of packets we can see in one second. The RFC + * just says the next packet must have a higher seq value. + * + * XXX What is a good algorithm for this? We could use + * a rate-determined increase, but to know it, we would have + * to extend struct tdb. + * XXX pt->rpl can wrap over MAXINT, but if so the real tdb + * will soon be replaced anyway. For now, just don't handle + * this edge case. + */ +#define RPL_INCR 16384 + ut->rpl = htobe64(tdb->tdb_rpl + + (ISSET(tdb->tdb_flags, TDBF_PFSYNC_RPL) ? RPL_INCR : 0)); + ut->cur_bytes = htobe64(tdb->tdb_cur_bytes); + ut->sproto = tdb->tdb_sproto; + ut->rdomain = htons(tdb->tdb_rdomain); +} - if (ISSET(st->rule.ptr->rule_flag, PFRULE_NOSYNC) || - st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) { - SET(st->state_flags, PFSTATE_NOSYNC); - return; +static struct pfsync_slice * +pfsync_slice_enter_tdb(struct pfsync_softc *sc, const struct tdb *t) +{ + /* + * just use the first slice for all ipsec (for now) until + * it's more obvious what property (eg, spi) we can distribute + * tdbs over slices with. + */ + struct pfsync_slice *s = &sc->sc_slices[0]; + + if (!mtx_enter_try(&s->s_mtx)) { + mtx_enter(&s->s_mtx); + s->s_stat_contended++; } + s->s_stat_locks++; - if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) || - ISSET(st->state_flags, PFSTATE_NOSYNC)) - return; + return (s); +} - if (sc->sc_len == PFSYNC_MINPKT) - timeout_add_sec(&sc->sc_tmo, 1); +static void +pfsync_tdb_ins(struct pfsync_slice *s, struct tdb *tdb) +{ + size_t nlen = sizeof(struct pfsync_tdb); + struct mbuf *m = NULL; - pfsync_q_ins(st, PFSYNC_S_INS); + KASSERT(s->s_len >= PFSYNC_MINPKT); - st->sync_updates = 0; -} + MUTEX_ASSERT_LOCKED(&s->s_mtx); + MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx); -int -pfsync_defer(struct pf_state *st, struct mbuf *m, struct pfsync_deferral **ppd) -{ - struct pfsync_softc *sc = pfsyncif; - struct pfsync_deferral *pd; - unsigned int sched; + if (TAILQ_EMPTY(&s->s_tdb_q)) + nlen += sizeof(struct pfsync_subheader); - NET_ASSERT_LOCKED(); + if (s->s_len + nlen > s->s_pfsync->sc_if.if_mtu) { + m = pfsync_slice_write(s); + if (m != NULL) { + s->s_stat_enqueue++; + if (mq_enqueue(&s->s_sendq, m) == 0) + task_add(s->s_softnet, &s->s_send); + } - if (!sc->sc_defer || - ISSET(st->state_flags, PFSTATE_NOSYNC) || - m->m_flags & (M_BCAST|M_MCAST)) - return (0); + nlen = sizeof(struct pfsync_subheader) + + sizeof(struct pfsync_tdb); + } - pd = pool_get(&sc->sc_pool, M_NOWAIT); - if (pd == NULL) - return (0); + s->s_len += nlen; + TAILQ_INSERT_TAIL(&s->s_tdb_q, tdb, tdb_sync_entry); + tdb->tdb_updates = 0; - /* - * deferral queue grows faster, than timeout can consume, - * we have to ask packet (caller) to help timer and dispatch - * one deferral for us. - * - * We wish to call pfsync_undefer() here. Unfortunately we can't, - * because pfsync_undefer() will be calling to ip_output(), - * which in turn will call to pf_test(), which would then attempt - * to grab PF_LOCK() we currently hold. - */ - if (sc->sc_deferred >= 128) { - mtx_enter(&sc->sc_deferrals_mtx); - *ppd = TAILQ_FIRST(&sc->sc_deferrals); - if (*ppd != NULL) { - TAILQ_REMOVE(&sc->sc_deferrals, *ppd, pd_entry); - sc->sc_deferred--; - } - mtx_leave(&sc->sc_deferrals_mtx); - } else - *ppd = NULL; + if (!timeout_pending(&s->s_tmo)) + timeout_add_sec(&s->s_tmo, 1); +} - m->m_pkthdr.pf.flags |= PF_TAG_GENERATED; - SET(st->state_flags, PFSTATE_ACK); +static void +pfsync_tdb_del(struct pfsync_slice *s, struct tdb *tdb) +{ + MUTEX_ASSERT_LOCKED(&s->s_mtx); + MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx); - pd->pd_st = pf_state_ref(st); - pd->pd_m = m; + TAILQ_REMOVE(&s->s_tdb_q, tdb, tdb_sync_entry); - pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC; + s->s_len -= sizeof(struct pfsync_tdb); + if (TAILQ_EMPTY(&s->s_tdb_q)) + s->s_len -= sizeof(struct pfsync_subheader); +} + +/* + * the reference that pfsync has to a tdb is accounted for by the + * TDBF_PFSYNC flag, not by tdb_ref/tdb_unref. tdb_delete_tdb() is + * called after all other references to a tdb are dropped (with + * tdb_unref) as part of the tdb_free(). + * + * tdb_free() needs to wait for pfsync to let go of the tdb though, + * which would be best handled by a reference count, but tdb_free + * needs the NET_LOCK which pfsync is already fighting with. instead + * use the TDBF_PFSYNC_SNAPPED flag to coordinate the pfsync write/drop + * with tdb_free. + */ + +void +pfsync_update_tdb(struct tdb *tdb, int output) +{ + struct pfsync_softc *sc; - mtx_enter(&sc->sc_deferrals_mtx); - sched = TAILQ_EMPTY(&sc->sc_deferrals); + MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx); - TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry); - sc->sc_deferred++; - mtx_leave(&sc->sc_deferrals_mtx); + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc != NULL) { + struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb); - if (sched) - timeout_add_nsec(&sc->sc_deferrals_tmo, PFSYNC_DEFER_NSEC); + /* TDBF_PFSYNC is only changed while the slice mtx is held */ + if (!ISSET(tdb->tdb_flags, TDBF_PFSYNC)) { + mtx_enter(&tdb->tdb_mtx); + SET(tdb->tdb_flags, TDBF_PFSYNC); + mtx_leave(&tdb->tdb_mtx); - schednetisr(NETISR_PFSYNC); + pfsync_tdb_ins(s, tdb); + } else if (++tdb->tdb_updates >= sc->sc_maxupdates) + pfsync_slice_sched(s); - return (1); + /* XXX no sync timestamp on tdbs to check */ + + pfsync_slice_leave(sc, s); + } + smr_read_leave(); } void -pfsync_undefer_notify(struct pfsync_deferral *pd) +pfsync_delete_tdb(struct tdb *tdb) { - struct pf_pdesc pdesc; - struct pf_state *st = pd->pd_st; + struct pfsync_softc *sc; - /* - * pf_remove_state removes the state keys and sets st->timeout - * to PFTM_UNLINKED. this is done under NET_LOCK which should - * be held here, so we can use PFTM_UNLINKED as a test for - * whether the state keys are set for the address family - * lookup. - */ + MUTEX_ASSERT_UNLOCKED(&tdb->tdb_mtx); - if (st->timeout == PFTM_UNLINKED) - return; + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc != NULL) { + struct pfsync_slice *s = pfsync_slice_enter_tdb(sc, tdb); - if (st->rt == PF_ROUTETO) { - if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af, - st->direction, st->kif, pd->pd_m, NULL) != PF_PASS) - return; - switch (st->key[PF_SK_WIRE]->af) { - case AF_INET: - pf_route(&pdesc, st); - break; -#ifdef INET6 - case AF_INET6: - pf_route6(&pdesc, st); - break; -#endif /* INET6 */ - default: - unhandled_af(st->key[PF_SK_WIRE]->af); - } - pd->pd_m = pdesc.m; - } else { - switch (st->key[PF_SK_WIRE]->af) { - case AF_INET: - ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0); - break; -#ifdef INET6 - case AF_INET6: - ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL); - break; -#endif /* INET6 */ - default: - unhandled_af(st->key[PF_SK_WIRE]->af); + /* TDBF_PFSYNC is only changed while the slice mtx is held */ + if (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) { + pfsync_tdb_del(s, tdb); + + mtx_enter(&tdb->tdb_mtx); + CLR(tdb->tdb_flags, TDBF_PFSYNC); + mtx_leave(&tdb->tdb_mtx); } - pd->pd_m = NULL; + pfsync_slice_leave(sc, s); } -} + smr_read_leave(); -void -pfsync_free_deferral(struct pfsync_deferral *pd) -{ - struct pfsync_softc *sc = pfsyncif; + /* + * handle pfsync_slice_drop being called from pfsync_down + * and the smr/slice access above won't work. + */ - pf_state_unref(pd->pd_st); - m_freem(pd->pd_m); - pool_put(&sc->sc_pool, pd); + mtx_enter(&tdb->tdb_mtx); + SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED); /* like a thanos snap */ + while (ISSET(tdb->tdb_flags, TDBF_PFSYNC)) { + msleep_nsec(&tdb->tdb_updates, &tdb->tdb_mtx, PWAIT, + "tdbfree", INFSLP); + } + mtx_leave(&tdb->tdb_mtx); +} +#endif /* defined(IPSEC) */ + +struct pfsync_act { + void (*in)(struct pfsync_softc *, const caddr_t, + unsigned int, unsigned int); + size_t len; +}; + +static void pfsync_in_clr(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_iack(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_upd_c(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_ureq(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_del(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_del_c(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_bus(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_tdb(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_ins(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +static void pfsync_in_upd(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); + +static const struct pfsync_act pfsync_acts[] = { + [PFSYNC_ACT_CLR] = + { pfsync_in_clr, sizeof(struct pfsync_clr) }, + [PFSYNC_ACT_INS_ACK] = + { pfsync_in_iack, sizeof(struct pfsync_ins_ack) }, + [PFSYNC_ACT_UPD_C] = + { pfsync_in_upd_c, sizeof(struct pfsync_upd_c) }, + [PFSYNC_ACT_UPD_REQ] = + { pfsync_in_ureq, sizeof(struct pfsync_upd_req) }, + [PFSYNC_ACT_DEL] = + { pfsync_in_del, sizeof(struct pfsync_state) }, + [PFSYNC_ACT_DEL_C] = + { pfsync_in_del_c, sizeof(struct pfsync_del_c) }, + [PFSYNC_ACT_BUS] = + { pfsync_in_bus, sizeof(struct pfsync_bus) }, + [PFSYNC_ACT_INS] = + { pfsync_in_ins, sizeof(struct pfsync_state) }, + [PFSYNC_ACT_UPD] = + { pfsync_in_upd, sizeof(struct pfsync_state) }, + [PFSYNC_ACT_TDB] = + { pfsync_in_tdb, sizeof(struct pfsync_tdb) }, +}; + +static void +pfsync_in_skip(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) +{ + /* nop */ } -void -pfsync_undefer(struct pfsync_deferral *pd, int drop) +static struct mbuf * +pfsync_input(struct mbuf *m, uint8_t ttl, unsigned int hlen) { - struct pfsync_softc *sc = pfsyncif; + struct pfsync_softc *sc; + struct pfsync_header *ph; + struct pfsync_subheader *subh; + unsigned int len; + void (*in)(struct pfsync_softc *, + const caddr_t, unsigned int, unsigned int); +#if NBPF > 0 + caddr_t if_bpf; +#endif - NET_ASSERT_LOCKED(); + pfsyncstat_inc(pfsyncs_ipackets); - if (sc == NULL) - return; + if (!pf_status.running) + return (m); - CLR(pd->pd_st->state_flags, PFSTATE_ACK); - if (!drop) - pfsync_undefer_notify(pd); + /* + * pfsyncif is only set if it is up and running correctly. + */ + smr_read_enter(); + sc = SMR_PTR_GET(&pfsyncif); + if (sc == NULL) + goto leave; - pfsync_free_deferral(pd); -} + if (sc->sc_sync_ifidx != m->m_pkthdr.ph_ifidx) { + pfsyncstat_inc(pfsyncs_badif); + goto leave; + } -void -pfsync_deferrals_tmo(void *arg) -{ - struct pfsync_softc *sc = arg; - struct pfsync_deferral *pd; - uint64_t now, nsec = 0; - struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds); +#if NBPF > 0 +#endif - now = getnsecuptime(); + /* verify that the IP TTL is 255. */ + if (ttl != PFSYNC_DFLTTL) { + pfsyncstat_inc(pfsyncs_badttl); + goto leave; + } - mtx_enter(&sc->sc_deferrals_mtx); - for (;;) { - pd = TAILQ_FIRST(&sc->sc_deferrals); - if (pd == NULL) - break; + m_adj(m, hlen); - if (now < pd->pd_deadline) { - nsec = pd->pd_deadline - now; - break; - } + if (m->m_pkthdr.len < sizeof(*ph)) { + pfsyncstat_inc(pfsyncs_hdrops); + goto leave; + } + if (m->m_len < sizeof(*ph)) { + m = m_pullup(m, sizeof(*ph)); + if (m == NULL) + goto leave; + } - TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); - sc->sc_deferred--; - TAILQ_INSERT_TAIL(&pds, pd, pd_entry); + ph = mtod(m, struct pfsync_header *); + if (ph->version != PFSYNC_VERSION) { + pfsyncstat_inc(pfsyncs_badver); + goto leave; } - mtx_leave(&sc->sc_deferrals_mtx); - if (nsec > 0) { - /* we were looking at a pd, but it wasn't old enough */ - timeout_add_nsec(&sc->sc_deferrals_tmo, nsec); + len = ntohs(ph->len); + if (m->m_pkthdr.len < len) { + pfsyncstat_inc(pfsyncs_badlen); + goto leave; } + if (m->m_pkthdr.len > len) + m->m_pkthdr.len = len; - if (TAILQ_EMPTY(&pds)) - return; + /* ok, it's serious now */ + refcnt_take(&sc->sc_refs); + smr_read_leave(); - NET_LOCK(); - while ((pd = TAILQ_FIRST(&pds)) != NULL) { - TAILQ_REMOVE(&pds, pd, pd_entry); + counters_pkt(sc->sc_if.if_counters, ifc_ipackets, ifc_ibytes, len); - pfsync_undefer(pd, 0); - } - NET_UNLOCK(); -} + m_adj(m, sizeof(*ph)); -void -pfsync_deferred(struct pf_state *st, int drop) -{ - struct pfsync_softc *sc = pfsyncif; - struct pfsync_deferral *pd; + while (m->m_pkthdr.len >= sizeof(*subh)) { + unsigned int action, mlen, count; - NET_ASSERT_LOCKED(); + if (m->m_len < sizeof(*subh)) { + m = m_pullup(m, sizeof(*subh)); + if (m == NULL) + goto rele; + } + subh = mtod(m, struct pfsync_subheader *); - mtx_enter(&sc->sc_deferrals_mtx); - TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) { - if (pd->pd_st == st) { - TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); - sc->sc_deferred--; - break; + action = subh->action; + mlen = subh->len << 2; + count = ntohs(subh->count); + + if (action >= PFSYNC_ACT_MAX || + action >= nitems(pfsync_acts) || + mlen < pfsync_acts[subh->action].len) { + /* + * subheaders are always followed by at least one + * message, so if the peer is new + * enough to tell us how big its messages are then we + * know enough to skip them. + */ + if (count == 0 || mlen == 0) { + pfsyncstat_inc(pfsyncs_badact); + goto rele; + } + + in = pfsync_in_skip; + } else { + in = pfsync_acts[action].in; + if (in == NULL) + in = pfsync_in_skip; + } + + m_adj(m, sizeof(*subh)); + len = mlen * count; + if (len > m->m_pkthdr.len) { + pfsyncstat_inc(pfsyncs_badlen); + goto rele; + } + if (m->m_len < len) { + m = m_pullup(m, len); + if (m == NULL) + goto rele; } + + (*in)(sc, mtod(m, caddr_t), mlen, count); + m_adj(m, len); } - mtx_leave(&sc->sc_deferrals_mtx); - if (pd != NULL) - pfsync_undefer(pd, drop); +rele: + refcnt_rele_wake(&sc->sc_refs); + return (m); + +leave: + smr_read_leave(); + return (m); } -void -pfsync_update_state(struct pf_state *st) +static void +pfsync_in_clr(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) { - struct pfsync_softc *sc = pfsyncif; - int sync = 0; - - NET_ASSERT_LOCKED(); - - if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING)) - return; + const struct pfsync_clr *clr; + struct pf_state *head, *tail, *st, *next; + struct pfi_kif *kif; + uint32_t creatorid; + unsigned int i; - if (ISSET(st->state_flags, PFSTATE_ACK)) - pfsync_deferred(st, 0); - if (ISSET(st->state_flags, PFSTATE_NOSYNC)) { - if (st->sync_state != PFSYNC_S_NONE) - pfsync_q_del(st); - return; - } + rw_enter_read(&pf_state_list.pfs_rwl); - if (sc->sc_len == PFSYNC_MINPKT) - timeout_add_sec(&sc->sc_tmo, 1); + /* get a view of the state list */ + mtx_enter(&pf_state_list.pfs_mtx); + head = TAILQ_FIRST(&pf_state_list.pfs_list); + tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); - switch (st->sync_state) { - case PFSYNC_S_UPD_C: - case PFSYNC_S_UPD: - case PFSYNC_S_INS: - /* we're already handling it */ + PF_LOCK(); + for (i = 0; i < count; i++) { + clr = (struct pfsync_clr *)(buf + i * mlen); - if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) { - st->sync_updates++; - if (st->sync_updates >= sc->sc_maxupdates) - sync = 1; + creatorid = clr->creatorid; + if (clr->ifname[0] == '\0') + kif = NULL; + else { + kif = pfi_kif_find(clr->ifname); + if (kif == NULL) + continue; } - break; - - case PFSYNC_S_IACK: - pfsync_q_del(st); - case PFSYNC_S_NONE: - pfsync_q_ins(st, PFSYNC_S_UPD_C); - st->sync_updates = 0; - break; - case PFSYNC_S_DEL: - case PFSYNC_S_COUNT: - case PFSYNC_S_DEFER: - break; + st = NULL; + next = head; - default: - panic("pfsync_update_state: unexpected sync state %d", - st->sync_state); - } + PF_STATE_ENTER_WRITE(); + while (st != tail) { + st = next; + next = TAILQ_NEXT(st, entry_list); - if (sync || (getuptime() - st->pfsync_time) < 2) - schednetisr(NETISR_PFSYNC); -} + if (creatorid != st->creatorid) + continue; + if (kif != NULL && kif != st->kif) + continue; -void -pfsync_cancel_full_update(struct pfsync_softc *sc) -{ - if (timeout_pending(&sc->sc_bulkfail_tmo) || - timeout_pending(&sc->sc_bulk_tmo)) { -#if NCARP > 0 - if (!pfsync_sync_ok) - carp_group_demote_adj(&sc->sc_if, -1, - "pfsync bulk cancelled"); - if (sc->sc_initial_bulk) { - carp_group_demote_adj(&sc->sc_if, -32, - "pfsync init"); - sc->sc_initial_bulk = 0; + mtx_enter(&st->mtx); + SET(st->state_flags, PFSTATE_NOSYNC); + mtx_leave(&st->mtx); + pf_remove_state(st); } -#endif - pfsync_sync_ok = 1; - DPFPRINTF(LOG_INFO, "cancelling bulk update"); + PF_STATE_EXIT_WRITE(); } - timeout_del(&sc->sc_bulkfail_tmo); - timeout_del(&sc->sc_bulk_tmo); - sc->sc_bulk_next = NULL; - sc->sc_bulk_last = NULL; - sc->sc_ureq_sent = 0; - sc->sc_bulk_tries = 0; -} + PF_UNLOCK(); -void -pfsync_request_full_update(struct pfsync_softc *sc) -{ - if (sc->sc_sync_ifidx != 0 && ISSET(sc->sc_if.if_flags, IFF_RUNNING)) { - /* Request a full state table update. */ - sc->sc_ureq_sent = getuptime(); -#if NCARP > 0 - if (!sc->sc_link_demoted && pfsync_sync_ok) - carp_group_demote_adj(&sc->sc_if, 1, - "pfsync bulk start"); -#endif - pfsync_sync_ok = 0; - DPFPRINTF(LOG_INFO, "requesting bulk update"); - PF_LOCK(); - timeout_add(&sc->sc_bulkfail_tmo, 4 * hz + - pf_pool_limits[PF_LIMIT_STATES].limit / - ((sc->sc_if.if_mtu - PFSYNC_MINPKT) / - sizeof(struct pfsync_state))); - PF_UNLOCK(); - pfsync_request_update(0, 0); - } + rw_exit_read(&pf_state_list.pfs_rwl); } -void -pfsync_request_update(u_int32_t creatorid, u_int64_t id) +static void +pfsync_in_ins(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) { - struct pfsync_softc *sc = pfsyncif; - struct pfsync_upd_req_item *item; - size_t nlen, sclen; - int retry; + const struct pfsync_state *sp; + sa_family_t af1, af2; + unsigned int i; - /* - * this code does nothing to prevent multiple update requests for the - * same state being generated. - */ + PF_LOCK(); + for (i = 0; i < count; i++) { + sp = (struct pfsync_state *)(buf + mlen * i); + af1 = sp->key[0].af; + af2 = sp->key[1].af; - item = pool_get(&sc->sc_pool, PR_NOWAIT); - if (item == NULL) { - /* XXX stats */ - return; + /* check for invalid values */ + if (sp->timeout >= PFTM_MAX || + sp->src.state > PF_TCPS_PROXY_DST || + sp->dst.state > PF_TCPS_PROXY_DST || + sp->direction > PF_OUT || + (((af1 || af2) && + ((af1 != AF_INET && af1 != AF_INET6) || + (af2 != AF_INET && af2 != AF_INET6))) || + (sp->af != AF_INET && sp->af != AF_INET6))) { + pfsyncstat_inc(pfsyncs_badval); + continue; + } + + if (pf_state_import(sp, 0) == ENOMEM) { + /* drop out, but process the rest of the actions */ + break; + } } + PF_UNLOCK(); +} - item->ur_msg.id = id; - item->ur_msg.creatorid = creatorid; +static void +pfsync_in_iack(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) +{ + const struct pfsync_ins_ack *ia; + struct pf_state_cmp id_key; + struct pf_state *st; + unsigned int i; - for (;;) { - mtx_enter(&sc->sc_upd_req_mtx); + for (i = 0; i < count; i++) { + ia = (struct pfsync_ins_ack *)(buf + mlen * i); - nlen = sizeof(struct pfsync_upd_req); - if (TAILQ_EMPTY(&sc->sc_upd_req_list)) - nlen += sizeof(struct pfsync_subheader); - - sclen = atomic_add_long_nv(&sc->sc_len, nlen); - retry = (sclen > sc->sc_if.if_mtu); - if (retry) - atomic_sub_long(&sc->sc_len, nlen); - else - TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry); + id_key.id = ia->id; + id_key.creatorid = ia->creatorid; - mtx_leave(&sc->sc_upd_req_mtx); + PF_STATE_ENTER_READ(); + st = pf_find_state_byid(&id_key); + pf_state_ref(st); + PF_STATE_EXIT_READ(); + if (st == NULL) + continue; - if (!retry) - break; + if (READ_ONCE(st->sync_defer) != NULL) + pfsync_deferred(sc, st); - pfsync_sendout(); + pf_state_unref(st); } - - schednetisr(NETISR_PFSYNC); } -void -pfsync_update_state_req(struct pf_state *st) +static int +pfsync_upd_tcp(struct pf_state *st, const struct pfsync_state_peer *src, + const struct pfsync_state_peer *dst) { - struct pfsync_softc *sc = pfsyncif; + int sync = 0; - if (sc == NULL) - panic("pfsync_update_state_req: nonexistent instance"); + /* + * The state should never go backwards except + * for syn-proxy states. Neither should the + * sequence window slide backwards. + */ + if ((st->src.state > src->state && + (st->src.state < PF_TCPS_PROXY_SRC || + src->state >= PF_TCPS_PROXY_SRC)) || - if (ISSET(st->state_flags, PFSTATE_NOSYNC)) { - if (st->sync_state != PFSYNC_S_NONE) - pfsync_q_del(st); - return; - } + (st->src.state == src->state && + SEQ_GT(st->src.seqlo, ntohl(src->seqlo)))) + sync++; + else + pf_state_peer_ntoh(src, &st->src); - switch (st->sync_state) { - case PFSYNC_S_UPD_C: - case PFSYNC_S_IACK: - pfsync_q_del(st); - case PFSYNC_S_NONE: - pfsync_q_ins(st, PFSYNC_S_UPD); - schednetisr(NETISR_PFSYNC); - return; + if ((st->dst.state > dst->state) || - case PFSYNC_S_INS: - case PFSYNC_S_UPD: - case PFSYNC_S_DEL: - /* we're already handling it */ - return; + (st->dst.state >= TCPS_SYN_SENT && + SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo)))) + sync++; + else + pf_state_peer_ntoh(dst, &st->dst); - default: - panic("pfsync_update_state_req: unexpected sync state %d", - st->sync_state); - } + return (sync); } -void -pfsync_delete_state(struct pf_state *st) +static void +pfsync_in_updates(struct pfsync_softc *sc, struct pf_state *st, + const struct pfsync_state_peer *src, const struct pfsync_state_peer *dst, + uint8_t timeout) { - struct pfsync_softc *sc = pfsyncif; - - NET_ASSERT_LOCKED(); - - if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING)) - return; + struct pf_state_scrub *sscrub = NULL; + struct pf_state_scrub *dscrub = NULL; + int sync; - if (ISSET(st->state_flags, PFSTATE_ACK)) - pfsync_deferred(st, 1); - if (ISSET(st->state_flags, PFSTATE_NOSYNC)) { - if (st->sync_state != PFSYNC_S_NONE) - pfsync_q_del(st); - return; + if (src->scrub.scrub_flag && st->src.scrub == NULL) { + sscrub = pf_state_scrub_get(); + if (sscrub == NULL) { + /* inc error? */ + goto out; + } + } + if (dst->scrub.scrub_flag && st->dst.scrub == NULL) { + dscrub = pf_state_scrub_get(); + if (dscrub == NULL) { + /* inc error? */ + goto out; + } } - if (sc->sc_len == PFSYNC_MINPKT) - timeout_add_sec(&sc->sc_tmo, 1); + if (READ_ONCE(st->sync_defer) != NULL) + pfsync_deferred(sc, st); - switch (st->sync_state) { - case PFSYNC_S_INS: - /* we never got to tell the world so just forget about it */ - pfsync_q_del(st); - return; + mtx_enter(&st->mtx); - case PFSYNC_S_UPD_C: - case PFSYNC_S_UPD: - case PFSYNC_S_IACK: - pfsync_q_del(st); - /* - * FALLTHROUGH to putting it on the del list - * Note on reference count bookkeeping: - * pfsync_q_del() drops reference for queue - * ownership. But the st entry survives, because - * our caller still holds a reference. - */ + /* attach the scrub memory if needed */ + if (sscrub != NULL && st->src.scrub == NULL) { + st->src.scrub = sscrub; + sscrub = NULL; + } + if (dscrub != NULL && st->dst.scrub == NULL) { + st->dst.scrub = dscrub; + dscrub = NULL; + } + + if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) + sync = pfsync_upd_tcp(st, src, dst); + else { + sync = 0; - case PFSYNC_S_NONE: /* - * We either fall through here, or there is no reference to - * st owned by pfsync queues at this point. - * - * Calling pfsync_q_ins() puts st to del queue. The pfsync_q_ins() - * grabs a reference for delete queue. + * Non-TCP protocol state machine always go + * forwards */ - pfsync_q_ins(st, PFSYNC_S_DEL); - return; + if (st->src.state > src->state) + sync++; + else + pf_state_peer_ntoh(src, &st->src); - default: - panic("pfsync_delete_state: unexpected sync state %d", - st->sync_state); + if (st->dst.state > dst->state) + sync++; + else + pf_state_peer_ntoh(dst, &st->dst); } -} - -void -pfsync_clear_states(u_int32_t creatorid, const char *ifname) -{ - struct pfsync_softc *sc = pfsyncif; - struct { - struct pfsync_subheader subh; - struct pfsync_clr clr; - } __packed r; - NET_ASSERT_LOCKED(); - - if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING)) - return; - - bzero(&r, sizeof(r)); + st->pfsync_time = getuptime(); + if (sync < 2) { + st->expire = st->pfsync_time; + st->timeout = timeout; + } - r.subh.action = PFSYNC_ACT_CLR; - r.subh.len = sizeof(struct pfsync_clr) >> 2; - r.subh.count = htons(1); + mtx_leave(&st->mtx); - strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname)); - r.clr.creatorid = creatorid; + if (sync) { + pfsyncstat_inc(pfsyncs_stale); + pfsync_update_state(st); + } - pfsync_send_plus(&r, sizeof(r)); +out: + if (sscrub != NULL) + pf_state_scrub_put(sscrub); + if (dscrub != NULL) + pf_state_scrub_put(dscrub); } -void -pfsync_iack(struct pf_state *st) -{ - pfsync_q_ins(st, PFSYNC_S_IACK); - schednetisr(NETISR_PFSYNC); -} -void -pfsync_q_ins(struct pf_state *st, int q) +static void +pfsync_in_upd(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) { - struct pfsync_softc *sc = pfsyncif; - size_t nlen, sclen; + const struct pfsync_state *sp; + struct pf_state_cmp id_key; + struct pf_state *st; + int error; + unsigned int i; - if (sc->sc_len < PFSYNC_MINPKT) - panic("pfsync pkt len is too low %zd", sc->sc_len); - do { - mtx_enter(&sc->sc_st_mtx); - mtx_enter(&st->mtx); + for (i = 0; i < count; i++) { + sp = (struct pfsync_state *)(buf + mlen * i); - /* - * There are either two threads trying to update the - * the same state, or the state is just being processed - * (is on snapshot queue). - */ - if (st->sync_state != PFSYNC_S_NONE) { - mtx_leave(&st->mtx); - mtx_leave(&sc->sc_st_mtx); - break; + /* check for invalid values */ + if (sp->timeout >= PFTM_MAX || + sp->src.state > PF_TCPS_PROXY_DST || + sp->dst.state > PF_TCPS_PROXY_DST) { + pfsyncstat_inc(pfsyncs_badval); + continue; } - nlen = pfsync_qs[q].len; - - if (TAILQ_EMPTY(&sc->sc_qs[q])) - nlen += sizeof(struct pfsync_subheader); + id_key.id = sp->id; + id_key.creatorid = sp->creatorid; - sclen = atomic_add_long_nv(&sc->sc_len, nlen); - if (sclen > sc->sc_if.if_mtu) { - atomic_sub_long(&sc->sc_len, nlen); - mtx_leave(&st->mtx); - mtx_leave(&sc->sc_st_mtx); - pfsync_sendout(); + PF_STATE_ENTER_READ(); + st = pf_find_state_byid(&id_key); + pf_state_ref(st); + PF_STATE_EXIT_READ(); + if (st == NULL) { + /* insert the update */ + PF_LOCK(); + error = pf_state_import(sp, 0); + if (error) + pfsyncstat_inc(pfsyncs_badstate); + PF_UNLOCK(); continue; } - pf_state_ref(st); + pfsync_in_updates(sc, st, &sp->src, &sp->dst, sp->timeout); - TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list); - st->sync_state = q; - mtx_leave(&st->mtx); - mtx_leave(&sc->sc_st_mtx); - } while (0); + pf_state_unref(st); + } } -void -pfsync_q_del(struct pf_state *st) +static struct mbuf * +pfsync_upd_req_init(struct pfsync_softc *sc, unsigned int count) { - struct pfsync_softc *sc = pfsyncif; - int q; + struct mbuf *m; + unsigned int mlen; - mtx_enter(&sc->sc_st_mtx); - mtx_enter(&st->mtx); - q = st->sync_state; - /* - * re-check under mutex - * if state is snapped already, then just bail out, because we came - * too late, the state is being just processed/dispatched to peer. - */ - if ((q == PFSYNC_S_NONE) || (st->snapped)) { - mtx_leave(&st->mtx); - mtx_leave(&sc->sc_st_mtx); - return; + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) { + pfsyncstat_inc(pfsyncs_onomem); + return (NULL); } - atomic_sub_long(&sc->sc_len, pfsync_qs[q].len); - TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); - if (TAILQ_EMPTY(&sc->sc_qs[q])) - atomic_sub_long(&sc->sc_len, sizeof (struct pfsync_subheader)); - st->sync_state = PFSYNC_S_NONE; - mtx_leave(&st->mtx); - mtx_leave(&sc->sc_st_mtx); - pf_state_unref(st); + mlen = max_linkhdr + sizeof(sc->sc_template) + + sizeof(struct pfsync_header) + + sizeof(struct pfsync_subheader) + + sizeof(struct pfsync_upd_req) * count; + + if (mlen > MHLEN) { + MCLGETL(m, M_DONTWAIT, mlen); + if (!ISSET(m->m_flags, M_EXT)) { + m_freem(m); + return (NULL); + } + } + + m_align(m, 0); + m->m_len = 0; + + return (m); } -#if defined(IPSEC) -void -pfsync_update_tdb(struct tdb *t, int output) +static void +pfsync_in_upd_c(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) { - struct pfsync_softc *sc = pfsyncif; - size_t nlen, sclen; + const struct pfsync_upd_c *up; + struct pf_state_cmp id_key; + struct pf_state *st; + unsigned int i; + struct mbuf *m = NULL; + unsigned int rcount = 0; - if (sc == NULL) - return; + for (i = 0; i < count; i++) { + up = (struct pfsync_upd_c *)(buf + mlen * i); - if (!ISSET(t->tdb_flags, TDBF_PFSYNC)) { - do { - mtx_enter(&sc->sc_tdb_mtx); - nlen = sizeof(struct pfsync_tdb); - - mtx_enter(&t->tdb_mtx); - if (ISSET(t->tdb_flags, TDBF_PFSYNC)) { - /* we've lost race, no action for us then */ - mtx_leave(&t->tdb_mtx); - mtx_leave(&sc->sc_tdb_mtx); - break; - } + /* check for invalid values */ + if (up->timeout >= PFTM_MAX || + up->src.state > PF_TCPS_PROXY_DST || + up->dst.state > PF_TCPS_PROXY_DST) { + pfsyncstat_inc(pfsyncs_badval); + continue; + } - if (TAILQ_EMPTY(&sc->sc_tdb_q)) - nlen += sizeof(struct pfsync_subheader); + id_key.id = up->id; + id_key.creatorid = up->creatorid; - sclen = atomic_add_long_nv(&sc->sc_len, nlen); - if (sclen > sc->sc_if.if_mtu) { - atomic_sub_long(&sc->sc_len, nlen); - mtx_leave(&t->tdb_mtx); - mtx_leave(&sc->sc_tdb_mtx); - pfsync_sendout(); - continue; - } + PF_STATE_ENTER_READ(); + st = pf_find_state_byid(&id_key); + pf_state_ref(st); + PF_STATE_EXIT_READ(); + if (st == NULL) { + /* We don't have this state. Ask for it. */ + struct pfsync_upd_req *ur; - TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry); - tdb_ref(t); - SET(t->tdb_flags, TDBF_PFSYNC); - mtx_leave(&t->tdb_mtx); - - mtx_leave(&sc->sc_tdb_mtx); - t->tdb_updates = 0; - } while (0); - } else { - if (++t->tdb_updates >= sc->sc_maxupdates) - schednetisr(NETISR_PFSYNC); - } + if (m == NULL) { + m = pfsync_upd_req_init(sc, count); + if (m == NULL) { + pfsyncstat_inc(pfsyncs_onomem); + continue; + } + } - mtx_enter(&t->tdb_mtx); - if (output) - SET(t->tdb_flags, TDBF_PFSYNC_RPL); - else - CLR(t->tdb_flags, TDBF_PFSYNC_RPL); - mtx_leave(&t->tdb_mtx); -} -#endif + m = m_prepend(m, sizeof(*ur), M_DONTWAIT); + if (m == NULL) { + pfsyncstat_inc(pfsyncs_onomem); + continue; + } -#if defined(IPSEC) -void -pfsync_delete_tdb(struct tdb *t) -{ - struct pfsync_softc *sc = pfsyncif; - size_t nlen; + ur = mtod(m, struct pfsync_upd_req *); + ur->id = up->id; + ur->creatorid = up->creatorid; + rcount++; - if (sc == NULL || !ISSET(t->tdb_flags, TDBF_PFSYNC)) - return; + continue; + } - mtx_enter(&sc->sc_tdb_mtx); + pfsync_in_updates(sc, st, &up->src, &up->dst, up->timeout); - /* - * if tdb entry is just being processed (found in snapshot), - * then it can not be deleted. we just came too late - */ - if (ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED)) { - mtx_leave(&sc->sc_tdb_mtx); - return; + pf_state_unref(st); } - TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry); - - mtx_enter(&t->tdb_mtx); - CLR(t->tdb_flags, TDBF_PFSYNC); - mtx_leave(&t->tdb_mtx); + if (m != NULL) { + struct pfsync_subheader *subh; - nlen = sizeof(struct pfsync_tdb); - if (TAILQ_EMPTY(&sc->sc_tdb_q)) - nlen += sizeof(struct pfsync_subheader); - atomic_sub_long(&sc->sc_len, nlen); - - mtx_leave(&sc->sc_tdb_mtx); + m = m_prepend(m, sizeof(*subh), M_DONTWAIT); + if (m == NULL) { + pfsyncstat_inc(pfsyncs_onomem); + return; + } - tdb_unref(t); -} -#endif + subh = mtod(m, struct pfsync_subheader *); + subh->action = PFSYNC_ACT_UPD_REQ; + subh->len = sizeof(struct pfsync_upd_req) >> 2; + subh->count = htons(rcount); -void -pfsync_out_tdb(struct tdb *t, void *buf) -{ - struct pfsync_tdb *ut = buf; + m = pfsync_encap(sc, m); + if (m == NULL) { + pfsyncstat_inc(pfsyncs_onomem); + return; + } - bzero(ut, sizeof(*ut)); - ut->spi = t->tdb_spi; - bcopy(&t->tdb_dst, &ut->dst, sizeof(ut->dst)); - /* - * When a failover happens, the master's rpl is probably above - * what we see here (we may be up to a second late), so - * increase it a bit for outbound tdbs to manage most such - * situations. - * - * For now, just add an offset that is likely to be larger - * than the number of packets we can see in one second. The RFC - * just says the next packet must have a higher seq value. - * - * XXX What is a good algorithm for this? We could use - * a rate-determined increase, but to know it, we would have - * to extend struct tdb. - * XXX pt->rpl can wrap over MAXINT, but if so the real tdb - * will soon be replaced anyway. For now, just don't handle - * this edge case. - */ -#define RPL_INCR 16384 - ut->rpl = htobe64(t->tdb_rpl + (ISSET(t->tdb_flags, TDBF_PFSYNC_RPL) ? - RPL_INCR : 0)); - ut->cur_bytes = htobe64(t->tdb_cur_bytes); - ut->sproto = t->tdb_sproto; - ut->rdomain = htons(t->tdb_rdomain); + pfsync_sendout(sc, m); + } } -void -pfsync_bulk_start(void) +static void +pfsync_in_ureq(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) { - struct pfsync_softc *sc = pfsyncif; - - NET_ASSERT_LOCKED(); - - /* - * pf gc via pfsync_state_in_use reads sc_bulk_next and - * sc_bulk_last while exclusively holding the pf_state_list - * rwlock. make sure it can't race with us setting these - * pointers. they basically act as hazards, and borrow the - * lists state reference count. - */ - rw_enter_read(&pf_state_list.pfs_rwl); + const struct pfsync_upd_req *ur; + struct pf_state_cmp id_key; + struct pf_state *st; + unsigned int i; - /* get a consistent view of the list pointers */ - mtx_enter(&pf_state_list.pfs_mtx); - if (sc->sc_bulk_next == NULL) - sc->sc_bulk_next = TAILQ_FIRST(&pf_state_list.pfs_list); + for (i = 0; i < count; i++) { + ur = (struct pfsync_upd_req *)(buf + mlen * i); - sc->sc_bulk_last = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); - mtx_leave(&pf_state_list.pfs_mtx); + id_key.id = ur->id; + id_key.creatorid = ur->creatorid; - rw_exit_read(&pf_state_list.pfs_rwl); + if (id_key.id == 0 && id_key.creatorid == 0) { + pfsync_bulk_snd_start(sc); + continue; + } - DPFPRINTF(LOG_INFO, "received bulk update request"); + PF_STATE_ENTER_READ(); + st = pf_find_state_byid(&id_key); + if (st != NULL && st->timeout < PFTM_MAX && + !ISSET(st->state_flags, PFSTATE_NOSYNC)) + pf_state_ref(st); + else + st = NULL; + PF_STATE_EXIT_READ(); + if (st == NULL) { + pfsyncstat_inc(pfsyncs_badstate); + continue; + } - if (sc->sc_bulk_last == NULL) - pfsync_bulk_status(PFSYNC_BUS_END); - else { - sc->sc_ureq_received = getuptime(); + pfsync_update_state_req(sc, st); - pfsync_bulk_status(PFSYNC_BUS_START); - timeout_add(&sc->sc_bulk_tmo, 0); + pf_state_unref(st); } } -void -pfsync_bulk_update(void *arg) +static void +pfsync_in_del(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) { - struct pfsync_softc *sc; + const struct pfsync_state *sp; + struct pf_state_cmp id_key; struct pf_state *st; - int i = 0; - - NET_LOCK(); - sc = pfsyncif; - if (sc == NULL) - goto out; - - rw_enter_read(&pf_state_list.pfs_rwl); - st = sc->sc_bulk_next; - sc->sc_bulk_next = NULL; + unsigned int i; - if (st == NULL) { - rw_exit_read(&pf_state_list.pfs_rwl); - goto out; - } + PF_LOCK(); + PF_STATE_ENTER_WRITE(); + for (i = 0; i < count; i++) { + sp = (struct pfsync_state *)(buf + mlen * i); - for (;;) { - if (st->sync_state == PFSYNC_S_NONE && - st->timeout < PFTM_MAX && - st->pfsync_time <= sc->sc_ureq_received) { - pfsync_update_state_req(st); - i++; - } + id_key.id = sp->id; + id_key.creatorid = sp->creatorid; - st = TAILQ_NEXT(st, entry_list); - if ((st == NULL) || (st == sc->sc_bulk_last)) { - /* we're done */ - sc->sc_bulk_last = NULL; - pfsync_bulk_status(PFSYNC_BUS_END); - break; + st = pf_find_state_byid(&id_key); + if (st == NULL) { + pfsyncstat_inc(pfsyncs_badstate); + continue; } - if (i > 1 && (sc->sc_if.if_mtu - sc->sc_len) < - sizeof(struct pfsync_state)) { - /* we've filled a packet */ - sc->sc_bulk_next = st; - timeout_add(&sc->sc_bulk_tmo, 1); - break; - } + mtx_enter(&st->mtx); + SET(st->state_flags, PFSTATE_NOSYNC); + mtx_leave(&st->mtx); + pf_remove_state(st); } - - rw_exit_read(&pf_state_list.pfs_rwl); - out: - NET_UNLOCK(); + PF_STATE_EXIT_WRITE(); + PF_UNLOCK(); } -void -pfsync_bulk_status(u_int8_t status) +static void +pfsync_in_del_c(struct pfsync_softc *sc, + const caddr_t buf, unsigned int mlen, unsigned int count) { - struct { - struct pfsync_subheader subh; - struct pfsync_bus bus; - } __packed r; - - struct pfsync_softc *sc = pfsyncif; + const struct pfsync_del_c *sp; + struct pf_state_cmp id_key; + struct pf_state *st; + unsigned int i; - bzero(&r, sizeof(r)); + PF_LOCK(); + PF_STATE_ENTER_WRITE(); + for (i = 0; i < count; i++) { + sp = (struct pfsync_del_c *)(buf + mlen * i); - r.subh.action = PFSYNC_ACT_BUS; - r.subh.len = sizeof(struct pfsync_bus) >> 2; - r.subh.count = htons(1); + id_key.id = sp->id; + id_key.creatorid = sp->creatorid; - r.bus.creatorid = pf_status.hostid; - r.bus.endtime = htonl(getuptime() - sc->sc_ureq_received); - r.bus.status = status; + st = pf_find_state_byid(&id_key); + if (st == NULL) { + pfsyncstat_inc(pfsyncs_badstate); + continue; + } - pfsync_send_plus(&r, sizeof(r)); + mtx_enter(&st->mtx); + SET(st->state_flags, PFSTATE_NOSYNC); + mtx_leave(&st->mtx); + pf_remove_state(st); + } + PF_STATE_EXIT_WRITE(); + PF_UNLOCK(); } -void -pfsync_bulk_fail(void *arg) +static void +pfsync_in_bus(struct pfsync_softc *sc, + const caddr_t buf, unsigned int len, unsigned int count) { - struct pfsync_softc *sc; + const struct pfsync_bus *bus = (struct pfsync_bus *)buf; - NET_LOCK(); - sc = pfsyncif; - if (sc == NULL) - goto out; - if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) { - /* Try again */ - timeout_add_sec(&sc->sc_bulkfail_tmo, 5); - pfsync_request_update(0, 0); - } else { - /* Pretend like the transfer was ok */ - sc->sc_ureq_sent = 0; - sc->sc_bulk_tries = 0; -#if NCARP > 0 - if (!pfsync_sync_ok) - carp_group_demote_adj(&sc->sc_if, -1, - sc->sc_link_demoted ? - "pfsync link state up" : - "pfsync bulk fail"); - if (sc->sc_initial_bulk) { - carp_group_demote_adj(&sc->sc_if, -32, - "pfsync init"); - sc->sc_initial_bulk = 0; - } -#endif - pfsync_sync_ok = 1; - sc->sc_link_demoted = 0; - DPFPRINTF(LOG_ERR, "failed to receive bulk update"); + switch (bus->status) { + case PFSYNC_BUS_START: + pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_START); + break; + + case PFSYNC_BUS_END: + pfsync_bulk_req_evt(sc, PFSYNC_BREQ_EVT_BUS_END); + break; } - out: - NET_UNLOCK(); } -void -pfsync_send_plus(void *plus, size_t pluslen) +#if defined(IPSEC) +/* Update an in-kernel tdb. Silently fail if no tdb is found. */ +static void +pfsync_update_net_tdb(const struct pfsync_tdb *pt) { - struct pfsync_softc *sc = pfsyncif; + struct tdb *tdb; - if (sc->sc_len + pluslen > sc->sc_if.if_mtu) - pfsync_sendout(); + NET_ASSERT_LOCKED(); - sc->sc_plus = plus; - sc->sc_pluslen = pluslen; - atomic_add_long(&sc->sc_len, pluslen); + /* check for invalid values */ + if (ntohl(pt->spi) <= SPI_RESERVED_MAX || + (pt->dst.sa.sa_family != AF_INET && + pt->dst.sa.sa_family != AF_INET6)) + goto bad; - pfsync_sendout(); -} + tdb = gettdb(ntohs(pt->rdomain), pt->spi, + (union sockaddr_union *)&pt->dst, pt->sproto); + if (tdb) { + uint64_t rpl = betoh64(pt->rpl); + uint64_t cur_bytes = betoh64(pt->cur_bytes); -int -pfsync_is_up(void) -{ - struct pfsync_softc *sc = pfsyncif; + /* Neither replay nor byte counter should ever decrease. */ + mtx_enter(&tdb->tdb_mtx); + if (rpl >= tdb->tdb_rpl && + cur_bytes >= tdb->tdb_cur_bytes) { + tdb->tdb_rpl = rpl; + tdb->tdb_cur_bytes = cur_bytes; + } + mtx_leave(&tdb->tdb_mtx); - if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING)) - return (0); + tdb_unref(tdb); + } + return; - return (1); + bad: + DPFPRINTF(LOG_WARNING, "pfsync_insert: PFSYNC_ACT_TDB_UPD: " + "invalid value"); + pfsyncstat_inc(pfsyncs_badstate); + return; } +#endif -int -pfsync_state_in_use(struct pf_state *st) +static void +pfsync_in_tdb(struct pfsync_softc *sc, + const caddr_t buf, unsigned int len, unsigned int count) { - struct pfsync_softc *sc = pfsyncif; +#if defined(IPSEC) + const struct pfsync_tdb *tp; + unsigned int i; - if (sc == NULL) - return (0); + for (i = 0; i < count; i++) { + tp = (const struct pfsync_tdb *)(buf + len * i); + pfsync_update_net_tdb(tp); + } +#endif +} - rw_assert_wrlock(&pf_state_list.pfs_rwl); +int +pfsync_input4(struct mbuf **mp, int *offp, int proto, int af) +{ + struct mbuf *m = *mp; + struct ip *ip; - if (st->sync_state != PFSYNC_S_NONE || - st == sc->sc_bulk_next || - st == sc->sc_bulk_last) - return (1); + ip = mtod(m, struct ip *); - return (0); -} + m = pfsync_input(m, ip->ip_ttl, ip->ip_hl << 2); -void -pfsync_timeout(void *arg) -{ - NET_LOCK(); - pfsync_sendout(); - NET_UNLOCK(); -} + m_freem(m); + *mp = NULL; -/* this is a softnet/netisr handler */ -void -pfsyncintr(void) -{ - pfsync_sendout(); + return (IPPROTO_DONE); } int @@ -2651,8 +3327,8 @@ pfsync_sysctl_pfsyncstat(void *oldp, siz } int -pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, - size_t newlen) +pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, + void *newp, size_t newlen) { /* All sysctl names at this level are terminal. */ if (namelen != 1) Index: net/if_pfsync.h =================================================================== RCS file: /cvs/src/sys/net/if_pfsync.h,v retrieving revision 1.59 diff -u -p -r1.59 if_pfsync.h --- net/if_pfsync.h 11 Nov 2022 11:47:13 -0000 1.59 +++ net/if_pfsync.h 21 Jun 2023 01:46:13 -0000 @@ -177,7 +177,7 @@ struct pfsync_upd_c { struct pfsync_upd_req { u_int64_t id; u_int32_t creatorid; -} __packed; +} __packed __aligned(4); /* * DEL_C @@ -295,16 +295,6 @@ enum pfsync_counters { pfsyncs_ncounters, }; -extern struct cpumem *pfsynccounters; - -struct pfsync_deferral; - -static inline void -pfsyncstat_inc(enum pfsync_counters c) -{ - counters_inc(pfsynccounters, c); -} - /* * this shows where a pf state is with respect to the syncing. */ @@ -315,10 +305,11 @@ pfsyncstat_inc(enum pfsync_counters c) #define PFSYNC_S_UPD 0x04 #define PFSYNC_S_COUNT 0x05 -#define PFSYNC_S_DEFER 0xfe -#define PFSYNC_S_NONE 0xff +#define PFSYNC_S_NONE 0xd0 +#define PFSYNC_S_SYNC 0xd1 +#define PFSYNC_S_DEAD 0xde -int pfsync_input(struct mbuf **, int *, int, int); +int pfsync_input4(struct mbuf **, int *, int, int); int pfsync_sysctl(int *, u_int, void *, size_t *, void *, size_t); @@ -329,6 +320,9 @@ int pfsync_state_import(struct pfsync_ void pfsync_state_export(struct pfsync_state *, struct pf_state *); +void pfsync_init_state(struct pf_state *, + const struct pf_state_key *, + const struct pf_state_key *, int); void pfsync_insert_state(struct pf_state *); void pfsync_update_state(struct pf_state *); void pfsync_delete_state(struct pf_state *); @@ -337,14 +331,10 @@ void pfsync_clear_states(u_int32_t, co void pfsync_update_tdb(struct tdb *, int); void pfsync_delete_tdb(struct tdb *); -int pfsync_defer(struct pf_state *, struct mbuf *, - struct pfsync_deferral **); -void pfsync_undefer(struct pfsync_deferral *, int); +int pfsync_defer(struct pf_state *, struct mbuf *); int pfsync_is_up(void); int pfsync_state_in_use(struct pf_state *); - -void pfsync_iack(struct pf_state *); #endif /* _KERNEL */ #endif /* _NET_IF_PFSYNC_H_ */ Index: net/netisr.h =================================================================== RCS file: /cvs/src/sys/net/netisr.h,v retrieving revision 1.60 diff -u -p -r1.60 netisr.h --- net/netisr.h 14 Jul 2022 10:52:21 -0000 1.60 +++ net/netisr.h 21 Jun 2023 01:46:13 -0000 @@ -42,7 +42,6 @@ * on the lowest level routine of each protocol. */ #define NETISR_IP 2 /* same as AF_INET */ -#define NETISR_PFSYNC 5 /* for pfsync "immediate" tx */ #define NETISR_ARP 18 /* same as AF_LINK */ #define NETISR_IPV6 24 /* same as AF_INET6 */ #define NETISR_PIPEX 27 /* for pipex processing */ @@ -64,7 +63,6 @@ void ipintr(void); void ip6intr(void); void pppintr(void); void bridgeintr(void); -void pfsyncintr(void); void pipexintr(void); void pppoeintr(void); Index: net/pf.c =================================================================== RCS file: /cvs/src/sys/net/pf.c,v retrieving revision 1.1181 diff -u -p -r1.1181 pf.c --- net/pf.c 5 Jun 2023 08:37:27 -0000 1.1181 +++ net/pf.c 21 Jun 2023 01:46:13 -0000 @@ -100,8 +100,6 @@ #if NPFSYNC > 0 #include -#else -struct pfsync_deferral; #endif /* NPFSYNC > 0 */ /* @@ -121,10 +119,6 @@ u_char pf_tcp_secret[16]; int pf_tcp_secret_init; int pf_tcp_iss_off; -int pf_npurge; -struct task pf_purge_task = TASK_INITIALIZER(pf_purge, &pf_npurge); -struct timeout pf_purge_to = TIMEOUT_INITIALIZER(pf_purge_timeout, NULL); - enum pf_test_status { PF_TEST_FAIL = -1, PF_TEST_OK, @@ -190,8 +184,7 @@ void pf_rule_to_actions(struct pf_rul struct pf_rule_actions *); int pf_test_rule(struct pf_pdesc *, struct pf_rule **, struct pf_state **, struct pf_rule **, - struct pf_ruleset **, u_short *, - struct pfsync_deferral **); + struct pf_ruleset **, u_short *); static __inline int pf_create_state(struct pf_pdesc *, struct pf_rule *, struct pf_rule *, struct pf_rule *, struct pf_state_key **, struct pf_state_key **, @@ -250,6 +243,10 @@ void pf_counters_inc(int, struct pf_p struct pf_state *, struct pf_rule *, struct pf_rule *); +int pf_state_insert(struct pfi_kif *, + struct pf_state_key **, struct pf_state_key **, + struct pf_state *); + int pf_state_key_isvalid(struct pf_state_key *); struct pf_state_key *pf_state_key_ref(struct pf_state_key *); void pf_state_key_unref(struct pf_state_key *); @@ -1064,10 +1061,11 @@ pf_state_insert(struct pfi_kif *kif, str pf_status.fcounters[FCNT_STATE_INSERT]++; pf_status.states++; pfi_kif_ref(kif, PFI_KIF_REF_STATE); + PF_STATE_EXIT_WRITE(); + #if NPFSYNC > 0 pfsync_insert_state(st); #endif /* NPFSYNC > 0 */ - PF_STATE_EXIT_WRITE(); *skwp = skw; *sksp = sks; @@ -1318,6 +1316,8 @@ pf_state_export(struct pfsync_state *sp, #endif /* NPFLOG > 0 */ sp->timeout = st->timeout; sp->state_flags = htons(st->state_flags); + if (READ_ONCE(st->sync_defer) != NULL) /* XXX */ + sp->state_flags |= htons(PFSTATE_ACK); if (!SLIST_EMPTY(&st->src_nodes)) sp->sync_flags |= PFSYNC_FLAG_SRCNODE; @@ -1519,9 +1519,6 @@ pf_state_import(const struct pfsync_stat st->rule.ptr = r; st->anchor.ptr = NULL; - st->pfsync_time = getuptime(); - st->sync_state = PFSYNC_S_NONE; - PF_REF_INIT(st->refcnt); mtx_init(&st->mtx, IPL_NET); @@ -1529,15 +1526,12 @@ pf_state_import(const struct pfsync_stat r->states_cur++; r->states_tot++; + st->sync_state = PFSYNC_S_NONE; + st->pfsync_time = getuptime(); #if NPFSYNC > 0 - if (!ISSET(flags, PFSYNC_SI_IOCTL)) - SET(st->state_flags, PFSTATE_NOSYNC); + pfsync_init_state(st, skw, sks, flags); #endif - /* - * We just set PFSTATE_NOSYNC bit, which prevents - * pfsync_insert_state() to insert state to pfsync. - */ if (pf_state_insert(kif, &skw, &sks, st) != 0) { /* XXX when we have anchors, use STATE_DEC_COUNTERS */ r->states_cur--; @@ -1545,15 +1539,6 @@ pf_state_import(const struct pfsync_stat goto cleanup_state; } -#if NPFSYNC > 0 - if (!ISSET(flags, PFSYNC_SI_IOCTL)) { - CLR(st->state_flags, PFSTATE_NOSYNC); - if (ISSET(st->state_flags, PFSTATE_ACK)) - pfsync_iack(st); - } - CLR(st->state_flags, PFSTATE_ACK); -#endif - return (0); cleanup: @@ -1576,47 +1561,106 @@ pf_state_import(const struct pfsync_stat /* END state table stuff */ +void pf_purge_states(void *); +struct task pf_purge_states_task = + TASK_INITIALIZER(pf_purge_states, NULL); + +void pf_purge_states_tick(void *); +struct timeout pf_purge_states_to = + TIMEOUT_INITIALIZER(pf_purge_states_tick, NULL); + +unsigned int pf_purge_expired_states(unsigned int, unsigned int); + +/* + * how many states to scan this interval. + * + * this is set when the timeout fires, and reduced by the task. the + * task will reschedule itself until the limit is reduced to zero, + * and then it adds the timeout again. + */ +unsigned int pf_purge_states_limit; + +/* + * limit how many states are processed with locks held per run of + * the state purge task. + */ +unsigned int pf_purge_states_collect = 64; + + void +pf_purge_states_tick(void *null) + { + unsigned int limit = pf_status.states; + unsigned int interval = pf_default_rule.timeout[PFTM_INTERVAL]; + + if (limit == 0) { + timeout_add_sec(&pf_purge_states_to, 1); + return; + } + + /* + * process a fraction of the state table every second + */ + + if (interval > 1) + limit /= interval; + + pf_purge_states_limit = limit; + task_add(systqmp, &pf_purge_states_task); +} + void -pf_purge_timeout(void *unused) +pf_purge_states(void *null) { - /* XXX move to systqmp to avoid KERNEL_LOCK */ - task_add(systq, &pf_purge_task); + unsigned int limit; + unsigned int scanned; + + limit = pf_purge_states_limit; + if (limit < pf_purge_states_collect) + limit = pf_purge_states_collect; + + scanned = pf_purge_expired_states(limit, pf_purge_states_collect); + if (scanned >= pf_purge_states_limit) { + /* we've run out of states to scan this "interval" */ + timeout_add_sec(&pf_purge_states_to, 1); + return; + } + + pf_purge_states_limit -= scanned; + task_add(systqmp, &pf_purge_states_task); } +void pf_purge_tick(void *); +struct timeout pf_purge_to = + TIMEOUT_INITIALIZER(pf_purge_tick, NULL); + +void pf_purge(void *); +struct task pf_purge_task = + TASK_INITIALIZER(pf_purge, NULL); + void -pf_purge(void *xnloops) +pf_purge_tick(void *null) { - int *nloops = xnloops; - - /* - * process a fraction of the state table every second - * Note: - * we no longer need PF_LOCK() here, because - * pf_purge_expired_states() uses pf_state_lock to maintain - * consistency. - */ - if (pf_default_rule.timeout[PFTM_INTERVAL] > 0) - pf_purge_expired_states(1 + (pf_status.states - / pf_default_rule.timeout[PFTM_INTERVAL])); + task_add(systqmp, &pf_purge_task); +} - NET_LOCK(); +void +pf_purge(void *null) +{ + unsigned int interval = max(1, pf_default_rule.timeout[PFTM_INTERVAL]); PF_LOCK(); - /* purge other expired types every PFTM_INTERVAL seconds */ - if (++(*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) - pf_purge_expired_src_nodes(); - PF_UNLOCK(); + pf_purge_expired_src_nodes(); + + PF_UNLOCK(); + /* * Fragments don't require PF_LOCK(), they use their own lock. */ - if ((*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) { - pf_purge_expired_fragments(); - *nloops = 0; - } - NET_UNLOCK(); - - timeout_add_sec(&pf_purge_to, 1); + pf_purge_expired_fragments(); + + /* interpret the interval as idle time between runs */ + timeout_add_sec(&pf_purge_to, interval); } int32_t @@ -1717,6 +1761,8 @@ pf_remove_state(struct pf_state *st) if (st->timeout == PFTM_UNLINKED) return; + st->timeout = PFTM_UNLINKED; + /* handle load balancing related tasks */ pf_postprocess_addr(st); @@ -1741,7 +1787,6 @@ pf_remove_state(struct pf_state *st) #if NPFSYNC > 0 pfsync_delete_state(st); #endif /* NPFSYNC > 0 */ - st->timeout = PFTM_UNLINKED; pf_src_tree_remove_state(st); pf_detach_state(st); } @@ -1795,6 +1840,7 @@ pf_free_state(struct pf_state *st) if (pfsync_state_in_use(st)) return; #endif /* NPFSYNC > 0 */ + KASSERT(st->timeout == PFTM_UNLINKED); if (--st->rule.ptr->states_cur == 0 && st->rule.ptr->src_nodes == 0) @@ -1819,8 +1865,8 @@ pf_free_state(struct pf_state *st) pf_status.states--; } -void -pf_purge_expired_states(u_int32_t maxcheck) +unsigned int +pf_purge_expired_states(const unsigned int limit, const unsigned int collect) { /* * this task/thread/context/whatever is the only thing that @@ -1834,6 +1880,8 @@ pf_purge_expired_states(u_int32_t maxche struct pf_state *st; SLIST_HEAD(pf_state_gcl, pf_state) gcl = SLIST_HEAD_INITIALIZER(gcl); time_t now; + unsigned int scanned; + unsigned int collected = 0; PF_ASSERT_UNLOCKED(); @@ -1847,7 +1895,7 @@ pf_purge_expired_states(u_int32_t maxche if (head == NULL) { /* the list is empty */ rw_exit_read(&pf_state_list.pfs_rwl); - return; + return (limit); } /* (re)start at the front of the list */ @@ -1856,13 +1904,17 @@ pf_purge_expired_states(u_int32_t maxche now = getuptime(); - do { + for (scanned = 0; scanned < limit; scanned++) { uint8_t stimeout = cur->timeout; + unsigned int limited = 0; if ((stimeout == PFTM_UNLINKED) || (pf_state_expires(cur, stimeout) <= now)) { st = pf_state_ref(cur); SLIST_INSERT_HEAD(&gcl, st, gc_list); + + if (++collected >= collect) + limited = 1; } /* don't iterate past the end of our view of the list */ @@ -1872,14 +1924,18 @@ pf_purge_expired_states(u_int32_t maxche } cur = TAILQ_NEXT(cur, entry_list); - } while (maxcheck--); + + /* don't spend too much time here. */ + if (ISSET(READ_ONCE(curcpu()->ci_schedstate.spc_schedflags), + SPCF_SHOULDYIELD) || limited) + break; + } rw_exit_read(&pf_state_list.pfs_rwl); if (SLIST_EMPTY(&gcl)) - return; + return (scanned); - NET_LOCK(); rw_enter_write(&pf_state_list.pfs_rwl); PF_LOCK(); PF_STATE_ENTER_WRITE(); @@ -1892,12 +1948,13 @@ pf_purge_expired_states(u_int32_t maxche PF_STATE_EXIT_WRITE(); PF_UNLOCK(); rw_exit_write(&pf_state_list.pfs_rwl); - NET_UNLOCK(); while ((st = SLIST_FIRST(&gcl)) != NULL) { SLIST_REMOVE_HEAD(&gcl, gc_list); pf_state_unref(st); } + + return (scanned); } int @@ -4262,8 +4319,7 @@ next_rule: int pf_test_rule(struct pf_pdesc *pd, struct pf_rule **rm, struct pf_state **sm, - struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason, - struct pfsync_deferral **pdeferral) + struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason) { struct pf_rule *r = NULL; struct pf_rule *a = NULL; @@ -4475,7 +4531,7 @@ pf_test_rule(struct pf_pdesc *pd, struct * firewall has to know about it to allow * replies through it. */ - if (pfsync_defer(*sm, pd->m, pdeferral)) + if (pfsync_defer(*sm, pd->m)) return (PF_DEFER); } #endif /* NPFSYNC > 0 */ @@ -4517,6 +4573,8 @@ pf_create_state(struct pf_pdesc *pd, str st->state_flags |= PFSTATE_SLOPPY; if (r->rule_flag & PFRULE_PFLOW) st->state_flags |= PFSTATE_PFLOW; + if (r->rule_flag & PFRULE_NOSYNC) + st->state_flags |= PFSTATE_NOSYNC; #if NPFLOG > 0 st->log = act->log & PF_LOG_ALL; #endif /* NPFLOG > 0 */ @@ -4535,6 +4593,7 @@ pf_create_state(struct pf_pdesc *pd, str st->set_prio[1] = act->set_prio[1]; st->delay = act->delay; SLIST_INIT(&st->src_nodes); + /* * must initialize refcnt, before pf_state_insert() gets called. * pf_state_inserts() grabs reference for pfsync! @@ -7462,7 +7521,6 @@ pf_test(sa_family_t af, int fwdir, struc int dir = (fwdir == PF_FWD) ? PF_OUT : fwdir; u_int32_t qid, pqid = 0; int have_pf_lock = 0; - struct pfsync_deferral *deferral = NULL; if (!pf_status.running) return (PF_PASS); @@ -7565,8 +7623,7 @@ pf_test(sa_family_t af, int fwdir, struc */ PF_LOCK(); have_pf_lock = 1; - action = pf_test_rule(&pd, &r, &st, &a, &ruleset, &reason, - &deferral); + action = pf_test_rule(&pd, &r, &st, &a, &ruleset, &reason); st = pf_state_ref(st); if (action != PF_PASS) REASON_SET(&reason, PFRES_FRAG); @@ -7598,7 +7655,7 @@ pf_test(sa_family_t af, int fwdir, struc PF_LOCK(); have_pf_lock = 1; action = pf_test_rule(&pd, &r, &st, &a, &ruleset, - &reason, &deferral); + &reason); st = pf_state_ref(st); } break; @@ -7630,7 +7687,7 @@ pf_test(sa_family_t af, int fwdir, struc PF_LOCK(); have_pf_lock = 1; action = pf_test_rule(&pd, &r, &st, &a, &ruleset, - &reason, &deferral); + &reason); st = pf_state_ref(st); } break; @@ -7714,7 +7771,7 @@ pf_test(sa_family_t af, int fwdir, struc PF_LOCK(); have_pf_lock = 1; action = pf_test_rule(&pd, &r, &st, &a, &ruleset, - &reason, &deferral); + &reason); st = pf_state_ref(st); } @@ -7854,14 +7911,6 @@ done: m_freem(pd.m); /* FALLTHROUGH */ case PF_DEFER: -#if NPFSYNC > 0 - /* - * We no longer hold PF_LOCK() here, so we can dispatch - * deferral if we are asked to do so. - */ - if (deferral != NULL) - pfsync_undefer(deferral, 0); -#endif /* NPFSYNC > 0 */ pd.m = NULL; action = PF_PASS; break; @@ -8210,7 +8259,7 @@ pf_state_unref(struct pf_state *st) #if NPFSYNC > 0 KASSERT((TAILQ_NEXT(st, sync_list) == NULL) || ((TAILQ_NEXT(st, sync_list) == _Q_INVALID) && - (st->sync_state == PFSYNC_S_NONE))); + (st->sync_state >= PFSYNC_S_NONE))); #endif /* NPFSYNC */ KASSERT((TAILQ_NEXT(st, entry_list) == NULL) || (TAILQ_NEXT(st, entry_list) == _Q_INVALID)); Index: net/pf_ioctl.c =================================================================== RCS file: /cvs/src/sys/net/pf_ioctl.c,v retrieving revision 1.405 diff -u -p -r1.405 pf_ioctl.c --- net/pf_ioctl.c 26 May 2023 12:13:26 -0000 1.405 +++ net/pf_ioctl.c 21 Jun 2023 01:46:13 -0000 @@ -1186,6 +1186,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a pf_status.stateid = gettime(); pf_status.stateid = pf_status.stateid << 32; } + timeout_add_sec(&pf_purge_states_to, 1); timeout_add_sec(&pf_purge_to, 1); pf_create_queues(); DPFPRINTF(LOG_NOTICE, "pf: started"); @@ -2771,8 +2772,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a pf_default_rule.timeout[i] = pf_default_rule_new.timeout[i]; if (pf_default_rule.timeout[i] == PFTM_INTERVAL && - pf_default_rule.timeout[i] < old) - task_add(net_tq(0), &pf_purge_task); + pf_default_rule.timeout[i] < old && + timeout_del(&pf_purge_to)) + task_add(systqmp, &pf_purge_task); } pfi_xcommit(); pf_trans_set_commit(); Index: net/pf_norm.c =================================================================== RCS file: /cvs/src/sys/net/pf_norm.c,v retrieving revision 1.227 diff -u -p -r1.227 pf_norm.c --- net/pf_norm.c 7 May 2023 16:23:23 -0000 1.227 +++ net/pf_norm.c 21 Jun 2023 01:46:13 -0000 @@ -1098,10 +1098,22 @@ no_fragment: } #endif /* INET6 */ +struct pf_state_scrub * +pf_state_scrub_get(void) +{ + return (pool_get(&pf_state_scrub_pl, PR_NOWAIT | PR_ZERO)); +} + +void +pf_state_scrub_put(struct pf_state_scrub *scrub) +{ + pool_put(&pf_state_scrub_pl, scrub); +} + int pf_normalize_tcp_alloc(struct pf_state_peer *src) { - src->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT | PR_ZERO); + src->scrub = pf_state_scrub_get(); if (src->scrub == NULL) return (ENOMEM); Index: net/pfvar.h =================================================================== RCS file: /cvs/src/sys/net/pfvar.h,v retrieving revision 1.531 diff -u -p -r1.531 pfvar.h --- net/pfvar.h 26 May 2023 12:13:26 -0000 1.531 +++ net/pfvar.h 21 Jun 2023 01:46:13 -0000 @@ -1603,15 +1603,10 @@ extern void pf_tbladdr_remove(struct extern void pf_tbladdr_copyout(struct pf_addr_wrap *); extern void pf_calc_skip_steps(struct pf_rulequeue *); extern void pf_purge_expired_src_nodes(void); -extern void pf_purge_expired_states(u_int32_t); extern void pf_purge_expired_rules(void); extern void pf_remove_state(struct pf_state *); extern void pf_remove_divert_state(struct pf_state_key *); extern void pf_free_state(struct pf_state *); -extern int pf_state_insert(struct pfi_kif *, - struct pf_state_key **, - struct pf_state_key **, - struct pf_state *); int pf_insert_src_node(struct pf_src_node **, struct pf_rule *, enum pf_sn_types, sa_family_t, struct pf_addr *, @@ -1674,6 +1669,10 @@ int pf_match(u_int8_t, u_int32_t, u_int3 int pf_match_port(u_int8_t, u_int16_t, u_int16_t, u_int16_t); int pf_match_uid(u_int8_t, uid_t, uid_t, uid_t); int pf_match_gid(u_int8_t, gid_t, gid_t, gid_t); + +struct pf_state_scrub * + pf_state_scrub_get(void); +void pf_state_scrub_put(struct pf_state_scrub *); int pf_refragment6(struct mbuf **, struct m_tag *mtag, struct sockaddr_in6 *, struct ifnet *, struct rtentry *); Index: net/pfvar_priv.h =================================================================== RCS file: /cvs/src/sys/net/pfvar_priv.h,v retrieving revision 1.33 diff -u -p -r1.33 pfvar_priv.h --- net/pfvar_priv.h 10 May 2023 22:42:51 -0000 1.33 +++ net/pfvar_priv.h 21 Jun 2023 01:46:13 -0000 @@ -41,6 +41,12 @@ #include #include +struct pfsync_deferral; + +/* + * pf state items - links from pf_state_key to pf_states + */ + struct pf_state_item { TAILQ_ENTRY(pf_state_item) si_entry; @@ -49,6 +55,10 @@ struct pf_state_item { TAILQ_HEAD(pf_statelisthead, pf_state_item); +/* + * pf state keys - look up states by address + */ + struct pf_state_key { struct pf_addr addr[2]; u_int16_t port[2]; @@ -73,11 +83,13 @@ RBT_PROTOTYPE(pf_state_tree, pf_state_ke (key[PF_SK_WIRE]->af != (family))) /* + * pf state + * * Protection/ownership of pf_state members: * I immutable after pf_state_insert() * M pf_state mtx * P PF_STATE_LOCK - * S pfsync mutex + * S pfsync * L pf_state_list * g pf_purge gc */ @@ -89,7 +101,7 @@ struct pf_state { u_int8_t pad[3]; TAILQ_ENTRY(pf_state) sync_list; /* [S] */ - TAILQ_ENTRY(pf_state) sync_snap; /* [S] */ + struct pfsync_deferral *sync_defer; /* [S] */ TAILQ_ENTRY(pf_state) entry_list; /* [L] */ SLIST_ENTRY(pf_state) gc_list; /* [g] */ RB_ENTRY(pf_state) entry_id; /* [P] */ @@ -101,7 +113,7 @@ struct pf_state { union pf_rule_ptr natrule; /* [I] */ struct pf_addr rt_addr; /* [I] */ struct pf_sn_head src_nodes; /* [I] */ - struct pf_state_key *key[2]; /* [I] stack and wire */ + struct pf_state_key *key[2]; /* [I] stack and wire */ struct pfi_kif *kif; /* [I] */ struct mutex mtx; pf_refcnt_t refcnt; @@ -109,16 +121,16 @@ struct pf_state { u_int64_t bytes[2]; int32_t creation; /* [I] */ int32_t expire; - int32_t pfsync_time; - int rtableid[2]; /* [I] rtables stack and wire */ + int32_t pfsync_time; /* [S] */ + int rtableid[2]; /* [I] stack and wire */ u_int16_t qid; /* [I] */ u_int16_t pqid; /* [I] */ u_int16_t tag; /* [I] */ - u_int16_t state_flags; + u_int16_t state_flags; /* [M] */ u_int8_t log; /* [I] */ u_int8_t timeout; - u_int8_t sync_state; /* PFSYNC_S_x */ - u_int8_t sync_updates; + u_int8_t sync_state; /* [S] PFSYNC_S_x */ + u_int8_t sync_updates; /* [S] */ u_int8_t min_ttl; /* [I] */ u_int8_t set_tos; /* [I] */ u_int8_t set_prio[2]; /* [I] */ @@ -127,7 +139,6 @@ struct pf_state { u_int16_t if_index_out; /* [I] */ u_int16_t delay; /* [I] */ u_int8_t rt; /* [I] */ - u_int8_t snapped; /* [S] */ }; RBT_HEAD(pf_state_tree_id, pf_state); @@ -345,6 +356,7 @@ struct pf_trans { #define pftgr_anchor u.u_getrule.gr_anchor #define pftgr_rule u.u_getrule.gr_rule +extern struct timeout pf_purge_states_to; extern struct task pf_purge_task; extern struct timeout pf_purge_to; @@ -396,9 +408,6 @@ extern struct rwlock pf_state_lock; splassert_fail(RW_WRITE, \ rw_status(&pf_state_lock), __func__);\ } while (0) - -extern void pf_purge_timeout(void *); -extern void pf_purge(void *); /* for copies to/from network byte order */ void pf_state_peer_hton(const struct pf_state_peer *, Index: netinet/in_proto.c =================================================================== RCS file: /cvs/src/sys/netinet/in_proto.c,v retrieving revision 1.101 diff -u -p -r1.101 in_proto.c --- netinet/in_proto.c 18 May 2023 09:59:43 -0000 1.101 +++ netinet/in_proto.c 21 Jun 2023 01:46:13 -0000 @@ -343,7 +343,7 @@ const struct protosw inetsw[] = { .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PFSYNC, .pr_flags = PR_ATOMIC|PR_ADDR, - .pr_input = pfsync_input, + .pr_input = pfsync_input4, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs, .pr_sysctl = pfsync_sysctl Index: netinet/ip_ipsp.h =================================================================== RCS file: /cvs/src/sys/netinet/ip_ipsp.h,v retrieving revision 1.240 diff -u -p -r1.240 ip_ipsp.h --- netinet/ip_ipsp.h 14 Jul 2022 13:52:10 -0000 1.240 +++ netinet/ip_ipsp.h 21 Jun 2023 01:46:13 -0000 @@ -50,6 +50,7 @@ * P ipo_tdb_mtx link policy to TDB global mutex * D tdb_sadb_mtx SA database global mutex * m tdb_mtx fields of struct tdb + * S pfsync fields of struct tdb */ /* IPSP global definitions. */ @@ -405,7 +406,6 @@ struct tdb { /* tunnel descriptor blo u_int8_t tdb_sproto; /* [I] IPsec protocol */ u_int8_t tdb_wnd; /* Replay window */ u_int8_t tdb_satype; /* SA type (RFC2367, PF_KEY) */ - u_int8_t tdb_updates; /* pfsync update counter */ union sockaddr_union tdb_dst; /* [N] Destination address */ union sockaddr_union tdb_src; /* [N] Source address */ @@ -439,8 +439,8 @@ struct tdb { /* tunnel descriptor blo struct sockaddr_encap tdb_filtermask; /* And the mask */ TAILQ_HEAD(tdb_policy_head, ipsec_policy) tdb_policy_head; /* [P] */ - TAILQ_ENTRY(tdb) tdb_sync_entry; - TAILQ_ENTRY(tdb) tdb_sync_snap; + TAILQ_ENTRY(tdb) tdb_sync_entry; /* [S] pfsync tdb queue */ + u_int32_t tdb_updates; /* [S] pfsync update counter */ }; enum tdb_counters {