Index: art.c =================================================================== RCS file: /cvs/src/sys/net/art.c,v retrieving revision 1.29 diff -u -p -r1.29 art.c --- art.c 12 Nov 2020 15:25:28 -0000 1.29 +++ art.c 9 Mar 2021 04:28:41 -0000 @@ -217,58 +217,33 @@ art_findex(struct art_table *at, uint8_t struct art_node * art_match(struct art_root *ar, void *addr, struct srp_ref *nsr) { - struct srp_ref dsr, ndsr; void *entry; struct art_table *at; - struct art_node *dflt, *ndflt; + struct art_node *dflt = NULL, *ndflt; int j; - entry = srp_enter(nsr, &ar->ar_root); - at = entry; - - if (at == NULL) - goto done; - - /* - * Remember the default route of each table we visit in case - * we do not find a better matching route. - */ - dflt = srp_enter(&dsr, &at->at_default); - /* * Iterate until we find a leaf. */ - while (1) { + for (at = SMR_PTR_GET(ar->ar_root); at != NULL; at = SUBTABLE(entry)) { /* Do a single level route lookup. */ j = art_findex(at, addr); - entry = srp_follow(nsr, &at->at_heap[j].node); + entry = SMR_PTR_GET(at->at_heap[j].entry); /* If this is a leaf (NULL is a leaf) we're done. */ if (ISLEAF(entry)) - break; + return (entry) - at = SUBTABLE(entry); - - ndflt = srp_enter(&ndsr, &at->at_default); - if (ndflt != NULL) { - srp_leave(&dsr); - dsr = ndsr; + /* + * Rember the default route of this table in case + * we do not find a better matching route. + */ + ndft = SMR_PTR_GET(at->at_default); + if (ndflt != NULL) dflt = ndflt; - } else - srp_leave(&ndsr); } - if (entry == NULL) { - srp_leave(nsr); - *nsr = dsr; - KASSERT(ISLEAF(dflt)); - return (dflt); - } - - srp_leave(&dsr); -done: - KASSERT(ISLEAF(entry)); - return (entry); + return (dflt); } /* @@ -278,7 +253,7 @@ done: * it does not exist. */ struct art_node * -art_lookup(struct art_root *ar, void *addr, int plen, struct srp_ref *nsr) +art_lookup(struct art_root *ar, void *addr, int plen) { void *entry; struct art_table *at; @@ -286,17 +261,13 @@ art_lookup(struct art_root *ar, void *ad KASSERT(plen >= 0 && plen <= ar->ar_alen); - entry = srp_enter(nsr, &ar->ar_root); - at = entry; - + at = SMR_PTR_GET(ar->ar_root); if (at == NULL) - goto done; + return (NULL); /* Default route */ - if (plen == 0) { - entry = srp_follow(nsr, &at->at_default); - goto done; - } + if (plen == 0) + return (SMR_PTR_GET(at->at_default)); /* * If the prefix length is smaller than the sum of @@ -306,7 +277,7 @@ art_lookup(struct art_root *ar, void *ad while (plen > (at->at_offset + at->at_bits)) { /* Do a single level route lookup. */ j = art_findex(at, addr); - entry = srp_follow(nsr, &at->at_heap[j].node); + entry = SMR_PTR_GET(at->at_heap[j].entry); /* A leaf is a match, but not a perfect one, or NULL */ if (ISLEAF(entry)) @@ -319,16 +290,15 @@ art_lookup(struct art_root *ar, void *ad if (i == -1) return (NULL); - entry = srp_follow(nsr, &at->at_heap[i].node); - if (!ISLEAF(entry)) - entry = srp_follow(nsr, &SUBTABLE(entry)->at_default); + entry = SMR_PTR_GET(&at->at_heap[i].entry); + if (!ISLEAF(entry)) { + at = SUBTABLE(entry); + entry = SMR_PTR_GET(at->at_default); + } -done: - KASSERT(ISLEAF(entry)); return (entry); } - /* * Insertion API function. * @@ -339,29 +309,29 @@ struct art_node * art_insert(struct art_root *ar, struct art_node *an, void *addr, int plen) { struct art_table *at, *child; - struct art_node *node; + void *entry; int i, j; rw_assert_wrlock(&ar->ar_lock); KASSERT(plen >= 0 && plen <= ar->ar_alen); - at = srp_get_locked(&ar->ar_root); + at = SMR_PTR_GET_LOCKED(ar->ar_root); if (at == NULL) { at = art_table_get(ar, NULL, -1); if (at == NULL) return (NULL); - srp_swap_locked(&ar->ar_root, at); + SMR_PTR_SET_LOCKED(&ar->ar_root, at); } /* Default route */ if (plen == 0) { - node = srp_get_locked(&at->at_default); - if (node != NULL) - return (node); + entry = SMR_PTR_GET_LOCKED(at->at_default); + if (entry != NULL) + return (entry); art_table_ref(ar, at); - srp_swap_locked(&at->at_default, an); + SMR_PTR_SET_LOCKED(&at->at_default, an); return (an); } @@ -373,7 +343,7 @@ art_insert(struct art_root *ar, struct a while (plen > (at->at_offset + at->at_bits)) { /* Do a single level route lookup. */ j = art_findex(at, addr); - node = srp_get_locked(&at->at_heap[j].node); + entry = SMR_PTR_GET_LOCKED(at->at_heap[j].entry); /* * If the node corresponding to the fringe index is @@ -381,16 +351,17 @@ art_insert(struct art_root *ar, struct a * entry of this node will then become the default * route of the subtable. */ - if (ISLEAF(node)) { + if (ISLEAF(entry)) { child = art_table_get(ar, at, j); if (child == NULL) return (NULL); art_table_ref(ar, at); - srp_swap_locked(&at->at_heap[j].node, ASNODE(child)); + SMR_SET_PTR_LOCKED(&at->at_heap[j].entry, + ASNODE(child)); at = child; } else - at = SUBTABLE(node); + at = SUBTABLE(entry); } i = art_bindex(at, addr, plen); @@ -407,16 +378,19 @@ struct art_node * art_table_insert(struct art_root *ar, struct art_table *at, int i, struct art_node *an) { + void **slot; + void *entry; struct art_node *prev, *node; - node = srp_get_locked(&at->at_heap[i].node); - if (!ISLEAF(node)) - prev = srp_get_locked(&SUBTABLE(node)->at_default); - else - prev = node; + slot = &at->at_heap[i].entry; + entry = SMR_PTR_GET_LOCKED(*slot); + if (!ISLEAF(entry)) { + slot = &SUBTABLE(entry)->at_default; + entry = SMR_PTR_GET_LOCKED(*slot); + } - if (art_check_duplicate(ar, prev, an)) - return (prev); + if (art_check_duplicate(ar, entry, an)) + return (entry); art_table_ref(ar, at); @@ -426,21 +400,19 @@ art_table_insert(struct art_root *ar, st * all the corresponding fringe indices. */ if (i < at->at_minfringe) - art_allot(at, i, prev, an); - else if (!ISLEAF(node)) - srp_swap_locked(&SUBTABLE(node)->at_default, an); + art_allot(at, i, entry, an); else - srp_swap_locked(&at->at_heap[i].node, an); + SMR_PTR_SET_LOCKED(slot, an); return (an); } - /* * Deletion API function. */ struct art_node * -art_delete(struct art_root *ar, struct art_node *an, void *addr, int plen) +art_delete(struct art_root *ar, struct art_node *an, void *addr, int plen, + struct art_garbage *ag) { struct art_table *at; struct art_node *node; @@ -455,9 +427,9 @@ art_delete(struct art_root *ar, struct a /* Default route */ if (plen == 0) { - node = srp_get_locked(&at->at_default); + node = SMR_PTR_GET_LOCKED(at->at_default); srp_swap_locked(&at->at_default, NULL); - art_table_free(ar, at); + art_table_free(ar, at, ag); return (node); } @@ -490,29 +462,29 @@ art_delete(struct art_root *ar, struct a */ struct art_node * art_table_delete(struct art_root *ar, struct art_table *at, int i, - struct art_node *an) + struct art_node *an, struct art_garbage *ag) { + void **slot; + void *entry; struct art_node *next, *node; #ifdef DIAGNOSTIC struct art_node *prev; #endif - node = srp_get_locked(&at->at_heap[i].node); -#ifdef DIAGNOSTIC - if (!ISLEAF(node)) - prev = srp_get_locked(&SUBTABLE(node)->at_default); - else - prev = node; - - KASSERT(prev == an); -#endif + slot = &at->at_heap[i].entry); + entry = SMR_PTR_GET_LOCKED(slot); + if (!ISLEAVE(entry) { + slot = &SUBTABLE(entry)->at_default); + entry = SMR_PTR_GET_LOCKED(slot); + } + KASSERT((struct node *)entry == an); /* Get the next most specific route for the index `i'. */ if ((i >> 1) > 1) - next = srp_get_locked(&at->at_heap[i >> 1].node); + entry = SMR_PTR_GET_LOCKED(&at->at_heap[i >> 1].entry); else - next = NULL; + entry = NULL; /* * If the index `i' of the route that we are removing is not @@ -520,14 +492,12 @@ art_table_delete(struct art_root *ar, st * route pointer to all the corresponding fringe indices. */ if (i < at->at_minfringe) - art_allot(at, i, an, next); - else if (!ISLEAF(node)) - srp_swap_locked(&SUBTABLE(node)->at_default, next); + art_allot(at, i, an, entry); else - srp_swap_locked(&at->at_heap[i].node, next); + SMR_PTR_SET_LOCKED(slot, entry); /* We have removed an entry from this table. */ - art_table_free(ar, at); + art_table_free(ar, at, ag); return (an); } @@ -543,27 +513,27 @@ static inline int art_table_rele(struct art_table *at) { if (at == NULL) - return (0); + return (NULL); - return (--at->at_refcnt == 0); + if --at->at_refcnt > 0) + return (NULL); } int -art_table_free(struct art_root *ar, struct art_table *at) +art_table_free(struct art_root *ar, struct art_table *at, + struct art_garbage *ag) { - if (art_table_rele(at)) { - /* - * Garbage collect this table and all its parents - * that are empty. - */ - do { - at = art_table_put(ar, at); - } while (art_table_rele(at)); + int rv = 0; - return (1); + /* + * Garbage collect this table and all its parents + * that are empty. + */ + while ((at = art_table_rele(ar, at, ag)) != NULL) + rv = 1; } - return (0); + return (rv); } /* @@ -689,7 +659,6 @@ struct art_table * art_table_get(struct art_root *ar, struct art_table *parent, int j) { struct art_table *at; - struct art_node *node; void *at_heap; uint32_t lvl; @@ -732,29 +701,36 @@ art_table_get(struct art_root *ar, struc at->at_refcnt = 0; if (parent != NULL) { - node = srp_get_locked(&parent->at_heap[j].node); - /* node isn't being deleted, no srp_finalize needed */ - srp_swap_locked(&at->at_default, node); + void *entry = SMR_PTR_GET_LOCKED(parent->at_heap[j].entry); + SMR_PTR_SET_LOCKED(&at->at_default, entry); + at->at_offset = (parent->at_offset + parent->at_bits); } return (at); } - /* * Delete a table and use its index to restore its parent's default route. * * Note: Modify its parent to unlink the table from it. */ struct art_table * -art_table_put(struct art_root *ar, struct art_table *at) +art_table_rele(struct art_root *ar, struct art_table *at, + struct art_garbage *ag) { - struct art_table *parent = at->at_parent; - struct art_node *node; - uint32_t j = at->at_index; + struct art_table *parent; + void *entry; + uint32_t j; + + if (at == NULL) + return (NULL); + if (--at->at_refcnt > 0) + return (NULL); + + parent = at->at_parent; + j = at->at_index; - KASSERT(at->at_refcnt == 0); KASSERT(j != 0 && j != 1); if (parent != NULL) { @@ -762,58 +738,36 @@ art_table_put(struct art_root *ar, struc KASSERT(at->at_level == parent->at_level + 1); KASSERT(parent->at_refcnt >= 1); - /* Give the route back to its parent. */ - node = srp_get_locked(&at->at_default); - srp_swap_locked(&parent->at_heap[j].node, node); + /* Give the default entry back to its parent. */ + entry = SRP_PTR_GET_LOCKED(at->at_default); + SMR_PTR_SET_LOCKED(&parent->at_heap[j].entry, entry); } else { KASSERT(j == -1); KASSERT(at->at_level == 0); - srp_swap_locked(&ar->ar_root, NULL); + SMR_PTR_SET_LOCKED(&ar->ar_root, NULL); } - mtx_enter(&art_table_gc_mtx); - at->at_parent = art_table_gc_list; - art_table_gc_list = at; - mtx_leave(&art_table_gc_mtx); - - task_add(systqmp, &art_table_gc_task); + at->at_gc_entry = ag->ag_entry; + ag->ag_entry = ASNODE(at); return (parent); } void -art_table_gc(void *null) +art_table_put(struct art_table *at); { - struct art_table *at, *next; - - mtx_enter(&art_table_gc_mtx); - at = art_table_gc_list; - art_table_gc_list = NULL; - mtx_leave(&art_table_gc_mtx); - - while (at != NULL) { - next = at->at_parent; - - if (at->at_level == 0) - srp_finalize(at, "arttfini"); - else - srp_finalize(ASNODE(at), "arttfini"); - - switch (AT_HEAPSIZE(at->at_bits)) { - case AT_HEAPSIZE(4): - pool_put(&at_heap_4_pool, at->at_heap); - break; - case AT_HEAPSIZE(8): - pool_put(&at_heap_8_pool, at->at_heap); - break; - default: - panic("incorrect stride length %u", at->at_bits); - } - - pool_put(&at_pool, at); - - at = next; + switch (AT_HEAPSIZE(at->at_bits)) { + case AT_HEAPSIZE(4): + pool_put(&at_heap_4_pool, at->at_heap); + break; + case AT_HEAPSIZE(8): + pool_put(&at_heap_8_pool, at->at_heap); + break; + default: + panic("incorrect stride length %u", at->at_bits); } + + pool_put(&at_pool, at); } /* @@ -843,7 +797,9 @@ void art_allot(struct art_table *at, int i, struct art_node *old, struct art_node *new) { - struct art_node *node, *dflt; + void **slot; + void *entry; + struct art_node *cur; int k = i; KASSERT(i < at->at_minfringe); @@ -855,24 +811,24 @@ again: /* Change fringe nodes. */ while (1) { - node = srp_get_locked(&at->at_heap[k].node); - if (!ISLEAF(node)) { - dflt = srp_get_locked(&SUBTABLE(node)->at_default); - if (dflt == old) { - srp_swap_locked(&SUBTABLE(node)->at_default, - new); - } - } else if (node == old) { - srp_swap_locked(&at->at_heap[k].node, new); + slot = &at->at_heap[k].entry; + entry = SMR_PTR_GET_LOCKED(*slot); + if (!ISLEAF(entry)) { + slot = &SUBTABLE(entry)->at_default; + entry = SMR_PTR_GET_LOCKED(*slot); } + cur = entry; + if (cur == old) + SMR_PTR_SET_LOCKED(slot, new); + if (k % 2) goto moveup; k++; } nonfringe: - node = srp_get_locked(&at->at_heap[k].node); - if (node == old) + cur = SMR_PTR_GET_LOCKED(at->at_heap[k].entry); + if (cur == old) goto again; moveon: if (k % 2) @@ -881,7 +837,7 @@ moveon: goto nonfringe; moveup: k >>= 1; - srp_swap_locked(&at->at_heap[k].node, new); + SMR_PTR_SET_LOCKED(&at->at_heap[k].entry, new); /* Change non-fringe node. */ if (k != i) @@ -889,7 +845,7 @@ moveup: } struct art_node * -art_get(void *dst, uint8_t plen) +art_get(void *entry, uint8_t plen) { struct art_node *an; @@ -897,8 +853,8 @@ art_get(void *dst, uint8_t plen) if (an == NULL) return (NULL); + an->an_entry = entry; an->an_plen = plen; - SRPL_INIT(&an->an_rtlist); return (an); } @@ -906,33 +862,25 @@ art_get(void *dst, uint8_t plen) void art_put(struct art_node *an) { - KASSERT(SRPL_EMPTY_LOCKED(&an->an_rtlist)); - - mtx_enter(&art_node_gc_mtx); - an->an_gc = art_node_gc_list; - art_node_gc_list = an; - mtx_leave(&art_node_gc_mtx); - - task_add(systqmp, &art_node_gc_task); + pool_put(&an_pool, an); } void -art_gc(void *null) +art_gc(struct art_garbage *ag) { - struct art_node *an, *next; - - mtx_enter(&art_node_gc_mtx); - an = art_node_gc_list; - art_node_gc_list = NULL; - mtx_leave(&art_node_gc_mtx); + void *entry, *next; - while (an != NULL) { - next = an->an_gc; + for (entry = ag->ag_list; entry != NULL; entry = next) { + if (ISLEAF(entry)) { + struct art_node *an = entry; + next = an->an_gc_entry; - srp_finalize(an, "artnfini"); + art_put(an); + } else { + art_table *at = SUBTABLE(entry); + next = at->at_gc_entry; - pool_put(&an_pool, an); - - an = next; + art_table_put(at); + } } } Index: art.h =================================================================== RCS file: /cvs/src/sys/net/art.h,v retrieving revision 1.21 diff -u -p -r1.21 art.h --- art.h 2 Mar 2021 17:50:41 -0000 1.21 +++ art.h 9 Mar 2021 04:28:41 -0000 @@ -24,6 +24,8 @@ #define ART_MAXLVL 32 /* We currently use 32 levels for IPv6. */ +struct art_table; + /* * Root of the ART tables, equivalent to the radix head. * @@ -35,7 +37,7 @@ * is indicated below. */ struct art_root { - struct srp ar_root; /* [l] First table */ + struct art_table ar_root; /* [l] First table */ struct rwlock ar_lock; /* [] Serialise modifications */ uint8_t ar_bits[ART_MAXLVL]; /* [I] Per level stride */ uint8_t ar_nlvl; /* [I] Number of levels */ @@ -52,7 +54,13 @@ struct art_root { * Allotment Table. */ struct art_table { - struct art_table *at_parent; /* Parent table */ + union { + struct art_table + *at_u_parent; /* Parent table */ + void *at_u_entry; + } at_u; +#define at_parent at_u.at_u_parent +#define at_gc_entry at_u.at_u_entry uint32_t at_index; /* Index in the parent table */ uint32_t at_minfringe; /* Index that fringe begins */ uint32_t at_level; /* Level of the table */ @@ -65,34 +73,28 @@ struct art_table { * is a route counter. */ union { - struct srp node; + void *entry; unsigned long count; } *at_heap; /* Array of 2^(slen+1) items */ }; #define at_refcnt at_heap[0].count/* Refcounter (1 per different route) */ -#define at_default at_heap[1].node /* Default route (was in parent heap) */ +#define at_default at_heap[1].entry/* Default route (was in parent heap) */ /* Heap size for an ART table of stride length ``slen''. */ #define AT_HEAPSIZE(slen) ((1 << ((slen) + 1)) * sizeof(void *)) /* - * Forward declaration needed for the list of mpath routes - * attached to a single ART node. - */ -struct rtentry; - -/* * A node is the internal representation of a route entry. */ struct art_node { + void *an_entry; /* Routes on this node */ union { - SRPL_HEAD(, rtentry) an__rtlist; /* Route related to this node */ - struct art_node *an__gc; /* Entry on GC list */ - } an_pointer; - uint8_t an_plen; /* Prefix length */ + uint8_t an_u_plen; /* Prefix length */ + void *an_u_entry; + } an_u; +#define an_plen an_u.an_u_plen; +#define an_gc_entry an_u.an_u_entry; }; -#define an_rtlist an_pointer.an__rtlist -#define an_gc an_pointer.an__gc void art_init(void); struct art_root *art_alloc(unsigned int, unsigned int, unsigned int); @@ -100,13 +102,20 @@ struct art_node *art_insert(struct art_r int); struct art_node *art_delete(struct art_root *, struct art_node *, void *, int); -struct art_node *art_match(struct art_root *, void *, struct srp_ref *); -struct art_node *art_lookup(struct art_root *, void *, int, - struct srp_ref *); +struct art_node *art_match(struct art_root *, void *); +struct art_node *art_lookup(struct art_root *, void *, int); int art_walk(struct art_root *, int (*)(struct art_node *, void *), void *); struct art_node *art_get(void *, uint8_t); void art_put(struct art_node *); + +struct art_garbage { + void *ag_entry; +}; + +#define ART_GARBAGE_INITIALIZER() { NULL } + +void art_garbage_collect(struct art_garbage *); #endif /* _NET_ART_H_ */ Index: if.c =================================================================== RCS file: /cvs/src/sys/net/if.c,v retrieving revision 1.632 diff -u -p -r1.632 if.c --- if.c 20 Feb 2021 04:55:52 -0000 1.632 +++ if.c 9 Mar 2021 04:28:41 -0000 @@ -238,7 +238,7 @@ int ifq_congestion; int netisr; -#define NET_TASKQ 1 +#define NET_TASKQ 4 struct taskq *nettqmp[NET_TASKQ]; struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL); @@ -822,22 +822,8 @@ if_input_process(struct ifnet *ifp, stru if (!ISSET(ifp->if_xflags, IFXF_CLONED)) enqueue_randomness(ml_len(ml) ^ (uintptr_t)MBUF_LIST_FIRST(ml)); - /* - * We grab the NET_LOCK() before processing any packet to - * ensure there's no contention on the routing table lock. - * - * Without it we could race with a userland thread to insert - * a L2 entry in ip{6,}_output(). Such race would result in - * one of the threads sleeping *inside* the IP output path. - * - * Since we have a NET_LOCK() we also use it to serialize access - * to PF globals, pipex globals, unicast and multicast addresses - * lists and the socket layer. - */ - NET_LOCK(); while ((m = ml_dequeue(ml)) != NULL) (*ifp->if_input)(ifp, m); - NET_UNLOCK(); } void @@ -895,6 +881,12 @@ if_netisr(void *unused) KERNEL_UNLOCK(); } #endif + if (n & (1 << NETISR_IP)) + ipintr(); +#ifdef INET6 + if (n & (1 << NETISR_IPV6)) + ip6intr(); +#endif #if NPPP > 0 if (n & (1 << NETISR_PPP)) { KERNEL_LOCK(); @@ -3295,12 +3287,15 @@ unhandled_af(int af) * globals aren't ready to be accessed by multiple threads in * parallel. */ -int nettaskqs = NET_TASKQ; +int nettaskqs; struct taskq * net_tq(unsigned int ifindex) { struct taskq *t = NULL; + + if (nettaskqs == 0) + nettaskqs = min(NET_TASKQ, ncpus); t = nettqmp[ifindex % nettaskqs]; Index: if_ethersubr.c =================================================================== RCS file: /cvs/src/sys/net/if_ethersubr.c,v retrieving revision 1.274 diff -u -p -r1.274 if_ethersubr.c --- if_ethersubr.c 7 Mar 2021 06:02:32 -0000 1.274 +++ if_ethersubr.c 9 Mar 2021 04:28:41 -0000 @@ -222,7 +222,9 @@ ether_resolve(struct ifnet *ifp, struct switch (af) { case AF_INET: + KERNEL_LOCK(); error = arpresolve(ifp, rt, m, dst, eh->ether_dhost); + KERNEL_UNLOCK(); if (error) return (error); eh->ether_type = htons(ETHERTYPE_IP); @@ -245,7 +247,9 @@ ether_resolve(struct ifnet *ifp, struct break; #ifdef INET6 case AF_INET6: + KERNEL_LOCK(); error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost); + KERNEL_UNLOCK(); if (error) return (error); eh->ether_type = htons(ETHERTYPE_IPV6); Index: if_pfsync.c =================================================================== RCS file: /cvs/src/sys/net/if_pfsync.c,v retrieving revision 1.287 diff -u -p -r1.287 if_pfsync.c --- if_pfsync.c 25 Feb 2021 02:48:21 -0000 1.287 +++ if_pfsync.c 9 Mar 2021 04:28:41 -0000 @@ -744,7 +744,7 @@ pfsync_input(struct mbuf **mp, int *offp int offset, noff, len, count, mlen, flags = 0; int e; - NET_ASSERT_LOCKED(); + //NET_ASSERT_LOCKED(); pfsyncstat_inc(pfsyncs_ipackets); @@ -1304,7 +1304,7 @@ pfsync_update_net_tdb(struct pfsync_tdb { struct tdb *tdb; - NET_ASSERT_LOCKED(); + //NET_ASSERT_LOCKED(); /* check for invalid values */ if (ntohl(pt->spi) <= SPI_RESERVED_MAX || @@ -1902,7 +1902,7 @@ pfsync_insert_state(struct pf_state *st) { struct pfsync_softc *sc = pfsyncif; - NET_ASSERT_LOCKED(); + //NET_ASSERT_LOCKED(); if (ISSET(st->rule.ptr->rule_flag, PFRULE_NOSYNC) || st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) { @@ -1932,7 +1932,7 @@ pfsync_defer(struct pf_state *st, struct struct pfsync_softc *sc = pfsyncif; struct pfsync_deferral *pd; - NET_ASSERT_LOCKED(); + //NET_ASSERT_LOCKED(); if (!sc->sc_defer || ISSET(st->state_flags, PFSTATE_NOSYNC) || @@ -2031,7 +2031,7 @@ pfsync_undefer(struct pfsync_deferral *p { struct pfsync_softc *sc = pfsyncif; - NET_ASSERT_LOCKED(); + //NET_ASSERT_LOCKED(); if (sc == NULL) return; @@ -2067,7 +2067,7 @@ pfsync_deferred(struct pf_state *st, int struct pfsync_softc *sc = pfsyncif; struct pfsync_deferral *pd; - NET_ASSERT_LOCKED(); + //NET_ASSERT_LOCKED(); mtx_enter(&sc->sc_deferrals_mtx); TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) { @@ -2091,7 +2091,7 @@ pfsync_update_state(struct pf_state *st) struct pfsync_softc *sc = pfsyncif; int sync = 0; - NET_ASSERT_LOCKED(); + //NET_ASSERT_LOCKED(); if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING)) return; @@ -2266,7 +2266,7 @@ pfsync_delete_state(struct pf_state *st) { struct pfsync_softc *sc = pfsyncif; - NET_ASSERT_LOCKED(); + //NET_ASSERT_LOCKED(); if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING)) return; @@ -2326,7 +2326,7 @@ pfsync_clear_states(u_int32_t creatorid, struct pfsync_clr clr; } __packed r; - NET_ASSERT_LOCKED(); + //NET_ASSERT_LOCKED(); if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING)) return; Index: if_tun.c =================================================================== RCS file: /cvs/src/sys/net/if_tun.c,v retrieving revision 1.230 diff -u -p -r1.230 if_tun.c --- if_tun.c 20 Feb 2021 04:39:16 -0000 1.230 +++ if_tun.c 9 Mar 2021 04:28:41 -0000 @@ -118,7 +118,6 @@ int tun_ioctl(struct ifnet *, u_long, ca void tun_input(struct ifnet *, struct mbuf *); int tun_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); -int tun_enqueue(struct ifnet *, struct mbuf *); int tun_clone_create(struct if_clone *, int); int tap_clone_create(struct if_clone *, int); int tun_create(struct if_clone *, int, int); @@ -231,7 +230,6 @@ tun_create(struct if_clone *ifc, int uni /* build the interface */ ifp->if_ioctl = tun_ioctl; - ifp->if_enqueue = tun_enqueue; ifp->if_start = tun_start; ifp->if_hardmtu = TUNMRU; ifp->if_link_state = LINK_STATE_DOWN; @@ -606,21 +604,6 @@ tun_output(struct ifnet *ifp, struct mbu return (if_enqueue(ifp, m0)); } -int -tun_enqueue(struct ifnet *ifp, struct mbuf *m0) -{ - struct tun_softc *sc = ifp->if_softc; - int error; - - error = ifq_enqueue(&ifp->if_snd, m0); - if (error != 0) - return (error); - - tun_wakeup(sc); - - return (0); -} - void tun_wakeup(struct tun_softc *sc) { @@ -831,6 +814,7 @@ tun_dev_write(dev_t dev, struct uio *uio struct mbuf *m0; int error = 0; size_t mlen; + struct mbuf_list ml = MBUF_LIST_INITIALIZER(); sc = tun_get(dev); if (sc == NULL) @@ -868,9 +852,8 @@ tun_dev_write(dev_t dev, struct uio *uio if (error != 0) goto drop; - NET_LOCK(); - if_vinput(ifp, m0); - NET_UNLOCK(); + ml_enqueue(&ml, m0); + if_input(ifp, &ml); tun_put(sc); return (0); @@ -1058,10 +1041,7 @@ tun_start(struct ifnet *ifp) { struct tun_softc *sc = ifp->if_softc; - splassert(IPL_NET); - - if (ifq_len(&ifp->if_snd)) - tun_wakeup(sc); + tun_wakeup(sc); } void Index: if_vxlan.c =================================================================== RCS file: /cvs/src/sys/net/if_vxlan.c,v retrieving revision 1.82 diff -u -p -r1.82 if_vxlan.c --- if_vxlan.c 25 Feb 2021 02:48:21 -0000 1.82 +++ if_vxlan.c 9 Mar 2021 04:28:41 -0000 @@ -1,7 +1,7 @@ -/* $OpenBSD: if_vxlan.c,v 1.82 2021/02/25 02:48:21 dlg Exp $ */ +/* $OpenBSD$ */ /* - * Copyright (c) 2013 Reyk Floeter + * Copyright (c) 2021 David Gwynne * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -17,475 +17,775 @@ */ #include "bpfilter.h" -#include "vxlan.h" -#include "vlan.h" #include "pf.h" -#include "bridge.h" #include #include +#include #include #include -#include #include +#include +#include +#include +#include +#include + +#include +#include #include #include +#include #include +#include #include - -#if NBPFILTER > 0 -#include -#endif +#include #include #include #include #include -#include #include -#include #include +#include -#if NPF > 0 -#include +#ifdef INET6 +#include +#include +#include #endif -#if NBRIDGE > 0 +/* for bridge stuff */ #include +#include + +#if NBPFILTER > 0 +#include #endif -#include +/* + * The protocol. + */ + +#define VXLANMTU 1492 +#define VXLAN_PORT 4789 + +struct vxlan_header { + uint32_t vxlan_flags; +#define VXLAN_F_I (1U << 27) + uint32_t vxlan_id; +#define VXLAN_VNI_SHIFT 8 +#define VXLAN_VNI_MASK (0xffffff << VXLAN_VNI_SHIFT) +}; + +#define VXLAN_VNI_MAX 0x00ffffff +#define VXLAN_VNI_MIN 0x00000000 + +/* + * The driver. + */ + +union vxlan_addr { + struct in_addr in4; + struct in6_addr in6; +}; + +struct vxlan_softc; + +struct vxlan_peer { + RBT_ENTRY(vxlan_peer) p_entry; + + unsigned int p_mask; /* do we use addr in the comparison */ + union vxlan_addr p_addr; + struct vxlan_header p_header; + + struct vxlan_softc *p_sc; +}; + +RBT_HEAD(vxlan_peers, vxlan_peer); + +struct vxlan_tep { + TAILQ_ENTRY(vxlan_tep) vt_entry; + + sa_family_t vt_af; + unsigned int vt_rdomain; + union vxlan_addr vt_addr; +#define vt_addr4 vt_addr.in4 +#define vt_addr6 vt_addr.in6 + in_port_t vt_port; + + struct socket *vt_so; + + struct mutex vt_mtx; + struct vxlan_peers vt_peers; +}; + +TAILQ_HEAD(vxlan_teps, vxlan_tep); + +enum vxlan_tunnel_mode { + VXLAN_TMODE_UNSET, + VXLAN_TMODE_P2P, /* unicast destination, no learning */ + VXLAN_TMODE_LEARNING, /* multicast destination, learning */ + VXLAN_TMODE_ENDPOINT, /* unset destination, no learning */ +}; struct vxlan_softc { struct arpcom sc_ac; - struct ifmedia sc_media; + struct etherbridge sc_eb; + + unsigned int sc_rdomain; + sa_family_t sc_af; + union vxlan_addr sc_src; + union vxlan_addr sc_dst; + in_port_t sc_port; + struct vxlan_header sc_header; + unsigned int sc_if_index0; - struct ip_moptions sc_imo; - struct task sc_atask; - struct task sc_ltask; struct task sc_dtask; + void *sc_inmulti; + + enum vxlan_tunnel_mode sc_mode; + struct vxlan_peer *sc_ucast_peer; + struct vxlan_peer *sc_mcast_peer; + struct refcnt sc_refs; - struct sockaddr_storage sc_src; - struct sockaddr_storage sc_dst; - in_port_t sc_dstport; - u_int sc_rdomain; - int64_t sc_vnetid; uint16_t sc_df; - u_int8_t sc_ttl; + int sc_ttl; int sc_txhprio; + int sc_rxhprio; - struct task sc_sendtask; - - LIST_ENTRY(vxlan_softc) sc_entry; + struct task sc_send_task; }; -void vxlanattach(int); -int vxlanioctl(struct ifnet *, u_long, caddr_t); -void vxlanstart(struct ifnet *); -int vxlan_clone_create(struct if_clone *, int); -int vxlan_clone_destroy(struct ifnet *); -void vxlan_multicast_cleanup(struct ifnet *); -int vxlan_multicast_join(struct ifnet *, struct sockaddr *, - struct sockaddr *); -int vxlan_media_change(struct ifnet *); -void vxlan_media_status(struct ifnet *, struct ifmediareq *); -int vxlan_config(struct ifnet *, struct sockaddr *, struct sockaddr *); -int vxlan_output(struct ifnet *, struct mbuf *); -void vxlan_addr_change(void *); -void vxlan_if_change(void *); -void vxlan_link_change(void *); -void vxlan_send_dispatch(void *); +void vxlanattach(int); + +static int vxlan_clone_create(struct if_clone *, int); +static int vxlan_clone_destroy(struct ifnet *); + +static int vxlan_output(struct ifnet *, struct mbuf *, + struct sockaddr *, struct rtentry *); +static int vxlan_enqueue(struct ifnet *, struct mbuf *); +static void vxlan_start(struct ifqueue *); +static void vxlan_send(void *); + +static int vxlan_ioctl(struct ifnet *, u_long, caddr_t); +static int vxlan_up(struct vxlan_softc *); +static int vxlan_down(struct vxlan_softc *); +static int vxlan_addmulti(struct vxlan_softc *, struct ifnet *); +static void vxlan_delmulti(struct vxlan_softc *); + +static struct mbuf * + vxlan_input(void *, struct mbuf *, + struct ip *, struct ip6_hdr *, void *, int); + +static int vxlan_set_rdomain(struct vxlan_softc *, const struct ifreq *); +static int vxlan_get_rdomain(struct vxlan_softc *, struct ifreq *); +static int vxlan_set_tunnel(struct vxlan_softc *, + const struct if_laddrreq *); +static int vxlan_get_tunnel(struct vxlan_softc *, struct if_laddrreq *); +static int vxlan_del_tunnel(struct vxlan_softc *); +static int vxlan_set_vnetid(struct vxlan_softc *, const struct ifreq *); +static int vxlan_get_vnetid(struct vxlan_softc *, struct ifreq *); +static int vxlan_del_vnetid(struct vxlan_softc *); +static int vxlan_set_parent(struct vxlan_softc *, + const struct if_parent *); +static int vxlan_get_parent(struct vxlan_softc *, struct if_parent *); +static int vxlan_del_parent(struct vxlan_softc *); + +static int vxlan_add_addr(struct vxlan_softc *, const struct ifbareq *); +static int vxlan_del_addr(struct vxlan_softc *, const struct ifbareq *); -int vxlan_sockaddr_cmp(struct sockaddr *, struct sockaddr *); -uint16_t vxlan_sockaddr_port(struct sockaddr *); +static void vxlan_detach_hook(void *); -struct if_clone vxlan_cloner = +static struct if_clone vxlan_cloner = IF_CLONE_INITIALIZER("vxlan", vxlan_clone_create, vxlan_clone_destroy); -int vxlan_enable = 0; -u_long vxlan_tagmask; +static int vxlan_eb_port_eq(void *, void *, void *); +static void *vxlan_eb_port_take(void *, void *); +static void vxlan_eb_port_rele(void *, void *); +static size_t vxlan_eb_port_ifname(void *, char *, size_t, void *); +static void vxlan_eb_port_sa(void *, struct sockaddr_storage *, void *); + +static const struct etherbridge_ops vxlan_etherbridge_ops = { + vxlan_eb_port_eq, + vxlan_eb_port_take, + vxlan_eb_port_rele, + vxlan_eb_port_ifname, + vxlan_eb_port_sa, +}; + +static struct rwlock vxlan_lock = RWLOCK_INITIALIZER("vteps"); +static struct vxlan_teps vxlan_teps = TAILQ_HEAD_INITIALIZER(vxlan_teps); +static struct pool vxlan_endpoint_pool; -#define VXLAN_TAGHASHSIZE 32 -#define VXLAN_TAGHASH(tag) ((unsigned int)tag & vxlan_tagmask) -LIST_HEAD(vxlan_taghash, vxlan_softc) *vxlan_tagh, vxlan_any; +static inline int vxlan_peer_cmp(const struct vxlan_peer *, + const struct vxlan_peer *); + +RBT_PROTOTYPE(vxlan_peers, vxlan_peer, p_entry, vxlan_peer_cmp); void vxlanattach(int count) { - /* Regular vxlan interfaces with a VNI */ - if ((vxlan_tagh = hashinit(VXLAN_TAGHASHSIZE, M_DEVBUF, M_NOWAIT, - &vxlan_tagmask)) == NULL) - panic("vxlanattach: hashinit"); - - /* multipoint-to-multipoint interfaces that accept any VNI */ - LIST_INIT(&vxlan_any); - if_clone_attach(&vxlan_cloner); } -int +static int vxlan_clone_create(struct if_clone *ifc, int unit) { - struct ifnet *ifp; - struct vxlan_softc *sc; + struct vxlan_softc *sc; + struct ifnet *ifp; + int error; + + if (vxlan_endpoint_pool.pr_size == 0) { + pool_init(&vxlan_endpoint_pool, sizeof(union vxlan_addr), + 0, IPL_SOFTNET, 0, "vxlanep", NULL); + } - sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); - sc->sc_imo.imo_membership = mallocarray(IP_MIN_MEMBERSHIPS, - sizeof(struct in_multi *), M_IPMOPTS, M_WAITOK|M_ZERO); - sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS; - sc->sc_dstport = htons(VXLAN_PORT); - sc->sc_vnetid = VXLAN_VNI_UNSET; - sc->sc_txhprio = IFQ_TOS2PRIO(IPTOS_PREC_ROUTINE); /* 0 */ - sc->sc_df = htons(0); - task_set(&sc->sc_atask, vxlan_addr_change, sc); - task_set(&sc->sc_ltask, vxlan_link_change, sc); - task_set(&sc->sc_dtask, vxlan_if_change, sc); - task_set(&sc->sc_sendtask, vxlan_send_dispatch, sc); + sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO|M_CANFAIL); + if (sc == NULL) + return (ENOMEM); ifp = &sc->sc_ac.ac_if; - snprintf(ifp->if_xname, sizeof ifp->if_xname, "vxlan%d", unit); - ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; - ether_fakeaddr(ifp); - ifp->if_softc = sc; - ifp->if_ioctl = vxlanioctl; - ifp->if_start = vxlanstart; + snprintf(ifp->if_xname, sizeof(ifp->if_xname), "%s%d", + ifc->ifc_name, unit); - ifp->if_hardmtu = ETHER_MAX_HARDMTU_LEN; - ifp->if_capabilities = IFCAP_VLAN_MTU; - ifp->if_xflags = IFXF_CLONED; + error = etherbridge_init(&sc->sc_eb, ifp->if_xname, + &vxlan_etherbridge_ops, sc); + if (error == -1) { + free(sc, M_DEVBUF, sizeof(*sc)); + return (error); + } + + sc->sc_af = AF_UNSPEC; + sc->sc_txhprio = 0; + sc->sc_rxhprio = IF_HDRPRIO_OUTER; + sc->sc_df = 0; + sc->sc_ttl = IP_DEFAULT_MULTICAST_TTL; + + task_set(&sc->sc_dtask, vxlan_detach_hook, sc); + refcnt_init(&sc->sc_refs); + task_set(&sc->sc_send_task, vxlan_send, sc); - ifmedia_init(&sc->sc_media, 0, vxlan_media_change, - vxlan_media_status); - ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); - ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); + ifp->if_softc = sc; + ifp->if_hardmtu = ETHER_MAX_HARDMTU_LEN; + ifp->if_ioctl = vxlan_ioctl; + ifp->if_output = vxlan_output; + ifp->if_enqueue = vxlan_enqueue; + ifp->if_qstart = vxlan_start; + ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX; + ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE; + ether_fakeaddr(ifp); if_counters_alloc(ifp); if_attach(ifp); ether_ifattach(ifp); -#if 0 - /* - * Instead of using a decreased MTU of 1450 bytes, prefer - * to use the default Ethernet-size MTU of 1500 bytes and to - * increase the MTU of the outer transport interfaces to - * at least 1550 bytes. The following is disabled by default. - */ - ifp->if_mtu = ETHERMTU - sizeof(struct ether_header); - ifp->if_mtu -= sizeof(struct vxlanudphdr) + sizeof(struct ipovly); -#endif - - LIST_INSERT_HEAD(&vxlan_tagh[VXLAN_TAGHASH(0)], sc, sc_entry); - vxlan_enable++; - return (0); } -int +static int vxlan_clone_destroy(struct ifnet *ifp) { - struct vxlan_softc *sc = ifp->if_softc; + struct vxlan_softc *sc = ifp->if_softc; NET_LOCK(); - vxlan_multicast_cleanup(ifp); + if (ISSET(ifp->if_flags, IFF_RUNNING)) + vxlan_down(sc); NET_UNLOCK(); - vxlan_enable--; - LIST_REMOVE(sc, sc_entry); - - ifmedia_delete_instance(&sc->sc_media, IFM_INST_ANY); ether_ifdetach(ifp); if_detach(ifp); - if (!task_del(net_tq(ifp->if_index), &sc->sc_sendtask)) - taskq_barrier(net_tq(ifp->if_index)); + etherbridge_destroy(&sc->sc_eb); + + refcnt_finalize(&sc->sc_refs, "vxlanfini"); - free(sc->sc_imo.imo_membership, M_IPMOPTS, - sc->sc_imo.imo_max_memberships * sizeof(struct in_multi *)); free(sc, M_DEVBUF, sizeof(*sc)); return (0); } -void -vxlan_multicast_cleanup(struct ifnet *ifp) +static struct vxlan_softc * +vxlan_take(struct vxlan_softc *sc) +{ + refcnt_take(&sc->sc_refs); + return (sc); +} + +static void +vxlan_rele(struct vxlan_softc *sc) { - struct vxlan_softc *sc = (struct vxlan_softc *)ifp->if_softc; - struct ip_moptions *imo = &sc->sc_imo; - struct ifnet *mifp; + refcnt_rele_wake(&sc->sc_refs); +} - mifp = if_get(imo->imo_ifidx); - if (mifp != NULL) { - if_addrhook_del(mifp, &sc->sc_atask); - if_linkstatehook_del(mifp, &sc->sc_ltask); - if_detachhook_del(mifp, &sc->sc_dtask); +static struct mbuf * +vxlan_encap(struct vxlan_softc *sc, struct mbuf *m, + struct mbuf *(ip_encap)(struct vxlan_softc *sc, struct mbuf *, + const union vxlan_addr *, uint8_t)) +{ + struct mbuf *m0; + union vxlan_addr gateway; + const union vxlan_addr *endpoint; + struct vxlan_header *vh; + struct udphdr *uh; + int prio; + uint8_t tos; - if_put(mifp); - } + if (sc->sc_mode == VXLAN_TMODE_UNSET) + goto drop; - if (imo->imo_num_memberships > 0) { - in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); - imo->imo_ifidx = 0; + if (sc->sc_mode == VXLAN_TMODE_P2P) + endpoint = &sc->sc_dst; + else { /* VXLAN_TMODE_LEARNING || VXLAN_TMODE_ENDPOINT */ + struct ether_header *eh = mtod(m, struct ether_header *); + + smr_read_enter(); + endpoint = etherbridge_resolve_ea(&sc->sc_eb, + (struct ether_addr *)eh->ether_dhost); + if (endpoint != NULL) { + gateway = *endpoint; + endpoint = &gateway; + } + smr_read_leave(); + + if (endpoint == NULL) { + if (sc->sc_mode == VXLAN_TMODE_ENDPOINT) + goto drop; + + /* "flood" to unknown destinations */ + endpoint = &sc->sc_dst; + } } + + /* force prepend mbuf because of payload alignment */ + m0 = m_get(M_DONTWAIT, m->m_type); + if (m0 == NULL) + goto drop; + + m_align(m0, 0); + m0->m_len = 0; + + M_MOVE_PKTHDR(m0, m); + m0->m_next = m; + + m = m_prepend(m0, sizeof(*vh), M_DONTWAIT); + if (m == NULL) + return (NULL); + + vh = mtod(m, struct vxlan_header *); + *vh = sc->sc_header; + + m = m_prepend(m, sizeof(*uh), M_DONTWAIT); + if (m == NULL) + return (NULL); + + uh = mtod(m, struct udphdr *); + uh->uh_sport = sc->sc_port; /* XXX */ + uh->uh_dport = sc->sc_port; + htobem16(&uh->uh_ulen, m->m_pkthdr.len); + uh->uh_sum = htons(0); + + SET(m->m_pkthdr.csum_flags, M_UDP_CSUM_OUT); + + prio = sc->sc_txhprio; + if (prio == IF_HDRPRIO_PACKET) + prio = m->m_pkthdr.pf.prio; + tos = IFQ_PRIO2TOS(prio); + + CLR(m->m_flags, M_BCAST|M_MCAST); + m->m_pkthdr.ph_rtableid = sc->sc_rdomain; + +#if NPF > 0 + pf_pkt_addr_changed(m); +#endif + + return ((*ip_encap)(sc, m, endpoint, tos)); +drop: + m_freem(m); + return (NULL); } -int -vxlan_multicast_join(struct ifnet *ifp, struct sockaddr *src, - struct sockaddr *dst) +static struct mbuf * +vxlan_encap_ipv4(struct vxlan_softc *sc, struct mbuf *m, + const union vxlan_addr *endpoint, uint8_t tos) { - struct vxlan_softc *sc = ifp->if_softc; - struct ip_moptions *imo = &sc->sc_imo; - struct sockaddr_in *src4, *dst4; -#ifdef INET6 - struct sockaddr_in6 *dst6; -#endif /* INET6 */ - struct ifaddr *ifa; - struct ifnet *mifp; + struct ip *ip; + + m = m_prepend(m, sizeof(*ip), M_DONTWAIT); + if (m == NULL) + return (NULL); + + ip = mtod(m, struct ip *); + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(*ip) >> 2; + ip->ip_off = sc->sc_df; + ip->ip_tos = tos; + ip->ip_len = htons(m->m_pkthdr.len); + ip->ip_ttl = sc->sc_ttl; + ip->ip_p = IPPROTO_UDP; + ip->ip_src = sc->sc_src.in4; + ip->ip_dst = endpoint->in4; + + return (m); +} - switch (dst->sa_family) { - case AF_INET: - dst4 = satosin(dst); - if (!IN_MULTICAST(dst4->sin_addr.s_addr)) - return (0); - break; #ifdef INET6 - case AF_INET6: - dst6 = satosin6(dst); - if (!IN6_IS_ADDR_MULTICAST(&dst6->sin6_addr)) - return (0); +static struct mbuf * +vxlan_encap_ipv6(struct vxlan_softc *sc, struct mbuf *m, + const union vxlan_addr *endpoint, uint8_t tos) +{ + struct ip6_hdr *ip6; + int len = m->m_pkthdr.len; - /* Multicast mode is currently not supported for IPv6 */ - return (EAFNOSUPPORT); + m = m_prepend(m, sizeof(*ip6), M_DONTWAIT); + if (m == NULL) + return (NULL); + + ip6 = mtod(m, struct ip6_hdr *); + ip6->ip6_flow = ISSET(m->m_pkthdr.csum_flags, M_FLOWID) ? + htonl(m->m_pkthdr.ph_flowid) : 0; + ip6->ip6_vfc |= IPV6_VERSION; + ip6->ip6_flow |= htonl((uint32_t)tos << 20); + ip6->ip6_plen = htons(len); + ip6->ip6_nxt = IPPROTO_UDP; + ip6->ip6_hlim = sc->sc_ttl; + ip6->ip6_src = sc->sc_src.in6; + ip6->ip6_dst = endpoint->in6; + + if (sc->sc_df) + SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT); + + return (m); +} #endif /* INET6 */ - default: - return (EAFNOSUPPORT); + +static int +vxlan_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct rtentry *rt) +{ + struct m_tag *mtag; + int error = 0; + + mtag = NULL; + while ((mtag = m_tag_find(m, PACKET_TAG_GRE, mtag)) != NULL) { + if (memcmp((caddr_t)(mtag + 1), &ifp->if_index, + sizeof(ifp->if_index)) == 0) { + error = EIO; + goto drop; + } } - src4 = satosin(src); - dst4 = satosin(dst); + mtag = m_tag_get(PACKET_TAG_GRE, sizeof(ifp->if_index), M_NOWAIT); + if (mtag == NULL) { + error = ENOBUFS; + goto drop; + } + memcpy((caddr_t)(mtag + 1), &ifp->if_index, sizeof(ifp->if_index)); + m_tag_prepend(m, mtag); - if (src4->sin_addr.s_addr == INADDR_ANY || - IN_MULTICAST(src4->sin_addr.s_addr)) - return (EINVAL); - if ((ifa = ifa_ifwithaddr(src, sc->sc_rdomain)) == NULL || - (mifp = ifa->ifa_ifp) == NULL || - (mifp->if_flags & IFF_MULTICAST) == 0) - return (EADDRNOTAVAIL); + return (ether_output(ifp, m, dst, rt)); - if ((imo->imo_membership[0] = - in_addmulti(&dst4->sin_addr, mifp)) == NULL) - return (ENOBUFS); +drop: + m_freem(m); + return (error); +} - imo->imo_num_memberships++; - imo->imo_ifidx = mifp->if_index; - if (sc->sc_ttl > 0) - imo->imo_ttl = sc->sc_ttl; - else - imo->imo_ttl = IP_DEFAULT_MULTICAST_TTL; - imo->imo_loop = 0; +static int +vxlan_enqueue(struct ifnet *ifp, struct mbuf *m) +{ + struct vxlan_softc *sc = ifp->if_softc; + struct ifqueue *ifq = &ifp->if_snd; + + if (ifq_enqueue(ifq, m) != 0) + return (ENOBUFS); - /* - * Use interface hooks to track any changes on the interface - * that is used to send out the tunnel traffic as multicast. - */ - if_addrhook_add(mifp, &sc->sc_atask); - if_linkstatehook_add(mifp, &sc->sc_ltask); - if_detachhook_add(mifp, &sc->sc_dtask); + task_add(ifq->ifq_softnet, &sc->sc_send_task); return (0); } -void -vxlanstart(struct ifnet *ifp) +static void +vxlan_start(struct ifqueue *ifq) { - struct vxlan_softc *sc = (struct vxlan_softc *)ifp->if_softc; + struct ifnet *ifp = ifq->ifq_if; + struct vxlan_softc *sc = ifp->if_softc; - task_add(net_tq(ifp->if_index), &sc->sc_sendtask); + task_add(ifq->ifq_softnet, &sc->sc_send_task); } -void -vxlan_send_dispatch(void *xsc) +static uint64_t +vxlan_send_ipv4(struct vxlan_softc *sc, struct mbuf_list *ml) { - struct vxlan_softc *sc = xsc; - struct ifnet *ifp = &sc->sc_ac.ac_if; - struct mbuf *m; - struct mbuf_list ml; - - ml_init(&ml); - for (;;) { - m = ifq_dequeue(&ifp->if_snd); - if (m == NULL) - break; - -#if NBPFILTER > 0 - if (ifp->if_bpf) - bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT); -#endif - - ml_enqueue(&ml, m); - } - - if (ml_empty(&ml)) - return; + struct ip_moptions imo; + struct mbuf *m; + uint64_t oerrors = 0; + + imo.imo_ifidx = sc->sc_if_index0; + imo.imo_ttl = sc->sc_ttl; + imo.imo_loop = 0; NET_LOCK(); - while ((m = ml_dequeue(&ml)) != NULL) { - vxlan_output(ifp, m); + while ((m = ml_dequeue(ml)) != NULL) { + if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &imo, NULL, 0) != 0) + oerrors++; } NET_UNLOCK(); + + return (oerrors); } +#ifdef INET6 +static uint64_t +vxlan_send_ipv6(struct vxlan_softc *sc, struct mbuf_list *ml) +{ + struct ip6_moptions im6o; + struct mbuf *m; + uint64_t oerrors = 0; + + im6o.im6o_ifidx = sc->sc_if_index0; + im6o.im6o_hlim = sc->sc_ttl; + im6o.im6o_loop = 0; + + NET_LOCK(); + while ((m = ml_dequeue(ml)) != NULL) { + if (ip6_output(m, NULL, NULL, 0, &im6o, NULL) != 0) + oerrors++; + } + NET_UNLOCK(); + + return (oerrors); +} +#endif /* INET6 */ -int -vxlan_config(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) +static void +vxlan_send(void *arg) { - struct vxlan_softc *sc = (struct vxlan_softc *)ifp->if_softc; - int reset = 0, error, af; - socklen_t slen; - in_port_t port; - struct vxlan_taghash *tagh; - - if (src != NULL && dst != NULL) { - if ((af = src->sa_family) != dst->sa_family) - return (EAFNOSUPPORT); - } else { - /* Reset current configuration */ - af = sc->sc_src.ss_family; - src = sstosa(&sc->sc_src); - dst = sstosa(&sc->sc_dst); - reset = 1; - } + struct vxlan_softc *sc = arg; + struct ifnet *ifp = &sc->sc_ac.ac_if; + struct mbuf *(*ip_encap)(struct vxlan_softc *, struct mbuf *, + const union vxlan_addr *, uint8_t); + uint64_t (*ip_send)(struct vxlan_softc *, struct mbuf_list *); + struct mbuf_list ml = MBUF_LIST_INITIALIZER(); + struct mbuf *m; + uint64_t oerrors; + + if (!ISSET(ifp->if_flags, IFF_RUNNING)) + return; - switch (af) { + switch (sc->sc_af) { case AF_INET: - slen = sizeof(struct sockaddr_in); + ip_encap = vxlan_encap_ipv4; + ip_send = vxlan_send_ipv4; break; #ifdef INET6 case AF_INET6: - slen = sizeof(struct sockaddr_in6); + ip_encap = vxlan_encap_ipv6; + ip_send = vxlan_send_ipv6; break; -#endif /* INET6 */ +#endif default: - return (EAFNOSUPPORT); + unhandled_af(sc->sc_af); + /* NOTREACHED */ } - if (src->sa_len != slen || dst->sa_len != slen) - return (EINVAL); + while ((m = ifq_dequeue(&ifp->if_snd)) != NULL) { +#if NBPFILTER > 0 + caddr_t if_bpf = READ_ONCE(ifp->if_bpf); + if (if_bpf != NULL) + bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_OUT); +#endif + m = vxlan_encap(sc, m, ip_encap); + if (m == NULL) + continue; - vxlan_multicast_cleanup(ifp); + ml_enqueue(&ml, m); + } - /* returns without error if multicast is not configured */ - if ((error = vxlan_multicast_join(ifp, src, dst)) != 0) - return (error); + oerrors = (*ip_send)(sc, &ml); + + counters_add(ifp->if_counters, ifc_oerrors, oerrors); +} + +static struct mbuf * +vxlan_input(void *arg, struct mbuf *m, struct ip *ip, struct ip6_hdr *ip6, + void *uhp, int hlen) +{ + struct vxlan_tep *vt = arg; + struct vxlan_peer key, *p; + struct udphdr *uh; + struct vxlan_header *vh; + struct ether_header *eh; + int vhlen = hlen + sizeof(*vh); + struct mbuf *n; + int off; + in_port_t port; + struct vxlan_softc *sc = NULL; + struct ifnet *ifp; + + if (m->m_pkthdr.len < vhlen) + goto drop; + + uh = uhp; + port = uh->uh_sport; - if ((port = vxlan_sockaddr_port(dst)) != 0) - sc->sc_dstport = port; + memset(&key, 0, sizeof(key)); + key.p_mask = 0; + + if (ip != NULL) + key.p_addr.in4 = ip->ip_src; +#ifdef INET6 + else + key.p_addr.in6 = ip6->ip6_src; +#endif - if (!reset) { - bzero(&sc->sc_src, sizeof(sc->sc_src)); - bzero(&sc->sc_dst, sizeof(sc->sc_dst)); - memcpy(&sc->sc_src, src, src->sa_len); - memcpy(&sc->sc_dst, dst, dst->sa_len); + if (m->m_len < vhlen) { + m = m_pullup(m, vhlen); + if (m == NULL) + return (NULL); } - if (sc->sc_vnetid == VXLAN_VNI_ANY) { - /* - * If the interface accepts any VNI, put it into a separate - * list that is not part of the main hash. - */ - tagh = &vxlan_any; - } else - tagh = &vxlan_tagh[VXLAN_TAGHASH(sc->sc_vnetid)]; + vh = (struct vxlan_header *)(mtod(m, caddr_t) + hlen); + key.p_header.vxlan_flags = vh->vxlan_flags & htonl(VXLAN_F_I); + key.p_header.vxlan_id = vh->vxlan_id & htonl(VXLAN_VNI_MASK); - LIST_REMOVE(sc, sc_entry); - LIST_INSERT_HEAD(tagh, sc, sc_entry); + mtx_enter(&vt->vt_mtx); + p = RBT_FIND(vxlan_peers, &vt->vt_peers, &key); + if (p != NULL) + sc = vxlan_take(p->p_sc); + mtx_leave(&vt->vt_mtx); - return (0); + if (sc == NULL) + goto drop; + + ifp = &sc->sc_ac.ac_if; + if (ISSET(ifp->if_flags, IFF_LINK0) && port != sc->sc_port) + goto rele_drop; + + m_adj(m, vhlen); + + if (m->m_pkthdr.len < sizeof(*eh)) + goto rele_drop; + + if (m->m_len < sizeof(*eh)) { + m = m_pullup(m, sizeof(*eh)); + if (m == NULL) + goto rele; + } + + n = m_getptr(m, sizeof(*eh), &off); + if (n == NULL) + goto rele_drop; + + if (!ALIGNED_POINTER(mtod(n, caddr_t) + off, uint32_t)) { + n = m_dup_pkt(m, ETHER_ALIGN, M_NOWAIT); + m_freem(m); + if (n == NULL) + goto rele; + m = n; + } + + if (sc->sc_mode == VXLAN_TMODE_LEARNING) { + eh = mtod(m, struct ether_header *); + etherbridge_map_ea(&sc->sc_eb, &key.p_addr, + (struct ether_addr *)eh->ether_shost); + } + + /* XXX prio */ + + if_vinput(ifp, m); +rele: + vxlan_rele(sc); + return (NULL); + +rele_drop: + vxlan_rele(sc); +drop: + m_freem(m); + return (NULL); } -int -vxlanioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +static int +vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { - struct vxlan_softc *sc = (struct vxlan_softc *)ifp->if_softc; - struct ifreq *ifr = (struct ifreq *)data; - struct if_laddrreq *lifr = (struct if_laddrreq *)data; - int error = 0; + struct vxlan_softc *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *)data; + struct ifbrparam *bparam = (struct ifbrparam *)data; + int error = 0; switch (cmd) { case SIOCSIFADDR: - ifp->if_flags |= IFF_UP; - /* FALLTHROUGH */ - + break; case SIOCSIFFLAGS: - if (ifp->if_flags & IFF_UP) { - ifp->if_flags |= IFF_RUNNING; + if (ISSET(ifp->if_flags, IFF_UP)) { + if (!ISSET(ifp->if_flags, IFF_RUNNING)) + error = vxlan_up(sc); + else + error = 0; } else { - ifp->if_flags &= ~IFF_RUNNING; + if (ISSET(ifp->if_flags, IFF_RUNNING)) + error = vxlan_down(sc); } break; - case SIOCADDMULTI: - case SIOCDELMULTI: + case SIOCSLIFPHYRTABLE: + error = vxlan_set_rdomain(sc, ifr); break; - - case SIOCGIFMEDIA: - case SIOCSIFMEDIA: - error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); + case SIOCGLIFPHYRTABLE: + error = vxlan_get_rdomain(sc, ifr); break; case SIOCSLIFPHYADDR: - error = vxlan_config(ifp, - sstosa(&lifr->addr), - sstosa(&lifr->dstaddr)); + error = vxlan_set_tunnel(sc, (const struct if_laddrreq *)data); + break; + case SIOCGLIFPHYADDR: + error = vxlan_get_tunnel(sc, (struct if_laddrreq *)data); break; - case SIOCDIFPHYADDR: - vxlan_multicast_cleanup(ifp); - bzero(&sc->sc_src, sizeof(sc->sc_src)); - bzero(&sc->sc_dst, sizeof(sc->sc_dst)); - sc->sc_dstport = htons(VXLAN_PORT); + error = vxlan_del_tunnel(sc); break; - case SIOCGLIFPHYADDR: - if (sc->sc_dst.ss_family == AF_UNSPEC) { - error = EADDRNOTAVAIL; - break; - } - bzero(&lifr->addr, sizeof(lifr->addr)); - bzero(&lifr->dstaddr, sizeof(lifr->dstaddr)); - memcpy(&lifr->addr, &sc->sc_src, sc->sc_src.ss_len); - memcpy(&lifr->dstaddr, &sc->sc_dst, sc->sc_dst.ss_len); + case SIOCSVNETID: + error = vxlan_set_vnetid(sc, ifr); break; - - case SIOCSLIFPHYRTABLE: - if (ifr->ifr_rdomainid < 0 || - ifr->ifr_rdomainid > RT_TABLEID_MAX || - !rtable_exists(ifr->ifr_rdomainid)) { - error = EINVAL; - break; - } - sc->sc_rdomain = ifr->ifr_rdomainid; - (void)vxlan_config(ifp, NULL, NULL); + case SIOCGVNETID: + error = vxlan_get_vnetid(sc, ifr); + break; + case SIOCDVNETID: + error = vxlan_del_vnetid(sc); break; - case SIOCGLIFPHYRTABLE: - ifr->ifr_rdomainid = sc->sc_rdomain; + case SIOCSIFPARENT: + error = vxlan_set_parent(sc, (struct if_parent *)data); + break; + case SIOCGIFPARENT: + error = vxlan_get_parent(sc, (struct if_parent *)data); + break; + case SIOCDIFPARENT: + error = vxlan_del_parent(sc); break; - case SIOCSLIFPHYTTL: - if (ifr->ifr_ttl < 0 || ifr->ifr_ttl > 0xff) { - error = EINVAL; - break; - } - if (sc->sc_ttl == (u_int8_t)ifr->ifr_ttl) + case SIOCSTXHPRIO: + error = if_txhprio_l2_check(ifr->ifr_hdrprio); + if (error != 0) break; - sc->sc_ttl = (u_int8_t)(ifr->ifr_ttl); - (void)vxlan_config(ifp, NULL, NULL); - break; - case SIOCGLIFPHYTTL: - ifr->ifr_ttl = (int)sc->sc_ttl; + sc->sc_txhprio = ifr->ifr_hdrprio; + break; + case SIOCGTXHPRIO: + ifr->ifr_hdrprio = sc->sc_txhprio; + break; + + case SIOCSRXHPRIO: + error = if_rxhprio_l2_check(ifr->ifr_hdrprio); + if (error != 0) + break; + + sc->sc_rxhprio = ifr->ifr_hdrprio; + break; + case SIOCGRXHPRIO: + ifr->ifr_hdrprio = sc->sc_rxhprio; break; case SIOCSLIFPHYDF: @@ -496,50 +796,45 @@ vxlanioctl(struct ifnet *ifp, u_long cmd ifr->ifr_df = sc->sc_df ? 1 : 0; break; - case SIOCSTXHPRIO: - if (ifr->ifr_hdrprio == IF_HDRPRIO_PACKET) - ; /* fall through */ - else if (ifr->ifr_hdrprio < IF_HDRPRIO_MIN || - ifr->ifr_hdrprio > IF_HDRPRIO_MAX) { + case SIOCSLIFPHYTTL: + if (ifr->ifr_ttl < 1 || ifr->ifr_ttl > 0xff) { error = EINVAL; break; } - sc->sc_txhprio = ifr->ifr_hdrprio; + /* commit */ + sc->sc_ttl = (uint8_t)ifr->ifr_ttl; break; - case SIOCGTXHPRIO: - ifr->ifr_hdrprio = sc->sc_txhprio; + case SIOCGLIFPHYTTL: + ifr->ifr_ttl = (int)sc->sc_ttl; break; - case SIOCSVNETID: - if (sc->sc_vnetid == ifr->ifr_vnetid) - break; - - if ((ifr->ifr_vnetid != VXLAN_VNI_ANY) && - (ifr->ifr_vnetid > VXLAN_VNI_MAX || - ifr->ifr_vnetid < VXLAN_VNI_MIN)) { - error = EINVAL; - break; - } - - sc->sc_vnetid = (int)ifr->ifr_vnetid; - (void)vxlan_config(ifp, NULL, NULL); + case SIOCBRDGSCACHE: + error = etherbridge_set_max(&sc->sc_eb, bparam); break; - - case SIOCGVNETID: - if ((sc->sc_vnetid != VXLAN_VNI_ANY) && - (sc->sc_vnetid > VXLAN_VNI_MAX || - sc->sc_vnetid < VXLAN_VNI_MIN)) { - error = EADDRNOTAVAIL; - break; - } - - ifr->ifr_vnetid = sc->sc_vnetid; + case SIOCBRDGGCACHE: + error = etherbridge_get_max(&sc->sc_eb, bparam); + break; + case SIOCBRDGSTO: + error = etherbridge_set_tmo(&sc->sc_eb, bparam); + break; + case SIOCBRDGGTO: + error = etherbridge_get_tmo(&sc->sc_eb, bparam); break; - case SIOCDVNETID: - sc->sc_vnetid = VXLAN_VNI_UNSET; - (void)vxlan_config(ifp, NULL, NULL); + case SIOCBRDGRTS: + error = etherbridge_rtfind(&sc->sc_eb, + (struct ifbaconf *)data); + break; + case SIOCBRDGFLUSH: + etherbridge_flush(&sc->sc_eb, + ((struct ifbreq *)data)->ifbr_ifsflags); + break; + case SIOCBRDGSADDR: + error = vxlan_add_addr(sc, (struct ifbareq *)data); + break; + case SIOCBRDGDADDR: + error = vxlan_del_addr(sc, (struct ifbareq *)data); break; default: @@ -550,465 +845,964 @@ vxlanioctl(struct ifnet *ifp, u_long cmd return (error); } -int -vxlan_media_change(struct ifnet *ifp) +static struct vxlan_tep * +vxlan_tep_get(struct vxlan_softc *sc, const union vxlan_addr *addr) { - return (0); -} + struct vxlan_tep *vt; -void -vxlan_media_status(struct ifnet *ifp, struct ifmediareq *imr) -{ - imr->ifm_status = IFM_AVALID | IFM_ACTIVE; + TAILQ_FOREACH(vt, &vxlan_teps, vt_entry) { + if (sc->sc_af == vt->vt_af && + sc->sc_rdomain == vt->vt_rdomain && + memcmp(addr, &vt->vt_addr, sizeof(*addr)) == 0 && + sc->sc_port == vt->vt_port) + return (vt); + } + + return (NULL); } -int -vxlan_sockaddr_cmp(struct sockaddr *srcsa, struct sockaddr *dstsa) +static int +vxlan_tep_add_addr(struct vxlan_softc *sc, const union vxlan_addr *addr, + struct vxlan_peer *p) { - struct sockaddr_in *src4, *dst4; + struct mbuf m; + struct vxlan_tep *vt; + struct socket *so; + struct sockaddr_in *sin; #ifdef INET6 - struct sockaddr_in6 *src6, *dst6; -#endif /* INET6 */ + struct sockaddr_in6 *sin6; +#endif + int error; + int s; - if (srcsa->sa_family != dstsa->sa_family) - return (1); + vt = vxlan_tep_get(sc, addr); + if (vt != NULL) { + struct vxlan_peer *op; + + mtx_enter(&vt->vt_mtx); + op = RBT_INSERT(vxlan_peers, &vt->vt_peers, p); + mtx_leave(&vt->vt_mtx); + + if (op != NULL) + return (EADDRINUSE); + + return (0); + } - switch (dstsa->sa_family) { + vt = malloc(sizeof(*vt), M_DEVBUF, M_NOWAIT|M_ZERO); + if (vt == NULL) + return (ENOMEM); + + vt->vt_af = sc->sc_af; + vt->vt_rdomain = sc->sc_rdomain; + vt->vt_addr = *addr; + vt->vt_port = sc->sc_port; + + mtx_init(&vt->vt_mtx, IPL_SOFTNET); + RBT_INIT(vxlan_peers, &vt->vt_peers); + RBT_INSERT(vxlan_peers, &vt->vt_peers, p); + + error = socreate(vt->vt_af, &so, SOCK_DGRAM, IPPROTO_UDP); + if (error != 0) + goto free; + + s = solock(so); + + sotoinpcb(so)->inp_upcall = vxlan_input; + sotoinpcb(so)->inp_upcall_arg = vt; + + m_inithdr(&m); + m.m_len = sizeof(vt->vt_rdomain); + *mtod(&m, unsigned int *) = vt->vt_rdomain; + error = sosetopt(so, SOL_SOCKET, SO_RTABLE, &m); + if (error != 0) + goto close; + + m_inithdr(&m); + switch (vt->vt_af) { case AF_INET: - src4 = satosin(srcsa); - dst4 = satosin(dstsa); - if (src4->sin_addr.s_addr == dst4->sin_addr.s_addr) - return (0); + sin = mtod(&m, struct sockaddr_in *); + memset(sin, 0, sizeof(*sin)); + sin->sin_len = sizeof(*sin); + sin->sin_family = AF_INET; + sin->sin_addr = addr->in4; + sin->sin_port = vt->vt_port; + + m.m_len = sizeof(*sin); break; + #ifdef INET6 case AF_INET6: - src6 = satosin6(srcsa); - dst6 = satosin6(dstsa); - if (IN6_ARE_ADDR_EQUAL(&src6->sin6_addr, &dst6->sin6_addr) && - src6->sin6_scope_id == dst6->sin6_scope_id) - return (0); + sin6 = mtod(&m, struct sockaddr_in6 *); + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_family = AF_INET6; + in6_recoverscope(sin6, &addr->in6); + sin6->sin6_port = sc->sc_port; + + m.m_len = sizeof(*sin6); break; -#endif /* INET6 */ +#endif + default: + unhandled_af(vt->vt_af); } - return (1); + error = sobind(so, &m, curproc); + if (error != 0) + goto close; + + sounlock(so, s); + + rw_assert_wrlock(&vxlan_lock); + TAILQ_INSERT_TAIL(&vxlan_teps, vt, vt_entry); + + vt->vt_so = so; + + return (0); + +close: + sounlock(so, s); + soclose(so, MSG_DONTWAIT); +free: + free(vt, M_DEVBUF, sizeof(*vt)); + return (error); } -uint16_t -vxlan_sockaddr_port(struct sockaddr *sa) +static void +vxlan_tep_del_addr(struct vxlan_softc *sc, const union vxlan_addr *addr, + struct vxlan_peer *p) { - struct sockaddr_in *sin4; -#ifdef INET6 - struct sockaddr_in6 *sin6; -#endif /* INET6 */ + struct vxlan_tep *vt; + int empty; - switch (sa->sa_family) { - case AF_INET: - sin4 = satosin(sa); - return (sin4->sin_port); -#ifdef INET6 - case AF_INET6: - sin6 = satosin6(sa); - return (sin6->sin6_port); -#endif /* INET6 */ - default: - break; - } + vt = vxlan_tep_get(sc, addr); + if (vt == NULL) + panic("unable to find vxlan_tep for peer %p (sc %p)", p, sc); + + mtx_enter(&vt->vt_mtx); + RBT_REMOVE(vxlan_peers, &vt->vt_peers, p); + empty = RBT_EMPTY(vxlan_peers, &vt->vt_peers); + mtx_leave(&vt->vt_mtx); + + if (!empty) + return; + + rw_assert_wrlock(&vxlan_lock); + TAILQ_REMOVE(&vxlan_teps, vt, vt_entry); + + soclose(vt->vt_so, MSG_DONTWAIT); + free(vt, M_DEVBUF, sizeof(*vt)); +} + +static int +vxlan_tep_up(struct vxlan_softc *sc) +{ + struct vxlan_peer *up, *mp; + int error; + + up = malloc(sizeof(*up), M_DEVBUF, M_NOWAIT|M_ZERO); + if (up == NULL) + return (ENOMEM); + + up->p_mask = (sc->sc_mode != VXLAN_TMODE_P2P); + up->p_addr = sc->sc_dst; + up->p_header = sc->sc_header; + up->p_sc = vxlan_take(sc); + + error = vxlan_tep_add_addr(sc, &sc->sc_src, up); + if (error != 0) + goto freeup; + + sc->sc_ucast_peer = up; + + if (sc->sc_mode != VXLAN_TMODE_LEARNING) + return (0); + + mp = malloc(sizeof(*mp), M_DEVBUF, M_NOWAIT|M_ZERO); + if (mp == NULL) { + error = ENOMEM; + goto delup; + } + + mp->p_mask = 1; + /* addr is masked, leave it as 0s */ + mp->p_header = sc->sc_header; + mp->p_sc = vxlan_take(sc); + + /* destination address is a multicast group we want to join */ + error = vxlan_tep_add_addr(sc, &sc->sc_dst, up); + if (error != 0) + goto freemp; + + sc->sc_mcast_peer = mp; return (0); + +freemp: + vxlan_rele(mp->p_sc); + free(mp, M_DEVBUF, sizeof(*mp)); +delup: + vxlan_tep_del_addr(sc, &sc->sc_src, up); +freeup: + vxlan_rele(up->p_sc); + free(up, M_DEVBUF, sizeof(*up)); + return (error); } -int -vxlan_lookup(struct mbuf *m, struct udphdr *uh, int iphlen, - struct sockaddr *srcsa, struct sockaddr *dstsa) -{ - struct vxlan_softc *sc = NULL, *sc_cand = NULL; - struct vxlan_header v; - int vni; - struct ifnet *ifp; - int skip; -#if NBRIDGE > 0 - struct bridge_tunneltag *brtag; -#endif - struct mbuf *n; - int off; - - /* XXX Should verify the UDP port first before copying the packet */ - skip = iphlen + sizeof(*uh); - if (m->m_pkthdr.len - skip < sizeof(v)) - return (0); - m_copydata(m, skip, sizeof(v), &v); - skip += sizeof(v); - - if (v.vxlan_flags & htonl(VXLAN_RESERVED1) || - v.vxlan_id & htonl(VXLAN_RESERVED2)) - return (0); - - vni = ntohl(v.vxlan_id) >> VXLAN_VNI_S; - if ((v.vxlan_flags & htonl(VXLAN_FLAGS_VNI)) == 0) { - if (vni != 0) - return (0); +static void +vxlan_tep_down(struct vxlan_softc *sc) +{ + struct vxlan_peer *up = sc->sc_ucast_peer; - vni = VXLAN_VNI_UNSET; + if (sc->sc_mode == VXLAN_TMODE_LEARNING) { + struct vxlan_peer *mp = sc->sc_mcast_peer; + vxlan_tep_del_addr(sc, &sc->sc_dst, mp); + vxlan_rele(mp->p_sc); + free(mp, M_DEVBUF, sizeof(*mp)); } + vxlan_tep_del_addr(sc, &sc->sc_src, up); + vxlan_rele(up->p_sc); + free(up, M_DEVBUF, sizeof(*up)); +} + +static int +vxlan_up(struct vxlan_softc *sc) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + struct ifnet *ifp0 = NULL; + int error; + + KASSERT(!ISSET(ifp->if_flags, IFF_RUNNING)); NET_ASSERT_LOCKED(); - /* First search for a vxlan(4) interface with the packet's VNI */ - LIST_FOREACH(sc, &vxlan_tagh[VXLAN_TAGHASH(vni)], sc_entry) { - if ((uh->uh_dport == sc->sc_dstport) && - vni == sc->sc_vnetid && - sc->sc_rdomain == rtable_l2(m->m_pkthdr.ph_rtableid)) { - sc_cand = sc; - if (vxlan_sockaddr_cmp(srcsa, sstosa(&sc->sc_dst)) == 0) - goto found; - } + + if (sc->sc_af == AF_UNSPEC) + return (EDESTADDRREQ); + KASSERT(sc->sc_mode != VXLAN_TMODE_UNSET); + + NET_UNLOCK(); + + error = rw_enter(&vxlan_lock, RW_WRITE|RW_INTR); + if (error != 0) + goto netlock; + + NET_LOCK(); + if (ISSET(ifp->if_flags, IFF_RUNNING)) { + /* something else beat us */ + rw_exit(&vxlan_lock); + return (0); } + NET_UNLOCK(); - /* - * Now loop through all the vxlan(4) interfaces that are configured - * to accept any VNI and operating in multipoint-to-multipoint mode - * that is used in combination with bridge(4) or switch(4). - * If a vxlan(4) interface has been found for the packet's VNI, this - * code is not reached as the other interface is more specific. - */ - LIST_FOREACH(sc, &vxlan_any, sc_entry) { - if ((uh->uh_dport == sc->sc_dstport) && - (sc->sc_rdomain == rtable_l2(m->m_pkthdr.ph_rtableid))) { - sc_cand = sc; - goto found; - } + if (sc->sc_mode != VXLAN_TMODE_P2P) { + error = etherbridge_up(&sc->sc_eb); + if (error != 0) + goto unlock; } - if (sc_cand) { - sc = sc_cand; - goto found; + if (sc->sc_mode == VXLAN_TMODE_LEARNING) { + ifp0 = if_get(sc->sc_if_index0); + if (ifp0 == NULL) { + error = ENXIO; + goto down; + } + + /* check again if multicast will work on top of the parent */ + if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) { + error = EPROTONOSUPPORT; + goto put; + } + + error = vxlan_addmulti(sc, ifp0); + if (error != 0) + goto put; + + /* Register callback if parent wants to unregister */ + if_detachhook_add(ifp0, &sc->sc_dtask); + } else { + if (sc->sc_if_index0 != 0) { + error = EPROTONOSUPPORT; + goto down; + } } - /* not found */ + error = vxlan_tep_up(sc); + if (error != 0) + goto del; + + if_put(ifp0); + + NET_LOCK(); + SET(ifp->if_flags, IFF_RUNNING); + rw_exit(&vxlan_lock); + return (0); - found: - if (m->m_pkthdr.len < skip + sizeof(struct ether_header)) { - m_freem(m); - return (EINVAL); +del: + if (ifp0 != NULL) + if_detachhook_del(ifp0, &sc->sc_dtask); + vxlan_delmulti(sc); +put: + if_put(ifp0); +down: + if (sc->sc_mode != VXLAN_TMODE_P2P) + etherbridge_down(&sc->sc_eb); +unlock: + rw_exit(&vxlan_lock); +netlock: + NET_LOCK(); + + return (error); +} + +static int +vxlan_down(struct vxlan_softc *sc) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + struct ifnet *ifp0; + int error; + + KASSERT(ISSET(ifp->if_flags, IFF_RUNNING)); + NET_UNLOCK(); + + error = rw_enter(&vxlan_lock, RW_WRITE|RW_INTR); + if (error != 0) { + NET_LOCK(); + return (error); } - m_adj(m, skip); - ifp = &sc->sc_ac.ac_if; + NET_LOCK(); + if (!ISSET(ifp->if_flags, IFF_RUNNING)) { + /* something else beat us */ + rw_exit(&vxlan_lock); + return (0); + } + NET_UNLOCK(); + + vxlan_tep_down(sc); -#if NBRIDGE > 0 - /* Store the tunnel src/dst IP and vni for the bridge or switch */ - if ((ifp->if_bridgeidx != 0 || ifp->if_switchport != NULL) && - srcsa->sa_family != AF_UNSPEC && - ((brtag = bridge_tunneltag(m)) != NULL)) { - memcpy(&brtag->brtag_peer.sa, srcsa, srcsa->sa_len); - memcpy(&brtag->brtag_local.sa, dstsa, dstsa->sa_len); - brtag->brtag_id = vni; + if (sc->sc_mode == VXLAN_TMODE_LEARNING) { + vxlan_delmulti(sc); + ifp0 = if_get(sc->sc_if_index0); + if (ifp0 != NULL) { + if_detachhook_del(ifp0, &sc->sc_dtask); + } + if_put(ifp0); } -#endif - m->m_flags &= ~(M_BCAST|M_MCAST); + if (sc->sc_mode != VXLAN_TMODE_P2P) + etherbridge_down(&sc->sc_eb); -#if NPF > 0 - pf_pkt_addr_changed(m); -#endif - if ((m->m_len < sizeof(struct ether_header)) && - (m = m_pullup(m, sizeof(struct ether_header))) == NULL) - return (ENOBUFS); + taskq_del_barrier(ifp->if_snd.ifq_softnet, &sc->sc_send_task); + NET_LOCK(); + CLR(ifp->if_flags, IFF_RUNNING); + rw_exit(&vxlan_lock); - n = m_getptr(m, sizeof(struct ether_header), &off); - if (n == NULL) { - m_freem(m); - return (EINVAL); + return (0); +} + +static int +vxlan_addmulti(struct vxlan_softc *sc, struct ifnet *ifp0) +{ + int error = 0; + + NET_LOCK(); + + switch (sc->sc_af) { + case AF_INET: + sc->sc_inmulti = in_addmulti(&sc->sc_dst.in4, ifp0); + if (sc->sc_inmulti == NULL) + error = EADDRNOTAVAIL; + break; +#ifdef INET6 + case AF_INET6: + sc->sc_inmulti = in6_addmulti(&sc->sc_dst.in6, ifp0, &error); + break; +#endif + default: + unhandled_af(sc->sc_af); } - if (!ALIGNED_POINTER(mtod(n, caddr_t) + off, uint32_t)) { - n = m_dup_pkt(m, ETHER_ALIGN, M_NOWAIT); - /* Dispose of the original mbuf chain */ - m_freem(m); - if (n == NULL) - return (ENOBUFS); - m = n; + + NET_UNLOCK(); + + return (error); +} + +static void +vxlan_delmulti(struct vxlan_softc *sc) +{ + NET_LOCK(); + + switch (sc->sc_af) { + case AF_INET: + in_delmulti(sc->sc_inmulti); + break; +#ifdef INET6 + case AF_INET6: + in6_delmulti(sc->sc_inmulti); + break; +#endif + default: + unhandled_af(sc->sc_af); } - if_vinput(ifp, m); + sc->sc_inmulti = NULL; /* keep it tidy */ - /* success */ - return (1); + NET_UNLOCK(); } -struct mbuf * -vxlan_encap4(struct ifnet *ifp, struct mbuf *m, - struct sockaddr *src, struct sockaddr *dst) -{ - struct vxlan_softc *sc = (struct vxlan_softc *)ifp->if_softc; - struct ip *ip; - - /* - * Remove multicast and broadcast flags or encapsulated packet - * ends up as multicast or broadcast packet. - */ - m->m_flags &= ~(M_BCAST|M_MCAST); +static int +vxlan_set_rdomain(struct vxlan_softc *sc, const struct ifreq *ifr) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; - M_PREPEND(m, sizeof(*ip), M_DONTWAIT); - if (m == NULL) - return (NULL); + if (ifr->ifr_rdomainid < 0 || + ifr->ifr_rdomainid > RT_TABLEID_MAX) + return (EINVAL); + if (!rtable_exists(ifr->ifr_rdomainid)) + return (EADDRNOTAVAIL); - ip = mtod(m, struct ip *); - ip->ip_v = IPVERSION; - ip->ip_hl = sizeof(struct ip) >> 2; - ip->ip_id = htons(ip_randomid()); - ip->ip_off = sc->sc_df; - ip->ip_p = IPPROTO_UDP; - ip->ip_tos = IFQ_PRIO2TOS(sc->sc_txhprio == IF_HDRPRIO_PACKET ? - m->m_pkthdr.pf.prio : sc->sc_txhprio); - ip->ip_len = htons(m->m_pkthdr.len); + if (sc->sc_rdomain == ifr->ifr_rdomainid) + return (0); - ip->ip_src = satosin(src)->sin_addr; - ip->ip_dst = satosin(dst)->sin_addr; + if (!ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); - if (sc->sc_ttl > 0) - ip->ip_ttl = sc->sc_ttl; - else - ip->ip_ttl = IPDEFTTL; + /* commit */ + sc->sc_rdomain = ifr->ifr_rdomainid; + etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL); - return (m); + return (0); +} + +static int +vxlan_get_rdomain(struct vxlan_softc *sc, struct ifreq *ifr) +{ + ifr->ifr_rdomainid = sc->sc_rdomain; + + return (0); } +static int +vxlan_set_tunnel(struct vxlan_softc *sc, const struct if_laddrreq *req) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + struct sockaddr *src = (struct sockaddr *)&req->addr; + struct sockaddr *dst = (struct sockaddr *)&req->dstaddr; + struct sockaddr_in *src4, *dst4; #ifdef INET6 -struct mbuf * -vxlan_encap6(struct ifnet *ifp, struct mbuf *m, - struct sockaddr *src, struct sockaddr *dst) -{ - struct vxlan_softc *sc = (struct vxlan_softc *)ifp->if_softc; - struct ip6_hdr *ip6; - struct in6_addr *in6a; - uint32_t flow; - - /* - * Remove multicast and broadcast flags or encapsulated packet - * ends up as multicast or broadcast packet. - */ - m->m_flags &= ~(M_BCAST|M_MCAST); + struct sockaddr_in6 *src6, *dst6; + int error; +#endif + union vxlan_addr saddr, daddr; + unsigned int mode = VXLAN_TMODE_ENDPOINT; + in_port_t port = htons(VXLAN_PORT); - M_PREPEND(m, sizeof(struct ip6_hdr), M_DONTWAIT); - if (m == NULL) - return (NULL); + memset(&saddr, 0, sizeof(saddr)); + memset(&daddr, 0, sizeof(daddr)); - flow = (uint32_t)IFQ_PRIO2TOS(sc->sc_txhprio == IF_HDRPRIO_PACKET ? - m->m_pkthdr.pf.prio : sc->sc_txhprio) << 20; + /* validate */ + switch (src->sa_family) { + case AF_INET: + src4 = (struct sockaddr_in *)src; + if (in_nullhost(src4->sin_addr) || + IN_MULTICAST(src4->sin_addr.s_addr)) + return (EINVAL); - ip6 = mtod(m, struct ip6_hdr *); - ip6->ip6_flow = htonl(flow); - ip6->ip6_vfc &= ~IPV6_VERSION_MASK; - ip6->ip6_vfc |= IPV6_VERSION; - ip6->ip6_nxt = IPPROTO_UDP; - ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); - if (in6_embedscope(&ip6->ip6_src, satosin6(src), NULL) != 0) - goto drop; - if (in6_embedscope(&ip6->ip6_dst, satosin6(dst), NULL) != 0) - goto drop; + if (src4->sin_port != htons(0)) + port = src4->sin_port; - if (sc->sc_ttl > 0) - ip6->ip6_hlim = sc->sc_ttl; - else - ip6->ip6_hlim = ip6_defhlim; + if (dst->sa_family != AF_UNSPEC) { + if (dst->sa_family != AF_INET) + return (EINVAL); + + dst4 = (struct sockaddr_in *)dst; + if (in_nullhost(dst4->sin_addr)) + return (EINVAL); + + /* all good */ + mode = IN_MULTICAST(dst4->sin_addr.s_addr) ? + VXLAN_TMODE_LEARNING : VXLAN_TMODE_P2P; + daddr.in4 = dst4->sin_addr; + } - if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr)) { - if (in6_selectsrc(&in6a, satosin6(dst), NULL, - sc->sc_rdomain) != 0) - goto drop; + saddr.in4 = src4->sin_addr; + break; + +#ifdef INET6 + case AF_INET6: + src6 = (struct sockaddr_in6 *)src; + if (IN6_IS_ADDR_UNSPECIFIED(&src6->sin6_addr) || + IN6_IS_ADDR_MULTICAST(&src6->sin6_addr)) + return (EINVAL); + + if (src6->sin6_port != htons(0)) + port = src6->sin6_port; - ip6->ip6_src = *in6a; + if (dst->sa_family != AF_UNSPEC) { + if (dst->sa_family != AF_INET6) + return (EINVAL); + + dst6 = (struct sockaddr_in6 *)dst; + if (IN6_IS_ADDR_UNSPECIFIED(&dst6->sin6_addr)) + return (EINVAL); + + if (src6->sin6_scope_id != dst6->sin6_scope_id) + return (EINVAL); + + /* all good */ + mode = IN6_IS_ADDR_MULTICAST(&dst6->sin6_addr) ? + VXLAN_TMODE_LEARNING : VXLAN_TMODE_P2P; + error = in6_embedscope(&daddr.in6, dst6, NULL); + if (error != 0) + return (error); + } + + error = in6_embedscope(&saddr.in6, src6, NULL); + if (error != 0) + return (error); + + break; +#endif + default: + return (EAFNOSUPPORT); } - if (sc->sc_df) - SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT); + if (memcmp(&sc->sc_src, &saddr, sizeof(sc->sc_src)) == 0 && + memcmp(&sc->sc_dst, &daddr, sizeof(sc->sc_dst)) == 0 && + sc->sc_port == port) + return (0); - /* - * The UDP checksum of VXLAN packets should be set to zero, - * but the IPv6 UDP checksum is not optional. There is an RFC 6539 - * to relax the IPv6 UDP checksum requirement for tunnels, but it - * is currently not supported by most implementations. - */ - m->m_pkthdr.csum_flags |= M_UDP_CSUM_OUT; + if (ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); - return (m); + /* commit */ + sc->sc_af = src->sa_family; + sc->sc_src = saddr; + sc->sc_dst = daddr; + sc->sc_port = port; + sc->sc_mode = mode; + etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL); -drop: - m_freem(m); - return (NULL); + return (0); } -#endif /* INET6 */ -int -vxlan_output(struct ifnet *ifp, struct mbuf *m) +static int +vxlan_get_tunnel(struct vxlan_softc *sc, struct if_laddrreq *req) { - struct vxlan_softc *sc = (struct vxlan_softc *)ifp->if_softc; - struct vxlanudphdr *vu; - struct sockaddr *src, *dst; -#if NBRIDGE > 0 - struct bridge_tunneltag *brtag; -#endif - int error, af; - uint32_t tag; - struct mbuf *m0; - - /* VXLAN header, needs new mbuf because of alignment issues */ - MGET(m0, M_DONTWAIT, m->m_type); - if (m0 == NULL) { - ifp->if_oerrors++; - return (ENOBUFS); - } - M_MOVE_PKTHDR(m0, m); - m0->m_next = m; - m = m0; - m_align(m, sizeof(*vu)); - m->m_len = sizeof(*vu); - m->m_pkthdr.len += sizeof(*vu); - - src = sstosa(&sc->sc_src); - dst = sstosa(&sc->sc_dst); - af = src->sa_family; - - vu = mtod(m, struct vxlanudphdr *); - vu->vu_u.uh_sport = sc->sc_dstport; - vu->vu_u.uh_dport = sc->sc_dstport; - vu->vu_u.uh_ulen = htons(m->m_pkthdr.len); - vu->vu_u.uh_sum = 0; - tag = sc->sc_vnetid; - -#if NBRIDGE > 0 - if ((brtag = bridge_tunnel(m)) != NULL) { - dst = &brtag->brtag_peer.sa; - - /* If accepting any VNI, source ip address is from brtag */ - if (sc->sc_vnetid == VXLAN_VNI_ANY) { - src = &brtag->brtag_local.sa; - tag = (uint32_t)brtag->brtag_id; - af = src->sa_family; - } - - if (dst->sa_family != af) { - ifp->if_oerrors++; - m_freem(m); - return (EINVAL); - } - } else + struct sockaddr *dstaddr = (struct sockaddr *)&req->dstaddr; + struct sockaddr_in *sin; +#ifdef INET6 + struct sockaddr_in6 *sin6; #endif - if (sc->sc_vnetid == VXLAN_VNI_ANY) { - /* - * If accepting any VNI, build the vxlan header only by - * bridge_tunneltag or drop packet if the tag does not exist. - */ - ifp->if_oerrors++; - m_freem(m); - return (ENETUNREACH); - } - if (sc->sc_vnetid != VXLAN_VNI_UNSET) { - vu->vu_v.vxlan_flags = htonl(VXLAN_FLAGS_VNI); - vu->vu_v.vxlan_id = htonl(tag << VXLAN_VNI_S); - } else { - vu->vu_v.vxlan_flags = htonl(0); - vu->vu_v.vxlan_id = htonl(0); - } + if (sc->sc_af == AF_UNSPEC) + return (EADDRNOTAVAIL); + KASSERT(sc->sc_mode != VXLAN_TMODE_UNSET); + + memset(&req->addr, 0, sizeof(req->addr)); + memset(&req->dstaddr, 0, sizeof(req->dstaddr)); - switch (af) { + /* default to endpoint */ + dstaddr->sa_len = 2; + dstaddr->sa_family = AF_UNSPEC; + + switch (sc->sc_af) { case AF_INET: - m = vxlan_encap4(ifp, m, src, dst); + sin = (struct sockaddr_in *)&req->addr; + sin->sin_len = sizeof(*sin); + sin->sin_family = AF_INET; + sin->sin_addr = sc->sc_src.in4; + sin->sin_port = sc->sc_port; + + if (sc->sc_mode == VXLAN_TMODE_ENDPOINT) + break; + + sin = (struct sockaddr_in *)&req->dstaddr; + sin->sin_len = sizeof(*sin); + sin->sin_family = AF_INET; + sin->sin_addr = sc->sc_dst.in4; break; + #ifdef INET6 case AF_INET6: - m = vxlan_encap6(ifp, m, src, dst); + sin6 = (struct sockaddr_in6 *)&req->addr; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_family = AF_INET6; + in6_recoverscope(sin6, &sc->sc_src.in6); + sin6->sin6_port = sc->sc_port; + + if (sc->sc_mode == VXLAN_TMODE_ENDPOINT) + break; + + sin6 = (struct sockaddr_in6 *)&req->dstaddr; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_family = AF_INET6; + in6_recoverscope(sin6, &sc->sc_dst.in6); break; -#endif /* INET6 */ +#endif default: - m_freem(m); - m = NULL; + unhandled_af(sc->sc_af); } - if (m == NULL) { - ifp->if_oerrors++; - return (ENOBUFS); + return (0); +} + +static int +vxlan_del_tunnel(struct vxlan_softc *sc) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + + if (sc->sc_af == AF_UNSPEC) + return (0); + + if (ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); + + /* commit */ + sc->sc_af = AF_UNSPEC; + memset(&sc->sc_src, 0, sizeof(sc->sc_src)); + memset(&sc->sc_dst, 0, sizeof(sc->sc_dst)); + sc->sc_port = htons(0); + sc->sc_mode = VXLAN_TMODE_UNSET; + etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL); + + return (0); +} + +static int +vxlan_set_vnetid(struct vxlan_softc *sc, const struct ifreq *ifr) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + uint32_t vni; + + if (ifr->ifr_vnetid < VXLAN_VNI_MIN || + ifr->ifr_vnetid > VXLAN_VNI_MAX) + return (EINVAL); + + vni = htonl(ifr->ifr_vnetid << VXLAN_VNI_SHIFT); + if (ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)) && + sc->sc_header.vxlan_id == vni) + return (0); + + if (ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); + + /* commit */ + SET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)); + sc->sc_header.vxlan_id = vni; + etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL); + + return (0); +} + +static int +vxlan_get_vnetid(struct vxlan_softc *sc, struct ifreq *ifr) +{ + uint32_t vni; + + if (!ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I))) + return (EADDRNOTAVAIL); + + vni = ntohl(sc->sc_header.vxlan_id); + vni &= VXLAN_VNI_MASK; + vni >>= VXLAN_VNI_SHIFT; + + ifr->ifr_vnetid = vni; + + return (0); +} + +static int +vxlan_del_vnetid(struct vxlan_softc *sc) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + + if (!ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I))) + return (0); + + if (ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); + + /* commit */ + CLR(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)); + sc->sc_header.vxlan_id = htonl(0 << VXLAN_VNI_SHIFT); + etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL); + + return (0); +} + +static int +vxlan_set_parent(struct vxlan_softc *sc, const struct if_parent *p) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + struct ifnet *ifp0; + int error = 0; + + ifp0 = if_unit(p->ifp_parent); + if (ifp0 == NULL) + return (ENXIO); + + if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) { + error = ENXIO; + goto put; } -#if NBRIDGE > 0 - if (brtag != NULL) - bridge_tunneluntag(m); -#endif + if (sc->sc_if_index0 == ifp0->if_index) + goto put; - m->m_pkthdr.ph_rtableid = sc->sc_rdomain; + if (ISSET(ifp->if_flags, IFF_RUNNING)) { + error = EBUSY; + goto put; + } -#if NPF > 0 - pf_pkt_addr_changed(m); + /* commit */ + sc->sc_if_index0 = ifp0->if_index; + etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL); + +put: + if_put(ifp0); + return (error); +} + +static int +vxlan_get_parent(struct vxlan_softc *sc, struct if_parent *p) +{ + struct ifnet *ifp0; + int error = 0; + + ifp0 = if_get(sc->sc_if_index0); + if (ifp0 == NULL) + error = EADDRNOTAVAIL; + else + strlcpy(p->ifp_parent, ifp0->if_xname, sizeof(p->ifp_parent)); + if_put(ifp0); + + return (error); +} + +static int +vxlan_del_parent(struct vxlan_softc *sc) +{ + struct ifnet *ifp = &sc->sc_ac.ac_if; + + if (sc->sc_if_index0 == 0) + return (0); + + if (ISSET(ifp->if_flags, IFF_RUNNING)) + return (EBUSY); + + /* commit */ + sc->sc_if_index0 = 0; + etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL); + + return (0); +} + +static int +vxlan_add_addr(struct vxlan_softc *sc, const struct ifbareq *ifba) +{ + struct sockaddr_in *sin; +#ifdef INET6 + struct sockaddr_in6 *sin6; + struct sockaddr_in6 src6 = { + .sin6_len = sizeof(src6), + .sin6_family = AF_UNSPEC, + }; + int error; #endif + union vxlan_addr endpoint; + unsigned int type; + + switch (sc->sc_mode) { + case VXLAN_TMODE_UNSET: + return (ENOPROTOOPT); + case VXLAN_TMODE_P2P: + return (EPROTONOSUPPORT); + default: + break; + } + + /* ignore ifba_ifsname */ + + if (ISSET(ifba->ifba_flags, ~IFBAF_TYPEMASK)) + return (EINVAL); + switch (ifba->ifba_flags & IFBAF_TYPEMASK) { + case IFBAF_DYNAMIC: + type = EBE_DYNAMIC; + break; + case IFBAF_STATIC: + type = EBE_STATIC; + break; + default: + return (EINVAL); + } + + memset(&endpoint, 0, sizeof(endpoint)); - switch (af) { + if (ifba->ifba_dstsa.ss_family != sc->sc_af) + return (EAFNOSUPPORT); + switch (ifba->ifba_dstsa.ss_family) { case AF_INET: - error = ip_output(m, NULL, NULL, IP_RAWOUTPUT, - &sc->sc_imo, NULL, 0); + sin = (struct sockaddr_in *)&ifba->ifba_dstsa; + if (in_nullhost(sin->sin_addr) || + IN_MULTICAST(sin->sin_addr.s_addr)) + return (EADDRNOTAVAIL); + + if (sin->sin_port != htons(0)) + return (EADDRNOTAVAIL); + + endpoint.in4 = sin->sin_addr; break; + #ifdef INET6 case AF_INET6: - error = ip6_output(m, 0, NULL, IPV6_MINMTU, 0, NULL); + sin6 = (struct sockaddr_in6 *)&ifba->ifba_dstsa; + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || + IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) + return (EADDRNOTAVAIL); + + in6_recoverscope(&src6, &sc->sc_src.in6); + if (src6.sin6_scope_id != sin6->sin6_scope_id) + return (EADDRNOTAVAIL); + + if (sin6->sin6_port != htons(0)) + return (EADDRNOTAVAIL); + + error = in6_embedscope(&endpoint.in6, sin6, NULL); + if (error != 0) + return (error); + break; -#endif /* INET6 */ - default: - m_freem(m); - error = EAFNOSUPPORT; +#endif + default: /* AF_UNSPEC */ + return (EADDRNOTAVAIL); } - if (error) - ifp->if_oerrors++; + return (etherbridge_add_addr(&sc->sc_eb, &endpoint, + &ifba->ifba_dst, type)); +} - return (error); +static int +vxlan_del_addr(struct vxlan_softc *sc, const struct ifbareq *ifba) +{ + return (etherbridge_del_addr(&sc->sc_eb, &ifba->ifba_dst)); } void -vxlan_addr_change(void *arg) +vxlan_detach_hook(void *arg) { - struct vxlan_softc *sc = arg; - struct ifnet *ifp = &sc->sc_ac.ac_if; - int error; - - /* - * Reset the configuration after resume or any possible address - * configuration changes. - */ - if ((error = vxlan_config(ifp, NULL, NULL))) { - /* - * The source address of the tunnel can temporarily disappear, - * after a link state change when running the DHCP client, - * so keep it configured. - */ + struct vxlan_softc *sc = arg; + struct ifnet *ifp = &sc->sc_ac.ac_if; + + if (ISSET(ifp->if_flags, IFF_RUNNING)) { + vxlan_down(sc); + CLR(ifp->if_flags, IFF_UP); } + + sc->sc_if_index0 = 0; } -void -vxlan_if_change(void *arg) +static int +vxlan_eb_port_eq(void *arg, void *a, void *b) { - struct vxlan_softc *sc = arg; - struct ifnet *ifp = &sc->sc_ac.ac_if; + const union vxlan_addr *va = a, *vb = b; + size_t i; - /* - * Reset the configuration after the parent interface disappeared. - */ - vxlan_multicast_cleanup(ifp); - memset(&sc->sc_src, 0, sizeof(sc->sc_src)); - memset(&sc->sc_dst, 0, sizeof(sc->sc_dst)); - sc->sc_dstport = htons(VXLAN_PORT); + for (i = 0; i < nitems(va->in6.s6_addr32); i++) { + if (va->in6.s6_addr32[i] != vb->in6.s6_addr32[i]) + return (0); + } + + return (1); } -void -vxlan_link_change(void *arg) +static void * +vxlan_eb_port_take(void *arg, void *port) { - struct vxlan_softc *sc = arg; - struct ifnet *ifp = &sc->sc_ac.ac_if; + union vxlan_addr *endpoint; + + endpoint = pool_get(&vxlan_endpoint_pool, PR_NOWAIT); + if (endpoint == NULL) + return (NULL); + + *endpoint = *(union vxlan_addr *)port; - /* - * The machine might have lost its multicast associations after - * link state changes. This fixes a problem with VMware after - * suspend/resume of the host or guest. - */ - (void)vxlan_config(ifp, NULL, NULL); + return (endpoint); } + +static void +vxlan_eb_port_rele(void *arg, void *port) +{ + union vxlan_addr *endpoint = port; + + pool_put(&vxlan_endpoint_pool, endpoint); +} + +static size_t +vxlan_eb_port_ifname(void *arg, char *dst, size_t len, void *port) +{ + struct vxlan_softc *sc = arg; + + return (strlcpy(dst, sc->sc_ac.ac_if.if_xname, len)); +} + +static void +vxlan_eb_port_sa(void *arg, struct sockaddr_storage *ss, void *port) +{ + struct vxlan_softc *sc = arg; + union vxlan_addr *endpoint = port; + + switch (sc->sc_af) { + case AF_INET: { + struct sockaddr_in *sin = (struct sockaddr_in *)ss; + + sin->sin_len = sizeof(*sin); + sin->sin_family = AF_INET; + sin->sin_addr = endpoint->in4; + break; + } +#ifdef INET6 + case AF_INET6: { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss; + + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_family = AF_INET6; + in6_recoverscope(sin6, &endpoint->in6); + break; + } +#endif /* INET6 */ + default: + unhandled_af(sc->sc_af); + } +} + +static inline int +vxlan_peer_cmp(const struct vxlan_peer *ap, const struct vxlan_peer *bp) +{ + size_t i; + + if (ap->p_header.vxlan_id > bp->p_header.vxlan_id) + return (1); + if (ap->p_header.vxlan_id < bp->p_header.vxlan_id) + return (-1); + if (ap->p_header.vxlan_flags > bp->p_header.vxlan_flags) + return (1); + if (ap->p_header.vxlan_flags < bp->p_header.vxlan_flags) + return (-1); + + if (ap->p_mask || bp->p_mask) + return (0); + + for (i = 0; i < nitems(ap->p_addr.in6.s6_addr32); i++) { + if (ap->p_addr.in6.s6_addr32[i] > + bp->p_addr.in6.s6_addr32[i]) + return (1); + if (ap->p_addr.in6.s6_addr32[i] < + bp->p_addr.in6.s6_addr32[i]) + return (-1); + } + + return (0); +} + +RBT_GENERATE(vxlan_peers, vxlan_peer, p_entry, vxlan_peer_cmp); Index: if_wg.c =================================================================== RCS file: /cvs/src/sys/net/if_wg.c,v retrieving revision 1.15 diff -u -p -r1.15 if_wg.c --- if_wg.c 25 Jan 2021 09:11:36 -0000 1.15 +++ if_wg.c 9 Mar 2021 04:28:41 -0000 @@ -646,11 +646,16 @@ wg_aip_remove(struct wg_softc *sc, struc int ret = 0; switch (d->a_af) { - case AF_INET: root = sc->sc_aip4; break; + case AF_INET: + root = sc->sc_aip4; + break; #ifdef INET6 - case AF_INET6: root = sc->sc_aip6; break; + case AF_INET6: + root = sc->sc_aip6; + break; #endif - default: return EAFNOSUPPORT; + default: + return (EAFNOSUPPORT); } rw_enter_write(&root->ar_lock); @@ -667,9 +672,8 @@ wg_aip_remove(struct wg_softc *sc, struc LIST_REMOVE(aip, a_entry); pool_put(&wg_aip_pool, aip); } - - srp_leave(&sr); rw_exit_write(&root->ar_lock); + return ret; } Index: ifq.c =================================================================== RCS file: /cvs/src/sys/net/ifq.c,v retrieving revision 1.43 diff -u -p -r1.43 ifq.c --- ifq.c 20 Feb 2021 04:37:26 -0000 1.43 +++ ifq.c 9 Mar 2021 04:28:41 -0000 @@ -500,6 +500,9 @@ ifq_hdatalen(struct ifqueue *ifq) struct mbuf *m; int len = 0; + if (ifq_empty(ifq)) + return (0); + m = ifq_deq_begin(ifq); if (m != NULL) { len = m->m_pkthdr.len; Index: netisr.h =================================================================== RCS file: /cvs/src/sys/net/netisr.h,v retrieving revision 1.55 diff -u -p -r1.55 netisr.h --- netisr.h 5 Jan 2021 20:43:36 -0000 1.55 +++ netisr.h 9 Mar 2021 04:28:41 -0000 @@ -41,8 +41,10 @@ * interrupt used for scheduling the network code to calls * on the lowest level routine of each protocol. */ +#define NETISR_IP 2 /* same as AF_INET */ #define NETISR_PFSYNC 5 /* for pfsync "immediate" tx */ #define NETISR_ARP 18 /* same as AF_LINK */ +#define NETISR_IPV6 24 /* same as AF_INET6 */ #define NETISR_PPP 28 /* for PPP processing */ #define NETISR_BRIDGE 29 /* for bridge processing */ #define NETISR_SWITCH 31 /* for switch dataplane */ @@ -57,6 +59,8 @@ extern int netisr; /* scheduling bits extern struct task if_input_task_locked; void arpintr(void); +void ipintr(void); +void ip6intr(void); void pppintr(void); void bridgeintr(void); void switchintr(void); Index: pf.c =================================================================== RCS file: /cvs/src/sys/net/pf.c,v retrieving revision 1.1113 diff -u -p -r1.1113 pf.c --- pf.c 1 Mar 2021 11:05:42 -0000 1.1113 +++ pf.c 9 Mar 2021 04:28:41 -0000 @@ -3239,14 +3239,14 @@ pf_socket_lookup(struct pf_pdesc *pd) sport = pd->hdr.tcp.th_sport; dport = pd->hdr.tcp.th_dport; PF_ASSERT_LOCKED(); - NET_ASSERT_LOCKED(); + //NET_ASSERT_LOCKED(); tb = &tcbtable; break; case IPPROTO_UDP: sport = pd->hdr.udp.uh_sport; dport = pd->hdr.udp.uh_dport; PF_ASSERT_LOCKED(); - NET_ASSERT_LOCKED(); + //NET_ASSERT_LOCKED(); tb = &udbtable; break; default: Index: route.h =================================================================== RCS file: /cvs/src/sys/net/route.h,v retrieving revision 1.183 diff -u -p -r1.183 route.h --- route.h 29 Oct 2020 21:15:27 -0000 1.183 +++ route.h 9 Mar 2021 04:28:41 -0000 @@ -94,7 +94,7 @@ struct rt_metrics { struct rtentry { struct sockaddr *rt_dest; /* destination */ - SRPL_ENTRY(rtentry) rt_next; /* Next multipath entry to our dst. */ + struct rtentry *rt_next; /* Next multipath entry to our dst. */ struct sockaddr *rt_gateway; /* value */ struct ifaddr *rt_ifa; /* the answer: interface addr to use */ caddr_t rt_llinfo; /* pointer to link level info cache or Index: rtable.c =================================================================== RCS file: /cvs/src/sys/net/rtable.c,v retrieving revision 1.72 diff -u -p -r1.72 rtable.c --- rtable.c 7 Nov 2020 09:51:40 -0000 1.72 +++ rtable.c 9 Mar 2021 04:28:41 -0000 @@ -87,6 +87,9 @@ void rtable_init_backend(void); void *rtable_alloc(unsigned int, unsigned int, unsigned int); void *rtable_get(unsigned int, sa_family_t); +static int + + void rtmap_init(void) { @@ -639,10 +642,8 @@ rtable_delete(unsigned int rtableid, str { struct art_root *ar; struct art_node *an; - struct srp_ref sr; uint8_t *addr; int plen; - struct rtentry *mrt; int npaths = 0; int error = 0; @@ -655,46 +656,43 @@ rtable_delete(unsigned int rtableid, str if (plen == -1) return (EINVAL); - rtref(rt); /* guarantee rtfree won't do anything under ar_lock */ rw_enter_write(&ar->ar_lock); - an = art_lookup(ar, addr, plen, &sr); - srp_leave(&sr); /* an can't go away while we have the lock */ + an = art_lookup(ar, addr, plen); /* Make sure we've got a perfect match. */ if (!an_match(an, dst, plen)) { - error = ESRCH; - goto leave; + rw_exit_write(&ar->ar_lock); + return (ESRCH); } - /* - * If other multipath route entries are still attached to - * this ART node we only have to unlink it. - */ - SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) - npaths++; + if (an->an_rtcount == 1) { + /* this is the last entry on the node, remove the node */ + if (art_delete(ar, an, addr, plen) == NULL) + panic("art_delete failed to find node %p", an); + art_put(an); + } else { + struct rtentry **rtp = &an->an_rtlist; + struct rtentry *nrt = SMR_PTR_GET_LOCKED(*rtp); - if (npaths > 1) { - KASSERT(rt->rt_refcnt >= 1); - SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, - rt_next); + while (nrt != rt) { + rtp = &nrt->rt_next; + nrt = SMR_PTR_GET_LOCKED(*rtp); + } + SMR_PTR_SET_LOCKED(rtp, rt->rt_next); - mrt = SRPL_FIRST_LOCKED(&an->an_rtlist); - if (npaths == 2) - mrt->rt_flags &= ~RTF_MPATH; + if (--an->an_rtcount == 1) { + nrt = SMR_PTR_GET_LOCKED(an->an_rtlist); + CLR(nrt->rt_flags, RTF_MPATH); + } - goto leave; + an = NULL; } + rw_exit_write(&ar->ar_lock); - if (art_delete(ar, an, addr, plen) == NULL) - panic("art_delete failed to find node %p", an); - - KASSERT(rt->rt_refcnt >= 1); - SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next); - art_put(an); + smr_barrier(); -leave: - rw_exit_write(&ar->ar_lock); - rtfree(rt); + if (an != NULL) + art_put(an); return (error); } @@ -779,7 +777,6 @@ rtable_mpath_reprio(unsigned int rtablei { struct art_root *ar; struct art_node *an; - struct srp_ref sr; uint8_t *addr; int error = 0; @@ -790,8 +787,7 @@ rtable_mpath_reprio(unsigned int rtablei addr = satoaddr(ar, dst); rw_enter_write(&ar->ar_lock); - an = art_lookup(ar, addr, plen, &sr); - srp_leave(&sr); /* an can't go away while we have the lock */ + an = art_lookup(ar, addr, plen); /* Make sure we've got a perfect match. */ if (!an_match(an, dst, plen)) { @@ -863,22 +859,6 @@ an_match(struct art_node *an, struct soc SRPL_LEAVE(&sr); return (match); -} - -void -rtentry_ref(void *null, void *xrt) -{ - struct rtentry *rt = xrt; - - rtref(rt); -} - -void -rtentry_unref(void *null, void *xrt) -{ - struct rtentry *rt = xrt; - - rtfree(rt); } /*