Index: share/man/man9/task_add.9 =================================================================== RCS file: /cvs/src/share/man/man9/task_add.9,v retrieving revision 1.16 diff -u -p -r1.16 task_add.9 --- share/man/man9/task_add.9 14 Sep 2015 15:14:55 -0000 1.16 +++ share/man/man9/task_add.9 9 Nov 2017 04:56:15 -0000 @@ -20,6 +20,7 @@ .Sh NAME .Nm taskq_create , .Nm taskq_destroy , +.Nm taskq_barrier , .Nm task_set , .Nm task_add , .Nm task_del , @@ -37,6 +38,8 @@ .Ft void .Fn taskq_destroy "struct taskq *tq" .Ft void +.Fn taskq_barrier "struct taskq *tq" +.Ft void .Fn task_set "struct task *t" "void (*fn)(void *)" "void *arg" .Ft int .Fn task_add "struct taskq *tq" "struct task *t" @@ -88,6 +91,15 @@ Calling against the system taskq is an error and will lead to undefined behaviour or a system fault. .Pp +.Fn taskq_barrier +guarantees that any task that was running on the +.Fa tq +taskq when the barrier was called has finished by the time the barrier +returns. +.Fn taskq_barrier +is only supported on taskqs serviced by 1 thread, +and may not be called by a task running in the specified taskq. +.Pp It is the responsibility of the caller to provide the .Fn task_set , .Fn task_add , @@ -163,6 +175,8 @@ argument given in and .Fn taskq_destroy can be called during autoconf, or from process context. +.Fn taskq_barrier +can be called from process context. .Fn task_set , .Fn task_add , and Index: sys/dev/pci/if_ix.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_ix.c,v retrieving revision 1.152 diff -u -p -r1.152 if_ix.c --- sys/dev/pci/if_ix.c 22 Jun 2017 02:44:37 -0000 1.152 +++ sys/dev/pci/if_ix.c 9 Nov 2017 04:56:15 -0000 @@ -133,14 +133,17 @@ void ixgbe_rxrefill(void *); void ixgbe_enable_intr(struct ix_softc *); void ixgbe_disable_intr(struct ix_softc *); void ixgbe_update_stats_counters(struct ix_softc *); -int ixgbe_txeof(struct tx_ring *); -int ixgbe_rxeof(struct ix_queue *); +void ixgbe_txeof(void *); +void ixgbe_rxeof(void *); +void ixgbe_rxfill_tmo(void *); +void ixgbe_rxfill_task(void *); void ixgbe_rx_checksum(uint32_t, struct mbuf *, uint32_t); void ixgbe_iff(struct ix_softc *); #ifdef IX_DEBUG void ixgbe_print_hw_stats(struct ix_softc *); #endif -void ixgbe_update_link_status(struct ix_softc *); +//void ixgbe_update_link_status(struct ix_softc *); +void ixgbe_update_link_status(void *); int ixgbe_get_buf(struct rx_ring *, int); int ixgbe_encap(struct tx_ring *, struct mbuf *); int ixgbe_dma_malloc(struct ix_softc *, bus_size_t, @@ -227,7 +230,6 @@ ixgbe_attach(struct device *parent, stru /* Set up the timer callout */ timeout_set(&sc->timer, ixgbe_local_timer, sc); - timeout_set(&sc->rx_refill, ixgbe_rxrefill, sc); /* Determine hardware revision */ ixgbe_identify_hardware(sc); @@ -353,11 +355,14 @@ ixgbe_detach(struct device *self, int fl ctrl_ext &= ~IXGBE_CTRL_EXT_DRV_LOAD; IXGBE_WRITE_REG(&sc->hw, IXGBE_CTRL_EXT, ctrl_ext); + printf("%s %s\n", ifp->if_xname, __func__); ether_ifdetach(ifp); + printf("%s %s\n", ifp->if_xname, __func__); if_detach(ifp); + printf("%s %s\n", ifp->if_xname, __func__); timeout_del(&sc->timer); - timeout_del(&sc->rx_refill); + //timeout_del(&sc->rx_refill); ixgbe_free_pci_resources(sc); ixgbe_free_transmit_structures(sc); @@ -383,14 +388,21 @@ ixgbe_start(struct ifqueue *ifq) { struct ifnet *ifp = ifq->ifq_if; struct ix_softc *sc = ifp->if_softc; - struct tx_ring *txr = sc->tx_rings; - struct mbuf *m_head; + struct tx_ring *txr = ifq->ifq_softc; + struct mbuf *m; + unsigned int head, free, used; int post = 0; - if (!(ifp->if_flags & IFF_RUNNING) || ifq_is_oactive(ifq)) - return; - if (!sc->link_up) + if (!sc->link_up) { + ifq_purge(ifq); return; + } + + head = txr->next_avail_desc; + free = txr->next_to_clean; + if (free <= head) + free += sc->num_tx_desc; + free -= head; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, 0, txr->txdma.dma_map->dm_mapsize, @@ -398,29 +410,28 @@ ixgbe_start(struct ifqueue *ifq) for (;;) { /* Check that we have the minimal number of TX descriptors. */ - if (txr->tx_avail <= IXGBE_TX_OP_THRESHOLD) { + if (free <= IXGBE_TX_OP_THRESHOLD) { ifq_set_oactive(ifq); break; } - m_head = ifq_dequeue(ifq); - if (m_head == NULL) + m = ifq_dequeue(ifq); + if (m == NULL) break; - if (ixgbe_encap(txr, m_head)) { - m_freem(m_head); + used = ixgbe_encap(txr, m); + if (used == 0) { + m_freem(m); continue; } + free -= used; + #if NBPFILTER > 0 if (ifp->if_bpf) - bpf_mtap_ether(ifp->if_bpf, m_head, BPF_DIRECTION_OUT); + bpf_mtap_ether(ifp->if_bpf, m, BPF_DIRECTION_OUT); #endif - /* Set timeout in case hardware has problems transmitting */ - txr->watchdog_timer = IXGBE_TX_TIMEOUT; - ifp->if_timer = IXGBE_TX_TIMEOUT; - post = 1; } @@ -521,7 +532,7 @@ ixgbe_rxrinfo(struct ix_softc *sc, struc for (i = 0; i < sc->num_queues; i++) { rxr = &sc->rx_rings[i]; ifr[n].ifr_size = sc->rx_mbuf_sz; - snprintf(ifr[n].ifr_name, sizeof(ifr[n].ifr_name), "/%d", i); + snprintf(ifr[n].ifr_name, sizeof(ifr[n].ifr_name), "%d", i); ifr[n].ifr_info = rxr->rx_ring; n++; } @@ -569,7 +580,6 @@ ixgbe_watchdog(struct ifnet * ifp) if (!(IXGBE_READ_REG(hw, IXGBE_TFCS) & IXGBE_TFCS_TXON)) { for (i = 0; i < sc->num_queues; i++, txr++) txr->watchdog_timer = IXGBE_TX_TIMEOUT; - ifp->if_timer = IXGBE_TX_TIMEOUT; return; } @@ -579,8 +589,6 @@ ixgbe_watchdog(struct ifnet * ifp) printf("%s: Queue(%d) tdh = %d, hw tdt = %d\n", ifp->if_xname, i, IXGBE_READ_REG(hw, IXGBE_TDH(i)), IXGBE_READ_REG(hw, IXGBE_TDT(i))); - printf("%s: TX(%d) desc avail = %d, Next TX to Clean = %d\n", ifp->if_xname, - i, txr->tx_avail, txr->next_to_clean); } ifp->if_flags &= ~IFF_RUNNING; sc->watchdog_events++; @@ -714,9 +722,10 @@ ixgbe_init(void *arg) timeout_add_sec(&sc->timer, 1); + /* set up interrupt routing */ + ixgbe_configure_ivars(sc); /* Set up MSI/X routing */ if (sc->msix > 1) { - ixgbe_configure_ivars(sc); /* Set up auto-mask */ if (sc->hw.mac.type == ixgbe_mac_82598EB) IXGBE_WRITE_REG(&sc->hw, IXGBE_EIAM, IXGBE_EICS_RTX_QUEUE); @@ -725,8 +734,8 @@ ixgbe_init(void *arg) IXGBE_WRITE_REG(&sc->hw, IXGBE_EIAM_EX(1), 0xFFFFFFFF); } } else { /* Simple settings for Legacy/MSI */ - ixgbe_set_ivar(sc, 0, 0, 0); - ixgbe_set_ivar(sc, 0, 0, 1); + //ixgbe_set_ivar(sc, 0, 0, 0); + //ixgbe_set_ivar(sc, 0, 0, 1); IXGBE_WRITE_REG(&sc->hw, IXGBE_EIAM, IXGBE_EICS_RTX_QUEUE); } @@ -744,7 +753,7 @@ ixgbe_init(void *arg) itr = (4000000 / IXGBE_INTS_PER_SEC) & 0xff8; if (sc->hw.mac.type != ixgbe_mac_82598EB) itr |= IXGBE_EITR_LLI_MOD | IXGBE_EITR_CNT_WDIS; - IXGBE_WRITE_REG(&sc->hw, IXGBE_EITR(0), itr); +// IXGBE_WRITE_REG(&sc->hw, IXGBE_EITR(0), itr); /* Enable power to the phy */ if (sc->hw.phy.ops.set_phy_power) @@ -917,12 +926,12 @@ int ixgbe_intr(void *arg) { struct ix_softc *sc = (struct ix_softc *)arg; - struct ix_queue *que = sc->queues; struct ifnet *ifp = &sc->arpcom.ac_if; - struct tx_ring *txr = sc->tx_rings; struct ixgbe_hw *hw = &sc->hw; uint32_t reg_eicr, mod_mask, msf_mask; - int i, refill = 0; + int i; + struct tx_ring *txr; + struct rx_ring *rxr; reg_eicr = IXGBE_READ_REG(&sc->hw, IXGBE_EICR); if (reg_eicr == 0) { @@ -930,28 +939,25 @@ ixgbe_intr(void *arg) return (0); } - if (ISSET(ifp->if_flags, IFF_RUNNING)) { - ixgbe_rxeof(que); - ixgbe_txeof(txr); - refill = 1; - } + if (ISSET(ifp->if_flags, IFF_RUNNING) && + (reg_eicr & IXGBE_EICR_RTX_QUEUE)) { + struct ix_queue *que; - if (refill) { - if (ixgbe_rxfill(que->rxr)) { - /* Advance the Rx Queue "Tail Pointer" */ - IXGBE_WRITE_REG(&sc->hw, IXGBE_RDT(que->rxr->me), - que->rxr->last_desc_filled); - } else - timeout_add(&sc->rx_refill, 1); + for (i = 0; i < sc->num_queues; i++) { + txr = &sc->tx_rings[i]; + rxr = &sc->rx_rings[i]; + que = &sc->queues[i]; + + if (ISSET(reg_eicr, 1 << rxr->msix)) + task_add(que->tq, &rxr->rxeof); + if (ISSET(reg_eicr, 1 << txr->msix)) + task_add(que->tq, &txr->txeof); + } } /* Link status change */ - if (reg_eicr & IXGBE_EICR_LSC) { - KERNEL_LOCK(); - ixgbe_update_link_status(sc); - KERNEL_UNLOCK(); - ifq_start(&ifp->if_snd); - } + if (reg_eicr & IXGBE_EICR_LSC) + task_add(systq, &sc->linkch); if (hw->mac.type != ixgbe_mac_82598EB) { if (reg_eicr & IXGBE_EICR_ECC) { @@ -1015,8 +1021,14 @@ ixgbe_intr(void *arg) KERNEL_UNLOCK(); } - for (i = 0; i < sc->num_queues; i++, que++) - ixgbe_enable_queue(sc, que->msix); +#if 1 + for (i = 0; i < sc->num_queues; i++) { + txr = &sc->tx_rings[i]; + rxr = &sc->rx_rings[i]; + ixgbe_enable_queue(sc, txr->msix); + ixgbe_enable_queue(sc, rxr->msix); + } +#endif return (1); } @@ -1146,11 +1158,11 @@ ixgbe_media_change(struct ifnet *ifp) **********************************************************************/ int -ixgbe_encap(struct tx_ring *txr, struct mbuf *m_head) +ixgbe_encap(struct tx_ring *txr, struct mbuf *m) { struct ix_softc *sc = txr->sc; uint32_t olinfo_status = 0, cmd_type_len; - int i, j, error; + int i, seg, ntxc; int first, last = 0; bus_dmamap_t map; struct ixgbe_tx_buf *txbuf; @@ -1161,7 +1173,7 @@ ixgbe_encap(struct tx_ring *txr, struct IXGBE_ADVTXD_DCMD_IFCS | IXGBE_ADVTXD_DCMD_DEXT); #if NVLAN > 0 - if (m_head->m_flags & M_VLANTAG) + if (m->m_flags & M_VLANTAG) cmd_type_len |= IXGBE_ADVTXD_DCMD_VLE; #endif @@ -1177,81 +1189,67 @@ ixgbe_encap(struct tx_ring *txr, struct /* * Map the packet for DMA. */ - error = bus_dmamap_load_mbuf(txr->txdma.dma_tag, map, m_head, - BUS_DMA_NOWAIT); - switch (error) { + switch (bus_dmamap_load_mbuf(txr->txdma.dma_tag, map, m, + BUS_DMA_NOWAIT)) { case 0: break; case EFBIG: - if (m_defrag(m_head, M_NOWAIT) == 0 && - (error = bus_dmamap_load_mbuf(txr->txdma.dma_tag, map, - m_head, BUS_DMA_NOWAIT)) == 0) + if (m_defrag(m, M_NOWAIT) == 0 && + bus_dmamap_load_mbuf(txr->txdma.dma_tag, map, + m, BUS_DMA_NOWAIT) == 0) break; /* FALLTHROUGH */ default: sc->no_tx_dma_setup++; - return (error); + return (0); } - /* Make certain there are enough descriptors */ - KASSERT(map->dm_nsegs <= txr->tx_avail - 2); - /* * Set the appropriate offload context * this will becomes the first descriptor. */ - error = ixgbe_tx_ctx_setup(txr, m_head, &cmd_type_len, &olinfo_status); - if (error) + ntxc = ixgbe_tx_ctx_setup(txr, m, &cmd_type_len, &olinfo_status); + if (ntxc == -1) goto xmit_fail; - i = txr->next_avail_desc; - for (j = 0; j < map->dm_nsegs; j++) { + i = txr->next_avail_desc + ntxc; + if (i == sc->num_tx_desc) + i = 0; + + for (seg = 0; seg < map->dm_nsegs; seg++) { txbuf = &txr->tx_buffers[i]; txd = &txr->tx_base[i]; - txd->read.buffer_addr = htole64(map->dm_segs[j].ds_addr); - txd->read.cmd_type_len = htole32(txr->txd_cmd | - cmd_type_len | map->dm_segs[j].ds_len); - txd->read.olinfo_status = htole32(olinfo_status); + htolem64(&txd->read.buffer_addr, map->dm_segs[seg].ds_addr); + htolem32(&txd->read.cmd_type_len, txr->txd_cmd | + cmd_type_len | map->dm_segs[seg].ds_len); + htolem32(&txd->read.olinfo_status, olinfo_status); last = i; /* descriptor that will get completion IRQ */ if (++i == sc->num_tx_desc) i = 0; - - txbuf->m_head = NULL; - txbuf->eop_index = -1; } txd->read.cmd_type_len |= htole32(IXGBE_TXD_CMD_EOP | IXGBE_TXD_CMD_RS); - txbuf->m_head = m_head; - /* - * Here we swap the map so the last descriptor, - * which gets the completion interrupt has the - * real map, and the first descriptor gets the - * unused map from this descriptor. - */ - txr->tx_buffers[first].map = txbuf->map; - txbuf->map = map; bus_dmamap_sync(txr->txdma.dma_tag, map, 0, map->dm_mapsize, BUS_DMASYNC_PREWRITE); /* Set the index of the descriptor that will be marked done */ txbuf = &txr->tx_buffers[first]; + txbuf->m_head = m; txbuf->eop_index = last; membar_producer(); - atomic_sub_int(&txr->tx_avail, map->dm_nsegs); txr->next_avail_desc = i; - ++txr->tx_packets; - return (0); + return (ntxc + seg); xmit_fail: bus_dmamap_unload(txr->txdma.dma_tag, txbuf->map); - return (error); + return (0); } void @@ -1343,8 +1341,9 @@ ixgbe_local_timer(void *arg) } void -ixgbe_update_link_status(struct ix_softc *sc) +ixgbe_update_link_status(void *arg) { + struct ix_softc *sc = arg; struct ifnet *ifp = &sc->arpcom.ac_if; int link_state = LINK_STATE_DOWN; @@ -1391,6 +1390,7 @@ ixgbe_stop(void *arg) { struct ix_softc *sc = arg; struct ifnet *ifp = &sc->arpcom.ac_if; + int i; /* Tell the stack that the interface is no longer active */ ifp->if_flags &= ~IFF_RUNNING; @@ -1407,21 +1407,25 @@ ixgbe_stop(void *arg) if (sc->hw.mac.ops.disable_tx_laser) sc->hw.mac.ops.disable_tx_laser(&sc->hw); timeout_del(&sc->timer); - timeout_del(&sc->rx_refill); + /* stop queues */ //timeout_del(&sc->rx_refill); /* reprogram the RAR[0] in case user changed it. */ ixgbe_set_rar(&sc->hw, 0, sc->hw.mac.addr, 0, IXGBE_RAH_AV); - ifq_barrier(&ifp->if_snd); intr_barrier(sc->tag); - KASSERT((ifp->if_flags & IFF_RUNNING) == 0); + for (i = 0; i < sc->num_queues; i++) { + struct ifqueue *ifq = ifp->if_ifqs[i]; + ifq_barrier(ifq); + ifq_clr_oactive(ifq); + } - ifq_clr_oactive(&ifp->if_snd); + KASSERT((ifp->if_flags & IFF_RUNNING) == 0); /* Should we really clear all structures on stop? */ ixgbe_free_transmit_structures(sc); ixgbe_free_receive_structures(sc); + printf("%s %s\n", ifp->if_xname, __func__); } @@ -1563,7 +1567,7 @@ ixgbe_allocate_pci_resources(struct ix_s sc->hw.hw_addr = (uint8_t *)os->os_membase; /* Legacy defaults */ - sc->num_queues = 1; + sc->num_queues = 8; /* XXX */ sc->hw.back = os; #ifdef notyet @@ -1606,6 +1610,7 @@ void ixgbe_setup_interface(struct ix_softc *sc) { struct ifnet *ifp = &sc->arpcom.ac_if; + int i; strlcpy(ifp->if_xname, sc->dev.dv_xname, IFNAMSIZ); ifp->if_softc = sc; @@ -1638,9 +1643,26 @@ ixgbe_setup_interface(struct ix_softc *s ixgbe_add_media_types(sc); ifmedia_set(&sc->media, IFM_ETHER | IFM_AUTO); + task_set(&sc->linkch, ixgbe_update_link_status, sc); + if_attach(ifp); ether_ifattach(ifp); + if_attach_queues(ifp, sc->num_queues); + if_attach_iqueues(ifp, sc->num_queues); + for (i = 0; i < sc->num_queues; i++) { + struct ifqueue *ifq = ifp->if_ifqs[i]; + struct tx_ring *txr = &sc->tx_rings[i]; + struct ifiqueue *ifiq = ifp->if_iqs[i]; + struct rx_ring *rxr = &sc->rx_rings[i]; + + ifq->ifq_softc = txr; + txr->ifq = ifq; + + ifiq->ifiq_softc = rxr; + rxr->ifiq = ifiq; + } + sc->max_frame_size = IXGBE_MAX_FRAME_SIZE; } @@ -1737,7 +1759,7 @@ ixgbe_dma_malloc(struct ix_softc *sc, bu dma->dma_tag = os->os_pa.pa_dmat; r = bus_dmamap_create(dma->dma_tag, size, 1, - size, 0, BUS_DMA_NOWAIT, &dma->dma_map); + size, 0, BUS_DMA_NOWAIT | BUS_DMA_64BIT, &dma->dma_map); if (r != 0) { printf("%s: ixgbe_dma_malloc: bus_dmamap_create failed; " "error %u\n", ifp->if_xname, r); @@ -1852,6 +1874,7 @@ ixgbe_allocate_queues(struct ix_softc *s txr = &sc->tx_rings[i]; txr->sc = sc; txr->me = i; + txr->msix = i; if (ixgbe_dma_malloc(sc, tsize, &txr->txdma, BUS_DMA_NOWAIT)) { @@ -1861,6 +1884,8 @@ ixgbe_allocate_queues(struct ix_softc *s } txr->tx_base = (union ixgbe_adv_tx_desc *)txr->txdma.dma_vaddr; bzero((void *)txr->tx_base, tsize); + + task_set(&txr->txeof, ixgbe_txeof, txr); } /* @@ -1873,6 +1898,8 @@ ixgbe_allocate_queues(struct ix_softc *s /* Set up some basics */ rxr->sc = sc; rxr->me = i; + rxr->msix = i + 8; + rxr->que = &sc->queues[i]; if (ixgbe_dma_malloc(sc, rsize, &rxr->rxdma, BUS_DMA_NOWAIT)) { @@ -1882,6 +1909,10 @@ ixgbe_allocate_queues(struct ix_softc *s } rxr->rx_base = (union ixgbe_adv_rx_desc *)rxr->rxdma.dma_vaddr; bzero((void *)rxr->rx_base, rsize); + + task_set(&rxr->rxeof, ixgbe_rxeof, rxr); + task_set(&rxr->rxfill, ixgbe_rxfill_task, rxr); + timeout_set(&rxr->refill, ixgbe_rxfill_tmo, rxr); } /* @@ -1892,6 +1923,11 @@ ixgbe_allocate_queues(struct ix_softc *s que->sc = sc; que->txr = &sc->tx_rings[i]; que->rxr = &sc->rx_rings[i]; + snprintf(que->tqname, sizeof(que->tqname), "%s.%d", + sc->dev.dv_xname, i); + que->tq = taskq_create(que->tqname, 1, IPL_NET, TASKQ_MPSAFE); + if (que->tq == NULL) + goto err_rx_desc; } return (0); @@ -1940,7 +1976,7 @@ ixgbe_allocate_transmit_buffers(struct t txbuf = &txr->tx_buffers[i]; error = bus_dmamap_create(txr->txdma.dma_tag, IXGBE_TSO_SIZE, sc->num_segs, PAGE_SIZE, 0, - BUS_DMA_NOWAIT, &txbuf->map); + BUS_DMA_NOWAIT | BUS_DMA_64BIT, &txbuf->map); if (error != 0) { printf("%s: Unable to create TX DMA map\n", @@ -1977,9 +2013,6 @@ ixgbe_setup_transmit_ring(struct tx_ring txr->next_avail_desc = 0; txr->next_to_clean = 0; - /* Set number of descriptors available */ - txr->tx_avail = sc->num_tx_desc; - bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, 0, txr->txdma.dma_map->dm_mapsize, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); @@ -2118,16 +2151,15 @@ ixgbe_free_transmit_buffers(struct tx_ri if (txr->tx_buffers == NULL) return; - tx_buffer = txr->tx_buffers; - for (i = 0; i < sc->num_tx_desc; i++, tx_buffer++) { - if (tx_buffer->map != NULL && tx_buffer->map->dm_nsegs > 0) { + for (i = 0; i < sc->num_tx_desc; i++) { + tx_buffer = &txr->tx_buffers[i]; + + if (tx_buffer->m_head != NULL) { bus_dmamap_sync(txr->txdma.dma_tag, tx_buffer->map, 0, tx_buffer->map->dm_mapsize, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->txdma.dma_tag, tx_buffer->map); - } - if (tx_buffer->m_head != NULL) { m_freem(tx_buffer->m_head); tx_buffer->m_head = NULL; } @@ -2138,9 +2170,8 @@ ixgbe_free_transmit_buffers(struct tx_ri } } - if (txr->tx_buffers != NULL) - free(txr->tx_buffers, M_DEVBUF, - sc->num_tx_desc * sizeof(struct ixgbe_tx_buf)); + free(txr->tx_buffers, M_DEVBUF, + sc->num_tx_desc * sizeof(struct ixgbe_tx_buf)); txr->tx_buffers = NULL; txr->txtag = NULL; } @@ -2155,7 +2186,6 @@ int ixgbe_tx_ctx_setup(struct tx_ring *txr, struct mbuf *mp, uint32_t *cmd_type_len, uint32_t *olinfo_status) { - struct ix_softc *sc = txr->sc; struct ixgbe_adv_tx_context_desc *TXD; struct ixgbe_tx_buf *tx_buffer; #if NVLAN > 0 @@ -2215,12 +2245,12 @@ ixgbe_tx_ctx_setup(struct tx_ring *txr, * helpful for QinQ too. */ if (mp->m_len < sizeof(struct ether_header)) - return (1); + return (-1); #if NVLAN > 0 eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { if (mp->m_len < sizeof(struct ether_vlan_header)) - return (1); + return (-1); etype = ntohs(eh->evl_proto); ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { @@ -2239,7 +2269,7 @@ ixgbe_tx_ctx_setup(struct tx_ring *txr, switch (etype) { case ETHERTYPE_IP: if (mp->m_pkthdr.len < ehdrlen + sizeof(*ip)) - return (1); + return (-1); m = m_getptr(mp, ehdrlen, &ipoff); KASSERT(m != NULL && m->m_len - ipoff >= sizeof(*ip)); ip = (struct ip *)(m->m_data + ipoff); @@ -2250,7 +2280,7 @@ ixgbe_tx_ctx_setup(struct tx_ring *txr, #ifdef notyet case ETHERTYPE_IPV6: if (mp->m_pkthdr.len < ehdrlen + sizeof(*ip6)) - return (1); + return (-1); m = m_getptr(mp, ehdrlen, &ipoff); KASSERT(m != NULL && m->m_len - ipoff >= sizeof(*ip6)); ip6 = (struct ip6 *)(m->m_data + ipoff); @@ -2286,23 +2316,15 @@ ixgbe_tx_ctx_setup(struct tx_ring *txr, *olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8; /* Now copy bits into descriptor */ - TXD->vlan_macip_lens = htole32(vlan_macip_lens); - TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl); + htolem32(&TXD->vlan_macip_lens, vlan_macip_lens); + htolem32(&TXD->type_tucmd_mlhl, type_tucmd_mlhl); TXD->seqnum_seed = htole32(0); TXD->mss_l4len_idx = htole32(0); tx_buffer->m_head = NULL; tx_buffer->eop_index = -1; - membar_producer(); - - /* We've consumed the first desc, adjust counters */ - if (++ctxd == sc->num_tx_desc) - ctxd = 0; - txr->next_avail_desc = ctxd; - atomic_dec_int(&txr->tx_avail); - - return (0); + return (1); } /********************************************************************** @@ -2312,108 +2334,57 @@ ixgbe_tx_ctx_setup(struct tx_ring *txr, * tx_buffer is put back on the free queue. * **********************************************************************/ -int -ixgbe_txeof(struct tx_ring *txr) +void +ixgbe_txeof(void *arg) { + struct tx_ring *txr = arg; struct ix_softc *sc = txr->sc; - struct ifnet *ifp = &sc->arpcom.ac_if; - uint32_t first, last, done, processed; - uint32_t num_avail; - struct ixgbe_tx_buf *tx_buffer; - struct ixgbe_legacy_tx_desc *tx_desc, *eop_desc; + struct ifqueue *ifq = txr->ifq; + unsigned int head, tail, last; + struct ixgbe_tx_buf *txbuf; + struct ixgbe_legacy_tx_desc *desc; - if (!ISSET(ifp->if_flags, IFF_RUNNING)) - return FALSE; - - if (txr->tx_avail == sc->num_tx_desc) { - txr->queue_status = IXGBE_QUEUE_IDLE; - return FALSE; - } - - membar_consumer(); - - processed = 0; - first = txr->next_to_clean; - /* was the txt queue cleaned up in the meantime */ - if (txr->tx_buffers == NULL) - return FALSE; - tx_buffer = &txr->tx_buffers[first]; - /* For cleanup we just use legacy struct */ - tx_desc = (struct ixgbe_legacy_tx_desc *)&txr->tx_base[first]; - last = tx_buffer->eop_index; - if (last == -1) - return FALSE; - eop_desc = (struct ixgbe_legacy_tx_desc *)&txr->tx_base[last]; + head = txr->next_avail_desc; + tail = txr->next_to_clean; - /* - * Get the index of the first descriptor - * BEYOND the EOP and call that 'done'. - * I do this so the comparison in the - * inner while loop below can be simple - */ - if (++last == sc->num_tx_desc) last = 0; - done = last; + if (head == tail) + return; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, 0, txr->txdma.dma_map->dm_mapsize, BUS_DMASYNC_POSTREAD); - while (eop_desc->upper.fields.status & IXGBE_TXD_STAT_DD) { - /* We clean the range of the packet */ - while (first != done) { - tx_desc->upper.data = 0; - tx_desc->lower.data = 0; - tx_desc->buffer_addr = 0; - ++processed; - - if (tx_buffer->m_head) { - bus_dmamap_sync(txr->txdma.dma_tag, - tx_buffer->map, - 0, tx_buffer->map->dm_mapsize, - BUS_DMASYNC_POSTWRITE); - bus_dmamap_unload(txr->txdma.dma_tag, - tx_buffer->map); - m_freem(tx_buffer->m_head); - tx_buffer->m_head = NULL; - } - tx_buffer->eop_index = -1; + do { + txbuf = &txr->tx_buffers[tail]; + last = txbuf->eop_index; + desc = (struct ixgbe_legacy_tx_desc *)&txr->tx_base[last]; + + if (!ISSET(desc->upper.fields.status, IXGBE_TXD_STAT_DD)) + break; - if (++first == sc->num_tx_desc) - first = 0; + bus_dmamap_sync(txr->txdma.dma_tag, txbuf->map, + 0, txbuf->map->dm_mapsize, BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(txr->txdma.dma_tag, txbuf->map); + m_freem(txbuf->m_head); - tx_buffer = &txr->tx_buffers[first]; - tx_desc = (struct ixgbe_legacy_tx_desc *) - &txr->tx_base[first]; - } - ++txr->packets; - /* See if there is more work now */ - last = tx_buffer->eop_index; - if (last != -1) { - eop_desc = - (struct ixgbe_legacy_tx_desc *)&txr->tx_base[last]; - /* Get next done point */ - if (++last == sc->num_tx_desc) last = 0; - done = last; - } else - break; - } + txbuf->m_head = NULL; + txbuf->eop_index = -1; + + tail = last + 1; + if (tail == sc->num_tx_desc) + tail = 0; + } while (head != tail); bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, 0, txr->txdma.dma_map->dm_mapsize, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - - txr->next_to_clean = first; + BUS_DMASYNC_PREREAD); - num_avail = atomic_add_int_nv(&txr->tx_avail, processed); + txr->next_to_clean = tail; - /* All clean, turn off the timer */ - if (num_avail == sc->num_tx_desc) - ifp->if_timer = 0; + ixgbe_enable_queue(sc, txr->msix); - if (ifq_is_oactive(&ifp->if_snd)) - ifq_restart(&ifp->if_snd); - - return TRUE; + if (ifq_is_oactive(ifq)) + ifq_restart(ifq); } /********************************************************************* @@ -2461,7 +2432,7 @@ ixgbe_get_buf(struct rx_ring *rxr, int i bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, dsize * i, dsize, BUS_DMASYNC_POSTWRITE); - rxdesc->read.pkt_addr = htole64(rxbuf->map->dm_segs[0].ds_addr); + htolem64(&rxdesc->read.pkt_addr, rxbuf->map->dm_segs[0].ds_addr); bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, dsize * i, dsize, BUS_DMASYNC_PREWRITE); @@ -2496,7 +2467,7 @@ ixgbe_allocate_receive_buffers(struct rx rxbuf = rxr->rx_buffers; for (i = 0; i < sc->num_rx_desc; i++, rxbuf++) { error = bus_dmamap_create(rxr->rxdma.dma_tag, 16 * 1024, 1, - 16 * 1024, 0, BUS_DMA_NOWAIT, &rxbuf->map); + 16 * 1024, 0, BUS_DMA_NOWAIT | BUS_DMA_64BIT, &rxbuf->map); if (error) { printf("%s: Unable to create Pack DMA map\n", ifp->if_xname); @@ -2577,20 +2548,25 @@ ixgbe_rxfill(struct rx_ring *rxr) } void -ixgbe_rxrefill(void *xsc) +ixgbe_rxfill_tmo(void *arg) { - struct ix_softc *sc = xsc; - struct ix_queue *que = sc->queues; - int s; + struct rx_ring *rxr = arg; + /* serialise the refill with rxeof */ + task_add(rxr->que->tq, &rxr->rxfill); +} - s = splnet(); - if (ixgbe_rxfill(que->rxr)) { +void +ixgbe_rxfill_task(void *arg) +{ + struct rx_ring *rxr = arg; + struct ix_softc *sc = rxr->sc; + + if (ixgbe_rxfill(rxr)) { /* Advance the Rx Queue "Tail Pointer" */ - IXGBE_WRITE_REG(&sc->hw, IXGBE_RDT(que->rxr->me), - que->rxr->last_desc_filled); + IXGBE_WRITE_REG(&sc->hw, IXGBE_RDT(rxr->me), + rxr->last_desc_filled); } else - timeout_add(&sc->rx_refill, 1); - splx(s); + timeout_add(&rxr->refill, 1); } /********************************************************************* @@ -2821,11 +2797,11 @@ ixgbe_free_receive_buffers(struct rx_rin * dma'ed into host memory to upper layer. * *********************************************************************/ -int -ixgbe_rxeof(struct ix_queue *que) +void +ixgbe_rxeof(void *arg) { - struct ix_softc *sc = que->sc; - struct rx_ring *rxr = que->rxr; + struct rx_ring *rxr = arg; + struct ix_softc *sc = rxr->sc; struct ifnet *ifp = &sc->arpcom.ac_if; struct mbuf_list ml = MBUF_LIST_INITIALIZER(); struct mbuf *mp, *sendmp; @@ -2838,7 +2814,7 @@ ixgbe_rxeof(struct ix_queue *que) int i, nextp; if (!ISSET(ifp->if_flags, IFF_RUNNING)) - return FALSE; + return; i = rxr->next_to_check; while (if_rxr_inuse(&rxr->rx_ring) > 0) { @@ -2846,7 +2822,7 @@ ixgbe_rxeof(struct ix_queue *que) dsize * i, dsize, BUS_DMASYNC_POSTREAD); rxdesc = &rxr->rx_base[i]; - staterr = letoh32(rxdesc->wb.upper.status_error); + staterr = lemtoh32(&rxdesc->wb.upper.status_error); if (!ISSET(staterr, IXGBE_RXD_STAT_DD)) { bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, dsize * i, dsize, @@ -2864,10 +2840,10 @@ ixgbe_rxeof(struct ix_queue *que) bus_dmamap_unload(rxr->rxdma.dma_tag, rxbuf->map); mp = rxbuf->buf; - len = letoh16(rxdesc->wb.upper.length); - ptype = letoh32(rxdesc->wb.lower.lo_dword.data) & + len = lemtoh16(&rxdesc->wb.upper.length); + ptype = lemtoh32(&rxdesc->wb.lower.lo_dword.data) & IXGBE_RXDADV_PKTTYPE_MASK; - vtag = letoh16(rxdesc->wb.upper.vlan); + vtag = lemtoh16(&rxdesc->wb.upper.vlan); eop = ((staterr & IXGBE_RXD_STAT_EOP) != 0); if (staterr & IXGBE_RXDADV_ERR_FRAME_ERR_MASK) { @@ -2957,12 +2933,12 @@ next_desc: } rxr->next_to_check = i; - if_input(ifp, &ml); + if (ifiq_input(rxr->ifiq, &ml, if_rxr_cwm(&rxr->rx_ring)) != 0) + if_rxr_livelocked(&rxr->rx_ring); - if (!(staterr & IXGBE_RXD_STAT_DD)) - return FALSE; + ixgbe_rxfill_task(rxr); - return TRUE; + ixgbe_enable_queue(sc, rxr->msix); } /********************************************************************* @@ -3034,11 +3010,11 @@ void ixgbe_enable_intr(struct ix_softc *sc) { struct ixgbe_hw *hw = &sc->hw; - struct ix_queue *que = sc->queues; uint32_t mask, fwsm; int i; mask = (IXGBE_EIMS_ENABLE_MASK & ~IXGBE_EIMS_RTX_QUEUE); + //mask = IXGBE_EIMS_ENABLE_MASK; /* Enable Fan Failure detection */ if (hw->device_id == IXGBE_DEV_ID_82598AT) mask |= IXGBE_EIMS_GPI_SDP1; @@ -3088,8 +3064,12 @@ ixgbe_enable_intr(struct ix_softc *sc) * allow for handling the extended (beyond 32) MSIX * vectors that can be used by 82599 */ - for (i = 0; i < sc->num_queues; i++, que++) - ixgbe_enable_queue(sc, que->msix); + for (i = 0; i < sc->num_queues; i++) { + struct rx_ring *rxr = &sc->rx_rings[i]; + struct tx_ring *txr = &sc->tx_rings[i]; + ixgbe_enable_queue(sc, rxr->msix); + ixgbe_enable_queue(sc, txr->msix); + } IXGBE_WRITE_FLUSH(hw); } @@ -3166,7 +3146,6 @@ ixgbe_set_ivar(struct ix_softc *sc, uint vector |= IXGBE_IVAR_ALLOC_VAL; switch (hw->mac.type) { - case ixgbe_mac_82598EB: if (type == -1) entry = IXGBE_IVAR_OTHER_CAUSES_INDEX; @@ -3192,8 +3171,10 @@ ixgbe_set_ivar(struct ix_softc *sc, uint } else { /* RX/TX IVARS */ index = (16 * (entry & 1)) + (8 * type); ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(entry >> 1)); +printf("%s: ivar entry %u type %u pre %u=%08x\n", sc->dev.dv_xname, entry, type, IXGBE_IVAR(entry >> 1), ivar); ivar &= ~(0xFF << index); ivar |= (vector << index); +printf("%s: ivar entry %u type %u post %u=%08x\n", sc->dev.dv_xname, entry, type, IXGBE_IVAR(entry >> 1), ivar); IXGBE_WRITE_REG(hw, IXGBE_IVAR(entry >> 1), ivar); } @@ -3205,29 +3186,24 @@ ixgbe_set_ivar(struct ix_softc *sc, uint void ixgbe_configure_ivars(struct ix_softc *sc) { -#if notyet struct ix_queue *que = sc->queues; uint32_t newitr; int i; - if (ixgbe_max_interrupt_rate > 0) - newitr = (4000000 / ixgbe_max_interrupt_rate) & 0x0FF8; - else - newitr = 0; + newitr = (4000000 / IXGBE_INTS_PER_SEC) & 0x0FF8; for (i = 0; i < sc->num_queues; i++, que++) { /* First the RX queue entry */ - ixgbe_set_ivar(sc, i, que->msix, 0); + ixgbe_set_ivar(sc, i, que->rxr->msix, 0); /* ... and the TX */ - ixgbe_set_ivar(sc, i, que->msix, 1); + ixgbe_set_ivar(sc, i, que->txr->msix, 1); /* Set an Initial EITR value */ - IXGBE_WRITE_REG(&sc->hw, - IXGBE_EITR(que->msix), newitr); + IXGBE_WRITE_REG(&sc->hw, IXGBE_EITR(que->rxr->msix), newitr); + IXGBE_WRITE_REG(&sc->hw, IXGBE_EITR(que->txr->msix), newitr); } /* For the Link interrupt */ - ixgbe_set_ivar(sc, 1, sc->linkvec, -1); -#endif +// ixgbe_set_ivar(sc, 1, sc->linkvec, -1); } /* Index: sys/dev/pci/if_ix.h =================================================================== RCS file: /cvs/src/sys/dev/pci/if_ix.h,v retrieving revision 1.32 diff -u -p -r1.32 if_ix.h --- sys/dev/pci/if_ix.h 21 Nov 2016 17:21:33 -0000 1.32 +++ sys/dev/pci/if_ix.h 9 Nov 2017 04:56:15 -0000 @@ -48,7 +48,7 @@ * bytes. Performance tests have show the 2K value to be optimal for top * performance. */ -#define DEFAULT_TXD 256 +#define DEFAULT_TXD 1024 #define PERFORM_TXD 2048 #define MAX_TXD 4096 #define MIN_TXD 64 @@ -63,7 +63,7 @@ * against the system mbuf pool limit, you can tune nmbclusters * to adjust for this. */ -#define DEFAULT_RXD 256 +#define DEFAULT_RXD 1024 #define PERFORM_RXD 2048 #define MAX_RXD 4096 #define MIN_RXD 64 @@ -151,12 +151,13 @@ struct ixgbe_dma_alloc { */ struct ix_queue { struct ix_softc *sc; - uint32_t msix; /* This queue's MSIX vector */ uint32_t eims; /* This queue's EIMS bit */ uint32_t eitr_setting; void *tag; struct tx_ring *txr; struct rx_ring *rxr; + char tqname[8]; + struct taskq *tq; /* fake multiple cpus */ }; /* @@ -165,11 +166,11 @@ struct ix_queue { struct tx_ring { struct ix_softc *sc; uint32_t me; + uint32_t msix; /* This queue's MSIX vector */ uint32_t watchdog_timer; union ixgbe_adv_tx_desc *tx_base; struct ixgbe_tx_buf *tx_buffers; struct ixgbe_dma_alloc txdma; - volatile uint32_t tx_avail; uint32_t next_avail_desc; uint32_t next_to_clean; enum { @@ -183,6 +184,9 @@ struct tx_ring { uint32_t packets; /* Soft Stats */ uint64_t tx_packets; + + struct ifqueue *ifq; + struct task txeof; }; @@ -191,7 +195,9 @@ struct tx_ring { */ struct rx_ring { struct ix_softc *sc; + struct ix_queue *que; uint32_t me; + uint32_t msix; /* This queue's MSIX vector */ union ixgbe_adv_rx_desc *rx_base; struct ixgbe_dma_alloc rxdma; #if 0 @@ -215,6 +221,11 @@ struct rx_ring { uint64_t rx_bytes; uint64_t rx_discarded; uint64_t rsc_num; + + struct timeout refill; + struct task rxfill; + struct task rxeof; + struct ifiqueue *ifiq; }; /* Our adapter structure */ @@ -229,7 +240,7 @@ struct ix_softc { struct ifmedia media; struct timeout timer; - struct timeout rx_refill; + struct task linkch; int msix; int if_flags; Index: sys/kern/kern_task.c =================================================================== RCS file: /cvs/src/sys/kern/kern_task.c,v retrieving revision 1.20 diff -u -p -r1.20 kern_task.c --- sys/kern/kern_task.c 30 Oct 2017 14:01:42 -0000 1.20 +++ sys/kern/kern_task.c 9 Nov 2017 04:56:15 -0000 @@ -22,6 +22,7 @@ #include #include #include +#include #define TASK_ONQUEUE 1 @@ -68,6 +69,7 @@ struct taskq *const systqmp = &taskq_sys void taskq_init(void); /* called in init_main.c */ void taskq_create_thread(void *); +void taskq_barrier_task(void *); int taskq_sleep(const volatile void *, struct mutex *, int, const char *, int); int taskq_next_work(struct taskq *, struct task *, sleepfn); @@ -176,6 +178,30 @@ taskq_create_thread(void *arg) } while (tq->tq_running < tq->tq_nthreads); mtx_leave(&tq->tq_mtx); +} + +void +taskq_barrier(struct taskq *tq) +{ + struct sleep_state sls; + unsigned int notdone = 1; + struct task t = TASK_INITIALIZER(taskq_barrier_task, ¬done); + + task_add(tq, &t); + + while (notdone) { + sleep_setup(&sls, ¬done, PWAIT, "tqbar"); + sleep_finish(&sls, notdone); + } +} + +void +taskq_barrier_task(void *p) +{ + unsigned int *notdone = p; + + *notdone = 0; + wakeup_one(notdone); } void Index: sys/kern/uipc_mbuf.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v retrieving revision 1.250 diff -u -p -r1.250 uipc_mbuf.c --- sys/kern/uipc_mbuf.c 12 Oct 2017 09:14:16 -0000 1.250 +++ sys/kern/uipc_mbuf.c 9 Nov 2017 04:56:15 -0000 @@ -195,7 +195,7 @@ mbinit(void) } (void)mextfree_register(m_extfree_pool); - KASSERT(num_extfree_fns == 1); + KDASSERT(num_extfree_fns == 1); } void @@ -300,7 +300,7 @@ m_resethdr(struct mbuf *m) int len = m->m_pkthdr.len; u_int8_t loopcnt = m->m_pkthdr.ph_loopcnt; - KASSERT(m->m_flags & M_PKTHDR); + KDASSERT(m->m_flags & M_PKTHDR); m->m_flags &= (M_EXT|M_PKTHDR|M_EOR|M_EXTWR|M_ZEROIZE); /* delete all mbuf tags to reset the state */ @@ -466,7 +466,7 @@ m_extunref(struct mbuf *m) u_int mextfree_register(void (*fn)(caddr_t, u_int, void *)) { - KASSERT(num_extfree_fns < nitems(mextfree_fns)); + KDASSERT(num_extfree_fns < nitems(mextfree_fns)); mextfree_fns[num_extfree_fns] = fn; return num_extfree_fns++; } @@ -475,7 +475,7 @@ void m_extfree(struct mbuf *m) { if (m_extunref(m) == 0) { - KASSERT(m->m_ext.ext_free_fn < num_extfree_fns); + KDASSERT(m->m_ext.ext_free_fn < num_extfree_fns); mextfree_fns[m->m_ext.ext_free_fn](m->m_ext.ext_buf, m->m_ext.ext_size, m->m_ext.ext_arg); } @@ -520,7 +520,7 @@ m_defrag(struct mbuf *m, int how) if (m->m_next == NULL) return (0); - KASSERT(m->m_flags & M_PKTHDR); + KDASSERT(m->m_flags & M_PKTHDR); if ((m0 = m_gethdr(how, m->m_type)) == NULL) return (ENOBUFS); @@ -931,7 +931,7 @@ m_pullup(struct mbuf *n, int len) m->m_data += adj; } - KASSERT(M_TRAILINGSPACE(m) >= len); + KDASSERT(M_TRAILINGSPACE(m) >= len); do { if (n == NULL) { @@ -1079,14 +1079,14 @@ m_makespace(struct mbuf *m0, int skip, i struct mbuf *m; unsigned remain; - KASSERT(m0->m_flags & M_PKTHDR); + KDASSERT(m0->m_flags & M_PKTHDR); /* * Limit the size of the new header to MHLEN. In case * skip = 0 and the first buffer is not a cluster this * is the maximum space available in that mbuf. * In other words this code never prepends a mbuf. */ - KASSERT(hlen < MHLEN); + KDASSERT(hlen < MHLEN); for (m = m0; m && skip > m->m_len; m = m->m_next) skip -= m->m_len; @@ -1318,7 +1318,7 @@ m_dup_pkthdr(struct mbuf *to, struct mbu { int error; - KASSERT(from->m_flags & M_PKTHDR); + KDASSERT(from->m_flags & M_PKTHDR); to->m_flags = (to->m_flags & (M_EXT | M_EXTWR)); to->m_flags |= (from->m_flags & M_COPYFLAGS); @@ -1345,7 +1345,7 @@ m_dup_pkt(struct mbuf *m0, unsigned int struct mbuf *m; int len; - KASSERT(m0->m_flags & M_PKTHDR); + KDASSERT(m0->m_flags & M_PKTHDR); len = m0->m_pkthdr.len + adj; if (len > MAXMCLBYTES) /* XXX */ @@ -1512,14 +1512,15 @@ ml_dequeue(struct mbuf_list *ml) struct mbuf *m; m = ml->ml_head; - if (m != NULL) { - ml->ml_head = m->m_nextpkt; - if (ml->ml_head == NULL) - ml->ml_tail = NULL; + if (m == NULL) + return (NULL); - m->m_nextpkt = NULL; - ml->ml_len--; - } + ml->ml_head = m->m_nextpkt; + if (ml->ml_head == NULL) + ml->ml_tail = NULL; + + ml->ml_len--; + m->m_nextpkt = NULL; return (m); } Index: sys/net/if.c =================================================================== RCS file: /cvs/src/sys/net/if.c,v retrieving revision 1.523 diff -u -p -r1.523 if.c --- sys/net/if.c 4 Nov 2017 16:58:46 -0000 1.523 +++ sys/net/if.c 9 Nov 2017 04:56:15 -0000 @@ -155,7 +155,6 @@ int if_group_egress_build(void); void if_watchdog_task(void *); -void if_input_process(void *); void if_netisr(void *); #ifdef DDB @@ -227,7 +226,7 @@ int ifq_congestion; int netisr; -#define SOFTNET_TASKS 1 +#define SOFTNET_TASKS 8 struct taskq *softnettq[SOFTNET_TASKS]; struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL); @@ -238,12 +237,27 @@ struct task if_input_task_locked = TASK_ */ struct rwlock netlock = RWLOCK_INITIALIZER("netlock"); +static void +net_peg(void *null) +{ + CPU_INFO_ITERATOR cii; + struct cpu_info *ci; + + CPU_INFO_FOREACH(cii, ci) { + if (CPU_INFO_UNIT(ci) == 2) { + sched_peg_curproc(ci); + return; + } + } +} + /* * Network interface utility routines. */ void ifinit(void) { + static struct task t = TASK_INITIALIZER(net_peg, NULL); unsigned int i; /* @@ -260,6 +274,8 @@ ifinit(void) panic("unable to create softnet taskq"); } + task_add(softnettq[0], &t); + net_tick(&net_tick_to); } @@ -436,8 +452,6 @@ if_attachsetup(struct ifnet *ifp) ifidx = ifp->if_index; - mq_init(&ifp->if_inputqueue, 8192, IPL_NET); - task_set(ifp->if_inputtask, if_input_process, (void *)ifidx); task_set(ifp->if_watchdogtask, if_watchdog_task, (void *)ifidx); task_set(ifp->if_linkstatetask, if_linkstate_task, (void *)ifidx); @@ -545,7 +559,7 @@ if_attach_queues(struct ifnet *ifp, unsi KASSERT(ifp->if_ifqs == ifp->if_snd.ifq_ifqs); KASSERT(nqs != 0); - map = mallocarray(sizeof(*map), nqs, M_DEVBUF, M_WAITOK); + map = mallocarray(nqs, sizeof(*map), M_DEVBUF, M_WAITOK); ifp->if_snd.ifq_softc = NULL; map[0] = &ifp->if_snd; @@ -562,6 +576,30 @@ if_attach_queues(struct ifnet *ifp, unsi } void +if_attach_iqueues(struct ifnet *ifp, unsigned int nqs) +{ + struct ifiqueue **map; + struct ifiqueue *ifiq; + int i; + + KASSERT(nqs != 0); + + map = mallocarray(nqs, sizeof(*map), M_DEVBUF, M_WAITOK); + + ifp->if_iq.ifiq_softc = NULL; + map[0] = &ifp->if_iq; + + for (i = 1; i < nqs; i++) { + ifiq = malloc(sizeof(*ifiq), M_DEVBUF, M_WAITOK|M_ZERO); + ifiq_init(ifiq, ifp, i); + map[i] = ifiq; + } + + ifp->if_iqs = map; + ifp->if_niqs = nqs; +} + +void if_attach_common(struct ifnet *ifp) { KASSERT(ifp->if_ioctl != NULL); @@ -586,6 +624,12 @@ if_attach_common(struct ifnet *ifp) ifp->if_ifqs = ifp->if_snd.ifq_ifqs; ifp->if_nifqs = 1; + ifiq_init(&ifp->if_iq, ifp, 0); + + ifp->if_iq.ifiq_ifiqs[0] = &ifp->if_iq; + ifp->if_iqs = ifp->if_iq.ifiq_ifiqs; + ifp->if_niqs = 1; + ifp->if_addrhooks = malloc(sizeof(*ifp->if_addrhooks), M_TEMP, M_WAITOK); TAILQ_INIT(ifp->if_addrhooks); @@ -604,11 +648,7 @@ if_attach_common(struct ifnet *ifp) M_TEMP, M_WAITOK|M_ZERO); ifp->if_linkstatetask = malloc(sizeof(*ifp->if_linkstatetask), M_TEMP, M_WAITOK|M_ZERO); - ifp->if_inputtask = malloc(sizeof(*ifp->if_inputtask), - M_TEMP, M_WAITOK|M_ZERO); ifp->if_llprio = IFQ_DEFPRIO; - - SRPL_INIT(&ifp->if_inputs); } void @@ -645,7 +685,7 @@ if_qstart_compat(struct ifqueue *ifq) * this provides compatability between the stack and the older * drivers by translating from the only queue they have * (ifp->if_snd) back to the interface and calling if_start. - */ + */ KERNEL_LOCK(); s = splnet(); @@ -661,7 +701,7 @@ if_enqueue(struct ifnet *ifp, struct mbu struct ifqueue *ifq; int error; -#if NBRIDGE > 0 +#if 0 && NBRIDGE > 0 if (ifp->if_bridgeport && (m->m_flags & M_PROTO1) == 0) { KERNEL_LOCK(); error = bridge_output(ifp, m, NULL, NULL); @@ -685,55 +725,105 @@ if_enqueue(struct ifnet *ifp, struct mbu if (error) return (error); - ifq_start(ifq); + if (ifq_len(ifq) >= 4) { + task_del(ifq->ifq_softnet, &ifq->ifq_bundle); + ifq_start(ifq); + } else + task_add(ifq->ifq_softnet, &ifq->ifq_bundle); return (0); } void -if_input(struct ifnet *ifp, struct mbuf_list *ml) +if_input_process(struct mbuf_list *ml) { + struct srp_ref sr; + struct ifnet *ifp; struct mbuf *m; - size_t ibytes = 0; -#if NBPFILTER > 0 - caddr_t if_bpf; + int s; +#ifdef IPSEC + int locked = 0; + extern int ipsec_in_use; +#endif /* IPSEC */ + + add_net_randomness(ml_len(ml)); + +#ifdef IPSEC + if (ipsec_in_use) { + KERNEL_LOCK(); + locked = 1; + } #endif - if (ml_empty(ml)) - return; + NET_LOCK(); + s = splnet(); - MBUF_LIST_FOREACH(ml, m) { - m->m_pkthdr.ph_ifidx = ifp->if_index; - m->m_pkthdr.ph_rtableid = ifp->if_rdomain; - ibytes += m->m_pkthdr.len; + while ((m = ml_dequeue(ml)) != NULL) { + ifp = if_enter(&sr, m->m_pkthdr.ph_ifidx); + if (ifp != NULL) + ifp->if_input(ml, ifp, m); + else + m_freem(m); + if_leave(&sr); } - ifp->if_ipackets += ml_len(ml); - ifp->if_ibytes += ibytes; + splx(s); + NET_UNLOCK(); +#ifdef IPSEC + if (locked) + KERNEL_UNLOCK(); +#endif +} + +void +if_input_m(struct mbuf_list *ml, struct ifnet *ifp, struct mbuf *m) +{ #if NBPFILTER > 0 - if_bpf = ifp->if_bpf; - if (if_bpf) { - struct mbuf_list ml0; + caddr_t if_bpf; +#endif - ml_init(&ml0); - ml_enlist(&ml0, ml); - ml_init(ml); - - while ((m = ml_dequeue(&ml0)) != NULL) { - if (bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_IN)) - m_freem(m); - else - ml_enqueue(ml, m); - } + m->m_pkthdr.ph_ifidx = ifp->if_index; + m->m_pkthdr.ph_rtableid = ifp->if_rdomain; - if (ml_empty(ml)) + ifp->if_ipackets++; + ifp->if_ibytes += m->m_pkthdr.len; + +#if NBPFILTER > 0 + if_bpf = ifp->if_bpf; + if (if_bpf) { + if (bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_IN)) { + m_freem(m); return; + } } #endif - if (mq_enlist(&ifp->if_inputqueue, ml) == 0) - task_add(net_tq(ifp->if_index), ifp->if_inputtask); + ml_enqueue(ml, m); +} + +void +if_input(struct ifnet *ifp, struct mbuf_list *ml) +{ + ifiq_input(&ifp->if_iq, ml, IFQ_MAXLEN); +} + +int +if_output_local(struct ifnet *ifp, struct mbuf *m, sa_family_t af) +{ + struct ifiqueue *ifiq; + unsigned int flow = 0; + + m->m_pkthdr.ph_family = af; + m->m_pkthdr.ph_ifidx = ifp->if_index; + m->m_pkthdr.ph_rtableid = ifp->if_rdomain; + + if (ISSET(m->m_pkthdr.ph_flowid, M_FLOWID_VALID)) + flow = m->m_pkthdr.ph_flowid & M_FLOWID_MASK; + + ifiq = ifp->if_iqs[flow % ifp->if_niqs]; + + return (ifiq_enqueue(ifiq, m) == 0 ? 0 : ENOBUFS); } int @@ -788,145 +878,6 @@ if_input_local(struct ifnet *ifp, struct return (0); } -struct ifih { - SRPL_ENTRY(ifih) ifih_next; - int (*ifih_input)(struct ifnet *, struct mbuf *, - void *); - void *ifih_cookie; - int ifih_refcnt; - struct refcnt ifih_srpcnt; -}; - -void if_ih_ref(void *, void *); -void if_ih_unref(void *, void *); - -struct srpl_rc ifih_rc = SRPL_RC_INITIALIZER(if_ih_ref, if_ih_unref, NULL); - -void -if_ih_insert(struct ifnet *ifp, int (*input)(struct ifnet *, struct mbuf *, - void *), void *cookie) -{ - struct ifih *ifih; - - /* the kernel lock guarantees serialised modifications to if_inputs */ - KERNEL_ASSERT_LOCKED(); - - SRPL_FOREACH_LOCKED(ifih, &ifp->if_inputs, ifih_next) { - if (ifih->ifih_input == input && ifih->ifih_cookie == cookie) { - ifih->ifih_refcnt++; - break; - } - } - - if (ifih == NULL) { - ifih = malloc(sizeof(*ifih), M_DEVBUF, M_WAITOK); - - ifih->ifih_input = input; - ifih->ifih_cookie = cookie; - ifih->ifih_refcnt = 1; - refcnt_init(&ifih->ifih_srpcnt); - SRPL_INSERT_HEAD_LOCKED(&ifih_rc, &ifp->if_inputs, - ifih, ifih_next); - } -} - -void -if_ih_ref(void *null, void *i) -{ - struct ifih *ifih = i; - - refcnt_take(&ifih->ifih_srpcnt); -} - -void -if_ih_unref(void *null, void *i) -{ - struct ifih *ifih = i; - - refcnt_rele_wake(&ifih->ifih_srpcnt); -} - -void -if_ih_remove(struct ifnet *ifp, int (*input)(struct ifnet *, struct mbuf *, - void *), void *cookie) -{ - struct ifih *ifih; - - /* the kernel lock guarantees serialised modifications to if_inputs */ - KERNEL_ASSERT_LOCKED(); - - SRPL_FOREACH_LOCKED(ifih, &ifp->if_inputs, ifih_next) { - if (ifih->ifih_input == input && ifih->ifih_cookie == cookie) - break; - } - - KASSERT(ifih != NULL); - - if (--ifih->ifih_refcnt == 0) { - SRPL_REMOVE_LOCKED(&ifih_rc, &ifp->if_inputs, ifih, - ifih, ifih_next); - - refcnt_finalize(&ifih->ifih_srpcnt, "ifihrm"); - free(ifih, M_DEVBUF, sizeof(*ifih)); - } -} - -void -if_input_process(void *xifidx) -{ - unsigned int ifidx = (unsigned long)xifidx; - struct mbuf_list ml; - struct mbuf *m; - struct ifnet *ifp; - struct ifih *ifih; - struct srp_ref sr; - int s; - - ifp = if_get(ifidx); - if (ifp == NULL) - return; - - mq_delist(&ifp->if_inputqueue, &ml); - if (ml_empty(&ml)) - goto out; - - if (!ISSET(ifp->if_xflags, IFXF_CLONED)) - add_net_randomness(ml_len(&ml)); - - /* - * We grab the NET_LOCK() before processing any packet to - * ensure there's no contention on the routing table lock. - * - * Without it we could race with a userland thread to insert - * a L2 entry in ip{6,}_output(). Such race would result in - * one of the threads sleeping *inside* the IP output path. - * - * Since we have a NET_LOCK() we also use it to serialize access - * to PF globals, pipex globals, unicast and multicast addresses - * lists. - */ - NET_LOCK(); - s = splnet(); - while ((m = ml_dequeue(&ml)) != NULL) { - /* - * Pass this mbuf to all input handlers of its - * interface until it is consumed. - */ - SRPL_FOREACH(ifih, &sr, &ifp->if_inputs, ifih_next) { - if ((*ifih->ifih_input)(ifp, m, ifih->ifih_cookie)) - break; - } - SRPL_LEAVE(&sr); - - if (ifih == NULL) - m_freem(m); - } - splx(s); - NET_UNLOCK(); -out: - if_put(ifp); -} - void if_netisr(void *unused) { @@ -1032,10 +983,6 @@ if_detach(struct ifnet *ifp) ifp->if_ioctl = if_detached_ioctl; ifp->if_watchdog = NULL; - /* Remove the input task */ - task_del(net_tq(ifp->if_index), ifp->if_inputtask); - mq_purge(&ifp->if_inputqueue); - /* Remove the watchdog timeout & task */ timeout_del(ifp->if_slowtimo); task_del(net_tq(ifp->if_index), ifp->if_watchdogtask); @@ -1089,7 +1036,6 @@ if_detach(struct ifnet *ifp) free(ifp->if_slowtimo, M_TEMP, sizeof(*ifp->if_slowtimo)); free(ifp->if_watchdogtask, M_TEMP, sizeof(*ifp->if_watchdogtask)); free(ifp->if_linkstatetask, M_TEMP, sizeof(*ifp->if_linkstatetask)); - free(ifp->if_inputtask, M_TEMP, sizeof(*ifp->if_inputtask)); for (i = 0; (dp = domains[i]) != NULL; i++) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) @@ -1112,6 +1058,17 @@ if_detach(struct ifnet *ifp) free(ifp->if_ifqs, M_DEVBUF, sizeof(struct ifqueue *) * ifp->if_nifqs); } + + for (i = 0; i < ifp->if_niqs; i++) + ifq_destroy(ifp->if_ifqs[i]); + if (ifp->if_iqs != ifp->if_iq.ifiq_ifiqs) { + for (i = 1; i < ifp->if_niqs; i++) { + free(ifp->if_iqs[i], M_DEVBUF, + sizeof(struct ifiqueue)); + } + free(ifp->if_iqs, M_DEVBUF, + sizeof(struct ifiqueue *) * ifp->if_niqs); + } } /* @@ -1653,24 +1610,37 @@ ifunit(const char *name) * Map interface index to interface structure pointer. */ struct ifnet * -if_get(unsigned int index) +if_enter(struct srp_ref *sr, unsigned int index) { - struct srp_ref sr; struct if_map *if_map; struct srp *map; struct ifnet *ifp = NULL; - if_map = srp_enter(&sr, &if_idxmap.map); + if_map = srp_enter(sr, &if_idxmap.map); if (index < if_map->limit) { map = (struct srp *)(if_map + 1); - - ifp = srp_follow(&sr, &map[index]); - if (ifp != NULL) { - KASSERT(ifp->if_index == index); - if_ref(ifp); - } + ifp = srp_follow(sr, &map[index]); } - srp_leave(&sr); + + return (ifp); +} + +void +if_leave(struct srp_ref *sr) +{ + srp_leave(sr); +} + +struct ifnet * +if_get(unsigned int index) +{ + struct srp_ref sr; + struct ifnet *ifp; + + ifp = if_enter(&sr, index); + if (ifp != NULL) + if_ref(ifp); + if_leave(&sr); return (ifp); } @@ -2276,30 +2246,65 @@ void if_getdata(struct ifnet *ifp, struct if_data *data) { unsigned int i; - struct ifqueue *ifq; - uint64_t opackets = 0; - uint64_t obytes = 0; - uint64_t omcasts = 0; - uint64_t oqdrops = 0; + uint64_t packets = 0; + uint64_t bytes = 0; + uint64_t mcasts = 0; + uint64_t qdrops = 0; + + *data = ifp->if_data; + + NET_UNLOCK(); for (i = 0; i < ifp->if_nifqs; i++) { - ifq = ifp->if_ifqs[i]; + struct ifqueue *ifq = ifp->if_ifqs[i]; mtx_enter(&ifq->ifq_mtx); - opackets += ifq->ifq_packets; - obytes += ifq->ifq_bytes; - oqdrops += ifq->ifq_qdrops; - omcasts += ifq->ifq_mcasts; + packets += ifq->ifq_packets; + bytes += ifq->ifq_bytes; + qdrops += ifq->ifq_qdrops; + mcasts += ifq->ifq_mcasts; mtx_leave(&ifq->ifq_mtx); /* ifq->ifq_errors */ } - *data = ifp->if_data; - data->ifi_opackets += opackets; - data->ifi_obytes += obytes; - data->ifi_oqdrops += oqdrops; - data->ifi_omcasts += omcasts; + data->ifi_opackets += packets; + data->ifi_obytes += bytes; + data->ifi_oqdrops += qdrops; + data->ifi_omcasts += mcasts; /* ifp->if_data.ifi_oerrors */ + + for (i = 0; i < ifp->if_niqs; i++) { + struct ifiqueue *ifiq = ifp->if_iqs[i]; + unsigned int enter, leave; + + enter = ifiq->ifiq_gen; + for (;;) { + /* the generation number is odd during an update */ + while (enter & 1) { + yield(); + enter = ifiq->ifiq_gen; + } + + membar_consumer(); + packets = ifiq->ifiq_packets; + bytes = ifiq->ifiq_bytes; + qdrops = ifiq->ifiq_qdrops; + membar_consumer(); + + leave = ifiq->ifiq_gen; + + if (enter == leave) + break; + + enter = leave; + } + + data->ifi_ipackets += packets; + data->ifi_ibytes += bytes; + data->ifi_iqdrops += qdrops; + } + + NET_LOCK(); } /* @@ -2819,11 +2824,24 @@ if_rxr_adjust_cwm(struct if_rxring *rxr) rxr->rxr_adjusted = ticks; } -u_int +void +if_rxr_livelocked(struct if_rxring *rxr) +{ + extern int ticks; + + if (ticks - rxr->rxr_adjusted >= 1) { + if (rxr->rxr_cwm > rxr->rxr_lwm) + rxr->rxr_cwm--; + + rxr->rxr_adjusted = ticks; + } +} + +unsigned int if_rxr_get(struct if_rxring *rxr, u_int max) { extern int ticks; - u_int diff; + unsigned int diff; if (ticks - rxr->rxr_adjusted >= 1) { /* we're free to try for an adjustment */ Index: sys/net/if.h =================================================================== RCS file: /cvs/src/sys/net/if.h,v retrieving revision 1.187 diff -u -p -r1.187 if.h --- sys/net/if.h 31 Oct 2017 22:05:12 -0000 1.187 +++ sys/net/if.h 9 Nov 2017 04:56:15 -0000 @@ -458,11 +458,13 @@ struct if_parent { struct socket; struct ifnet; struct ifq_ops; +struct srp_ref; void if_alloc_sadl(struct ifnet *); void if_free_sadl(struct ifnet *); void if_attach(struct ifnet *); void if_attach_queues(struct ifnet *, unsigned int); +void if_attach_iqueues(struct ifnet *, unsigned int); void if_attach_ifq(struct ifnet *, const struct ifq_ops *, void *); void if_attachtail(struct ifnet *); void if_attachhead(struct ifnet *); @@ -482,6 +484,8 @@ int if_addgroup(struct ifnet *, const ch int if_delgroup(struct ifnet *, const char *); void if_group_routechange(struct sockaddr *, struct sockaddr *); struct ifnet *ifunit(const char *); +struct ifnet *if_enter(struct srp_ref *, unsigned int); +void if_leave(struct srp_ref *); struct ifnet *if_get(unsigned int); void if_put(struct ifnet *); void ifnewlladdr(struct ifnet *); Index: sys/net/if_bridge.c =================================================================== RCS file: /cvs/src/sys/net/if_bridge.c,v retrieving revision 1.298 diff -u -p -r1.298 if_bridge.c --- sys/net/if_bridge.c 17 Aug 2017 10:14:08 -0000 1.298 +++ sys/net/if_bridge.c 9 Nov 2017 04:56:15 -0000 @@ -185,6 +185,7 @@ bridge_clone_create(struct if_clone *ifc ifp->if_start = NULL; ifp->if_type = IFT_BRIDGE; ifp->if_hdrlen = ETHER_HDR_LEN; + ifp->if_input = ether_input; if_attach(ifp); if_alloc_sadl(ifp); @@ -194,8 +195,6 @@ bridge_clone_create(struct if_clone *ifc DLT_EN10MB, ETHER_HDR_LEN); #endif - if_ih_insert(ifp, ether_input, NULL); - return (0); } @@ -225,10 +224,6 @@ bridge_clone_destroy(struct ifnet *ifp) /* Undo pseudo-driver changes. */ if_deactivate(ifp); - if_ih_remove(ifp, ether_input, NULL); - - KASSERT(SRPL_EMPTY_LOCKED(&ifp->if_inputs)); - if_detach(ifp); free(sc, M_DEVBUF, sizeof *sc); @@ -247,7 +242,6 @@ bridge_delete(struct bridge_softc *sc, s error = ifpromisc(p->ifp, 0); hook_disestablish(p->ifp->if_detachhooks, p->bif_dhcookie); - if_ih_remove(p->ifp, bridge_input, NULL); TAILQ_REMOVE(&sc->sc_iflist, p, next); bridge_rtdelete(sc, p->ifp, 0); bridge_flushrule(p); @@ -361,7 +355,6 @@ bridge_ioctl(struct ifnet *ifp, u_long c ifs->if_bridgeport = (caddr_t)p; p->bif_dhcookie = hook_establish(ifs->if_detachhooks, 0, bridge_ifdetach, ifs); - if_ih_insert(p->ifp, bridge_input, NULL); TAILQ_INSERT_TAIL(&sc->sc_iflist, p, next); break; case SIOCBRDGDEL: Index: sys/net/if_ethersubr.c =================================================================== RCS file: /cvs/src/sys/net/if_ethersubr.c,v retrieving revision 1.246 diff -u -p -r1.246 if_ethersubr.c --- sys/net/if_ethersubr.c 31 May 2017 05:59:09 -0000 1.246 +++ sys/net/if_ethersubr.c 9 Nov 2017 04:56:15 -0000 @@ -98,16 +98,27 @@ didn't get a copy, you may request one f #include #include #include +#include #if NBPFILTER > 0 #include #endif +#include "vlan.h" +#if NVLAN > 0 +#include +#endif + #include "pppoe.h" #if NPPPOE > 0 #include #endif +#include "carp.h" +#if NCARP > 0 +#include +#endif + #ifdef INET6 #include #include @@ -178,24 +189,18 @@ ether_rtrequest(struct ifnet *ifp, int r break; } } -/* - * Ethernet output routine. - * Encapsulate a packet of type family for the local net. - * Assumes that ifp is actually pointer to arpcom structure. - */ + int -ether_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, - struct rtentry *rt) +ether_resolve(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct rtentry *rt, struct ether_header *eh) { - u_int16_t etype; - u_char edst[ETHER_ADDR_LEN]; - u_char *esrc; - struct mbuf *mcopy = NULL; - struct ether_header *eh; struct arpcom *ac = (struct arpcom *)ifp; sa_family_t af = dst->sa_family; int error = 0; + if (!ISSET(ifp->if_flags, IFF_RUNNING)) + senderr(ENETDOWN); + KASSERT(rt != NULL || ISSET(m->m_flags, M_MCAST|M_BCAST) || af == AF_UNSPEC || af == pseudo_AF_HDRCMPLT); @@ -207,28 +212,31 @@ ether_output(struct ifnet *ifp, struct m } #endif - esrc = ac->ac_enaddr; - - if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) - senderr(ENETDOWN); - switch (af) { case AF_INET: - error = arpresolve(ifp, rt, m, dst, edst); + error = arpresolve(ifp, rt, m, dst, eh->ether_dhost); if (error) - return (error == EAGAIN ? 0 : error); + return (error); + eh->ether_type = htons(ETHERTYPE_IP); + /* If broadcasting on a simplex interface, loopback a copy */ - if ((m->m_flags & M_BCAST) && (ifp->if_flags & IFF_SIMPLEX) && - !m->m_pkthdr.pf.routed) + if (ISSET(m->m_flags, M_BCAST) && + ISSET(ifp->if_flags, IFF_SIMPLEX) && + !m->m_pkthdr.pf.routed) { + struct mbuf *mcopy; + + /* XXX Should we input an unencrypted IPsec packet? */ mcopy = m_copym(m, 0, M_COPYALL, M_NOWAIT); - etype = htons(ETHERTYPE_IP); + if (mcopy != NULL) + if_input_local(ifp, mcopy, af); + } break; #ifdef INET6 case AF_INET6: - error = nd6_resolve(ifp, rt, m, dst, edst); + error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost); if (error) - return (error == EAGAIN ? 0 : error); - etype = htons(ETHERTYPE_IPV6); + return (error); + eh->ether_type = htons(ETHERTYPE_IPV6); break; #endif #ifdef MPLS @@ -242,230 +250,197 @@ ether_output(struct ifnet *ifp, struct m senderr(ENETUNREACH); switch (dst->sa_family) { - case AF_LINK: - if (satosdl(dst)->sdl_alen < sizeof(edst)) - senderr(EHOSTUNREACH); - memcpy(edst, LLADDR(satosdl(dst)), - sizeof(edst)); - break; - case AF_INET: - case AF_MPLS: - error = arpresolve(ifp, rt, m, dst, edst); - if (error) - return (error == EAGAIN ? 0 : error); - break; - default: + case AF_LINK: + if (satosdl(dst)->sdl_alen < sizeof(eh->ether_dhost)) senderr(EHOSTUNREACH); + memcpy(eh->ether_dhost, LLADDR(satosdl(dst)), + sizeof(eh->ether_dhost)); + break; + case AF_INET: + case AF_MPLS: + error = arpresolve(ifp, rt, m, dst, eh->ether_dhost); + if (error) + return (error); + break; + default: + senderr(EHOSTUNREACH); } /* XXX handling for simplex devices in case of M/BCAST ?? */ if (m->m_flags & (M_BCAST | M_MCAST)) - etype = htons(ETHERTYPE_MPLS_MCAST); + eh->ether_type = htons(ETHERTYPE_MPLS_MCAST); else - etype = htons(ETHERTYPE_MPLS); + eh->ether_type = htons(ETHERTYPE_MPLS); break; #endif /* MPLS */ + case pseudo_AF_HDRCMPLT: - eh = (struct ether_header *)dst->sa_data; - esrc = eh->ether_shost; - /* FALLTHROUGH */ + /* take the whole header from the sa */ + memcpy(eh, (struct ether_header *)dst->sa_data, sizeof(*eh)); + return (0); case AF_UNSPEC: - eh = (struct ether_header *)dst->sa_data; - memcpy(edst, eh->ether_dhost, sizeof(edst)); - /* AF_UNSPEC doesn't swap the byte order of the ether_type. */ - etype = eh->ether_type; + /* take the dst and type from the sa, but fall for the src */ + memcpy(eh, (struct ether_header *)dst->sa_data, sizeof(*eh)); break; default: - printf("%s: can't handle af%d\n", ifp->if_xname, - dst->sa_family); + printf("%s: can't handle af%d\n", ifp->if_xname, af); senderr(EAFNOSUPPORT); } - /* XXX Should we feed-back an unencrypted IPsec packet ? */ - if (mcopy) - if_input_local(ifp, mcopy, dst->sa_family); + memcpy(eh->ether_shost, ac->ac_enaddr, sizeof(eh->ether_shost)); - M_PREPEND(m, sizeof(struct ether_header) + ETHER_ALIGN, M_DONTWAIT); - if (m == NULL) - return (ENOBUFS); - m_adj(m, ETHER_ALIGN); - eh = mtod(m, struct ether_header *); - eh->ether_type = etype; - memcpy(eh->ether_dhost, edst, sizeof(eh->ether_dhost)); - memcpy(eh->ether_shost, esrc, sizeof(eh->ether_shost)); + return (0); - return (if_enqueue(ifp, m)); bad: m_freem(m); return (error); } /* - * Process a received Ethernet packet; - * the packet is in the mbuf chain m without - * the ether header, which is provided separately. + * Ethernet output routine. + * Encapsulate a packet of type family for the local net. + * Assumes that ifp is actually pointer to arpcom structure. */ int -ether_input(struct ifnet *ifp, struct mbuf *m, void *cookie) +ether_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct rtentry *rt) +{ + struct ether_header eh; + int error = 0; + + error = ether_resolve(ifp, m, dst, rt, &eh); + switch (error) { + case 0: + break; + case EAGAIN: + return (0); + default: + return (error); + } + + M_PREPEND(m, ETHER_ALIGN + sizeof(eh), M_DONTWAIT); + if (m == NULL) + return (ENOBUFS); + + m_adj(m, ETHER_ALIGN); + memcpy(mtod(m, struct ether_header *), &eh, sizeof(eh)); + + return (if_enqueue(ifp, m)); +} + +void +ether_input(struct mbuf_list *ml, struct ifnet *ifp, struct mbuf *m) { - struct ether_header *eh; - struct niqueue *inq; - u_int16_t etype; - int llcfound = 0; - struct llc *l; struct arpcom *ac; -#if NPPPOE > 0 - struct ether_header *eh_tmp; -#endif + struct niqueue *inq; + struct ether_header *eh; + uint16_t etype; - /* Drop short frames */ - if (m->m_len < ETHER_HDR_LEN) - goto dropanyway; + if (m->m_len < sizeof(*eh)) + goto drop; ac = (struct arpcom *)ifp; eh = mtod(m, struct ether_header *); - m_adj(m, ETHER_HDR_LEN); + etype = eh->ether_type; + + if (ISSET(m->m_flags, M_VLANTAG) || + etype == htons(ETHERTYPE_VLAN) || + etype == htons(ETHERTYPE_QINQ)) { +#if NVLAN > 0 + vlan_input(ml, ifp, m); + return; +#else + goto drop; +#endif + } + +#if NCARP > 0 + if (ifp->if_carp && carp_input(ml, ifp, m)) + return; +#endif + + if (memcmp(ac->ac_enaddr, eh->ether_dhost, ETHER_ADDR_LEN) != 0) { + if (!ETHER_IS_MULTICAST(eh->ether_dhost)) + goto drop; - if (ETHER_IS_MULTICAST(eh->ether_dhost)) { /* * If this is not a simplex interface, drop the packet * if it came from us. */ - if ((ifp->if_flags & IFF_SIMPLEX) == 0) { - if (memcmp(ac->ac_enaddr, eh->ether_shost, - ETHER_ADDR_LEN) == 0) { - m_freem(m); - return (1); - } - } + if (!ISSET(ifp->if_flags, IFF_SIMPLEX) && + memcmp(ac->ac_enaddr, eh->ether_shost, + ETHER_ADDR_LEN) == 0) + goto drop; + + SET(m->m_flags, (memcmp(etherbroadcastaddr, eh->ether_dhost, + sizeof(etherbroadcastaddr)) == 0) ? M_BCAST : M_MCAST); - if (memcmp(etherbroadcastaddr, eh->ether_dhost, - sizeof(etherbroadcastaddr)) == 0) - m->m_flags |= M_BCAST; - else - m->m_flags |= M_MCAST; ifp->if_imcasts++; } - /* - * HW vlan tagged packets that were not collected by vlan(4) must - * be dropped now. - */ - if (m->m_flags & M_VLANTAG) { - m_freem(m); - return (1); - } +#if NPPPOE > 0 || defined(PIPEX) + switch (etype) { + case HTON16(ETHERTYPE_PPPOEDISC): + case HTON16(ETHERTYPE_PPPOE): + if (ISSET(m->m_flags, M_MCAST|M_BCAST)) + goto drop; - /* - * If packet is unicast, make sure it is for us. Drop otherwise. - * This check is required in promiscous mode, and for some hypervisors - * where the MAC filter is 'best effort' only. - */ - if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) { - if (memcmp(ac->ac_enaddr, eh->ether_dhost, ETHER_ADDR_LEN)) { - m_freem(m); - return (1); +#ifdef PIPEX + if (pipex_enable) { + struct pipex_session *session; + + if ((session = pipex_pppoe_lookup_session(m)) != NULL) { + pipex_pppoe_input(m, session); + return; + } } +#endif + if (etype == HTON16(ETHERTYPE_PPPOEDISC)) + inq = &pppoediscinq; + else + inq = &pppoeinq; + + niq_enqueue(inq, m); + return; } +#endif - etype = ntohs(eh->ether_type); + m_adj(m, sizeof(*eh)); -decapsulate: switch (etype) { - case ETHERTYPE_IP: + case HTON16(ETHERTYPE_IP): ipv4_input(ifp, m); - return (1); + return; - case ETHERTYPE_ARP: - if (ifp->if_flags & IFF_NOARP) - goto dropanyway; + case HTON16(ETHERTYPE_ARP): + if (ISSET(ifp->if_flags, IFF_NOARP)) + goto drop; arpinput(ifp, m); - return (1); + return; - case ETHERTYPE_REVARP: - if (ifp->if_flags & IFF_NOARP) - goto dropanyway; + case HTON16(ETHERTYPE_REVARP): + if (ISSET(ifp->if_flags, IFF_NOARP)) + goto drop; revarpinput(ifp, m); - return (1); + return; #ifdef INET6 - /* - * Schedule IPv6 software interrupt for incoming IPv6 packet. - */ - case ETHERTYPE_IPV6: + case HTON16(ETHERTYPE_IPV6): ipv6_input(ifp, m); - return (1); + return; #endif /* INET6 */ -#if NPPPOE > 0 || defined(PIPEX) - case ETHERTYPE_PPPOEDISC: - case ETHERTYPE_PPPOE: - if (m->m_flags & (M_MCAST | M_BCAST)) - goto dropanyway; - M_PREPEND(m, sizeof(*eh), M_DONTWAIT); - if (m == NULL) - return (1); - - eh_tmp = mtod(m, struct ether_header *); - /* - * danger! - * eh_tmp and eh may overlap because eh - * is stolen from the mbuf above. - */ - memmove(eh_tmp, eh, sizeof(struct ether_header)); -#ifdef PIPEX - if (pipex_enable) { - struct pipex_session *session; - if ((session = pipex_pppoe_lookup_session(m)) != NULL) { - pipex_pppoe_input(m, session); - return (1); - } - } -#endif - if (etype == ETHERTYPE_PPPOEDISC) - inq = &pppoediscinq; - else - inq = &pppoeinq; - break; -#endif #ifdef MPLS - case ETHERTYPE_MPLS: - case ETHERTYPE_MPLS_MCAST: + case HTON16(ETHERTYPE_MPLS): + case HTON16(ETHERTYPE_MPLS_MCAST): mpls_input(m); - return (1); + return; #endif - default: - if (llcfound || etype > ETHERMTU || - m->m_len < sizeof(struct llc)) - goto dropanyway; - llcfound = 1; - l = mtod(m, struct llc *); - switch (l->llc_dsap) { - case LLC_SNAP_LSAP: - if (l->llc_control == LLC_UI && - l->llc_dsap == LLC_SNAP_LSAP && - l->llc_ssap == LLC_SNAP_LSAP) { - /* SNAP */ - if (m->m_pkthdr.len > etype) - m_adj(m, etype - m->m_pkthdr.len); - m_adj(m, 6); - M_PREPEND(m, sizeof(*eh), M_DONTWAIT); - if (m == NULL) - return (1); - *mtod(m, struct ether_header *) = *eh; - goto decapsulate; - } - default: - goto dropanyway; - } } - niq_enqueue(inq, m); - return (1); -dropanyway: +drop: m_freem(m); - return (1); } /* @@ -528,7 +503,7 @@ ether_ifattach(struct ifnet *ifp) ifp->if_output = ether_output; ifp->if_rtrequest = ether_rtrequest; - if_ih_insert(ifp, ether_input, NULL); + ifp->if_input = ether_input; if (ifp->if_hardmtu == 0) ifp->if_hardmtu = ETHERMTU; @@ -550,9 +525,7 @@ ether_ifdetach(struct ifnet *ifp) /* Undo pseudo-driver changes. */ if_deactivate(ifp); - if_ih_remove(ifp, ether_input, NULL); - - KASSERT(SRPL_EMPTY_LOCKED(&ifp->if_inputs)); + KASSERT(ifp->if_input == ether_input); for (enm = LIST_FIRST(&ac->ac_multiaddrs); enm != NULL; Index: sys/net/if_loop.c =================================================================== RCS file: /cvs/src/sys/net/if_loop.c,v retrieving revision 1.83 diff -u -p -r1.83 if_loop.c --- sys/net/if_loop.c 31 Oct 2017 22:05:12 -0000 1.83 +++ sys/net/if_loop.c 9 Nov 2017 04:56:15 -0000 @@ -143,7 +143,7 @@ int loioctl(struct ifnet *, u_long, caddr_t); void loopattach(int); void lortrequest(struct ifnet *, int, struct rtentry *); -int loinput(struct ifnet *, struct mbuf *, void *); +void loinput(struct mbuf_list *, struct ifnet *, struct mbuf *); int looutput(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); @@ -192,7 +192,7 @@ loop_clone_create(struct if_clone *ifc, #if NBPFILTER > 0 bpfattach(&ifp->if_bpf, ifp, DLT_LOOP, sizeof(u_int32_t)); #endif - if_ih_insert(ifp, loinput, NULL); + ifp->if_input = loinput; return (0); } @@ -202,26 +202,20 @@ loop_clone_destroy(struct ifnet *ifp) if (ifp->if_index == rtable_loindex(ifp->if_rdomain)) return (EPERM); - if_ih_remove(ifp, loinput, NULL); if_detach(ifp); free(ifp, M_DEVBUF, sizeof(*ifp)); return (0); } -int -loinput(struct ifnet *ifp, struct mbuf *m, void *cookie) +void +loinput(struct mbuf_list *ml, struct ifnet *ifp, struct mbuf *m) { - int error; - if ((m->m_flags & M_PKTHDR) == 0) panic("%s: no header mbuf", __func__); - error = if_input_local(ifp, m, m->m_pkthdr.ph_family); - if (error) + if (if_input_local(ifp, m, m->m_pkthdr.ph_family) != 0) ifp->if_ierrors++; - - return (1); } int @@ -241,12 +235,7 @@ looutput(struct ifnet *ifp, struct mbuf if ((m->m_flags & M_LOOP) == 0) return (if_input_local(ifp, m, dst->sa_family)); - m->m_pkthdr.ph_family = dst->sa_family; - if (mq_enqueue(&ifp->if_inputqueue, m)) - return ENOBUFS; - task_add(net_tq(ifp->if_index), ifp->if_inputtask); - - return (0); + return (if_output_local(ifp, m, dst->sa_family)); } void Index: sys/net/if_mpw.c =================================================================== RCS file: /cvs/src/sys/net/if_mpw.c,v retrieving revision 1.22 diff -u -p -r1.22 if_mpw.c --- sys/net/if_mpw.c 15 May 2017 14:03:53 -0000 1.22 +++ sys/net/if_mpw.c 9 Nov 2017 04:56:15 -0000 @@ -63,7 +63,9 @@ int mpw_ioctl(struct ifnet *, u_long, ca int mpw_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); void mpw_start(struct ifnet *); +#if 0 int mpw_input(struct ifnet *, struct mbuf *, void *); +#endif #if NVLAN > 0 struct mbuf *mpw_vlan_handle(struct mbuf *, struct mpw_softc *); #endif /* NVLAN */ @@ -108,8 +110,6 @@ mpw_clone_create(struct if_clone *ifc, i sc->sc_smpls.smpls_len = sizeof(sc->sc_smpls); sc->sc_smpls.smpls_family = AF_MPLS; - if_ih_insert(ifp, mpw_input, NULL); - #if NBPFILTER > 0 bpfattach(&ifp->if_bpf, ifp, DLT_EN10MB, ETHER_HDR_LEN); #endif /* NBFILTER */ @@ -129,14 +129,13 @@ mpw_clone_destroy(struct ifnet *ifp) smplstosa(&sc->sc_smpls)); } - if_ih_remove(ifp, mpw_input, NULL); - if_detach(ifp); free(sc, M_DEVBUF, sizeof(*sc)); return (0); } +#if 0 int mpw_input(struct ifnet *ifp, struct mbuf *m, void *cookie) { @@ -144,6 +143,7 @@ mpw_input(struct ifnet *ifp, struct mbuf m_freem(m); return (1); } +#endif int mpw_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) Index: sys/net/if_switch.c =================================================================== RCS file: /cvs/src/sys/net/if_switch.c,v retrieving revision 1.20 diff -u -p -r1.20 if_switch.c --- sys/net/if_switch.c 31 May 2017 05:59:09 -0000 1.20 +++ sys/net/if_switch.c 9 Nov 2017 04:56:15 -0000 @@ -543,7 +543,6 @@ switch_port_add(struct switch_softc *sc, swpo->swpo_switch = sc; swpo->swpo_ifindex = ifs->if_index; ifs->if_switchport = (caddr_t)swpo; - if_ih_insert(ifs, switch_input, NULL); swpo->swpo_port_no = swofp_assign_portno(sc, ifs->if_index); swpo->swpo_dhcookie = hook_establish(ifs->if_detachhooks, 0, switch_port_detach, ifs); @@ -615,7 +614,6 @@ switch_port_detach(void *arg) ifp->if_switchport = NULL; hook_disestablish(ifp->if_detachhooks, swpo->swpo_dhcookie); ifpromisc(ifp, 0); - if_ih_remove(ifp, switch_input, NULL); TAILQ_REMOVE(&sc->sc_swpo_list, swpo, swpo_list_next); free(swpo, M_DEVBUF, sizeof(*swpo)); } Index: sys/net/if_switch.h =================================================================== RCS file: /cvs/src/sys/net/if_switch.h,v retrieving revision 1.10 diff -u -p -r1.10 if_switch.h --- sys/net/if_switch.h 20 Nov 2016 12:45:26 -0000 1.10 +++ sys/net/if_switch.h 9 Nov 2017 04:56:15 -0000 @@ -176,6 +176,8 @@ struct switch_port { uint32_t swpo_flags; void *swpo_dhcookie; void (*swop_bk_start)(struct ifnet *); + void (*swpo_if_input)(struct mbuf_list *, + struct ifnet *, struct mbuf *); }; TAILQ_HEAD(switch_fwdp_queue, switch_port); Index: sys/net/if_trunk.c =================================================================== RCS file: /cvs/src/sys/net/if_trunk.c,v retrieving revision 1.134 diff -u -p -r1.134 if_trunk.c --- sys/net/if_trunk.c 14 Aug 2017 08:31:00 -0000 1.134 +++ sys/net/if_trunk.c 9 Nov 2017 04:56:15 -0000 @@ -75,8 +75,10 @@ int trunk_ether_delmulti(struct trunk_s void trunk_ether_purgemulti(struct trunk_softc *); int trunk_ether_cmdmulti(struct trunk_port *, u_long); int trunk_ioctl_allports(struct trunk_softc *, u_long, caddr_t); -int trunk_input(struct ifnet *, struct mbuf *, void *); -void trunk_start(struct ifnet *); +void trunk_port_input_m(struct mbuf_list *, struct ifnet *, struct mbuf *); +int trunk_output(struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *); +void trunk_start(struct ifqueue *); void trunk_init(struct ifnet *); void trunk_stop(struct ifnet *); int trunk_media_change(struct ifnet *); @@ -182,9 +184,10 @@ trunk_clone_create(struct if_clone *ifc, ifp = &tr->tr_ac.ac_if; ifp->if_softc = tr; - ifp->if_start = trunk_start; + ifp->if_qstart = trunk_start; ifp->if_ioctl = trunk_ioctl; ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; + ifp->if_xflags = IFXF_MPSAFE; ifp->if_capabilities = trunk_capabilities(tr); IFQ_SET_MAXLEN(&ifp->if_snd, 1); @@ -199,6 +202,8 @@ trunk_clone_create(struct if_clone *ifc, if_attach(ifp); ether_ifattach(ifp); + ifp->if_output = trunk_output; + /* Insert into the global list of trunks */ SLIST_INSERT_HEAD(&trunk_list, tr, tr_entries); @@ -331,22 +336,24 @@ trunk_port_create(struct trunk_softc *tr } } - /* Change the interface type */ + tp->tp_if = ifp; + tp->tp_trunk = tr; tp->tp_iftype = ifp->if_type; - ifp->if_type = IFT_IEEE8023ADLAG; - + tp->tp_input = ifp->if_input; tp->tp_ioctl = ifp->if_ioctl; - ifp->if_ioctl = trunk_port_ioctl; - tp->tp_output = ifp->if_output; - ifp->if_output = trunk_port_output; - - tp->tp_if = ifp; - tp->tp_trunk = tr; /* Save port link layer address */ bcopy(((struct arpcom *)ifp)->ac_enaddr, tp->tp_lladdr, ETHER_ADDR_LEN); + ifp->if_trunkport = (caddr_t)tp; + + /* Change the interface type */ + ifp->if_type = IFT_IEEE8023ADLAG; + ifp->if_ioctl = trunk_port_ioctl; + ifp->if_output = trunk_port_output; + ifp->if_input = trunk_port_input_m; + if (SLIST_EMPTY(&tr->tr_ports)) { tr->tr_primary = tp; tp->tp_flags |= TRUNK_PORT_MASTER; @@ -378,9 +385,6 @@ trunk_port_create(struct trunk_softc *tr if (tr->tr_port_create != NULL) error = (*tr->tr_port_create)(tp); - /* Change input handler of the physical interface. */ - if_ih_insert(ifp, trunk_input, tp); - return (error); } @@ -408,9 +412,6 @@ trunk_port_destroy(struct trunk_port *tp struct trunk_port *tp_ptr; struct ifnet *ifp = tp->tp_if; - /* Restore previous input handler. */ - if_ih_remove(ifp, trunk_input, tp); - /* Remove multicast addresses from this port */ trunk_ether_cmdmulti(tp, SIOCDELMULTI); @@ -428,6 +429,7 @@ trunk_port_destroy(struct trunk_port *tp ifp->if_ioctl = tp->tp_ioctl; ifp->if_output = tp->tp_output; + ifp->if_input = tp->tp_input; hook_disestablish(ifp->if_linkstatehooks, tp->lh_cookie); hook_disestablish(ifp->if_detachhooks, tp->dh_cookie); @@ -648,8 +650,6 @@ trunk_ioctl(struct ifnet *ifp, u_long cm * running trunk_input's on this port to finish * granting us an exclusive access to it. */ - SLIST_FOREACH(tp, &tr->tr_ports, tp_entries) - if_ih_remove(tp->tp_if, trunk_input, tp); if (tr->tr_proto != TRUNK_PROTO_NONE) error = tr->tr_detach(tr); if (error != 0) @@ -663,9 +663,6 @@ trunk_ioctl(struct ifnet *ifp, u_long cm tr->tr_proto = trunk_protos[i].ti_proto; if (tr->tr_proto != TRUNK_PROTO_NONE) error = trunk_protos[i].ti_attach(tr); - SLIST_FOREACH(tp, &tr->tr_ports, tp_entries) - if_ih_insert(tp->tp_if, - trunk_input, tp); /* Update trunk capabilities */ tr->tr_capabilities = trunk_capabilities(tr); goto out; @@ -905,18 +902,73 @@ trunk_ioctl_allports(struct trunk_softc return (error); } +int +trunk_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct rtentry *rt) +{ +#if NBPFILTER > 0 + caddr_t if_bpf; +#endif + struct trunk_softc *tr; + struct trunk_port *tp; + struct sockaddr_storage ss; + struct sockaddr *edst = (struct sockaddr *)&ss; + struct ether_header *eh = (struct ether_header *)edst->sa_data; + int error; + + if (!ifq_priq(&ifp->if_snd)) { + /* + * user wants to delay packets, which relies on the ifq + * machinery. fall back to if_enqueue via ether_output. + */ + return (ether_output(ifp, m, edst, rt)); + } + + error = ether_resolve(ifp, m, dst, rt, eh); + switch (error) { + case 0: + break; + case EAGAIN: + return (0); + default: + return (error); + } + +#if NBPFILTER > 0 + if_bpf = ifp->if_bpf; + if (if_bpf) { + bpf_mtap_hdr(if_bpf, (caddr_t)eh, sizeof(*eh), m, + BPF_DIRECTION_OUT, NULL); + } +#endif + + tr = (struct trunk_softc *)ifp->if_softc; + if (tr->tr_proto == TRUNK_PROTO_NONE || tr->tr_count == 0) + goto busy; + + tp = lacp_select_tx_port(tr, m); + if (tp == NULL) + goto busy; + + ifq_count(&ifp->if_snd, m); + + edst->sa_family = pseudo_AF_HDRCMPLT; + return (ether_output(tp->tp_if, m, edst, NULL)); + +busy: + m_freem(m); + return (EBUSY); +} + void -trunk_start(struct ifnet *ifp) +trunk_start(struct ifqueue *ifq) { + struct ifnet *ifp = ifq->ifq_if; struct trunk_softc *tr = (struct trunk_softc *)ifp->if_softc; struct mbuf *m; int error; - for (;;) { - IFQ_DEQUEUE(&ifp->if_snd, m); - if (m == NULL) - break; - + while ((m = ifq_dequeue(ifq)) != NULL) { #if NBPFILTER > 0 if (ifp->if_bpf) bpf_mtap_ether(ifp->if_bpf, m, BPF_DIRECTION_OUT); @@ -925,11 +977,11 @@ trunk_start(struct ifnet *ifp) if (tr->tr_proto != TRUNK_PROTO_NONE && tr->tr_count) { error = (*tr->tr_start)(tr, m); if (error != 0) - ifp->if_oerrors++; + ifq->ifq_errors++; } else { m_freem(m); if (tr->tr_proto != TRUNK_PROTO_NONE) - ifp->if_oerrors++; + ifq->ifq_errors++; } } } @@ -1022,66 +1074,36 @@ trunk_stop(struct ifnet *ifp) (*tr->tr_stop)(tr); } -int -trunk_input(struct ifnet *ifp, struct mbuf *m, void *cookie) +void +trunk_port_input_m(struct mbuf_list *ml, struct ifnet *ifp, struct mbuf *m) { - struct trunk_softc *tr; struct trunk_port *tp; - struct ifnet *trifp = NULL; - struct ether_header *eh; - struct mbuf_list ml = MBUF_LIST_INITIALIZER(); - - eh = mtod(m, struct ether_header *); - if (ETHER_IS_MULTICAST(eh->ether_dhost)) - ifp->if_imcasts++; - - /* Should be checked by the caller */ - if (ifp->if_type != IFT_IEEE8023ADLAG) - goto bad; - - tp = (struct trunk_port *)cookie; - if ((tr = (struct trunk_softc *)tp->tp_trunk) == NULL) - goto bad; + struct trunk_softc *tr; + struct ifnet *trifp; + tp = (struct trunk_port *)ifp->if_trunkport; + tr = (struct trunk_softc *)tp->tp_trunk; trifp = &tr->tr_ac.ac_if; + if (tr->tr_proto == TRUNK_PROTO_NONE) goto bad; - if ((*tr->tr_input)(tr, tp, m)) { + if ((*tr->tr_input)(tr, tp, m) != 0) { /* * We stop here if the packet has been consumed * by the protocol routine. */ - return (1); + return; } - if ((trifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) + if (!ISSET(trifp->if_flags, IFF_RUNNING)) goto bad; - /* - * Drop promiscuously received packets if we are not in - * promiscuous mode. - */ - if (!ETHER_IS_MULTICAST(eh->ether_dhost) && - (ifp->if_flags & IFF_PROMISC) && - (trifp->if_flags & IFF_PROMISC) == 0) { - if (bcmp(&tr->tr_ac.ac_enaddr, eh->ether_dhost, - ETHER_ADDR_LEN)) { - m_freem(m); - return (1); - } - } - - - ml_enqueue(&ml, m); - if_input(trifp, &ml); - return (1); + if_input_m(ml, trifp, m); + return; bad: - if (trifp != NULL) - trifp->if_ierrors++; m_freem(m); - return (1); } int Index: sys/net/if_trunk.h =================================================================== RCS file: /cvs/src/sys/net/if_trunk.h,v retrieving revision 1.25 diff -u -p -r1.25 if_trunk.h --- sys/net/if_trunk.h 23 Sep 2015 12:40:12 -0000 1.25 +++ sys/net/if_trunk.h 9 Nov 2017 04:56:15 -0000 @@ -139,6 +139,7 @@ struct trunk_port { void *dh_cookie; /* if detach hook */ /* Redirected callbacks */ + void (*tp_input)(struct mbuf_list *, struct ifnet *, struct mbuf *); int (*tp_ioctl)(struct ifnet *, u_long, caddr_t); int (*tp_output)(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); Index: sys/net/if_var.h =================================================================== RCS file: /cvs/src/sys/net/if_var.h,v retrieving revision 1.83 diff -u -p -r1.83 if_var.h --- sys/net/if_var.h 31 Oct 2017 22:05:12 -0000 1.83 +++ sys/net/if_var.h 9 Nov 2017 04:56:15 -0000 @@ -116,6 +116,7 @@ struct ifnet { /* and the entries */ caddr_t if_bpf; /* packet filter structure */ caddr_t if_bridgeport; /* used by bridge ports */ caddr_t if_switchport; /* used by switch ports */ + caddr_t if_trunkport; caddr_t if_mcast; /* used by multicast code */ caddr_t if_mcast6; /* used by IPv6 multicast code */ caddr_t if_pf_kif; /* pf interface abstraction */ @@ -140,11 +141,7 @@ struct ifnet { /* and the entries */ struct task *if_linkstatetask; /* task to do route updates */ /* procedure handles */ - struct mbuf_queue if_inputqueue; - struct task *if_inputtask; /* input task */ - SRPL_HEAD(, ifih) if_inputs; /* input routines (dequeue) */ - - /* output routine (enqueue) */ + void (*if_input)(struct mbuf_list *, struct ifnet *, struct mbuf *); int (*if_output)(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); @@ -153,6 +150,7 @@ struct ifnet { /* and the entries */ struct sockaddr *, struct rtentry *); /* initiate output routine */ void (*if_start)(struct ifnet *); + void (*if_qstart)(struct ifqueue *); /* ioctl routine */ int (*if_ioctl)(struct ifnet *, u_long, caddr_t); /* timer routine */ @@ -161,9 +159,12 @@ struct ifnet { /* and the entries */ struct ifqueue if_snd; /* transmit queue */ struct ifqueue **if_ifqs; /* pointer to an array of sndqs */ - void (*if_qstart)(struct ifqueue *); unsigned int if_nifqs; + struct ifiqueue if_iq; /* input queue */ + struct ifiqueue **if_iqs; /* pointer to the array of iqs */ + unsigned int if_niqs; + struct sockaddr_dl *if_sadl; /* pointer to our sockaddr_dl */ void *if_afdata[AF_MAX]; @@ -303,7 +304,10 @@ void if_start(struct ifnet *); int if_enqueue_try(struct ifnet *, struct mbuf *); int if_enqueue(struct ifnet *, struct mbuf *); void if_input(struct ifnet *, struct mbuf_list *); +void if_input_process(struct mbuf_list *); +void if_input_m(struct mbuf_list *, struct ifnet *, struct mbuf *); int if_input_local(struct ifnet *, struct mbuf *, sa_family_t); +int if_output_local(struct ifnet *, struct mbuf *, sa_family_t); void if_rtrequest_dummy(struct ifnet *, int, struct rtentry *); void p2p_rtrequest(struct ifnet *, int, struct rtentry *); @@ -333,11 +337,13 @@ void if_ih_insert(struct ifnet *, int (* void if_ih_remove(struct ifnet *, int (*)(struct ifnet *, struct mbuf *, void *), void *); +void if_rxr_livelocked(struct if_rxring *); void if_rxr_init(struct if_rxring *, u_int, u_int); u_int if_rxr_get(struct if_rxring *, u_int); #define if_rxr_put(_r, _c) do { (_r)->rxr_alive -= (_c); } while (0) #define if_rxr_inuse(_r) ((_r)->rxr_alive) +#define if_rxr_cwm(_r) ((_r)->rxr_cwm) int if_rxr_info_ioctl(struct if_rxrinfo *, u_int, struct if_rxring_info *); int if_rxr_ioctl(struct if_rxrinfo *, const char *, u_int, Index: sys/net/if_vlan.c =================================================================== RCS file: /cvs/src/sys/net/if_vlan.c,v retrieving revision 1.174 diff -u -p -r1.174 if_vlan.c --- sys/net/if_vlan.c 22 Jun 2017 11:34:51 -0000 1.174 +++ sys/net/if_vlan.c 9 Nov 2017 04:56:15 -0000 @@ -84,7 +84,6 @@ void vlanattach(int count); int vlan_clone_create(struct if_clone *, int); int vlan_clone_destroy(struct ifnet *); -int vlan_input(struct ifnet *, struct mbuf *, void *); void vlan_start(struct ifqueue *ifq); int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr); @@ -185,6 +184,7 @@ vlan_clone_create(struct if_clone *ifc, ifp->if_link_state = LINK_STATE_DOWN; if_attach(ifp); ether_ifattach(ifp); + ifp->if_output = vlan_output; ifp->if_hdrlen = EVL_ENCAPLEN; return (0); @@ -223,27 +223,102 @@ vlan_clone_destroy(struct ifnet *ifp) return (0); } -static inline int -vlan_mplstunnel(int ifidx) +int +vlan_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *edst, + struct rtentry *rt) { -#if NMPW > 0 - struct ifnet *ifp; - int rv = 0; - - ifp = if_get(ifidx); - if (ifp != NULL) { - rv = ifp->if_type == IFT_MPLSTUNNEL; - if_put(ifp); + struct ifvlan *ifv; + struct ifnet *ifp0; + struct srp_ref sr; +#if NBPFILTER > 0 + caddr_t if_bpf; +#endif + struct sockaddr_storage ss; + struct sockaddr *vdst = (struct sockaddr *)&ss; + struct ether_header *eh = (struct ether_header *)vdst->sa_data; + uint16_t tag; + uint8_t prio; + int error; + + if (!ifq_priq(&ifp->if_snd)) { + /* + * user wants to delay packets, which relies on the ifq + * machinery. fall back to if_enqueue via ether_output. + */ + return (ether_output(ifp, m, edst, rt)); + } + + error = ether_resolve(ifp, m, edst, rt, eh); + switch (error) { + case 0: + break; + case EAGAIN: + return (0); + default: + return (error); + } + + ifv = ifp->if_softc; + ifp0 = if_enter(&sr, ifv->ifv_ifp0); + if (ifp0 == NULL || !ISSET(ifp0->if_flags, IFF_RUNNING)) { + m_freem(m); + error = ENETDOWN; + goto leave; + } + +#if NBPFILTER > 0 + if_bpf = ifp->if_bpf; + if (if_bpf) { + bpf_mtap_hdr(if_bpf, (caddr_t)eh, sizeof(*eh), m, + BPF_DIRECTION_OUT, NULL); } - return (rv); -#else - return (0); #endif + + /* IEEE 802.1p has prio 0 and 1 swapped */ + prio = m->m_pkthdr.pf.prio; + if (prio <= 1) + prio = !prio; + + tag = (prio << EVL_PRIO_BITS) | ifv->ifv_tag; + + /* + * If the underlying interface cannot do VLAN tag insertion + * itself, create an encapsulation header. + */ + if (ISSET(ifp0->if_capabilities, IFCAP_VLAN_HWTAGGING) && + ifv->ifv_type == ETHERTYPE_VLAN) { + m->m_pkthdr.ether_vtag = tag; + m->m_flags |= M_VLANTAG; + } else { + struct ether_vlan_shim *evl; + + M_PREPEND(m, sizeof(*evl), M_DONTWAIT); + if (m == NULL) { + error = ENOBUFS; + goto leave; + } + + evl = mtod(m, struct ether_vlan_shim *); + evl->evl_tag = htons(tag); + evl->evl_proto = eh->ether_type; + + eh->ether_type = htons(ifv->ifv_type); + } + + ifq_count(&ifp->if_snd, m); + + vdst->sa_family = pseudo_AF_HDRCMPLT; + error = ifp0->if_output(ifp0, m, vdst, NULL); + +leave: + if_leave(&sr); + return (error); } void vlan_start(struct ifqueue *ifq) { + struct srp_ref sr; struct ifnet *ifp = ifq->ifq_if; struct ifvlan *ifv; struct ifnet *ifp0; @@ -251,9 +326,8 @@ vlan_start(struct ifqueue *ifq) uint8_t prio; ifv = ifp->if_softc; - ifp0 = if_get(ifv->ifv_ifp0); - if (ifp0 == NULL || (ifp0->if_flags & (IFF_UP|IFF_RUNNING)) != - (IFF_UP|IFF_RUNNING)) { + ifp0 = if_enter(&sr, ifv->ifv_ifp0); + if (ifp0 == NULL || !ISSET(ifp0->if_flags, IFF_RUNNING)) { ifq_purge(ifq); goto leave; } @@ -271,17 +345,10 @@ vlan_start(struct ifqueue *ifq) prio = !prio; /* - * If this packet came from a pseudowire it means it already - * has all tags it needs, so just output it. - */ - if (vlan_mplstunnel(m->m_pkthdr.ph_ifidx)) { - /* NOTHING */ - - /* * If the underlying interface cannot do VLAN tag insertion * itself, create an encapsulation header. */ - } else if ((ifp0->if_capabilities & IFCAP_VLAN_HWTAGGING) && + if ((ifp0->if_capabilities & IFCAP_VLAN_HWTAGGING) && (ifv->ifv_type == ETHERTYPE_VLAN)) { m->m_pkthdr.ether_vtag = ifv->ifv_tag + (prio << EVL_PRIO_BITS); @@ -303,7 +370,7 @@ vlan_start(struct ifqueue *ifq) } leave: - if_put(ifp0); + if_leave(&sr); } struct mbuf * @@ -328,45 +395,40 @@ vlan_inject(struct mbuf *m, uint16_t typ return (m); } -/* - * vlan_input() returns 1 if it has consumed the packet, 0 otherwise. - */ -int -vlan_input(struct ifnet *ifp0, struct mbuf *m, void *cookie) +void +vlan_input(struct mbuf_list *ml, struct ifnet *ifp0, struct mbuf *m) { struct ifvlan *ifv; struct ether_vlan_header *evl; struct ether_header *eh; SRPL_HEAD(, ifvlan) *tagh, *list; struct srp_ref sr; + uint64_t vtag; u_int tag; - struct mbuf_list ml = MBUF_LIST_INITIALIZER(); u_int16_t etype; eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); - if (m->m_flags & M_VLANTAG) { + if (ISSET(m->m_flags, M_VLANTAG)) { etype = ETHERTYPE_VLAN; + vtag = m->m_pkthdr.ether_vtag; tagh = vlan_tagh; - } else if ((etype == ETHERTYPE_VLAN) || (etype == ETHERTYPE_QINQ)) { + } else { if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { ifp0->if_ierrors++; - return (1); + return; } evl = mtod(m, struct ether_vlan_header *); - m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); + vtag = ntohs(evl->evl_tag); tagh = etype == ETHERTYPE_QINQ ? svlan_tagh : vlan_tagh; - } else { - /* Skip non-VLAN packets. */ - return (0); } /* From now on ether_vtag is fine */ - tag = EVL_VLANOFTAG(m->m_pkthdr.ether_vtag); - m->m_pkthdr.pf.prio = EVL_PRIOFTAG(m->m_pkthdr.ether_vtag); + tag = EVL_VLANOFTAG(vtag); + m->m_pkthdr.pf.prio = EVL_PRIOFTAG(vtag); /* IEEE 802.1p has prio 0 and 1 swapped */ if (m->m_pkthdr.pf.prio <= 1) @@ -374,7 +436,8 @@ vlan_input(struct ifnet *ifp0, struct mb list = &tagh[TAG_HASH(tag)]; SRPL_FOREACH(ifv, &sr, list, ifv_list) { - if (ifp0->if_index == ifv->ifv_ifp0 && tag == ifv->ifv_tag && + if (ifp0->if_index == ifv->ifv_ifp0 && + tag == ifv->ifv_tag && etype == ifv->ifv_type) break; } @@ -384,8 +447,7 @@ vlan_input(struct ifnet *ifp0, struct mb goto drop; } - if ((ifv->ifv_if.if_flags & (IFF_UP|IFF_RUNNING)) != - (IFF_UP|IFF_RUNNING)) + if (!ISSET(ifv->ifv_if.if_flags, IFF_RUNNING)) goto drop; /* @@ -393,23 +455,22 @@ vlan_input(struct ifnet *ifp0, struct mb * the given source interface and vlan tag, remove the * encapsulation. */ - if (m->m_flags & M_VLANTAG) { - m->m_flags &= ~M_VLANTAG; - } else { + if (ISSET(m->m_flags, M_VLANTAG)) + CLR(m->m_flags, M_VLANTAG); + else { eh->ether_type = evl->evl_proto; memmove((char *)eh + EVL_ENCAPLEN, eh, sizeof(*eh)); m_adj(m, EVL_ENCAPLEN); } - ml_enqueue(&ml, m); - if_input(&ifv->ifv_if, &ml); + if_input_m(ml, &ifv->ifv_if, m); + SRPL_LEAVE(&sr); - return (1); + return; drop: SRPL_LEAVE(&sr); m_freem(m); - return (1); } int @@ -433,8 +494,6 @@ vlan_parent_up(struct ifvlan *ifv, struc vlan_multi_apply(ifv, ifp0, SIOCADDMULTI); - if_ih_insert(ifp0, vlan_input, NULL); - return (0); } @@ -552,7 +611,6 @@ vlan_down(struct ifvlan *ifv) ifp0 = if_get(ifv->ifv_ifp0); if (ifp0 != NULL) { - if_ih_remove(ifp0, vlan_input, NULL); if (ISSET(ifv->ifv_flags, IFVF_PROMISC)) ifpromisc(ifp0, 0); vlan_multi_apply(ifv, ifp0, SIOCDELMULTI); Index: sys/net/if_vlan_var.h =================================================================== RCS file: /cvs/src/sys/net/if_vlan_var.h,v retrieving revision 1.37 diff -u -p -r1.37 if_vlan_var.h --- sys/net/if_vlan_var.h 24 Jan 2017 10:08:30 -0000 1.37 +++ sys/net/if_vlan_var.h 9 Nov 2017 04:56:15 -0000 @@ -85,6 +85,10 @@ struct ifvlan { #define IFVF_LLADDR 0x02 /* don't inherit the parents mac */ struct mbuf *vlan_inject(struct mbuf *, uint16_t, uint16_t); +void vlan_input(struct mbuf_list *, struct ifnet *, struct mbuf *); +int vlan_output(struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *); + #endif /* _KERNEL */ #endif /* _NET_IF_VLAN_VAR_H_ */ Index: sys/net/ifq.c =================================================================== RCS file: /cvs/src/sys/net/ifq.c,v retrieving revision 1.12 diff -u -p -r1.12 ifq.c --- sys/net/ifq.c 2 Jun 2017 00:07:12 -0000 1.12 +++ sys/net/ifq.c 9 Nov 2017 04:56:15 -0000 @@ -16,6 +16,8 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include "bpfilter.h" + #include #include #include @@ -25,6 +27,10 @@ #include #include +#if NBPFILTER > 0 +#include +#endif + /* * priq glue */ @@ -64,6 +70,7 @@ struct priq { void ifq_start_task(void *); void ifq_restart_task(void *); void ifq_barrier_task(void *); +void ifq_bundle_task(void *); #define TASK_ONQUEUE 0x1 @@ -140,6 +147,9 @@ ifq_barrier(struct ifqueue *ifq) /* this should only be called from converted drivers */ KASSERT(ISSET(ifq->ifq_if->if_xflags, IFXF_MPSAFE)); + if (!task_del(ifq->ifq_softnet, &ifq->ifq_bundle)) + taskq_barrier(ifq->ifq_softnet); + if (ifq->ifq_serializer == NULL) return; @@ -160,6 +170,14 @@ ifq_barrier_task(void *p) wakeup_one(notdone); } +void +ifq_bundle_task(void *p) +{ + struct ifqueue *ifq = p; + + ifq_start(ifq); +} + /* * ifqueue mbuf queue API */ @@ -168,6 +186,7 @@ void ifq_init(struct ifqueue *ifq, struct ifnet *ifp, unsigned int idx) { ifq->ifq_if = ifp; + ifq->ifq_softnet = net_tq(ifp->if_index + idx); ifq->ifq_softc = NULL; mtx_init(&ifq->ifq_mtx, IPL_NET); @@ -193,6 +212,8 @@ ifq_init(struct ifqueue *ifq, struct ifn task_set(&ifq->ifq_start, ifq_start_task, ifq); task_set(&ifq->ifq_restart, ifq_restart_task, ifq); + task_set(&ifq->ifq_bundle, ifq_bundle_task, ifq); + if (ifq->ifq_maxlen == 0) ifq_set_maxlen(ifq, IFQ_MAXLEN); @@ -248,6 +269,39 @@ ifq_destroy(struct ifqueue *ifq) ml_purge(&ml); } +static inline void +ifq_enter(struct ifqueue *ifq) +{ + mtx_enter(&ifq->ifq_mtx); +} + +static inline void +ifq_leave(struct ifqueue *ifq) +{ + struct mbuf *m; + unsigned int drops; + + drops = ml_len(&ifq->ifq_free); + if (drops != 0) { + m = ml_dechain(&ifq->ifq_free); + ifq->ifq_qdrops += drops; + } + + mtx_leave(&ifq->ifq_mtx); + + if (drops) + m_purge(m); +} + +static inline void +ifq_count_locked(struct ifqueue *ifq, const struct mbuf *m) +{ + ifq->ifq_packets++; + ifq->ifq_bytes += m->m_pkthdr.len; + if (ISSET(m->m_flags, M_MCAST)) + ifq->ifq_mcasts++; +} + int ifq_enqueue(struct ifqueue *ifq, struct mbuf *m) { @@ -274,36 +328,19 @@ ifq_enqueue(struct ifqueue *ifq, struct return (dm == m ? ENOBUFS : 0); } -static inline void -ifq_deq_enter(struct ifqueue *ifq) -{ - mtx_enter(&ifq->ifq_mtx); -} - -static inline void -ifq_deq_leave(struct ifqueue *ifq) -{ - struct mbuf_list ml; - - ml = ifq->ifq_free; - ml_init(&ifq->ifq_free); - - mtx_leave(&ifq->ifq_mtx); - - if (!ml_empty(&ml)) - ml_purge(&ml); -} - struct mbuf * ifq_deq_begin(struct ifqueue *ifq) { struct mbuf *m = NULL; void *cookie; - ifq_deq_enter(ifq); - if (ifq->ifq_len == 0 || - (m = ifq->ifq_ops->ifqop_deq_begin(ifq, &cookie)) == NULL) { - ifq_deq_leave(ifq); + if (ifq_empty(ifq)) + return (NULL); + + ifq_enter(ifq); + m = ifq->ifq_ops->ifqop_deq_begin(ifq, &cookie); + if (m == NULL) { + ifq_leave(ifq); return (NULL); } @@ -317,32 +354,38 @@ ifq_deq_commit(struct ifqueue *ifq, stru { void *cookie; - KASSERT(m != NULL); + KDASSERT(m != NULL); cookie = m->m_pkthdr.ph_cookie; ifq->ifq_ops->ifqop_deq_commit(ifq, m, cookie); ifq->ifq_len--; - ifq_deq_leave(ifq); + ifq_leave(ifq); } void ifq_deq_rollback(struct ifqueue *ifq, struct mbuf *m) { - KASSERT(m != NULL); + KDASSERT(m != NULL); - ifq_deq_leave(ifq); + ifq_leave(ifq); } struct mbuf * ifq_dequeue(struct ifqueue *ifq) { struct mbuf *m; + void *cookie; - m = ifq_deq_begin(ifq); - if (m == NULL) + if (ifq_empty(ifq)) return (NULL); - ifq_deq_commit(ifq, m); + ifq_enter(ifq); + m = ifq->ifq_ops->ifqop_deq_begin(ifq, &cookie); + if (m != NULL) { + ifq->ifq_ops->ifqop_deq_commit(ifq, m, cookie); + ifq->ifq_len--; + } + ifq_leave(ifq); return (m); } @@ -360,13 +403,21 @@ ifq_purge(struct ifqueue *ifq) ifq->ifq_qdrops += rv; mtx_leave(&ifq->ifq_mtx); - KASSERT(rv == ml_len(&ml)); + KDASSERT(rv == ml_len(&ml)); ml_purge(&ml); return (rv); } +void +ifq_count(struct ifqueue *ifq, const struct mbuf *m) +{ + mtx_enter(&ifq->ifq_mtx); + ifq_count_locked(ifq, m); + mtx_leave(&ifq->ifq_mtx); +} + void * ifq_q_enter(struct ifqueue *ifq, const struct ifq_ops *ops) { @@ -382,31 +433,153 @@ ifq_q_enter(struct ifqueue *ifq, const s void ifq_q_leave(struct ifqueue *ifq, void *q) { - KASSERT(q == ifq->ifq_q); + KDASSERT(q == ifq->ifq_q); mtx_leave(&ifq->ifq_mtx); } void ifq_mfreem(struct ifqueue *ifq, struct mbuf *m) { - MUTEX_ASSERT_LOCKED(&ifq->ifq_mtx); - ifq->ifq_len--; - ifq->ifq_qdrops++; ml_enqueue(&ifq->ifq_free, m); } void ifq_mfreeml(struct ifqueue *ifq, struct mbuf_list *ml) { - MUTEX_ASSERT_LOCKED(&ifq->ifq_mtx); - ifq->ifq_len -= ml_len(ml); - ifq->ifq_qdrops += ml_len(ml); ml_enlist(&ifq->ifq_free, ml); } /* + * ifiq + */ + +static void ifiq_process(void *); + +void +ifiq_init(struct ifiqueue *ifiq, struct ifnet *ifp, unsigned int idx) +{ + ifiq->ifiq_if = ifp; + ifiq->ifiq_softnet = net_tq(ifp->if_index + idx); + ifiq->ifiq_softc = NULL; + + mtx_init(&ifiq->ifiq_mtx, IPL_NET); + ml_init(&ifiq->ifiq_ml); + task_set(&ifiq->ifiq_task, ifiq_process, ifiq); + + ifiq->ifiq_qdrops = 0; + ifiq->ifiq_packets = 0; + ifiq->ifiq_bytes = 0; + ifiq->ifiq_qdrops = 0; + ifiq->ifiq_errors = 0; + + ifiq->ifiq_idx = idx; +} + +void +ifiq_destroy(struct ifiqueue *ifiq) +{ + /* don't need to lock because this is the last use of the ifiq */ + ml_purge(&ifiq->ifiq_ml); +} + +int +ifiq_input(struct ifiqueue *ifiq, struct mbuf_list *ml, unsigned int cwm) +{ + struct ifnet *ifp = ifiq->ifiq_if; + struct mbuf *m; + uint64_t bytes = 0; +#if NBPFILTER > 0 + caddr_t if_bpf; +#endif + int rv; + + if (ml_empty(ml)) + return (0); + + MBUF_LIST_FOREACH(ml, m) { + m->m_pkthdr.ph_ifidx = ifp->if_index; + m->m_pkthdr.ph_rtableid = ifp->if_rdomain; + bytes += m->m_pkthdr.len; + } + + ifiq_count(ifiq, ml_len(ml), bytes); + +#if NBPFILTER > 0 + if_bpf = ifp->if_bpf; + if (if_bpf) { + struct mbuf_list ml0 = *ml; + + ml_init(ml); + + while ((m = ml_dequeue(&ml0)) != NULL) { + if (bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_IN)) + m_freem(m); + else + ml_enqueue(ml, m); + } + + if (ml_empty(ml)) + return (0); + } +#endif + + if (ifiq_len(ifiq) >= cwm * 5) { + IFIQ_ADD(ifiq, ifiq_qdrops, ml_purge(ml)); + return (1); + } + + rv = (ifiq_len(ifiq) >= cwm * 3); + + mtx_enter(&ifiq->ifiq_mtx); + ml_enlist(&ifiq->ifiq_ml, ml); + mtx_leave(&ifiq->ifiq_mtx); + + task_add(ifiq->ifiq_softnet, &ifiq->ifiq_task); + + return (rv); +} + +void +ifiq_barrier(struct ifiqueue *ifiq) +{ + if (!task_del(ifiq->ifiq_softnet, &ifiq->ifiq_task)) + taskq_barrier(ifiq->ifiq_softnet); +} + +int +ifiq_enqueue(struct ifiqueue *ifiq, struct mbuf *m) +{ + /* this can be called from anywhere at any time, so must lock */ + + mtx_enter(&ifiq->ifiq_mtx); + ml_enqueue(&ifiq->ifiq_ml, m); + mtx_leave(&ifiq->ifiq_mtx); + + task_add(ifiq->ifiq_softnet, &ifiq->ifiq_task); + + return (0); +} + +static void +ifiq_process(void *arg) +{ + struct ifiqueue *ifiq = arg; + struct mbuf_list ml; + + if (ifiq_empty(ifiq)) + return; + + mtx_enter(&ifiq->ifiq_mtx); + ml = ifiq->ifiq_ml; + ml_init(&ifiq->ifiq_ml); + mtx_leave(&ifiq->ifiq_mtx); + + if_input_process(&ml); +} + +/* * priq implementation */ @@ -444,21 +617,21 @@ priq_enq(struct ifqueue *ifq, struct mbu { struct priq *pq; struct mbuf_list *pl; - struct mbuf *n = NULL; + struct mbuf *dm = NULL; unsigned int prio; pq = ifq->ifq_q; - KASSERT(m->m_pkthdr.pf.prio <= IFQ_MAXPRIO); + KDASSERT(m->m_pkthdr.pf.prio <= IFQ_MAXPRIO); /* Find a lower priority queue to drop from */ if (ifq_len(ifq) >= ifq->ifq_maxlen) { for (prio = 0; prio < m->m_pkthdr.pf.prio; prio++) { pl = &pq->pq_lists[prio]; - if (ml_len(pl) > 0) { - n = ml_dequeue(pl); + dm = ml_dequeue(pl); + if (dm != NULL) goto enqueue; - } } + /* * There's no lower priority queue that we can * drop from so don't enqueue this one. @@ -470,7 +643,7 @@ priq_enq(struct ifqueue *ifq, struct mbu pl = &pq->pq_lists[m->m_pkthdr.pf.prio]; ml_enqueue(pl, m); - return (n); + return (dm); } struct mbuf * @@ -498,7 +671,7 @@ priq_deq_commit(struct ifqueue *ifq, str { struct mbuf_list *pl = cookie; - KASSERT(MBUF_LIST_FIRST(pl) == m); + KDASSERT(MBUF_LIST_FIRST(pl) == m); ml_dequeue(pl); } Index: sys/net/ifq.h =================================================================== RCS file: /cvs/src/sys/net/ifq.h,v retrieving revision 1.13 diff -u -p -r1.13 ifq.h --- sys/net/ifq.h 3 May 2017 20:55:29 -0000 1.13 +++ sys/net/ifq.h 9 Nov 2017 04:56:15 -0000 @@ -25,6 +25,7 @@ struct ifq_ops; struct ifqueue { struct ifnet *ifq_if; + struct taskq *ifq_softnet; union { void *_ifq_softc; /* @@ -62,11 +63,41 @@ struct ifqueue { struct task ifq_start; struct task ifq_restart; + struct task ifq_bundle; + /* properties */ unsigned int ifq_maxlen; unsigned int ifq_idx; }; +struct ifiqueue { + struct ifnet *ifiq_if; + struct taskq *ifiq_softnet; + union { + void *_ifiq_softc; + struct ifiqueue *_ifiq_ifiqs[1]; + } _ifiq_ptr; +#define ifiq_softc _ifiq_ptr._ifiq_softc +#define ifiq_ifiqs _ifiq_ptr._ifiq_ifiqs + + struct mutex ifiq_mtx; + struct mbuf_list ifiq_ml; + struct task ifiq_task; + + /* counters */ + volatile unsigned int ifiq_gen; + + uint64_t ifiq_packets; + uint64_t ifiq_bytes; + uint64_t ifiq_qdrops; + uint64_t ifiq_errors; + uint64_t ifiq_mcasts; + uint64_t ifiq_noproto; + + /* properties */ + unsigned int ifiq_idx; +}; + #ifdef _KERNEL #define IFQ_MAXLEN 256 @@ -385,6 +416,7 @@ struct mbuf *ifq_dequeue(struct ifqueue void ifq_mfreem(struct ifqueue *, struct mbuf *); void ifq_mfreeml(struct ifqueue *, struct mbuf_list *); unsigned int ifq_purge(struct ifqueue *); +void ifq_count(struct ifqueue *, const struct mbuf *); void *ifq_q_enter(struct ifqueue *, const struct ifq_ops *); void ifq_q_leave(struct ifqueue *, void *); void ifq_serialize(struct ifqueue *, struct task *); @@ -394,6 +426,7 @@ void ifq_barrier(struct ifqueue *); #define ifq_len(_ifq) ((_ifq)->ifq_len) #define ifq_empty(_ifq) (ifq_len(_ifq) == 0) #define ifq_set_maxlen(_ifq, _l) ((_ifq)->ifq_maxlen = (_l)) +#define ifq_priq(_ifq) ((_ifq)->ifq_ops == ifq_priq_ops) static inline void ifq_set_oactive(struct ifqueue *ifq) @@ -434,6 +467,64 @@ ifq_idx(struct ifqueue *ifq, unsigned in #define IFQ_ASSERT_SERIALIZED(_ifq) KASSERT(ifq_is_serialized(_ifq)) extern const struct ifq_ops * const ifq_priq_ops; + +/* ifiq */ + +void ifiq_init(struct ifiqueue *, struct ifnet *, unsigned int); +int ifiq_input(struct ifiqueue *, struct mbuf_list *, + unsigned int); +int ifiq_enqueue(struct ifiqueue *, struct mbuf *); + +#define ifiq_len(_ifiq) ml_len(&(_ifiq)->ifiq_ml) +#define ifiq_empty(_ifiq) ml_empty(&(_ifiq)->ifiq_ml) + +#include + +struct ifiq_ref { + unsigned int gen; +}; + +static inline void +ifiq_enter(struct ifiq_ref *ref, struct ifiqueue *ifiq) +{ + ref->gen = ++ifiq->ifiq_gen; + membar_producer(); +} + +static inline void +ifiq_leave(struct ifiq_ref *ref, struct ifiqueue *ifiq) +{ + membar_producer(); + ifiq->ifiq_gen = ++ref->gen; +} + +#define IFIQ_INC(_ifiq, _counter) do { \ + struct ifiq_ref ref; \ + ifiq_enter(&ref, (_ifiq)); \ + (_ifiq)->_counter++; \ + ifiq_leave(&ref, (_ifiq)); \ +} while (0) + +#define IFIQ_ADD(_ifiq, _counter, _v) do { \ + struct ifiq_ref ref; \ + ifiq_enter(&ref, (_ifiq)); \ + (_ifiq)->_counter += (_v); \ + ifiq_leave(&ref, (_ifiq)); \ +} while (0) + +#define IFIQ_PKTS(_ifiq, _p, _pkts, _b, _bytes) do { \ + struct ifiq_ref ref; \ + ifiq_enter(&ref, (_ifiq)); \ + (_ifiq)->_p += (_pkts); \ + (_ifiq)->_b += (_bytes); \ + ifiq_leave(&ref, (_ifiq)); \ +} while (0) + +static inline void +ifiq_count(struct ifiqueue *ifiq, uint64_t pkts, uint64_t bytes) +{ + IFIQ_PKTS(ifiq, ifiq_packets, pkts, ifiq_bytes, bytes); +} #endif /* _KERNEL */ Index: sys/net/pf.c =================================================================== RCS file: /cvs/src/sys/net/pf.c,v retrieving revision 1.1043 diff -u -p -r1.1043 pf.c --- sys/net/pf.c 31 Oct 2017 22:05:12 -0000 1.1043 +++ sys/net/pf.c 9 Nov 2017 04:56:16 -0000 @@ -7089,10 +7089,6 @@ pf_ouraddr(struct mbuf *m) if (sk != NULL) { if (sk->inp != NULL) return (1); - - /* If we have linked state keys it is certainly forwarded. */ - if (sk->reverse != NULL) - return (0); } return (-1); Index: sys/net/trunklacp.c =================================================================== RCS file: /cvs/src/sys/net/trunklacp.c,v retrieving revision 1.29 diff -u -p -r1.29 trunklacp.c --- sys/net/trunklacp.c 24 Jan 2017 10:08:30 -0000 1.29 +++ sys/net/trunklacp.c 9 Nov 2017 04:56:16 -0000 @@ -231,16 +231,14 @@ lacp_input(struct trunk_port *tp, struct struct ether_header *eh; u_int8_t subtype; - eh = mtod(m, struct ether_header *); + if (m->m_len < sizeof(*eh)) + goto drop; - if (ntohs(eh->ether_type) == ETHERTYPE_SLOW) { -#if NBPFILTER > 0 - if (tp->tp_if->if_bpf) - bpf_mtap_ether(tp->tp_if->if_bpf, m, BPF_DIRECTION_IN); -#endif + eh = mtod(m, struct ether_header *); + if (eh->ether_type == ntohs(ETHERTYPE_SLOW)) { if (m->m_pkthdr.len < (sizeof(*eh) + sizeof(subtype))) - return (-1); + goto drop; m_copydata(m, sizeof(*eh), sizeof(subtype), &subtype); switch (subtype) { @@ -257,14 +255,16 @@ lacp_input(struct trunk_port *tp, struct * free and return. */ /* This port is joined to the active aggregator */ - if ((lp->lp_state & LACP_STATE_COLLECTING) == 0 || - la == NULL || la != lsc->lsc_active_aggregator) { - m_freem(m); - return (-1); - } + if (!ISSET(lp->lp_state, LACP_STATE_COLLECTING) || + la == NULL || la != lsc->lsc_active_aggregator) + goto drop; /* Not a subtype we are interested in */ return (0); + +drop: + m_freem(m); + return (-1); } void Index: sys/netinet/if_ether.h =================================================================== RCS file: /cvs/src/sys/netinet/if_ether.h,v retrieving revision 1.73 diff -u -p -r1.73 if_ether.h --- sys/netinet/if_ether.h 29 Nov 2016 10:09:57 -0000 1.73 +++ sys/netinet/if_ether.h 9 Nov 2017 04:56:16 -0000 @@ -92,6 +92,11 @@ struct ether_vlan_header { u_int16_t evl_proto; }; +struct ether_vlan_shim { + u_int16_t evl_tag; + u_int16_t evl_proto; +}; + #define EVL_VLID_MASK 0xFFF #define EVL_VLID_NULL 0x000 /* 0x000 and 0xfff are reserved */ @@ -239,7 +244,9 @@ int ether_multiaddr(struct sockaddr *, u void ether_ifattach(struct ifnet *); void ether_ifdetach(struct ifnet *); int ether_ioctl(struct ifnet *, struct arpcom *, u_long, caddr_t); -int ether_input(struct ifnet *, struct mbuf *, void *); +void ether_input(struct mbuf_list *, struct ifnet *, struct mbuf *); +int ether_resolve(struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *, struct ether_header *); int ether_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); void ether_rtrequest(struct ifnet *, int, struct rtentry *); Index: sys/netinet/ip_carp.c =================================================================== RCS file: /cvs/src/sys/netinet/ip_carp.c,v retrieving revision 1.317 diff -u -p -r1.317 ip_carp.c --- sys/netinet/ip_carp.c 16 Oct 2017 13:20:20 -0000 1.317 +++ sys/netinet/ip_carp.c 9 Nov 2017 04:56:16 -0000 @@ -211,7 +211,6 @@ void carp_hmac_generate(struct carp_vhos unsigned char *, u_int8_t); int carp_hmac_verify(struct carp_vhost_entry *, u_int32_t *, unsigned char *); -int carp_input(struct ifnet *, struct mbuf *, void *); void carp_proto_input_c(struct ifnet *, struct mbuf *, struct carp_header *, int, sa_family_t); int carp_proto_input_if(struct ifnet *, struct mbuf **, int *, int); @@ -931,9 +930,6 @@ carpdetach(struct carp_softc *sc) cif = (struct carp_if *)ifp0->if_carp; - /* Restore previous input handler. */ - if_ih_remove(ifp0, carp_input, cif); - if (sc->lh_cookie != NULL) hook_disestablish(ifp0->if_linkstatehooks, sc->lh_cookie); @@ -1402,30 +1398,20 @@ carp_vhe_match(struct carp_softc *sc, ui } int -carp_input(struct ifnet *ifp0, struct mbuf *m, void *cookie) +carp_input(struct mbuf_list *ml, struct ifnet *ifp0, struct mbuf *m) { struct ether_header *eh; - struct mbuf_list ml = MBUF_LIST_INITIALIZER(); struct carp_if *cif; struct carp_softc *sc; struct srp_ref sr; -#if NVLAN > 0 - /* - * If the underlying interface removed the VLAN header itself, - * it's not for us. - */ - if (ISSET(m->m_flags, M_VLANTAG)) - return (0); -#endif + NET_ASSERT_LOCKED(); eh = mtod(m, struct ether_header *); - cif = (struct carp_if *)cookie; KASSERT(cif == (struct carp_if *)ifp0->if_carp); SRPL_FOREACH(sc, &sr, &cif->vhif_vrs, sc_list) { - if ((sc->sc_if.if_flags & (IFF_UP|IFF_RUNNING)) != - (IFF_UP|IFF_RUNNING)) + if (!ISSET(sc->sc_if.if_flags, IFF_RUNNING)) continue; if (carp_vhe_match(sc, eh->ether_dhost)) { @@ -1462,25 +1448,21 @@ carp_input(struct ifnet *ifp0, struct mb SRPL_FOREACH(sc, &sr, &cif->vhif_vrs, sc_list) { struct mbuf *m0; - if (!(sc->sc_if.if_flags & IFF_UP)) + if (!ISSET(sc->sc_if.if_flags, IFF_RUNNING)) continue; m0 = m_dup_pkt(m, ETHER_ALIGN, M_DONTWAIT); if (m0 == NULL) continue; - ml_init(&ml); - ml_enqueue(&ml, m0); - - if_input(&sc->sc_if, &ml); + if_input_m(ml, &sc->sc_if, m0); } SRPL_LEAVE(&sr); return (0); } - ml_enqueue(&ml, m); - if_input(&sc->sc_if, &ml); + if_input_m(ml, &sc->sc_if, m); out: SRPL_LEAVE(&sr); @@ -1780,9 +1762,6 @@ carp_set_ifp(struct carp_softc *sc, stru sc->lh_cookie = hook_establish(ifp0->if_linkstatehooks, 1, carp_carpdev_state, ifp0); - - /* Change input handler of the physical interface. */ - if_ih_insert(ifp0, carp_input, cif); carp_carpdev_state(ifp0); Index: sys/netinet/ip_carp.h =================================================================== RCS file: /cvs/src/sys/netinet/ip_carp.h,v retrieving revision 1.43 diff -u -p -r1.43 ip_carp.h --- sys/netinet/ip_carp.h 30 May 2017 12:09:27 -0000 1.43 +++ sys/netinet/ip_carp.h 9 Nov 2017 04:56:16 -0000 @@ -201,6 +201,7 @@ int carp6_proto_input(struct mbuf **, int carp_iamatch(struct ifnet *); int carp_iamatch6(struct ifnet *); struct ifnet *carp_ourether(void *, u_int8_t *); +int carp_input(struct mbuf_list *, struct ifnet *, struct mbuf *); int carp_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); int carp_sysctl(int *, u_int, void *, size_t *, void *, size_t); Index: sys/sys/_endian.h =================================================================== RCS file: /cvs/src/sys/sys/_endian.h,v retrieving revision 1.2 diff -u -p -r1.2 _endian.h --- sys/sys/_endian.h 21 Apr 2017 19:04:22 -0000 1.2 +++ sys/sys/_endian.h 9 Nov 2017 04:56:16 -0000 @@ -43,13 +43,14 @@ #define _BIG_ENDIAN 4321 #define _PDP_ENDIAN 3412 +#define __swap16op(_x) \ + (__uint16_t)(((_x) & 0xff) << 8 | ((_x) & 0xff00) >> 8) + #ifdef __GNUC__ #define __swap16gen(x) __statement({ \ __uint16_t __swap16gen_x = (x); \ - \ - (__uint16_t)((__swap16gen_x & 0xff) << 8 | \ - (__swap16gen_x & 0xff00) >> 8); \ + __swap16op(__swap16gen_x); \ }) #define __swap32gen(x) __statement({ \ @@ -168,6 +169,7 @@ #define __htole64(x) ((__uint64_t)(x)) #ifdef _KERNEL +#define __HTON16(x) __swap16op(x) #ifdef __HAVE_MD_SWAPIO #define __bemtoh16(_x) __mswap16(_x) @@ -239,6 +241,11 @@ #define __htolem32(_x, _v) (*(__uint32_t *)(_x) = __htole32(_v)) #define __htolem64(_x, _v) (*(__uint64_t *)(_x) = __htole64(_v)) #endif + +#ifndef __HTON16 +#define __HTON16(x) ((__uint16_t)(x)) +#endif + #endif /* _KERNEL */ #endif /* _SYS__ENDIAN_H_ */ Index: sys/sys/endian.h =================================================================== RCS file: /cvs/src/sys/sys/endian.h,v retrieving revision 1.25 diff -u -p -r1.25 endian.h --- sys/sys/endian.h 21 Dec 2014 04:49:00 -0000 1.25 +++ sys/sys/endian.h 9 Nov 2017 04:56:16 -0000 @@ -119,6 +119,8 @@ #define htolem16 __htolem16 #define htolem32 __htolem32 #define htolem64 __htolem64 + +#define HTON16 __HTON16 #endif /* _KERNEL */ #endif /* _SYS_ENDIAN_H_ */ Index: sys/sys/sockio.h =================================================================== RCS file: /cvs/src/sys/sys/sockio.h,v retrieving revision 1.72 diff -u -p -r1.72 sockio.h --- sys/sys/sockio.h 24 Oct 2017 09:36:13 -0000 1.72 +++ sys/sys/sockio.h 9 Nov 2017 04:56:16 -0000 @@ -59,6 +59,7 @@ #define SIOCAIFADDR _IOW('i', 26, struct ifaliasreq)/* add/chg IF alias */ #define SIOCGIFDATA _IOWR('i', 27, struct ifreq) /* get if_data */ #define SIOCSIFLLADDR _IOW('i', 31, struct ifreq) /* set link level addr */ +#define SIOCDIFLLADDR _IOW('i', 32, struct ifreq) /* reset lladdr */ #define SIOCADDMULTI _IOW('i', 49, struct ifreq) /* add m'cast addr */ #define SIOCDELMULTI _IOW('i', 50, struct ifreq) /* del m'cast addr */ Index: sys/sys/task.h =================================================================== RCS file: /cvs/src/sys/sys/task.h,v retrieving revision 1.11 diff -u -p -r1.11 task.h --- sys/sys/task.h 7 Jun 2016 07:53:33 -0000 1.11 +++ sys/sys/task.h 9 Nov 2017 04:56:16 -0000 @@ -43,6 +43,7 @@ extern struct taskq *const systqmp; struct taskq *taskq_create(const char *, unsigned int, int, unsigned int); void taskq_destroy(struct taskq *); +void taskq_barrier(struct taskq *); void task_set(struct task *, void (*)(void *), void *); int task_add(struct taskq *, struct task *);