Index: sys/net/if_tun.c =================================================================== RCS file: /cvs/src/sys/net/if_tun.c,v diff -u -p -r1.243 if_tun.c --- sys/net/if_tun.c 16 Oct 2024 11:12:31 -0000 1.243 +++ sys/net/if_tun.c 5 Nov 2024 07:01:15 -0000 @@ -88,6 +88,7 @@ struct tun_softc { struct sigio_ref sc_sigio; /* async I/O registration */ unsigned int sc_flags; /* misc flags */ #define TUN_DEAD (1 << 16) +#define TUN_HDR (1 << 17) dev_t sc_dev; struct refcnt sc_refs; @@ -104,6 +105,13 @@ int tundebug = TUN_DEBUG; /* Pretend that these IFF flags are changeable by TUNSIFINFO */ #define TUN_IFF_FLAGS (IFF_POINTOPOINT|IFF_MULTICAST|IFF_BROADCAST) +#define TUN_IF_CAPS ( \ + IFCAP_CSUM_IPv4 | \ + IFCAP_CSUM_TCPv4|IFCAP_CSUM_UDPv4|IFCAP_CSUM_TCPv6|IFCAP_CSUM_UDPv6 | \ + IFCAP_VLAN_MTU|IFCAP_VLAN_HWTAGGING|IFCAP_VLAN_HWOFFLOAD | \ + IFCAP_TSOv4|IFCAP_TSOv6|IFCAP_LRO \ +) + void tunattach(int); int tun_dev_open(dev_t, const struct if_clone *, int, struct proc *); @@ -627,6 +635,47 @@ tapioctl(dev_t dev, u_long cmd, caddr_t return (tun_dev_ioctl(dev, cmd, data)); } +static int +tun_set_capabilities(struct tun_softc *sc, const struct tun_capabilities *cap) +{ + if (ISSET(cap->tun_if_capabilities, ~TUN_IF_CAPS)) + return (EINVAL); + + NET_LOCK(); + CLR(sc->sc_if.if_capabilities, TUN_IF_CAPS); + SET(sc->sc_if.if_capabilities, cap->tun_if_capabilities); + SET(sc->sc_flags, TUN_HDR); + NET_UNLOCK(); + return (0); +} + +static int +tun_get_capabilities(struct tun_softc *sc, struct tun_capabilities *cap) +{ + int error = 0; + + NET_LOCK_SHARED(); + if (ISSET(sc->sc_flags, TUN_HDR)) { + cap->tun_if_capabilities = + (sc->sc_if.if_capabilities & TUN_IF_CAPS); + } else + error = ENODEV; + NET_UNLOCK_SHARED(); + + return (error); +} + +static int +tun_del_capabilities(struct tun_softc *sc) +{ + NET_LOCK(); + CLR(sc->sc_flags, TUN_HDR); + CLR(sc->sc_if.if_capabilities, TUN_IF_CAPS); + NET_UNLOCK(); + + return (0); +} + int tun_dev_ioctl(dev_t dev, u_long cmd, void *data) { @@ -678,6 +727,18 @@ tun_dev_ioctl(dev_t dev, u_long cmd, voi } break; + case TUNSCAP: + error = tun_set_capabilities(sc, + (const struct tun_capabilities *)data); + break; + case TUNGCAP: + error = tun_get_capabilities(sc, + (struct tun_capabilities *)data); + break; + case TUNDCAP: + error = tun_del_capabilities(sc); + break; + case FIONBIO: break; case FIOASYNC: @@ -745,6 +806,7 @@ tun_dev_read(dev_t dev, struct uio *uio, struct tun_softc *sc; struct ifnet *ifp; struct mbuf *m, *m0; + size_t len; int error = 0; sc = tun_get(dev); @@ -760,12 +822,49 @@ tun_dev_read(dev_t dev, struct uio *uio, #if NBPFILTER > 0 if (ifp->if_bpf) - bpf_mtap(ifp->if_bpf, m0, BPF_DIRECTION_OUT); + (*ifp->if_bpf_mtap)(ifp->if_bpf, m0, BPF_DIRECTION_OUT); #endif + if (ISSET(sc->sc_flags, TUN_HDR)) { + struct tun_hdr th; + + KASSERT(ISSET(m0->m_flags, M_PKTHDR)); + + th.th_flags = 0; + if (ISSET(m0->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) + SET(th.th_flags, TUN_H_IPV4_CSUM); + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) + SET(th.th_flags, TUN_H_TCP_CSUM); + if (ISSET(m0->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) + SET(th.th_flags, TUN_H_UDP_CSUM); + if (ISSET(m0->m_pkthdr.csum_flags, M_ICMP_CSUM_OUT)) + SET(th.th_flags, TUN_H_ICMP_CSUM); + + th.th_pad = 0; + + th.th_vtag = 0; + if (ISSET(m0->m_flags, M_VLANTAG)) { + SET(th.th_flags, TUN_H_VTAG); + th.th_vtag = m0->m_pkthdr.ether_vtag; + } + + th.th_mss = 0; + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO)) { + SET(th.th_flags, TUN_H_TCP_MSS); + th.th_mss = m0->m_pkthdr.ph_mss; + } + + len = ulmin(uio->uio_resid, sizeof(th)); + if (len > 0) { + error = uiomove(&th, len, uio); + if (error != 0) + goto free; + } + } + m = m0; while (uio->uio_resid > 0) { - size_t len = ulmin(uio->uio_resid, m->m_len); + len = ulmin(uio->uio_resid, m->m_len); if (len > 0) { error = uiomove(mtod(m, void *), len, uio); if (error != 0) @@ -777,6 +876,7 @@ tun_dev_read(dev_t dev, struct uio *uio, break; } +free: m_freem(m0); put: @@ -807,6 +907,8 @@ tun_dev_write(dev_t dev, struct uio *uio struct mbuf *m0; int error = 0; size_t mlen; + size_t hlen; + struct tun_hdr th; sc = tun_get(dev); if (sc == NULL) @@ -814,8 +916,11 @@ tun_dev_write(dev_t dev, struct uio *uio ifp = &sc->sc_if; - if (uio->uio_resid < ifp->if_hdrlen || - uio->uio_resid > (ifp->if_hdrlen + ifp->if_hardmtu)) { + hlen = ifp->if_hdrlen; + if (ISSET(sc->sc_flags, TUN_HDR)) + hlen += sizeof(th); + if (uio->uio_resid < hlen || + uio->uio_resid > (hlen + ifp->if_hardmtu)) { error = EMSGSIZE; goto put; } @@ -839,6 +944,39 @@ tun_dev_write(dev_t dev, struct uio *uio m_align(m0, mlen); m0->m_pkthdr.len = m0->m_len = mlen; m_adj(m0, align); + + if (ISSET(sc->sc_flags, TUN_HDR)) { + error = uiomove(&th, sizeof(th), uio); + if (error != 0) + goto drop; + + if (ISSET(th.th_flags, TUN_H_IPV4_CSUM)) { + SET(m0->m_pkthdr.csum_flags, + M_IPV4_CSUM_OUT | M_IPV4_CSUM_IN_OK); + } + if (ISSET(th.th_flags, TUN_H_TCP_CSUM)) { + SET(m0->m_pkthdr.csum_flags, + M_TCP_CSUM_OUT | M_TCP_CSUM_IN_OK); + } + if (ISSET(th.th_flags, TUN_H_UDP_CSUM)) { + SET(m0->m_pkthdr.csum_flags, + M_UDP_CSUM_OUT | M_UDP_CSUM_IN_OK); + } + if (ISSET(th.th_flags, TUN_H_ICMP_CSUM)) { + SET(m0->m_pkthdr.csum_flags, + M_ICMP_CSUM_OUT | M_ICMP_CSUM_IN_OK); + } + + if (ISSET(th.th_flags, TUN_H_VTAG)) { + SET(m0->m_flags, M_VLANTAG); + m0->m_pkthdr.ether_vtag = th.th_vtag; + } + + if (ISSET(th.th_flags, TUN_H_TCP_MSS)) { + SET(m0->m_pkthdr.csum_flags, M_TCP_TSO); + m0->m_pkthdr.ph_mss = th.th_mss; + } + } error = uiomove(mtod(m0, void *), m0->m_len, uio); if (error != 0) Index: sys/net/if_tun.h =================================================================== RCS file: /cvs/src/sys/net/if_tun.h,v diff -u -p -r1.15 if_tun.h --- sys/net/if_tun.h 6 Feb 2007 10:49:40 -0000 1.15 +++ sys/net/if_tun.h 5 Nov 2024 07:01:15 -0000 @@ -31,6 +31,22 @@ #include +struct tun_hdr { + uint16_t th_flags; +#define TUN_H_VTAG (1 << 0) /* th_vtag is set */ +#define TUN_H_TCP_MSS (1 << 1) /* Cut TCP frame up by th_mss */ + +#define TUN_H_IPV4_CSUM (1 << 8) /* IPv4 cksum needed */ +#define TUN_H_TCP_CSUM (1 << 9) /* TCP cksum needed */ +#define TUN_H_UDP_CSUM (1 << 10) /* UDP cksum needed */ +#define TUN_H_ICMP_CSUM (1 << 11) /* ICMP/ICMPv6 cksum needed */ + + uint16_t th_pad; + + uint16_t th_vtag; + uint16_t th_mss; +}; + #define TUN_OPEN 0x0001 #define TUN_INITED 0x0002 #define TUN_RCOLL 0x0004 /* unused */ @@ -67,5 +83,13 @@ struct tuninfo { /* ioctl's for get/set debug */ #define TUNSDEBUG _IOW('t', 94, int) #define TUNGDEBUG _IOR('t', 95, int) + +struct tun_capabilities { + uint32_t tun_if_capabilities; +}; + +#define TUNSCAP _IOW('t', 96, struct tun_capabilities) +#define TUNGCAP _IOR('t', 96, struct tun_capabilities) +#define TUNDCAP _IO('t', 96) #endif /* _NET_IF_TUN_H_ */ Index: usr.sbin/vmd/vionet.c =================================================================== RCS file: /cvs/src/usr.sbin/vmd/vionet.c,v diff -u -p -r1.17 vionet.c --- usr.sbin/vmd/vionet.c 26 Sep 2024 01:45:13 -0000 1.17 +++ usr.sbin/vmd/vionet.c 5 Nov 2024 07:01:15 -0000 @@ -17,11 +17,13 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include +#include #include #include #include +#include #include #include @@ -33,6 +35,10 @@ #include #include #include +#include + +#include +#include #include "atomicio.h" #include "virtio.h" @@ -71,6 +77,8 @@ static void read_pipe_tx(int, short, voi static void vionet_assert_pic_irq(struct virtio_dev *); static void vionet_deassert_pic_irq(struct virtio_dev *); +static void tun_hdr_extract(struct tun_hdr *, const struct iovec *, int); + /* Device Globals */ struct event ev_tap; struct event ev_inject; @@ -101,13 +109,7 @@ vionet_main(int fd, int fd_vmm) struct vm_create_params *vcp; ssize_t sz; int ret; - - /* - * stdio - needed for read/write to disk fds and channels to the vm. - * vmm + proc - needed to create shared vm mappings. - */ - if (pledge("stdio vmm proc", NULL) == -1) - fatal("pledge"); + struct tun_capabilities tcap; /* Initialize iovec arrays. */ memset(iov_rx, 0, sizeof(iov_rx)); @@ -132,6 +134,21 @@ vionet_main(int fd, int fd_vmm) ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd, dev.async_fd, fd_vmm); + /* + * IFCAPs should be tweaked based on feature negotiation + * with the guest. + */ + tcap.tun_if_capabilities = 0; + if (ioctl(vionet->data_fd, TUNSCAP, &tcap) == -1) + fatal("tap(4) TUNSCAP"); + + /* + * stdio - needed for read/write to disk fds and channels to the vm. + * vmm + proc - needed to create shared vm mappings. + */ + if (pledge("stdio vmm proc", NULL) == -1) + fatal("pledge"); + /* Receive our vm information from the vm process. */ memset(&vm, 0, sizeof(vm)); sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm)); @@ -725,6 +742,8 @@ vionet_tx(struct virtio_dev *dev) struct iovec *iov; struct packet pkt; uint8_t status = 0; + struct virtio_net_hdr vh, *vhp; + struct tun_hdr th; status = vionet->cfg.device_status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK; @@ -752,8 +771,8 @@ vionet_tx(struct virtio_dev *dev) goto reset; } - iov = &iov_tx[0]; - iov_cnt = 0; + iov = &iov_tx[1]; + iov_cnt = 1; chain_len = 0; /* @@ -761,18 +780,23 @@ vionet_tx(struct virtio_dev *dev) * descriptor sized to the virtio_net_hdr. However, the framing * is not guaranteed, so check for packet data. */ - iov->iov_len = desc->len; - if (iov->iov_len < sizeof(struct virtio_net_hdr)) { + if (desc->len < sizeof(vh)) { log_warnx("%s: invalid descriptor length", __func__); goto reset; - } else if (iov->iov_len > sizeof(struct virtio_net_hdr)) { - /* Chop off the virtio header, leaving packet data. */ - iov->iov_len -= sizeof(struct virtio_net_hdr); - chain_len += iov->iov_len; - iov->iov_base = hvaddr_mem(desc->addr + - sizeof(struct virtio_net_hdr), iov->iov_len); + } + + /* Chop the virtio net header off */ + vhp = hvaddr_mem(desc->addr, sizeof(*vhp)); + if (vhp == NULL) + goto reset; + + iov->iov_len = desc->len - sizeof(*vhp); + if (iov->iov_len > 0) { + iov->iov_base = hvaddr_mem(desc->addr + sizeof(*vhp), + iov->iov_len); if (iov->iov_base == NULL) goto reset; + chain_len += iov->iov_len; iov_cnt++; } @@ -816,7 +840,7 @@ vionet_tx(struct virtio_dev *dev) * descriptor with packet data contains a large enough buffer * for this inspection. */ - iov = &iov_tx[0]; + iov = &iov_tx[1]; if (vionet->lockedmac) { if (iov->iov_len < ETHER_HDR_LEN) { log_warnx("%s: insufficient header data", @@ -842,6 +866,17 @@ vionet_tx(struct virtio_dev *dev) } } + memset(&th, 0, sizeof(th)); + iov_tx[0].iov_base = &th; + iov_tx[0].iov_len = sizeof(th); + + /* + * if we look at more of vhp we might need to copy + * it so it's aligned properly + */ + if (vhp->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) + tun_hdr_extract(&th, iov_tx + 1, iov_cnt - 1); + /* Write our packet to the tap(4). */ sz = writev(vionet->data_fd, iov_tx, iov_cnt); if (sz == -1 && errno != ENOBUFS) { @@ -1391,4 +1426,99 @@ vionet_deassert_pic_irq(struct virtio_de &msg, sizeof(msg), ev_base_main); if (ret == -1) log_warnx("%s: failed to assert irq %d", __func__, dev->irq); +} + +static int +memcpyv(void *buf, size_t len, size_t off, + const struct iovec *iov, int iovcnt) +{ + uint8_t *dst = buf; + size_t l; + + for (;;) { + if (iovcnt == 0) + return (-1); + + if (off < iov->iov_len) + break; + + off -= iov->iov_len; + iov++; + iovcnt--; + } + + l = off + len; + if (l > iov->iov_len) + l = iov->iov_len; + l -= off; + + memcpy(dst, (const uint8_t *)iov->iov_base + off, l); + dst += l; + len -= l; + + if (len == 0) + return (0); + + for (;;) { + if (iovcnt == 0) + return (-1); + + l = len; + if (l > iov->iov_len) + l = iov->iov_len; + + memcpy(dst, (const uint8_t *)iov->iov_base + off, l); + dst += l; + len -= l; + + if (len == 0) + break; + + iov++; + iovcnt--; + } + + return (0); +} + +static void +tun_hdr_extract(struct tun_hdr *th, const struct iovec *iov, int iovcnt) +{ + uint16_t etype; + uint8_t ipproto; + size_t offs; + + if (memcpyv(&etype, sizeof(etype), + offsetof(struct ether_header, ether_type), + iov, iovcnt) == -1) + return; + + if (etype == htons(ETHERTYPE_VLAN)) { + if (memcpyv(&etype, sizeof(etype), + offsetof(struct ether_vlan_header, evl_proto), + iov, iovcnt) == -1) + return; + + offs = sizeof(struct ether_vlan_header); + } else + offs = sizeof(struct ether_header); + + if (etype == htons(ETHERTYPE_IP)) + offs += offsetof(struct ip, ip_p); + else if (etype == htons(ETHERTYPE_IPV6)) + offs += offsetof(struct ip6_hdr, ip6_nxt); + else + return; + + if (memcpyv(&ipproto, sizeof(ipproto), offs, iov, iovcnt) == -1) + return; + + switch (ipproto) { + case IPPROTO_TCP: + th->th_flags |= TUN_H_TCP_CSUM; + break; + case IPPROTO_UDP: + th->th_flags |= TUN_H_UDP_CSUM; + break; + } } Index: usr.sbin/vmd/virtio.c =================================================================== RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v diff -u -p -r1.116 virtio.c --- usr.sbin/vmd/virtio.c 26 Sep 2024 01:45:13 -0000 1.116 +++ usr.sbin/vmd/virtio.c 5 Nov 2024 07:01:15 -0000 @@ -55,6 +55,7 @@ SLIST_HEAD(virtio_dev_head, virtio_dev) #define MAXPHYS (64 * 1024) /* max raw I/O transfer size */ +#define VIRTIO_NET_F_CSUM (1<<0) #define VIRTIO_NET_F_MAC (1<<5) #define VMMCI_F_TIMESYNC (1<<0) @@ -579,6 +580,9 @@ virtio_init(struct vmd_vm *vm, int child /* MAC address has been assigned by the parent */ memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6); dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC; + + /* offload checksum to the kernel */ + dev->vionet.cfg.device_feature = VIRTIO_NET_F_CSUM; dev->vionet.lockedmac = vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0; Index: usr.sbin/vmd/virtio.h =================================================================== RCS file: /cvs/src/usr.sbin/vmd/virtio.h,v diff -u -p -r1.52 virtio.h --- usr.sbin/vmd/virtio.h 10 Jul 2024 09:27:33 -0000 1.52 +++ usr.sbin/vmd/virtio.h 5 Nov 2024 07:01:15 -0000 @@ -307,6 +307,9 @@ struct virtio_net_hdr { /* uint16_t num_buffers; */ }; +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* flags */ +#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* flags */ + enum vmmci_cmd { VMMCI_NONE = 0, VMMCI_SHUTDOWN,