Index: sys/net/if_tun.c =================================================================== RCS file: /cvs/src/sys/net/if_tun.c,v diff -u -p -r1.243 if_tun.c --- sys/net/if_tun.c 16 Oct 2024 11:12:31 -0000 1.243 +++ sys/net/if_tun.c 5 Nov 2024 12:17:57 -0000 @@ -1,4 +1,4 @@ -/* $OpenBSD: if_tun.c,v 1.243 2024/10/16 11:12:31 dlg Exp $ */ +/* $OpenBSD: if_tun.c,v 1.244 2024/11/01 02:07:14 jsg Exp $ */ /* $NetBSD: if_tun.c,v 1.24 1996/05/07 02:40:48 thorpej Exp $ */ /* @@ -88,6 +88,7 @@ struct tun_softc { struct sigio_ref sc_sigio; /* async I/O registration */ unsigned int sc_flags; /* misc flags */ #define TUN_DEAD (1 << 16) +#define TUN_HDR (1 << 17) dev_t sc_dev; struct refcnt sc_refs; @@ -104,6 +105,13 @@ int tundebug = TUN_DEBUG; /* Pretend that these IFF flags are changeable by TUNSIFINFO */ #define TUN_IFF_FLAGS (IFF_POINTOPOINT|IFF_MULTICAST|IFF_BROADCAST) +#define TUN_IF_CAPS ( \ + IFCAP_CSUM_IPv4 | \ + IFCAP_CSUM_TCPv4|IFCAP_CSUM_UDPv4|IFCAP_CSUM_TCPv6|IFCAP_CSUM_UDPv6 | \ + IFCAP_VLAN_MTU|IFCAP_VLAN_HWTAGGING|IFCAP_VLAN_HWOFFLOAD | \ + IFCAP_TSOv4|IFCAP_TSOv6|IFCAP_LRO \ +) + void tunattach(int); int tun_dev_open(dev_t, const struct if_clone *, int, struct proc *); @@ -496,10 +504,11 @@ tun_dev_close(dev_t dev, struct proc *p) */ NET_LOCK(); CLR(ifp->if_flags, IFF_UP | IFF_RUNNING); + CLR(ifp->if_capabilities, TUN_IF_CAPS); NET_UNLOCK(); ifq_purge(&ifp->if_snd); - CLR(sc->sc_flags, TUN_ASYNC); + CLR(sc->sc_flags, TUN_ASYNC|TUN_HDR); sigio_free(&sc->sc_sigio); if (!ISSET(sc->sc_flags, TUN_DEAD)) { @@ -627,6 +636,51 @@ tapioctl(dev_t dev, u_long cmd, caddr_t return (tun_dev_ioctl(dev, cmd, data)); } +static int +tun_set_capabilities(struct tun_softc *sc, const struct tun_capabilities *cap) +{ + if (ISSET(cap->tun_if_capabilities, ~TUN_IF_CAPS)) + return (EINVAL); + + KERNEL_ASSERT_LOCKED(); + SET(sc->sc_flags, TUN_HDR); + + NET_LOCK(); + CLR(sc->sc_if.if_capabilities, TUN_IF_CAPS); + SET(sc->sc_if.if_capabilities, cap->tun_if_capabilities); + NET_UNLOCK(); + return (0); +} + +static int +tun_get_capabilities(struct tun_softc *sc, struct tun_capabilities *cap) +{ + int error = 0; + + NET_LOCK_SHARED(); + if (ISSET(sc->sc_flags, TUN_HDR)) { + cap->tun_if_capabilities = + (sc->sc_if.if_capabilities & TUN_IF_CAPS); + } else + error = ENODEV; + NET_UNLOCK_SHARED(); + + return (error); +} + +static int +tun_del_capabilities(struct tun_softc *sc) +{ + NET_LOCK(); + CLR(sc->sc_if.if_capabilities, TUN_IF_CAPS); + NET_UNLOCK(); + + KERNEL_ASSERT_LOCKED(); + CLR(sc->sc_flags, TUN_HDR); + + return (0); +} + int tun_dev_ioctl(dev_t dev, u_long cmd, void *data) { @@ -678,6 +732,18 @@ tun_dev_ioctl(dev_t dev, u_long cmd, voi } break; + case TUNSCAP: + error = tun_set_capabilities(sc, + (const struct tun_capabilities *)data); + break; + case TUNGCAP: + error = tun_get_capabilities(sc, + (struct tun_capabilities *)data); + break; + case TUNDCAP: + error = tun_del_capabilities(sc); + break; + case FIONBIO: break; case FIOASYNC: @@ -745,6 +811,7 @@ tun_dev_read(dev_t dev, struct uio *uio, struct tun_softc *sc; struct ifnet *ifp; struct mbuf *m, *m0; + size_t len; int error = 0; sc = tun_get(dev); @@ -760,12 +827,49 @@ tun_dev_read(dev_t dev, struct uio *uio, #if NBPFILTER > 0 if (ifp->if_bpf) - bpf_mtap(ifp->if_bpf, m0, BPF_DIRECTION_OUT); + (*ifp->if_bpf_mtap)(ifp->if_bpf, m0, BPF_DIRECTION_OUT); #endif + if (ISSET(sc->sc_flags, TUN_HDR)) { + struct tun_hdr th; + + KASSERT(ISSET(m0->m_flags, M_PKTHDR)); + + th.th_flags = 0; + if (ISSET(m0->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) + SET(th.th_flags, TUN_H_IPV4_CSUM); + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) + SET(th.th_flags, TUN_H_TCP_CSUM); + if (ISSET(m0->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) + SET(th.th_flags, TUN_H_UDP_CSUM); + if (ISSET(m0->m_pkthdr.csum_flags, M_ICMP_CSUM_OUT)) + SET(th.th_flags, TUN_H_ICMP_CSUM); + + th.th_pad = 0; + + th.th_vtag = 0; + if (ISSET(m0->m_flags, M_VLANTAG)) { + SET(th.th_flags, TUN_H_VTAG); + th.th_vtag = m0->m_pkthdr.ether_vtag; + } + + th.th_mss = 0; + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO)) { + SET(th.th_flags, TUN_H_TCP_MSS); + th.th_mss = m0->m_pkthdr.ph_mss; + } + + len = ulmin(uio->uio_resid, sizeof(th)); + if (len > 0) { + error = uiomove(&th, len, uio); + if (error != 0) + goto free; + } + } + m = m0; while (uio->uio_resid > 0) { - size_t len = ulmin(uio->uio_resid, m->m_len); + len = ulmin(uio->uio_resid, m->m_len); if (len > 0) { error = uiomove(mtod(m, void *), len, uio); if (error != 0) @@ -777,6 +881,7 @@ tun_dev_read(dev_t dev, struct uio *uio, break; } +free: m_freem(m0); put: @@ -807,6 +912,8 @@ tun_dev_write(dev_t dev, struct uio *uio struct mbuf *m0; int error = 0; size_t mlen; + size_t hlen; + struct tun_hdr th; sc = tun_get(dev); if (sc == NULL) @@ -814,8 +921,11 @@ tun_dev_write(dev_t dev, struct uio *uio ifp = &sc->sc_if; - if (uio->uio_resid < ifp->if_hdrlen || - uio->uio_resid > (ifp->if_hdrlen + ifp->if_hardmtu)) { + hlen = ifp->if_hdrlen; + if (ISSET(sc->sc_flags, TUN_HDR)) + hlen += sizeof(th); + if (uio->uio_resid < hlen || + uio->uio_resid > (hlen + ifp->if_hardmtu)) { error = EMSGSIZE; goto put; } @@ -840,6 +950,39 @@ tun_dev_write(dev_t dev, struct uio *uio m0->m_pkthdr.len = m0->m_len = mlen; m_adj(m0, align); + if (ISSET(sc->sc_flags, TUN_HDR)) { + error = uiomove(&th, sizeof(th), uio); + if (error != 0) + goto drop; + + if (ISSET(th.th_flags, TUN_H_IPV4_CSUM)) { + SET(m0->m_pkthdr.csum_flags, + M_IPV4_CSUM_OUT | M_IPV4_CSUM_IN_OK); + } + if (ISSET(th.th_flags, TUN_H_TCP_CSUM)) { + SET(m0->m_pkthdr.csum_flags, + M_TCP_CSUM_OUT | M_TCP_CSUM_IN_OK); + } + if (ISSET(th.th_flags, TUN_H_UDP_CSUM)) { + SET(m0->m_pkthdr.csum_flags, + M_UDP_CSUM_OUT | M_UDP_CSUM_IN_OK); + } + if (ISSET(th.th_flags, TUN_H_ICMP_CSUM)) { + SET(m0->m_pkthdr.csum_flags, + M_ICMP_CSUM_OUT | M_ICMP_CSUM_IN_OK); + } + + if (ISSET(th.th_flags, TUN_H_VTAG)) { + SET(m0->m_flags, M_VLANTAG); + m0->m_pkthdr.ether_vtag = th.th_vtag; + } + + if (ISSET(th.th_flags, TUN_H_TCP_MSS)) { + SET(m0->m_pkthdr.csum_flags, M_TCP_TSO); + m0->m_pkthdr.ph_mss = th.th_mss; + } + } + error = uiomove(mtod(m0, void *), m0->m_len, uio); if (error != 0) goto drop; @@ -905,15 +1048,12 @@ int tun_dev_kqfilter(dev_t dev, struct knote *kn) { struct tun_softc *sc; - struct ifnet *ifp; struct klist *klist; int error = 0; sc = tun_get(dev); if (sc == NULL) return (ENXIO); - - ifp = &sc->sc_if; switch (kn->kn_filter) { case EVFILT_READ: Index: sys/net/if_tun.h =================================================================== RCS file: /cvs/src/sys/net/if_tun.h,v diff -u -p -r1.15 if_tun.h --- sys/net/if_tun.h 6 Feb 2007 10:49:40 -0000 1.15 +++ sys/net/if_tun.h 5 Nov 2024 12:17:57 -0000 @@ -31,6 +31,48 @@ #include +/* + * tun_hdr is built out of uint16_t fields so it can sit on a 2 + * byte boundary, and is a multiple of 4 bytes. This allows it to + * placed directly in front of Ethernet headers on tap(4) interfaces, + * and dereferenced directly, while also while maintaining the alignment + * of the payload on both tun(4) and tap(4) interfaces. + * + * Userland can request the use of the tun_hdr using the TUNSCAP + * ioctl. This ioctl also allows userland to specify which "offload" + * capabilities it is able to accept in packets it will read from the + * kernel. It is valid to enable tun_hdr without enabling any + * interface offload capabilities. + * + * Once the tap_hdr is enabled, userland can write packets into the + * kernel with any of the supported features. tun(4)/tap(4) reads + * will unconditionally handle any features specified on the packet, + * regardless of what capabilities were specified by the TUNSCAP + * ioctl. + * + * The tun_hdr can be read from one interface and written directly + * to another without interpretation or modification. + * + * Use of tun_hdr and the associated capabilities are reset when a + * tun(4)/tap(4) device is closed. + */ + +struct tun_hdr { + uint16_t th_flags; +#define TUN_H_VTAG (1 << 0) /* th_vtag is set */ +#define TUN_H_TCP_MSS (1 << 1) /* Cut TCP frame up by th_mss */ + +#define TUN_H_IPV4_CSUM (1 << 8) +#define TUN_H_TCP_CSUM (1 << 9) +#define TUN_H_UDP_CSUM (1 << 10) +#define TUN_H_ICMP_CSUM (1 << 11) + + uint16_t th_pad; + + uint16_t th_vtag; + uint16_t th_mss; +}; + #define TUN_OPEN 0x0001 #define TUN_INITED 0x0002 #define TUN_RCOLL 0x0004 /* unused */ @@ -67,5 +109,13 @@ struct tuninfo { /* ioctl's for get/set debug */ #define TUNSDEBUG _IOW('t', 94, int) #define TUNGDEBUG _IOR('t', 95, int) + +struct tun_capabilities { + uint32_t tun_if_capabilities; /* IFCAP_* from net/if.h */ +}; + +#define TUNSCAP _IOW('t', 96, struct tun_capabilities) +#define TUNGCAP _IOR('t', 96, struct tun_capabilities) +#define TUNDCAP _IO('t', 96) #endif /* _NET_IF_TUN_H_ */ Index: usr.sbin/vmd/vionet.c =================================================================== RCS file: /cvs/src/usr.sbin/vmd/vionet.c,v diff -u -p -r1.17 vionet.c --- usr.sbin/vmd/vionet.c 26 Sep 2024 01:45:13 -0000 1.17 +++ usr.sbin/vmd/vionet.c 5 Nov 2024 12:17:57 -0000 @@ -17,11 +17,13 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include +#include #include #include #include +#include #include #include @@ -33,6 +35,10 @@ #include #include #include +#include + +#include +#include #include "atomicio.h" #include "virtio.h" @@ -101,13 +107,7 @@ vionet_main(int fd, int fd_vmm) struct vm_create_params *vcp; ssize_t sz; int ret; - - /* - * stdio - needed for read/write to disk fds and channels to the vm. - * vmm + proc - needed to create shared vm mappings. - */ - if (pledge("stdio vmm proc", NULL) == -1) - fatal("pledge"); + struct tun_capabilities tcap; /* Initialize iovec arrays. */ memset(iov_rx, 0, sizeof(iov_rx)); @@ -132,6 +132,21 @@ vionet_main(int fd, int fd_vmm) ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd, dev.async_fd, fd_vmm); + /* + * IFCAPs should be tweaked based on feature negotiation + * with the guest. + */ + tcap.tun_if_capabilities = 0; + if (ioctl(vionet->data_fd, TUNSCAP, &tcap) == -1) + fatal("tap(4) TUNSCAP"); + + /* + * stdio - needed for read/write to disk fds and channels to the vm. + * vmm + proc - needed to create shared vm mappings. + */ + if (pledge("stdio vmm proc", NULL) == -1) + fatal("pledge"); + /* Receive our vm information from the vm process. */ memset(&vm, 0, sizeof(vm)); sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm)); @@ -410,7 +425,7 @@ vionet_rx(struct vionet_dev *dev, int fd goto reset; } - iov = &iov_rx[0]; + iov = &iov_rx[1]; iov_cnt = 1; /* @@ -483,8 +498,12 @@ vionet_rx(struct vionet_dev *dev, int fd if (dev->lockedmac || fd != dev->data_fd) sz = vionet_rx_copy(dev, fd, iov_rx, iov_cnt, chain_len); - else + else { + struct tun_hdr th; + iov_rx[0].iov_base = &th; + iov_rx[0].iov_len = sizeof(th); sz = vionet_rx_zerocopy(dev, fd, iov_rx, iov_cnt); + } if (sz == -1) goto reset; if (sz == 0) /* No packets, so bail out for now. */ @@ -531,7 +550,7 @@ ssize_t vionet_rx_copy(struct vionet_dev *dev, int fd, const struct iovec *iov, int iov_cnt, size_t chain_len) { - static uint8_t buf[VIONET_HARD_MTU]; + static uint8_t buf[sizeof(struct tun_hdr) + VIONET_HARD_MTU]; struct packet *pkt = NULL; struct ether_header *eh = NULL; uint8_t *payload = buf; @@ -539,9 +558,10 @@ vionet_rx_copy(struct vionet_dev *dev, i ssize_t sz; /* If reading from the tap(4), try to right-size the read. */ - if (fd == dev->data_fd) - nbytes = MIN(chain_len, VIONET_HARD_MTU); - else if (fd == pipe_inject[READ]) + if (fd == dev->data_fd) { + nbytes = sizeof(struct tun_hdr) + + MIN(chain_len, VIONET_HARD_MTU); + } else if (fd == pipe_inject[READ]) nbytes = sizeof(struct packet); else { log_warnx("%s: invalid fd: %d", __func__, fd); @@ -560,10 +580,19 @@ vionet_rx_copy(struct vionet_dev *dev, i return (-1); } return (0); - } else if (fd == dev->data_fd && sz < VIONET_MIN_TXLEN) { + } else if (fd == dev->data_fd) { + if ((size_t)sz < sizeof(struct tun_hdr)) { + log_warnx("%s: short tun_hdr", __func__); + return (0); + } + payload += sizeof(struct tun_hdr); + sz -= sizeof(struct tun_hdr); + /* If reading the tap(4), we should get valid ethernet. */ - log_warnx("%s: invalid packet size", __func__); - return (0); + if (sz < VIONET_MIN_TXLEN) { + log_warnx("%s: invalid packet size", __func__); + return (0); + } } else if (fd == pipe_inject[READ] && sz != sizeof(struct packet)) { log_warnx("%s: invalid injected packet object (sz=%ld)", __func__, sz); @@ -725,6 +754,8 @@ vionet_tx(struct virtio_dev *dev) struct iovec *iov; struct packet pkt; uint8_t status = 0; + struct virtio_net_hdr vh, *vhp; + struct tun_hdr th; status = vionet->cfg.device_status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK; @@ -752,8 +783,10 @@ vionet_tx(struct virtio_dev *dev) goto reset; } - iov = &iov_tx[0]; - iov_cnt = 0; + /* the 0th slot will by used by the tun_hdr */ + + iov = &iov_tx[1]; + iov_cnt = 1; chain_len = 0; /* @@ -761,18 +794,23 @@ vionet_tx(struct virtio_dev *dev) * descriptor sized to the virtio_net_hdr. However, the framing * is not guaranteed, so check for packet data. */ - iov->iov_len = desc->len; - if (iov->iov_len < sizeof(struct virtio_net_hdr)) { + if (desc->len < sizeof(vh)) { log_warnx("%s: invalid descriptor length", __func__); goto reset; - } else if (iov->iov_len > sizeof(struct virtio_net_hdr)) { - /* Chop off the virtio header, leaving packet data. */ - iov->iov_len -= sizeof(struct virtio_net_hdr); - chain_len += iov->iov_len; - iov->iov_base = hvaddr_mem(desc->addr + - sizeof(struct virtio_net_hdr), iov->iov_len); + } + + /* Chop the virtio net header off */ + vhp = hvaddr_mem(desc->addr, sizeof(*vhp)); + if (vhp == NULL) + goto reset; + + iov->iov_len = desc->len - sizeof(*vhp); + if (iov->iov_len > 0) { + iov->iov_base = hvaddr_mem(desc->addr + sizeof(*vhp), + iov->iov_len); if (iov->iov_base == NULL) goto reset; + chain_len += iov->iov_len; iov_cnt++; } @@ -816,7 +854,7 @@ vionet_tx(struct virtio_dev *dev) * descriptor with packet data contains a large enough buffer * for this inspection. */ - iov = &iov_tx[0]; + iov = &iov_tx[1]; if (vionet->lockedmac) { if (iov->iov_len < ETHER_HDR_LEN) { log_warnx("%s: insufficient header data", @@ -841,6 +879,18 @@ vionet_tx(struct virtio_dev *dev) goto drop; } } + + memset(&th, 0, sizeof(th)); + + /* + * if we look at more of vhp we might need to copy + * it so it's aligned properly + */ + if (vhp->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) + th.th_flags |= TUN_H_TCP_CSUM | TUN_H_UDP_CSUM; + + iov_tx[0].iov_base = &th; + iov_tx[0].iov_len = sizeof(th); /* Write our packet to the tap(4). */ sz = writev(vionet->data_fd, iov_tx, iov_cnt); Index: usr.sbin/vmd/virtio.c =================================================================== RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v diff -u -p -r1.116 virtio.c --- usr.sbin/vmd/virtio.c 26 Sep 2024 01:45:13 -0000 1.116 +++ usr.sbin/vmd/virtio.c 5 Nov 2024 12:17:57 -0000 @@ -55,6 +55,7 @@ SLIST_HEAD(virtio_dev_head, virtio_dev) #define MAXPHYS (64 * 1024) /* max raw I/O transfer size */ +#define VIRTIO_NET_F_CSUM (1<<0) #define VIRTIO_NET_F_MAC (1<<5) #define VMMCI_F_TIMESYNC (1<<0) @@ -579,6 +580,9 @@ virtio_init(struct vmd_vm *vm, int child /* MAC address has been assigned by the parent */ memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6); dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC; + + /* offload checksum to the kernel */ + dev->vionet.cfg.device_feature = VIRTIO_NET_F_CSUM; dev->vionet.lockedmac = vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0; Index: usr.sbin/vmd/virtio.h =================================================================== RCS file: /cvs/src/usr.sbin/vmd/virtio.h,v diff -u -p -r1.52 virtio.h --- usr.sbin/vmd/virtio.h 10 Jul 2024 09:27:33 -0000 1.52 +++ usr.sbin/vmd/virtio.h 5 Nov 2024 12:17:57 -0000 @@ -307,6 +307,9 @@ struct virtio_net_hdr { /* uint16_t num_buffers; */ }; +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* flags */ +#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* flags */ + enum vmmci_cmd { VMMCI_NONE = 0, VMMCI_SHUTDOWN,