Index: Makefile =================================================================== RCS file: /cvs/src/usr.sbin/vmd/Makefile,v retrieving revision 1.16 diff -u -p -r1.16 Makefile --- Makefile 3 Jul 2017 22:21:47 -0000 1.16 +++ Makefile 25 Aug 2017 02:51:41 -0000 @@ -7,17 +7,21 @@ SRCS= vmd.c control.c log.c priv.c proc SRCS+= vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c SRCS+= ns8250.c i8253.c vmboot.c ufs.c disklabel.c dhcp.c packet.c SRCS+= parse.y atomicio.c +SRCS+= task.c CFLAGS+= -Wall -I${.CURDIR} CFLAGS+= -Wstrict-prototypes -Wmissing-prototypes CFLAGS+= -Wmissing-declarations CFLAGS+= -Wshadow -Wpointer-arith -Wcast-qual CFLAGS+= -Wsign-compare +CFLAGS+= -fno-omit-frame-pointer LDADD+= -lutil -lpthread -levent DPADD+= ${LIBUTIL} ${LIBEVENT} YFLAGS= + +DEBUG=-g .else Index: i8259.c =================================================================== RCS file: /cvs/src/usr.sbin/vmd/i8259.c,v retrieving revision 1.14 diff -u -p -r1.14 i8259.c --- i8259.c 8 May 2017 09:08:40 -0000 1.14 +++ i8259.c 25 Aug 2017 02:51:41 -0000 @@ -24,10 +24,12 @@ #include #include +#include #include "proc.h" #include "i8259.h" #include "vmm.h" #include "atomicio.h" +#include "vmd.h" struct i8259 { uint8_t irr; @@ -51,6 +53,7 @@ struct i8259 { /* Master and slave PICs */ struct i8259 pics[2]; +pthread_mutex_t pic_mtx; /* * i8259_pic_name @@ -85,6 +88,9 @@ i8259_init(void) memset(&pics, 0, sizeof(pics)); pics[MASTER].cur_icw = 1; pics[SLAVE].cur_icw = 1; + + if (pthread_mutex_init(&pic_mtx, NULL) != 0) + fatalx("unable to create pic mutex"); } /* @@ -98,17 +104,15 @@ i8259_init(void) uint8_t i8259_is_pending(void) { - uint8_t pending = 0; uint8_t master_pending; uint8_t slave_pending; + mutex_lock(&pic_mtx); master_pending = pics[MASTER].irr & ~(pics[MASTER].imr | (1 << 2)); slave_pending = pics[SLAVE].irr & ~pics[SLAVE].imr; + mutex_unlock(&pic_mtx); - if (master_pending || slave_pending) - pending = 1; - - return pending; + return (master_pending || slave_pending); } /* @@ -120,8 +124,8 @@ i8259_is_pending(void) * Return values: * interrupt vector to inject, 0xFFFF if no irq pending */ -uint16_t -i8259_ack(void) +static uint16_t +_i8259_ack(void) { uint8_t high_prio_m, high_prio_s; uint8_t i; @@ -195,6 +199,18 @@ i8259_ack(void) return (0xFFFF); } +uint16_t +i8259_ack(void) +{ + uint16_t rv; + + mutex_lock(&pic_mtx); + rv = _i8259_ack(); + mutex_unlock(&pic_mtx); + + return (rv); +} + /* * i8259_assert_irq * @@ -206,23 +222,24 @@ i8259_ack(void) void i8259_assert_irq(uint8_t irq) { + mutex_lock(&pic_mtx); if (irq <= 7) { - if (pics[MASTER].imr & (1 << irq)) - return; - - pics[MASTER].irr |= (1 << irq); - pics[MASTER].asserted = 1; + if (!ISSET(pics[MASTER].imr, 1 << irq)) { + SET(pics[MASTER].irr, 1 << irq); + pics[MASTER].asserted = 1; + } } else { - if (pics[SLAVE].imr & (1 << (irq - 8))) - return; - - pics[SLAVE].irr |= (1 << (irq - 8)); - pics[SLAVE].asserted = 1; - - /* Assert cascade IRQ on master PIC */ - pics[MASTER].irr |= (1 << 2); - pics[MASTER].asserted = 1; + irq -= 8; + if (!ISSET(pics[SLAVE].imr, 1 << irq)) { + SET(pics[SLAVE].irr, irq); + pics[SLAVE].asserted = 1; + + /* Assert cascade IRQ on master PIC */ + SET(pics[MASTER].irr, 1 << 2); + pics[MASTER].asserted = 1; + } } + mutex_unlock(&pic_mtx); } /* @@ -236,15 +253,18 @@ i8259_assert_irq(uint8_t irq) void i8259_deassert_irq(uint8_t irq) { + mutex_lock(&pic_mtx); if (irq <= 7) - pics[MASTER].irr &= ~(1 << irq); + CLR(pics[MASTER].irr, 1 << irq); else { - pics[SLAVE].irr &= ~(1 << (irq - 8)); + irq -= 8; + CLR(pics[SLAVE].irr, 1 << irq); /* Deassert cascade IRQ on master if no IRQs on slave */ if (pics[SLAVE].irr == 0) - pics[MASTER].irr &= ~(1 << 2); + CLR(pics[MASTER].irr, 1 << 2); } + mutex_unlock(&pic_mtx); } /* @@ -605,6 +625,7 @@ i8259_io_read(union vm_exit *vei) { uint16_t port = vei->vei.vei_port; uint8_t n = 0; + uint8_t rv; switch (port) { case IO_ICU1: @@ -619,10 +640,14 @@ i8259_io_read(union vm_exit *vei) fatal("%s: invalid port 0x%x", __func__, port); } + mutex_lock(&pic_mtx); if (port == IO_ICU1 + 1 || port == IO_ICU2 + 1) - return i8259_read_datareg(n); + rv = i8259_read_datareg(n); else - return i8259_read_cmdreg(n); + rv = i8259_read_cmdreg(n); + mutex_unlock(&pic_mtx); + + return (rv); } /* @@ -641,11 +666,13 @@ vcpu_exit_i8259(struct vm_run_params *vr { union vm_exit *vei = vrp->vrp_exit; + mutex_lock(&pic_mtx); if (vei->vei.vei_dir == VEI_DIR_OUT) { i8259_io_write(vei); } else { set_return_data(vei, i8259_io_read(vei)); } + mutex_unlock(&pic_mtx); return (0xFF); } Index: task.c =================================================================== RCS file: task.c diff -N task.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ task.c 25 Aug 2017 02:51:41 -0000 @@ -0,0 +1,156 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2017 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "task.h" + +#define ISSET(_v, _m) ((_v) & (_m)) +#define SET(_v, _m) ((_v) |= (_m)) +#define CLR(_v, _m) ((_v) &= ~(_m)) + +struct taskq { + pthread_t thread; + struct task_list list; + pthread_mutex_t mtx; + pthread_cond_t cv; +}; + +#define TASK_ONQUEUE (1 << 0) + +static void *taskq_run(void *); + +struct taskq * +taskq_create(const char *name) +{ + struct taskq *tq; + int error; + + tq = malloc(sizeof(*tq)); + if (tq == NULL) + return (NULL); + + TAILQ_INIT(&tq->list); + + error = pthread_mutex_init(&tq->mtx, NULL); + if (error != 0) + goto free; + + error = pthread_cond_init(&tq->cv, NULL); + if (error != 0) + goto mtx; + + error = pthread_create(&tq->thread, NULL, taskq_run, tq); + if (error != 0) + goto cv; + + pthread_set_name_np(tq->thread, name); + + return (tq); + +cv: + pthread_cond_destroy(&tq->cv); +mtx: + pthread_mutex_destroy(&tq->mtx); /* can this really fail? */ +free: + free(tq); + + errno = error; + return (NULL); +} + +static void * +taskq_run(void *tqarg) +{ + struct taskq *tq = tqarg; + struct task *t; + + void (*t_func)(void *); + void *t_arg; + + for (;;) { + pthread_mutex_lock(&tq->mtx); + while ((t = TAILQ_FIRST(&tq->list)) == NULL) + pthread_cond_wait(&tq->cv, &tq->mtx); + + TAILQ_REMOVE(&tq->list, t, t_entry); + CLR(t->t_flags, TASK_ONQUEUE); + + t_func = t->t_func; + t_arg = t->t_arg; + + pthread_mutex_unlock(&tq->mtx); + + (*t_func)(t_arg); + } + + return (NULL); +} + +void +task_set(struct task *t, void (*fn)(void *), void *arg) +{ + t->t_func = fn; + t->t_arg = arg; + t->t_flags = 0; +} + +int +task_add(struct taskq *tq, struct task *t) +{ + int rv = 1; + + if (ISSET(t->t_flags, TASK_ONQUEUE)) + return (0); + + pthread_mutex_lock(&tq->mtx); + if (ISSET(t->t_flags, TASK_ONQUEUE)) + rv = 0; + else { + SET(t->t_flags, TASK_ONQUEUE); + TAILQ_INSERT_TAIL(&tq->list, t, t_entry); + pthread_cond_signal(&tq->cv); + } + pthread_mutex_unlock(&tq->mtx); + + return (rv); +} + +int +task_del(struct taskq *tq, struct task *t) +{ + int rv = 1; + + if (!ISSET(t->t_flags, TASK_ONQUEUE)) + return (0); + + pthread_mutex_lock(&tq->mtx); + if (!ISSET(t->t_flags, TASK_ONQUEUE)) + rv = 0; + else { + TAILQ_REMOVE(&tq->list, t, t_entry); + CLR(t->t_flags, TASK_ONQUEUE); + } + pthread_mutex_unlock(&tq->mtx); + + return (rv); +} Index: task.h =================================================================== RCS file: task.h diff -N task.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ task.h 25 Aug 2017 02:51:41 -0000 @@ -0,0 +1,43 @@ +/* $OpenBSD: task.h,v 1.11 2016/06/07 07:53:33 mpi Exp $ */ + +/* + * Copyright (c) 2013 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _TASK_H_ +#define _TASK_H_ + +#include + +struct taskq; + +struct task { + TAILQ_ENTRY(task) t_entry; + void (*t_func)(void *); + void *t_arg; + unsigned int t_flags; +}; + +TAILQ_HEAD(task_list, task); + +#define TASK_INITIALIZER(_f, _a) {{ NULL, NULL }, (_f), (_a), 0 } + +struct taskq *taskq_create(const char *); + +void task_set(struct task *, void (*)(void *), void *); +int task_add(struct taskq *, struct task *); +int task_del(struct taskq *, struct task *); + +#endif /* _TASK_H_ */ Index: virtio.c =================================================================== RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v retrieving revision 1.52 diff -u -p -r1.52 virtio.c --- virtio.c 20 Aug 2017 05:16:58 -0000 1.52 +++ virtio.c 25 Aug 2017 02:51:41 -0000 @@ -43,14 +43,48 @@ #include "virtio.h" #include "loadfile.h" #include "atomicio.h" +#include "task.h" + +#ifndef MIN +#define MIN(_a, _b) ((_a) < (_b) ? (_a) : (_b)) +#endif + +#ifndef nitems +#define nitems(_a) (sizeof(_a) / sizeof((_a)[0])) +#endif extern char *__progname; +struct vioblk_queue { + struct vioblk_dev *dev; + void *ring; + struct virtio_vq_info vq; + struct task t; + struct event ev; +}; + +struct vioblk_dev { + struct virtio_io_cfg cfg; + + struct vioblk_queue q[VIRTIO_MAX_QUEUES]; + + int fd; + uint64_t sz; + uint32_t max_xfer; + + uint32_t vm_id; + int irq; +}; + struct viornd_dev viornd; struct vioblk_dev *vioblk; struct vionet_dev *vionet; struct vmmci_dev vmmci; +struct taskq *iotq; +int iofd; +struct event ioev; + int nr_vionet; int nr_vioblk; @@ -62,13 +96,12 @@ int nr_vioblk; #define VMMCI_F_ACK (1<<1) #define VMMCI_F_SYNCRTC (1<<2) -struct ioinfo { - uint8_t *buf; - ssize_t len; - off_t offset; - int fd; - int error; -}; +int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t); +int vioblk_dump(int); +int vioblk_restore(int, struct vm_create_params *, int *); +void vioblk_update_qs(struct vioblk_dev *); +void vioblk_update_qa(struct vioblk_dev *); +int vioblk_notifyq(struct vioblk_dev *); const char * vioblk_cmd_name(uint32_t type) @@ -85,6 +118,7 @@ vioblk_cmd_name(uint32_t type) } } +#if 0 static void dump_descriptor_chain(struct vring_desc *desc, int16_t dxx) { @@ -108,6 +142,7 @@ dump_descriptor_chain(struct vring_desc desc[dxx].flags, desc[dxx].next); } +#endif static const char * virtio_reg_name(uint8_t reg) @@ -319,7 +354,10 @@ vioblk_update_qa(struct vioblk_dev *dev) if (dev->cfg.queue_select > 0) return; - dev->vq[dev->cfg.queue_select].qa = dev->cfg.queue_address; + dev->q[dev->cfg.queue_select].vq.qa = dev->cfg.queue_address; + dev->q[dev->cfg.queue_select].ring = vaddr_mem( + dev->cfg.queue_address * VIRTIO_PAGE_SIZE, + vring_size(VIOBLK_QUEUE_SIZE)); } void @@ -332,375 +370,201 @@ vioblk_update_qs(struct vioblk_dev *dev) } /* Update queue address/size based on queue select */ - dev->cfg.queue_address = dev->vq[dev->cfg.queue_select].qa; - dev->cfg.queue_size = dev->vq[dev->cfg.queue_select].qs; + dev->cfg.queue_address = dev->q[dev->cfg.queue_select].vq.qa; + dev->cfg.queue_size = dev->q[dev->cfg.queue_select].vq.qs; } -static void -vioblk_free_info(struct ioinfo *info) +static int +vioblk_complete(struct vring_desc *desc, uint8_t ds) { - if (!info) - return; - free(info->buf); - free(info); + if (write_mem(desc->addr, &ds, MIN(desc->len, sizeof(ds)))) { + log_warnx("can't write device status data @ " + "0x%llx", desc->addr); + } + + return (0); } -static struct ioinfo * -vioblk_start_read(struct vioblk_dev *dev, off_t sector, ssize_t sz) +static int +vioblk_io_skip(struct vring_desc *vring, struct vring_desc *desc) { - struct ioinfo *info; + unsigned int idx; - info = calloc(1, sizeof(*info)); - if (!info) - goto nomem; - info->buf = malloc(sz); - if (info->buf == NULL) - goto nomem; - info->len = sz; - info->offset = sector * VIRTIO_BLK_SECTOR_SIZE; - info->fd = dev->fd; - - return info; + do { + idx = desc->next & VIOBLK_QUEUE_MASK; + desc = &vring[idx]; + } while (ISSET(desc->flags, VRING_DESC_F_NEXT)); -nomem: - free(info); - log_warn("malloc errror vioblk read"); - return (NULL); + return (vioblk_complete(desc, VIRTIO_BLK_S_IOERR)); } - -static const uint8_t * -vioblk_finish_read(struct ioinfo *info) -{ - if (pread(info->fd, info->buf, info->len, info->offset) != info->len) { - info->error = errno; - log_warn("vioblk read error"); - return NULL; +static int +vioblk_io(struct vioblk_dev *dev, + ssize_t (*piov)(int, const struct iovec *, int, off_t), + const struct virtio_blk_req_hdr *hdr, + struct vring_desc *desc, + struct vring_desc *vring) +{ + struct iovec iov[128]; + int cnt, iovcnt = 0; + unsigned int idx; + ssize_t rv; + + idx = desc->next & VIOBLK_QUEUE_MASK; + desc = &vring[idx]; + + if (!ISSET(desc->flags, VRING_DESC_F_NEXT)) { + log_warnx("unchained vioblk data descriptor " + "received (idx %u)", idx); + return (-1); } - return info->buf; -} - -static struct ioinfo * -vioblk_start_write(struct vioblk_dev *dev, off_t sector, paddr_t addr, size_t len) -{ - struct ioinfo *info; - - info = calloc(1, sizeof(*info)); - if (!info) - goto nomem; - info->buf = malloc(len); - if (info->buf == NULL) - goto nomem; - info->len = len; - info->offset = sector * VIRTIO_BLK_SECTOR_SIZE; - info->fd = dev->fd; + do { + cnt = iovec_mem(desc->addr, desc->len, + iov + iovcnt, nitems(iov) - iovcnt); + if (cnt == -1) { + log_warnx("invalid dma address 0x%016llx", + desc->addr); + return vioblk_io_skip(vring, desc); + } + + iovcnt += cnt; + if (iovcnt == nitems(iov)) { + log_warnx("%s: iov is too small", __func__); + return vioblk_io_skip(vring, desc); + } + + idx = desc->next & VIOBLK_QUEUE_MASK; + desc = &vring[idx]; + } while (ISSET(desc->flags, VRING_DESC_F_NEXT)); - if (read_mem(addr, info->buf, len)) { - vioblk_free_info(info); - return NULL; - } + do { + rv = (*piov)(dev->fd, iov, iovcnt, + hdr->sector * VIRTIO_BLK_SECTOR_SIZE); + } while (rv == -1 && errno == EINTR); - return info; + if (rv == -1) + log_warn("boo hiss @ %llu", hdr->sector); -nomem: - free(info); - log_warn("malloc errror vioblk write"); - return (NULL); + return vioblk_complete(desc, + rv == -1 ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK); } -static int -vioblk_finish_write(struct ioinfo *info) +static void +vioblk_ev(int fd, short events, void *arg) { - if (pwrite(info->fd, info->buf, info->len, info->offset) != info->len) { - log_warn("vioblk write error"); - return EIO; - } - return 0; + static const struct timeval tv = { 0, 1 }; + struct vioblk_queue *queue = arg; + struct vioblk_dev *dev = queue->dev; + + vcpu_assert_pic_irq(dev->vm_id, 0, dev->irq); + + evtimer_add(&queue->ev, &tv); } /* * XXX in various cases, ds should be set to VIRTIO_BLK_S_IOERR, if we can * XXX cant trust ring data from VM, be extra cautious. */ -int -vioblk_notifyq(struct vioblk_dev *dev) +static void +vioblk_q(void *arg) { - uint64_t q_gpa; - uint32_t vr_sz; - uint16_t idx, cmd_desc_idx, secdata_desc_idx, ds_desc_idx; - uint8_t ds; - int ret; - off_t secbias; + struct vioblk_queue *queue = arg; + struct vioblk_dev *dev = queue->dev; + uint16_t cmd_desc_idx, desc_idx; char *vr; - struct vring_desc *desc, *cmd_desc, *secdata_desc, *ds_desc; + struct vring_desc *desc, *cmd_desc; struct vring_avail *avail; struct vring_used *used; struct virtio_blk_req_hdr cmd; + unsigned int prod, cons; + uint8_t ds; - ret = 0; - - /* Invalid queue? */ - if (dev->cfg.queue_notify > 0) - return (0); - - vr_sz = vring_size(VIOBLK_QUEUE_SIZE); - q_gpa = dev->vq[dev->cfg.queue_notify].qa; - q_gpa = q_gpa * VIRTIO_PAGE_SIZE; - - vr = calloc(1, vr_sz); - if (vr == NULL) { - log_warn("calloc error getting vioblk ring"); - return (0); - } - - if (read_mem(q_gpa, vr, vr_sz)) { - log_warnx("error reading gpa 0x%llx", q_gpa); - goto out; - } + vr = queue->ring; + if (vr == NULL) + return; /* Compute offsets in ring of descriptors, avail ring, and used ring */ desc = (struct vring_desc *)(vr); - avail = (struct vring_avail *)(vr + - dev->vq[dev->cfg.queue_notify].vq_availoffset); - used = (struct vring_used *)(vr + - dev->vq[dev->cfg.queue_notify].vq_usedoffset); + avail = (struct vring_avail *)(vr + queue->vq.vq_availoffset); + used = (struct vring_used *)(vr + queue->vq.vq_usedoffset); - idx = dev->vq[dev->cfg.queue_notify].last_avail & VIOBLK_QUEUE_MASK; + cons = queue->vq.last_avail & VIOBLK_QUEUE_MASK; + prod = avail->idx & VIOBLK_QUEUE_MASK; - if ((avail->idx & VIOBLK_QUEUE_MASK) == idx) { - log_warnx("vioblk queue notify - nothing to do?"); - goto out; - } - - while (idx != (avail->idx & VIOBLK_QUEUE_MASK)) { + if (cons == prod) + return; - cmd_desc_idx = avail->ring[idx] & VIOBLK_QUEUE_MASK; + do { + cmd_desc_idx = avail->ring[cons] & VIOBLK_QUEUE_MASK; cmd_desc = &desc[cmd_desc_idx]; - if ((cmd_desc->flags & VRING_DESC_F_NEXT) == 0) { + if (!ISSET(cmd_desc->flags, VRING_DESC_F_NEXT)) { log_warnx("unchained vioblk cmd descriptor received " "(idx %d)", cmd_desc_idx); - goto out; + break; } /* Read command from descriptor ring */ if (read_mem(cmd_desc->addr, &cmd, cmd_desc->len)) { log_warnx("vioblk: command read_mem error @ 0x%llx", cmd_desc->addr); - goto out; + break; } switch (cmd.type) { case VIRTIO_BLK_T_IN: - /* first descriptor */ - secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK; - secdata_desc = &desc[secdata_desc_idx]; - - if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) { - log_warnx("unchained vioblk data descriptor " - "received (idx %d)", cmd_desc_idx); - goto out; - } - - secbias = 0; - do { - struct ioinfo *info; - const uint8_t *secdata; - - info = vioblk_start_read(dev, cmd.sector + secbias, - (ssize_t)secdata_desc->len); - - /* read the data (use current data descriptor) */ - secdata = vioblk_finish_read(info); - if (secdata == NULL) { - vioblk_free_info(info); - log_warnx("vioblk: block read error, " - "sector %lld", cmd.sector); - goto out; - } - - if (write_mem(secdata_desc->addr, secdata, - secdata_desc->len)) { - log_warnx("can't write sector " - "data to gpa @ 0x%llx", - secdata_desc->addr); - dump_descriptor_chain(desc, cmd_desc_idx); - vioblk_free_info(info); - goto out; - } - - vioblk_free_info(info); - - secbias += (secdata_desc->len / VIRTIO_BLK_SECTOR_SIZE); - secdata_desc_idx = secdata_desc->next & - VIOBLK_QUEUE_MASK; - secdata_desc = &desc[secdata_desc_idx]; - } while (secdata_desc->flags & VRING_DESC_F_NEXT); - - ds_desc_idx = secdata_desc_idx; - ds_desc = secdata_desc; + if (vioblk_io(dev, preadv, &cmd, cmd_desc, desc) != 0) + goto fail; - ds = VIRTIO_BLK_S_OK; - if (write_mem(ds_desc->addr, &ds, ds_desc->len)) { - log_warnx("can't write device status data @ " - "0x%llx", ds_desc->addr); - dump_descriptor_chain(desc, cmd_desc_idx); - goto out; - } - - ret = 1; - dev->cfg.isr_status = 1; - used->ring[used->idx & VIOBLK_QUEUE_MASK].id = cmd_desc_idx; - used->ring[used->idx & VIOBLK_QUEUE_MASK].len = cmd_desc->len; - used->idx++; - - dev->vq[dev->cfg.queue_notify].last_avail = avail->idx & - VIOBLK_QUEUE_MASK; - - if (write_mem(q_gpa, vr, vr_sz)) { - log_warnx("vioblk: error writing vio ring"); - } break; case VIRTIO_BLK_T_OUT: - secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK; - secdata_desc = &desc[secdata_desc_idx]; + if (vioblk_io(dev, pwritev, &cmd, cmd_desc, desc) != 0) + goto fail; - if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) { - log_warnx("wr vioblk: unchained vioblk data " - "descriptor received (idx %d)", - cmd_desc_idx); - goto out; - } - - if (secdata_desc->len > dev->max_xfer) { - log_warnx("%s: invalid read size %d requested", - __func__, secdata_desc->len); - goto out; - } - - secbias = 0; - do { - struct ioinfo *info; - - info = vioblk_start_write(dev, cmd.sector + secbias, - secdata_desc->addr, secdata_desc->len); - - if (info == NULL) { - log_warnx("wr vioblk: can't read " - "sector data @ 0x%llx", - secdata_desc->addr); - dump_descriptor_chain(desc, - cmd_desc_idx); - goto out; - } - - if (vioblk_finish_write(info)) { - log_warnx("wr vioblk: disk write " - "error"); - vioblk_free_info(info); - goto out; - } - - vioblk_free_info(info); - - secbias += secdata_desc->len / - VIRTIO_BLK_SECTOR_SIZE; - - secdata_desc_idx = secdata_desc->next & - VIOBLK_QUEUE_MASK; - secdata_desc = &desc[secdata_desc_idx]; - } while (secdata_desc->flags & VRING_DESC_F_NEXT); - - ds_desc_idx = secdata_desc_idx; - ds_desc = secdata_desc; - - ds = VIRTIO_BLK_S_OK; - if (write_mem(ds_desc->addr, &ds, ds_desc->len)) { - log_warnx("wr vioblk: can't write device " - "status data @ 0x%llx", ds_desc->addr); - dump_descriptor_chain(desc, cmd_desc_idx); - goto out; - } - - ret = 1; - dev->cfg.isr_status = 1; - used->ring[used->idx & VIOBLK_QUEUE_MASK].id = - cmd_desc_idx; - used->ring[used->idx & VIOBLK_QUEUE_MASK].len = - cmd_desc->len; - used->idx++; - - dev->vq[dev->cfg.queue_notify].last_avail = avail->idx & - VIOBLK_QUEUE_MASK; - if (write_mem(q_gpa, vr, vr_sz)) - log_warnx("wr vioblk: error writing vio ring"); break; + case VIRTIO_BLK_T_FLUSH: case VIRTIO_BLK_T_FLUSH_OUT: - ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK; - ds_desc = &desc[ds_desc_idx]; - ds = VIRTIO_BLK_S_OK; - if (write_mem(ds_desc->addr, &ds, ds_desc->len)) { - log_warnx("fl vioblk: can't write device status " - "data @ 0x%llx", ds_desc->addr); - dump_descriptor_chain(desc, cmd_desc_idx); - goto out; - } + if (fsync(dev->fd) == -1) + ds = VIRTIO_BLK_S_IOERR; - ret = 1; - dev->cfg.isr_status = 1; - used->ring[used->idx & VIOBLK_QUEUE_MASK].id = - cmd_desc_idx; - used->ring[used->idx & VIOBLK_QUEUE_MASK].len = - cmd_desc->len; - used->idx++; + desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK; + desc = &desc[desc_idx]; - dev->vq[dev->cfg.queue_notify].last_avail = avail->idx & - VIOBLK_QUEUE_MASK; - if (write_mem(q_gpa, vr, vr_sz)) { - log_warnx("fl vioblk: error writing vio ring"); - } + vioblk_complete(desc, ds); break; + default: log_warnx("%s: unsupported command 0x%x", __func__, cmd.type); - ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK; - ds_desc = &desc[ds_desc_idx]; - - ds = VIRTIO_BLK_S_UNSUPP; - if (write_mem(ds_desc->addr, &ds, ds_desc->len)) { - log_warnx("%s: get id : can't write device " - "status data @ 0x%llx", __func__, - ds_desc->addr); - dump_descriptor_chain(desc, cmd_desc_idx); - goto out; - } - - ret = 1; - dev->cfg.isr_status = 1; - used->ring[used->idx & VIOBLK_QUEUE_MASK].id = - cmd_desc_idx; - used->ring[used->idx & VIOBLK_QUEUE_MASK].len = - cmd_desc->len; - used->idx++; - - dev->vq[dev->cfg.queue_notify].last_avail = avail->idx & - VIOBLK_QUEUE_MASK; - if (write_mem(q_gpa, vr, vr_sz)) { - log_warnx("%s: get id : error writing vio ring", - __func__); - } + vioblk_io_skip(desc, cmd_desc); break; } - idx = (idx + 1) & VIOBLK_QUEUE_MASK; - } -out: - free(vr); - return (ret); + used->ring[used->idx & VIOBLK_QUEUE_MASK].id = cmd_desc_idx; + used->ring[used->idx & VIOBLK_QUEUE_MASK].len = cmd_desc->len; + __asm("" ::: "memory"); + used->idx++; + + cons++; + cons &= VIOBLK_QUEUE_MASK; + } while (cons != prod); + +fail: + queue->vq.last_avail = cons; + + __asm("" ::: "memory"); +#if 1 + vcpu_assert_pic_irq(dev->vm_id, 0, dev->irq); +#else + char buf[1] = { dev - vioblk }; + write(iofd, buf, sizeof(buf)); +#endif + + dev->cfg.isr_status = 1; } int @@ -733,8 +597,12 @@ virtio_blk_io(int dir, uint16_t reg, uin break; case VIRTIO_CONFIG_QUEUE_NOTIFY: dev->cfg.queue_notify = *data; - if (vioblk_notifyq(dev)) - *intr = 1; +#if 1 + task_add(iotq, &dev->q[0].t); +// vioblk_ev(0, 0, &dev->q[0]); +#else + vioblk_q(&dev->q[0]); +#endif break; case VIRTIO_CONFIG_DEVICE_STATUS: dev->cfg.device_status = *data; @@ -748,7 +616,7 @@ virtio_blk_io(int dir, uint16_t reg, uin dev->cfg.queue_select = 0; dev->cfg.queue_notify = 0; dev->cfg.isr_status = 0; - dev->vq[0].last_avail = 0; + dev->q[0].vq.last_avail = 0; } break; default: @@ -1661,6 +1529,32 @@ vmmci_io(int dir, uint16_t reg, uint32_t return (0); } +static void +vioblk_intr(int fd, short events, void *null) +{ + char buf[64]; + ssize_t rv; + unsigned int i; + +log_warnx("%s", __func__); +intr: + rv = read(fd, buf, sizeof(buf)); + if (rv == -1) { + switch (errno) { + case EINTR: + goto intr; + default: + fatal("intr read"); + break; + } + } + for (i = 0; i < rv; i++) { + unsigned int n = buf[i]; + struct vioblk_dev *dev = &vioblk[n]; + vcpu_assert_pic_irq(dev->vm_id, 0, dev->irq); + } +} + void virtio_init(struct vmd_vm *vm, int *child_disks, int *child_taps) { @@ -1697,6 +1591,8 @@ virtio_init(struct vmd_vm *vm, int *chil + sizeof(uint16_t) * (2 + VIORND_QUEUE_SIZE)); if (vcp->vcp_ndisks > 0) { + int pfd[2]; + nr_vioblk = vcp->vcp_ndisks; vioblk = calloc(vcp->vcp_ndisks, sizeof(struct vioblk_dev)); if (vioblk == NULL) { @@ -1705,6 +1601,17 @@ virtio_init(struct vmd_vm *vm, int *chil return; } + iotq = taskq_create("iotq"); + if (iotq == NULL) + fatalx("unable to create io taskq"); + + if (pipe(pfd) == -1) + fatal("io pipe"); + + iofd = pfd[1]; + event_set(&ioev, pfd[0], EV_READ|EV_PERSIST, vioblk_intr, NULL); + event_add(&ioev, NULL); + /* One virtio block device for each disk defined in vcp */ for (i = 0; i < vcp->vcp_ndisks; i++) { if ((sz = lseek(child_disks[i], 0, SEEK_END)) == -1) @@ -1726,17 +1633,28 @@ virtio_init(struct vmd_vm *vm, int *chil "device", __progname); return; } - vioblk[i].vq[0].qs = VIOBLK_QUEUE_SIZE; - vioblk[i].vq[0].vq_availoffset = + + vioblk[i].q[0].dev = &vioblk[i]; + vioblk[i].q[0].ring = NULL; + vioblk[i].q[0].vq.qs = VIOBLK_QUEUE_SIZE; + vioblk[i].q[0].vq.vq_availoffset = sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE; - vioblk[i].vq[0].vq_usedoffset = VIRTQUEUE_ALIGN( + vioblk[i].q[0].vq.vq_usedoffset = VIRTQUEUE_ALIGN( sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE + sizeof(uint16_t) * (2 + VIOBLK_QUEUE_SIZE)); - vioblk[i].vq[0].last_avail = 0; + vioblk[i].q[0].vq.last_avail = 0; vioblk[i].fd = child_disks[i]; vioblk[i].sz = sz / 512; vioblk[i].cfg.device_feature = VIRTIO_BLK_F_SIZE_MAX; vioblk[i].max_xfer = 1048576; + + vioblk[i].vm_id = vcp->vcp_id; + vioblk[i].irq = pci_get_dev_irq(id); + + evtimer_set(&vioblk[0].q[0].ev, vioblk_ev, + &vioblk[0].q[0]); + + task_set(&vioblk[i].q[0].t, vioblk_q, &vioblk[i].q[0]); } } Index: virtio.h =================================================================== RCS file: /cvs/src/usr.sbin/vmd/virtio.h,v retrieving revision 1.20 diff -u -p -r1.20 virtio.h --- virtio.h 12 Aug 2017 20:24:57 -0000 1.20 +++ virtio.h 25 Aug 2017 02:51:41 -0000 @@ -97,16 +97,6 @@ struct viornd_dev { struct virtio_vq_info vq[VIRTIO_MAX_QUEUES]; }; -struct vioblk_dev { - struct virtio_io_cfg cfg; - - struct virtio_vq_info vq[VIRTIO_MAX_QUEUES]; - - int fd; - uint64_t sz; - uint32_t max_xfer; -}; - struct vionet_dev { pthread_mutex_t mutex; struct event event; @@ -171,13 +161,6 @@ int viornd_restore(int); void viornd_update_qs(void); void viornd_update_qa(void); int viornd_notifyq(void); - -int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t); -int vioblk_dump(int); -int vioblk_restore(int, struct vm_create_params *, int *); -void vioblk_update_qs(struct vioblk_dev *); -void vioblk_update_qa(struct vioblk_dev *); -int vioblk_notifyq(struct vioblk_dev *); int virtio_net_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t); int vionet_dump(int); Index: vm.c =================================================================== RCS file: /cvs/src/usr.sbin/vmd/vm.c,v retrieving revision 1.24 diff -u -p -r1.24 vm.c --- vm.c 20 Aug 2017 21:15:32 -0000 1.24 +++ vm.c 25 Aug 2017 02:51:41 -0000 @@ -1561,6 +1561,29 @@ find_gpa_range(struct vm_create_params * return (vmr); } +void * +vaddr_mem(paddr_t gpa, size_t len) +{ + struct vm_create_params *vcp = ¤t_vm->vm_params.vmc_params; + size_t i; + struct vm_mem_range *vmr; + paddr_t gpend = gpa + len; + + /* Find the first vm_mem_range that contains gpa */ + for (i = 0; i < vcp->vcp_nmemranges; i++) { + vmr = &vcp->vcp_memranges[i]; + if (gpa < vmr->vmr_gpa) + continue; + + if (gpend >= vmr->vmr_gpa + vmr->vmr_size) + continue; + + return ((char *)vmr->vmr_va + (gpa - vmr->vmr_gpa)); + } + + return (NULL); +} + /* * write_mem * @@ -1658,6 +1681,43 @@ read_mem(paddr_t src, void *buf, size_t return (0); } +int +iovec_mem(paddr_t src, size_t len, struct iovec *iov, int iovcnt) +{ + size_t n, off; + struct vm_mem_range *vmr; + int niov = 0; + + vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len); + if (vmr == NULL) { + errno = EINVAL; + return (-1); + } + + off = src - vmr->vmr_gpa; + while (len > 0) { + if (niov == iovcnt) { + errno = ENOMEM; + return (-1); + } + + n = vmr->vmr_size - off; + if (len < n) + n = len; + + iov[niov].iov_base = (char *)vmr->vmr_va + off; + iov[niov].iov_len = n; + + niov++; + + len -= n; + off = 0; + vmr++; + } + + return (niov); +} + /* * vcpu_assert_pic_irq * @@ -1676,12 +1736,12 @@ vcpu_assert_pic_irq(uint32_t vm_id, uint i8259_assert_irq(irq); if (i8259_is_pending()) { - if (vcpu_pic_intr(vm_id, vcpu_id, 1)) - fatalx("%s: can't assert INTR", __func__); - ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]); if (ret) fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret); + + if (vcpu_pic_intr(vm_id, vcpu_id, 1)) + fatalx("%s: can't assert INTR", __func__); vcpu_hlt[vcpu_id] = 0; ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]); Index: vmd.h =================================================================== RCS file: /cvs/src/usr.sbin/vmd/vmd.h,v retrieving revision 1.60 diff -u -p -r1.60 vmd.h --- vmd.h 20 Aug 2017 21:15:32 -0000 1.60 +++ vmd.h 25 Aug 2017 02:51:41 -0000 @@ -35,6 +35,10 @@ #ifndef VMD_H #define VMD_H +#define SET(_v, _m) ((_v) |= (_m)) +#define CLR(_v, _m) ((_v) &= ~(_m)) +#define ISSET(_v, _m) ((_v) & (_m)) + #define VMD_USER "_vmd" #define VMD_CONF "/etc/vm.conf" #define SOCKET_NAME "/var/run/vmd.sock" @@ -320,10 +324,14 @@ int vm_priv_brconfig(struct privsep *, uint32_t vm_priv_addr(struct address *, uint32_t, int, int); /* vmm.c */ +struct iovec; + void vmm(struct privsep *, struct privsep_proc *); void vmm_shutdown(void); +void *vaddr_mem(paddr_t, size_t); int write_mem(paddr_t, const void *buf, size_t); int read_mem(paddr_t, void *buf, size_t); +int iovec_mem(paddr_t, size_t, struct iovec *, int); int opentap(char *); int fd_hasdata(int); void mutex_lock(pthread_mutex_t *);