From: Michael S. Tsirkin on
On Tue, Apr 06, 2010 at 01:46:56PM +0800, Xin, Xiaohui wrote:
> Michael,
> > >>> For the write logging, do you have a function in hand that we can
> > >>> recompute the log? If that, I think I can use it to recompute the
> > >>>log info when the logging is suddenly enabled.
> > >>> For the outstanding requests, do you mean all the user buffers have
> > >>>submitted before the logging ioctl changed? That may be a lot, and
> > >> >some of them are still in NIC ring descriptors. Waiting them to be
> > >>>finished may be need some time. I think when logging ioctl changed,
> > >> >then the logging is changed just after that is also reasonable.
>
> > >>The key point is that after loggin ioctl returns, any
> > >>subsequent change to memory must be logged. It does not
> > >>matter when was the request submitted, otherwise we will
> > >>get memory corruption on migration.
>
> > >The change to memory happens when vhost_add_used_and_signal(), right?
> > >So after ioctl returns, just recompute the log info to the events in the async queue,
> > >is ok. Since the ioctl and write log operations are all protected by vq->mutex.
>
> >> Thanks
> >> Xiaohui
>
> >Yes, I think this will work.
>
> Thanks, so do you have the function to recompute the log info in your hand that I can
> use? I have weakly remembered that you have noticed it before some time.

Doesn't just rerunning vhost_get_vq_desc work?

> > > Thanks
> > > Xiaohui
> > >
> > > drivers/vhost/net.c | 189 +++++++++++++++++++++++++++++++++++++++++++++++--
> > > drivers/vhost/vhost.h | 10 +++
> > > 2 files changed, 192 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > index 22d5fef..2aafd90 100644
> > > --- a/drivers/vhost/net.c
> > > +++ b/drivers/vhost/net.c
> > > @@ -17,11 +17,13 @@
> > > #include <linux/workqueue.h>
> > > #include <linux/rcupdate.h>
> > > #include <linux/file.h>
> > > +#include <linux/aio.h>
> > >
> > > #include <linux/net.h>
> > > #include <linux/if_packet.h>
> > > #include <linux/if_arp.h>
> > > #include <linux/if_tun.h>
> > > +#include <linux/mpassthru.h>
> > >
> > > #include <net/sock.h>
> > >
> > > @@ -47,6 +49,7 @@ struct vhost_net {
> > > struct vhost_dev dev;
> > > struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
> > > struct vhost_poll poll[VHOST_NET_VQ_MAX];
> > > + struct kmem_cache *cache;
> > > /* Tells us whether we are polling a socket for TX.
> > > * We only do this when socket buffer fills up.
> > > * Protected by tx vq lock. */
> > > @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
> > > net->tx_poll_state = VHOST_NET_POLL_STARTED;
> > > }
> > >
> > > +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
> > > +{
> > > + struct kiocb *iocb = NULL;
> > > + unsigned long flags;
> > > +
> > > + spin_lock_irqsave(&vq->notify_lock, flags);
> > > + if (!list_empty(&vq->notifier)) {
> > > + iocb = list_first_entry(&vq->notifier,
> > > + struct kiocb, ki_list);
> > > + list_del(&iocb->ki_list);
> > > + }
> > > + spin_unlock_irqrestore(&vq->notify_lock, flags);
> > > + return iocb;
> > > +}
> > > +
> > > +static void handle_async_rx_events_notify(struct vhost_net *net,
> > > + struct vhost_virtqueue *vq)
> > > +{
> > > + struct kiocb *iocb = NULL;
> > > + struct vhost_log *vq_log = NULL;
> > > + int rx_total_len = 0;
> > > + int log, size;
> > > +
> > > + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > > + return;
> > > +
> > > + if (vq->receiver)
> > > + vq->receiver(vq);
> > > +
> > > + vq_log = unlikely(vhost_has_feature(
> > > + &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
> > > + while ((iocb = notify_dequeue(vq)) != NULL) {
> > > + vhost_add_used_and_signal(&net->dev, vq,
> > > + iocb->ki_pos, iocb->ki_nbytes);
> > > + log = (int)iocb->ki_user_data;
> > > + size = iocb->ki_nbytes;
> > > + rx_total_len += iocb->ki_nbytes;
> > > +
> > > + if (iocb->ki_dtor)
> > > + iocb->ki_dtor(iocb);
> > > + kmem_cache_free(net->cache, iocb);
> > > +
> > > + if (unlikely(vq_log))
> > > + vhost_log_write(vq, vq_log, log, size);
> > > + if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
> > > + vhost_poll_queue(&vq->poll);
> > > + break;
> > > + }
> > > + }
> > > +}
> > > +
> > > +static void handle_async_tx_events_notify(struct vhost_net *net,
> > > + struct vhost_virtqueue *vq)
> > > +{
> > > + struct kiocb *iocb = NULL;
> > > + int tx_total_len = 0;
> > > +
> > > + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > > + return;
> > > +
> > > + while ((iocb = notify_dequeue(vq)) != NULL) {
> > > + vhost_add_used_and_signal(&net->dev, vq,
> > > + iocb->ki_pos, 0);
> > > + tx_total_len += iocb->ki_nbytes;
> > > +
> > > + if (iocb->ki_dtor)
> > > + iocb->ki_dtor(iocb);
> > > +
> > > + kmem_cache_free(net->cache, iocb);
> > > + if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
> > > + vhost_poll_queue(&vq->poll);
> > > + break;
> > > + }
> > > + }
> > > +}
> > > +
> > > /* Expects to be always run from workqueue - which acts as
> > > * read-size critical section for our kind of RCU. */
> > > static void handle_tx(struct vhost_net *net)
> > > {
> > > struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
> > > + struct kiocb *iocb = NULL;
> > > unsigned head, out, in, s;
> > > struct msghdr msg = {
> > > .msg_name = NULL,
> > > @@ -124,6 +204,8 @@ static void handle_tx(struct vhost_net *net)
> > > tx_poll_stop(net);
> > > hdr_size = vq->hdr_size;
> > >
> > > + handle_async_tx_events_notify(net, vq);
> > > +
> > > for (;;) {
> > > head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > > ARRAY_SIZE(vq->iov),
> > > @@ -151,6 +233,15 @@ static void handle_tx(struct vhost_net *net)
> > > /* Skip header. TODO: support TSO. */
> > > s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
> > > msg.msg_iovlen = out;
> > > +
> > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > > + iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
> > > + if (!iocb)
> > > + break;
> > > + iocb->ki_pos = head;
> > > + iocb->private = (void *)vq;
> > > + }
> > > +
> > > len = iov_length(vq->iov, out);
> > > /* Sanity check */
> > > if (!len) {
> > > @@ -160,12 +251,16 @@ static void handle_tx(struct vhost_net *net)
> > > break;
> > > }
> > > /* TODO: Check specific error and bomb out unless ENOBUFS? */
> > > - err = sock->ops->sendmsg(NULL, sock, &msg, len);
> > > + err = sock->ops->sendmsg(iocb, sock, &msg, len);
> > > if (unlikely(err < 0)) {
> > > vhost_discard_vq_desc(vq);
> > > tx_poll_start(net, sock);
> > > break;
> > > }
> > > +
> > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> > > + continue;
> > > +
> > > if (err != len)
> > > pr_err("Truncated TX packet: "
> > > " len %d != %zd\n", err, len);
> > > @@ -177,6 +272,8 @@ static void handle_tx(struct vhost_net *net)
> > > }
> > > }
> > >
> > > + handle_async_tx_events_notify(net, vq);
> > > +
> > > mutex_unlock(&vq->mutex);
> > > unuse_mm(net->dev.mm);
> > > }
> > > @@ -186,6 +283,7 @@ static void handle_tx(struct vhost_net *net)
> > > static void handle_rx(struct vhost_net *net)
> > > {
> > > struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> > > + struct kiocb *iocb = NULL;
> > > unsigned head, out, in, log, s;
> > > struct vhost_log *vq_log;
> > > struct msghdr msg = {
> > > @@ -206,7 +304,8 @@ static void handle_rx(struct vhost_net *net)
> > > int err;
> > > size_t hdr_size;
> > > struct socket *sock = rcu_dereference(vq->private_data);
> > > - if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> > > + if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
> > > + vq->link_state == VHOST_VQ_LINK_SYNC))
> > > return;
> > >
> > > use_mm(net->dev.mm);
> > > @@ -214,9 +313,18 @@ static void handle_rx(struct vhost_net *net)
> > > vhost_disable_notify(vq);
> > > hdr_size = vq->hdr_size;
> > >
> > > - vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> > > + /* In async cases, for write logging, the simple way is to get
> > > + * the log info always, and really logging is decided later.
> > > + * Thus, when logging enabled, we can get log, and when logging
> > > + * disabled, we can get log disabled accordingly.
> > > + */
> > > +
> > > + vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) |
> > > + (vq->link_state == VHOST_VQ_LINK_ASYNC) ?
> > > vq->log : NULL;
> > >
> > > + handle_async_rx_events_notify(net, vq);
> > > +
> > > for (;;) {
> > > head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > > ARRAY_SIZE(vq->iov),
> > > @@ -245,6 +353,14 @@ static void handle_rx(struct vhost_net *net)
> > > s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> > > msg.msg_iovlen = in;
> > > len = iov_length(vq->iov, in);
> > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > > + iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
> > > + if (!iocb)
> > > + break;
> > > + iocb->private = vq;
> > > + iocb->ki_pos = head;
> > > + iocb->ki_user_data = log;
> > > + }
> > > /* Sanity check */
> > > if (!len) {
> > > vq_err(vq, "Unexpected header len for RX: "
> > > @@ -252,13 +368,18 @@ static void handle_rx(struct vhost_net *net)
> > > iov_length(vq->hdr, s), hdr_size);
> > > break;
> > > }
> > > - err = sock->ops->recvmsg(NULL, sock, &msg,
> > > +
> > > + err = sock->ops->recvmsg(iocb, sock, &msg,
> > > len, MSG_DONTWAIT | MSG_TRUNC);
> > > /* TODO: Check specific error and bomb out unless EAGAIN? */
> > > if (err < 0) {
> > > vhost_discard_vq_desc(vq);
> > > break;
> > > }
> > > +
> > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> > > + continue;
> > > +
> > > /* TODO: Should check and handle checksum. */
> > > if (err > len) {
> > > pr_err("Discarded truncated rx packet: "
> > > @@ -284,10 +405,13 @@ static void handle_rx(struct vhost_net *net)
> > > }
> > > }
> > >
> > > + handle_async_rx_events_notify(net, vq);
> > > +
> > > mutex_unlock(&vq->mutex);
> > > unuse_mm(net->dev.mm);
> > > }
> > >
> > > +
> > > static void handle_tx_kick(struct work_struct *work)
> > > {
> > > struct vhost_virtqueue *vq;
> > > @@ -338,6 +462,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
> > > vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
> > > vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
> > > n->tx_poll_state = VHOST_NET_POLL_DISABLED;
> > > + n->cache = NULL;
> > > return 0;
> > > }
> > >
> > > @@ -398,6 +523,17 @@ static void vhost_net_flush(struct vhost_net *n)
> > > vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
> > > }
> > >
> > > +static void vhost_notifier_cleanup(struct vhost_net *n)
> > > +{
> > > + struct vhost_virtqueue *vq = &n->dev.vqs[VHOST_NET_VQ_RX];
> > > + struct kiocb *iocb = NULL;
> > > + if (n->cache) {
> > > + while ((iocb = notify_dequeue(vq)) != NULL)
> > > + kmem_cache_free(n->cache, iocb);
> > > + kmem_cache_destroy(n->cache);
> > > + }
> > > +}
> > > +
> > > static int vhost_net_release(struct inode *inode, struct file *f)
> > > {
> > > struct vhost_net *n = f->private_data;
> > > @@ -414,6 +550,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
> > > /* We do an extra flush before freeing memory,
> > > * since jobs can re-queue themselves. */
> > > vhost_net_flush(n);
> > > + vhost_notifier_cleanup(n);
> > > kfree(n);
> > > return 0;
> > > }
> > > @@ -462,7 +599,19 @@ static struct socket *get_tun_socket(int fd)
> > > return sock;
> > > }
> > >
> > > -static struct socket *get_socket(int fd)
> > > +static struct socket *get_mp_socket(int fd)
> > > +{
> > > + struct file *file = fget(fd);
> > > + struct socket *sock;
> > > + if (!file)
> > > + return ERR_PTR(-EBADF);
> > > + sock = mp_get_socket(file);
> > > + if (IS_ERR(sock))
> > > + fput(file);
> > > + return sock;
> > > +}
> > > +
> > > +static struct socket *get_socket(struct vhost_virtqueue *vq, int fd)
> > > {
> > > struct socket *sock;
> > > if (fd == -1)
> > > @@ -473,9 +622,31 @@ static struct socket *get_socket(int fd)
> > > sock = get_tun_socket(fd);
> > > if (!IS_ERR(sock))
> > > return sock;
> > > + sock = get_mp_socket(fd);
> > > + if (!IS_ERR(sock)) {
> > > + vq->link_state = VHOST_VQ_LINK_ASYNC;
> > > + return sock;
> > > + }
> > > return ERR_PTR(-ENOTSOCK);
> > > }
> > >
> > > +static void vhost_init_link_state(struct vhost_net *n, int index)
> > > +{
> > > + struct vhost_virtqueue *vq = n->vqs + index;
> > > +
> > > + WARN_ON(!mutex_is_locked(&vq->mutex));
> > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > > + vq->receiver = NULL;
> > > + INIT_LIST_HEAD(&vq->notifier);
> > > + spin_lock_init(&vq->notify_lock);
> > > + if (!n->cache) {
> > > + n->cache = kmem_cache_create("vhost_kiocb",
> > > + sizeof(struct kiocb), 0,
> > > + SLAB_HWCACHE_ALIGN, NULL);
> > > + }
> > > + }
> > > +}
> > > +
> > > static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> > > {
> > > struct socket *sock, *oldsock;
> > > @@ -493,12 +664,15 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> > > }
> > > vq = n->vqs + index;
> > > mutex_lock(&vq->mutex);
> > > - sock = get_socket(fd);
> > > + vq->link_state = VHOST_VQ_LINK_SYNC;
> > > + sock = get_socket(vq, fd);
> > > if (IS_ERR(sock)) {
> > > r = PTR_ERR(sock);
> > > goto err;
> > > }
> > >
> > > + vhost_init_link_state(n, index);
> > > +
> > > /* start polling new socket */
> > > oldsock = vq->private_data;
> > > if (sock == oldsock)
> > > @@ -507,8 +681,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> > > vhost_net_disable_vq(n, vq);
> > > rcu_assign_pointer(vq->private_data, sock);
> > > vhost_net_enable_vq(n, vq);
> > > - mutex_unlock(&vq->mutex);
> > > done:
> > > + mutex_unlock(&vq->mutex);
> > > mutex_unlock(&n->dev.mutex);
> > > if (oldsock) {
> > > vhost_net_flush_vq(n, index);
> > > @@ -516,6 +690,7 @@ done:
> > > }
> > > return r;
> > > err:
> > > + mutex_unlock(&vq->mutex);
> > > mutex_unlock(&n->dev.mutex);
> > > return r;
> > > }
> > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > index d1f0453..cffe39a 100644
> > > --- a/drivers/vhost/vhost.h
> > > +++ b/drivers/vhost/vhost.h
> > > @@ -43,6 +43,11 @@ struct vhost_log {
> > > u64 len;
> > > };
> > >
> > > +enum vhost_vq_link_state {
> > > + VHOST_VQ_LINK_SYNC = 0,
> > > + VHOST_VQ_LINK_ASYNC = 1,
> > > +};
> > > +
> > > /* The virtqueue structure describes a queue attached to a device. */
> > > struct vhost_virtqueue {
> > > struct vhost_dev *dev;
> > > @@ -96,6 +101,11 @@ struct vhost_virtqueue {
> > > /* Log write descriptors */
> > > void __user *log_base;
> > > struct vhost_log log[VHOST_NET_MAX_SG];
> > > + /*Differiate async socket for 0-copy from normal*/
> > > + enum vhost_vq_link_state link_state;
> > > + struct list_head notifier;
> > > + spinlock_t notify_lock;
> > > + void (*receiver)(struct vhost_virtqueue *);
> > > };
> > >
> > > struct vhost_dev {
> > > --
> > > 1.5.4.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Xin, Xiaohui on
Michael,
> > >>>> For the write logging, do you have a function in hand that we can
> > >>>> recompute the log? If that, I think I can use it to recompute the
> > >>>>log info when the logging is suddenly enabled.
> > >>>> For the outstanding requests, do you mean all the user buffers have
> > >>>>submitted before the logging ioctl changed? That may be a lot, and
> > >> >>some of them are still in NIC ring descriptors. Waiting them to be
> > >>>>finished may be need some time. I think when logging ioctl changed,
> > >> >>then the logging is changed just after that is also reasonable.

> > >>>The key point is that after loggin ioctl returns, any
> > >>>subsequent change to memory must be logged. It does not
> > >>>matter when was the request submitted, otherwise we will
> > >>>get memory corruption on migration.

> > >>The change to memory happens when vhost_add_used_and_signal(), right?
> > >>So after ioctl returns, just recompute the log info to the events in the async queue,
> > >>is ok. Since the ioctl and write log operations are all protected by vq->mutex.

> >>> Thanks
> >> >Xiaohui

> >>Yes, I think this will work.

>> Thanks, so do you have the function to recompute the log info in your hand that I can
>>use? I have weakly remembered that you have noticed it before some time.

>Doesn't just rerunning vhost_get_vq_desc work?

Am I missing something here?
The vhost_get_vq_desc() looks in vq, and finds the first available buffers, and converts it
to an iovec. I think the first available buffer is not the buffers in the async queue, so I
think rerunning vhost_get_vq_desc() cannot work.

Thanks
Xiaohui

> > > Thanks
> > > Xiaohui
> > >
> > > drivers/vhost/net.c | 189 +++++++++++++++++++++++++++++++++++++++++++++++--
> > > drivers/vhost/vhost.h | 10 +++
> > > 2 files changed, 192 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > index 22d5fef..2aafd90 100644
> > > --- a/drivers/vhost/net.c
> > > +++ b/drivers/vhost/net.c
> > > @@ -17,11 +17,13 @@
> > > #include <linux/workqueue.h>
> > > #include <linux/rcupdate.h>
> > > #include <linux/file.h>
> > > +#include <linux/aio.h>
> > >
> > > #include <linux/net.h>
> > > #include <linux/if_packet.h>
> > > #include <linux/if_arp.h>
> > > #include <linux/if_tun.h>
> > > +#include <linux/mpassthru.h>
> > >
> > > #include <net/sock.h>
> > >
> > > @@ -47,6 +49,7 @@ struct vhost_net {
> > > struct vhost_dev dev;
> > > struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
> > > struct vhost_poll poll[VHOST_NET_VQ_MAX];
> > > + struct kmem_cache *cache;
> > > /* Tells us whether we are polling a socket for TX.
> > > * We only do this when socket buffer fills up.
> > > * Protected by tx vq lock. */
> > > @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
> > > net->tx_poll_state = VHOST_NET_POLL_STARTED;
> > > }
> > >
> > > +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
> > > +{
> > > + struct kiocb *iocb = NULL;
> > > + unsigned long flags;
> > > +
> > > + spin_lock_irqsave(&vq->notify_lock, flags);
> > > + if (!list_empty(&vq->notifier)) {
> > > + iocb = list_first_entry(&vq->notifier,
> > > + struct kiocb, ki_list);
> > > + list_del(&iocb->ki_list);
> > > + }
> > > + spin_unlock_irqrestore(&vq->notify_lock, flags);
> > > + return iocb;
> > > +}
> > > +
> > > +static void handle_async_rx_events_notify(struct vhost_net *net,
> > > + struct vhost_virtqueue *vq)
> > > +{
> > > + struct kiocb *iocb = NULL;
> > > + struct vhost_log *vq_log = NULL;
> > > + int rx_total_len = 0;
> > > + int log, size;
> > > +
> > > + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > > + return;
> > > +
> > > + if (vq->receiver)
> > > + vq->receiver(vq);
> > > +
> > > + vq_log = unlikely(vhost_has_feature(
> > > + &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
> > > + while ((iocb = notify_dequeue(vq)) != NULL) {
> > > + vhost_add_used_and_signal(&net->dev, vq,
> > > + iocb->ki_pos, iocb->ki_nbytes);
> > > + log = (int)iocb->ki_user_data;
> > > + size = iocb->ki_nbytes;
> > > + rx_total_len += iocb->ki_nbytes;
> > > +
> > > + if (iocb->ki_dtor)
> > > + iocb->ki_dtor(iocb);
> > > + kmem_cache_free(net->cache, iocb);
> > > +
> > > + if (unlikely(vq_log))
> > > + vhost_log_write(vq, vq_log, log, size);
> > > + if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
> > > + vhost_poll_queue(&vq->poll);
> > > + break;
> > > + }
> > > + }
> > > +}
> > > +
> > > +static void handle_async_tx_events_notify(struct vhost_net *net,
> > > + struct vhost_virtqueue *vq)
> > > +{
> > > + struct kiocb *iocb = NULL;
> > > + int tx_total_len = 0;
> > > +
> > > + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > > + return;
> > > +
> > > + while ((iocb = notify_dequeue(vq)) != NULL) {
> > > + vhost_add_used_and_signal(&net->dev, vq,
> > > + iocb->ki_pos, 0);
> > > + tx_total_len += iocb->ki_nbytes;
> > > +
> > > + if (iocb->ki_dtor)
> > > + iocb->ki_dtor(iocb);
> > > +
> > > + kmem_cache_free(net->cache, iocb);
> > > + if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
> > > + vhost_poll_queue(&vq->poll);
> > > + break;
> > > + }
> > > + }
> > > +}
> > > +
> > > /* Expects to be always run from workqueue - which acts as
> > > * read-size critical section for our kind of RCU. */
> > > static void handle_tx(struct vhost_net *net)
> > > {
> > > struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
> > > + struct kiocb *iocb = NULL;
> > > unsigned head, out, in, s;
> > > struct msghdr msg = {
> > > .msg_name = NULL,
> > > @@ -124,6 +204,8 @@ static void handle_tx(struct vhost_net *net)
> > > tx_poll_stop(net);
> > > hdr_size = vq->hdr_size;
> > >
> > > + handle_async_tx_events_notify(net, vq);
> > > +
> > > for (;;) {
> > > head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > > ARRAY_SIZE(vq->iov),
> > > @@ -151,6 +233,15 @@ static void handle_tx(struct vhost_net *net)
> > > /* Skip header. TODO: support TSO. */
> > > s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
> > > msg.msg_iovlen = out;
> > > +
> > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > > + iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
> > > + if (!iocb)
> > > + break;
> > > + iocb->ki_pos = head;
> > > + iocb->private = (void *)vq;
> > > + }
> > > +
> > > len = iov_length(vq->iov, out);
> > > /* Sanity check */
> > > if (!len) {
> > > @@ -160,12 +251,16 @@ static void handle_tx(struct vhost_net *net)
> > > break;
> > > }
> > > /* TODO: Check specific error and bomb out unless ENOBUFS? */
> > > - err = sock->ops->sendmsg(NULL, sock, &msg, len);
> > > + err = sock->ops->sendmsg(iocb, sock, &msg, len);
> > > if (unlikely(err < 0)) {
> > > vhost_discard_vq_desc(vq);
> > > tx_poll_start(net, sock);
> > > break;
> > > }
> > > +
> > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> > > + continue;
> > > +
> > > if (err != len)
> > > pr_err("Truncated TX packet: "
> > > " len %d != %zd\n", err, len);
> > > @@ -177,6 +272,8 @@ static void handle_tx(struct vhost_net *net)
> > > }
> > > }
> > >
> > > + handle_async_tx_events_notify(net, vq);
> > > +
> > > mutex_unlock(&vq->mutex);
> > > unuse_mm(net->dev.mm);
> > > }
> > > @@ -186,6 +283,7 @@ static void handle_tx(struct vhost_net *net)
> > > static void handle_rx(struct vhost_net *net)
> > > {
> > > struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> > > + struct kiocb *iocb = NULL;
> > > unsigned head, out, in, log, s;
> > > struct vhost_log *vq_log;
> > > struct msghdr msg = {
> > > @@ -206,7 +304,8 @@ static void handle_rx(struct vhost_net *net)
> > > int err;
> > > size_t hdr_size;
> > > struct socket *sock = rcu_dereference(vq->private_data);
> > > - if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> > > + if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
> > > + vq->link_state == VHOST_VQ_LINK_SYNC))
> > > return;
> > >
> > > use_mm(net->dev.mm);
> > > @@ -214,9 +313,18 @@ static void handle_rx(struct vhost_net *net)
> > > vhost_disable_notify(vq);
> > > hdr_size = vq->hdr_size;
> > >
> > > - vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> > > + /* In async cases, for write logging, the simple way is to get
> > > + * the log info always, and really logging is decided later.
> > > + * Thus, when logging enabled, we can get log, and when logging
> > > + * disabled, we can get log disabled accordingly.
> > > + */
> > > +
> > > + vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) |
> > > + (vq->link_state == VHOST_VQ_LINK_ASYNC) ?
> > > vq->log : NULL;
> > >
> > > + handle_async_rx_events_notify(net, vq);
> > > +
> > > for (;;) {
> > > head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > > ARRAY_SIZE(vq->iov),
> > > @@ -245,6 +353,14 @@ static void handle_rx(struct vhost_net *net)
> > > s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> > > msg.msg_iovlen = in;
> > > len = iov_length(vq->iov, in);
> > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > > + iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
> > > + if (!iocb)
> > > + break;
> > > + iocb->private = vq;
> > > + iocb->ki_pos = head;
> > > + iocb->ki_user_data = log;
> > > + }
> > > /* Sanity check */
> > > if (!len) {
> > > vq_err(vq, "Unexpected header len for RX: "
> > > @@ -252,13 +368,18 @@ static void handle_rx(struct vhost_net *net)
> > > iov_length(vq->hdr, s), hdr_size);
> > > break;
> > > }
> > > - err = sock->ops->recvmsg(NULL, sock, &msg,
> > > +
> > > + err = sock->ops->recvmsg(iocb, sock, &msg,
> > > len, MSG_DONTWAIT | MSG_TRUNC);
> > > /* TODO: Check specific error and bomb out unless EAGAIN? */
> > > if (err < 0) {
> > > vhost_discard_vq_desc(vq);
> > > break;
> > > }
> > > +
> > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> > > + continue;
> > > +
> > > /* TODO: Should check and handle checksum. */
> > > if (err > len) {
> > > pr_err("Discarded truncated rx packet: "
> > > @@ -284,10 +405,13 @@ static void handle_rx(struct vhost_net *net)
> > > }
> > > }
> > >
> > > + handle_async_rx_events_notify(net, vq);
> > > +
> > > mutex_unlock(&vq->mutex);
> > > unuse_mm(net->dev.mm);
> > > }
> > >
> > > +
> > > static void handle_tx_kick(struct work_struct *work)
> > > {
> > > struct vhost_virtqueue *vq;
> > > @@ -338,6 +462,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
> > > vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
> > > vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
> > > n->tx_poll_state = VHOST_NET_POLL_DISABLED;
> > > + n->cache = NULL;
> > > return 0;
> > > }
> > >
> > > @@ -398,6 +523,17 @@ static void vhost_net_flush(struct vhost_net *n)
> > > vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
> > > }
> > >
> > > +static void vhost_notifier_cleanup(struct vhost_net *n)
> > > +{
> > > + struct vhost_virtqueue *vq = &n->dev.vqs[VHOST_NET_VQ_RX];
> > > + struct kiocb *iocb = NULL;
> > > + if (n->cache) {
> > > + while ((iocb = notify_dequeue(vq)) != NULL)
> > > + kmem_cache_free(n->cache, iocb);
> > > + kmem_cache_destroy(n->cache);
> > > + }
> > > +}
> > > +
> > > static int vhost_net_release(struct inode *inode, struct file *f)
> > > {
> > > struct vhost_net *n = f->private_data;
> > > @@ -414,6 +550,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
> > > /* We do an extra flush before freeing memory,
> > > * since jobs can re-queue themselves. */
> > > vhost_net_flush(n);
> > > + vhost_notifier_cleanup(n);
> > > kfree(n);
> > > return 0;
> > > }
> > > @@ -462,7 +599,19 @@ static struct socket *get_tun_socket(int fd)
> > > return sock;
> > > }
> > >
> > > -static struct socket *get_socket(int fd)
> > > +static struct socket *get_mp_socket(int fd)
> > > +{
> > > + struct file *file = fget(fd);
> > > + struct socket *sock;
> > > + if (!file)
> > > + return ERR_PTR(-EBADF);
> > > + sock = mp_get_socket(file);
> > > + if (IS_ERR(sock))
> > > + fput(file);
> > > + return sock;
> > > +}
> > > +
> > > +static struct socket *get_socket(struct vhost_virtqueue *vq, int fd)
> > > {
> > > struct socket *sock;
> > > if (fd == -1)
> > > @@ -473,9 +622,31 @@ static struct socket *get_socket(int fd)
> > > sock = get_tun_socket(fd);
> > > if (!IS_ERR(sock))
> > > return sock;
> > > + sock = get_mp_socket(fd);
> > > + if (!IS_ERR(sock)) {
> > > + vq->link_state = VHOST_VQ_LINK_ASYNC;
> > > + return sock;
> > > + }
> > > return ERR_PTR(-ENOTSOCK);
> > > }
> > >
> > > +static void vhost_init_link_state(struct vhost_net *n, int index)
> > > +{
> > > + struct vhost_virtqueue *vq = n->vqs + index;
> > > +
> > > + WARN_ON(!mutex_is_locked(&vq->mutex));
> > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > > + vq->receiver = NULL;
> > > + INIT_LIST_HEAD(&vq->notifier);
> > > + spin_lock_init(&vq->notify_lock);
> > > + if (!n->cache) {
> > > + n->cache = kmem_cache_create("vhost_kiocb",
> > > + sizeof(struct kiocb), 0,
> > > + SLAB_HWCACHE_ALIGN, NULL);
> > > + }
> > > + }
> > > +}
> > > +
> > > static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> > > {
> > > struct socket *sock, *oldsock;
> > > @@ -493,12 +664,15 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> > > }
> > > vq = n->vqs + index;
> > > mutex_lock(&vq->mutex);
> > > - sock = get_socket(fd);
> > > + vq->link_state = VHOST_VQ_LINK_SYNC;
> > > + sock = get_socket(vq, fd);
> > > if (IS_ERR(sock)) {
> > > r = PTR_ERR(sock);
> > > goto err;
> > > }
> > >
> > > + vhost_init_link_state(n, index);
> > > +
> > > /* start polling new socket */
> > > oldsock = vq->private_data;
> > > if (sock == oldsock)
> > > @@ -507,8 +681,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> > > vhost_net_disable_vq(n, vq);
> > > rcu_assign_pointer(vq->private_data, sock);
> > > vhost_net_enable_vq(n, vq);
> > > - mutex_unlock(&vq->mutex);
> > > done:
> > > + mutex_unlock(&vq->mutex);
> > > mutex_unlock(&n->dev.mutex);
> > > if (oldsock) {
> > > vhost_net_flush_vq(n, index);
> > > @@ -516,6 +690,7 @@ done:
> > > }
> > > return r;
> > > err:
> > > + mutex_unlock(&vq->mutex);
> > > mutex_unlock(&n->dev.mutex);
> > > return r;
> > > }
> > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > index d1f0453..cffe39a 100644
> > > --- a/drivers/vhost/vhost.h
> > > +++ b/drivers/vhost/vhost.h
> > > @@ -43,6 +43,11 @@ struct vhost_log {
> > > u64 len;
> > > };
> > >
> > > +enum vhost_vq_link_state {
> > > + VHOST_VQ_LINK_SYNC = 0,
> > > + VHOST_VQ_LINK_ASYNC = 1,
> > > +};
> > > +
> > > /* The virtqueue structure describes a queue attached to a device. */
> > > struct vhost_virtqueue {
> > > struct vhost_dev *dev;
> > > @@ -96,6 +101,11 @@ struct vhost_virtqueue {
> > > /* Log write descriptors */
> > > void __user *log_base;
> > > struct vhost_log log[VHOST_NET_MAX_SG];
> > > + /*Differiate async socket for 0-copy from normal*/
> > > + enum vhost_vq_link_state link_state;
> > > + struct list_head notifier;
> > > + spinlock_t notify_lock;
> > > + void (*receiver)(struct vhost_virtqueue *);
> > > };
> > >
> > > struct vhost_dev {
> > > --
> > > 1.5.4.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Michael S. Tsirkin on
On Wed, Apr 07, 2010 at 09:36:36AM +0800, Xin, Xiaohui wrote:
> Michael,
> > > >>>> For the write logging, do you have a function in hand that we can
> > > >>>> recompute the log? If that, I think I can use it to recompute the
> > > >>>>log info when the logging is suddenly enabled.
> > > >>>> For the outstanding requests, do you mean all the user buffers have
> > > >>>>submitted before the logging ioctl changed? That may be a lot, and
> > > >> >>some of them are still in NIC ring descriptors. Waiting them to be
> > > >>>>finished may be need some time. I think when logging ioctl changed,
> > > >> >>then the logging is changed just after that is also reasonable.
>
> > > >>>The key point is that after loggin ioctl returns, any
> > > >>>subsequent change to memory must be logged. It does not
> > > >>>matter when was the request submitted, otherwise we will
> > > >>>get memory corruption on migration.
>
> > > >>The change to memory happens when vhost_add_used_and_signal(), right?
> > > >>So after ioctl returns, just recompute the log info to the events in the async queue,
> > > >>is ok. Since the ioctl and write log operations are all protected by vq->mutex.
>
> > >>> Thanks
> > >> >Xiaohui
>
> > >>Yes, I think this will work.
>
> >> Thanks, so do you have the function to recompute the log info in your hand that I can
> >>use? I have weakly remembered that you have noticed it before some time.
>
> >Doesn't just rerunning vhost_get_vq_desc work?
>
> Am I missing something here?
> The vhost_get_vq_desc() looks in vq, and finds the first available buffers, and converts it
> to an iovec. I think the first available buffer is not the buffers in the async queue, so I
> think rerunning vhost_get_vq_desc() cannot work.
>
> Thanks
> Xiaohui

Right, but we can move the head back, so we'll find the same buffers
again, or add a variant of vhost_get_vq_desc that will process
descriptors already consumed.

> > > > Thanks
> > > > Xiaohui
> > > >
> > > > drivers/vhost/net.c | 189 +++++++++++++++++++++++++++++++++++++++++++++++--
> > > > drivers/vhost/vhost.h | 10 +++
> > > > 2 files changed, 192 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > > index 22d5fef..2aafd90 100644
> > > > --- a/drivers/vhost/net.c
> > > > +++ b/drivers/vhost/net.c
> > > > @@ -17,11 +17,13 @@
> > > > #include <linux/workqueue.h>
> > > > #include <linux/rcupdate.h>
> > > > #include <linux/file.h>
> > > > +#include <linux/aio.h>
> > > >
> > > > #include <linux/net.h>
> > > > #include <linux/if_packet.h>
> > > > #include <linux/if_arp.h>
> > > > #include <linux/if_tun.h>
> > > > +#include <linux/mpassthru.h>
> > > >
> > > > #include <net/sock.h>
> > > >
> > > > @@ -47,6 +49,7 @@ struct vhost_net {
> > > > struct vhost_dev dev;
> > > > struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
> > > > struct vhost_poll poll[VHOST_NET_VQ_MAX];
> > > > + struct kmem_cache *cache;
> > > > /* Tells us whether we are polling a socket for TX.
> > > > * We only do this when socket buffer fills up.
> > > > * Protected by tx vq lock. */
> > > > @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
> > > > net->tx_poll_state = VHOST_NET_POLL_STARTED;
> > > > }
> > > >
> > > > +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
> > > > +{
> > > > + struct kiocb *iocb = NULL;
> > > > + unsigned long flags;
> > > > +
> > > > + spin_lock_irqsave(&vq->notify_lock, flags);
> > > > + if (!list_empty(&vq->notifier)) {
> > > > + iocb = list_first_entry(&vq->notifier,
> > > > + struct kiocb, ki_list);
> > > > + list_del(&iocb->ki_list);
> > > > + }
> > > > + spin_unlock_irqrestore(&vq->notify_lock, flags);
> > > > + return iocb;
> > > > +}
> > > > +
> > > > +static void handle_async_rx_events_notify(struct vhost_net *net,
> > > > + struct vhost_virtqueue *vq)
> > > > +{
> > > > + struct kiocb *iocb = NULL;
> > > > + struct vhost_log *vq_log = NULL;
> > > > + int rx_total_len = 0;
> > > > + int log, size;
> > > > +
> > > > + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > > > + return;
> > > > +
> > > > + if (vq->receiver)
> > > > + vq->receiver(vq);
> > > > +
> > > > + vq_log = unlikely(vhost_has_feature(
> > > > + &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
> > > > + while ((iocb = notify_dequeue(vq)) != NULL) {
> > > > + vhost_add_used_and_signal(&net->dev, vq,
> > > > + iocb->ki_pos, iocb->ki_nbytes);
> > > > + log = (int)iocb->ki_user_data;
> > > > + size = iocb->ki_nbytes;
> > > > + rx_total_len += iocb->ki_nbytes;
> > > > +
> > > > + if (iocb->ki_dtor)
> > > > + iocb->ki_dtor(iocb);
> > > > + kmem_cache_free(net->cache, iocb);
> > > > +
> > > > + if (unlikely(vq_log))
> > > > + vhost_log_write(vq, vq_log, log, size);
> > > > + if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
> > > > + vhost_poll_queue(&vq->poll);
> > > > + break;
> > > > + }
> > > > + }
> > > > +}
> > > > +
> > > > +static void handle_async_tx_events_notify(struct vhost_net *net,
> > > > + struct vhost_virtqueue *vq)
> > > > +{
> > > > + struct kiocb *iocb = NULL;
> > > > + int tx_total_len = 0;
> > > > +
> > > > + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > > > + return;
> > > > +
> > > > + while ((iocb = notify_dequeue(vq)) != NULL) {
> > > > + vhost_add_used_and_signal(&net->dev, vq,
> > > > + iocb->ki_pos, 0);
> > > > + tx_total_len += iocb->ki_nbytes;
> > > > +
> > > > + if (iocb->ki_dtor)
> > > > + iocb->ki_dtor(iocb);
> > > > +
> > > > + kmem_cache_free(net->cache, iocb);
> > > > + if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
> > > > + vhost_poll_queue(&vq->poll);
> > > > + break;
> > > > + }
> > > > + }
> > > > +}
> > > > +
> > > > /* Expects to be always run from workqueue - which acts as
> > > > * read-size critical section for our kind of RCU. */
> > > > static void handle_tx(struct vhost_net *net)
> > > > {
> > > > struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
> > > > + struct kiocb *iocb = NULL;
> > > > unsigned head, out, in, s;
> > > > struct msghdr msg = {
> > > > .msg_name = NULL,
> > > > @@ -124,6 +204,8 @@ static void handle_tx(struct vhost_net *net)
> > > > tx_poll_stop(net);
> > > > hdr_size = vq->hdr_size;
> > > >
> > > > + handle_async_tx_events_notify(net, vq);
> > > > +
> > > > for (;;) {
> > > > head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > > > ARRAY_SIZE(vq->iov),
> > > > @@ -151,6 +233,15 @@ static void handle_tx(struct vhost_net *net)
> > > > /* Skip header. TODO: support TSO. */
> > > > s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
> > > > msg.msg_iovlen = out;
> > > > +
> > > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > > > + iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
> > > > + if (!iocb)
> > > > + break;
> > > > + iocb->ki_pos = head;
> > > > + iocb->private = (void *)vq;
> > > > + }
> > > > +
> > > > len = iov_length(vq->iov, out);
> > > > /* Sanity check */
> > > > if (!len) {
> > > > @@ -160,12 +251,16 @@ static void handle_tx(struct vhost_net *net)
> > > > break;
> > > > }
> > > > /* TODO: Check specific error and bomb out unless ENOBUFS? */
> > > > - err = sock->ops->sendmsg(NULL, sock, &msg, len);
> > > > + err = sock->ops->sendmsg(iocb, sock, &msg, len);
> > > > if (unlikely(err < 0)) {
> > > > vhost_discard_vq_desc(vq);
> > > > tx_poll_start(net, sock);
> > > > break;
> > > > }
> > > > +
> > > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> > > > + continue;
> > > > +
> > > > if (err != len)
> > > > pr_err("Truncated TX packet: "
> > > > " len %d != %zd\n", err, len);
> > > > @@ -177,6 +272,8 @@ static void handle_tx(struct vhost_net *net)
> > > > }
> > > > }
> > > >
> > > > + handle_async_tx_events_notify(net, vq);
> > > > +
> > > > mutex_unlock(&vq->mutex);
> > > > unuse_mm(net->dev.mm);
> > > > }
> > > > @@ -186,6 +283,7 @@ static void handle_tx(struct vhost_net *net)
> > > > static void handle_rx(struct vhost_net *net)
> > > > {
> > > > struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> > > > + struct kiocb *iocb = NULL;
> > > > unsigned head, out, in, log, s;
> > > > struct vhost_log *vq_log;
> > > > struct msghdr msg = {
> > > > @@ -206,7 +304,8 @@ static void handle_rx(struct vhost_net *net)
> > > > int err;
> > > > size_t hdr_size;
> > > > struct socket *sock = rcu_dereference(vq->private_data);
> > > > - if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> > > > + if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
> > > > + vq->link_state == VHOST_VQ_LINK_SYNC))
> > > > return;
> > > >
> > > > use_mm(net->dev.mm);
> > > > @@ -214,9 +313,18 @@ static void handle_rx(struct vhost_net *net)
> > > > vhost_disable_notify(vq);
> > > > hdr_size = vq->hdr_size;
> > > >
> > > > - vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> > > > + /* In async cases, for write logging, the simple way is to get
> > > > + * the log info always, and really logging is decided later.
> > > > + * Thus, when logging enabled, we can get log, and when logging
> > > > + * disabled, we can get log disabled accordingly.
> > > > + */
> > > > +
> > > > + vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) |
> > > > + (vq->link_state == VHOST_VQ_LINK_ASYNC) ?
> > > > vq->log : NULL;
> > > >
> > > > + handle_async_rx_events_notify(net, vq);
> > > > +
> > > > for (;;) {
> > > > head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > > > ARRAY_SIZE(vq->iov),
> > > > @@ -245,6 +353,14 @@ static void handle_rx(struct vhost_net *net)
> > > > s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> > > > msg.msg_iovlen = in;
> > > > len = iov_length(vq->iov, in);
> > > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > > > + iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
> > > > + if (!iocb)
> > > > + break;
> > > > + iocb->private = vq;
> > > > + iocb->ki_pos = head;
> > > > + iocb->ki_user_data = log;
> > > > + }
> > > > /* Sanity check */
> > > > if (!len) {
> > > > vq_err(vq, "Unexpected header len for RX: "
> > > > @@ -252,13 +368,18 @@ static void handle_rx(struct vhost_net *net)
> > > > iov_length(vq->hdr, s), hdr_size);
> > > > break;
> > > > }
> > > > - err = sock->ops->recvmsg(NULL, sock, &msg,
> > > > +
> > > > + err = sock->ops->recvmsg(iocb, sock, &msg,
> > > > len, MSG_DONTWAIT | MSG_TRUNC);
> > > > /* TODO: Check specific error and bomb out unless EAGAIN? */
> > > > if (err < 0) {
> > > > vhost_discard_vq_desc(vq);
> > > > break;
> > > > }
> > > > +
> > > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> > > > + continue;
> > > > +
> > > > /* TODO: Should check and handle checksum. */
> > > > if (err > len) {
> > > > pr_err("Discarded truncated rx packet: "
> > > > @@ -284,10 +405,13 @@ static void handle_rx(struct vhost_net *net)
> > > > }
> > > > }
> > > >
> > > > + handle_async_rx_events_notify(net, vq);
> > > > +
> > > > mutex_unlock(&vq->mutex);
> > > > unuse_mm(net->dev.mm);
> > > > }
> > > >
> > > > +
> > > > static void handle_tx_kick(struct work_struct *work)
> > > > {
> > > > struct vhost_virtqueue *vq;
> > > > @@ -338,6 +462,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
> > > > vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
> > > > vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
> > > > n->tx_poll_state = VHOST_NET_POLL_DISABLED;
> > > > + n->cache = NULL;
> > > > return 0;
> > > > }
> > > >
> > > > @@ -398,6 +523,17 @@ static void vhost_net_flush(struct vhost_net *n)
> > > > vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
> > > > }
> > > >
> > > > +static void vhost_notifier_cleanup(struct vhost_net *n)
> > > > +{
> > > > + struct vhost_virtqueue *vq = &n->dev.vqs[VHOST_NET_VQ_RX];
> > > > + struct kiocb *iocb = NULL;
> > > > + if (n->cache) {
> > > > + while ((iocb = notify_dequeue(vq)) != NULL)
> > > > + kmem_cache_free(n->cache, iocb);
> > > > + kmem_cache_destroy(n->cache);
> > > > + }
> > > > +}
> > > > +
> > > > static int vhost_net_release(struct inode *inode, struct file *f)
> > > > {
> > > > struct vhost_net *n = f->private_data;
> > > > @@ -414,6 +550,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
> > > > /* We do an extra flush before freeing memory,
> > > > * since jobs can re-queue themselves. */
> > > > vhost_net_flush(n);
> > > > + vhost_notifier_cleanup(n);
> > > > kfree(n);
> > > > return 0;
> > > > }
> > > > @@ -462,7 +599,19 @@ static struct socket *get_tun_socket(int fd)
> > > > return sock;
> > > > }
> > > >
> > > > -static struct socket *get_socket(int fd)
> > > > +static struct socket *get_mp_socket(int fd)
> > > > +{
> > > > + struct file *file = fget(fd);
> > > > + struct socket *sock;
> > > > + if (!file)
> > > > + return ERR_PTR(-EBADF);
> > > > + sock = mp_get_socket(file);
> > > > + if (IS_ERR(sock))
> > > > + fput(file);
> > > > + return sock;
> > > > +}
> > > > +
> > > > +static struct socket *get_socket(struct vhost_virtqueue *vq, int fd)
> > > > {
> > > > struct socket *sock;
> > > > if (fd == -1)
> > > > @@ -473,9 +622,31 @@ static struct socket *get_socket(int fd)
> > > > sock = get_tun_socket(fd);
> > > > if (!IS_ERR(sock))
> > > > return sock;
> > > > + sock = get_mp_socket(fd);
> > > > + if (!IS_ERR(sock)) {
> > > > + vq->link_state = VHOST_VQ_LINK_ASYNC;
> > > > + return sock;
> > > > + }
> > > > return ERR_PTR(-ENOTSOCK);
> > > > }
> > > >
> > > > +static void vhost_init_link_state(struct vhost_net *n, int index)
> > > > +{
> > > > + struct vhost_virtqueue *vq = n->vqs + index;
> > > > +
> > > > + WARN_ON(!mutex_is_locked(&vq->mutex));
> > > > + if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > > > + vq->receiver = NULL;
> > > > + INIT_LIST_HEAD(&vq->notifier);
> > > > + spin_lock_init(&vq->notify_lock);
> > > > + if (!n->cache) {
> > > > + n->cache = kmem_cache_create("vhost_kiocb",
> > > > + sizeof(struct kiocb), 0,
> > > > + SLAB_HWCACHE_ALIGN, NULL);
> > > > + }
> > > > + }
> > > > +}
> > > > +
> > > > static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> > > > {
> > > > struct socket *sock, *oldsock;
> > > > @@ -493,12 +664,15 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> > > > }
> > > > vq = n->vqs + index;
> > > > mutex_lock(&vq->mutex);
> > > > - sock = get_socket(fd);
> > > > + vq->link_state = VHOST_VQ_LINK_SYNC;
> > > > + sock = get_socket(vq, fd);
> > > > if (IS_ERR(sock)) {
> > > > r = PTR_ERR(sock);
> > > > goto err;
> > > > }
> > > >
> > > > + vhost_init_link_state(n, index);
> > > > +
> > > > /* start polling new socket */
> > > > oldsock = vq->private_data;
> > > > if (sock == oldsock)
> > > > @@ -507,8 +681,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> > > > vhost_net_disable_vq(n, vq);
> > > > rcu_assign_pointer(vq->private_data, sock);
> > > > vhost_net_enable_vq(n, vq);
> > > > - mutex_unlock(&vq->mutex);
> > > > done:
> > > > + mutex_unlock(&vq->mutex);
> > > > mutex_unlock(&n->dev.mutex);
> > > > if (oldsock) {
> > > > vhost_net_flush_vq(n, index);
> > > > @@ -516,6 +690,7 @@ done:
> > > > }
> > > > return r;
> > > > err:
> > > > + mutex_unlock(&vq->mutex);
> > > > mutex_unlock(&n->dev.mutex);
> > > > return r;
> > > > }
> > > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > > > index d1f0453..cffe39a 100644
> > > > --- a/drivers/vhost/vhost.h
> > > > +++ b/drivers/vhost/vhost.h
> > > > @@ -43,6 +43,11 @@ struct vhost_log {
> > > > u64 len;
> > > > };
> > > >
> > > > +enum vhost_vq_link_state {
> > > > + VHOST_VQ_LINK_SYNC = 0,
> > > > + VHOST_VQ_LINK_ASYNC = 1,
> > > > +};
> > > > +
> > > > /* The virtqueue structure describes a queue attached to a device. */
> > > > struct vhost_virtqueue {
> > > > struct vhost_dev *dev;
> > > > @@ -96,6 +101,11 @@ struct vhost_virtqueue {
> > > > /* Log write descriptors */
> > > > void __user *log_base;
> > > > struct vhost_log log[VHOST_NET_MAX_SG];
> > > > + /*Differiate async socket for 0-copy from normal*/
> > > > + enum vhost_vq_link_state link_state;
> > > > + struct list_head notifier;
> > > > + spinlock_t notify_lock;
> > > > + void (*receiver)(struct vhost_virtqueue *);
> > > > };
> > > >
> > > > struct vhost_dev {
> > > > --
> > > > 1.5.4.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Michael S. Tsirkin on
On Fri, Apr 23, 2010 at 03:08:33PM +0800, xiaohui.xin(a)intel.com wrote:
> From: Xin Xiaohui <xiaohui.xin(a)intel.com>
>
> The vhost-net backend now only supports synchronous send/recv
> operations. The patch provides multiple submits and asynchronous
> notifications. This is needed for zero-copy case.
>
> Signed-off-by: Xin Xiaohui <xiaohui.xin(a)intel.com>
> ---
>
> Michael,
> >>>Can't vhost supply a kiocb completion callback that will handle the list?
> >>Yes, thanks. And with it I also remove the vq->receivr finally.
> >>Thanks
> >>Xiaohui
>
> >Nice progress. I commented on some minor issues below.
> >Thanks!
>
> The updated patch addressed your comments on the minor issues.
> Thanks!
>
> Thanks
> Xiaohui
>
> drivers/vhost/net.c | 236 +++++++++++++++++++++++++++++++++++++++++++++++-
> drivers/vhost/vhost.c | 120 ++++++++++++++-----------
> drivers/vhost/vhost.h | 14 +++
> 3 files changed, 314 insertions(+), 56 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 38989d1..18f6c41 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -23,6 +23,8 @@
> #include <linux/if_arp.h>
> #include <linux/if_tun.h>
> #include <linux/if_macvlan.h>
> +#include <linux/mpassthru.h>
> +#include <linux/aio.h>
>
> #include <net/sock.h>
>
> @@ -48,6 +50,7 @@ struct vhost_net {
> struct vhost_dev dev;
> struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
> struct vhost_poll poll[VHOST_NET_VQ_MAX];
> + struct kmem_cache *cache;
> /* Tells us whether we are polling a socket for TX.
> * We only do this when socket buffer fills up.
> * Protected by tx vq lock. */
> @@ -92,11 +95,138 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
> net->tx_poll_state = VHOST_NET_POLL_STARTED;
> }
>
> +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
> +{
> + struct kiocb *iocb = NULL;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&vq->notify_lock, flags);
> + if (!list_empty(&vq->notifier)) {
> + iocb = list_first_entry(&vq->notifier,
> + struct kiocb, ki_list);
> + list_del(&iocb->ki_list);
> + }
> + spin_unlock_irqrestore(&vq->notify_lock, flags);
> + return iocb;
> +}
> +
> +static void handle_iocb(struct kiocb *iocb)
> +{
> + struct vhost_virtqueue *vq = iocb->private;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&vq->notify_lock, flags);
> + list_add_tail(&iocb->ki_list, &vq->notifier);
> + spin_unlock_irqrestore(&vq->notify_lock, flags);

Don't we need to wake up the wq as well?

> +}
> +
> +static int is_async_vq(struct vhost_virtqueue *vq)
> +{
> + return (vq->link_state == VHOST_VQ_LINK_ASYNC);

() not needed

> +}
> +
> +static void handle_async_rx_events_notify(struct vhost_net *net,
> + struct vhost_virtqueue *vq,
> + struct socket *sock)
> +{
> + struct kiocb *iocb = NULL;
> + struct vhost_log *vq_log = NULL;
> + int rx_total_len = 0;
> + unsigned int head, log, in, out;
> + int size;
> +
> + if (!is_async_vq(vq))
> + return;
> +
> + if (sock->sk->sk_data_ready)
> + sock->sk->sk_data_ready(sock->sk, 0);
> +
> + vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> + vq->log : NULL;
> +
> + while ((iocb = notify_dequeue(vq)) != NULL) {
> + vhost_add_used_and_signal(&net->dev, vq,
> + iocb->ki_pos, iocb->ki_nbytes);
> + size = iocb->ki_nbytes;
> + head = iocb->ki_pos;
> + rx_total_len += iocb->ki_nbytes;
> +
> + if (iocb->ki_dtor)
> + iocb->ki_dtor(iocb);

I am confused by the above. Isn't ki_dtor handle_iocb?
Why is it called here?

> + kmem_cache_free(net->cache, iocb);
> +
> + /* when log is enabled, recomputing the log info is needed,
> + * since these buffers are in async queue, and may not get
> + * the log info before.
> + */
> + if (unlikely(vq_log)) {
> + if (!log)

log is uninitialized now?

> + __vhost_get_vq_desc(&net->dev, vq, vq->iov,
> + ARRAY_SIZE(vq->iov),
> + &out, &in, vq_log,
> + &log, head);
> + vhost_log_write(vq, vq_log, log, size);
> + }
> + if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
> + vhost_poll_queue(&vq->poll);
> + break;
> + }
> + }
> +}
> +
> +static void handle_async_tx_events_notify(struct vhost_net *net,
> + struct vhost_virtqueue *vq)
> +{
> + struct kiocb *iocb = NULL;
> + int tx_total_len = 0;
> +
> + if (!is_async_vq(vq))
> + return;
> +
> + while ((iocb = notify_dequeue(vq)) != NULL) {

Please just write this as while (((iocb = notify_dequeue(vq)))
above as well

> + vhost_add_used_and_signal(&net->dev, vq,
> + iocb->ki_pos, 0);

pls indent continuation lines to the roght of (
above as well

> + tx_total_len += iocb->ki_nbytes;
> +
> + if (iocb->ki_dtor)
> + iocb->ki_dtor(iocb);

same question as above

> +
> + kmem_cache_free(net->cache, iocb);
> + if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
> + vhost_poll_queue(&vq->poll);
> + break;
> + }
> + }
> +}
> +
> +static struct kiocb *create_iocb(struct vhost_net *net,
> + struct vhost_virtqueue *vq,
> + unsigned head)
> +{
> + struct kiocb *iocb = NULL;
> +
> + if (!is_async_vq(vq))
> + return NULL;
> +
> + iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
> + if (!iocb)
> + return NULL;
> + iocb->private = vq;
> + iocb->ki_pos = head;
> + iocb->ki_dtor = handle_iocb;

So, dtor calls handle_iocb, but what causes vhost
to wake-up is really poll, right?


> + if (vq == &net->dev.vqs[VHOST_NET_VQ_RX]) {
> + iocb->ki_user_data = vq->num;

Is the above used?

> + iocb->ki_iovec = vq->hdr;
> + }
> + return iocb;
> +}
> +
> /* Expects to be always run from workqueue - which acts as
> * read-size critical section for our kind of RCU. */
> static void handle_tx(struct vhost_net *net)
> {
> struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
> + struct kiocb *iocb = NULL;

Why do we need to init iocb to NULL?

> unsigned head, out, in, s;
> struct msghdr msg = {
> .msg_name = NULL,
> @@ -129,6 +259,8 @@ static void handle_tx(struct vhost_net *net)
> tx_poll_stop(net);
> hdr_size = vq->hdr_size;
>
> + handle_async_tx_events_notify(net, vq);
> +
> for (;;) {
> head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> ARRAY_SIZE(vq->iov),
> @@ -156,6 +288,13 @@ static void handle_tx(struct vhost_net *net)
> /* Skip header. TODO: support TSO. */
> s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
> msg.msg_iovlen = out;
> +
> + if (is_async_vq(vq)) {
> + iocb = create_iocb(net, vq, head);
> + if (!iocb)
> + break;
> + }
> +
> len = iov_length(vq->iov, out);
> /* Sanity check */
> if (!len) {
> @@ -165,12 +304,18 @@ static void handle_tx(struct vhost_net *net)
> break;
> }
> /* TODO: Check specific error and bomb out unless ENOBUFS? */
> - err = sock->ops->sendmsg(NULL, sock, &msg, len);
> + err = sock->ops->sendmsg(iocb, sock, &msg, len);
> if (unlikely(err < 0)) {
> + if (is_async_vq(vq))
> + kmem_cache_free(net->cache, iocb);
> vhost_discard_vq_desc(vq);
> tx_poll_start(net, sock);
> break;
> }
> +
> + if (is_async_vq(vq))
> + continue;
> +
> if (err != len)
> pr_err("Truncated TX packet: "
> " len %d != %zd\n", err, len);
> @@ -182,6 +327,8 @@ static void handle_tx(struct vhost_net *net)
> }
> }
>
> + handle_async_tx_events_notify(net, vq);
> +
> mutex_unlock(&vq->mutex);
> unuse_mm(net->dev.mm);
> }
> @@ -191,6 +338,7 @@ static void handle_tx(struct vhost_net *net)
> static void handle_rx(struct vhost_net *net)
> {
> struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> + struct kiocb *iocb = NULL;
> unsigned head, out, in, log, s;
> struct vhost_log *vq_log;
> struct msghdr msg = {
> @@ -211,7 +359,8 @@ static void handle_rx(struct vhost_net *net)
> int err;
> size_t hdr_size;
> struct socket *sock = rcu_dereference(vq->private_data);
> - if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> + if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
> + vq->link_state == VHOST_VQ_LINK_SYNC))
> return;
>
> use_mm(net->dev.mm);
> @@ -219,9 +368,17 @@ static void handle_rx(struct vhost_net *net)
> vhost_disable_notify(vq);
> hdr_size = vq->hdr_size;
>
> + /* In async cases, when write log is enabled, in case the submitted
> + * buffers did not get log info before the log enabling, so we'd
> + * better recompute the log info when needed. We do this in
> + * handle_async_rx_events_notify().
> + */
> +
> vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> vq->log : NULL;
>
> + handle_async_rx_events_notify(net, vq, sock);
> +
> for (;;) {
> head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> ARRAY_SIZE(vq->iov),
> @@ -250,6 +407,13 @@ static void handle_rx(struct vhost_net *net)
> s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> msg.msg_iovlen = in;
> len = iov_length(vq->iov, in);
> +
> + if (is_async_vq(vq)) {
> + iocb = create_iocb(net, vq, head);
> + if (!iocb)
> + break;
> + }
> +
> /* Sanity check */
> if (!len) {
> vq_err(vq, "Unexpected header len for RX: "
> @@ -257,13 +421,20 @@ static void handle_rx(struct vhost_net *net)
> iov_length(vq->hdr, s), hdr_size);
> break;
> }
> - err = sock->ops->recvmsg(NULL, sock, &msg,
> +
> + err = sock->ops->recvmsg(iocb, sock, &msg,
> len, MSG_DONTWAIT | MSG_TRUNC);
> /* TODO: Check specific error and bomb out unless EAGAIN? */
> if (err < 0) {
> + if (is_async_vq(vq))
> + kmem_cache_free(net->cache, iocb);
> vhost_discard_vq_desc(vq);
> break;
> }
> +
> + if (is_async_vq(vq))
> + continue;
> +
> /* TODO: Should check and handle checksum. */
> if (err > len) {
> pr_err("Discarded truncated rx packet: "
> @@ -289,6 +460,8 @@ static void handle_rx(struct vhost_net *net)
> }
> }
>
> + handle_async_rx_events_notify(net, vq, sock);
> +
> mutex_unlock(&vq->mutex);
> unuse_mm(net->dev.mm);
> }
> @@ -342,6 +515,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
> vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
> vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
> n->tx_poll_state = VHOST_NET_POLL_DISABLED;
> + n->cache = NULL;
>
> f->private_data = n;
>
> @@ -405,6 +579,18 @@ static void vhost_net_flush(struct vhost_net *n)
> vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
> }
>
> +static void vhost_async_cleanup(struct vhost_net *n)
> +{
> + /* clean the notifier */
> + struct vhost_virtqueue *vq = &n->dev.vqs[VHOST_NET_VQ_RX];
> + struct kiocb *iocb = NULL;
> + if (n->cache) {
> + while ((iocb = notify_dequeue(vq)) != NULL)
> + kmem_cache_free(n->cache, iocb);
> + kmem_cache_destroy(n->cache);
> + }
> +}
> +
> static int vhost_net_release(struct inode *inode, struct file *f)
> {
> struct vhost_net *n = f->private_data;
> @@ -421,6 +607,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
> /* We do an extra flush before freeing memory,
> * since jobs can re-queue themselves. */
> vhost_net_flush(n);
> + vhost_async_cleanup(n);
> kfree(n);
> return 0;
> }
> @@ -472,21 +659,58 @@ static struct socket *get_tap_socket(int fd)
> return sock;
> }
>
> -static struct socket *get_socket(int fd)
> +static struct socket *get_mp_socket(int fd)
> +{
> + struct file *file = fget(fd);
> + struct socket *sock;
> + if (!file)
> + return ERR_PTR(-EBADF);
> + sock = mp_get_socket(file);
> + if (IS_ERR(sock))
> + fput(file);
> + return sock;
> +}
> +
> +static struct socket *get_socket(struct vhost_virtqueue *vq, int fd,
> + enum vhost_vq_link_state *state)
> {
> struct socket *sock;
> /* special case to disable backend */
> if (fd == -1)
> return NULL;
> +
> + *state = VHOST_VQ_LINK_SYNC;
> +
> sock = get_raw_socket(fd);
> if (!IS_ERR(sock))
> return sock;
> sock = get_tap_socket(fd);
> if (!IS_ERR(sock))
> return sock;
> + sock = get_mp_socket(fd);
> + if (!IS_ERR(sock)) {
> + *state = VHOST_VQ_LINK_ASYNC;
> + return sock;
> + }
> return ERR_PTR(-ENOTSOCK);
> }
>
> +static void vhost_init_link_state(struct vhost_net *n, int index)

so let's pass link state as parameter, and set it in this function.
And maybe pass in vq, no need for index tricks.

> +{
> + struct vhost_virtqueue *vq = n->vqs + index;
> +
> + WARN_ON(!mutex_is_locked(&vq->mutex));

there's a single place of call, I don't think we need this check.

> + if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> + INIT_LIST_HEAD(&vq->notifier);
> + spin_lock_init(&vq->notify_lock);
> + if (!n->cache) {
> + n->cache = kmem_cache_create("vhost_kiocb",

vhost_net_kiocb a better name

> + sizeof(struct kiocb), 0,
> + SLAB_HWCACHE_ALIGN, NULL);
> + }

no need for {} for single statement if.

> + }
> +}
> +
> static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> {
> struct socket *sock, *oldsock;
> @@ -510,12 +734,14 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> r = -EFAULT;
> goto err_vq;
> }
> - sock = get_socket(fd);
> + sock = get_socket(vq, fd, &vq->link_state);
> if (IS_ERR(sock)) {
> r = PTR_ERR(sock);
> goto err_vq;
> }
>
> + vhost_init_link_state(n, index);
> +
> /* start polling new socket */
> oldsock = vq->private_data;
> if (sock == oldsock)
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 3f10194..add77d3 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -860,61 +860,17 @@ static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
> return 0;
> }
>
> -/* This looks in the virtqueue and for the first available buffer, and converts
> - * it to an iovec for convenient access. Since descriptors consist of some
> - * number of output then some number of input descriptors, it's actually two
> - * iovecs, but we pack them into one and note how many of each there were.
> - *
> - * This function returns the descriptor number found, or vq->num (which
> - * is never a valid descriptor number) if none was found. */
> -unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
> - struct iovec iov[], unsigned int iov_size,
> - unsigned int *out_num, unsigned int *in_num,
> - struct vhost_log *log, unsigned int *log_num)
> +/* This computes the log info according to the index of buffer */
> +unsigned __vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
> + struct iovec iov[], unsigned int iov_size,
> + unsigned int *out_num, unsigned int *in_num,
> + struct vhost_log *log, unsigned int *log_num,
> + unsigned int head)
> {
> struct vring_desc desc;
> unsigned int i, head, found = 0;
> - u16 last_avail_idx;
> - int ret;
> -
> - /* Check it isn't doing very strange things with descriptor numbers. */
> - last_avail_idx = vq->last_avail_idx;
> - if (get_user(vq->avail_idx, &vq->avail->idx)) {
> - vq_err(vq, "Failed to access avail idx at %p\n",
> - &vq->avail->idx);
> - return vq->num;
> - }
> -
> - if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
> - vq_err(vq, "Guest moved used index from %u to %u",
> - last_avail_idx, vq->avail_idx);
> - return vq->num;
> - }
> -
> - /* If there's nothing new since last we looked, return invalid. */
> - if (vq->avail_idx == last_avail_idx)
> - return vq->num;
> + unsigned int ret;
>
> - /* Only get avail ring entries after they have been exposed by guest. */
> - smp_rmb();
> -
> - /* Grab the next descriptor number they're advertising, and increment
> - * the index we've seen. */
> - if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
> - vq_err(vq, "Failed to read head: idx %d address %p\n",
> - last_avail_idx,
> - &vq->avail->ring[last_avail_idx % vq->num]);
> - return vq->num;
> - }
> -
> - /* If their number is silly, that's an error. */
> - if (head >= vq->num) {
> - vq_err(vq, "Guest says index %u > %u is available",
> - head, vq->num);
> - return vq->num;
> - }
> -
> - /* When we start there are none of either input nor output. */
> *out_num = *in_num = 0;
> if (unlikely(log))
> *log_num = 0;
> @@ -978,8 +934,70 @@ unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
> *out_num += ret;
> }
> } while ((i = next_desc(&desc)) != -1);
> + return head;
> +}
> +
> +/* This looks in the virtqueue and for the first available buffer, and converts
> + * it to an iovec for convenient access. Since descriptors consist of some
> + * number of output then some number of input descriptors, it's actually two
> + * iovecs, but we pack them into one and note how many of each there were.
> + *
> + * This function returns the descriptor number found, or vq->num (which
> + * is never a valid descriptor number) if none was found. */
> +unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
> + struct iovec iov[], unsigned int iov_size,
> + unsigned int *out_num, unsigned int *in_num,
> + struct vhost_log *log, unsigned int *log_num)
> +{
> + struct vring_desc desc;
> + unsigned int i, head, found = 0;
> + u16 last_avail_idx;
> + unsigned int ret;
> +
> + /* Check it isn't doing very strange things with descriptor numbers. */
> + last_avail_idx = vq->last_avail_idx;
> + if (get_user(vq->avail_idx, &vq->avail->idx)) {
> + vq_err(vq, "Failed to access avail idx at %p\n",
> + &vq->avail->idx);
> + return vq->num;
> + }
> +
> + if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
> + vq_err(vq, "Guest moved used index from %u to %u",
> + last_avail_idx, vq->avail_idx);
> + return vq->num;
> + }
> +
> + /* If there's nothing new since last we looked, return invalid. */
> + if (vq->avail_idx == last_avail_idx)
> + return vq->num;
> +
> + /* Only get avail ring entries after they have been exposed by guest. */
> + rmb();
> +
> + /* Grab the next descriptor number they're advertising, and increment
> + * the index we've seen. */
> + if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
> + vq_err(vq, "Failed to read head: idx %d address %p\n",
> + last_avail_idx,
> + &vq->avail->ring[last_avail_idx % vq->num]);
> + return vq->num;
> + }
> +
> + /* If their number is silly, that's an error. */
> + if (head >= vq->num) {
> + vq_err(vq, "Guest says index %u > %u is available",
> + head, vq->num);
> + return vq->num;
> + }
> +
> + ret = __vhost_get_vq_desc(dev, vq, iov, iov_size,
> + out_num, in_num,
> + log, log_num, head);
>
> /* On success, increment avail index. */
> + if (ret == vq->num)
> + return ret;
> vq->last_avail_idx++;
> return head;
> }
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 44591ba..3c9cbce 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -43,6 +43,11 @@ struct vhost_log {
> u64 len;
> };
>
> +enum vhost_vq_link_state {
> + VHOST_VQ_LINK_SYNC = 0,
> + VHOST_VQ_LINK_ASYNC = 1,
> +};
> +
> /* The virtqueue structure describes a queue attached to a device. */
> struct vhost_virtqueue {
> struct vhost_dev *dev;
> @@ -96,6 +101,10 @@ struct vhost_virtqueue {
> /* Log write descriptors */
> void __user *log_base;
> struct vhost_log log[VHOST_NET_MAX_SG];
> + /* Differiate async socket for 0-copy from normal */
> + enum vhost_vq_link_state link_state;
> + struct list_head notifier;
> + spinlock_t notify_lock;
> };
>
> struct vhost_dev {
> @@ -124,6 +133,11 @@ unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
> struct iovec iov[], unsigned int iov_count,
> unsigned int *out_num, unsigned int *in_num,
> struct vhost_log *log, unsigned int *log_num);
> +unsigned __vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
> + struct iovec iov[], unsigned int iov_count,
> + unsigned int *out_num, unsigned int *in_num,
> + struct vhost_log *log, unsigned int *log_num,
> + unsigned int head);
> void vhost_discard_vq_desc(struct vhost_virtqueue *);
>
> int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
> --
> 1.5.4.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/