diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 23e8861e8b25e..c53ab2609d8cc 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -45,6 +45,12 @@ struct xsk_map { struct xdp_sock __rcu *xsk_map[]; }; +struct local_cq { + u32 prod ____cacheline_aligned_in_smp; + u32 ring_mask ____cacheline_aligned_in_smp; + u64 desc[] ____cacheline_aligned_in_smp; +}; + struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; @@ -89,6 +95,8 @@ struct xdp_sock { struct mutex mutex; struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */ struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */ + /* Maintain addr(s) of descriptors locally */ + struct local_cq *lcq; }; /* diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index f093c3453f64c..3b8720f64eb5c 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,8 +41,6 @@ struct xsk_addrs { u64 addrs[MAX_SKB_FRAGS + 1]; }; -static struct kmem_cache *xsk_tx_generic_cache; - void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { if (pool->cached_need_wakeup & XDP_WAKEUP_RX) @@ -539,81 +537,87 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } -static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) +static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr) { + struct xsk_buff_pool *pool = xs->pool; + struct local_cq *lcq = xs->lcq; int ret; spin_lock(&pool->cq_cached_prod_lock); ret = xskq_prod_reserve(pool->cq); spin_unlock(&pool->cq_cached_prod_lock); + if (!ret) + lcq->desc[lcq->prod++ & lcq->ring_mask] = addr; return ret; } -static bool xsk_skb_destructor_is_addr(struct sk_buff *skb) +#define XSK_DESTRUCTOR_DESCS_SHIFT 8 +#define XSK_DESTRUCTOR_DESCS_MASK \ + ((1ULL << XSK_DESTRUCTOR_DESCS_SHIFT) - 1) + +static long xsk_get_destructor_arg(struct sk_buff *skb) { - return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL; + return (long)skb_shinfo(skb)->destructor_arg; } -static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb) +static u8 xsk_get_num_desc(struct sk_buff *skb) { - return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL); + long val = xsk_get_destructor_arg(skb); + + return (u8)val & XSK_DESTRUCTOR_DESCS_MASK; } -static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) +/* Record the position of first desc in local cq */ +static void xsk_skb_destructor_set_addr(struct sk_buff *skb, + struct xdp_sock *xs) { - skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); + long val; + + val = ((xs->lcq->prod - 1) & xs->lcq->ring_mask) << XSK_DESTRUCTOR_DESCS_SHIFT; + skb_shinfo(skb)->destructor_arg = (void *)val; } +/* Only update the lower bits to adjust number of descriptors the skb + * carries. We have enough bits to increase the value of number of + * descriptors that should be within MAX_SKB_FRAGS, so increase it by + * one directly. + */ static void xsk_inc_num_desc(struct sk_buff *skb) { - struct xsk_addrs *xsk_addr; + long val = xsk_get_destructor_arg(skb) + 1; - if (!xsk_skb_destructor_is_addr(skb)) { - xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; - xsk_addr->num_descs++; - } + skb_shinfo(skb)->destructor_arg = (void *)val; } -static u32 xsk_get_num_desc(struct sk_buff *skb) +static u32 xsk_get_start_addr(struct sk_buff *skb) { - struct xsk_addrs *xsk_addr; + long val = xsk_get_destructor_arg(skb); - if (xsk_skb_destructor_is_addr(skb)) - return 1; + return val >> XSK_DESTRUCTOR_DESCS_SHIFT; +} - xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; +static void xsk_cq_write_addr(struct sk_buff *skb, u32 desc_processed) +{ + struct xsk_buff_pool *pool = xdp_sk(skb->sk)->pool; + u32 idx, addr, pos = xsk_get_start_addr(skb); + struct xdp_sock *xs = xdp_sk(skb->sk); - return xsk_addr->num_descs; + idx = xskq_get_prod(pool->cq) + desc_processed; + addr = xs->lcq->desc[(pos + desc_processed) & xs->lcq->ring_mask]; + xskq_prod_write_addr(pool->cq, idx, addr); } -static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, - struct sk_buff *skb) +static void xsk_cq_submit_addr_locked(struct sk_buff *skb) { - u32 num_descs = xsk_get_num_desc(skb); - struct xsk_addrs *xsk_addr; - u32 descs_processed = 0; + struct xsk_buff_pool *pool = xdp_sk(skb->sk)->pool; + u8 i, num = xsk_get_num_desc(skb); unsigned long flags; - u32 idx, i; spin_lock_irqsave(&pool->cq_prod_lock, flags); - idx = xskq_get_prod(pool->cq); - - if (unlikely(num_descs > 1)) { - xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; - - for (i = 0; i < num_descs; i++) { - xskq_prod_write_addr(pool->cq, idx + descs_processed, - xsk_addr->addrs[i]); - descs_processed++; - } - kmem_cache_free(xsk_tx_generic_cache, xsk_addr); - } else { - xskq_prod_write_addr(pool->cq, idx, - xsk_skb_destructor_get_addr(skb)); - descs_processed++; - } - xskq_prod_submit_n(pool->cq, descs_processed); + for (i = 0; i < num; i++) + xsk_cq_write_addr(skb, i); + xskq_prod_submit_n(pool->cq, num); spin_unlock_irqrestore(&pool->cq_prod_lock, flags); } @@ -634,30 +638,23 @@ void xsk_destruct_skb(struct sk_buff *skb) *compl->tx_timestamp = ktime_get_tai_fast_ns(); } - xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb); + xsk_cq_submit_addr_locked(skb); sock_wfree(skb); } -static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, - u64 addr) +static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs) { skb->dev = xs->dev; skb->priority = READ_ONCE(xs->sk.sk_priority); skb->mark = READ_ONCE(xs->sk.sk_mark); skb->destructor = xsk_destruct_skb; - xsk_skb_destructor_set_addr(skb, addr); + xsk_skb_destructor_set_addr(skb, xs); } static void xsk_consume_skb(struct sk_buff *skb) { struct xdp_sock *xs = xdp_sk(skb->sk); u32 num_descs = xsk_get_num_desc(skb); - struct xsk_addrs *xsk_addr; - - if (unlikely(num_descs > 1)) { - xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; - kmem_cache_free(xsk_tx_generic_cache, xsk_addr); - } skb->destructor = sock_wfree; xsk_cq_cancel_locked(xs->pool, num_descs); @@ -734,33 +731,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, skb_reserve(skb, hr); - xsk_skb_init_misc(skb, xs, desc->addr); + xsk_skb_init_misc(skb, xs); if (desc->options & XDP_TX_METADATA) { err = xsk_skb_metadata(skb, buffer, desc, pool, hr); if (unlikely(err)) return ERR_PTR(err); } - } else { - struct xsk_addrs *xsk_addr; - - if (xsk_skb_destructor_is_addr(skb)) { - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, - GFP_KERNEL); - if (!xsk_addr) - return ERR_PTR(-ENOMEM); - - xsk_addr->num_descs = 1; - xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); - skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; - } else { - xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; - } - - /* in case of -EOVERFLOW that could happen below, - * xsk_consume_skb() will release this node as whole skb - * would be dropped, which implies freeing all list elements - */ - xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; } len = desc->len; @@ -828,7 +804,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, if (unlikely(err)) goto free_err; - xsk_skb_init_misc(skb, xs, desc->addr); + xsk_skb_init_misc(skb, xs); if (desc->options & XDP_TX_METADATA) { err = xsk_skb_metadata(skb, buffer, desc, xs->pool, hr); @@ -837,25 +813,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, } } else { int nr_frags = skb_shinfo(skb)->nr_frags; - struct xsk_addrs *xsk_addr; struct page *page; u8 *vaddr; - if (xsk_skb_destructor_is_addr(skb)) { - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, - GFP_KERNEL); - if (!xsk_addr) { - err = -ENOMEM; - goto free_err; - } - - xsk_addr->num_descs = 1; - xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); - skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; - } else { - xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; - } - if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { err = -EOVERFLOW; goto free_err; @@ -873,8 +833,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); - - xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; } } @@ -931,7 +889,7 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - err = xsk_cq_reserve_locked(xs->pool); + err = xsk_cq_reserve_addr_locked(xs, desc.addr); if (err) { err = -EAGAIN; goto out; @@ -1212,6 +1170,30 @@ static void xsk_delete_from_maps(struct xdp_sock *xs) } } +/* Initialize local compeletion queue for each xsk */ +static int xsk_init_local_cq(struct xdp_sock *xs) +{ + struct xsk_queue *cq = xs->pool->cq; + size_t size; + + if (!cq || !cq->nentries) + return -EINVAL; + + size = struct_size_t(struct local_cq, desc, cq->nentries); + xs->lcq = vmalloc(size); + if (!xs->lcq) + return -ENOMEM; + xs->lcq->ring_mask = cq->nentries - 1; + xs->lcq->prod = 0; + + return 0; +} + +static void xsk_clear_local_cq(struct xdp_sock *xs) +{ + vfree(xs->lcq); +} + static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -1241,6 +1223,7 @@ static int xsk_release(struct socket *sock) xskq_destroy(xs->tx); xskq_destroy(xs->fq_tmp); xskq_destroy(xs->cq_tmp); + xsk_clear_local_cq(xs); sock_orphan(sk); sock->sk = NULL; @@ -1360,9 +1343,18 @@ static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr goto out_unlock; } + err = xsk_init_local_cq(xs); + if (err) { + xp_destroy(xs->pool); + xs->pool = NULL; + sockfd_put(sock); + goto out_unlock; + } + err = xp_assign_dev_shared(xs->pool, umem_xs, dev, qid); if (err) { + xsk_clear_local_cq(xs); xp_destroy(xs->pool); xs->pool = NULL; sockfd_put(sock); @@ -1380,6 +1372,13 @@ static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr xp_get_pool(umem_xs->pool); xs->pool = umem_xs->pool; + err = xsk_init_local_cq(xs); + if (err) { + xp_put_pool(xs->pool); + xs->pool = NULL; + sockfd_put(sock); + goto out_unlock; + } /* If underlying shared umem was created without Tx * ring, allocate Tx descs array that Tx batching API * utilizes @@ -1387,6 +1386,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr if (xs->tx && !xs->pool->tx_descs) { err = xp_alloc_tx_descs(xs->pool, xs); if (err) { + xsk_clear_local_cq(xs); xp_put_pool(xs->pool); xs->pool = NULL; sockfd_put(sock); @@ -1409,8 +1409,16 @@ static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr goto out_unlock; } + err = xsk_init_local_cq(xs); + if (err) { + xp_destroy(xs->pool); + xs->pool = NULL; + goto out_unlock; + } + err = xp_assign_dev(xs->pool, dev, qid, flags); if (err) { + xsk_clear_local_cq(xs); xp_destroy(xs->pool); xs->pool = NULL; goto out_unlock; @@ -1934,18 +1942,8 @@ static int __init xsk_init(void) if (err) goto out_pernet; - xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", - sizeof(struct xsk_addrs), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!xsk_tx_generic_cache) { - err = -ENOMEM; - goto out_unreg_notif; - } - return 0; -out_unreg_notif: - unregister_netdevice_notifier(&xsk_netdev_notifier); out_pernet: unregister_pernet_subsys(&xsk_net_ops); out_sk: