16f377873SDavid Wei // SPDX-License-Identifier: GPL-2.0
26f377873SDavid Wei #include <linux/kernel.h>
36f377873SDavid Wei #include <linux/errno.h>
4db070446SPavel Begunkov #include <linux/dma-map-ops.h>
56f377873SDavid Wei #include <linux/mm.h>
634a3e608SPavel Begunkov #include <linux/nospec.h>
76f377873SDavid Wei #include <linux/io_uring.h>
8035af94bSPavel Begunkov #include <linux/netdevice.h>
9035af94bSPavel Begunkov #include <linux/rtnetlink.h>
10bc57c7d3SPavel Begunkov #include <linux/skbuff_ref.h>
11d7af80b2SPavel Begunkov #include <linux/anon_inodes.h>
126f377873SDavid Wei
1334a3e608SPavel Begunkov #include <net/page_pool/helpers.h>
1434a3e608SPavel Begunkov #include <net/page_pool/memory_provider.h>
1534a3e608SPavel Begunkov #include <net/netlink.h>
1659b8b32aSDragos Tatulea #include <net/netdev_queues.h>
17e0793de2SDavid Wei #include <net/netdev_rx_queue.h>
1811ed914bSDavid Wei #include <net/tcp.h>
1911ed914bSDavid Wei #include <net/rps.h>
2034a3e608SPavel Begunkov
21e0793de2SDavid Wei #include <trace/events/page_pool.h>
22e0793de2SDavid Wei
236f377873SDavid Wei #include <uapi/linux/io_uring.h>
246f377873SDavid Wei
256f377873SDavid Wei #include "io_uring.h"
266f377873SDavid Wei #include "kbuf.h"
276f377873SDavid Wei #include "memmap.h"
286f377873SDavid Wei #include "zcrx.h"
29cf96310cSDavid Wei #include "rsrc.h"
306f377873SDavid Wei
3101464ea4SPavel Begunkov #define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF)
3201464ea4SPavel Begunkov
33782dfa32SPavel Begunkov #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
34782dfa32SPavel Begunkov
io_pp_to_ifq(struct page_pool * pp)3570e4f9bfSPavel Begunkov static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
3670e4f9bfSPavel Begunkov {
3770e4f9bfSPavel Begunkov return pp->mp_priv;
3870e4f9bfSPavel Begunkov }
3970e4f9bfSPavel Begunkov
io_zcrx_iov_to_area(const struct net_iov * niov)40a79154aeSPavel Begunkov static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
41a79154aeSPavel Begunkov {
42a79154aeSPavel Begunkov struct net_iov_area *owner = net_iov_owner(niov);
43a79154aeSPavel Begunkov
44a79154aeSPavel Begunkov return container_of(owner, struct io_zcrx_area, nia);
45a79154aeSPavel Begunkov }
46a79154aeSPavel Begunkov
io_zcrx_iov_page(const struct net_iov * niov)47a79154aeSPavel Begunkov static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
48a79154aeSPavel Begunkov {
49a79154aeSPavel Begunkov struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
50d8d135dfSPavel Begunkov unsigned niov_pages_shift;
51a79154aeSPavel Begunkov
521b4dc1ffSPavel Begunkov lockdep_assert(!area->mem.is_dmabuf);
531b4dc1ffSPavel Begunkov
54d8d135dfSPavel Begunkov niov_pages_shift = area->ifq->niov_shift - PAGE_SHIFT;
55d8d135dfSPavel Begunkov return area->mem.pages[net_iov_idx(niov) << niov_pages_shift];
56a79154aeSPavel Begunkov }
57a79154aeSPavel Begunkov
io_area_max_shift(struct io_zcrx_mem * mem)58795663b4SPavel Begunkov static int io_area_max_shift(struct io_zcrx_mem *mem)
59795663b4SPavel Begunkov {
60795663b4SPavel Begunkov struct sg_table *sgt = mem->sgt;
61795663b4SPavel Begunkov struct scatterlist *sg;
62795663b4SPavel Begunkov unsigned shift = -1U;
63795663b4SPavel Begunkov unsigned i;
64795663b4SPavel Begunkov
65795663b4SPavel Begunkov for_each_sgtable_dma_sg(sgt, sg, i)
667120b87bSPavel Begunkov shift = min(shift, __ffs(sg_dma_len(sg)));
67795663b4SPavel Begunkov return shift;
68795663b4SPavel Begunkov }
69795663b4SPavel Begunkov
io_populate_area_dma(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)7054e89a93SPavel Begunkov static int io_populate_area_dma(struct io_zcrx_ifq *ifq,
715d93f7baSPavel Begunkov struct io_zcrx_area *area)
7254e89a93SPavel Begunkov {
73d8d135dfSPavel Begunkov unsigned niov_size = 1U << ifq->niov_shift;
745d93f7baSPavel Begunkov struct sg_table *sgt = area->mem.sgt;
7554e89a93SPavel Begunkov struct scatterlist *sg;
7654e89a93SPavel Begunkov unsigned i, niov_idx = 0;
7754e89a93SPavel Begunkov
7854e89a93SPavel Begunkov for_each_sgtable_dma_sg(sgt, sg, i) {
7954e89a93SPavel Begunkov dma_addr_t dma = sg_dma_address(sg);
8054e89a93SPavel Begunkov unsigned long sg_len = sg_dma_len(sg);
8154e89a93SPavel Begunkov
82d8d135dfSPavel Begunkov if (WARN_ON_ONCE(sg_len % niov_size))
83d8d135dfSPavel Begunkov return -EINVAL;
84d8d135dfSPavel Begunkov
8554e89a93SPavel Begunkov while (sg_len && niov_idx < area->nia.num_niovs) {
8654e89a93SPavel Begunkov struct net_iov *niov = &area->nia.niovs[niov_idx];
8754e89a93SPavel Begunkov
8854e89a93SPavel Begunkov if (net_mp_niov_set_dma_addr(niov, dma))
8954e89a93SPavel Begunkov return -EFAULT;
90d8d135dfSPavel Begunkov sg_len -= niov_size;
91d8d135dfSPavel Begunkov dma += niov_size;
9254e89a93SPavel Begunkov niov_idx++;
9354e89a93SPavel Begunkov }
9454e89a93SPavel Begunkov }
95d7ae46b4SPavel Begunkov
96d7ae46b4SPavel Begunkov if (WARN_ON_ONCE(niov_idx != area->nia.num_niovs))
97d7ae46b4SPavel Begunkov return -EFAULT;
9854e89a93SPavel Begunkov return 0;
9954e89a93SPavel Begunkov }
10054e89a93SPavel Begunkov
io_release_dmabuf(struct io_zcrx_mem * mem)101a5c98e94SPavel Begunkov static void io_release_dmabuf(struct io_zcrx_mem *mem)
102a5c98e94SPavel Begunkov {
103a5c98e94SPavel Begunkov if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
104a5c98e94SPavel Begunkov return;
105a5c98e94SPavel Begunkov
106a5c98e94SPavel Begunkov if (mem->sgt)
107a5c98e94SPavel Begunkov dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt,
108a5c98e94SPavel Begunkov DMA_FROM_DEVICE);
109a5c98e94SPavel Begunkov if (mem->attach)
110a5c98e94SPavel Begunkov dma_buf_detach(mem->dmabuf, mem->attach);
111a5c98e94SPavel Begunkov if (mem->dmabuf)
112a5c98e94SPavel Begunkov dma_buf_put(mem->dmabuf);
113a5c98e94SPavel Begunkov
114a5c98e94SPavel Begunkov mem->sgt = NULL;
115a5c98e94SPavel Begunkov mem->attach = NULL;
116a5c98e94SPavel Begunkov mem->dmabuf = NULL;
117a5c98e94SPavel Begunkov }
118a5c98e94SPavel Begunkov
io_import_dmabuf(struct io_zcrx_ifq * ifq,struct io_zcrx_mem * mem,struct io_uring_zcrx_area_reg * area_reg)119a5c98e94SPavel Begunkov static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
120a5c98e94SPavel Begunkov struct io_zcrx_mem *mem,
121a5c98e94SPavel Begunkov struct io_uring_zcrx_area_reg *area_reg)
122a5c98e94SPavel Begunkov {
123a5c98e94SPavel Begunkov unsigned long off = (unsigned long)area_reg->addr;
124a5c98e94SPavel Begunkov unsigned long len = (unsigned long)area_reg->len;
125a5c98e94SPavel Begunkov unsigned long total_size = 0;
126a5c98e94SPavel Begunkov struct scatterlist *sg;
127a5c98e94SPavel Begunkov int dmabuf_fd = area_reg->dmabuf_fd;
128a5c98e94SPavel Begunkov int i, ret;
129a5c98e94SPavel Begunkov
130825f2764SPavel Begunkov if (!ifq->dev)
131825f2764SPavel Begunkov return -EINVAL;
13208ca1409SPavel Begunkov if (off)
13308ca1409SPavel Begunkov return -EINVAL;
134a5c98e94SPavel Begunkov if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
135a5c98e94SPavel Begunkov return -EINVAL;
136a5c98e94SPavel Begunkov
137a5c98e94SPavel Begunkov mem->is_dmabuf = true;
138a5c98e94SPavel Begunkov mem->dmabuf = dma_buf_get(dmabuf_fd);
139a5c98e94SPavel Begunkov if (IS_ERR(mem->dmabuf)) {
140a5c98e94SPavel Begunkov ret = PTR_ERR(mem->dmabuf);
141a5c98e94SPavel Begunkov mem->dmabuf = NULL;
142a5c98e94SPavel Begunkov goto err;
143a5c98e94SPavel Begunkov }
144a5c98e94SPavel Begunkov
145a5c98e94SPavel Begunkov mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev);
146a5c98e94SPavel Begunkov if (IS_ERR(mem->attach)) {
147a5c98e94SPavel Begunkov ret = PTR_ERR(mem->attach);
148a5c98e94SPavel Begunkov mem->attach = NULL;
149a5c98e94SPavel Begunkov goto err;
150a5c98e94SPavel Begunkov }
151a5c98e94SPavel Begunkov
152a5c98e94SPavel Begunkov mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE);
153a5c98e94SPavel Begunkov if (IS_ERR(mem->sgt)) {
154a5c98e94SPavel Begunkov ret = PTR_ERR(mem->sgt);
155a5c98e94SPavel Begunkov mem->sgt = NULL;
156a5c98e94SPavel Begunkov goto err;
157a5c98e94SPavel Begunkov }
158a5c98e94SPavel Begunkov
159a5c98e94SPavel Begunkov for_each_sgtable_dma_sg(mem->sgt, sg, i)
160a5c98e94SPavel Begunkov total_size += sg_dma_len(sg);
161a5c98e94SPavel Begunkov
16208ca1409SPavel Begunkov if (total_size != len) {
1637cac633aSPenglei Jiang ret = -EINVAL;
1647cac633aSPenglei Jiang goto err;
1657cac633aSPenglei Jiang }
166a5c98e94SPavel Begunkov
167a5c98e94SPavel Begunkov mem->size = len;
168a5c98e94SPavel Begunkov return 0;
169a5c98e94SPavel Begunkov err:
170a5c98e94SPavel Begunkov io_release_dmabuf(mem);
171a5c98e94SPavel Begunkov return ret;
172a5c98e94SPavel Begunkov }
173a5c98e94SPavel Begunkov
io_count_account_pages(struct page ** pages,unsigned nr_pages)174262ab205SPavel Begunkov static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pages)
175262ab205SPavel Begunkov {
176262ab205SPavel Begunkov struct folio *last_folio = NULL;
177262ab205SPavel Begunkov unsigned long res = 0;
178262ab205SPavel Begunkov int i;
179262ab205SPavel Begunkov
180262ab205SPavel Begunkov for (i = 0; i < nr_pages; i++) {
181262ab205SPavel Begunkov struct folio *folio = page_folio(pages[i]);
182262ab205SPavel Begunkov
183262ab205SPavel Begunkov if (folio == last_folio)
184262ab205SPavel Begunkov continue;
185262ab205SPavel Begunkov last_folio = folio;
186a0169c3aSPedro Demarchi Gomes res += folio_nr_pages(folio);
187a5c98e94SPavel Begunkov }
188262ab205SPavel Begunkov return res;
189a5c98e94SPavel Begunkov }
190a5c98e94SPavel Begunkov
io_import_umem(struct io_zcrx_ifq * ifq,struct io_zcrx_mem * mem,struct io_uring_zcrx_area_reg * area_reg)191a5c98e94SPavel Begunkov static int io_import_umem(struct io_zcrx_ifq *ifq,
192a5c98e94SPavel Begunkov struct io_zcrx_mem *mem,
193a5c98e94SPavel Begunkov struct io_uring_zcrx_area_reg *area_reg)
194a5c98e94SPavel Begunkov {
195a5c98e94SPavel Begunkov struct page **pages;
196b84621d9SPavel Begunkov int nr_pages, ret;
197b8d6eb6cSPavel Begunkov bool mapped = false;
198a5c98e94SPavel Begunkov
199a5c98e94SPavel Begunkov if (area_reg->dmabuf_fd)
200a5c98e94SPavel Begunkov return -EINVAL;
201a5c98e94SPavel Begunkov if (!area_reg->addr)
202a5c98e94SPavel Begunkov return -EFAULT;
203a5c98e94SPavel Begunkov pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
204a5c98e94SPavel Begunkov &nr_pages);
205a5c98e94SPavel Begunkov if (IS_ERR(pages))
206a5c98e94SPavel Begunkov return PTR_ERR(pages);
207a5c98e94SPavel Begunkov
208b84621d9SPavel Begunkov ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages,
209417d029dSPavel Begunkov 0, (unsigned long)nr_pages << PAGE_SHIFT,
210b84621d9SPavel Begunkov GFP_KERNEL_ACCOUNT);
21141041562SPavel Begunkov if (ret)
21241041562SPavel Begunkov goto out_err;
213b84621d9SPavel Begunkov
214825f2764SPavel Begunkov if (ifq->dev) {
215b8d6eb6cSPavel Begunkov ret = dma_map_sgtable(ifq->dev, &mem->page_sg_table,
216b8d6eb6cSPavel Begunkov DMA_FROM_DEVICE, IO_DMA_ATTR);
217b8d6eb6cSPavel Begunkov if (ret < 0)
218b8d6eb6cSPavel Begunkov goto out_err;
219b8d6eb6cSPavel Begunkov mapped = true;
220825f2764SPavel Begunkov }
221b8d6eb6cSPavel Begunkov
222262ab205SPavel Begunkov mem->account_pages = io_count_account_pages(pages, nr_pages);
2235c686456SDavid Wei ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages);
22441041562SPavel Begunkov if (ret < 0) {
225262ab205SPavel Begunkov mem->account_pages = 0;
22641041562SPavel Begunkov goto out_err;
22741041562SPavel Begunkov }
228262ab205SPavel Begunkov
2295d93f7baSPavel Begunkov mem->sgt = &mem->page_sg_table;
230a5c98e94SPavel Begunkov mem->pages = pages;
231a5c98e94SPavel Begunkov mem->nr_folios = nr_pages;
232a5c98e94SPavel Begunkov mem->size = area_reg->len;
2336bbd3411SPavel Begunkov return ret;
23441041562SPavel Begunkov out_err:
235b8d6eb6cSPavel Begunkov if (mapped)
236b8d6eb6cSPavel Begunkov dma_unmap_sgtable(ifq->dev, &mem->page_sg_table,
237b8d6eb6cSPavel Begunkov DMA_FROM_DEVICE, IO_DMA_ATTR);
23841041562SPavel Begunkov sg_free_table(&mem->page_sg_table);
23941041562SPavel Begunkov unpin_user_pages(pages, nr_pages);
24041041562SPavel Begunkov kvfree(pages);
24141041562SPavel Begunkov return ret;
242a5c98e94SPavel Begunkov }
243a5c98e94SPavel Begunkov
io_release_area_mem(struct io_zcrx_mem * mem)244782dfa32SPavel Begunkov static void io_release_area_mem(struct io_zcrx_mem *mem)
245782dfa32SPavel Begunkov {
246a5c98e94SPavel Begunkov if (mem->is_dmabuf) {
247a5c98e94SPavel Begunkov io_release_dmabuf(mem);
248a5c98e94SPavel Begunkov return;
249a5c98e94SPavel Begunkov }
250782dfa32SPavel Begunkov if (mem->pages) {
251782dfa32SPavel Begunkov unpin_user_pages(mem->pages, mem->nr_folios);
2525d93f7baSPavel Begunkov sg_free_table(mem->sgt);
2535d93f7baSPavel Begunkov mem->sgt = NULL;
254782dfa32SPavel Begunkov kvfree(mem->pages);
255782dfa32SPavel Begunkov }
256782dfa32SPavel Begunkov }
257782dfa32SPavel Begunkov
io_import_area(struct io_zcrx_ifq * ifq,struct io_zcrx_mem * mem,struct io_uring_zcrx_area_reg * area_reg)258782dfa32SPavel Begunkov static int io_import_area(struct io_zcrx_ifq *ifq,
259782dfa32SPavel Begunkov struct io_zcrx_mem *mem,
260782dfa32SPavel Begunkov struct io_uring_zcrx_area_reg *area_reg)
261782dfa32SPavel Begunkov {
262782dfa32SPavel Begunkov int ret;
263782dfa32SPavel Begunkov
26401464ea4SPavel Begunkov if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS)
26501464ea4SPavel Begunkov return -EINVAL;
26601464ea4SPavel Begunkov if (area_reg->rq_area_token)
26701464ea4SPavel Begunkov return -EINVAL;
26801464ea4SPavel Begunkov if (area_reg->__resv2[0] || area_reg->__resv2[1])
26901464ea4SPavel Begunkov return -EINVAL;
27001464ea4SPavel Begunkov
271782dfa32SPavel Begunkov ret = io_validate_user_buf_range(area_reg->addr, area_reg->len);
272782dfa32SPavel Begunkov if (ret)
273782dfa32SPavel Begunkov return ret;
274782dfa32SPavel Begunkov if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
275782dfa32SPavel Begunkov return -EINVAL;
276782dfa32SPavel Begunkov
277a5c98e94SPavel Begunkov if (area_reg->flags & IORING_ZCRX_AREA_DMABUF)
278a5c98e94SPavel Begunkov return io_import_dmabuf(ifq, mem, area_reg);
279a5c98e94SPavel Begunkov return io_import_umem(ifq, mem, area_reg);
280782dfa32SPavel Begunkov }
281db070446SPavel Begunkov
io_zcrx_unmap_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)282b84621d9SPavel Begunkov static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
283b84621d9SPavel Begunkov struct io_zcrx_area *area)
284db070446SPavel Begunkov {
285db070446SPavel Begunkov int i;
286db070446SPavel Begunkov
2874f602f31SPavel Begunkov guard(mutex)(&ifq->pp_lock);
288b84621d9SPavel Begunkov if (!area->is_mapped)
289b84621d9SPavel Begunkov return;
290b84621d9SPavel Begunkov area->is_mapped = false;
2918a628042SPavel Begunkov
29252dcd177SPavel Begunkov if (area->nia.niovs) {
2938a628042SPavel Begunkov for (i = 0; i < area->nia.num_niovs; i++)
2948a628042SPavel Begunkov net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
29552dcd177SPavel Begunkov }
2968a628042SPavel Begunkov
297b84621d9SPavel Begunkov if (area->mem.is_dmabuf) {
298b84621d9SPavel Begunkov io_release_dmabuf(&area->mem);
299b84621d9SPavel Begunkov } else {
300b84621d9SPavel Begunkov dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table,
301db070446SPavel Begunkov DMA_FROM_DEVICE, IO_DMA_ATTR);
302db070446SPavel Begunkov }
303db070446SPavel Begunkov }
30406897ddfSPavel Begunkov
zcrx_sync_for_device(struct page_pool * pp,struct io_zcrx_ifq * zcrx,netmem_ref * netmems,unsigned nr)30561cfadaaSPavel Begunkov static void zcrx_sync_for_device(struct page_pool *pp, struct io_zcrx_ifq *zcrx,
30661cfadaaSPavel Begunkov netmem_ref *netmems, unsigned nr)
307db070446SPavel Begunkov {
308db070446SPavel Begunkov #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
30961cfadaaSPavel Begunkov struct device *dev = pp->p.dev;
31061cfadaaSPavel Begunkov unsigned i, niov_size;
311db070446SPavel Begunkov dma_addr_t dma_addr;
312db070446SPavel Begunkov
31361cfadaaSPavel Begunkov if (!dma_dev_need_sync(dev))
314db070446SPavel Begunkov return;
31561cfadaaSPavel Begunkov niov_size = 1U << zcrx->niov_shift;
316db070446SPavel Begunkov
31761cfadaaSPavel Begunkov for (i = 0; i < nr; i++) {
31861cfadaaSPavel Begunkov dma_addr = page_pool_get_dma_addr_netmem(netmems[i]);
31961cfadaaSPavel Begunkov __dma_sync_single_for_device(dev, dma_addr + pp->p.offset,
32061cfadaaSPavel Begunkov niov_size, pp->p.dma_dir);
32161cfadaaSPavel Begunkov }
322db070446SPavel Begunkov #endif
323db070446SPavel Begunkov }
324db070446SPavel Begunkov
3256f377873SDavid Wei #define IO_RQ_MAX_ENTRIES 32768
3266f377873SDavid Wei
327931dfae1SPavel Begunkov #define IO_SKBS_PER_CALL_LIMIT 20
328931dfae1SPavel Begunkov
32911ed914bSDavid Wei struct io_zcrx_args {
33011ed914bSDavid Wei struct io_kiocb *req;
33111ed914bSDavid Wei struct io_zcrx_ifq *ifq;
33211ed914bSDavid Wei struct socket *sock;
333931dfae1SPavel Begunkov unsigned nr_skbs;
33411ed914bSDavid Wei };
33511ed914bSDavid Wei
33634a3e608SPavel Begunkov static const struct memory_provider_ops io_uring_pp_zc_ops;
33734a3e608SPavel Begunkov
io_get_user_counter(struct net_iov * niov)33834a3e608SPavel Begunkov static inline atomic_t *io_get_user_counter(struct net_iov *niov)
33934a3e608SPavel Begunkov {
34034a3e608SPavel Begunkov struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
34134a3e608SPavel Begunkov
34234a3e608SPavel Begunkov return &area->user_refs[net_iov_idx(niov)];
34334a3e608SPavel Begunkov }
34434a3e608SPavel Begunkov
io_zcrx_put_niov_uref(struct net_iov * niov)34534a3e608SPavel Begunkov static bool io_zcrx_put_niov_uref(struct net_iov *niov)
34634a3e608SPavel Begunkov {
34734a3e608SPavel Begunkov atomic_t *uref = io_get_user_counter(niov);
348003049b1SKai Aizen int old;
34934a3e608SPavel Begunkov
350003049b1SKai Aizen old = atomic_read(uref);
351003049b1SKai Aizen do {
352003049b1SKai Aizen if (unlikely(old == 0))
35334a3e608SPavel Begunkov return false;
354003049b1SKai Aizen } while (!atomic_try_cmpxchg(uref, &old, old - 1));
355003049b1SKai Aizen
35634a3e608SPavel Begunkov return true;
35734a3e608SPavel Begunkov }
35834a3e608SPavel Begunkov
io_zcrx_get_niov_uref(struct net_iov * niov)35911ed914bSDavid Wei static void io_zcrx_get_niov_uref(struct net_iov *niov)
36011ed914bSDavid Wei {
36111ed914bSDavid Wei atomic_inc(io_get_user_counter(niov));
36211ed914bSDavid Wei }
36311ed914bSDavid Wei
io_fill_zcrx_offsets(struct io_uring_zcrx_offsets * offsets)3640926f94aSDavid Wei static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets)
3650926f94aSDavid Wei {
3660926f94aSDavid Wei offsets->head = offsetof(struct io_uring, head);
3670926f94aSDavid Wei offsets->tail = offsetof(struct io_uring, tail);
3680926f94aSDavid Wei offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES);
3690926f94aSDavid Wei }
3700926f94aSDavid Wei
io_allocate_rbuf_ring(struct io_ring_ctx * ctx,struct io_zcrx_ifq * ifq,struct io_uring_zcrx_ifq_reg * reg,struct io_uring_region_desc * rd,u32 id)3715c686456SDavid Wei static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
3725c686456SDavid Wei struct io_zcrx_ifq *ifq,
3736f377873SDavid Wei struct io_uring_zcrx_ifq_reg *reg,
37476f1cc98SPavel Begunkov struct io_uring_region_desc *rd,
37576f1cc98SPavel Begunkov u32 id)
3766f377873SDavid Wei {
37776f1cc98SPavel Begunkov u64 mmap_offset;
3786f377873SDavid Wei size_t off, size;
3796f377873SDavid Wei void *ptr;
3806f377873SDavid Wei int ret;
3816f377873SDavid Wei
3820926f94aSDavid Wei io_fill_zcrx_offsets(®->offsets);
3830926f94aSDavid Wei off = reg->offsets.rqes;
3846f377873SDavid Wei size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
3856f377873SDavid Wei if (size > rd->size)
3866f377873SDavid Wei return -EINVAL;
3876f377873SDavid Wei
38876f1cc98SPavel Begunkov mmap_offset = IORING_MAP_OFF_ZCRX_REGION;
3894c6f9395SPavel Begunkov mmap_offset += (u64)id << IORING_OFF_ZCRX_SHIFT;
39076f1cc98SPavel Begunkov
391ebae09bcSPavel Begunkov ret = io_create_region(ctx, &ifq->rq_region, rd, mmap_offset);
3926f377873SDavid Wei if (ret < 0)
3936f377873SDavid Wei return ret;
3946f377873SDavid Wei
395ebae09bcSPavel Begunkov ptr = io_region_get_ptr(&ifq->rq_region);
3966a55a0a7SPavel Begunkov ifq->rq.ring = (struct io_uring *)ptr;
3976a55a0a7SPavel Begunkov ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
3989eb3c571SPavel Begunkov
3994f02cc40SPavel Begunkov memset(ifq->rq.ring, 0, sizeof(*ifq->rq.ring));
4006f377873SDavid Wei return 0;
4016f377873SDavid Wei }
4026f377873SDavid Wei
io_free_rbuf_ring(struct io_zcrx_ifq * ifq)4036f377873SDavid Wei static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
4046f377873SDavid Wei {
405ebae09bcSPavel Begunkov io_free_region(ifq->user, &ifq->rq_region);
4066a55a0a7SPavel Begunkov ifq->rq.ring = NULL;
4076a55a0a7SPavel Begunkov ifq->rq.rqes = NULL;
4086f377873SDavid Wei }
4096f377873SDavid Wei
io_zcrx_free_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)410edd706edSDavid Wei static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
411edd706edSDavid Wei struct io_zcrx_area *area)
412cf96310cSDavid Wei {
413edd706edSDavid Wei io_zcrx_unmap_area(ifq, area);
414782dfa32SPavel Begunkov io_release_area_mem(&area->mem);
415db070446SPavel Begunkov
416262ab205SPavel Begunkov if (area->mem.account_pages)
4175c686456SDavid Wei io_unaccount_mem(ifq->user, ifq->mm_account,
4186ab39b39SDavid Wei area->mem.account_pages);
419262ab205SPavel Begunkov
420cf96310cSDavid Wei kvfree(area->freelist);
421cf96310cSDavid Wei kvfree(area->nia.niovs);
42234a3e608SPavel Begunkov kvfree(area->user_refs);
423cf96310cSDavid Wei kfree(area);
424cf96310cSDavid Wei }
425cf96310cSDavid Wei
io_zcrx_append_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)426d425f131SPavel Begunkov static int io_zcrx_append_area(struct io_zcrx_ifq *ifq,
427d425f131SPavel Begunkov struct io_zcrx_area *area)
428d425f131SPavel Begunkov {
4295c727ce0SPavel Begunkov bool kern_readable = !area->mem.is_dmabuf;
4305c727ce0SPavel Begunkov
431f0b92207SPavel Begunkov if (WARN_ON_ONCE(ifq->area))
432d425f131SPavel Begunkov return -EINVAL;
4335c727ce0SPavel Begunkov if (WARN_ON_ONCE(ifq->kern_readable != kern_readable))
4345c727ce0SPavel Begunkov return -EINVAL;
4355c727ce0SPavel Begunkov
436d425f131SPavel Begunkov ifq->area = area;
437d425f131SPavel Begunkov return 0;
438d425f131SPavel Begunkov }
439d425f131SPavel Begunkov
io_zcrx_create_area(struct io_zcrx_ifq * ifq,struct io_uring_zcrx_area_reg * area_reg,struct io_uring_zcrx_ifq_reg * reg)440cf96310cSDavid Wei static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
441795663b4SPavel Begunkov struct io_uring_zcrx_area_reg *area_reg,
442795663b4SPavel Begunkov struct io_uring_zcrx_ifq_reg *reg)
443cf96310cSDavid Wei {
444795663b4SPavel Begunkov int buf_size_shift = PAGE_SHIFT;
445cf96310cSDavid Wei struct io_zcrx_area *area;
446782dfa32SPavel Begunkov unsigned nr_iovs;
447782dfa32SPavel Begunkov int i, ret;
448cf96310cSDavid Wei
449795663b4SPavel Begunkov if (reg->rx_buf_len) {
450795663b4SPavel Begunkov if (!is_power_of_2(reg->rx_buf_len) ||
451795663b4SPavel Begunkov reg->rx_buf_len < PAGE_SIZE)
452795663b4SPavel Begunkov return -EINVAL;
453795663b4SPavel Begunkov buf_size_shift = ilog2(reg->rx_buf_len);
454795663b4SPavel Begunkov }
455a9d00848SPavel Begunkov if (!ifq->dev && buf_size_shift != PAGE_SHIFT)
456a9d00848SPavel Begunkov return -EOPNOTSUPP;
457795663b4SPavel Begunkov
458cf96310cSDavid Wei ret = -ENOMEM;
459bf4afc53SLinus Torvalds area = kzalloc_obj(*area);
460cf96310cSDavid Wei if (!area)
461cf96310cSDavid Wei goto err;
462720df231SPavel Begunkov area->ifq = ifq;
463cf96310cSDavid Wei
464782dfa32SPavel Begunkov ret = io_import_area(ifq, &area->mem, area_reg);
465782dfa32SPavel Begunkov if (ret)
466cf96310cSDavid Wei goto err;
467825f2764SPavel Begunkov if (ifq->dev)
468b8d6eb6cSPavel Begunkov area->is_mapped = true;
469782dfa32SPavel Begunkov
470a9d00848SPavel Begunkov if (ifq->dev && buf_size_shift > io_area_max_shift(&area->mem)) {
471795663b4SPavel Begunkov ret = -ERANGE;
472795663b4SPavel Begunkov goto err;
473795663b4SPavel Begunkov }
474795663b4SPavel Begunkov
475795663b4SPavel Begunkov ifq->niov_shift = buf_size_shift;
476d8d135dfSPavel Begunkov nr_iovs = area->mem.size >> ifq->niov_shift;
4775a17131aSPavel Begunkov area->nia.num_niovs = nr_iovs;
478cf96310cSDavid Wei
479782dfa32SPavel Begunkov ret = -ENOMEM;
48069050f8dSKees Cook area->nia.niovs = kvmalloc_objs(area->nia.niovs[0], nr_iovs,
48131bf77dcSPavel Begunkov GFP_KERNEL_ACCOUNT | __GFP_ZERO);
482cf96310cSDavid Wei if (!area->nia.niovs)
483cf96310cSDavid Wei goto err;
484cf96310cSDavid Wei
4855a17131aSPavel Begunkov area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]),
48631bf77dcSPavel Begunkov GFP_KERNEL_ACCOUNT | __GFP_ZERO);
487cf96310cSDavid Wei if (!area->freelist)
488cf96310cSDavid Wei goto err;
489cf96310cSDavid Wei
49069050f8dSKees Cook area->user_refs = kvmalloc_objs(area->user_refs[0], nr_iovs,
49131bf77dcSPavel Begunkov GFP_KERNEL_ACCOUNT | __GFP_ZERO);
49234a3e608SPavel Begunkov if (!area->user_refs)
49334a3e608SPavel Begunkov goto err;
49434a3e608SPavel Begunkov
4955a17131aSPavel Begunkov for (i = 0; i < nr_iovs; i++) {
49634a3e608SPavel Begunkov struct net_iov *niov = &area->nia.niovs[i];
49734a3e608SPavel Begunkov
498*735a309bSJakub Kicinski net_iov_init(niov, &area->nia, NET_IOV_IOURING);
49934a3e608SPavel Begunkov area->freelist[i] = i;
50034a3e608SPavel Begunkov atomic_set(&area->user_refs[i], 0);
50134a3e608SPavel Begunkov }
50234a3e608SPavel Begunkov
503825f2764SPavel Begunkov if (ifq->dev) {
504b8d6eb6cSPavel Begunkov ret = io_populate_area_dma(ifq, area);
505b8d6eb6cSPavel Begunkov if (ret)
506b8d6eb6cSPavel Begunkov goto err;
507825f2764SPavel Begunkov }
508b8d6eb6cSPavel Begunkov
5095a17131aSPavel Begunkov area->free_count = nr_iovs;
510cf96310cSDavid Wei /* we're only supporting one area per ifq for now */
511cf96310cSDavid Wei area->area_id = 0;
512cf96310cSDavid Wei area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT;
513cf96310cSDavid Wei spin_lock_init(&area->freelist_lock);
514d425f131SPavel Begunkov
515d425f131SPavel Begunkov ret = io_zcrx_append_area(ifq, area);
516d425f131SPavel Begunkov if (!ret)
517cf96310cSDavid Wei return 0;
518cf96310cSDavid Wei err:
519cf96310cSDavid Wei if (area)
520edd706edSDavid Wei io_zcrx_free_area(ifq, area);
521cf96310cSDavid Wei return ret;
522cf96310cSDavid Wei }
523cf96310cSDavid Wei
io_zcrx_ifq_alloc(struct io_ring_ctx * ctx)5246f377873SDavid Wei static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
5256f377873SDavid Wei {
5266f377873SDavid Wei struct io_zcrx_ifq *ifq;
5276f377873SDavid Wei
528bf4afc53SLinus Torvalds ifq = kzalloc_obj(*ifq);
5296f377873SDavid Wei if (!ifq)
5306f377873SDavid Wei return NULL;
5316f377873SDavid Wei
5326f377873SDavid Wei ifq->if_rxq = -1;
5336a55a0a7SPavel Begunkov spin_lock_init(&ifq->rq.lock);
5344f602f31SPavel Begunkov mutex_init(&ifq->pp_lock);
53575c299a9SDavid Wei refcount_set(&ifq->refs, 1);
53639c9676fSPavel Begunkov refcount_set(&ifq->user_refs, 1);
5376f377873SDavid Wei return ifq;
5386f377873SDavid Wei }
5396f377873SDavid Wei
io_zcrx_drop_netdev(struct io_zcrx_ifq * ifq)540035af94bSPavel Begunkov static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq)
541035af94bSPavel Begunkov {
54220dda449SPavel Begunkov guard(mutex)(&ifq->pp_lock);
54320dda449SPavel Begunkov
54420dda449SPavel Begunkov if (!ifq->netdev)
54520dda449SPavel Begunkov return;
546035af94bSPavel Begunkov netdev_put(ifq->netdev, &ifq->netdev_tracker);
547035af94bSPavel Begunkov ifq->netdev = NULL;
548035af94bSPavel Begunkov }
549035af94bSPavel Begunkov
io_close_queue(struct io_zcrx_ifq * ifq)550e0793de2SDavid Wei static void io_close_queue(struct io_zcrx_ifq *ifq)
551e0793de2SDavid Wei {
552e0793de2SDavid Wei struct net_device *netdev;
553e0793de2SDavid Wei netdevice_tracker netdev_tracker;
554e0793de2SDavid Wei struct pp_memory_provider_params p = {
555e0793de2SDavid Wei .mp_ops = &io_uring_pp_zc_ops,
556e0793de2SDavid Wei .mp_priv = ifq,
557e0793de2SDavid Wei };
558e0793de2SDavid Wei
55920dda449SPavel Begunkov scoped_guard(mutex, &ifq->pp_lock) {
560e0793de2SDavid Wei netdev = ifq->netdev;
561e0793de2SDavid Wei netdev_tracker = ifq->netdev_tracker;
562e0793de2SDavid Wei ifq->netdev = NULL;
56320dda449SPavel Begunkov }
564e0793de2SDavid Wei
565e0793de2SDavid Wei if (netdev) {
5661e91c98bSDaniel Borkmann if (ifq->if_rxq != -1) {
5671e91c98bSDaniel Borkmann netdev_lock(netdev);
5681e91c98bSDaniel Borkmann netif_mp_close_rxq(netdev, ifq->if_rxq, &p);
5691e91c98bSDaniel Borkmann netdev_unlock(netdev);
5701e91c98bSDaniel Borkmann }
571e0793de2SDavid Wei netdev_put(netdev, &netdev_tracker);
572e0793de2SDavid Wei }
573e0793de2SDavid Wei ifq->if_rxq = -1;
574e0793de2SDavid Wei }
575e0793de2SDavid Wei
io_zcrx_ifq_free(struct io_zcrx_ifq * ifq)5766f377873SDavid Wei static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
5776f377873SDavid Wei {
578e0793de2SDavid Wei io_close_queue(ifq);
579035af94bSPavel Begunkov
580cf96310cSDavid Wei if (ifq->area)
581edd706edSDavid Wei io_zcrx_free_area(ifq, ifq->area);
5825c686456SDavid Wei if (ifq->mm_account)
5835c686456SDavid Wei mmdrop(ifq->mm_account);
584035af94bSPavel Begunkov if (ifq->dev)
585035af94bSPavel Begunkov put_device(ifq->dev);
586cf96310cSDavid Wei
5876f377873SDavid Wei io_free_rbuf_ring(ifq);
5880fcccfd8SPavel Begunkov free_uid(ifq->user);
5894f602f31SPavel Begunkov mutex_destroy(&ifq->pp_lock);
5906f377873SDavid Wei kfree(ifq);
5916f377873SDavid Wei }
5926f377873SDavid Wei
io_put_zcrx_ifq(struct io_zcrx_ifq * ifq)59375c299a9SDavid Wei static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq)
59475c299a9SDavid Wei {
59575c299a9SDavid Wei if (refcount_dec_and_test(&ifq->refs))
59675c299a9SDavid Wei io_zcrx_ifq_free(ifq);
59775c299a9SDavid Wei }
59875c299a9SDavid Wei
io_zcrx_return_niov_freelist(struct net_iov * niov)599742cb2e1SDavid Wei static void io_zcrx_return_niov_freelist(struct net_iov *niov)
600742cb2e1SDavid Wei {
601742cb2e1SDavid Wei struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
602742cb2e1SDavid Wei
603898ad80dSPavel Begunkov guard(spinlock_bh)(&area->freelist_lock);
604770594e7SPavel Begunkov if (WARN_ON_ONCE(area->free_count >= area->nia.num_niovs))
605770594e7SPavel Begunkov return;
606742cb2e1SDavid Wei area->freelist[area->free_count++] = net_iov_idx(niov);
607742cb2e1SDavid Wei }
608742cb2e1SDavid Wei
zcrx_get_free_niov(struct io_zcrx_area * area)6097df542a6SPavel Begunkov static struct net_iov *zcrx_get_free_niov(struct io_zcrx_area *area)
6107df542a6SPavel Begunkov {
6117df542a6SPavel Begunkov unsigned niov_idx;
6127df542a6SPavel Begunkov
6137df542a6SPavel Begunkov lockdep_assert_held(&area->freelist_lock);
6147df542a6SPavel Begunkov
6157df542a6SPavel Begunkov if (unlikely(!area->free_count))
6167df542a6SPavel Begunkov return NULL;
6177df542a6SPavel Begunkov
6187df542a6SPavel Begunkov niov_idx = area->freelist[--area->free_count];
6197df542a6SPavel Begunkov return &area->nia.niovs[niov_idx];
6207df542a6SPavel Begunkov }
6217df542a6SPavel Begunkov
io_zcrx_return_niov(struct net_iov * niov)622742cb2e1SDavid Wei static void io_zcrx_return_niov(struct net_iov *niov)
623742cb2e1SDavid Wei {
624742cb2e1SDavid Wei netmem_ref netmem = net_iov_to_netmem(niov);
625742cb2e1SDavid Wei
626742cb2e1SDavid Wei if (!niov->desc.pp) {
627742cb2e1SDavid Wei /* copy fallback allocated niovs */
628742cb2e1SDavid Wei io_zcrx_return_niov_freelist(niov);
629742cb2e1SDavid Wei return;
630742cb2e1SDavid Wei }
631742cb2e1SDavid Wei page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false);
632742cb2e1SDavid Wei }
633742cb2e1SDavid Wei
io_zcrx_scrub(struct io_zcrx_ifq * ifq)634742cb2e1SDavid Wei static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
635742cb2e1SDavid Wei {
636742cb2e1SDavid Wei struct io_zcrx_area *area = ifq->area;
637742cb2e1SDavid Wei int i;
638742cb2e1SDavid Wei
639742cb2e1SDavid Wei if (!area)
640742cb2e1SDavid Wei return;
641742cb2e1SDavid Wei
642742cb2e1SDavid Wei /* Reclaim back all buffers given to the user space. */
643742cb2e1SDavid Wei for (i = 0; i < area->nia.num_niovs; i++) {
644742cb2e1SDavid Wei struct net_iov *niov = &area->nia.niovs[i];
645742cb2e1SDavid Wei int nr;
646742cb2e1SDavid Wei
647742cb2e1SDavid Wei if (!atomic_read(io_get_user_counter(niov)))
648742cb2e1SDavid Wei continue;
649742cb2e1SDavid Wei nr = atomic_xchg(io_get_user_counter(niov), 0);
650742cb2e1SDavid Wei if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
651742cb2e1SDavid Wei io_zcrx_return_niov(niov);
652742cb2e1SDavid Wei }
653742cb2e1SDavid Wei }
654742cb2e1SDavid Wei
zcrx_unregister_user(struct io_zcrx_ifq * ifq)655e5361d25SPavel Begunkov static void zcrx_unregister_user(struct io_zcrx_ifq *ifq)
656d7af80b2SPavel Begunkov {
657d7af80b2SPavel Begunkov if (refcount_dec_and_test(&ifq->user_refs)) {
658d7af80b2SPavel Begunkov io_close_queue(ifq);
659d7af80b2SPavel Begunkov io_zcrx_scrub(ifq);
660d7af80b2SPavel Begunkov }
661e5361d25SPavel Begunkov }
662e5361d25SPavel Begunkov
zcrx_unregister(struct io_zcrx_ifq * ifq)663e5361d25SPavel Begunkov static void zcrx_unregister(struct io_zcrx_ifq *ifq)
664e5361d25SPavel Begunkov {
665e5361d25SPavel Begunkov zcrx_unregister_user(ifq);
666d7af80b2SPavel Begunkov io_put_zcrx_ifq(ifq);
667d7af80b2SPavel Begunkov }
668d7af80b2SPavel Begunkov
io_zcrx_get_region(struct io_ring_ctx * ctx,unsigned int id)66977231d4eSPavel Begunkov struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
67077231d4eSPavel Begunkov unsigned int id)
67177231d4eSPavel Begunkov {
67276f1cc98SPavel Begunkov struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id);
67376f1cc98SPavel Begunkov
67477231d4eSPavel Begunkov lockdep_assert_held(&ctx->mmap_lock);
67577231d4eSPavel Begunkov
676ebae09bcSPavel Begunkov return ifq ? &ifq->rq_region : NULL;
67777231d4eSPavel Begunkov }
67877231d4eSPavel Begunkov
zcrx_box_release(struct inode * inode,struct file * file)679d7af80b2SPavel Begunkov static int zcrx_box_release(struct inode *inode, struct file *file)
680d7af80b2SPavel Begunkov {
681d7af80b2SPavel Begunkov struct io_zcrx_ifq *ifq = file->private_data;
682d7af80b2SPavel Begunkov
683d7af80b2SPavel Begunkov if (WARN_ON_ONCE(!ifq))
684d7af80b2SPavel Begunkov return -EFAULT;
685d7af80b2SPavel Begunkov zcrx_unregister(ifq);
686d7af80b2SPavel Begunkov return 0;
687d7af80b2SPavel Begunkov }
688d7af80b2SPavel Begunkov
689d7af80b2SPavel Begunkov static const struct file_operations zcrx_box_fops = {
690d7af80b2SPavel Begunkov .owner = THIS_MODULE,
691d7af80b2SPavel Begunkov .release = zcrx_box_release,
692d7af80b2SPavel Begunkov };
693d7af80b2SPavel Begunkov
zcrx_export(struct io_ring_ctx * ctx,struct io_zcrx_ifq * ifq,struct zcrx_ctrl * ctrl,void __user * arg)694d7af80b2SPavel Begunkov static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
695d7af80b2SPavel Begunkov struct zcrx_ctrl *ctrl, void __user *arg)
696d7af80b2SPavel Begunkov {
697d7af80b2SPavel Begunkov struct zcrx_ctrl_export *ce = &ctrl->zc_export;
698d7af80b2SPavel Begunkov struct file *file;
699d7af80b2SPavel Begunkov int fd = -1;
700d7af80b2SPavel Begunkov
701d7af80b2SPavel Begunkov if (!mem_is_zero(ce, sizeof(*ce)))
702d7af80b2SPavel Begunkov return -EINVAL;
703d7af80b2SPavel Begunkov fd = get_unused_fd_flags(O_CLOEXEC);
704d7af80b2SPavel Begunkov if (fd < 0)
705d7af80b2SPavel Begunkov return fd;
706d7af80b2SPavel Begunkov
707d7af80b2SPavel Begunkov ce->zcrx_fd = fd;
708d7af80b2SPavel Begunkov if (copy_to_user(arg, ctrl, sizeof(*ctrl))) {
709d7af80b2SPavel Begunkov put_unused_fd(fd);
710d7af80b2SPavel Begunkov return -EFAULT;
711d7af80b2SPavel Begunkov }
712d7af80b2SPavel Begunkov
713d7af80b2SPavel Begunkov refcount_inc(&ifq->refs);
714d7af80b2SPavel Begunkov refcount_inc(&ifq->user_refs);
715d7af80b2SPavel Begunkov
716d7af80b2SPavel Begunkov file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops,
717d7af80b2SPavel Begunkov ifq, O_CLOEXEC, NULL);
718d7af80b2SPavel Begunkov if (IS_ERR(file)) {
719d7af80b2SPavel Begunkov put_unused_fd(fd);
720d7af80b2SPavel Begunkov zcrx_unregister(ifq);
721d7af80b2SPavel Begunkov return PTR_ERR(file);
722d7af80b2SPavel Begunkov }
723d7af80b2SPavel Begunkov
724d7af80b2SPavel Begunkov fd_install(fd, file);
725d7af80b2SPavel Begunkov return 0;
726d7af80b2SPavel Begunkov }
727d7af80b2SPavel Begunkov
import_zcrx(struct io_ring_ctx * ctx,struct io_uring_zcrx_ifq_reg __user * arg,struct io_uring_zcrx_ifq_reg * reg)72800d91481SDavid Wei static int import_zcrx(struct io_ring_ctx *ctx,
72900d91481SDavid Wei struct io_uring_zcrx_ifq_reg __user *arg,
73000d91481SDavid Wei struct io_uring_zcrx_ifq_reg *reg)
73100d91481SDavid Wei {
73200d91481SDavid Wei struct io_zcrx_ifq *ifq;
73300d91481SDavid Wei struct file *file;
73400d91481SDavid Wei int fd, ret;
73500d91481SDavid Wei u32 id;
73600d91481SDavid Wei
73700d91481SDavid Wei if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
73800d91481SDavid Wei return -EINVAL;
73900d91481SDavid Wei if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))
74000d91481SDavid Wei return -EINVAL;
74100d91481SDavid Wei if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr)
74200d91481SDavid Wei return -EINVAL;
7437496e658SPavel Begunkov if (reg->flags & ~ZCRX_REG_IMPORT)
7447496e658SPavel Begunkov return -EINVAL;
74500d91481SDavid Wei
74600d91481SDavid Wei fd = reg->if_idx;
74700d91481SDavid Wei CLASS(fd, f)(fd);
74800d91481SDavid Wei if (fd_empty(f))
74900d91481SDavid Wei return -EBADF;
75000d91481SDavid Wei
75100d91481SDavid Wei file = fd_file(f);
75200d91481SDavid Wei if (file->f_op != &zcrx_box_fops || !file->private_data)
75300d91481SDavid Wei return -EBADF;
75400d91481SDavid Wei
75500d91481SDavid Wei ifq = file->private_data;
75600d91481SDavid Wei refcount_inc(&ifq->refs);
75700d91481SDavid Wei refcount_inc(&ifq->user_refs);
75800d91481SDavid Wei
75900d91481SDavid Wei scoped_guard(mutex, &ctx->mmap_lock) {
76000d91481SDavid Wei ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
76100d91481SDavid Wei if (ret)
76200d91481SDavid Wei goto err;
76300d91481SDavid Wei }
76400d91481SDavid Wei
76500d91481SDavid Wei reg->zcrx_id = id;
76600d91481SDavid Wei io_fill_zcrx_offsets(®->offsets);
76700d91481SDavid Wei if (copy_to_user(arg, reg, sizeof(*reg))) {
76800d91481SDavid Wei ret = -EFAULT;
76900d91481SDavid Wei goto err_xa_erase;
77000d91481SDavid Wei }
77100d91481SDavid Wei
77200d91481SDavid Wei scoped_guard(mutex, &ctx->mmap_lock) {
77300d91481SDavid Wei ret = -ENOMEM;
77400d91481SDavid Wei if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
77500d91481SDavid Wei goto err_xa_erase;
77600d91481SDavid Wei }
77700d91481SDavid Wei
77800d91481SDavid Wei return 0;
77900d91481SDavid Wei err_xa_erase:
78000d91481SDavid Wei scoped_guard(mutex, &ctx->mmap_lock)
78100d91481SDavid Wei xa_erase(&ctx->zcrx_ctxs, id);
78200d91481SDavid Wei err:
78300d91481SDavid Wei zcrx_unregister(ifq);
78400d91481SDavid Wei return ret;
78500d91481SDavid Wei }
78600d91481SDavid Wei
zcrx_register_netdev(struct io_zcrx_ifq * ifq,struct io_uring_zcrx_ifq_reg * reg,struct io_uring_zcrx_area_reg * area)78706fc3b6dSPavel Begunkov static int zcrx_register_netdev(struct io_zcrx_ifq *ifq,
78806fc3b6dSPavel Begunkov struct io_uring_zcrx_ifq_reg *reg,
78906fc3b6dSPavel Begunkov struct io_uring_zcrx_area_reg *area)
79006fc3b6dSPavel Begunkov {
79106fc3b6dSPavel Begunkov struct pp_memory_provider_params mp_param = {};
79206fc3b6dSPavel Begunkov unsigned if_rxq = reg->if_rxq;
79306fc3b6dSPavel Begunkov int ret;
79406fc3b6dSPavel Begunkov
79506fc3b6dSPavel Begunkov ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns,
79606fc3b6dSPavel Begunkov reg->if_idx);
79706fc3b6dSPavel Begunkov if (!ifq->netdev)
79806fc3b6dSPavel Begunkov return -ENODEV;
79906fc3b6dSPavel Begunkov
80006fc3b6dSPavel Begunkov netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL);
80106fc3b6dSPavel Begunkov
80291a4855dSLinus Torvalds ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, if_rxq, NETDEV_QUEUE_TYPE_RX);
80306fc3b6dSPavel Begunkov if (!ifq->dev) {
80406fc3b6dSPavel Begunkov ret = -EOPNOTSUPP;
80506fc3b6dSPavel Begunkov goto netdev_put_unlock;
80606fc3b6dSPavel Begunkov }
80706fc3b6dSPavel Begunkov get_device(ifq->dev);
80806fc3b6dSPavel Begunkov
80906fc3b6dSPavel Begunkov ret = io_zcrx_create_area(ifq, area, reg);
81006fc3b6dSPavel Begunkov if (ret)
81106fc3b6dSPavel Begunkov goto netdev_put_unlock;
81206fc3b6dSPavel Begunkov
81306fc3b6dSPavel Begunkov if (reg->rx_buf_len)
81406fc3b6dSPavel Begunkov mp_param.rx_page_size = 1U << ifq->niov_shift;
81506fc3b6dSPavel Begunkov mp_param.mp_ops = &io_uring_pp_zc_ops;
81606fc3b6dSPavel Begunkov mp_param.mp_priv = ifq;
81791a4855dSLinus Torvalds ret = netif_mp_open_rxq(ifq->netdev, if_rxq, &mp_param, NULL);
81806fc3b6dSPavel Begunkov if (ret)
81906fc3b6dSPavel Begunkov goto netdev_put_unlock;
82006fc3b6dSPavel Begunkov
82106fc3b6dSPavel Begunkov ifq->if_rxq = if_rxq;
82206fc3b6dSPavel Begunkov ret = 0;
82306fc3b6dSPavel Begunkov netdev_put_unlock:
82406fc3b6dSPavel Begunkov netdev_unlock(ifq->netdev);
82506fc3b6dSPavel Begunkov return ret;
82606fc3b6dSPavel Begunkov }
82706fc3b6dSPavel Begunkov
io_register_zcrx(struct io_ring_ctx * ctx,struct io_uring_zcrx_ifq_reg __user * arg)8287c713dd0SPavel Begunkov int io_register_zcrx(struct io_ring_ctx *ctx,
8296f377873SDavid Wei struct io_uring_zcrx_ifq_reg __user *arg)
8306f377873SDavid Wei {
831cf96310cSDavid Wei struct io_uring_zcrx_area_reg area;
8326f377873SDavid Wei struct io_uring_zcrx_ifq_reg reg;
8336f377873SDavid Wei struct io_uring_region_desc rd;
8346f377873SDavid Wei struct io_zcrx_ifq *ifq;
8356f377873SDavid Wei int ret;
83676f1cc98SPavel Begunkov u32 id;
8376f377873SDavid Wei
8386f377873SDavid Wei /*
8396f377873SDavid Wei * 1. Interface queue allocation.
8406f377873SDavid Wei * 2. It can observe data destined for sockets of other tasks.
8416f377873SDavid Wei */
8426f377873SDavid Wei if (!capable(CAP_NET_ADMIN))
8436f377873SDavid Wei return -EPERM;
8446f377873SDavid Wei
8456f377873SDavid Wei /* mandatory io_uring features for zc rx */
846c986f758SJens Axboe if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
847c986f758SJens Axboe return -EINVAL;
848c986f758SJens Axboe if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))
8496f377873SDavid Wei return -EINVAL;
8506f377873SDavid Wei if (copy_from_user(®, arg, sizeof(reg)))
8516f377873SDavid Wei return -EFAULT;
852795663b4SPavel Begunkov if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || reg.zcrx_id)
8536f377873SDavid Wei return -EINVAL;
854dc156e0fSPavel Begunkov if (reg.flags & ~ZCRX_SUPPORTED_REG_FLAGS)
855dc156e0fSPavel Begunkov return -EINVAL;
85600d91481SDavid Wei if (reg.flags & ZCRX_REG_IMPORT)
85700d91481SDavid Wei return import_zcrx(ctx, arg, ®);
85800d91481SDavid Wei if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
85900d91481SDavid Wei return -EFAULT;
860dc156e0fSPavel Begunkov if (reg.if_rxq == -1 || !reg.rq_entries)
8616f377873SDavid Wei return -EINVAL;
862825f2764SPavel Begunkov if ((reg.if_rxq || reg.if_idx) && (reg.flags & ZCRX_REG_NODEV))
863825f2764SPavel Begunkov return -EINVAL;
8646f377873SDavid Wei if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
8656f377873SDavid Wei if (!(ctx->flags & IORING_SETUP_CLAMP))
8666f377873SDavid Wei return -EINVAL;
8676f377873SDavid Wei reg.rq_entries = IO_RQ_MAX_ENTRIES;
8686f377873SDavid Wei }
8696f377873SDavid Wei reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
8706f377873SDavid Wei
871cf96310cSDavid Wei if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area)))
8726f377873SDavid Wei return -EFAULT;
8736f377873SDavid Wei
8746f377873SDavid Wei ifq = io_zcrx_ifq_alloc(ctx);
8756f377873SDavid Wei if (!ifq)
8766f377873SDavid Wei return -ENOMEM;
87775c299a9SDavid Wei
8785c686456SDavid Wei if (ctx->user) {
8795c686456SDavid Wei get_uid(ctx->user);
8805c686456SDavid Wei ifq->user = ctx->user;
8815c686456SDavid Wei }
8825c686456SDavid Wei if (ctx->mm_account) {
8835c686456SDavid Wei mmgrab(ctx->mm_account);
8845c686456SDavid Wei ifq->mm_account = ctx->mm_account;
8855c686456SDavid Wei }
8866a55a0a7SPavel Begunkov ifq->rq.nr_entries = reg.rq_entries;
8876f377873SDavid Wei
88876f1cc98SPavel Begunkov scoped_guard(mutex, &ctx->mmap_lock) {
88976f1cc98SPavel Begunkov /* preallocate id */
89076f1cc98SPavel Begunkov ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
89176f1cc98SPavel Begunkov if (ret)
89276f1cc98SPavel Begunkov goto ifq_free;
89376f1cc98SPavel Begunkov }
89476f1cc98SPavel Begunkov
8955c686456SDavid Wei ret = io_allocate_rbuf_ring(ctx, ifq, ®, &rd, id);
8966f377873SDavid Wei if (ret)
8976f377873SDavid Wei goto err;
8986f377873SDavid Wei
8995c727ce0SPavel Begunkov ifq->kern_readable = !(area.flags & IORING_ZCRX_AREA_DMABUF);
9005c727ce0SPavel Begunkov
901825f2764SPavel Begunkov if (!(reg.flags & ZCRX_REG_NODEV)) {
90206fc3b6dSPavel Begunkov ret = zcrx_register_netdev(ifq, ®, &area);
90306fc3b6dSPavel Begunkov if (ret)
9046c9589aaSPavel Begunkov goto err;
905825f2764SPavel Begunkov } else {
906825f2764SPavel Begunkov ret = io_zcrx_create_area(ifq, &area, ®);
907825f2764SPavel Begunkov if (ret)
908825f2764SPavel Begunkov goto err;
909825f2764SPavel Begunkov }
910e0793de2SDavid Wei
91176f1cc98SPavel Begunkov reg.zcrx_id = id;
91276f1cc98SPavel Begunkov
91376f1cc98SPavel Begunkov scoped_guard(mutex, &ctx->mmap_lock) {
91476f1cc98SPavel Begunkov /* publish ifq */
91576f1cc98SPavel Begunkov ret = -ENOMEM;
91676f1cc98SPavel Begunkov if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
91776f1cc98SPavel Begunkov goto err;
91876f1cc98SPavel Begunkov }
9196f377873SDavid Wei
920795663b4SPavel Begunkov reg.rx_buf_len = 1U << ifq->niov_shift;
921795663b4SPavel Begunkov
9226f377873SDavid Wei if (copy_to_user(arg, ®, sizeof(reg)) ||
923e0793de2SDavid Wei copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) ||
924e0793de2SDavid Wei copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) {
925cf96310cSDavid Wei ret = -EFAULT;
926cf96310cSDavid Wei goto err;
927cf96310cSDavid Wei }
9286f377873SDavid Wei return 0;
9296f377873SDavid Wei err:
93076f1cc98SPavel Begunkov scoped_guard(mutex, &ctx->mmap_lock)
93176f1cc98SPavel Begunkov xa_erase(&ctx->zcrx_ctxs, id);
93276f1cc98SPavel Begunkov ifq_free:
9335d540e45SPavel Begunkov zcrx_unregister(ifq);
9346f377873SDavid Wei return ret;
9356f377873SDavid Wei }
9366f377873SDavid Wei
is_zcrx_entry_marked(struct io_ring_ctx * ctx,unsigned long id)937e5361d25SPavel Begunkov static inline bool is_zcrx_entry_marked(struct io_ring_ctx *ctx, unsigned long id)
938e5361d25SPavel Begunkov {
9398ae2837dSPavel Begunkov return xa_get_mark(&ctx->zcrx_ctxs, id, XA_MARK_1);
940e5361d25SPavel Begunkov }
941e5361d25SPavel Begunkov
set_zcrx_entry_mark(struct io_ring_ctx * ctx,unsigned long id)942e5361d25SPavel Begunkov static inline void set_zcrx_entry_mark(struct io_ring_ctx *ctx, unsigned long id)
943e5361d25SPavel Begunkov {
9448ae2837dSPavel Begunkov xa_set_mark(&ctx->zcrx_ctxs, id, XA_MARK_1);
945e5361d25SPavel Begunkov }
946e5361d25SPavel Begunkov
io_terminate_zcrx(struct io_ring_ctx * ctx)947e5361d25SPavel Begunkov void io_terminate_zcrx(struct io_ring_ctx *ctx)
948e5361d25SPavel Begunkov {
949e5361d25SPavel Begunkov struct io_zcrx_ifq *ifq;
950e5361d25SPavel Begunkov unsigned long id = 0;
951e5361d25SPavel Begunkov
952e5361d25SPavel Begunkov lockdep_assert_held(&ctx->uring_lock);
953e5361d25SPavel Begunkov
954e5361d25SPavel Begunkov while (1) {
955e5361d25SPavel Begunkov scoped_guard(mutex, &ctx->mmap_lock)
956e5361d25SPavel Begunkov ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
957e5361d25SPavel Begunkov if (!ifq)
958e5361d25SPavel Begunkov break;
959e5361d25SPavel Begunkov if (WARN_ON_ONCE(is_zcrx_entry_marked(ctx, id)))
960e5361d25SPavel Begunkov break;
961e5361d25SPavel Begunkov set_zcrx_entry_mark(ctx, id);
962e5361d25SPavel Begunkov id++;
963e5361d25SPavel Begunkov zcrx_unregister_user(ifq);
964e5361d25SPavel Begunkov }
965e5361d25SPavel Begunkov }
966e5361d25SPavel Begunkov
io_unregister_zcrx(struct io_ring_ctx * ctx)9677c713dd0SPavel Begunkov void io_unregister_zcrx(struct io_ring_ctx *ctx)
9681bd95163SDavid Wei {
9691bd95163SDavid Wei struct io_zcrx_ifq *ifq;
9701bd95163SDavid Wei
9711bd95163SDavid Wei lockdep_assert_held(&ctx->uring_lock);
9721bd95163SDavid Wei
9731bd95163SDavid Wei while (1) {
9741bd95163SDavid Wei scoped_guard(mutex, &ctx->mmap_lock) {
9751bd95163SDavid Wei unsigned long id = 0;
9761bd95163SDavid Wei
9771bd95163SDavid Wei ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
978e5361d25SPavel Begunkov if (ifq) {
979e5361d25SPavel Begunkov if (WARN_ON_ONCE(!is_zcrx_entry_marked(ctx, id))) {
980e5361d25SPavel Begunkov ifq = NULL;
981e5361d25SPavel Begunkov break;
982e5361d25SPavel Begunkov }
9831bd95163SDavid Wei xa_erase(&ctx->zcrx_ctxs, id);
9841bd95163SDavid Wei }
985e5361d25SPavel Begunkov }
9861bd95163SDavid Wei if (!ifq)
9871bd95163SDavid Wei break;
988e5361d25SPavel Begunkov io_put_zcrx_ifq(ifq);
9891bd95163SDavid Wei }
9901bd95163SDavid Wei
9911bd95163SDavid Wei xa_destroy(&ctx->zcrx_ctxs);
9921bd95163SDavid Wei }
9931bd95163SDavid Wei
zcrx_rq_entries(struct zcrx_rq * rq)9946a55a0a7SPavel Begunkov static inline u32 zcrx_rq_entries(struct zcrx_rq *rq)
99534a3e608SPavel Begunkov {
99634a3e608SPavel Begunkov u32 entries;
99734a3e608SPavel Begunkov
9986a55a0a7SPavel Begunkov entries = smp_load_acquire(&rq->ring->tail) - rq->cached_head;
9996a55a0a7SPavel Begunkov return min(entries, rq->nr_entries);
100034a3e608SPavel Begunkov }
100134a3e608SPavel Begunkov
zcrx_next_rqe(struct zcrx_rq * rq,unsigned mask)10026a55a0a7SPavel Begunkov static struct io_uring_zcrx_rqe *zcrx_next_rqe(struct zcrx_rq *rq, unsigned mask)
100334a3e608SPavel Begunkov {
10046a55a0a7SPavel Begunkov unsigned int idx = rq->cached_head++ & mask;
100534a3e608SPavel Begunkov
10066a55a0a7SPavel Begunkov return &rq->rqes[idx];
100734a3e608SPavel Begunkov }
100834a3e608SPavel Begunkov
io_parse_rqe(struct io_uring_zcrx_rqe * rqe,struct io_zcrx_ifq * ifq,struct net_iov ** ret_niov)10098fd08d8dSPavel Begunkov static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe,
10108fd08d8dSPavel Begunkov struct io_zcrx_ifq *ifq,
10118fd08d8dSPavel Begunkov struct net_iov **ret_niov)
10128fd08d8dSPavel Begunkov {
1013531bb98aSPavel Begunkov __u64 off = READ_ONCE(rqe->off);
10148fd08d8dSPavel Begunkov unsigned niov_idx, area_idx;
10158fd08d8dSPavel Begunkov struct io_zcrx_area *area;
10168fd08d8dSPavel Begunkov
1017531bb98aSPavel Begunkov area_idx = off >> IORING_ZCRX_AREA_SHIFT;
1018531bb98aSPavel Begunkov niov_idx = (off & ~IORING_ZCRX_AREA_MASK) >> ifq->niov_shift;
10198fd08d8dSPavel Begunkov
10208fd08d8dSPavel Begunkov if (unlikely(rqe->__pad || area_idx))
10218fd08d8dSPavel Begunkov return false;
10228fd08d8dSPavel Begunkov area = ifq->area;
10238fd08d8dSPavel Begunkov
10248fd08d8dSPavel Begunkov if (unlikely(niov_idx >= area->nia.num_niovs))
10258fd08d8dSPavel Begunkov return false;
10268fd08d8dSPavel Begunkov niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs);
10278fd08d8dSPavel Begunkov
10288fd08d8dSPavel Begunkov *ret_niov = &area->nia.niovs[niov_idx];
10298fd08d8dSPavel Begunkov return true;
10308fd08d8dSPavel Begunkov }
10318fd08d8dSPavel Begunkov
io_zcrx_ring_refill(struct page_pool * pp,struct io_zcrx_ifq * ifq,netmem_ref * netmems,unsigned to_alloc)1032c0989138SPavel Begunkov static unsigned io_zcrx_ring_refill(struct page_pool *pp,
1033c0989138SPavel Begunkov struct io_zcrx_ifq *ifq,
1034c0989138SPavel Begunkov netmem_ref *netmems, unsigned to_alloc)
103534a3e608SPavel Begunkov {
10366a55a0a7SPavel Begunkov struct zcrx_rq *rq = &ifq->rq;
10376a55a0a7SPavel Begunkov unsigned int mask = rq->nr_entries - 1;
103834a3e608SPavel Begunkov unsigned int entries;
1039c0989138SPavel Begunkov unsigned allocated = 0;
104034a3e608SPavel Begunkov
10416a55a0a7SPavel Begunkov guard(spinlock_bh)(&rq->lock);
104234a3e608SPavel Begunkov
10436a55a0a7SPavel Begunkov entries = zcrx_rq_entries(rq);
1044c0989138SPavel Begunkov entries = min_t(unsigned, entries, to_alloc);
1045c95257f3SPavel Begunkov if (unlikely(!entries))
1046c0989138SPavel Begunkov return 0;
104734a3e608SPavel Begunkov
104834a3e608SPavel Begunkov do {
10496a55a0a7SPavel Begunkov struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask);
105034a3e608SPavel Begunkov struct net_iov *niov;
105173fa880eSPavel Begunkov netmem_ref netmem;
105234a3e608SPavel Begunkov
10538fd08d8dSPavel Begunkov if (!io_parse_rqe(rqe, ifq, &niov))
105434a3e608SPavel Begunkov continue;
105534a3e608SPavel Begunkov if (!io_zcrx_put_niov_uref(niov))
105634a3e608SPavel Begunkov continue;
105734a3e608SPavel Begunkov
105834a3e608SPavel Begunkov netmem = net_iov_to_netmem(niov);
1059d5e31db9SPavel Begunkov if (!page_pool_unref_and_test(netmem))
106034a3e608SPavel Begunkov continue;
106134a3e608SPavel Begunkov
1062f0243d2bSPavel Begunkov if (unlikely(niov->desc.pp != pp)) {
106334a3e608SPavel Begunkov io_zcrx_return_niov(niov);
106434a3e608SPavel Begunkov continue;
106534a3e608SPavel Begunkov }
106634a3e608SPavel Begunkov
1067c0989138SPavel Begunkov netmems[allocated] = netmem;
1068c0989138SPavel Begunkov allocated++;
106934a3e608SPavel Begunkov } while (--entries);
107034a3e608SPavel Begunkov
10716a55a0a7SPavel Begunkov smp_store_release(&rq->ring->head, rq->cached_head);
1072c0989138SPavel Begunkov return allocated;
107334a3e608SPavel Begunkov }
107434a3e608SPavel Begunkov
io_zcrx_refill_slow(struct page_pool * pp,struct io_zcrx_ifq * ifq,netmem_ref * netmems,unsigned to_alloc)1075c0989138SPavel Begunkov static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq,
1076c0989138SPavel Begunkov netmem_ref *netmems, unsigned to_alloc)
107734a3e608SPavel Begunkov {
107834a3e608SPavel Begunkov struct io_zcrx_area *area = ifq->area;
1079c0989138SPavel Begunkov unsigned allocated = 0;
108034a3e608SPavel Begunkov
1081898ad80dSPavel Begunkov guard(spinlock_bh)(&area->freelist_lock);
1082898ad80dSPavel Begunkov
1083c0989138SPavel Begunkov for (allocated = 0; allocated < to_alloc; allocated++) {
10847df542a6SPavel Begunkov struct net_iov *niov = zcrx_get_free_niov(area);
108534a3e608SPavel Begunkov
10867df542a6SPavel Begunkov if (!niov)
10877df542a6SPavel Begunkov break;
108834a3e608SPavel Begunkov net_mp_niov_set_page_pool(pp, niov);
1089c0989138SPavel Begunkov netmems[allocated] = net_iov_to_netmem(niov);
109034a3e608SPavel Begunkov }
1091c0989138SPavel Begunkov return allocated;
109234a3e608SPavel Begunkov }
109334a3e608SPavel Begunkov
io_pp_zc_alloc_netmems(struct page_pool * pp,gfp_t gfp)109434a3e608SPavel Begunkov static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
109534a3e608SPavel Begunkov {
109670e4f9bfSPavel Begunkov struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
1097c0989138SPavel Begunkov netmem_ref *netmems = pp->alloc.cache;
1098c0989138SPavel Begunkov unsigned to_alloc = PP_ALLOC_CACHE_REFILL;
1099c0989138SPavel Begunkov unsigned allocated;
110034a3e608SPavel Begunkov
110134a3e608SPavel Begunkov /* pp should already be ensuring that */
110248f253d6SPavel Begunkov if (WARN_ON_ONCE(pp->alloc.count))
110348f253d6SPavel Begunkov return 0;
110434a3e608SPavel Begunkov
1105c0989138SPavel Begunkov allocated = io_zcrx_ring_refill(pp, ifq, netmems, to_alloc);
1106c0989138SPavel Begunkov if (likely(allocated))
110734a3e608SPavel Begunkov goto out_return;
110834a3e608SPavel Begunkov
1109c0989138SPavel Begunkov allocated = io_zcrx_refill_slow(pp, ifq, netmems, to_alloc);
1110c0989138SPavel Begunkov if (!allocated)
111134a3e608SPavel Begunkov return 0;
111234a3e608SPavel Begunkov out_return:
111361cfadaaSPavel Begunkov zcrx_sync_for_device(pp, ifq, netmems, allocated);
1114c0989138SPavel Begunkov allocated--;
1115c0989138SPavel Begunkov pp->alloc.count += allocated;
1116c0989138SPavel Begunkov return netmems[allocated];
111734a3e608SPavel Begunkov }
111834a3e608SPavel Begunkov
io_pp_zc_release_netmem(struct page_pool * pp,netmem_ref netmem)111934a3e608SPavel Begunkov static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
112034a3e608SPavel Begunkov {
112134a3e608SPavel Begunkov struct net_iov *niov;
112234a3e608SPavel Begunkov
112334a3e608SPavel Begunkov if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
112434a3e608SPavel Begunkov return false;
112534a3e608SPavel Begunkov
112634a3e608SPavel Begunkov niov = netmem_to_net_iov(netmem);
112734a3e608SPavel Begunkov net_mp_niov_clear_page_pool(niov);
112834a3e608SPavel Begunkov io_zcrx_return_niov_freelist(niov);
112934a3e608SPavel Begunkov return false;
113034a3e608SPavel Begunkov }
113134a3e608SPavel Begunkov
io_pp_zc_init(struct page_pool * pp)113234a3e608SPavel Begunkov static int io_pp_zc_init(struct page_pool *pp)
113334a3e608SPavel Begunkov {
113470e4f9bfSPavel Begunkov struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
113534a3e608SPavel Begunkov
113634a3e608SPavel Begunkov if (WARN_ON_ONCE(!ifq))
113734a3e608SPavel Begunkov return -EINVAL;
1138db070446SPavel Begunkov if (WARN_ON_ONCE(ifq->dev != pp->p.dev))
1139db070446SPavel Begunkov return -EINVAL;
1140db070446SPavel Begunkov if (WARN_ON_ONCE(!pp->dma_map))
114134a3e608SPavel Begunkov return -EOPNOTSUPP;
1142d8d135dfSPavel Begunkov if (pp->p.order + PAGE_SHIFT != ifq->niov_shift)
1143d8d135dfSPavel Begunkov return -EINVAL;
1144db070446SPavel Begunkov if (pp->p.dma_dir != DMA_FROM_DEVICE)
1145db070446SPavel Begunkov return -EOPNOTSUPP;
114634a3e608SPavel Begunkov
114775c299a9SDavid Wei refcount_inc(&ifq->refs);
114834a3e608SPavel Begunkov return 0;
114934a3e608SPavel Begunkov }
115034a3e608SPavel Begunkov
io_pp_zc_destroy(struct page_pool * pp)115134a3e608SPavel Begunkov static void io_pp_zc_destroy(struct page_pool *pp)
115234a3e608SPavel Begunkov {
115375c299a9SDavid Wei io_put_zcrx_ifq(io_pp_to_ifq(pp));
115434a3e608SPavel Begunkov }
115534a3e608SPavel Begunkov
io_pp_nl_fill(void * mp_priv,struct sk_buff * rsp,struct netdev_rx_queue * rxq)115634a3e608SPavel Begunkov static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp,
115734a3e608SPavel Begunkov struct netdev_rx_queue *rxq)
115834a3e608SPavel Begunkov {
115934a3e608SPavel Begunkov struct nlattr *nest;
116034a3e608SPavel Begunkov int type;
116134a3e608SPavel Begunkov
116234a3e608SPavel Begunkov type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING;
116334a3e608SPavel Begunkov nest = nla_nest_start(rsp, type);
116434a3e608SPavel Begunkov if (!nest)
116534a3e608SPavel Begunkov return -EMSGSIZE;
116634a3e608SPavel Begunkov nla_nest_end(rsp, nest);
116734a3e608SPavel Begunkov
116834a3e608SPavel Begunkov return 0;
116934a3e608SPavel Begunkov }
117034a3e608SPavel Begunkov
io_pp_uninstall(void * mp_priv,struct netdev_rx_queue * rxq)117134a3e608SPavel Begunkov static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq)
117234a3e608SPavel Begunkov {
117334a3e608SPavel Begunkov struct pp_memory_provider_params *p = &rxq->mp_params;
117434a3e608SPavel Begunkov struct io_zcrx_ifq *ifq = mp_priv;
117534a3e608SPavel Begunkov
117634a3e608SPavel Begunkov io_zcrx_drop_netdev(ifq);
1177f12ecf5eSPavel Begunkov if (ifq->area)
1178f12ecf5eSPavel Begunkov io_zcrx_unmap_area(ifq, ifq->area);
1179f12ecf5eSPavel Begunkov
118034a3e608SPavel Begunkov p->mp_ops = NULL;
118134a3e608SPavel Begunkov p->mp_priv = NULL;
118234a3e608SPavel Begunkov }
118334a3e608SPavel Begunkov
118434a3e608SPavel Begunkov static const struct memory_provider_ops io_uring_pp_zc_ops = {
118534a3e608SPavel Begunkov .alloc_netmems = io_pp_zc_alloc_netmems,
118634a3e608SPavel Begunkov .release_netmem = io_pp_zc_release_netmem,
118734a3e608SPavel Begunkov .init = io_pp_zc_init,
118834a3e608SPavel Begunkov .destroy = io_pp_zc_destroy,
118934a3e608SPavel Begunkov .nl_fill = io_pp_nl_fill,
119034a3e608SPavel Begunkov .uninstall = io_pp_uninstall,
119134a3e608SPavel Begunkov };
119211ed914bSDavid Wei
zcrx_parse_rq(netmem_ref * netmem_array,unsigned nr,struct io_zcrx_ifq * zcrx,struct zcrx_rq * rq)1193475eb39bSPavel Begunkov static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr,
11946a55a0a7SPavel Begunkov struct io_zcrx_ifq *zcrx, struct zcrx_rq *rq)
1195475eb39bSPavel Begunkov {
11966a55a0a7SPavel Begunkov unsigned int mask = rq->nr_entries - 1;
1197475eb39bSPavel Begunkov unsigned int i;
1198475eb39bSPavel Begunkov
11996a55a0a7SPavel Begunkov nr = min(nr, zcrx_rq_entries(rq));
1200475eb39bSPavel Begunkov for (i = 0; i < nr; i++) {
12016a55a0a7SPavel Begunkov struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask);
1202475eb39bSPavel Begunkov struct net_iov *niov;
1203475eb39bSPavel Begunkov
1204475eb39bSPavel Begunkov if (!io_parse_rqe(rqe, zcrx, &niov))
1205475eb39bSPavel Begunkov break;
1206475eb39bSPavel Begunkov netmem_array[i] = net_iov_to_netmem(niov);
1207475eb39bSPavel Begunkov }
1208475eb39bSPavel Begunkov
12096a55a0a7SPavel Begunkov smp_store_release(&rq->ring->head, rq->cached_head);
1210475eb39bSPavel Begunkov return i;
1211475eb39bSPavel Begunkov }
1212475eb39bSPavel Begunkov
1213475eb39bSPavel Begunkov #define ZCRX_FLUSH_BATCH 32
1214475eb39bSPavel Begunkov
zcrx_return_buffers(netmem_ref * netmems,unsigned nr)1215475eb39bSPavel Begunkov static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr)
1216475eb39bSPavel Begunkov {
1217475eb39bSPavel Begunkov unsigned i;
1218475eb39bSPavel Begunkov
1219475eb39bSPavel Begunkov for (i = 0; i < nr; i++) {
1220475eb39bSPavel Begunkov netmem_ref netmem = netmems[i];
1221475eb39bSPavel Begunkov struct net_iov *niov = netmem_to_net_iov(netmem);
1222475eb39bSPavel Begunkov
1223475eb39bSPavel Begunkov if (!io_zcrx_put_niov_uref(niov))
1224475eb39bSPavel Begunkov continue;
1225475eb39bSPavel Begunkov if (!page_pool_unref_and_test(netmem))
1226475eb39bSPavel Begunkov continue;
1227475eb39bSPavel Begunkov io_zcrx_return_niov(niov);
1228475eb39bSPavel Begunkov }
1229475eb39bSPavel Begunkov }
1230475eb39bSPavel Begunkov
zcrx_flush_rq(struct io_ring_ctx * ctx,struct io_zcrx_ifq * zcrx,struct zcrx_ctrl * ctrl)1231475eb39bSPavel Begunkov static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
1232475eb39bSPavel Begunkov struct zcrx_ctrl *ctrl)
1233475eb39bSPavel Begunkov {
1234475eb39bSPavel Begunkov struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush;
1235475eb39bSPavel Begunkov netmem_ref netmems[ZCRX_FLUSH_BATCH];
1236475eb39bSPavel Begunkov unsigned total = 0;
1237475eb39bSPavel Begunkov unsigned nr;
1238475eb39bSPavel Begunkov
1239475eb39bSPavel Begunkov if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv)))
1240475eb39bSPavel Begunkov return -EINVAL;
1241475eb39bSPavel Begunkov
1242475eb39bSPavel Begunkov do {
12436a55a0a7SPavel Begunkov struct zcrx_rq *rq = &zcrx->rq;
12446a55a0a7SPavel Begunkov
12456a55a0a7SPavel Begunkov scoped_guard(spinlock_bh, &rq->lock) {
12466a55a0a7SPavel Begunkov nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx, rq);
1247475eb39bSPavel Begunkov zcrx_return_buffers(netmems, nr);
1248af07330eSPavel Begunkov }
1249af07330eSPavel Begunkov
1250475eb39bSPavel Begunkov total += nr;
1251475eb39bSPavel Begunkov
1252475eb39bSPavel Begunkov if (fatal_signal_pending(current))
1253475eb39bSPavel Begunkov break;
1254475eb39bSPavel Begunkov cond_resched();
12556a55a0a7SPavel Begunkov } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq.nr_entries);
1256475eb39bSPavel Begunkov
1257475eb39bSPavel Begunkov return 0;
1258475eb39bSPavel Begunkov }
1259475eb39bSPavel Begunkov
io_zcrx_ctrl(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)1260d663976dSPavel Begunkov int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
1261d663976dSPavel Begunkov {
1262d663976dSPavel Begunkov struct zcrx_ctrl ctrl;
1263d663976dSPavel Begunkov struct io_zcrx_ifq *zcrx;
1264d663976dSPavel Begunkov
1265de6ed1b3SPavel Begunkov BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_flush));
1266de6ed1b3SPavel Begunkov
1267d663976dSPavel Begunkov if (nr_args)
1268d663976dSPavel Begunkov return -EINVAL;
1269d663976dSPavel Begunkov if (copy_from_user(&ctrl, arg, sizeof(ctrl)))
1270d663976dSPavel Begunkov return -EFAULT;
1271d663976dSPavel Begunkov if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv)))
1272d663976dSPavel Begunkov return -EFAULT;
1273d663976dSPavel Begunkov
1274d663976dSPavel Begunkov zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id);
1275d663976dSPavel Begunkov if (!zcrx)
1276d663976dSPavel Begunkov return -ENXIO;
1277d663976dSPavel Begunkov
1278475eb39bSPavel Begunkov switch (ctrl.op) {
1279475eb39bSPavel Begunkov case ZCRX_CTRL_FLUSH_RQ:
1280475eb39bSPavel Begunkov return zcrx_flush_rq(ctx, zcrx, &ctrl);
1281d7af80b2SPavel Begunkov case ZCRX_CTRL_EXPORT:
1282d7af80b2SPavel Begunkov return zcrx_export(ctx, zcrx, &ctrl, arg);
1283475eb39bSPavel Begunkov }
1284475eb39bSPavel Begunkov
1285475eb39bSPavel Begunkov return -EOPNOTSUPP;
1286d663976dSPavel Begunkov }
1287d663976dSPavel Begunkov
io_zcrx_queue_cqe(struct io_kiocb * req,struct net_iov * niov,struct io_zcrx_ifq * ifq,int off,int len)128811ed914bSDavid Wei static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
128911ed914bSDavid Wei struct io_zcrx_ifq *ifq, int off, int len)
129011ed914bSDavid Wei {
1291c986f758SJens Axboe struct io_ring_ctx *ctx = req->ctx;
129211ed914bSDavid Wei struct io_uring_zcrx_cqe *rcqe;
129311ed914bSDavid Wei struct io_zcrx_area *area;
129411ed914bSDavid Wei struct io_uring_cqe *cqe;
129511ed914bSDavid Wei u64 offset;
129611ed914bSDavid Wei
1297c986f758SJens Axboe if (!io_defer_get_uncommited_cqe(ctx, &cqe))
129811ed914bSDavid Wei return false;
129911ed914bSDavid Wei
130011ed914bSDavid Wei cqe->user_data = req->cqe.user_data;
130111ed914bSDavid Wei cqe->res = len;
130211ed914bSDavid Wei cqe->flags = IORING_CQE_F_MORE;
1303c986f758SJens Axboe if (ctx->flags & IORING_SETUP_CQE_MIXED)
1304c986f758SJens Axboe cqe->flags |= IORING_CQE_F_32;
130511ed914bSDavid Wei
130611ed914bSDavid Wei area = io_zcrx_iov_to_area(niov);
1307d8d135dfSPavel Begunkov offset = off + (net_iov_idx(niov) << ifq->niov_shift);
130811ed914bSDavid Wei rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1);
130911ed914bSDavid Wei rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT);
131011ed914bSDavid Wei rcqe->__pad = 0;
131111ed914bSDavid Wei return true;
131211ed914bSDavid Wei }
131311ed914bSDavid Wei
io_alloc_fallback_niov(struct io_zcrx_ifq * ifq)131402bb047bSPavel Begunkov static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq)
1315bc57c7d3SPavel Begunkov {
131602bb047bSPavel Begunkov struct io_zcrx_area *area = ifq->area;
1317bc57c7d3SPavel Begunkov struct net_iov *niov = NULL;
1318bc57c7d3SPavel Begunkov
13195c727ce0SPavel Begunkov if (!ifq->kern_readable)
132002bb047bSPavel Begunkov return NULL;
132102bb047bSPavel Begunkov
13227df542a6SPavel Begunkov scoped_guard(spinlock_bh, &area->freelist_lock)
13237df542a6SPavel Begunkov niov = zcrx_get_free_niov(area);
1324bc57c7d3SPavel Begunkov
1325bc57c7d3SPavel Begunkov if (niov)
1326bc57c7d3SPavel Begunkov page_pool_fragment_netmem(net_iov_to_netmem(niov), 1);
1327bc57c7d3SPavel Begunkov return niov;
1328bc57c7d3SPavel Begunkov }
1329bc57c7d3SPavel Begunkov
1330e67645bbSPavel Begunkov struct io_copy_cache {
1331e67645bbSPavel Begunkov struct page *page;
1332e67645bbSPavel Begunkov unsigned long offset;
1333e67645bbSPavel Begunkov size_t size;
1334e67645bbSPavel Begunkov };
1335e67645bbSPavel Begunkov
io_copy_page(struct io_copy_cache * cc,struct page * src_page,unsigned int src_offset,size_t len)1336e67645bbSPavel Begunkov static ssize_t io_copy_page(struct io_copy_cache *cc, struct page *src_page,
1337bc57c7d3SPavel Begunkov unsigned int src_offset, size_t len)
1338bc57c7d3SPavel Begunkov {
1339e67645bbSPavel Begunkov size_t copied = 0;
1340e67645bbSPavel Begunkov
1341e67645bbSPavel Begunkov len = min(len, cc->size);
1342e67645bbSPavel Begunkov
1343e67645bbSPavel Begunkov while (len) {
1344e67645bbSPavel Begunkov void *src_addr, *dst_addr;
1345e67645bbSPavel Begunkov struct page *dst_page = cc->page;
1346e67645bbSPavel Begunkov unsigned dst_offset = cc->offset;
1347e67645bbSPavel Begunkov size_t n = len;
1348e67645bbSPavel Begunkov
1349e67645bbSPavel Begunkov if (folio_test_partial_kmap(page_folio(dst_page)) ||
1350e67645bbSPavel Begunkov folio_test_partial_kmap(page_folio(src_page))) {
1351d99c5754SDavid Hildenbrand dst_page += dst_offset / PAGE_SIZE;
1352e67645bbSPavel Begunkov dst_offset = offset_in_page(dst_offset);
1353d99c5754SDavid Hildenbrand src_page += src_offset / PAGE_SIZE;
1354e67645bbSPavel Begunkov src_offset = offset_in_page(src_offset);
1355e67645bbSPavel Begunkov n = min(PAGE_SIZE - src_offset, PAGE_SIZE - dst_offset);
1356e67645bbSPavel Begunkov n = min(n, len);
1357e67645bbSPavel Begunkov }
1358e67645bbSPavel Begunkov
1359e67645bbSPavel Begunkov dst_addr = kmap_local_page(dst_page) + dst_offset;
1360e67645bbSPavel Begunkov src_addr = kmap_local_page(src_page) + src_offset;
1361e67645bbSPavel Begunkov
1362e67645bbSPavel Begunkov memcpy(dst_addr, src_addr, n);
1363e67645bbSPavel Begunkov
1364e67645bbSPavel Begunkov kunmap_local(src_addr);
1365e67645bbSPavel Begunkov kunmap_local(dst_addr);
1366e67645bbSPavel Begunkov
1367e67645bbSPavel Begunkov cc->size -= n;
1368e67645bbSPavel Begunkov cc->offset += n;
1369e9a9dcb4SPavel Begunkov src_offset += n;
1370e67645bbSPavel Begunkov len -= n;
1371e67645bbSPavel Begunkov copied += n;
1372e67645bbSPavel Begunkov }
1373e67645bbSPavel Begunkov return copied;
1374e67645bbSPavel Begunkov }
1375e67645bbSPavel Begunkov
io_zcrx_copy_chunk(struct io_kiocb * req,struct io_zcrx_ifq * ifq,struct page * src_page,unsigned int src_offset,size_t len)1376bc57c7d3SPavel Begunkov static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
1377e9a9ddb1SPavel Begunkov struct page *src_page, unsigned int src_offset,
1378e9a9ddb1SPavel Begunkov size_t len)
1379bc57c7d3SPavel Begunkov {
1380bc57c7d3SPavel Begunkov size_t copied = 0;
1381bc57c7d3SPavel Begunkov int ret = 0;
1382bc57c7d3SPavel Begunkov
1383bc57c7d3SPavel Begunkov while (len) {
1384e67645bbSPavel Begunkov struct io_copy_cache cc;
1385bc57c7d3SPavel Begunkov struct net_iov *niov;
1386e67645bbSPavel Begunkov size_t n;
1387bc57c7d3SPavel Begunkov
138802bb047bSPavel Begunkov niov = io_alloc_fallback_niov(ifq);
1389bc57c7d3SPavel Begunkov if (!niov) {
1390bc57c7d3SPavel Begunkov ret = -ENOMEM;
1391bc57c7d3SPavel Begunkov break;
1392bc57c7d3SPavel Begunkov }
1393bc57c7d3SPavel Begunkov
1394e67645bbSPavel Begunkov cc.page = io_zcrx_iov_page(niov);
1395e67645bbSPavel Begunkov cc.offset = 0;
1396e67645bbSPavel Begunkov cc.size = PAGE_SIZE;
1397bc57c7d3SPavel Begunkov
1398e67645bbSPavel Begunkov n = io_copy_page(&cc, src_page, src_offset, len);
1399bc57c7d3SPavel Begunkov
1400e67645bbSPavel Begunkov if (!io_zcrx_queue_cqe(req, niov, ifq, 0, n)) {
1401bc57c7d3SPavel Begunkov io_zcrx_return_niov(niov);
1402bc57c7d3SPavel Begunkov ret = -ENOSPC;
1403bc57c7d3SPavel Begunkov break;
1404bc57c7d3SPavel Begunkov }
1405bc57c7d3SPavel Begunkov
1406bc57c7d3SPavel Begunkov io_zcrx_get_niov_uref(niov);
1407e67645bbSPavel Begunkov src_offset += n;
1408e67645bbSPavel Begunkov len -= n;
1409e67645bbSPavel Begunkov copied += n;
1410bc57c7d3SPavel Begunkov }
1411bc57c7d3SPavel Begunkov
1412bc57c7d3SPavel Begunkov return copied ? copied : ret;
1413bc57c7d3SPavel Begunkov }
1414bc57c7d3SPavel Begunkov
io_zcrx_copy_frag(struct io_kiocb * req,struct io_zcrx_ifq * ifq,const skb_frag_t * frag,int off,int len)1415bc57c7d3SPavel Begunkov static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
1416bc57c7d3SPavel Begunkov const skb_frag_t *frag, int off, int len)
1417bc57c7d3SPavel Begunkov {
1418bc57c7d3SPavel Begunkov struct page *page = skb_frag_page(frag);
1419bc57c7d3SPavel Begunkov
1420e67645bbSPavel Begunkov return io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len);
1421bc57c7d3SPavel Begunkov }
1422bc57c7d3SPavel Begunkov
io_zcrx_recv_frag(struct io_kiocb * req,struct io_zcrx_ifq * ifq,const skb_frag_t * frag,int off,int len)142311ed914bSDavid Wei static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
142411ed914bSDavid Wei const skb_frag_t *frag, int off, int len)
142511ed914bSDavid Wei {
142611ed914bSDavid Wei struct net_iov *niov;
1427f0243d2bSPavel Begunkov struct page_pool *pp;
142811ed914bSDavid Wei
142911ed914bSDavid Wei if (unlikely(!skb_frag_is_net_iov(frag)))
1430bc57c7d3SPavel Begunkov return io_zcrx_copy_frag(req, ifq, frag, off, len);
143111ed914bSDavid Wei
143211ed914bSDavid Wei niov = netmem_to_net_iov(frag->netmem);
1433f0243d2bSPavel Begunkov pp = niov->desc.pp;
1434f0243d2bSPavel Begunkov
1435f0243d2bSPavel Begunkov if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq)
143611ed914bSDavid Wei return -EFAULT;
143711ed914bSDavid Wei
143811ed914bSDavid Wei if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))
143911ed914bSDavid Wei return -ENOSPC;
144011ed914bSDavid Wei
144111ed914bSDavid Wei /*
144211ed914bSDavid Wei * Prevent it from being recycled while user is accessing it.
144311ed914bSDavid Wei * It has to be done before grabbing a user reference.
144411ed914bSDavid Wei */
144511ed914bSDavid Wei page_pool_ref_netmem(net_iov_to_netmem(niov));
144611ed914bSDavid Wei io_zcrx_get_niov_uref(niov);
144711ed914bSDavid Wei return len;
144811ed914bSDavid Wei }
144911ed914bSDavid Wei
145011ed914bSDavid Wei static int
io_zcrx_recv_skb(read_descriptor_t * desc,struct sk_buff * skb,unsigned int offset,size_t len)145111ed914bSDavid Wei io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
145211ed914bSDavid Wei unsigned int offset, size_t len)
145311ed914bSDavid Wei {
145411ed914bSDavid Wei struct io_zcrx_args *args = desc->arg.data;
145511ed914bSDavid Wei struct io_zcrx_ifq *ifq = args->ifq;
145611ed914bSDavid Wei struct io_kiocb *req = args->req;
145711ed914bSDavid Wei struct sk_buff *frag_iter;
1458bc57c7d3SPavel Begunkov unsigned start, start_off = offset;
145911ed914bSDavid Wei int i, copy, end, off;
146011ed914bSDavid Wei int ret = 0;
146111ed914bSDavid Wei
14626699ec9aSDavid Wei len = min_t(size_t, len, desc->count);
1463fcfd94d6SDavid Wei /*
1464fcfd94d6SDavid Wei * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even
1465fcfd94d6SDavid Wei * if desc->count is already 0. This is caused by the if (offset + 1 !=
1466fcfd94d6SDavid Wei * skb->len) check. Return early in this case to break out of
1467fcfd94d6SDavid Wei * __tcp_read_sock().
1468fcfd94d6SDavid Wei */
1469fcfd94d6SDavid Wei if (!len)
1470fcfd94d6SDavid Wei return 0;
1471931dfae1SPavel Begunkov if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT))
1472931dfae1SPavel Begunkov return -EAGAIN;
1473931dfae1SPavel Begunkov
1474bc57c7d3SPavel Begunkov if (unlikely(offset < skb_headlen(skb))) {
1475bc57c7d3SPavel Begunkov ssize_t copied;
1476bc57c7d3SPavel Begunkov size_t to_copy;
147711ed914bSDavid Wei
1478bc57c7d3SPavel Begunkov to_copy = min_t(size_t, skb_headlen(skb) - offset, len);
1479e9a9ddb1SPavel Begunkov copied = io_zcrx_copy_chunk(req, ifq, virt_to_page(skb->data),
1480e9a9ddb1SPavel Begunkov offset_in_page(skb->data) + offset,
1481e9a9ddb1SPavel Begunkov to_copy);
1482bc57c7d3SPavel Begunkov if (copied < 0) {
1483bc57c7d3SPavel Begunkov ret = copied;
1484bc57c7d3SPavel Begunkov goto out;
1485bc57c7d3SPavel Begunkov }
1486bc57c7d3SPavel Begunkov offset += copied;
1487bc57c7d3SPavel Begunkov len -= copied;
1488bc57c7d3SPavel Begunkov if (!len)
1489bc57c7d3SPavel Begunkov goto out;
1490bc57c7d3SPavel Begunkov if (offset != skb_headlen(skb))
1491bc57c7d3SPavel Begunkov goto out;
1492bc57c7d3SPavel Begunkov }
1493bc57c7d3SPavel Begunkov
1494bc57c7d3SPavel Begunkov start = skb_headlen(skb);
149511ed914bSDavid Wei
149611ed914bSDavid Wei for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
149711ed914bSDavid Wei const skb_frag_t *frag;
149811ed914bSDavid Wei
149911ed914bSDavid Wei if (WARN_ON(start > offset + len))
150011ed914bSDavid Wei return -EFAULT;
150111ed914bSDavid Wei
150211ed914bSDavid Wei frag = &skb_shinfo(skb)->frags[i];
150311ed914bSDavid Wei end = start + skb_frag_size(frag);
150411ed914bSDavid Wei
150511ed914bSDavid Wei if (offset < end) {
150611ed914bSDavid Wei copy = end - offset;
150711ed914bSDavid Wei if (copy > len)
150811ed914bSDavid Wei copy = len;
150911ed914bSDavid Wei
151011ed914bSDavid Wei off = offset - start;
151111ed914bSDavid Wei ret = io_zcrx_recv_frag(req, ifq, frag, off, copy);
151211ed914bSDavid Wei if (ret < 0)
151311ed914bSDavid Wei goto out;
151411ed914bSDavid Wei
151511ed914bSDavid Wei offset += ret;
151611ed914bSDavid Wei len -= ret;
151711ed914bSDavid Wei if (len == 0 || ret != copy)
151811ed914bSDavid Wei goto out;
151911ed914bSDavid Wei }
152011ed914bSDavid Wei start = end;
152111ed914bSDavid Wei }
152211ed914bSDavid Wei
152311ed914bSDavid Wei skb_walk_frags(skb, frag_iter) {
152411ed914bSDavid Wei if (WARN_ON(start > offset + len))
152511ed914bSDavid Wei return -EFAULT;
152611ed914bSDavid Wei
152711ed914bSDavid Wei end = start + frag_iter->len;
152811ed914bSDavid Wei if (offset < end) {
152909cfd3c5SPavel Begunkov size_t count;
153009cfd3c5SPavel Begunkov
153111ed914bSDavid Wei copy = end - offset;
153211ed914bSDavid Wei if (copy > len)
153311ed914bSDavid Wei copy = len;
153411ed914bSDavid Wei
153511ed914bSDavid Wei off = offset - start;
153609cfd3c5SPavel Begunkov count = desc->count;
153711ed914bSDavid Wei ret = io_zcrx_recv_skb(desc, frag_iter, off, copy);
153809cfd3c5SPavel Begunkov desc->count = count;
153911ed914bSDavid Wei if (ret < 0)
154011ed914bSDavid Wei goto out;
154111ed914bSDavid Wei
154211ed914bSDavid Wei offset += ret;
154311ed914bSDavid Wei len -= ret;
154411ed914bSDavid Wei if (len == 0 || ret != copy)
154511ed914bSDavid Wei goto out;
154611ed914bSDavid Wei }
154711ed914bSDavid Wei start = end;
154811ed914bSDavid Wei }
154911ed914bSDavid Wei
155011ed914bSDavid Wei out:
155111ed914bSDavid Wei if (offset == start_off)
155211ed914bSDavid Wei return ret;
15536699ec9aSDavid Wei desc->count -= (offset - start_off);
155411ed914bSDavid Wei return offset - start_off;
155511ed914bSDavid Wei }
155611ed914bSDavid Wei
io_zcrx_tcp_recvmsg(struct io_kiocb * req,struct io_zcrx_ifq * ifq,struct sock * sk,int flags,unsigned issue_flags,unsigned int * outlen)155711ed914bSDavid Wei static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
155811ed914bSDavid Wei struct sock *sk, int flags,
15596699ec9aSDavid Wei unsigned issue_flags, unsigned int *outlen)
156011ed914bSDavid Wei {
15616699ec9aSDavid Wei unsigned int len = *outlen;
156211ed914bSDavid Wei struct io_zcrx_args args = {
156311ed914bSDavid Wei .req = req,
156411ed914bSDavid Wei .ifq = ifq,
156511ed914bSDavid Wei .sock = sk->sk_socket,
156611ed914bSDavid Wei };
156711ed914bSDavid Wei read_descriptor_t rd_desc = {
15686699ec9aSDavid Wei .count = len ? len : UINT_MAX,
156911ed914bSDavid Wei .arg.data = &args,
157011ed914bSDavid Wei };
157111ed914bSDavid Wei int ret;
157211ed914bSDavid Wei
157311ed914bSDavid Wei lock_sock(sk);
157411ed914bSDavid Wei ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb);
15756699ec9aSDavid Wei if (len && ret > 0)
15766699ec9aSDavid Wei *outlen = len - ret;
157711ed914bSDavid Wei if (ret <= 0) {
157811ed914bSDavid Wei if (ret < 0 || sock_flag(sk, SOCK_DONE))
157911ed914bSDavid Wei goto out;
158011ed914bSDavid Wei if (sk->sk_err)
158111ed914bSDavid Wei ret = sock_error(sk);
158211ed914bSDavid Wei else if (sk->sk_shutdown & RCV_SHUTDOWN)
158311ed914bSDavid Wei goto out;
158411ed914bSDavid Wei else if (sk->sk_state == TCP_CLOSE)
158511ed914bSDavid Wei ret = -ENOTCONN;
158611ed914bSDavid Wei else
158711ed914bSDavid Wei ret = -EAGAIN;
1588931dfae1SPavel Begunkov } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) &&
1589931dfae1SPavel Begunkov (issue_flags & IO_URING_F_MULTISHOT)) {
1590931dfae1SPavel Begunkov ret = IOU_REQUEUE;
159111ed914bSDavid Wei } else if (sock_flag(sk, SOCK_DONE)) {
159211ed914bSDavid Wei /* Make it to retry until it finally gets 0. */
159311ed914bSDavid Wei if (issue_flags & IO_URING_F_MULTISHOT)
159411ed914bSDavid Wei ret = IOU_REQUEUE;
159511ed914bSDavid Wei else
159611ed914bSDavid Wei ret = -EAGAIN;
159711ed914bSDavid Wei }
159811ed914bSDavid Wei out:
159911ed914bSDavid Wei release_sock(sk);
160011ed914bSDavid Wei return ret;
160111ed914bSDavid Wei }
160211ed914bSDavid Wei
io_zcrx_recv(struct io_kiocb * req,struct io_zcrx_ifq * ifq,struct socket * sock,unsigned int flags,unsigned issue_flags,unsigned int * len)160311ed914bSDavid Wei int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
160411ed914bSDavid Wei struct socket *sock, unsigned int flags,
16056699ec9aSDavid Wei unsigned issue_flags, unsigned int *len)
160611ed914bSDavid Wei {
160711ed914bSDavid Wei struct sock *sk = sock->sk;
160811ed914bSDavid Wei const struct proto *prot = READ_ONCE(sk->sk_prot);
160911ed914bSDavid Wei
161011ed914bSDavid Wei if (prot->recvmsg != tcp_recvmsg)
161111ed914bSDavid Wei return -EPROTONOSUPPORT;
161211ed914bSDavid Wei
161311ed914bSDavid Wei sock_rps_record_flow(sk);
16146699ec9aSDavid Wei return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len);
161511ed914bSDavid Wei }
1616