1457c8996SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
24bbd4c77SKirill A. Shutemov #include <linux/kernel.h>
34bbd4c77SKirill A. Shutemov #include <linux/errno.h>
44bbd4c77SKirill A. Shutemov #include <linux/err.h>
54bbd4c77SKirill A. Shutemov #include <linux/spinlock.h>
64bbd4c77SKirill A. Shutemov
74bbd4c77SKirill A. Shutemov #include <linux/mm.h>
889c1905dSVivek Kasireddy #include <linux/memfd.h>
93565fce3SDan Williams #include <linux/memremap.h>
104bbd4c77SKirill A. Shutemov #include <linux/pagemap.h>
114bbd4c77SKirill A. Shutemov #include <linux/rmap.h>
124bbd4c77SKirill A. Shutemov #include <linux/swap.h>
134bbd4c77SKirill A. Shutemov #include <linux/swapops.h>
141507f512SMike Rapoport #include <linux/secretmem.h>
154bbd4c77SKirill A. Shutemov
16174cd4b1SIngo Molnar #include <linux/sched/signal.h>
172667f50eSSteve Capper #include <linux/rwsem.h>
18f30c59e9SAneesh Kumar K.V #include <linux/hugetlb.h>
199a4e9f3bSAneesh Kumar K.V #include <linux/migrate.h>
209a4e9f3bSAneesh Kumar K.V #include <linux/mm_inline.h>
21*4e1d77a8STal Zussman #include <linux/folio_batch.h>
229a4e9f3bSAneesh Kumar K.V #include <linux/sched/mm.h>
23a6e79df9SLorenzo Stoakes #include <linux/shmem_fs.h>
241027e443SKirill A. Shutemov
2533a709b2SDave Hansen #include <asm/mmu_context.h>
261027e443SKirill A. Shutemov #include <asm/tlbflush.h>
272667f50eSSteve Capper
284bbd4c77SKirill A. Shutemov #include "internal.h"
297d0f0f06SKairui Song #include "swap.h"
304bbd4c77SKirill A. Shutemov
sanity_check_pinned_pages(struct page ** pages,unsigned long npages)31b6a2619cSDavid Hildenbrand static inline void sanity_check_pinned_pages(struct page **pages,
32b6a2619cSDavid Hildenbrand unsigned long npages)
33b6a2619cSDavid Hildenbrand {
34b6a2619cSDavid Hildenbrand if (!IS_ENABLED(CONFIG_DEBUG_VM))
35b6a2619cSDavid Hildenbrand return;
36b6a2619cSDavid Hildenbrand
37b6a2619cSDavid Hildenbrand /*
38b6a2619cSDavid Hildenbrand * We only pin anonymous pages if they are exclusive. Once pinned, we
39b6a2619cSDavid Hildenbrand * can no longer turn them possibly shared and PageAnonExclusive() will
40b6a2619cSDavid Hildenbrand * stick around until the page is freed.
41b6a2619cSDavid Hildenbrand *
42b6a2619cSDavid Hildenbrand * We'd like to verify that our pinned anonymous pages are still mapped
43b6a2619cSDavid Hildenbrand * exclusively. The issue with anon THP is that we don't know how
44b6a2619cSDavid Hildenbrand * they are/were mapped when pinning them. However, for anon
45b6a2619cSDavid Hildenbrand * THP we can assume that either the given page (PTE-mapped THP) or
46b6a2619cSDavid Hildenbrand * the head page (PMD-mapped THP) should be PageAnonExclusive(). If
47b6a2619cSDavid Hildenbrand * neither is the case, there is certainly something wrong.
48b6a2619cSDavid Hildenbrand */
49b6a2619cSDavid Hildenbrand for (; npages; npages--, pages++) {
50b6a2619cSDavid Hildenbrand struct page *page = *pages;
51a1268be2SJohn Hubbard struct folio *folio;
52a1268be2SJohn Hubbard
53a1268be2SJohn Hubbard if (!page)
54a1268be2SJohn Hubbard continue;
55a1268be2SJohn Hubbard
56a1268be2SJohn Hubbard folio = page_folio(page);
57b6a2619cSDavid Hildenbrand
58c8070b78SDavid Howells if (is_zero_page(page) ||
59c8070b78SDavid Howells !folio_test_anon(folio))
60b6a2619cSDavid Hildenbrand continue;
61b6a2619cSDavid Hildenbrand if (!folio_test_large(folio) || folio_test_hugetlb(folio))
62792b429dSDavid Hildenbrand VM_WARN_ON_ONCE_FOLIO(!PageAnonExclusive(&folio->page), folio);
63b6a2619cSDavid Hildenbrand else
64b6a2619cSDavid Hildenbrand /* Either a PTE-mapped or a PMD-mapped THP. */
65792b429dSDavid Hildenbrand VM_WARN_ON_ONCE_PAGE(!PageAnonExclusive(&folio->page) &&
66b6a2619cSDavid Hildenbrand !PageAnonExclusive(page), page);
67b6a2619cSDavid Hildenbrand }
68b6a2619cSDavid Hildenbrand }
69b6a2619cSDavid Hildenbrand
70c24d3732SJann Horn /*
71ece1ed7bSMatthew Wilcox (Oracle) * Return the folio with ref appropriately incremented,
72cd1adf1bSLinus Torvalds * or NULL if that failed.
73a707cdd5SJohn Hubbard */
try_get_folio(struct page * page,int refs)74ece1ed7bSMatthew Wilcox (Oracle) static inline struct folio *try_get_folio(struct page *page, int refs)
75a707cdd5SJohn Hubbard {
76ece1ed7bSMatthew Wilcox (Oracle) struct folio *folio;
77a707cdd5SJohn Hubbard
7859409373SMatthew Wilcox (Oracle) retry:
79ece1ed7bSMatthew Wilcox (Oracle) folio = page_folio(page);
80ece1ed7bSMatthew Wilcox (Oracle) if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
81a707cdd5SJohn Hubbard return NULL;
82fa2690afSYang Shi if (unlikely(!folio_ref_try_add(folio, refs)))
83a707cdd5SJohn Hubbard return NULL;
84c24d3732SJann Horn
85c24d3732SJann Horn /*
86ece1ed7bSMatthew Wilcox (Oracle) * At this point we have a stable reference to the folio; but it
87ece1ed7bSMatthew Wilcox (Oracle) * could be that between calling page_folio() and the refcount
88ece1ed7bSMatthew Wilcox (Oracle) * increment, the folio was split, in which case we'd end up
89ece1ed7bSMatthew Wilcox (Oracle) * holding a reference on a folio that has nothing to do with the page
90c24d3732SJann Horn * we were given anymore.
91ece1ed7bSMatthew Wilcox (Oracle) * So now that the folio is stable, recheck that the page still
92ece1ed7bSMatthew Wilcox (Oracle) * belongs to this folio.
93c24d3732SJann Horn */
94ece1ed7bSMatthew Wilcox (Oracle) if (unlikely(page_folio(page) != folio)) {
95ece1ed7bSMatthew Wilcox (Oracle) folio_put_refs(folio, refs);
9659409373SMatthew Wilcox (Oracle) goto retry;
97c24d3732SJann Horn }
98c24d3732SJann Horn
99ece1ed7bSMatthew Wilcox (Oracle) return folio;
100a707cdd5SJohn Hubbard }
101a707cdd5SJohn Hubbard
gup_put_folio(struct folio * folio,int refs,unsigned int flags)102d8ddc099SMatthew Wilcox (Oracle) static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
1034509b42cSJason Gunthorpe {
1044509b42cSJason Gunthorpe if (flags & FOLL_PIN) {
105c8070b78SDavid Howells if (is_zero_folio(folio))
106c8070b78SDavid Howells return;
107d8ddc099SMatthew Wilcox (Oracle) node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
10831a31da8SDavid Hildenbrand if (folio_has_pincount(folio))
10994688e8eSMatthew Wilcox (Oracle) atomic_sub(refs, &folio->_pincount);
1104509b42cSJason Gunthorpe else
1114509b42cSJason Gunthorpe refs *= GUP_PIN_COUNTING_BIAS;
1124509b42cSJason Gunthorpe }
1134509b42cSJason Gunthorpe
114d8ddc099SMatthew Wilcox (Oracle) folio_put_refs(folio, refs);
1154509b42cSJason Gunthorpe }
1164509b42cSJason Gunthorpe
1173faa52c0SJohn Hubbard /**
118f442fa61SYang Shi * try_grab_folio() - add a folio's refcount by a flag-dependent amount
119f442fa61SYang Shi * @folio: pointer to folio to be grabbed
120f442fa61SYang Shi * @refs: the value to (effectively) add to the folio's refcount
121f442fa61SYang Shi * @flags: gup flags: these are the FOLL_* flag values
1223faa52c0SJohn Hubbard *
1233faa52c0SJohn Hubbard * This might not do anything at all, depending on the flags argument.
1243faa52c0SJohn Hubbard *
1253faa52c0SJohn Hubbard * "grab" names in this file mean, "look at flags to decide whether to use
126f442fa61SYang Shi * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
1273faa52c0SJohn Hubbard *
1283faa52c0SJohn Hubbard * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
129f442fa61SYang Shi * time.
1303faa52c0SJohn Hubbard *
1310f089235SLogan Gunthorpe * Return: 0 for success, or if no action was required (if neither FOLL_PIN
1320f089235SLogan Gunthorpe * nor FOLL_GET was set, nothing is done). A negative error code for failure:
1330f089235SLogan Gunthorpe *
134f442fa61SYang Shi * -ENOMEM FOLL_GET or FOLL_PIN was set, but the folio could not
1350f089235SLogan Gunthorpe * be grabbed.
136f442fa61SYang Shi *
137f442fa61SYang Shi * It is called when we have a stable reference for the folio, typically in
138f442fa61SYang Shi * GUP slow path.
1393faa52c0SJohn Hubbard */
try_grab_folio(struct folio * folio,int refs,unsigned int flags)140f442fa61SYang Shi int __must_check try_grab_folio(struct folio *folio, int refs,
141f442fa61SYang Shi unsigned int flags)
1423faa52c0SJohn Hubbard {
1435fec0719SMatthew Wilcox (Oracle) if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
1440f089235SLogan Gunthorpe return -ENOMEM;
145c36c04c2SJohn Hubbard
14688df6ab2SMatthew Wilcox (Oracle) if (unlikely(!(flags & FOLL_PCI_P2PDMA) && folio_is_pci_p2pdma(folio)))
1474003f107SLogan Gunthorpe return -EREMOTEIO;
148c36c04c2SJohn Hubbard
1495fec0719SMatthew Wilcox (Oracle) if (flags & FOLL_GET)
150f442fa61SYang Shi folio_ref_add(folio, refs);
1515fec0719SMatthew Wilcox (Oracle) else if (flags & FOLL_PIN) {
152c36c04c2SJohn Hubbard /*
153c8070b78SDavid Howells * Don't take a pin on the zero page - it's not going anywhere
154c8070b78SDavid Howells * and it is used in a *lot* of places.
155c8070b78SDavid Howells */
156f442fa61SYang Shi if (is_zero_folio(folio))
157c8070b78SDavid Howells return 0;
158c8070b78SDavid Howells
159c8070b78SDavid Howells /*
160f442fa61SYang Shi * Increment the normal page refcount field at least once,
16178d9d6ceSMatthew Wilcox (Oracle) * so that the page really is pinned.
162c36c04c2SJohn Hubbard */
16331a31da8SDavid Hildenbrand if (folio_has_pincount(folio)) {
164f442fa61SYang Shi folio_ref_add(folio, refs);
165f442fa61SYang Shi atomic_add(refs, &folio->_pincount);
1668ea2979cSMatthew Wilcox (Oracle) } else {
167f442fa61SYang Shi folio_ref_add(folio, refs * GUP_PIN_COUNTING_BIAS);
1688ea2979cSMatthew Wilcox (Oracle) }
169c36c04c2SJohn Hubbard
170f442fa61SYang Shi node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
171c36c04c2SJohn Hubbard }
172c36c04c2SJohn Hubbard
1730f089235SLogan Gunthorpe return 0;
1743faa52c0SJohn Hubbard }
1753faa52c0SJohn Hubbard
1763faa52c0SJohn Hubbard /**
1773faa52c0SJohn Hubbard * unpin_user_page() - release a dma-pinned page
1783faa52c0SJohn Hubbard * @page: pointer to page to be released
1793faa52c0SJohn Hubbard *
1803faa52c0SJohn Hubbard * Pages that were pinned via pin_user_pages*() must be released via either
1813faa52c0SJohn Hubbard * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
1823faa52c0SJohn Hubbard * that such pages can be separately tracked and uniquely handled. In
1833faa52c0SJohn Hubbard * particular, interactions with RDMA and filesystems need special handling.
1843faa52c0SJohn Hubbard */
unpin_user_page(struct page * page)1853faa52c0SJohn Hubbard void unpin_user_page(struct page *page)
1863faa52c0SJohn Hubbard {
187b6a2619cSDavid Hildenbrand sanity_check_pinned_pages(&page, 1);
188d8ddc099SMatthew Wilcox (Oracle) gup_put_folio(page_folio(page), 1, FOLL_PIN);
1893faa52c0SJohn Hubbard }
1903faa52c0SJohn Hubbard EXPORT_SYMBOL(unpin_user_page);
1913faa52c0SJohn Hubbard
1921101fb8fSDavid Howells /**
1936cc04054SVivek Kasireddy * unpin_folio() - release a dma-pinned folio
1946cc04054SVivek Kasireddy * @folio: pointer to folio to be released
1956cc04054SVivek Kasireddy *
1966cc04054SVivek Kasireddy * Folios that were pinned via memfd_pin_folios() or other similar routines
1976cc04054SVivek Kasireddy * must be released either using unpin_folio() or unpin_folios().
1986cc04054SVivek Kasireddy */
unpin_folio(struct folio * folio)1996cc04054SVivek Kasireddy void unpin_folio(struct folio *folio)
2006cc04054SVivek Kasireddy {
2016cc04054SVivek Kasireddy gup_put_folio(folio, 1, FOLL_PIN);
2026cc04054SVivek Kasireddy }
2036cc04054SVivek Kasireddy EXPORT_SYMBOL_GPL(unpin_folio);
2046cc04054SVivek Kasireddy
2056cc04054SVivek Kasireddy /**
2061101fb8fSDavid Howells * folio_add_pin - Try to get an additional pin on a pinned folio
2071101fb8fSDavid Howells * @folio: The folio to be pinned
2081101fb8fSDavid Howells *
2091101fb8fSDavid Howells * Get an additional pin on a folio we already have a pin on. Makes no change
2101101fb8fSDavid Howells * if the folio is a zero_page.
2111101fb8fSDavid Howells */
folio_add_pin(struct folio * folio)2121101fb8fSDavid Howells void folio_add_pin(struct folio *folio)
2131101fb8fSDavid Howells {
2141101fb8fSDavid Howells if (is_zero_folio(folio))
2151101fb8fSDavid Howells return;
2161101fb8fSDavid Howells
2171101fb8fSDavid Howells /*
2181101fb8fSDavid Howells * Similar to try_grab_folio(): be sure to *also* increment the normal
2191101fb8fSDavid Howells * page refcount field at least once, so that the page really is
2201101fb8fSDavid Howells * pinned.
2211101fb8fSDavid Howells */
22231a31da8SDavid Hildenbrand if (folio_has_pincount(folio)) {
2231101fb8fSDavid Howells WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1);
2241101fb8fSDavid Howells folio_ref_inc(folio);
2251101fb8fSDavid Howells atomic_inc(&folio->_pincount);
2261101fb8fSDavid Howells } else {
2271101fb8fSDavid Howells WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS);
2281101fb8fSDavid Howells folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
2291101fb8fSDavid Howells }
2301101fb8fSDavid Howells }
2311101fb8fSDavid Howells
gup_folio_range_next(struct page * start,unsigned long npages,unsigned long i,unsigned int * ntails)232659508f9SMatthew Wilcox (Oracle) static inline struct folio *gup_folio_range_next(struct page *start,
2338f39f5fcSMatthew Wilcox (Oracle) unsigned long npages, unsigned long i, unsigned int *ntails)
234458a4f78SJoao Martins {
235b5ba761aSDavid Hildenbrand struct page *next = start + i;
236659508f9SMatthew Wilcox (Oracle) struct folio *folio = page_folio(next);
237458a4f78SJoao Martins unsigned int nr = 1;
238458a4f78SJoao Martins
239659508f9SMatthew Wilcox (Oracle) if (folio_test_large(folio))
2404c654229SMatthew Wilcox (Oracle) nr = min_t(unsigned int, npages - i,
241659508f9SMatthew Wilcox (Oracle) folio_nr_pages(folio) - folio_page_idx(folio, next));
242458a4f78SJoao Martins
243458a4f78SJoao Martins *ntails = nr;
244659508f9SMatthew Wilcox (Oracle) return folio;
245458a4f78SJoao Martins }
246458a4f78SJoao Martins
gup_folio_next(struct page ** list,unsigned long npages,unsigned long i,unsigned int * ntails)24712521c76SMatthew Wilcox (Oracle) static inline struct folio *gup_folio_next(struct page **list,
24828297dbcSMatthew Wilcox (Oracle) unsigned long npages, unsigned long i, unsigned int *ntails)
2498745d7f6SJoao Martins {
25012521c76SMatthew Wilcox (Oracle) struct folio *folio = page_folio(list[i]);
2518745d7f6SJoao Martins unsigned int nr;
2528745d7f6SJoao Martins
2538745d7f6SJoao Martins for (nr = i + 1; nr < npages; nr++) {
25412521c76SMatthew Wilcox (Oracle) if (page_folio(list[nr]) != folio)
2558745d7f6SJoao Martins break;
2568745d7f6SJoao Martins }
2578745d7f6SJoao Martins
2588745d7f6SJoao Martins *ntails = nr - i;
25912521c76SMatthew Wilcox (Oracle) return folio;
2608745d7f6SJoao Martins }
2618745d7f6SJoao Martins
262fc1d8e7cSJohn Hubbard /**
263f1f6a7ddSJohn Hubbard * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
2642d15eb31Sakpm@linux-foundation.org * @pages: array of pages to be maybe marked dirty, and definitely released.
265fc1d8e7cSJohn Hubbard * @npages: number of pages in the @pages array.
2662d15eb31Sakpm@linux-foundation.org * @make_dirty: whether to mark the pages dirty
267fc1d8e7cSJohn Hubbard *
268fc1d8e7cSJohn Hubbard * "gup-pinned page" refers to a page that has had one of the get_user_pages()
269fc1d8e7cSJohn Hubbard * variants called on that page.
270fc1d8e7cSJohn Hubbard *
271fc1d8e7cSJohn Hubbard * For each page in the @pages array, make that page (or its head page, if a
2722d15eb31Sakpm@linux-foundation.org * compound page) dirty, if @make_dirty is true, and if the page was previously
273f1f6a7ddSJohn Hubbard * listed as clean. In any case, releases all pages using unpin_user_page(),
274f1f6a7ddSJohn Hubbard * possibly via unpin_user_pages(), for the non-dirty case.
275fc1d8e7cSJohn Hubbard *
276f1f6a7ddSJohn Hubbard * Please see the unpin_user_page() documentation for details.
277fc1d8e7cSJohn Hubbard *
2782d15eb31Sakpm@linux-foundation.org * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
2792d15eb31Sakpm@linux-foundation.org * required, then the caller should a) verify that this is really correct,
2802d15eb31Sakpm@linux-foundation.org * because _lock() is usually required, and b) hand code it:
281f1f6a7ddSJohn Hubbard * set_page_dirty_lock(), unpin_user_page().
282fc1d8e7cSJohn Hubbard *
283fc1d8e7cSJohn Hubbard */
unpin_user_pages_dirty_lock(struct page ** pages,unsigned long npages,bool make_dirty)284f1f6a7ddSJohn Hubbard void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
2852d15eb31Sakpm@linux-foundation.org bool make_dirty)
286fc1d8e7cSJohn Hubbard {
28712521c76SMatthew Wilcox (Oracle) unsigned long i;
28812521c76SMatthew Wilcox (Oracle) struct folio *folio;
28912521c76SMatthew Wilcox (Oracle) unsigned int nr;
2902d15eb31Sakpm@linux-foundation.org
2912d15eb31Sakpm@linux-foundation.org if (!make_dirty) {
292f1f6a7ddSJohn Hubbard unpin_user_pages(pages, npages);
2932d15eb31Sakpm@linux-foundation.org return;
2942d15eb31Sakpm@linux-foundation.org }
2952d15eb31Sakpm@linux-foundation.org
296b6a2619cSDavid Hildenbrand sanity_check_pinned_pages(pages, npages);
29712521c76SMatthew Wilcox (Oracle) for (i = 0; i < npages; i += nr) {
29812521c76SMatthew Wilcox (Oracle) folio = gup_folio_next(pages, npages, i, &nr);
2992d15eb31Sakpm@linux-foundation.org /*
3002d15eb31Sakpm@linux-foundation.org * Checking PageDirty at this point may race with
3012d15eb31Sakpm@linux-foundation.org * clear_page_dirty_for_io(), but that's OK. Two key
3022d15eb31Sakpm@linux-foundation.org * cases:
3032d15eb31Sakpm@linux-foundation.org *
3042d15eb31Sakpm@linux-foundation.org * 1) This code sees the page as already dirty, so it
3052d15eb31Sakpm@linux-foundation.org * skips the call to set_page_dirty(). That could happen
3062d15eb31Sakpm@linux-foundation.org * because clear_page_dirty_for_io() called
307a929e0d1SKefeng Wang * folio_mkclean(), followed by set_page_dirty().
3082d15eb31Sakpm@linux-foundation.org * However, now the page is going to get written back,
3092d15eb31Sakpm@linux-foundation.org * which meets the original intention of setting it
3102d15eb31Sakpm@linux-foundation.org * dirty, so all is well: clear_page_dirty_for_io() goes
3112d15eb31Sakpm@linux-foundation.org * on to call TestClearPageDirty(), and write the page
3122d15eb31Sakpm@linux-foundation.org * back.
3132d15eb31Sakpm@linux-foundation.org *
3142d15eb31Sakpm@linux-foundation.org * 2) This code sees the page as clean, so it calls
3152d15eb31Sakpm@linux-foundation.org * set_page_dirty(). The page stays dirty, despite being
3162d15eb31Sakpm@linux-foundation.org * written back, so it gets written back again in the
3172d15eb31Sakpm@linux-foundation.org * next writeback cycle. This is harmless.
3182d15eb31Sakpm@linux-foundation.org */
31912521c76SMatthew Wilcox (Oracle) if (!folio_test_dirty(folio)) {
32012521c76SMatthew Wilcox (Oracle) folio_lock(folio);
32112521c76SMatthew Wilcox (Oracle) folio_mark_dirty(folio);
32212521c76SMatthew Wilcox (Oracle) folio_unlock(folio);
32312521c76SMatthew Wilcox (Oracle) }
32412521c76SMatthew Wilcox (Oracle) gup_put_folio(folio, nr, FOLL_PIN);
3252d15eb31Sakpm@linux-foundation.org }
326fc1d8e7cSJohn Hubbard }
327f1f6a7ddSJohn Hubbard EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
328fc1d8e7cSJohn Hubbard
329fc1d8e7cSJohn Hubbard /**
330458a4f78SJoao Martins * unpin_user_page_range_dirty_lock() - release and optionally dirty
331458a4f78SJoao Martins * gup-pinned page range
332458a4f78SJoao Martins *
333458a4f78SJoao Martins * @page: the starting page of a range maybe marked dirty, and definitely released.
334458a4f78SJoao Martins * @npages: number of consecutive pages to release.
335458a4f78SJoao Martins * @make_dirty: whether to mark the pages dirty
336458a4f78SJoao Martins *
337458a4f78SJoao Martins * "gup-pinned page range" refers to a range of pages that has had one of the
338458a4f78SJoao Martins * pin_user_pages() variants called on that page.
339458a4f78SJoao Martins *
340b5ba761aSDavid Hildenbrand * The page range must be truly physically contiguous: the page range
341b5ba761aSDavid Hildenbrand * corresponds to a contiguous PFN range and all pages can be iterated
342b5ba761aSDavid Hildenbrand * naturally.
343b5ba761aSDavid Hildenbrand *
344458a4f78SJoao Martins * For the page ranges defined by [page .. page+npages], make that range (or
345458a4f78SJoao Martins * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
346458a4f78SJoao Martins * page range was previously listed as clean.
347458a4f78SJoao Martins *
348458a4f78SJoao Martins * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
349458a4f78SJoao Martins * required, then the caller should a) verify that this is really correct,
350458a4f78SJoao Martins * because _lock() is usually required, and b) hand code it:
351458a4f78SJoao Martins * set_page_dirty_lock(), unpin_user_page().
352458a4f78SJoao Martins *
353458a4f78SJoao Martins */
unpin_user_page_range_dirty_lock(struct page * page,unsigned long npages,bool make_dirty)354458a4f78SJoao Martins void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
355458a4f78SJoao Martins bool make_dirty)
356458a4f78SJoao Martins {
357659508f9SMatthew Wilcox (Oracle) unsigned long i;
358659508f9SMatthew Wilcox (Oracle) struct folio *folio;
359659508f9SMatthew Wilcox (Oracle) unsigned int nr;
360458a4f78SJoao Martins
361b5ba761aSDavid Hildenbrand VM_WARN_ON_ONCE(!page_range_contiguous(page, npages));
362b5ba761aSDavid Hildenbrand
363659508f9SMatthew Wilcox (Oracle) for (i = 0; i < npages; i += nr) {
364659508f9SMatthew Wilcox (Oracle) folio = gup_folio_range_next(page, npages, i, &nr);
365659508f9SMatthew Wilcox (Oracle) if (make_dirty && !folio_test_dirty(folio)) {
366659508f9SMatthew Wilcox (Oracle) folio_lock(folio);
367659508f9SMatthew Wilcox (Oracle) folio_mark_dirty(folio);
368659508f9SMatthew Wilcox (Oracle) folio_unlock(folio);
369659508f9SMatthew Wilcox (Oracle) }
370659508f9SMatthew Wilcox (Oracle) gup_put_folio(folio, nr, FOLL_PIN);
371458a4f78SJoao Martins }
372458a4f78SJoao Martins }
373458a4f78SJoao Martins EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
374458a4f78SJoao Martins
gup_fast_unpin_user_pages(struct page ** pages,unsigned long npages)37523babe19SDavid Hildenbrand static void gup_fast_unpin_user_pages(struct page **pages, unsigned long npages)
376b6a2619cSDavid Hildenbrand {
377b6a2619cSDavid Hildenbrand unsigned long i;
378b6a2619cSDavid Hildenbrand struct folio *folio;
379b6a2619cSDavid Hildenbrand unsigned int nr;
380b6a2619cSDavid Hildenbrand
381b6a2619cSDavid Hildenbrand /*
382b6a2619cSDavid Hildenbrand * Don't perform any sanity checks because we might have raced with
383b6a2619cSDavid Hildenbrand * fork() and some anonymous pages might now actually be shared --
384b6a2619cSDavid Hildenbrand * which is why we're unpinning after all.
385b6a2619cSDavid Hildenbrand */
386b6a2619cSDavid Hildenbrand for (i = 0; i < npages; i += nr) {
387b6a2619cSDavid Hildenbrand folio = gup_folio_next(pages, npages, i, &nr);
388b6a2619cSDavid Hildenbrand gup_put_folio(folio, nr, FOLL_PIN);
389b6a2619cSDavid Hildenbrand }
390b6a2619cSDavid Hildenbrand }
391b6a2619cSDavid Hildenbrand
392458a4f78SJoao Martins /**
393f1f6a7ddSJohn Hubbard * unpin_user_pages() - release an array of gup-pinned pages.
394fc1d8e7cSJohn Hubbard * @pages: array of pages to be marked dirty and released.
395fc1d8e7cSJohn Hubbard * @npages: number of pages in the @pages array.
396fc1d8e7cSJohn Hubbard *
397f1f6a7ddSJohn Hubbard * For each page in the @pages array, release the page using unpin_user_page().
398fc1d8e7cSJohn Hubbard *
399f1f6a7ddSJohn Hubbard * Please see the unpin_user_page() documentation for details.
400fc1d8e7cSJohn Hubbard */
unpin_user_pages(struct page ** pages,unsigned long npages)401f1f6a7ddSJohn Hubbard void unpin_user_pages(struct page **pages, unsigned long npages)
402fc1d8e7cSJohn Hubbard {
40312521c76SMatthew Wilcox (Oracle) unsigned long i;
40412521c76SMatthew Wilcox (Oracle) struct folio *folio;
40512521c76SMatthew Wilcox (Oracle) unsigned int nr;
406fc1d8e7cSJohn Hubbard
407fc1d8e7cSJohn Hubbard /*
408146608bbSJohn Hubbard * If this WARN_ON() fires, then the system *might* be leaking pages (by
409146608bbSJohn Hubbard * leaving them pinned), but probably not. More likely, gup/pup returned
410146608bbSJohn Hubbard * a hard -ERRNO error to the caller, who erroneously passed it here.
411146608bbSJohn Hubbard */
412146608bbSJohn Hubbard if (WARN_ON(IS_ERR_VALUE(npages)))
413146608bbSJohn Hubbard return;
41431b912deSJoao Martins
415b6a2619cSDavid Hildenbrand sanity_check_pinned_pages(pages, npages);
41612521c76SMatthew Wilcox (Oracle) for (i = 0; i < npages; i += nr) {
417a1268be2SJohn Hubbard if (!pages[i]) {
418a1268be2SJohn Hubbard nr = 1;
419a1268be2SJohn Hubbard continue;
420a1268be2SJohn Hubbard }
42112521c76SMatthew Wilcox (Oracle) folio = gup_folio_next(pages, npages, i, &nr);
42212521c76SMatthew Wilcox (Oracle) gup_put_folio(folio, nr, FOLL_PIN);
423fc1d8e7cSJohn Hubbard }
424fc1d8e7cSJohn Hubbard }
425f1f6a7ddSJohn Hubbard EXPORT_SYMBOL(unpin_user_pages);
426fc1d8e7cSJohn Hubbard
4276cc04054SVivek Kasireddy /**
428d3bfbfb1SKundan Kumar * unpin_user_folio() - release pages of a folio
429d3bfbfb1SKundan Kumar * @folio: pointer to folio to be released
430d3bfbfb1SKundan Kumar * @npages: number of pages of same folio
431d3bfbfb1SKundan Kumar *
432d3bfbfb1SKundan Kumar * Release npages of the folio
433d3bfbfb1SKundan Kumar */
unpin_user_folio(struct folio * folio,unsigned long npages)434d3bfbfb1SKundan Kumar void unpin_user_folio(struct folio *folio, unsigned long npages)
435d3bfbfb1SKundan Kumar {
436d3bfbfb1SKundan Kumar gup_put_folio(folio, npages, FOLL_PIN);
437d3bfbfb1SKundan Kumar }
438d3bfbfb1SKundan Kumar EXPORT_SYMBOL(unpin_user_folio);
439d3bfbfb1SKundan Kumar
440d3bfbfb1SKundan Kumar /**
4416cc04054SVivek Kasireddy * unpin_folios() - release an array of gup-pinned folios.
4426cc04054SVivek Kasireddy * @folios: array of folios to be marked dirty and released.
4436cc04054SVivek Kasireddy * @nfolios: number of folios in the @folios array.
4446cc04054SVivek Kasireddy *
4456cc04054SVivek Kasireddy * For each folio in the @folios array, release the folio using gup_put_folio.
4466cc04054SVivek Kasireddy *
4476cc04054SVivek Kasireddy * Please see the unpin_folio() documentation for details.
4486cc04054SVivek Kasireddy */
unpin_folios(struct folio ** folios,unsigned long nfolios)4496cc04054SVivek Kasireddy void unpin_folios(struct folio **folios, unsigned long nfolios)
4506cc04054SVivek Kasireddy {
4516cc04054SVivek Kasireddy unsigned long i = 0, j;
4526cc04054SVivek Kasireddy
4536cc04054SVivek Kasireddy /*
4546cc04054SVivek Kasireddy * If this WARN_ON() fires, then the system *might* be leaking folios
4556cc04054SVivek Kasireddy * (by leaving them pinned), but probably not. More likely, gup/pup
4566cc04054SVivek Kasireddy * returned a hard -ERRNO error to the caller, who erroneously passed
4576cc04054SVivek Kasireddy * it here.
4586cc04054SVivek Kasireddy */
4596cc04054SVivek Kasireddy if (WARN_ON(IS_ERR_VALUE(nfolios)))
4606cc04054SVivek Kasireddy return;
4616cc04054SVivek Kasireddy
4626cc04054SVivek Kasireddy while (i < nfolios) {
4636cc04054SVivek Kasireddy for (j = i + 1; j < nfolios; j++)
4646cc04054SVivek Kasireddy if (folios[i] != folios[j])
4656cc04054SVivek Kasireddy break;
4666cc04054SVivek Kasireddy
4676cc04054SVivek Kasireddy if (folios[i])
4686cc04054SVivek Kasireddy gup_put_folio(folios[i], j - i, FOLL_PIN);
4696cc04054SVivek Kasireddy i = j;
4706cc04054SVivek Kasireddy }
4716cc04054SVivek Kasireddy }
4726cc04054SVivek Kasireddy EXPORT_SYMBOL_GPL(unpin_folios);
4736cc04054SVivek Kasireddy
474a458b76aSAndrea Arcangeli /*
475a458b76aSAndrea Arcangeli * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
476a458b76aSAndrea Arcangeli * lifecycle. Avoid setting the bit unless necessary, or it might cause write
477a458b76aSAndrea Arcangeli * cache bouncing on large SMP machines for concurrent pinned gups.
478a458b76aSAndrea Arcangeli */
mm_set_has_pinned_flag(struct mm_struct * mm)47912e423baSLorenzo Stoakes static inline void mm_set_has_pinned_flag(struct mm_struct *mm)
480a458b76aSAndrea Arcangeli {
48112e423baSLorenzo Stoakes if (!mm_flags_test(MMF_HAS_PINNED, mm))
48212e423baSLorenzo Stoakes mm_flags_set(MMF_HAS_PINNED, mm);
483a458b76aSAndrea Arcangeli }
484a458b76aSAndrea Arcangeli
485050a9adcSChristoph Hellwig #ifdef CONFIG_MMU
486a12083d7SPeter Xu
4878268614bSChristophe Leroy #ifdef CONFIG_HAVE_GUP_FAST
488f442fa61SYang Shi /**
489f442fa61SYang Shi * try_grab_folio_fast() - Attempt to get or pin a folio in fast path.
490f442fa61SYang Shi * @page: pointer to page to be grabbed
491f442fa61SYang Shi * @refs: the value to (effectively) add to the folio's refcount
492f442fa61SYang Shi * @flags: gup flags: these are the FOLL_* flag values.
493f442fa61SYang Shi *
494f442fa61SYang Shi * "grab" names in this file mean, "look at flags to decide whether to use
495f442fa61SYang Shi * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
496f442fa61SYang Shi *
497f442fa61SYang Shi * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
498f442fa61SYang Shi * same time. (That's true throughout the get_user_pages*() and
499f442fa61SYang Shi * pin_user_pages*() APIs.) Cases:
500f442fa61SYang Shi *
501f442fa61SYang Shi * FOLL_GET: folio's refcount will be incremented by @refs.
502f442fa61SYang Shi *
503f442fa61SYang Shi * FOLL_PIN on large folios: folio's refcount will be incremented by
504f442fa61SYang Shi * @refs, and its pincount will be incremented by @refs.
505f442fa61SYang Shi *
506f442fa61SYang Shi * FOLL_PIN on single-page folios: folio's refcount will be incremented by
507f442fa61SYang Shi * @refs * GUP_PIN_COUNTING_BIAS.
508f442fa61SYang Shi *
509f442fa61SYang Shi * Return: The folio containing @page (with refcount appropriately
510f442fa61SYang Shi * incremented) for success, or NULL upon failure. If neither FOLL_GET
511f442fa61SYang Shi * nor FOLL_PIN was set, that's considered failure, and furthermore,
512f442fa61SYang Shi * a likely bug in the caller, so a warning is also emitted.
513f442fa61SYang Shi *
514f442fa61SYang Shi * It uses add ref unless zero to elevate the folio refcount and must be called
515f442fa61SYang Shi * in fast path only.
516f442fa61SYang Shi */
try_grab_folio_fast(struct page * page,int refs,unsigned int flags)517f442fa61SYang Shi static struct folio *try_grab_folio_fast(struct page *page, int refs,
518f442fa61SYang Shi unsigned int flags)
519f442fa61SYang Shi {
520f442fa61SYang Shi struct folio *folio;
521f442fa61SYang Shi
522f442fa61SYang Shi /* Raise warn if it is not called in fast GUP */
523f442fa61SYang Shi VM_WARN_ON_ONCE(!irqs_disabled());
524f442fa61SYang Shi
525f442fa61SYang Shi if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))
526f442fa61SYang Shi return NULL;
527f442fa61SYang Shi
528f442fa61SYang Shi if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
529f442fa61SYang Shi return NULL;
530f442fa61SYang Shi
531f442fa61SYang Shi if (flags & FOLL_GET)
532f442fa61SYang Shi return try_get_folio(page, refs);
533f442fa61SYang Shi
534f442fa61SYang Shi /* FOLL_PIN is set */
535f442fa61SYang Shi
536f442fa61SYang Shi /*
537f442fa61SYang Shi * Don't take a pin on the zero page - it's not going anywhere
538f442fa61SYang Shi * and it is used in a *lot* of places.
539f442fa61SYang Shi */
540f442fa61SYang Shi if (is_zero_page(page))
541f442fa61SYang Shi return page_folio(page);
542f442fa61SYang Shi
543f442fa61SYang Shi folio = try_get_folio(page, refs);
544f442fa61SYang Shi if (!folio)
545f442fa61SYang Shi return NULL;
546f442fa61SYang Shi
547f442fa61SYang Shi /*
548f442fa61SYang Shi * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
549f442fa61SYang Shi * right zone, so fail and let the caller fall back to the slow
550f442fa61SYang Shi * path.
551f442fa61SYang Shi */
552f442fa61SYang Shi if (unlikely((flags & FOLL_LONGTERM) &&
553f442fa61SYang Shi !folio_is_longterm_pinnable(folio))) {
554f442fa61SYang Shi folio_put_refs(folio, refs);
555f442fa61SYang Shi return NULL;
556f442fa61SYang Shi }
557f442fa61SYang Shi
558f442fa61SYang Shi /*
559f442fa61SYang Shi * When pinning a large folio, use an exact count to track it.
560f442fa61SYang Shi *
561f442fa61SYang Shi * However, be sure to *also* increment the normal folio
562f442fa61SYang Shi * refcount field at least once, so that the folio really
563f442fa61SYang Shi * is pinned. That's why the refcount from the earlier
564f442fa61SYang Shi * try_get_folio() is left intact.
565f442fa61SYang Shi */
56631a31da8SDavid Hildenbrand if (folio_has_pincount(folio))
567f442fa61SYang Shi atomic_add(refs, &folio->_pincount);
568f442fa61SYang Shi else
569f442fa61SYang Shi folio_ref_add(folio,
570f442fa61SYang Shi refs * (GUP_PIN_COUNTING_BIAS - 1));
571f442fa61SYang Shi /*
572f442fa61SYang Shi * Adjust the pincount before re-checking the PTE for changes.
573f442fa61SYang Shi * This is essentially a smp_mb() and is paired with a memory
574f442fa61SYang Shi * barrier in folio_try_share_anon_rmap_*().
575f442fa61SYang Shi */
576f442fa61SYang Shi smp_mb__after_atomic();
577f442fa61SYang Shi
578f442fa61SYang Shi node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
579f442fa61SYang Shi
580f442fa61SYang Shi return folio;
581f442fa61SYang Shi }
5828268614bSChristophe Leroy #endif /* CONFIG_HAVE_GUP_FAST */
583a12083d7SPeter Xu
584052ccfbcSGuillaume Morin /* Common code for can_follow_write_* */
can_follow_write_common(struct page * page,struct vm_area_struct * vma,unsigned int flags)585052ccfbcSGuillaume Morin static inline bool can_follow_write_common(struct page *page,
586052ccfbcSGuillaume Morin struct vm_area_struct *vma, unsigned int flags)
587052ccfbcSGuillaume Morin {
588052ccfbcSGuillaume Morin /* Maybe FOLL_FORCE is set to override it? */
589052ccfbcSGuillaume Morin if (!(flags & FOLL_FORCE))
590052ccfbcSGuillaume Morin return false;
591052ccfbcSGuillaume Morin
592052ccfbcSGuillaume Morin /* But FOLL_FORCE has no effect on shared mappings */
593052ccfbcSGuillaume Morin if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
594052ccfbcSGuillaume Morin return false;
595052ccfbcSGuillaume Morin
596052ccfbcSGuillaume Morin /* ... or read-only private ones */
597052ccfbcSGuillaume Morin if (!(vma->vm_flags & VM_MAYWRITE))
598052ccfbcSGuillaume Morin return false;
599052ccfbcSGuillaume Morin
600052ccfbcSGuillaume Morin /* ... or already writable ones that just need to take a write fault */
601052ccfbcSGuillaume Morin if (vma->vm_flags & VM_WRITE)
602052ccfbcSGuillaume Morin return false;
603052ccfbcSGuillaume Morin
604052ccfbcSGuillaume Morin /*
605052ccfbcSGuillaume Morin * See can_change_pte_writable(): we broke COW and could map the page
606052ccfbcSGuillaume Morin * writable if we have an exclusive anonymous page ...
607052ccfbcSGuillaume Morin */
608052ccfbcSGuillaume Morin return page && PageAnon(page) && PageAnonExclusive(page);
609052ccfbcSGuillaume Morin }
610052ccfbcSGuillaume Morin
no_page_table(struct vm_area_struct * vma,unsigned int flags,unsigned long address)61169e68b4fSKirill A. Shutemov static struct page *no_page_table(struct vm_area_struct *vma,
612878b0c45SPeter Xu unsigned int flags, unsigned long address)
6134bbd4c77SKirill A. Shutemov {
614878b0c45SPeter Xu if (!(flags & FOLL_DUMP))
615878b0c45SPeter Xu return NULL;
616878b0c45SPeter Xu
6174bbd4c77SKirill A. Shutemov /*
618878b0c45SPeter Xu * When core dumping, we don't want to allocate unnecessary pages or
61969e68b4fSKirill A. Shutemov * page tables. Return error instead of NULL to skip handle_mm_fault,
62069e68b4fSKirill A. Shutemov * then get_dump_page() will return NULL to leave a hole in the dump.
62169e68b4fSKirill A. Shutemov * But we can only make this optimization where a hole would surely
62269e68b4fSKirill A. Shutemov * be zero-filled if handle_mm_fault() actually did handle it.
6234bbd4c77SKirill A. Shutemov */
624878b0c45SPeter Xu if (is_vm_hugetlb_page(vma)) {
625878b0c45SPeter Xu struct hstate *h = hstate_vma(vma);
626878b0c45SPeter Xu
627878b0c45SPeter Xu if (!hugetlbfs_pagecache_present(h, vma, address))
62869e68b4fSKirill A. Shutemov return ERR_PTR(-EFAULT);
629878b0c45SPeter Xu } else if ((vma_is_anonymous(vma) || !vma->vm_ops->fault)) {
630878b0c45SPeter Xu return ERR_PTR(-EFAULT);
631878b0c45SPeter Xu }
632878b0c45SPeter Xu
63369e68b4fSKirill A. Shutemov return NULL;
6344bbd4c77SKirill A. Shutemov }
63569e68b4fSKirill A. Shutemov
6361b167618SPeter Xu #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
637052ccfbcSGuillaume Morin /* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */
can_follow_write_pud(pud_t pud,struct page * page,struct vm_area_struct * vma,unsigned int flags)638052ccfbcSGuillaume Morin static inline bool can_follow_write_pud(pud_t pud, struct page *page,
639052ccfbcSGuillaume Morin struct vm_area_struct *vma,
640052ccfbcSGuillaume Morin unsigned int flags)
641052ccfbcSGuillaume Morin {
642052ccfbcSGuillaume Morin /* If the pud is writable, we can write to the page. */
643052ccfbcSGuillaume Morin if (pud_write(pud))
644052ccfbcSGuillaume Morin return true;
645052ccfbcSGuillaume Morin
646052ccfbcSGuillaume Morin return can_follow_write_common(page, vma, flags);
647052ccfbcSGuillaume Morin }
648052ccfbcSGuillaume Morin
follow_huge_pud(struct vm_area_struct * vma,unsigned long addr,pud_t * pudp,int flags,unsigned long * page_mask)6491b167618SPeter Xu static struct page *follow_huge_pud(struct vm_area_struct *vma,
6501b167618SPeter Xu unsigned long addr, pud_t *pudp,
651d3f7922bSAlistair Popple int flags, unsigned long *page_mask)
6521b167618SPeter Xu {
6531b167618SPeter Xu struct mm_struct *mm = vma->vm_mm;
6541b167618SPeter Xu struct page *page;
6551b167618SPeter Xu pud_t pud = *pudp;
6561b167618SPeter Xu unsigned long pfn = pud_pfn(pud);
6571b167618SPeter Xu int ret;
6581b167618SPeter Xu
6591b167618SPeter Xu assert_spin_locked(pud_lockptr(mm, pudp));
6601b167618SPeter Xu
661052ccfbcSGuillaume Morin if (!pud_present(pud))
6621b167618SPeter Xu return NULL;
6631b167618SPeter Xu
664052ccfbcSGuillaume Morin if ((flags & FOLL_WRITE) &&
665052ccfbcSGuillaume Morin !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags))
6661b167618SPeter Xu return NULL;
6671b167618SPeter Xu
6681b167618SPeter Xu pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
6691b167618SPeter Xu page = pfn_to_page(pfn);
6701b167618SPeter Xu
671fd2825b0SAlistair Popple if (!pud_write(pud) && gup_must_unshare(vma, flags, page))
6721b167618SPeter Xu return ERR_PTR(-EMLINK);
6731b167618SPeter Xu
674f442fa61SYang Shi ret = try_grab_folio(page_folio(page), 1, flags);
6751b167618SPeter Xu if (ret)
6761b167618SPeter Xu page = ERR_PTR(ret);
6771b167618SPeter Xu else
678d3f7922bSAlistair Popple *page_mask = HPAGE_PUD_NR - 1;
6791b167618SPeter Xu
6801b167618SPeter Xu return page;
6811b167618SPeter Xu }
6824418c522SPeter Xu
6834418c522SPeter Xu /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
can_follow_write_pmd(pmd_t pmd,struct page * page,struct vm_area_struct * vma,unsigned int flags)6844418c522SPeter Xu static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
6854418c522SPeter Xu struct vm_area_struct *vma,
6864418c522SPeter Xu unsigned int flags)
6874418c522SPeter Xu {
6884418c522SPeter Xu /* If the pmd is writable, we can write to the page. */
6894418c522SPeter Xu if (pmd_write(pmd))
6904418c522SPeter Xu return true;
6914418c522SPeter Xu
692052ccfbcSGuillaume Morin if (!can_follow_write_common(page, vma, flags))
6934418c522SPeter Xu return false;
6944418c522SPeter Xu
6954418c522SPeter Xu /* ... and a write-fault isn't required for other reasons. */
696f38ee285SBarry Song if (pmd_needs_soft_dirty_wp(vma, pmd))
6974418c522SPeter Xu return false;
6984418c522SPeter Xu return !userfaultfd_huge_pmd_wp(vma, pmd);
6994418c522SPeter Xu }
7004418c522SPeter Xu
follow_huge_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmd,unsigned int flags,unsigned long * page_mask)7014418c522SPeter Xu static struct page *follow_huge_pmd(struct vm_area_struct *vma,
7024418c522SPeter Xu unsigned long addr, pmd_t *pmd,
7034418c522SPeter Xu unsigned int flags,
704d3f7922bSAlistair Popple unsigned long *page_mask)
7054418c522SPeter Xu {
7064418c522SPeter Xu struct mm_struct *mm = vma->vm_mm;
7074418c522SPeter Xu pmd_t pmdval = *pmd;
7084418c522SPeter Xu struct page *page;
7094418c522SPeter Xu int ret;
7104418c522SPeter Xu
7114418c522SPeter Xu assert_spin_locked(pmd_lockptr(mm, pmd));
7124418c522SPeter Xu
7134418c522SPeter Xu page = pmd_page(pmdval);
7144418c522SPeter Xu if ((flags & FOLL_WRITE) &&
7154418c522SPeter Xu !can_follow_write_pmd(pmdval, page, vma, flags))
7164418c522SPeter Xu return NULL;
7174418c522SPeter Xu
7184418c522SPeter Xu /* Avoid dumping huge zero page */
7194418c522SPeter Xu if ((flags & FOLL_DUMP) && is_huge_zero_pmd(pmdval))
7204418c522SPeter Xu return ERR_PTR(-EFAULT);
7214418c522SPeter Xu
7224418c522SPeter Xu if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))
7234418c522SPeter Xu return NULL;
7244418c522SPeter Xu
7254418c522SPeter Xu if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page))
7264418c522SPeter Xu return ERR_PTR(-EMLINK);
7274418c522SPeter Xu
728792b429dSDavid Hildenbrand VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
7294418c522SPeter Xu !PageAnonExclusive(page), page);
7304418c522SPeter Xu
731f442fa61SYang Shi ret = try_grab_folio(page_folio(page), 1, flags);
7324418c522SPeter Xu if (ret)
7334418c522SPeter Xu return ERR_PTR(ret);
7344418c522SPeter Xu
7354418c522SPeter Xu #ifdef CONFIG_TRANSPARENT_HUGEPAGE
7364418c522SPeter Xu if (pmd_trans_huge(pmdval) && (flags & FOLL_TOUCH))
7374418c522SPeter Xu touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
7384418c522SPeter Xu #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
7394418c522SPeter Xu
7404418c522SPeter Xu page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
741d3f7922bSAlistair Popple *page_mask = HPAGE_PMD_NR - 1;
7424418c522SPeter Xu
7434418c522SPeter Xu return page;
7444418c522SPeter Xu }
7454418c522SPeter Xu
7461b167618SPeter Xu #else /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
follow_huge_pud(struct vm_area_struct * vma,unsigned long addr,pud_t * pudp,int flags,unsigned long * page_mask)7471b167618SPeter Xu static struct page *follow_huge_pud(struct vm_area_struct *vma,
7481b167618SPeter Xu unsigned long addr, pud_t *pudp,
749d3f7922bSAlistair Popple int flags, unsigned long *page_mask)
7501b167618SPeter Xu {
7511b167618SPeter Xu return NULL;
7521b167618SPeter Xu }
7534418c522SPeter Xu
follow_huge_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmd,unsigned int flags,unsigned long * page_mask)7544418c522SPeter Xu static struct page *follow_huge_pmd(struct vm_area_struct *vma,
7554418c522SPeter Xu unsigned long addr, pmd_t *pmd,
7564418c522SPeter Xu unsigned int flags,
757d3f7922bSAlistair Popple unsigned long *page_mask)
7584418c522SPeter Xu {
7594418c522SPeter Xu return NULL;
7604418c522SPeter Xu }
7611b167618SPeter Xu #endif /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
7621b167618SPeter Xu
follow_pfn_pte(struct vm_area_struct * vma,unsigned long address,pte_t * pte,unsigned int flags)7631027e443SKirill A. Shutemov static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
7641027e443SKirill A. Shutemov pte_t *pte, unsigned int flags)
7651027e443SKirill A. Shutemov {
7661027e443SKirill A. Shutemov if (flags & FOLL_TOUCH) {
767c33c7948SRyan Roberts pte_t orig_entry = ptep_get(pte);
768c33c7948SRyan Roberts pte_t entry = orig_entry;
7691027e443SKirill A. Shutemov
7701027e443SKirill A. Shutemov if (flags & FOLL_WRITE)
7711027e443SKirill A. Shutemov entry = pte_mkdirty(entry);
7721027e443SKirill A. Shutemov entry = pte_mkyoung(entry);
7731027e443SKirill A. Shutemov
774c33c7948SRyan Roberts if (!pte_same(orig_entry, entry)) {
7751027e443SKirill A. Shutemov set_pte_at(vma->vm_mm, address, pte, entry);
7761027e443SKirill A. Shutemov update_mmu_cache(vma, address, pte);
7771027e443SKirill A. Shutemov }
7781027e443SKirill A. Shutemov }
7791027e443SKirill A. Shutemov
7801027e443SKirill A. Shutemov /* Proper page table entry exists, but no corresponding struct page */
7811027e443SKirill A. Shutemov return -EEXIST;
7821027e443SKirill A. Shutemov }
7831027e443SKirill A. Shutemov
7845535be30SDavid Hildenbrand /* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
can_follow_write_pte(pte_t pte,struct page * page,struct vm_area_struct * vma,unsigned int flags)7855535be30SDavid Hildenbrand static inline bool can_follow_write_pte(pte_t pte, struct page *page,
7865535be30SDavid Hildenbrand struct vm_area_struct *vma,
7875535be30SDavid Hildenbrand unsigned int flags)
78819be0eafSLinus Torvalds {
7895535be30SDavid Hildenbrand /* If the pte is writable, we can write to the page. */
7905535be30SDavid Hildenbrand if (pte_write(pte))
7915535be30SDavid Hildenbrand return true;
7925535be30SDavid Hildenbrand
793052ccfbcSGuillaume Morin if (!can_follow_write_common(page, vma, flags))
7945535be30SDavid Hildenbrand return false;
7955535be30SDavid Hildenbrand
7965535be30SDavid Hildenbrand /* ... and a write-fault isn't required for other reasons. */
797f38ee285SBarry Song if (pte_needs_soft_dirty_wp(vma, pte))
7985535be30SDavid Hildenbrand return false;
7995535be30SDavid Hildenbrand return !userfaultfd_pte_wp(vma, pte);
80019be0eafSLinus Torvalds }
80119be0eafSLinus Torvalds
follow_page_pte(struct vm_area_struct * vma,unsigned long address,pmd_t * pmd,unsigned int flags)80269e68b4fSKirill A. Shutemov static struct page *follow_page_pte(struct vm_area_struct *vma,
803d3f7922bSAlistair Popple unsigned long address, pmd_t *pmd, unsigned int flags)
80469e68b4fSKirill A. Shutemov {
80569e68b4fSKirill A. Shutemov struct mm_struct *mm = vma->vm_mm;
806b967c648SDavid Hildenbrand struct folio *folio;
80769e68b4fSKirill A. Shutemov struct page *page;
80869e68b4fSKirill A. Shutemov spinlock_t *ptl;
80969e68b4fSKirill A. Shutemov pte_t *ptep, pte;
810f28d4363SClaudio Imbrenda int ret;
81169e68b4fSKirill A. Shutemov
8124bbd4c77SKirill A. Shutemov ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
81304dee9e8SHugh Dickins if (!ptep)
814878b0c45SPeter Xu return no_page_table(vma, flags, address);
815c33c7948SRyan Roberts pte = ptep_get(ptep);
816f7355e99SDavid Hildenbrand if (!pte_present(pte))
8174bbd4c77SKirill A. Shutemov goto no_page;
818d74943a2SDavid Hildenbrand if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags))
8194bbd4c77SKirill A. Shutemov goto no_page;
8204bbd4c77SKirill A. Shutemov
8214bbd4c77SKirill A. Shutemov page = vm_normal_page(vma, address, pte);
8225535be30SDavid Hildenbrand
8235535be30SDavid Hildenbrand /*
824fd2825b0SAlistair Popple * We only care about anon pages in can_follow_write_pte().
8255535be30SDavid Hildenbrand */
8265535be30SDavid Hildenbrand if ((flags & FOLL_WRITE) &&
8275535be30SDavid Hildenbrand !can_follow_write_pte(pte, page, vma, flags)) {
8285535be30SDavid Hildenbrand page = NULL;
8295535be30SDavid Hildenbrand goto out;
8305535be30SDavid Hildenbrand }
8315535be30SDavid Hildenbrand
832fd2825b0SAlistair Popple if (unlikely(!page)) {
8331027e443SKirill A. Shutemov if (flags & FOLL_DUMP) {
8341027e443SKirill A. Shutemov /* Avoid special (like zero) pages in core dumps */
8351027e443SKirill A. Shutemov page = ERR_PTR(-EFAULT);
8361027e443SKirill A. Shutemov goto out;
8371027e443SKirill A. Shutemov }
8381027e443SKirill A. Shutemov
8391027e443SKirill A. Shutemov if (is_zero_pfn(pte_pfn(pte))) {
8404bbd4c77SKirill A. Shutemov page = pte_page(pte);
8411027e443SKirill A. Shutemov } else {
8421027e443SKirill A. Shutemov ret = follow_pfn_pte(vma, address, ptep, flags);
8431027e443SKirill A. Shutemov page = ERR_PTR(ret);
8441027e443SKirill A. Shutemov goto out;
8451027e443SKirill A. Shutemov }
8464bbd4c77SKirill A. Shutemov }
847b967c648SDavid Hildenbrand folio = page_folio(page);
8484bbd4c77SKirill A. Shutemov
84984209e87SDavid Hildenbrand if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {
850a7f22660SDavid Hildenbrand page = ERR_PTR(-EMLINK);
851a7f22660SDavid Hildenbrand goto out;
852a7f22660SDavid Hildenbrand }
853b6a2619cSDavid Hildenbrand
854792b429dSDavid Hildenbrand VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
855b6a2619cSDavid Hildenbrand !PageAnonExclusive(page), page);
856b6a2619cSDavid Hildenbrand
857f442fa61SYang Shi /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */
858b967c648SDavid Hildenbrand ret = try_grab_folio(folio, 1, flags);
8590f089235SLogan Gunthorpe if (unlikely(ret)) {
8600f089235SLogan Gunthorpe page = ERR_PTR(ret);
8618fde12caSLinus Torvalds goto out;
8628fde12caSLinus Torvalds }
8634003f107SLogan Gunthorpe
864f28d4363SClaudio Imbrenda /*
865f28d4363SClaudio Imbrenda * We need to make the page accessible if and only if we are going
866f28d4363SClaudio Imbrenda * to access its content (the FOLL_PIN case). Please see
867f28d4363SClaudio Imbrenda * Documentation/core-api/pin_user_pages.rst for details.
868f28d4363SClaudio Imbrenda */
869f28d4363SClaudio Imbrenda if (flags & FOLL_PIN) {
870b967c648SDavid Hildenbrand ret = arch_make_folio_accessible(folio);
871f28d4363SClaudio Imbrenda if (ret) {
872f28d4363SClaudio Imbrenda unpin_user_page(page);
873f28d4363SClaudio Imbrenda page = ERR_PTR(ret);
874f28d4363SClaudio Imbrenda goto out;
875f28d4363SClaudio Imbrenda }
876f28d4363SClaudio Imbrenda }
8774bbd4c77SKirill A. Shutemov if (flags & FOLL_TOUCH) {
8784bbd4c77SKirill A. Shutemov if ((flags & FOLL_WRITE) &&
879f0327de7SMatthew Wilcox (Oracle) !pte_dirty(pte) && !folio_test_dirty(folio))
880f0327de7SMatthew Wilcox (Oracle) folio_mark_dirty(folio);
8814bbd4c77SKirill A. Shutemov /*
8824bbd4c77SKirill A. Shutemov * pte_mkyoung() would be more correct here, but atomic care
8834bbd4c77SKirill A. Shutemov * is needed to avoid losing the dirty bit: it is easier to use
884f0327de7SMatthew Wilcox (Oracle) * folio_mark_accessed().
8854bbd4c77SKirill A. Shutemov */
886f0327de7SMatthew Wilcox (Oracle) folio_mark_accessed(folio);
8874bbd4c77SKirill A. Shutemov }
8881027e443SKirill A. Shutemov out:
8894bbd4c77SKirill A. Shutemov pte_unmap_unlock(ptep, ptl);
8904bbd4c77SKirill A. Shutemov return page;
8914bbd4c77SKirill A. Shutemov no_page:
8924bbd4c77SKirill A. Shutemov pte_unmap_unlock(ptep, ptl);
8934bbd4c77SKirill A. Shutemov if (!pte_none(pte))
89469e68b4fSKirill A. Shutemov return NULL;
895878b0c45SPeter Xu return no_page_table(vma, flags, address);
89669e68b4fSKirill A. Shutemov }
8974bbd4c77SKirill A. Shutemov
follow_pmd_mask(struct vm_area_struct * vma,unsigned long address,pud_t * pudp,unsigned int flags,unsigned long * page_mask)898080dbb61SAneesh Kumar K.V static struct page *follow_pmd_mask(struct vm_area_struct *vma,
899080dbb61SAneesh Kumar K.V unsigned long address, pud_t *pudp,
900df06b37fSKeith Busch unsigned int flags,
901d3f7922bSAlistair Popple unsigned long *page_mask)
90269e68b4fSKirill A. Shutemov {
90368827280SHuang Ying pmd_t *pmd, pmdval;
90469e68b4fSKirill A. Shutemov spinlock_t *ptl;
90569e68b4fSKirill A. Shutemov struct page *page;
90669e68b4fSKirill A. Shutemov struct mm_struct *mm = vma->vm_mm;
90769e68b4fSKirill A. Shutemov
908080dbb61SAneesh Kumar K.V pmd = pmd_offset(pudp, address);
90926e1a0c3SHugh Dickins pmdval = pmdp_get_lockless(pmd);
91068827280SHuang Ying if (pmd_none(pmdval))
911878b0c45SPeter Xu return no_page_table(vma, flags, address);
912f7355e99SDavid Hildenbrand if (!pmd_present(pmdval))
913878b0c45SPeter Xu return no_page_table(vma, flags, address);
9144418c522SPeter Xu if (likely(!pmd_leaf(pmdval)))
915d3f7922bSAlistair Popple return follow_page_pte(vma, address, pmd, flags);
9166742d293SKirill A. Shutemov
917d74943a2SDavid Hildenbrand if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))
918878b0c45SPeter Xu return no_page_table(vma, flags, address);
919db08f203SAneesh Kumar K.V
9206742d293SKirill A. Shutemov ptl = pmd_lock(mm, pmd);
9214418c522SPeter Xu pmdval = *pmd;
9224418c522SPeter Xu if (unlikely(!pmd_present(pmdval))) {
92384c3fc4eSZi Yan spin_unlock(ptl);
924878b0c45SPeter Xu return no_page_table(vma, flags, address);
92584c3fc4eSZi Yan }
9264418c522SPeter Xu if (unlikely(!pmd_leaf(pmdval))) {
9276742d293SKirill A. Shutemov spin_unlock(ptl);
928d3f7922bSAlistair Popple return follow_page_pte(vma, address, pmd, flags);
92969e68b4fSKirill A. Shutemov }
9304418c522SPeter Xu if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) {
931bfe7b00dSSong Liu spin_unlock(ptl);
932bfe7b00dSSong Liu split_huge_pmd(vma, pmd, address);
9332378118bSHugh Dickins /* If pmd was left empty, stuff a page table in there quickly */
9342378118bSHugh Dickins return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :
935d3f7922bSAlistair Popple follow_page_pte(vma, address, pmd, flags);
9366742d293SKirill A. Shutemov }
937d3f7922bSAlistair Popple page = follow_huge_pmd(vma, address, pmd, flags, page_mask);
93869e68b4fSKirill A. Shutemov spin_unlock(ptl);
93969e68b4fSKirill A. Shutemov return page;
94069e68b4fSKirill A. Shutemov }
94169e68b4fSKirill A. Shutemov
follow_pud_mask(struct vm_area_struct * vma,unsigned long address,p4d_t * p4dp,unsigned int flags,unsigned long * page_mask)942080dbb61SAneesh Kumar K.V static struct page *follow_pud_mask(struct vm_area_struct *vma,
943080dbb61SAneesh Kumar K.V unsigned long address, p4d_t *p4dp,
944df06b37fSKeith Busch unsigned int flags,
945d3f7922bSAlistair Popple unsigned long *page_mask)
946080dbb61SAneesh Kumar K.V {
947caf8cab7SPeter Xu pud_t *pudp, pud;
948080dbb61SAneesh Kumar K.V spinlock_t *ptl;
949080dbb61SAneesh Kumar K.V struct page *page;
950080dbb61SAneesh Kumar K.V struct mm_struct *mm = vma->vm_mm;
951080dbb61SAneesh Kumar K.V
952caf8cab7SPeter Xu pudp = pud_offset(p4dp, address);
953c0efdb37SAnshuman Khandual pud = pudp_get(pudp);
9541b167618SPeter Xu if (!pud_present(pud))
955878b0c45SPeter Xu return no_page_table(vma, flags, address);
9561b167618SPeter Xu if (pud_leaf(pud)) {
957caf8cab7SPeter Xu ptl = pud_lock(mm, pudp);
958d3f7922bSAlistair Popple page = follow_huge_pud(vma, address, pudp, flags, page_mask);
959080dbb61SAneesh Kumar K.V spin_unlock(ptl);
960080dbb61SAneesh Kumar K.V if (page)
961080dbb61SAneesh Kumar K.V return page;
962878b0c45SPeter Xu return no_page_table(vma, flags, address);
963080dbb61SAneesh Kumar K.V }
964caf8cab7SPeter Xu if (unlikely(pud_bad(pud)))
965878b0c45SPeter Xu return no_page_table(vma, flags, address);
966080dbb61SAneesh Kumar K.V
967d3f7922bSAlistair Popple return follow_pmd_mask(vma, address, pudp, flags, page_mask);
968080dbb61SAneesh Kumar K.V }
969080dbb61SAneesh Kumar K.V
follow_p4d_mask(struct vm_area_struct * vma,unsigned long address,pgd_t * pgdp,unsigned int flags,unsigned long * page_mask)970080dbb61SAneesh Kumar K.V static struct page *follow_p4d_mask(struct vm_area_struct *vma,
971080dbb61SAneesh Kumar K.V unsigned long address, pgd_t *pgdp,
972df06b37fSKeith Busch unsigned int flags,
973d3f7922bSAlistair Popple unsigned long *page_mask)
974080dbb61SAneesh Kumar K.V {
975e6fd5564SPeter Xu p4d_t *p4dp, p4d;
976080dbb61SAneesh Kumar K.V
977e6fd5564SPeter Xu p4dp = p4d_offset(pgdp, address);
978c0efdb37SAnshuman Khandual p4d = p4dp_get(p4dp);
9791965e933SPeter Xu BUILD_BUG_ON(p4d_leaf(p4d));
980a12083d7SPeter Xu
981a12083d7SPeter Xu if (!p4d_present(p4d) || p4d_bad(p4d))
982878b0c45SPeter Xu return no_page_table(vma, flags, address);
983080dbb61SAneesh Kumar K.V
984d3f7922bSAlistair Popple return follow_pud_mask(vma, address, p4dp, flags, page_mask);
985080dbb61SAneesh Kumar K.V }
986080dbb61SAneesh Kumar K.V
987080dbb61SAneesh Kumar K.V /**
988080dbb61SAneesh Kumar K.V * follow_page_mask - look up a page descriptor from a user-virtual address
989080dbb61SAneesh Kumar K.V * @vma: vm_area_struct mapping @address
990080dbb61SAneesh Kumar K.V * @address: virtual address to look up
991080dbb61SAneesh Kumar K.V * @flags: flags modifying lookup behaviour
992d3f7922bSAlistair Popple * @page_mask: a pointer to output page_mask
993080dbb61SAneesh Kumar K.V *
994080dbb61SAneesh Kumar K.V * @flags can have FOLL_ flags set, defined in <linux/mm.h>
995080dbb61SAneesh Kumar K.V *
996a7f22660SDavid Hildenbrand * When getting an anonymous page and the caller has to trigger unsharing
997a7f22660SDavid Hildenbrand * of a shared anonymous page first, -EMLINK is returned. The caller should
998a7f22660SDavid Hildenbrand * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
999a7f22660SDavid Hildenbrand * relevant with FOLL_PIN and !FOLL_WRITE.
1000a7f22660SDavid Hildenbrand *
1001d3f7922bSAlistair Popple * On output, @page_mask is set according to the size of the page.
100278179556SMike Rapoport *
100378179556SMike Rapoport * Return: the mapped (struct page *), %NULL if no mapping exists, or
1004080dbb61SAneesh Kumar K.V * an error pointer if there is a mapping to something not represented
1005080dbb61SAneesh Kumar K.V * by a page descriptor (see also vm_normal_page()).
1006080dbb61SAneesh Kumar K.V */
follow_page_mask(struct vm_area_struct * vma,unsigned long address,unsigned int flags,unsigned long * page_mask)1007a7030aeaSBharath Vedartham static struct page *follow_page_mask(struct vm_area_struct *vma,
1008080dbb61SAneesh Kumar K.V unsigned long address, unsigned int flags,
1009d3f7922bSAlistair Popple unsigned long *page_mask)
1010080dbb61SAneesh Kumar K.V {
1011080dbb61SAneesh Kumar K.V pgd_t *pgd;
1012080dbb61SAneesh Kumar K.V struct mm_struct *mm = vma->vm_mm;
10139cb28da5SPeter Xu struct page *page;
10149cb28da5SPeter Xu
10159cb28da5SPeter Xu vma_pgtable_walk_begin(vma);
1016080dbb61SAneesh Kumar K.V
1017d3f7922bSAlistair Popple *page_mask = 0;
1018080dbb61SAneesh Kumar K.V pgd = pgd_offset(mm, address);
1019080dbb61SAneesh Kumar K.V
10208268614bSChristophe Leroy if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1021a12083d7SPeter Xu page = no_page_table(vma, flags, address);
1022a12083d7SPeter Xu else
1023d3f7922bSAlistair Popple page = follow_p4d_mask(vma, address, pgd, flags, page_mask);
1024080dbb61SAneesh Kumar K.V
10259cb28da5SPeter Xu vma_pgtable_walk_end(vma);
10269cb28da5SPeter Xu
1027a12083d7SPeter Xu return page;
1028df06b37fSKeith Busch }
1029df06b37fSKeith Busch
get_gate_page(struct mm_struct * mm,unsigned long address,unsigned int gup_flags,struct vm_area_struct ** vma,struct page ** page)1030f2b495caSKirill A. Shutemov static int get_gate_page(struct mm_struct *mm, unsigned long address,
1031f2b495caSKirill A. Shutemov unsigned int gup_flags, struct vm_area_struct **vma,
1032f2b495caSKirill A. Shutemov struct page **page)
1033f2b495caSKirill A. Shutemov {
1034f2b495caSKirill A. Shutemov pgd_t *pgd;
1035c2febafcSKirill A. Shutemov p4d_t *p4d;
1036f2b495caSKirill A. Shutemov pud_t *pud;
1037f2b495caSKirill A. Shutemov pmd_t *pmd;
1038f2b495caSKirill A. Shutemov pte_t *pte;
1039c33c7948SRyan Roberts pte_t entry;
1040f2b495caSKirill A. Shutemov int ret = -EFAULT;
1041f2b495caSKirill A. Shutemov
1042f2b495caSKirill A. Shutemov /* user gate pages are read-only */
1043f2b495caSKirill A. Shutemov if (gup_flags & FOLL_WRITE)
1044f2b495caSKirill A. Shutemov return -EFAULT;
10450cad6736SFeng Lee pgd = pgd_offset(mm, address);
1046b5d1c39fSAndy Lutomirski if (pgd_none(*pgd))
1047b5d1c39fSAndy Lutomirski return -EFAULT;
1048c2febafcSKirill A. Shutemov p4d = p4d_offset(pgd, address);
1049b5d1c39fSAndy Lutomirski if (p4d_none(*p4d))
1050b5d1c39fSAndy Lutomirski return -EFAULT;
1051c2febafcSKirill A. Shutemov pud = pud_offset(p4d, address);
1052b5d1c39fSAndy Lutomirski if (pud_none(*pud))
1053b5d1c39fSAndy Lutomirski return -EFAULT;
1054f2b495caSKirill A. Shutemov pmd = pmd_offset(pud, address);
105584c3fc4eSZi Yan if (!pmd_present(*pmd))
1056f2b495caSKirill A. Shutemov return -EFAULT;
1057f2b495caSKirill A. Shutemov pte = pte_offset_map(pmd, address);
105804dee9e8SHugh Dickins if (!pte)
105904dee9e8SHugh Dickins return -EFAULT;
1060c33c7948SRyan Roberts entry = ptep_get(pte);
1061c33c7948SRyan Roberts if (pte_none(entry))
1062f2b495caSKirill A. Shutemov goto unmap;
1063f2b495caSKirill A. Shutemov *vma = get_gate_vma(mm);
1064f2b495caSKirill A. Shutemov if (!page)
1065f2b495caSKirill A. Shutemov goto out;
1066c33c7948SRyan Roberts *page = vm_normal_page(*vma, address, entry);
1067f2b495caSKirill A. Shutemov if (!*page) {
1068c33c7948SRyan Roberts if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry)))
1069f2b495caSKirill A. Shutemov goto unmap;
1070c33c7948SRyan Roberts *page = pte_page(entry);
1071f2b495caSKirill A. Shutemov }
1072f442fa61SYang Shi ret = try_grab_folio(page_folio(*page), 1, gup_flags);
10730f089235SLogan Gunthorpe if (unlikely(ret))
10748fde12caSLinus Torvalds goto unmap;
1075f2b495caSKirill A. Shutemov out:
1076f2b495caSKirill A. Shutemov ret = 0;
1077f2b495caSKirill A. Shutemov unmap:
1078f2b495caSKirill A. Shutemov pte_unmap(pte);
1079f2b495caSKirill A. Shutemov return ret;
1080f2b495caSKirill A. Shutemov }
1081f2b495caSKirill A. Shutemov
10829a95f3cfSPaul Cassella /*
10839a863a6aSJason Gunthorpe * mmap_lock must be held on entry. If @flags has FOLL_UNLOCKABLE but not
10849a863a6aSJason Gunthorpe * FOLL_NOWAIT, the mmap_lock may be released. If it is, *@locked will be set
10859a863a6aSJason Gunthorpe * to 0 and -EBUSY returned.
10869a95f3cfSPaul Cassella */
faultin_page(struct vm_area_struct * vma,unsigned long address,unsigned int flags,bool unshare,int * locked)108764019a2eSPeter Xu static int faultin_page(struct vm_area_struct *vma,
108847872953SJosef Bacik unsigned long address, unsigned int flags, bool unshare,
1089a7f22660SDavid Hildenbrand int *locked)
109016744483SKirill A. Shutemov {
109116744483SKirill A. Shutemov unsigned int fault_flags = 0;
10922b740303SSouptick Joarder vm_fault_t ret;
109316744483SKirill A. Shutemov
109447872953SJosef Bacik if (flags & FOLL_NOFAULT)
109555b8fe70SAndreas Gruenbacher return -EFAULT;
109647872953SJosef Bacik if (flags & FOLL_WRITE)
109716744483SKirill A. Shutemov fault_flags |= FAULT_FLAG_WRITE;
109847872953SJosef Bacik if (flags & FOLL_REMOTE)
10991b2ee126SDave Hansen fault_flags |= FAULT_FLAG_REMOTE;
110047872953SJosef Bacik if (flags & FOLL_UNLOCKABLE) {
110171335f37SPeter Xu fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
110293c5c61dSPeter Xu /*
110393c5c61dSPeter Xu * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set
110493c5c61dSPeter Xu * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE.
110593c5c61dSPeter Xu * That's because some callers may not be prepared to
110693c5c61dSPeter Xu * handle early exits caused by non-fatal signals.
110793c5c61dSPeter Xu */
110847872953SJosef Bacik if (flags & FOLL_INTERRUPTIBLE)
110993c5c61dSPeter Xu fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
111093c5c61dSPeter Xu }
111147872953SJosef Bacik if (flags & FOLL_NOWAIT)
111216744483SKirill A. Shutemov fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
111347872953SJosef Bacik if (flags & FOLL_TRIED) {
11144426e945SPeter Xu /*
11154426e945SPeter Xu * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
11164426e945SPeter Xu * can co-exist
11174426e945SPeter Xu */
1118234b239bSAndres Lagar-Cavilla fault_flags |= FAULT_FLAG_TRIED;
1119234b239bSAndres Lagar-Cavilla }
1120a7f22660SDavid Hildenbrand if (unshare) {
1121a7f22660SDavid Hildenbrand fault_flags |= FAULT_FLAG_UNSHARE;
1122a7f22660SDavid Hildenbrand /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
1123792b429dSDavid Hildenbrand VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_WRITE);
1124a7f22660SDavid Hildenbrand }
112516744483SKirill A. Shutemov
1126bce617edSPeter Xu ret = handle_mm_fault(vma, address, fault_flags, NULL);
1127d9272525SPeter Xu
1128d9272525SPeter Xu if (ret & VM_FAULT_COMPLETED) {
1129d9272525SPeter Xu /*
1130d9272525SPeter Xu * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
1131d9272525SPeter Xu * mmap lock in the page fault handler. Sanity check this.
1132d9272525SPeter Xu */
1133d9272525SPeter Xu WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
1134d9272525SPeter Xu *locked = 0;
11359a863a6aSJason Gunthorpe
1136d9272525SPeter Xu /*
1137d9272525SPeter Xu * We should do the same as VM_FAULT_RETRY, but let's not
1138d9272525SPeter Xu * return -EBUSY since that's not reflecting the reality of
1139d9272525SPeter Xu * what has happened - we've just fully completed a page
1140d9272525SPeter Xu * fault, with the mmap lock released. Use -EAGAIN to show
1141d9272525SPeter Xu * that we want to take the mmap lock _again_.
1142d9272525SPeter Xu */
1143d9272525SPeter Xu return -EAGAIN;
1144d9272525SPeter Xu }
1145d9272525SPeter Xu
114616744483SKirill A. Shutemov if (ret & VM_FAULT_ERROR) {
114747872953SJosef Bacik int err = vm_fault_to_errno(ret, flags);
11489a291a7cSJames Morse
11499a291a7cSJames Morse if (err)
11509a291a7cSJames Morse return err;
115116744483SKirill A. Shutemov BUG();
115216744483SKirill A. Shutemov }
115316744483SKirill A. Shutemov
115416744483SKirill A. Shutemov if (ret & VM_FAULT_RETRY) {
11559a863a6aSJason Gunthorpe if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
11564f6da934SPeter Xu *locked = 0;
115716744483SKirill A. Shutemov return -EBUSY;
115816744483SKirill A. Shutemov }
115916744483SKirill A. Shutemov
116016744483SKirill A. Shutemov return 0;
116116744483SKirill A. Shutemov }
116216744483SKirill A. Shutemov
11638ac26843SLorenzo Stoakes /*
11648ac26843SLorenzo Stoakes * Writing to file-backed mappings which require folio dirty tracking using GUP
11658ac26843SLorenzo Stoakes * is a fundamentally broken operation, as kernel write access to GUP mappings
11668ac26843SLorenzo Stoakes * do not adhere to the semantics expected by a file system.
11678ac26843SLorenzo Stoakes *
11688ac26843SLorenzo Stoakes * Consider the following scenario:-
11698ac26843SLorenzo Stoakes *
11708ac26843SLorenzo Stoakes * 1. A folio is written to via GUP which write-faults the memory, notifying
11718ac26843SLorenzo Stoakes * the file system and dirtying the folio.
11728ac26843SLorenzo Stoakes * 2. Later, writeback is triggered, resulting in the folio being cleaned and
11738ac26843SLorenzo Stoakes * the PTE being marked read-only.
11748ac26843SLorenzo Stoakes * 3. The GUP caller writes to the folio, as it is mapped read/write via the
11758ac26843SLorenzo Stoakes * direct mapping.
11768ac26843SLorenzo Stoakes * 4. The GUP caller, now done with the page, unpins it and sets it dirty
11778ac26843SLorenzo Stoakes * (though it does not have to).
11788ac26843SLorenzo Stoakes *
11798ac26843SLorenzo Stoakes * This results in both data being written to a folio without writenotify, and
11808ac26843SLorenzo Stoakes * the folio being dirtied unexpectedly (if the caller decides to do so).
11818ac26843SLorenzo Stoakes */
writable_file_mapping_allowed(struct vm_area_struct * vma,unsigned long gup_flags)11828ac26843SLorenzo Stoakes static bool writable_file_mapping_allowed(struct vm_area_struct *vma,
11838ac26843SLorenzo Stoakes unsigned long gup_flags)
11848ac26843SLorenzo Stoakes {
11858ac26843SLorenzo Stoakes /*
11868ac26843SLorenzo Stoakes * If we aren't pinning then no problematic write can occur. A long term
11878ac26843SLorenzo Stoakes * pin is the most egregious case so this is the case we disallow.
11888ac26843SLorenzo Stoakes */
11898ac26843SLorenzo Stoakes if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) !=
11908ac26843SLorenzo Stoakes (FOLL_PIN | FOLL_LONGTERM))
11918ac26843SLorenzo Stoakes return true;
11928ac26843SLorenzo Stoakes
11938ac26843SLorenzo Stoakes /*
11948ac26843SLorenzo Stoakes * If the VMA does not require dirty tracking then no problematic write
11958ac26843SLorenzo Stoakes * can occur either.
11968ac26843SLorenzo Stoakes */
11978ac26843SLorenzo Stoakes return !vma_needs_dirty_tracking(vma);
11988ac26843SLorenzo Stoakes }
11998ac26843SLorenzo Stoakes
check_vma_flags(struct vm_area_struct * vma,unsigned long gup_flags)1200fa5bb209SKirill A. Shutemov static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
1201fa5bb209SKirill A. Shutemov {
1202fa5bb209SKirill A. Shutemov vm_flags_t vm_flags = vma->vm_flags;
12031b2ee126SDave Hansen int write = (gup_flags & FOLL_WRITE);
12041b2ee126SDave Hansen int foreign = (gup_flags & FOLL_REMOTE);
12058ac26843SLorenzo Stoakes bool vma_anon = vma_is_anonymous(vma);
1206fa5bb209SKirill A. Shutemov
1207fa5bb209SKirill A. Shutemov if (vm_flags & (VM_IO | VM_PFNMAP))
1208fa5bb209SKirill A. Shutemov return -EFAULT;
1209fa5bb209SKirill A. Shutemov
12108ac26843SLorenzo Stoakes if ((gup_flags & FOLL_ANON) && !vma_anon)
12117f7ccc2cSWilly Tarreau return -EFAULT;
12127f7ccc2cSWilly Tarreau
121352650c8bSJason Gunthorpe if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
121452650c8bSJason Gunthorpe return -EOPNOTSUPP;
121552650c8bSJason Gunthorpe
12168977752cSDavid Hildenbrand if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma))
12178977752cSDavid Hildenbrand return -EOPNOTSUPP;
12188977752cSDavid Hildenbrand
12191507f512SMike Rapoport if (vma_is_secretmem(vma))
12201507f512SMike Rapoport return -EFAULT;
12211507f512SMike Rapoport
12221b2ee126SDave Hansen if (write) {
12238ac26843SLorenzo Stoakes if (!vma_anon &&
12248ac26843SLorenzo Stoakes !writable_file_mapping_allowed(vma, gup_flags))
12258ac26843SLorenzo Stoakes return -EFAULT;
12268ac26843SLorenzo Stoakes
12276beb9958SRick Edgecombe if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
1228fa5bb209SKirill A. Shutemov if (!(gup_flags & FOLL_FORCE))
1229fa5bb209SKirill A. Shutemov return -EFAULT;
1230fa5bb209SKirill A. Shutemov /*
1231fa5bb209SKirill A. Shutemov * We used to let the write,force case do COW in a
1232fa5bb209SKirill A. Shutemov * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
1233fa5bb209SKirill A. Shutemov * set a breakpoint in a read-only mapping of an
1234fa5bb209SKirill A. Shutemov * executable, without corrupting the file (yet only
1235fa5bb209SKirill A. Shutemov * when that file had been opened for writing!).
1236fa5bb209SKirill A. Shutemov * Anon pages in shared mappings are surprising: now
1237fa5bb209SKirill A. Shutemov * just reject it.
1238fa5bb209SKirill A. Shutemov */
123946435364SHugh Dickins if (!is_cow_mapping(vm_flags))
1240fa5bb209SKirill A. Shutemov return -EFAULT;
1241fa5bb209SKirill A. Shutemov }
1242fa5bb209SKirill A. Shutemov } else if (!(vm_flags & VM_READ)) {
1243fa5bb209SKirill A. Shutemov if (!(gup_flags & FOLL_FORCE))
1244fa5bb209SKirill A. Shutemov return -EFAULT;
1245fa5bb209SKirill A. Shutemov /*
1246fa5bb209SKirill A. Shutemov * Is there actually any vma we can reach here which does not
1247fa5bb209SKirill A. Shutemov * have VM_MAYREAD set?
1248fa5bb209SKirill A. Shutemov */
1249fa5bb209SKirill A. Shutemov if (!(vm_flags & VM_MAYREAD))
1250fa5bb209SKirill A. Shutemov return -EFAULT;
1251fa5bb209SKirill A. Shutemov }
1252d61172b4SDave Hansen /*
1253d61172b4SDave Hansen * gups are always data accesses, not instruction
1254d61172b4SDave Hansen * fetches, so execute=false here
1255d61172b4SDave Hansen */
1256d61172b4SDave Hansen if (!arch_vma_access_permitted(vma, write, false, foreign))
125733a709b2SDave Hansen return -EFAULT;
1258fa5bb209SKirill A. Shutemov return 0;
1259fa5bb209SKirill A. Shutemov }
1260fa5bb209SKirill A. Shutemov
12616cd06ab1SLinus Torvalds /*
12626cd06ab1SLinus Torvalds * This is "vma_lookup()", but with a warning if we would have
12636cd06ab1SLinus Torvalds * historically expanded the stack in the GUP code.
12646cd06ab1SLinus Torvalds */
gup_vma_lookup(struct mm_struct * mm,unsigned long addr)12656cd06ab1SLinus Torvalds static struct vm_area_struct *gup_vma_lookup(struct mm_struct *mm,
12666cd06ab1SLinus Torvalds unsigned long addr)
12676cd06ab1SLinus Torvalds {
12686cd06ab1SLinus Torvalds #ifdef CONFIG_STACK_GROWSUP
12696cd06ab1SLinus Torvalds return vma_lookup(mm, addr);
12706cd06ab1SLinus Torvalds #else
12716cd06ab1SLinus Torvalds static volatile unsigned long next_warn;
12726cd06ab1SLinus Torvalds struct vm_area_struct *vma;
12736cd06ab1SLinus Torvalds unsigned long now, next;
12746cd06ab1SLinus Torvalds
12756cd06ab1SLinus Torvalds vma = find_vma(mm, addr);
12766cd06ab1SLinus Torvalds if (!vma || (addr >= vma->vm_start))
12776cd06ab1SLinus Torvalds return vma;
12786cd06ab1SLinus Torvalds
12796cd06ab1SLinus Torvalds /* Only warn for half-way relevant accesses */
12806cd06ab1SLinus Torvalds if (!(vma->vm_flags & VM_GROWSDOWN))
12816cd06ab1SLinus Torvalds return NULL;
12826cd06ab1SLinus Torvalds if (vma->vm_start - addr > 65536)
12836cd06ab1SLinus Torvalds return NULL;
12846cd06ab1SLinus Torvalds
12856cd06ab1SLinus Torvalds /* Let's not warn more than once an hour.. */
12866cd06ab1SLinus Torvalds now = jiffies; next = next_warn;
12876cd06ab1SLinus Torvalds if (next && time_before(now, next))
12886cd06ab1SLinus Torvalds return NULL;
12896cd06ab1SLinus Torvalds next_warn = now + 60*60*HZ;
12906cd06ab1SLinus Torvalds
12916cd06ab1SLinus Torvalds /* Let people know things may have changed. */
12926cd06ab1SLinus Torvalds pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n",
12936cd06ab1SLinus Torvalds current->comm, task_pid_nr(current),
12946cd06ab1SLinus Torvalds vma->vm_start, vma->vm_end, addr);
12956cd06ab1SLinus Torvalds dump_stack();
12966cd06ab1SLinus Torvalds return NULL;
12976cd06ab1SLinus Torvalds #endif
12986cd06ab1SLinus Torvalds }
12996cd06ab1SLinus Torvalds
13004bbd4c77SKirill A. Shutemov /**
13014bbd4c77SKirill A. Shutemov * __get_user_pages() - pin user pages in memory
13024bbd4c77SKirill A. Shutemov * @mm: mm_struct of target mm
13034bbd4c77SKirill A. Shutemov * @start: starting user address
13044bbd4c77SKirill A. Shutemov * @nr_pages: number of pages from start to pin
13054bbd4c77SKirill A. Shutemov * @gup_flags: flags modifying pin behaviour
13064bbd4c77SKirill A. Shutemov * @pages: array that receives pointers to the pages pinned.
13074bbd4c77SKirill A. Shutemov * Should be at least nr_pages long. Or NULL, if caller
13084bbd4c77SKirill A. Shutemov * only intends to ensure the pages are faulted in.
1309c1e8d7c6SMichel Lespinasse * @locked: whether we're still with the mmap_lock held
13104bbd4c77SKirill A. Shutemov *
1311d2dfbe47SLiu Xiang * Returns either number of pages pinned (which may be less than the
1312d2dfbe47SLiu Xiang * number requested), or an error. Details about the return value:
1313d2dfbe47SLiu Xiang *
1314d2dfbe47SLiu Xiang * -- If nr_pages is 0, returns 0.
1315d2dfbe47SLiu Xiang * -- If nr_pages is >0, but no pages were pinned, returns -errno.
1316d2dfbe47SLiu Xiang * -- If nr_pages is >0, and some pages were pinned, returns the number of
1317d2dfbe47SLiu Xiang * pages pinned. Again, this may be less than nr_pages.
13182d3a36a4SMichal Hocko * -- 0 return value is possible when the fault would need to be retried.
1319d2dfbe47SLiu Xiang *
1320d2dfbe47SLiu Xiang * The caller is responsible for releasing returned @pages, via put_page().
1321d2dfbe47SLiu Xiang *
1322c1e8d7c6SMichel Lespinasse * Must be called with mmap_lock held. It may be released. See below.
13234bbd4c77SKirill A. Shutemov *
13244bbd4c77SKirill A. Shutemov * __get_user_pages walks a process's page tables and takes a reference to
13254bbd4c77SKirill A. Shutemov * each struct page that each user address corresponds to at a given
13264bbd4c77SKirill A. Shutemov * instant. That is, it takes the page that would be accessed if a user
13274bbd4c77SKirill A. Shutemov * thread accesses the given user virtual address at that instant.
13284bbd4c77SKirill A. Shutemov *
13294bbd4c77SKirill A. Shutemov * This does not guarantee that the page exists in the user mappings when
13304bbd4c77SKirill A. Shutemov * __get_user_pages returns, and there may even be a completely different
13314bbd4c77SKirill A. Shutemov * page there in some cases (eg. if mmapped pagecache has been invalidated
1332c5acf1f6SJongwoo Han * and subsequently re-faulted). However it does guarantee that the page
13334bbd4c77SKirill A. Shutemov * won't be freed completely. And mostly callers simply care that the page
13344bbd4c77SKirill A. Shutemov * contains data that was valid *at some point in time*. Typically, an IO
13354bbd4c77SKirill A. Shutemov * or similar operation cannot guarantee anything stronger anyway because
13364bbd4c77SKirill A. Shutemov * locks can't be held over the syscall boundary.
13374bbd4c77SKirill A. Shutemov *
13384bbd4c77SKirill A. Shutemov * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
13394bbd4c77SKirill A. Shutemov * the page is written to, set_page_dirty (or set_page_dirty_lock, as
13404bbd4c77SKirill A. Shutemov * appropriate) must be called after the page is finished with, and
13414bbd4c77SKirill A. Shutemov * before put_page is called.
13424bbd4c77SKirill A. Shutemov *
13439a863a6aSJason Gunthorpe * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may
13449a863a6aSJason Gunthorpe * be released. If this happens *@locked will be set to 0 on return.
13459a95f3cfSPaul Cassella *
13469a863a6aSJason Gunthorpe * A caller using such a combination of @gup_flags must therefore hold the
13479a863a6aSJason Gunthorpe * mmap_lock for reading only, and recognize when it's been released. Otherwise,
13489a863a6aSJason Gunthorpe * it must be held for either reading or writing and will not be released.
13494bbd4c77SKirill A. Shutemov *
13504bbd4c77SKirill A. Shutemov * In most cases, get_user_pages or get_user_pages_fast should be used
13514bbd4c77SKirill A. Shutemov * instead of __get_user_pages. __get_user_pages should be used only if
13524bbd4c77SKirill A. Shutemov * you need some special @gup_flags.
13534bbd4c77SKirill A. Shutemov */
__get_user_pages(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages,int * locked)135464019a2eSPeter Xu static long __get_user_pages(struct mm_struct *mm,
13554bbd4c77SKirill A. Shutemov unsigned long start, unsigned long nr_pages,
13564bbd4c77SKirill A. Shutemov unsigned int gup_flags, struct page **pages,
1357b2cac248SLorenzo Stoakes int *locked)
13584bbd4c77SKirill A. Shutemov {
1359df06b37fSKeith Busch long ret = 0, i = 0;
1360fa5bb209SKirill A. Shutemov struct vm_area_struct *vma = NULL;
1361d3f7922bSAlistair Popple unsigned long page_mask = 0;
13624bbd4c77SKirill A. Shutemov
13634bbd4c77SKirill A. Shutemov if (!nr_pages)
13644bbd4c77SKirill A. Shutemov return 0;
13654bbd4c77SKirill A. Shutemov
1366428e106aSKirill A. Shutemov start = untagged_addr_remote(mm, start);
1367f9652594SAndrey Konovalov
1368ede27b7eSBaoquan He VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
1369ede27b7eSBaoquan He
1370ede27b7eSBaoquan He /* FOLL_GET and FOLL_PIN are mutually exclusive. */
1371ede27b7eSBaoquan He VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
1372ede27b7eSBaoquan He (FOLL_PIN | FOLL_GET));
13734bbd4c77SKirill A. Shutemov
13744bbd4c77SKirill A. Shutemov do {
13754bbd4c77SKirill A. Shutemov struct page *page;
13764bbd4c77SKirill A. Shutemov unsigned int page_increm;
13774bbd4c77SKirill A. Shutemov
1378fa5bb209SKirill A. Shutemov /* first iteration or cross vma bound */
1379fa5bb209SKirill A. Shutemov if (!vma || start >= vma->vm_end) {
1380631426baSDavid Hildenbrand /*
1381631426baSDavid Hildenbrand * MADV_POPULATE_(READ|WRITE) wants to handle VMA
1382631426baSDavid Hildenbrand * lookups+error reporting differently.
1383631426baSDavid Hildenbrand */
1384631426baSDavid Hildenbrand if (gup_flags & FOLL_MADV_POPULATE) {
1385631426baSDavid Hildenbrand vma = vma_lookup(mm, start);
1386631426baSDavid Hildenbrand if (!vma) {
1387631426baSDavid Hildenbrand ret = -ENOMEM;
1388631426baSDavid Hildenbrand goto out;
1389631426baSDavid Hildenbrand }
1390631426baSDavid Hildenbrand if (check_vma_flags(vma, gup_flags)) {
1391631426baSDavid Hildenbrand ret = -EINVAL;
1392631426baSDavid Hildenbrand goto out;
1393631426baSDavid Hildenbrand }
1394631426baSDavid Hildenbrand goto retry;
1395631426baSDavid Hildenbrand }
13966cd06ab1SLinus Torvalds vma = gup_vma_lookup(mm, start);
1397fa5bb209SKirill A. Shutemov if (!vma && in_gate_area(mm, start)) {
1398fa5bb209SKirill A. Shutemov ret = get_gate_page(mm, start & PAGE_MASK,
1399fa5bb209SKirill A. Shutemov gup_flags, &vma,
1400ffe1e786SPeter Xu pages ? &page : NULL);
1401fa5bb209SKirill A. Shutemov if (ret)
140208be37b7SJohn Hubbard goto out;
1403d3f7922bSAlistair Popple page_mask = 0;
1404fa5bb209SKirill A. Shutemov goto next_page;
1405fa5bb209SKirill A. Shutemov }
1406fa5bb209SKirill A. Shutemov
140752650c8bSJason Gunthorpe if (!vma) {
1408df06b37fSKeith Busch ret = -EFAULT;
1409df06b37fSKeith Busch goto out;
1410df06b37fSKeith Busch }
141152650c8bSJason Gunthorpe ret = check_vma_flags(vma, gup_flags);
141252650c8bSJason Gunthorpe if (ret)
141352650c8bSJason Gunthorpe goto out;
1414fa5bb209SKirill A. Shutemov }
1415fa5bb209SKirill A. Shutemov retry:
14164bbd4c77SKirill A. Shutemov /*
1417fa5bb209SKirill A. Shutemov * If we have a pending SIGKILL, don't keep faulting pages and
1418fa5bb209SKirill A. Shutemov * potentially allocating memory.
14194bbd4c77SKirill A. Shutemov */
1420fa45f116SDavidlohr Bueso if (fatal_signal_pending(current)) {
1421d180870dSMichal Hocko ret = -EINTR;
1422df06b37fSKeith Busch goto out;
1423df06b37fSKeith Busch }
14244bbd4c77SKirill A. Shutemov cond_resched();
1425df06b37fSKeith Busch
1426d3f7922bSAlistair Popple page = follow_page_mask(vma, start, gup_flags, &page_mask);
1427a7f22660SDavid Hildenbrand if (!page || PTR_ERR(page) == -EMLINK) {
1428dc21e700SJosef Bacik ret = faultin_page(vma, start, gup_flags,
1429a7f22660SDavid Hildenbrand PTR_ERR(page) == -EMLINK, locked);
143016744483SKirill A. Shutemov switch (ret) {
143116744483SKirill A. Shutemov case 0:
1432fa5bb209SKirill A. Shutemov goto retry;
1433df06b37fSKeith Busch case -EBUSY:
1434d9272525SPeter Xu case -EAGAIN:
1435df06b37fSKeith Busch ret = 0;
1436e4a9bc58SJoe Perches fallthrough;
143716744483SKirill A. Shutemov case -EFAULT:
143816744483SKirill A. Shutemov case -ENOMEM:
143916744483SKirill A. Shutemov case -EHWPOISON:
1440df06b37fSKeith Busch goto out;
14414bbd4c77SKirill A. Shutemov }
1442fa5bb209SKirill A. Shutemov BUG();
14431027e443SKirill A. Shutemov } else if (PTR_ERR(page) == -EEXIST) {
14441027e443SKirill A. Shutemov /*
14451027e443SKirill A. Shutemov * Proper page table entry exists, but no corresponding
144665462462SJohn Hubbard * struct page. If the caller expects **pages to be
144765462462SJohn Hubbard * filled in, bail out now, because that can't be done
144865462462SJohn Hubbard * for this page.
14491027e443SKirill A. Shutemov */
145065462462SJohn Hubbard if (pages) {
145165462462SJohn Hubbard ret = PTR_ERR(page);
145265462462SJohn Hubbard goto out;
145365462462SJohn Hubbard }
14541027e443SKirill A. Shutemov } else if (IS_ERR(page)) {
1455df06b37fSKeith Busch ret = PTR_ERR(page);
1456df06b37fSKeith Busch goto out;
14571027e443SKirill A. Shutemov }
1458ffe1e786SPeter Xu next_page:
1459d3f7922bSAlistair Popple page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
14604bbd4c77SKirill A. Shutemov if (page_increm > nr_pages)
14614bbd4c77SKirill A. Shutemov page_increm = nr_pages;
146257edfcfdSPeter Xu
146357edfcfdSPeter Xu if (pages) {
146457edfcfdSPeter Xu struct page *subpage;
146557edfcfdSPeter Xu unsigned int j;
146657edfcfdSPeter Xu
146757edfcfdSPeter Xu /*
146857edfcfdSPeter Xu * This must be a large folio (and doesn't need to
146957edfcfdSPeter Xu * be the whole folio; it can be part of it), do
147057edfcfdSPeter Xu * the refcount work for all the subpages too.
147157edfcfdSPeter Xu *
147257edfcfdSPeter Xu * NOTE: here the page may not be the head page
147357edfcfdSPeter Xu * e.g. when start addr is not thp-size aligned.
147457edfcfdSPeter Xu * try_grab_folio() should have taken care of tail
147557edfcfdSPeter Xu * pages.
147657edfcfdSPeter Xu */
147757edfcfdSPeter Xu if (page_increm > 1) {
1478f442fa61SYang Shi struct folio *folio = page_folio(page);
147957edfcfdSPeter Xu
148057edfcfdSPeter Xu /*
148157edfcfdSPeter Xu * Since we already hold refcount on the
148257edfcfdSPeter Xu * large folio, this should never fail.
148357edfcfdSPeter Xu */
1484f442fa61SYang Shi if (try_grab_folio(folio, page_increm - 1,
1485dc21e700SJosef Bacik gup_flags)) {
148657edfcfdSPeter Xu /*
148757edfcfdSPeter Xu * Release the 1st page ref if the
148857edfcfdSPeter Xu * folio is problematic, fail hard.
148957edfcfdSPeter Xu */
1490dc21e700SJosef Bacik gup_put_folio(folio, 1, gup_flags);
149157edfcfdSPeter Xu ret = -EFAULT;
149257edfcfdSPeter Xu goto out;
149357edfcfdSPeter Xu }
149457edfcfdSPeter Xu }
149557edfcfdSPeter Xu
149657edfcfdSPeter Xu for (j = 0; j < page_increm; j++) {
1497541541dbSDavid Hildenbrand subpage = page + j;
149857edfcfdSPeter Xu pages[i + j] = subpage;
149957edfcfdSPeter Xu flush_anon_page(vma, subpage, start + j * PAGE_SIZE);
150057edfcfdSPeter Xu flush_dcache_page(subpage);
150157edfcfdSPeter Xu }
150257edfcfdSPeter Xu }
150357edfcfdSPeter Xu
15044bbd4c77SKirill A. Shutemov i += page_increm;
15054bbd4c77SKirill A. Shutemov start += page_increm * PAGE_SIZE;
15064bbd4c77SKirill A. Shutemov nr_pages -= page_increm;
15074bbd4c77SKirill A. Shutemov } while (nr_pages);
1508df06b37fSKeith Busch out:
1509df06b37fSKeith Busch return i ? i : ret;
15104bbd4c77SKirill A. Shutemov }
15114bbd4c77SKirill A. Shutemov
vma_permits_fault(struct vm_area_struct * vma,unsigned int fault_flags)1512771ab430STobias Klauser static bool vma_permits_fault(struct vm_area_struct *vma,
1513771ab430STobias Klauser unsigned int fault_flags)
1514d4925e00SDave Hansen {
151533a709b2SDave Hansen bool write = !!(fault_flags & FAULT_FLAG_WRITE);
15161b2ee126SDave Hansen bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
151733a709b2SDave Hansen vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
1518d4925e00SDave Hansen
1519d4925e00SDave Hansen if (!(vm_flags & vma->vm_flags))
1520d4925e00SDave Hansen return false;
1521d4925e00SDave Hansen
152233a709b2SDave Hansen /*
152333a709b2SDave Hansen * The architecture might have a hardware protection
15241b2ee126SDave Hansen * mechanism other than read/write that can deny access.
1525d61172b4SDave Hansen *
1526d61172b4SDave Hansen * gup always represents data access, not instruction
1527d61172b4SDave Hansen * fetches, so execute=false here:
152833a709b2SDave Hansen */
1529d61172b4SDave Hansen if (!arch_vma_access_permitted(vma, write, false, foreign))
153033a709b2SDave Hansen return false;
153133a709b2SDave Hansen
1532d4925e00SDave Hansen return true;
1533d4925e00SDave Hansen }
1534d4925e00SDave Hansen
1535adc8cb40SSouptick Joarder /**
15364bbd4c77SKirill A. Shutemov * fixup_user_fault() - manually resolve a user page fault
15374bbd4c77SKirill A. Shutemov * @mm: mm_struct of target mm
15384bbd4c77SKirill A. Shutemov * @address: user address
15394bbd4c77SKirill A. Shutemov * @fault_flags:flags to pass down to handle_mm_fault()
1540c1e8d7c6SMichel Lespinasse * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller
1541548b6a1eSMiles Chen * does not allow retry. If NULL, the caller must guarantee
1542548b6a1eSMiles Chen * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
15434bbd4c77SKirill A. Shutemov *
15444bbd4c77SKirill A. Shutemov * This is meant to be called in the specific scenario where for locking reasons
15454bbd4c77SKirill A. Shutemov * we try to access user memory in atomic context (within a pagefault_disable()
15464bbd4c77SKirill A. Shutemov * section), this returns -EFAULT, and we want to resolve the user fault before
15474bbd4c77SKirill A. Shutemov * trying again.
15484bbd4c77SKirill A. Shutemov *
15494bbd4c77SKirill A. Shutemov * Typically this is meant to be used by the futex code.
15504bbd4c77SKirill A. Shutemov *
15514bbd4c77SKirill A. Shutemov * The main difference with get_user_pages() is that this function will
15524bbd4c77SKirill A. Shutemov * unconditionally call handle_mm_fault() which will in turn perform all the
15534bbd4c77SKirill A. Shutemov * necessary SW fixup of the dirty and young bits in the PTE, while
15544a9e1cdaSDominik Dingel * get_user_pages() only guarantees to update these in the struct page.
15554bbd4c77SKirill A. Shutemov *
15564bbd4c77SKirill A. Shutemov * This is important for some architectures where those bits also gate the
15574bbd4c77SKirill A. Shutemov * access permission to the page because they are maintained in software. On
15584bbd4c77SKirill A. Shutemov * such architectures, gup() will not be enough to make a subsequent access
15594bbd4c77SKirill A. Shutemov * succeed.
15604bbd4c77SKirill A. Shutemov *
1561c1e8d7c6SMichel Lespinasse * This function will not return with an unlocked mmap_lock. So it has not the
1562c1e8d7c6SMichel Lespinasse * same semantics wrt the @mm->mmap_lock as does filemap_fault().
15634bbd4c77SKirill A. Shutemov */
fixup_user_fault(struct mm_struct * mm,unsigned long address,unsigned int fault_flags,bool * unlocked)156464019a2eSPeter Xu int fixup_user_fault(struct mm_struct *mm,
15654a9e1cdaSDominik Dingel unsigned long address, unsigned int fault_flags,
15664a9e1cdaSDominik Dingel bool *unlocked)
15674bbd4c77SKirill A. Shutemov {
15684bbd4c77SKirill A. Shutemov struct vm_area_struct *vma;
15698fed2f3cSMiaohe Lin vm_fault_t ret;
15704bbd4c77SKirill A. Shutemov
1571428e106aSKirill A. Shutemov address = untagged_addr_remote(mm, address);
1572f9652594SAndrey Konovalov
15734a9e1cdaSDominik Dingel if (unlocked)
157471335f37SPeter Xu fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
15754a9e1cdaSDominik Dingel
15764a9e1cdaSDominik Dingel retry:
15776cd06ab1SLinus Torvalds vma = gup_vma_lookup(mm, address);
15788d7071afSLinus Torvalds if (!vma)
15794bbd4c77SKirill A. Shutemov return -EFAULT;
15804bbd4c77SKirill A. Shutemov
1581d4925e00SDave Hansen if (!vma_permits_fault(vma, fault_flags))
15824bbd4c77SKirill A. Shutemov return -EFAULT;
15834bbd4c77SKirill A. Shutemov
1584475f4dfcSPeter Xu if ((fault_flags & FAULT_FLAG_KILLABLE) &&
1585475f4dfcSPeter Xu fatal_signal_pending(current))
1586475f4dfcSPeter Xu return -EINTR;
1587475f4dfcSPeter Xu
1588bce617edSPeter Xu ret = handle_mm_fault(vma, address, fault_flags, NULL);
1589d9272525SPeter Xu
1590d9272525SPeter Xu if (ret & VM_FAULT_COMPLETED) {
1591d9272525SPeter Xu /*
1592d9272525SPeter Xu * NOTE: it's a pity that we need to retake the lock here
1593d9272525SPeter Xu * to pair with the unlock() in the callers. Ideally we
1594d9272525SPeter Xu * could tell the callers so they do not need to unlock.
1595d9272525SPeter Xu */
1596d9272525SPeter Xu mmap_read_lock(mm);
1597d9272525SPeter Xu *unlocked = true;
1598d9272525SPeter Xu return 0;
1599d9272525SPeter Xu }
1600d9272525SPeter Xu
16014bbd4c77SKirill A. Shutemov if (ret & VM_FAULT_ERROR) {
16029a291a7cSJames Morse int err = vm_fault_to_errno(ret, 0);
16039a291a7cSJames Morse
16049a291a7cSJames Morse if (err)
16059a291a7cSJames Morse return err;
16064bbd4c77SKirill A. Shutemov BUG();
16074bbd4c77SKirill A. Shutemov }
16084a9e1cdaSDominik Dingel
16094a9e1cdaSDominik Dingel if (ret & VM_FAULT_RETRY) {
1610d8ed45c5SMichel Lespinasse mmap_read_lock(mm);
16114a9e1cdaSDominik Dingel *unlocked = true;
16124a9e1cdaSDominik Dingel fault_flags |= FAULT_FLAG_TRIED;
16134a9e1cdaSDominik Dingel goto retry;
16144a9e1cdaSDominik Dingel }
16154a9e1cdaSDominik Dingel
16164bbd4c77SKirill A. Shutemov return 0;
16174bbd4c77SKirill A. Shutemov }
1618add6a0cdSPaolo Bonzini EXPORT_SYMBOL_GPL(fixup_user_fault);
16194bbd4c77SKirill A. Shutemov
16202d3a36a4SMichal Hocko /*
162193c5c61dSPeter Xu * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is
162293c5c61dSPeter Xu * specified, it'll also respond to generic signals. The caller of GUP
162393c5c61dSPeter Xu * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption.
162493c5c61dSPeter Xu */
gup_signal_pending(unsigned int flags)162593c5c61dSPeter Xu static bool gup_signal_pending(unsigned int flags)
162693c5c61dSPeter Xu {
162793c5c61dSPeter Xu if (fatal_signal_pending(current))
162893c5c61dSPeter Xu return true;
162993c5c61dSPeter Xu
163093c5c61dSPeter Xu if (!(flags & FOLL_INTERRUPTIBLE))
163193c5c61dSPeter Xu return false;
163293c5c61dSPeter Xu
163393c5c61dSPeter Xu return signal_pending(current);
163493c5c61dSPeter Xu }
163593c5c61dSPeter Xu
163693c5c61dSPeter Xu /*
1637b2a72dffSJason Gunthorpe * Locking: (*locked == 1) means that the mmap_lock has already been acquired by
1638b2a72dffSJason Gunthorpe * the caller. This function may drop the mmap_lock. If it does so, then it will
1639b2a72dffSJason Gunthorpe * set (*locked = 0).
1640b2a72dffSJason Gunthorpe *
1641b2a72dffSJason Gunthorpe * (*locked == 0) means that the caller expects this function to acquire and
1642b2a72dffSJason Gunthorpe * drop the mmap_lock. Therefore, the value of *locked will still be zero when
1643b2a72dffSJason Gunthorpe * the function returns, even though it may have changed temporarily during
1644b2a72dffSJason Gunthorpe * function execution.
1645b2a72dffSJason Gunthorpe *
1646b2a72dffSJason Gunthorpe * Please note that this function, unlike __get_user_pages(), will not return 0
1647b2a72dffSJason Gunthorpe * for nr_pages > 0, unless FOLL_NOWAIT is used.
16482d3a36a4SMichal Hocko */
__get_user_pages_locked(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,struct page ** pages,int * locked,unsigned int flags)164964019a2eSPeter Xu static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
1650f0818f47SAndrea Arcangeli unsigned long start,
1651f0818f47SAndrea Arcangeli unsigned long nr_pages,
1652f0818f47SAndrea Arcangeli struct page **pages,
1653e716712fSAl Viro int *locked,
16540fd71a56SAndrea Arcangeli unsigned int flags)
1655f0818f47SAndrea Arcangeli {
1656f0818f47SAndrea Arcangeli long ret, pages_done;
1657b2a72dffSJason Gunthorpe bool must_unlock = false;
1658f0818f47SAndrea Arcangeli
16599c4b2142SLorenzo Stoakes if (!nr_pages)
16609c4b2142SLorenzo Stoakes return 0;
16619c4b2142SLorenzo Stoakes
1662b2a72dffSJason Gunthorpe /*
1663b2a72dffSJason Gunthorpe * The internal caller expects GUP to manage the lock internally and the
1664b2a72dffSJason Gunthorpe * lock must be released when this returns.
1665b2a72dffSJason Gunthorpe */
16669a863a6aSJason Gunthorpe if (!*locked) {
1667b2a72dffSJason Gunthorpe if (mmap_read_lock_killable(mm))
1668b2a72dffSJason Gunthorpe return -EAGAIN;
1669b2a72dffSJason Gunthorpe must_unlock = true;
1670b2a72dffSJason Gunthorpe *locked = 1;
1671f0818f47SAndrea Arcangeli }
1672961ba472SJason Gunthorpe else
1673961ba472SJason Gunthorpe mmap_assert_locked(mm);
1674f0818f47SAndrea Arcangeli
1675a458b76aSAndrea Arcangeli if (flags & FOLL_PIN)
167612e423baSLorenzo Stoakes mm_set_has_pinned_flag(mm);
1677008cfe44SPeter Xu
1678eddb1c22SJohn Hubbard /*
1679eddb1c22SJohn Hubbard * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
1680eddb1c22SJohn Hubbard * is to set FOLL_GET if the caller wants pages[] filled in (but has
1681eddb1c22SJohn Hubbard * carelessly failed to specify FOLL_GET), so keep doing that, but only
1682eddb1c22SJohn Hubbard * for FOLL_GET, not for the newer FOLL_PIN.
1683eddb1c22SJohn Hubbard *
1684eddb1c22SJohn Hubbard * FOLL_PIN always expects pages to be non-null, but no need to assert
1685eddb1c22SJohn Hubbard * that here, as any failures will be obvious enough.
1686eddb1c22SJohn Hubbard */
1687eddb1c22SJohn Hubbard if (pages && !(flags & FOLL_PIN))
1688f0818f47SAndrea Arcangeli flags |= FOLL_GET;
1689f0818f47SAndrea Arcangeli
1690f0818f47SAndrea Arcangeli pages_done = 0;
1691f0818f47SAndrea Arcangeli for (;;) {
169264019a2eSPeter Xu ret = __get_user_pages(mm, start, nr_pages, flags, pages,
1693b2cac248SLorenzo Stoakes locked);
1694f04740f5SJason Gunthorpe if (!(flags & FOLL_UNLOCKABLE)) {
1695f0818f47SAndrea Arcangeli /* VM_FAULT_RETRY couldn't trigger, bypass */
1696f04740f5SJason Gunthorpe pages_done = ret;
1697f04740f5SJason Gunthorpe break;
1698f04740f5SJason Gunthorpe }
1699f0818f47SAndrea Arcangeli
1700d9272525SPeter Xu /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
1701792b429dSDavid Hildenbrand VM_WARN_ON_ONCE(!*locked && (ret < 0 || ret >= nr_pages));
1702f0818f47SAndrea Arcangeli
1703f0818f47SAndrea Arcangeli if (ret > 0) {
1704f0818f47SAndrea Arcangeli nr_pages -= ret;
1705f0818f47SAndrea Arcangeli pages_done += ret;
1706f0818f47SAndrea Arcangeli if (!nr_pages)
1707f0818f47SAndrea Arcangeli break;
1708f0818f47SAndrea Arcangeli }
1709f0818f47SAndrea Arcangeli if (*locked) {
171096312e61SAndrea Arcangeli /*
171196312e61SAndrea Arcangeli * VM_FAULT_RETRY didn't trigger or it was a
171296312e61SAndrea Arcangeli * FOLL_NOWAIT.
171396312e61SAndrea Arcangeli */
1714f0818f47SAndrea Arcangeli if (!pages_done)
1715f0818f47SAndrea Arcangeli pages_done = ret;
1716f0818f47SAndrea Arcangeli break;
1717f0818f47SAndrea Arcangeli }
1718df17277bSMike Rapoport /*
1719df17277bSMike Rapoport * VM_FAULT_RETRY triggered, so seek to the faulting offset.
1720df17277bSMike Rapoport * For the prefault case (!pages) we only update counts.
1721df17277bSMike Rapoport */
1722df17277bSMike Rapoport if (likely(pages))
1723f0818f47SAndrea Arcangeli pages += ret;
1724f0818f47SAndrea Arcangeli start += ret << PAGE_SHIFT;
1725b2a72dffSJason Gunthorpe
1726b2a72dffSJason Gunthorpe /* The lock was temporarily dropped, so we must unlock later */
1727b2a72dffSJason Gunthorpe must_unlock = true;
1728f0818f47SAndrea Arcangeli
17294426e945SPeter Xu retry:
1730f0818f47SAndrea Arcangeli /*
1731f0818f47SAndrea Arcangeli * Repeat on the address that fired VM_FAULT_RETRY
17324426e945SPeter Xu * with both FAULT_FLAG_ALLOW_RETRY and
17334426e945SPeter Xu * FAULT_FLAG_TRIED. Note that GUP can be interrupted
173493c5c61dSPeter Xu * by fatal signals of even common signals, depending on
173593c5c61dSPeter Xu * the caller's request. So we need to check it before we
17364426e945SPeter Xu * start trying again otherwise it can loop forever.
1737f0818f47SAndrea Arcangeli */
173893c5c61dSPeter Xu if (gup_signal_pending(flags)) {
1739ae46d2aaSHillf Danton if (!pages_done)
1740ae46d2aaSHillf Danton pages_done = -EINTR;
17414426e945SPeter Xu break;
1742ae46d2aaSHillf Danton }
17434426e945SPeter Xu
1744d8ed45c5SMichel Lespinasse ret = mmap_read_lock_killable(mm);
174571335f37SPeter Xu if (ret) {
174671335f37SPeter Xu if (!pages_done)
174771335f37SPeter Xu pages_done = ret;
174871335f37SPeter Xu break;
174971335f37SPeter Xu }
17504426e945SPeter Xu
1751c7b6a566SPeter Xu *locked = 1;
175264019a2eSPeter Xu ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
1753b2cac248SLorenzo Stoakes pages, locked);
17544426e945SPeter Xu if (!*locked) {
17554426e945SPeter Xu /* Continue to retry until we succeeded */
1756792b429dSDavid Hildenbrand VM_WARN_ON_ONCE(ret != 0);
17574426e945SPeter Xu goto retry;
17584426e945SPeter Xu }
1759f0818f47SAndrea Arcangeli if (ret != 1) {
1760792b429dSDavid Hildenbrand VM_WARN_ON_ONCE(ret > 1);
1761f0818f47SAndrea Arcangeli if (!pages_done)
1762f0818f47SAndrea Arcangeli pages_done = ret;
1763f0818f47SAndrea Arcangeli break;
1764f0818f47SAndrea Arcangeli }
1765f0818f47SAndrea Arcangeli nr_pages--;
1766f0818f47SAndrea Arcangeli pages_done++;
1767f0818f47SAndrea Arcangeli if (!nr_pages)
1768f0818f47SAndrea Arcangeli break;
1769df17277bSMike Rapoport if (likely(pages))
1770f0818f47SAndrea Arcangeli pages++;
1771f0818f47SAndrea Arcangeli start += PAGE_SIZE;
1772f0818f47SAndrea Arcangeli }
1773b2a72dffSJason Gunthorpe if (must_unlock && *locked) {
1774f0818f47SAndrea Arcangeli /*
1775b2a72dffSJason Gunthorpe * We either temporarily dropped the lock, or the caller
1776b2a72dffSJason Gunthorpe * requested that we both acquire and drop the lock. Either way,
1777b2a72dffSJason Gunthorpe * we must now unlock, and notify the caller of that state.
1778f0818f47SAndrea Arcangeli */
1779d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
1780f0818f47SAndrea Arcangeli *locked = 0;
1781f0818f47SAndrea Arcangeli }
17829c4b2142SLorenzo Stoakes
17839c4b2142SLorenzo Stoakes /*
17849c4b2142SLorenzo Stoakes * Failing to pin anything implies something has gone wrong (except when
17859c4b2142SLorenzo Stoakes * FOLL_NOWAIT is specified).
17869c4b2142SLorenzo Stoakes */
17879c4b2142SLorenzo Stoakes if (WARN_ON_ONCE(pages_done == 0 && !(flags & FOLL_NOWAIT)))
17889c4b2142SLorenzo Stoakes return -EFAULT;
17899c4b2142SLorenzo Stoakes
1790f0818f47SAndrea Arcangeli return pages_done;
1791f0818f47SAndrea Arcangeli }
1792f0818f47SAndrea Arcangeli
1793d3649f68SChristoph Hellwig /**
1794d3649f68SChristoph Hellwig * populate_vma_page_range() - populate a range of pages in the vma.
1795d3649f68SChristoph Hellwig * @vma: target vma
1796d3649f68SChristoph Hellwig * @start: start address
1797d3649f68SChristoph Hellwig * @end: end address
1798c1e8d7c6SMichel Lespinasse * @locked: whether the mmap_lock is still held
1799d3649f68SChristoph Hellwig *
1800d3649f68SChristoph Hellwig * This takes care of mlocking the pages too if VM_LOCKED is set.
1801d3649f68SChristoph Hellwig *
18020a36f7f8STang Yizhou * Return either number of pages pinned in the vma, or a negative error
18030a36f7f8STang Yizhou * code on error.
1804d3649f68SChristoph Hellwig *
1805c1e8d7c6SMichel Lespinasse * vma->vm_mm->mmap_lock must be held.
1806d3649f68SChristoph Hellwig *
18074f6da934SPeter Xu * If @locked is NULL, it may be held for read or write and will
1808d3649f68SChristoph Hellwig * be unperturbed.
1809d3649f68SChristoph Hellwig *
18104f6da934SPeter Xu * If @locked is non-NULL, it must held for read only and may be
18114f6da934SPeter Xu * released. If it's released, *@locked will be set to 0.
1812d3649f68SChristoph Hellwig */
populate_vma_page_range(struct vm_area_struct * vma,unsigned long start,unsigned long end,int * locked)1813d3649f68SChristoph Hellwig long populate_vma_page_range(struct vm_area_struct *vma,
18144f6da934SPeter Xu unsigned long start, unsigned long end, int *locked)
1815d3649f68SChristoph Hellwig {
1816d3649f68SChristoph Hellwig struct mm_struct *mm = vma->vm_mm;
1817d3649f68SChristoph Hellwig unsigned long nr_pages = (end - start) / PAGE_SIZE;
18189a863a6aSJason Gunthorpe int local_locked = 1;
1819d3649f68SChristoph Hellwig int gup_flags;
1820ece369c7SHugh Dickins long ret;
1821d3649f68SChristoph Hellwig
1822792b429dSDavid Hildenbrand VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
1823792b429dSDavid Hildenbrand VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
1824792b429dSDavid Hildenbrand VM_WARN_ON_ONCE_VMA(start < vma->vm_start, vma);
1825792b429dSDavid Hildenbrand VM_WARN_ON_ONCE_VMA(end > vma->vm_end, vma);
182642fc5414SMichel Lespinasse mmap_assert_locked(mm);
1827d3649f68SChristoph Hellwig
1828b67bf49cSHugh Dickins /*
1829b67bf49cSHugh Dickins * Rightly or wrongly, the VM_LOCKONFAULT case has never used
1830b67bf49cSHugh Dickins * faultin_page() to break COW, so it has no work to do here.
1831b67bf49cSHugh Dickins */
1832d3649f68SChristoph Hellwig if (vma->vm_flags & VM_LOCKONFAULT)
1833b67bf49cSHugh Dickins return nr_pages;
1834b67bf49cSHugh Dickins
18351096bc93SLinus Torvalds /* ... similarly, we've never faulted in PROT_NONE pages */
18361096bc93SLinus Torvalds if (!vma_is_accessible(vma))
18371096bc93SLinus Torvalds return -EFAULT;
18381096bc93SLinus Torvalds
1839b67bf49cSHugh Dickins gup_flags = FOLL_TOUCH;
1840d3649f68SChristoph Hellwig /*
1841d3649f68SChristoph Hellwig * We want to touch writable mappings with a write fault in order
1842d3649f68SChristoph Hellwig * to break COW, except for shared mappings because these don't COW
1843d3649f68SChristoph Hellwig * and we would not want to dirty them for nothing.
18441096bc93SLinus Torvalds *
18451096bc93SLinus Torvalds * Otherwise, do a read fault, and use FOLL_FORCE in case it's not
18461096bc93SLinus Torvalds * readable (ie write-only or executable).
1847d3649f68SChristoph Hellwig */
1848d3649f68SChristoph Hellwig if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
1849d3649f68SChristoph Hellwig gup_flags |= FOLL_WRITE;
18501096bc93SLinus Torvalds else
1851d3649f68SChristoph Hellwig gup_flags |= FOLL_FORCE;
1852d3649f68SChristoph Hellwig
1853f04740f5SJason Gunthorpe if (locked)
1854f04740f5SJason Gunthorpe gup_flags |= FOLL_UNLOCKABLE;
1855f04740f5SJason Gunthorpe
1856d3649f68SChristoph Hellwig /*
1857d3649f68SChristoph Hellwig * We made sure addr is within a VMA, so the following will
1858d3649f68SChristoph Hellwig * not result in a stack expansion that recurses back here.
1859d3649f68SChristoph Hellwig */
1860ece369c7SHugh Dickins ret = __get_user_pages(mm, start, nr_pages, gup_flags,
1861b2cac248SLorenzo Stoakes NULL, locked ? locked : &local_locked);
1862ece369c7SHugh Dickins lru_add_drain();
1863ece369c7SHugh Dickins return ret;
1864d3649f68SChristoph Hellwig }
1865d3649f68SChristoph Hellwig
1866d3649f68SChristoph Hellwig /*
1867631426baSDavid Hildenbrand * faultin_page_range() - populate (prefault) page tables inside the
1868631426baSDavid Hildenbrand * given range readable/writable
18694ca9b385SDavid Hildenbrand *
18704ca9b385SDavid Hildenbrand * This takes care of mlocking the pages, too, if VM_LOCKED is set.
18714ca9b385SDavid Hildenbrand *
1872631426baSDavid Hildenbrand * @mm: the mm to populate page tables in
18734ca9b385SDavid Hildenbrand * @start: start address
18744ca9b385SDavid Hildenbrand * @end: end address
18754ca9b385SDavid Hildenbrand * @write: whether to prefault readable or writable
18764ca9b385SDavid Hildenbrand * @locked: whether the mmap_lock is still held
18774ca9b385SDavid Hildenbrand *
1878631426baSDavid Hildenbrand * Returns either number of processed pages in the MM, or a negative error
1879631426baSDavid Hildenbrand * code on error (see __get_user_pages()). Note that this function reports
1880631426baSDavid Hildenbrand * errors related to VMAs, such as incompatible mappings, as expected by
1881631426baSDavid Hildenbrand * MADV_POPULATE_(READ|WRITE).
18824ca9b385SDavid Hildenbrand *
1883631426baSDavid Hildenbrand * The range must be page-aligned.
1884631426baSDavid Hildenbrand *
1885631426baSDavid Hildenbrand * mm->mmap_lock must be held. If it's released, *@locked will be set to 0.
18864ca9b385SDavid Hildenbrand */
faultin_page_range(struct mm_struct * mm,unsigned long start,unsigned long end,bool write,int * locked)1887631426baSDavid Hildenbrand long faultin_page_range(struct mm_struct *mm, unsigned long start,
18884ca9b385SDavid Hildenbrand unsigned long end, bool write, int *locked)
18894ca9b385SDavid Hildenbrand {
18904ca9b385SDavid Hildenbrand unsigned long nr_pages = (end - start) / PAGE_SIZE;
18914ca9b385SDavid Hildenbrand int gup_flags;
1892ece369c7SHugh Dickins long ret;
18934ca9b385SDavid Hildenbrand
1894792b429dSDavid Hildenbrand VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
1895792b429dSDavid Hildenbrand VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
18964ca9b385SDavid Hildenbrand mmap_assert_locked(mm);
18974ca9b385SDavid Hildenbrand
18984ca9b385SDavid Hildenbrand /*
18994ca9b385SDavid Hildenbrand * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
19004ca9b385SDavid Hildenbrand * the page dirty with FOLL_WRITE -- which doesn't make a
19014ca9b385SDavid Hildenbrand * difference with !FOLL_FORCE, because the page is writable
19024ca9b385SDavid Hildenbrand * in the page table.
19034ca9b385SDavid Hildenbrand * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
19044ca9b385SDavid Hildenbrand * a poisoned page.
19054ca9b385SDavid Hildenbrand * !FOLL_FORCE: Require proper access permissions.
19064ca9b385SDavid Hildenbrand */
1907631426baSDavid Hildenbrand gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE |
1908631426baSDavid Hildenbrand FOLL_MADV_POPULATE;
19094ca9b385SDavid Hildenbrand if (write)
19104ca9b385SDavid Hildenbrand gup_flags |= FOLL_WRITE;
19114ca9b385SDavid Hildenbrand
1912631426baSDavid Hildenbrand ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked,
1913631426baSDavid Hildenbrand gup_flags);
1914ece369c7SHugh Dickins lru_add_drain();
1915ece369c7SHugh Dickins return ret;
19164ca9b385SDavid Hildenbrand }
19174ca9b385SDavid Hildenbrand
19184ca9b385SDavid Hildenbrand /*
1919d3649f68SChristoph Hellwig * __mm_populate - populate and/or mlock pages within a range of address space.
1920d3649f68SChristoph Hellwig *
1921d3649f68SChristoph Hellwig * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
1922d3649f68SChristoph Hellwig * flags. VMAs must be already marked with the desired vm_flags, and
1923c1e8d7c6SMichel Lespinasse * mmap_lock must not be held.
1924d3649f68SChristoph Hellwig */
__mm_populate(unsigned long start,unsigned long len,int ignore_errors)1925d3649f68SChristoph Hellwig int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
1926d3649f68SChristoph Hellwig {
1927d3649f68SChristoph Hellwig struct mm_struct *mm = current->mm;
1928d3649f68SChristoph Hellwig unsigned long end, nstart, nend;
1929d3649f68SChristoph Hellwig struct vm_area_struct *vma = NULL;
1930d3649f68SChristoph Hellwig int locked = 0;
1931d3649f68SChristoph Hellwig long ret = 0;
1932d3649f68SChristoph Hellwig
1933d3649f68SChristoph Hellwig end = start + len;
1934d3649f68SChristoph Hellwig
1935d3649f68SChristoph Hellwig for (nstart = start; nstart < end; nstart = nend) {
1936d3649f68SChristoph Hellwig /*
1937d3649f68SChristoph Hellwig * We want to fault in pages for [nstart; end) address range.
1938d3649f68SChristoph Hellwig * Find first corresponding VMA.
1939d3649f68SChristoph Hellwig */
1940d3649f68SChristoph Hellwig if (!locked) {
1941d3649f68SChristoph Hellwig locked = 1;
1942d8ed45c5SMichel Lespinasse mmap_read_lock(mm);
1943c4d1a92dSLiam R. Howlett vma = find_vma_intersection(mm, nstart, end);
1944d3649f68SChristoph Hellwig } else if (nstart >= vma->vm_end)
1945c4d1a92dSLiam R. Howlett vma = find_vma_intersection(mm, vma->vm_end, end);
1946c4d1a92dSLiam R. Howlett
1947c4d1a92dSLiam R. Howlett if (!vma)
1948d3649f68SChristoph Hellwig break;
1949d3649f68SChristoph Hellwig /*
1950d3649f68SChristoph Hellwig * Set [nstart; nend) to intersection of desired address
1951d3649f68SChristoph Hellwig * range with the first VMA. Also, skip undesirable VMA types.
1952d3649f68SChristoph Hellwig */
1953d3649f68SChristoph Hellwig nend = min(end, vma->vm_end);
1954d3649f68SChristoph Hellwig if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1955d3649f68SChristoph Hellwig continue;
1956d3649f68SChristoph Hellwig if (nstart < vma->vm_start)
1957d3649f68SChristoph Hellwig nstart = vma->vm_start;
1958d3649f68SChristoph Hellwig /*
1959d3649f68SChristoph Hellwig * Now fault in a range of pages. populate_vma_page_range()
1960d3649f68SChristoph Hellwig * double checks the vma flags, so that it won't mlock pages
1961d3649f68SChristoph Hellwig * if the vma was already munlocked.
1962d3649f68SChristoph Hellwig */
1963d3649f68SChristoph Hellwig ret = populate_vma_page_range(vma, nstart, nend, &locked);
1964d3649f68SChristoph Hellwig if (ret < 0) {
1965d3649f68SChristoph Hellwig if (ignore_errors) {
1966d3649f68SChristoph Hellwig ret = 0;
1967d3649f68SChristoph Hellwig continue; /* continue at next VMA */
1968d3649f68SChristoph Hellwig }
1969d3649f68SChristoph Hellwig break;
1970d3649f68SChristoph Hellwig }
1971d3649f68SChristoph Hellwig nend = nstart + ret * PAGE_SIZE;
1972d3649f68SChristoph Hellwig ret = 0;
1973d3649f68SChristoph Hellwig }
1974d3649f68SChristoph Hellwig if (locked)
1975d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
1976d3649f68SChristoph Hellwig return ret; /* 0 or negative error code */
1977d3649f68SChristoph Hellwig }
1978050a9adcSChristoph Hellwig #else /* CONFIG_MMU */
__get_user_pages_locked(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,struct page ** pages,int * locked,unsigned int foll_flags)197964019a2eSPeter Xu static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
1980050a9adcSChristoph Hellwig unsigned long nr_pages, struct page **pages,
1981b2cac248SLorenzo Stoakes int *locked, unsigned int foll_flags)
1982050a9adcSChristoph Hellwig {
1983050a9adcSChristoph Hellwig struct vm_area_struct *vma;
1984b2a72dffSJason Gunthorpe bool must_unlock = false;
1985bfbe7110SLorenzo Stoakes vm_flags_t vm_flags;
198624dc20c7SPavel Tatashin long i;
1987050a9adcSChristoph Hellwig
1988b2a72dffSJason Gunthorpe if (!nr_pages)
1989b2a72dffSJason Gunthorpe return 0;
1990b2a72dffSJason Gunthorpe
1991b2a72dffSJason Gunthorpe /*
1992b2a72dffSJason Gunthorpe * The internal caller expects GUP to manage the lock internally and the
1993b2a72dffSJason Gunthorpe * lock must be released when this returns.
1994b2a72dffSJason Gunthorpe */
19959a863a6aSJason Gunthorpe if (!*locked) {
1996b2a72dffSJason Gunthorpe if (mmap_read_lock_killable(mm))
1997b2a72dffSJason Gunthorpe return -EAGAIN;
1998b2a72dffSJason Gunthorpe must_unlock = true;
1999b2a72dffSJason Gunthorpe *locked = 1;
2000b2a72dffSJason Gunthorpe }
2001b2a72dffSJason Gunthorpe
2002050a9adcSChristoph Hellwig /* calculate required read or write permissions.
2003050a9adcSChristoph Hellwig * If FOLL_FORCE is set, we only require the "MAY" flags.
2004050a9adcSChristoph Hellwig */
2005050a9adcSChristoph Hellwig vm_flags = (foll_flags & FOLL_WRITE) ?
2006050a9adcSChristoph Hellwig (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
2007050a9adcSChristoph Hellwig vm_flags &= (foll_flags & FOLL_FORCE) ?
2008050a9adcSChristoph Hellwig (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
2009050a9adcSChristoph Hellwig
2010050a9adcSChristoph Hellwig for (i = 0; i < nr_pages; i++) {
2011050a9adcSChristoph Hellwig vma = find_vma(mm, start);
2012050a9adcSChristoph Hellwig if (!vma)
2013b2a72dffSJason Gunthorpe break;
2014050a9adcSChristoph Hellwig
2015050a9adcSChristoph Hellwig /* protect what we can, including chardevs */
2016050a9adcSChristoph Hellwig if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
2017050a9adcSChristoph Hellwig !(vm_flags & vma->vm_flags))
2018b2a72dffSJason Gunthorpe break;
2019050a9adcSChristoph Hellwig
2020050a9adcSChristoph Hellwig if (pages) {
2021396a400bSLinus Walleij pages[i] = virt_to_page((void *)start);
2022050a9adcSChristoph Hellwig if (pages[i])
2023050a9adcSChristoph Hellwig get_page(pages[i]);
2024050a9adcSChristoph Hellwig }
2025b2cac248SLorenzo Stoakes
2026050a9adcSChristoph Hellwig start = (start + PAGE_SIZE) & PAGE_MASK;
2027050a9adcSChristoph Hellwig }
2028050a9adcSChristoph Hellwig
2029b2a72dffSJason Gunthorpe if (must_unlock && *locked) {
2030b2a72dffSJason Gunthorpe mmap_read_unlock(mm);
2031b2a72dffSJason Gunthorpe *locked = 0;
2032b2a72dffSJason Gunthorpe }
2033050a9adcSChristoph Hellwig
2034050a9adcSChristoph Hellwig return i ? : -EFAULT;
2035050a9adcSChristoph Hellwig }
2036050a9adcSChristoph Hellwig #endif /* !CONFIG_MMU */
2037d3649f68SChristoph Hellwig
20388f942eeaSJann Horn /**
2039bb523b40SAndreas Gruenbacher * fault_in_writeable - fault in userspace address range for writing
2040bb523b40SAndreas Gruenbacher * @uaddr: start of address range
2041bb523b40SAndreas Gruenbacher * @size: size of address range
2042bb523b40SAndreas Gruenbacher *
2043bb523b40SAndreas Gruenbacher * Returns the number of bytes not faulted in (like copy_to_user() and
2044bb523b40SAndreas Gruenbacher * copy_from_user()).
2045bb523b40SAndreas Gruenbacher */
fault_in_writeable(char __user * uaddr,size_t size)2046bb523b40SAndreas Gruenbacher size_t fault_in_writeable(char __user *uaddr, size_t size)
2047bb523b40SAndreas Gruenbacher {
2048a7797e74SBaoquan He const unsigned long start = (unsigned long)uaddr;
2049a7797e74SBaoquan He const unsigned long end = start + size;
2050a7797e74SBaoquan He unsigned long cur;
2051bb523b40SAndreas Gruenbacher
2052bb523b40SAndreas Gruenbacher if (unlikely(size == 0))
2053bb523b40SAndreas Gruenbacher return 0;
2054677b2a8cSChristophe Leroy if (!user_write_access_begin(uaddr, size))
2055bb523b40SAndreas Gruenbacher return size;
2056bb523b40SAndreas Gruenbacher
2057a7797e74SBaoquan He /* Stop once we overflow to 0. */
2058a7797e74SBaoquan He for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
2059a7797e74SBaoquan He unsafe_put_user(0, (char __user *)cur, out);
2060bb523b40SAndreas Gruenbacher out:
2061677b2a8cSChristophe Leroy user_write_access_end();
2062a7797e74SBaoquan He if (size > cur - start)
2063a7797e74SBaoquan He return size - (cur - start);
2064bb523b40SAndreas Gruenbacher return 0;
2065bb523b40SAndreas Gruenbacher }
2066bb523b40SAndreas Gruenbacher EXPORT_SYMBOL(fault_in_writeable);
2067bb523b40SAndreas Gruenbacher
2068da32b581SCatalin Marinas /**
2069da32b581SCatalin Marinas * fault_in_subpage_writeable - fault in an address range for writing
2070da32b581SCatalin Marinas * @uaddr: start of address range
2071da32b581SCatalin Marinas * @size: size of address range
2072da32b581SCatalin Marinas *
2073da32b581SCatalin Marinas * Fault in a user address range for writing while checking for permissions at
2074da32b581SCatalin Marinas * sub-page granularity (e.g. arm64 MTE). This function should be used when
2075da32b581SCatalin Marinas * the caller cannot guarantee forward progress of a copy_to_user() loop.
2076da32b581SCatalin Marinas *
2077da32b581SCatalin Marinas * Returns the number of bytes not faulted in (like copy_to_user() and
2078da32b581SCatalin Marinas * copy_from_user()).
2079da32b581SCatalin Marinas */
fault_in_subpage_writeable(char __user * uaddr,size_t size)2080da32b581SCatalin Marinas size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)
2081da32b581SCatalin Marinas {
2082da32b581SCatalin Marinas size_t faulted_in;
2083da32b581SCatalin Marinas
2084da32b581SCatalin Marinas /*
2085da32b581SCatalin Marinas * Attempt faulting in at page granularity first for page table
2086da32b581SCatalin Marinas * permission checking. The arch-specific probe_subpage_writeable()
2087da32b581SCatalin Marinas * functions may not check for this.
2088da32b581SCatalin Marinas */
2089da32b581SCatalin Marinas faulted_in = size - fault_in_writeable(uaddr, size);
2090da32b581SCatalin Marinas if (faulted_in)
2091da32b581SCatalin Marinas faulted_in -= probe_subpage_writeable(uaddr, faulted_in);
2092da32b581SCatalin Marinas
2093da32b581SCatalin Marinas return size - faulted_in;
2094da32b581SCatalin Marinas }
2095da32b581SCatalin Marinas EXPORT_SYMBOL(fault_in_subpage_writeable);
2096da32b581SCatalin Marinas
2097cdd591fcSAndreas Gruenbacher /*
2098cdd591fcSAndreas Gruenbacher * fault_in_safe_writeable - fault in an address range for writing
2099cdd591fcSAndreas Gruenbacher * @uaddr: start of address range
2100cdd591fcSAndreas Gruenbacher * @size: length of address range
2101cdd591fcSAndreas Gruenbacher *
2102fe673d3fSLinus Torvalds * Faults in an address range for writing. This is primarily useful when we
2103fe673d3fSLinus Torvalds * already know that some or all of the pages in the address range aren't in
2104fe673d3fSLinus Torvalds * memory.
2105cdd591fcSAndreas Gruenbacher *
2106fe673d3fSLinus Torvalds * Unlike fault_in_writeable(), this function is non-destructive.
2107cdd591fcSAndreas Gruenbacher *
2108cdd591fcSAndreas Gruenbacher * Note that we don't pin or otherwise hold the pages referenced that we fault
2109cdd591fcSAndreas Gruenbacher * in. There's no guarantee that they'll stay in memory for any duration of
2110cdd591fcSAndreas Gruenbacher * time.
2111cdd591fcSAndreas Gruenbacher *
2112cdd591fcSAndreas Gruenbacher * Returns the number of bytes not faulted in, like copy_to_user() and
2113cdd591fcSAndreas Gruenbacher * copy_from_user().
2114cdd591fcSAndreas Gruenbacher */
fault_in_safe_writeable(const char __user * uaddr,size_t size)2115cdd591fcSAndreas Gruenbacher size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
2116cdd591fcSAndreas Gruenbacher {
2117a7797e74SBaoquan He const unsigned long start = (unsigned long)uaddr;
2118a7797e74SBaoquan He const unsigned long end = start + size;
2119a7797e74SBaoquan He unsigned long cur;
2120cdd591fcSAndreas Gruenbacher struct mm_struct *mm = current->mm;
2121fe673d3fSLinus Torvalds bool unlocked = false;
2122cdd591fcSAndreas Gruenbacher
2123fe673d3fSLinus Torvalds if (unlikely(size == 0))
2124cdd591fcSAndreas Gruenbacher return 0;
2125fe673d3fSLinus Torvalds
2126fe673d3fSLinus Torvalds mmap_read_lock(mm);
2127a7797e74SBaoquan He /* Stop once we overflow to 0. */
2128a7797e74SBaoquan He for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
2129a7797e74SBaoquan He if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked))
2130fe673d3fSLinus Torvalds break;
2131fe673d3fSLinus Torvalds mmap_read_unlock(mm);
2132fe673d3fSLinus Torvalds
2133a7797e74SBaoquan He if (size > cur - start)
2134a7797e74SBaoquan He return size - (cur - start);
2135fe673d3fSLinus Torvalds return 0;
2136cdd591fcSAndreas Gruenbacher }
2137cdd591fcSAndreas Gruenbacher EXPORT_SYMBOL(fault_in_safe_writeable);
2138cdd591fcSAndreas Gruenbacher
2139bb523b40SAndreas Gruenbacher /**
2140bb523b40SAndreas Gruenbacher * fault_in_readable - fault in userspace address range for reading
2141bb523b40SAndreas Gruenbacher * @uaddr: start of user address range
2142bb523b40SAndreas Gruenbacher * @size: size of user address range
2143bb523b40SAndreas Gruenbacher *
2144bb523b40SAndreas Gruenbacher * Returns the number of bytes not faulted in (like copy_to_user() and
2145bb523b40SAndreas Gruenbacher * copy_from_user()).
2146bb523b40SAndreas Gruenbacher */
fault_in_readable(const char __user * uaddr,size_t size)2147bb523b40SAndreas Gruenbacher size_t fault_in_readable(const char __user *uaddr, size_t size)
2148bb523b40SAndreas Gruenbacher {
2149a7797e74SBaoquan He const unsigned long start = (unsigned long)uaddr;
2150a7797e74SBaoquan He const unsigned long end = start + size;
2151a7797e74SBaoquan He unsigned long cur;
2152bb523b40SAndreas Gruenbacher volatile char c;
2153bb523b40SAndreas Gruenbacher
2154bb523b40SAndreas Gruenbacher if (unlikely(size == 0))
2155bb523b40SAndreas Gruenbacher return 0;
2156677b2a8cSChristophe Leroy if (!user_read_access_begin(uaddr, size))
2157bb523b40SAndreas Gruenbacher return size;
2158bb523b40SAndreas Gruenbacher
2159a7797e74SBaoquan He /* Stop once we overflow to 0. */
2160a7797e74SBaoquan He for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
2161a7797e74SBaoquan He unsafe_get_user(c, (const char __user *)cur, out);
2162bb523b40SAndreas Gruenbacher out:
2163677b2a8cSChristophe Leroy user_read_access_end();
2164bb523b40SAndreas Gruenbacher (void)c;
2165a7797e74SBaoquan He if (size > cur - start)
2166a7797e74SBaoquan He return size - (cur - start);
2167bb523b40SAndreas Gruenbacher return 0;
2168bb523b40SAndreas Gruenbacher }
2169bb523b40SAndreas Gruenbacher EXPORT_SYMBOL(fault_in_readable);
2170bb523b40SAndreas Gruenbacher
2171bb523b40SAndreas Gruenbacher /**
21728f942eeaSJann Horn * get_dump_page() - pin user page in memory while writing it to core dump
21738f942eeaSJann Horn * @addr: user address
2174d6ff4c8fSMateusz Guzik * @locked: a pointer to an int denoting whether the mmap sem is held
21758f942eeaSJann Horn *
21768f942eeaSJann Horn * Returns struct page pointer of user page pinned for dump,
21778f942eeaSJann Horn * to be freed afterwards by put_page().
21788f942eeaSJann Horn *
21798f942eeaSJann Horn * Returns NULL on any kind of failure - a hole must then be inserted into
21808f942eeaSJann Horn * the corefile, to preserve alignment with its headers; and also returns
21818f942eeaSJann Horn * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
21828f942eeaSJann Horn * allowing a hole to be left in the corefile to save disk space.
21838f942eeaSJann Horn *
21847f3bfab5SJann Horn * Called without mmap_lock (takes and releases the mmap_lock by itself).
21858f942eeaSJann Horn */
21868f942eeaSJann Horn #ifdef CONFIG_ELF_CORE
get_dump_page(unsigned long addr,int * locked)2187d6ff4c8fSMateusz Guzik struct page *get_dump_page(unsigned long addr, int *locked)
21888f942eeaSJann Horn {
21898f942eeaSJann Horn struct page *page;
21907f3bfab5SJann Horn int ret;
21918f942eeaSJann Horn
2192d6ff4c8fSMateusz Guzik ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked,
21937f3bfab5SJann Horn FOLL_FORCE | FOLL_DUMP | FOLL_GET);
21947f3bfab5SJann Horn return (ret == 1) ? page : NULL;
21958f942eeaSJann Horn }
21968f942eeaSJann Horn #endif /* CONFIG_ELF_CORE */
21978f942eeaSJann Horn
2198d1e153feSPavel Tatashin #ifdef CONFIG_MIGRATION
219994efde1dSJohn Hubbard
220094efde1dSJohn Hubbard /*
220194efde1dSJohn Hubbard * An array of either pages or folios ("pofs"). Although it may seem tempting to
220294efde1dSJohn Hubbard * avoid this complication, by simply interpreting a list of folios as a list of
220394efde1dSJohn Hubbard * pages, that approach won't work in the longer term, because eventually the
220494efde1dSJohn Hubbard * layouts of struct page and struct folio will become completely different.
220594efde1dSJohn Hubbard * Furthermore, this pof approach avoids excessive page_folio() calls.
220694efde1dSJohn Hubbard */
220794efde1dSJohn Hubbard struct pages_or_folios {
220894efde1dSJohn Hubbard union {
220994efde1dSJohn Hubbard struct page **pages;
221094efde1dSJohn Hubbard struct folio **folios;
221194efde1dSJohn Hubbard void **entries;
221294efde1dSJohn Hubbard };
221394efde1dSJohn Hubbard bool has_folios;
221494efde1dSJohn Hubbard long nr_entries;
221594efde1dSJohn Hubbard };
221694efde1dSJohn Hubbard
pofs_get_folio(struct pages_or_folios * pofs,long i)221794efde1dSJohn Hubbard static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i)
221894efde1dSJohn Hubbard {
221994efde1dSJohn Hubbard if (pofs->has_folios)
222094efde1dSJohn Hubbard return pofs->folios[i];
222194efde1dSJohn Hubbard return page_folio(pofs->pages[i]);
222294efde1dSJohn Hubbard }
222394efde1dSJohn Hubbard
pofs_clear_entry(struct pages_or_folios * pofs,long i)222494efde1dSJohn Hubbard static void pofs_clear_entry(struct pages_or_folios *pofs, long i)
222594efde1dSJohn Hubbard {
222694efde1dSJohn Hubbard pofs->entries[i] = NULL;
222794efde1dSJohn Hubbard }
222894efde1dSJohn Hubbard
pofs_unpin(struct pages_or_folios * pofs)222994efde1dSJohn Hubbard static void pofs_unpin(struct pages_or_folios *pofs)
223094efde1dSJohn Hubbard {
223194efde1dSJohn Hubbard if (pofs->has_folios)
223294efde1dSJohn Hubbard unpin_folios(pofs->folios, pofs->nr_entries);
223394efde1dSJohn Hubbard else
223494efde1dSJohn Hubbard unpin_user_pages(pofs->pages, pofs->nr_entries);
223594efde1dSJohn Hubbard }
223694efde1dSJohn Hubbard
pofs_next_folio(struct folio * folio,struct pages_or_folios * pofs,long * index_ptr)2237a03db236SLi Zhe static struct folio *pofs_next_folio(struct folio *folio,
2238a03db236SLi Zhe struct pages_or_folios *pofs, long *index_ptr)
2239a03db236SLi Zhe {
2240a03db236SLi Zhe long i = *index_ptr + 1;
2241a03db236SLi Zhe
2242a03db236SLi Zhe if (!pofs->has_folios && folio_test_large(folio)) {
2243a03db236SLi Zhe const unsigned long start_pfn = folio_pfn(folio);
2244a03db236SLi Zhe const unsigned long end_pfn = start_pfn + folio_nr_pages(folio);
2245a03db236SLi Zhe
2246a03db236SLi Zhe for (; i < pofs->nr_entries; i++) {
2247a03db236SLi Zhe unsigned long pfn = page_to_pfn(pofs->pages[i]);
2248a03db236SLi Zhe
2249a03db236SLi Zhe /* Is this page part of this folio? */
2250a03db236SLi Zhe if (pfn < start_pfn || pfn >= end_pfn)
2251a03db236SLi Zhe break;
2252a03db236SLi Zhe }
2253a03db236SLi Zhe }
2254a03db236SLi Zhe
2255a03db236SLi Zhe if (unlikely(i == pofs->nr_entries))
2256a03db236SLi Zhe return NULL;
2257a03db236SLi Zhe *index_ptr = i;
2258a03db236SLi Zhe
2259a03db236SLi Zhe return pofs_get_folio(pofs, i);
2260a03db236SLi Zhe }
2261a03db236SLi Zhe
2262f68749ecSPavel Tatashin /*
226353ba78deSVivek Kasireddy * Returns the number of collected folios. Return value is always >= 0.
2264f68749ecSPavel Tatashin */
collect_longterm_unpinnable_folios(struct list_head * movable_folio_list,struct pages_or_folios * pofs)2265517f496eSDavid Hildenbrand static unsigned long collect_longterm_unpinnable_folios(
226653ba78deSVivek Kasireddy struct list_head *movable_folio_list,
226794efde1dSJohn Hubbard struct pages_or_folios *pofs)
22689a4e9f3bSAneesh Kumar K.V {
2269a03db236SLi Zhe unsigned long collected = 0;
2270a03db236SLi Zhe struct folio *folio;
2271a09a8a1fSHugh Dickins int drained = 0;
2272a03db236SLi Zhe long i = 0;
22739a4e9f3bSAneesh Kumar K.V
2274a03db236SLi Zhe for (folio = pofs_get_folio(pofs, i); folio;
2275a03db236SLi Zhe folio = pofs_next_folio(folio, pofs, &i)) {
2276f9f38f78SChristoph Hellwig
22776077c943SAlex Sierra if (folio_is_longterm_pinnable(folio))
2278f9f38f78SChristoph Hellwig continue;
227967e139b0SAlistair Popple
2280517f496eSDavid Hildenbrand collected++;
2281517f496eSDavid Hildenbrand
228267e139b0SAlistair Popple if (folio_is_device_coherent(folio))
228367e139b0SAlistair Popple continue;
228467e139b0SAlistair Popple
22851b7f7e58SMatthew Wilcox (Oracle) if (folio_test_hugetlb(folio)) {
22864c640f12SDavid Hildenbrand folio_isolate_hugetlb(folio, movable_folio_list);
2287f9f38f78SChristoph Hellwig continue;
2288f9f38f78SChristoph Hellwig }
2289f9f38f78SChristoph Hellwig
22902da6de30SHugh Dickins if (drained == 0 && folio_may_be_lru_cached(folio) &&
2291a09a8a1fSHugh Dickins folio_ref_count(folio) !=
2292a09a8a1fSHugh Dickins folio_expected_ref_count(folio) + 1) {
2293a09a8a1fSHugh Dickins lru_add_drain();
2294a09a8a1fSHugh Dickins drained = 1;
2295a09a8a1fSHugh Dickins }
22962da6de30SHugh Dickins if (drained == 1 && folio_may_be_lru_cached(folio) &&
2297a09a8a1fSHugh Dickins folio_ref_count(folio) !=
229898c6d259SHugh Dickins folio_expected_ref_count(folio) + 1) {
22999a4e9f3bSAneesh Kumar K.V lru_add_drain_all();
2300a09a8a1fSHugh Dickins drained = 2;
23019a4e9f3bSAneesh Kumar K.V }
23029a4e9f3bSAneesh Kumar K.V
2303be2d5756SBaolin Wang if (!folio_isolate_lru(folio))
23046e7f34ebSPavel Tatashin continue;
230567e139b0SAlistair Popple
230653ba78deSVivek Kasireddy list_add_tail(&folio->lru, movable_folio_list);
23071b7f7e58SMatthew Wilcox (Oracle) node_stat_mod_folio(folio,
23081b7f7e58SMatthew Wilcox (Oracle) NR_ISOLATED_ANON + folio_is_file_lru(folio),
23091b7f7e58SMatthew Wilcox (Oracle) folio_nr_pages(folio));
23109a4e9f3bSAneesh Kumar K.V }
2311517f496eSDavid Hildenbrand
2312517f496eSDavid Hildenbrand return collected;
2313f68749ecSPavel Tatashin }
2314f9f38f78SChristoph Hellwig
231567e139b0SAlistair Popple /*
231653ba78deSVivek Kasireddy * Unpins all folios and migrates device coherent folios and movable_folio_list.
231753ba78deSVivek Kasireddy * Returns -EAGAIN if all folios were successfully migrated or -errno for
231853ba78deSVivek Kasireddy * failure (or partial success).
231967e139b0SAlistair Popple */
232094efde1dSJohn Hubbard static int
migrate_longterm_unpinnable_folios(struct list_head * movable_folio_list,struct pages_or_folios * pofs)232194efde1dSJohn Hubbard migrate_longterm_unpinnable_folios(struct list_head *movable_folio_list,
232294efde1dSJohn Hubbard struct pages_or_folios *pofs)
232367e139b0SAlistair Popple {
232467e139b0SAlistair Popple int ret;
232567e139b0SAlistair Popple unsigned long i;
232667e139b0SAlistair Popple
232794efde1dSJohn Hubbard for (i = 0; i < pofs->nr_entries; i++) {
232894efde1dSJohn Hubbard struct folio *folio = pofs_get_folio(pofs, i);
232967e139b0SAlistair Popple
233067e139b0SAlistair Popple if (folio_is_device_coherent(folio)) {
233167e139b0SAlistair Popple /*
233253ba78deSVivek Kasireddy * Migration will fail if the folio is pinned, so
233353ba78deSVivek Kasireddy * convert the pin on the source folio to a normal
233453ba78deSVivek Kasireddy * reference.
233567e139b0SAlistair Popple */
233694efde1dSJohn Hubbard pofs_clear_entry(pofs, i);
233767e139b0SAlistair Popple folio_get(folio);
233867e139b0SAlistair Popple gup_put_folio(folio, 1, FOLL_PIN);
233967e139b0SAlistair Popple
23405c8525a3SKefeng Wang if (migrate_device_coherent_folio(folio)) {
234167e139b0SAlistair Popple ret = -EBUSY;
234267e139b0SAlistair Popple goto err;
234367e139b0SAlistair Popple }
234467e139b0SAlistair Popple
234567e139b0SAlistair Popple continue;
234667e139b0SAlistair Popple }
234767e139b0SAlistair Popple
234867e139b0SAlistair Popple /*
234953ba78deSVivek Kasireddy * We can't migrate folios with unexpected references, so drop
235067e139b0SAlistair Popple * the reference obtained by __get_user_pages_locked().
235153ba78deSVivek Kasireddy * Migrating folios have been added to movable_folio_list after
235267e139b0SAlistair Popple * calling folio_isolate_lru() which takes a reference so the
235353ba78deSVivek Kasireddy * folio won't be freed if it's migrating.
235467e139b0SAlistair Popple */
235594efde1dSJohn Hubbard unpin_folio(folio);
235694efde1dSJohn Hubbard pofs_clear_entry(pofs, i);
235767e139b0SAlistair Popple }
235867e139b0SAlistair Popple
235953ba78deSVivek Kasireddy if (!list_empty(movable_folio_list)) {
2360f9f38f78SChristoph Hellwig struct migration_target_control mtc = {
2361f9f38f78SChristoph Hellwig .nid = NUMA_NO_NODE,
2362f9f38f78SChristoph Hellwig .gfp_mask = GFP_USER | __GFP_NOWARN,
2363e42dfe4eSBaolin Wang .reason = MR_LONGTERM_PIN,
2364f9f38f78SChristoph Hellwig };
2365f9f38f78SChristoph Hellwig
236653ba78deSVivek Kasireddy if (migrate_pages(movable_folio_list, alloc_migration_target,
2367f0f44638SPavel Tatashin NULL, (unsigned long)&mtc, MIGRATE_SYNC,
236867e139b0SAlistair Popple MR_LONGTERM_PIN, NULL)) {
2369f9f38f78SChristoph Hellwig ret = -ENOMEM;
237067e139b0SAlistair Popple goto err;
237167e139b0SAlistair Popple }
2372f68749ecSPavel Tatashin }
2373f68749ecSPavel Tatashin
237453ba78deSVivek Kasireddy putback_movable_pages(movable_folio_list);
237524a95998SAlistair Popple
237667e139b0SAlistair Popple return -EAGAIN;
237767e139b0SAlistair Popple
237867e139b0SAlistair Popple err:
237994efde1dSJohn Hubbard pofs_unpin(pofs);
238053ba78deSVivek Kasireddy putback_movable_pages(movable_folio_list);
238167e139b0SAlistair Popple
238267e139b0SAlistair Popple return ret;
238367e139b0SAlistair Popple }
238467e139b0SAlistair Popple
238594efde1dSJohn Hubbard static long
check_and_migrate_movable_pages_or_folios(struct pages_or_folios * pofs)238694efde1dSJohn Hubbard check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
238794efde1dSJohn Hubbard {
238894efde1dSJohn Hubbard LIST_HEAD(movable_folio_list);
2389517f496eSDavid Hildenbrand unsigned long collected;
239094efde1dSJohn Hubbard
2391517f496eSDavid Hildenbrand collected = collect_longterm_unpinnable_folios(&movable_folio_list,
2392517f496eSDavid Hildenbrand pofs);
2393517f496eSDavid Hildenbrand if (!collected)
239494efde1dSJohn Hubbard return 0;
239594efde1dSJohn Hubbard
239694efde1dSJohn Hubbard return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
239794efde1dSJohn Hubbard }
239894efde1dSJohn Hubbard
239967e139b0SAlistair Popple /*
240053ba78deSVivek Kasireddy * Check whether all folios are *allowed* to be pinned indefinitely (long term).
240153ba78deSVivek Kasireddy * Rather confusingly, all folios in the range are required to be pinned via
240253ba78deSVivek Kasireddy * FOLL_PIN, before calling this routine.
240367e139b0SAlistair Popple *
2404aa6f8b25SJohn Hubbard * Return values:
240567e139b0SAlistair Popple *
2406aa6f8b25SJohn Hubbard * 0: if everything is OK and all folios in the range are allowed to be pinned,
240753ba78deSVivek Kasireddy * then this routine leaves all folios pinned and returns zero for success.
2408aa6f8b25SJohn Hubbard *
2409aa6f8b25SJohn Hubbard * -EAGAIN: if any folios in the range are not allowed to be pinned, then this
2410aa6f8b25SJohn Hubbard * routine will migrate those folios away, unpin all the folios in the range. If
2411aa6f8b25SJohn Hubbard * migration of the entire set of folios succeeds, then -EAGAIN is returned. The
2412aa6f8b25SJohn Hubbard * caller should re-pin the entire range with FOLL_PIN and then call this
2413aa6f8b25SJohn Hubbard * routine again.
2414aa6f8b25SJohn Hubbard *
2415aa6f8b25SJohn Hubbard * -ENOMEM, or any other -errno: if an error *other* than -EAGAIN occurs, this
2416aa6f8b25SJohn Hubbard * indicates a migration failure. The caller should give up, and propagate the
2417aa6f8b25SJohn Hubbard * error back up the call stack. The caller does not need to unpin any folios in
2418aa6f8b25SJohn Hubbard * that case, because this routine will do the unpinning.
241953ba78deSVivek Kasireddy */
check_and_migrate_movable_folios(unsigned long nr_folios,struct folio ** folios)242053ba78deSVivek Kasireddy static long check_and_migrate_movable_folios(unsigned long nr_folios,
242153ba78deSVivek Kasireddy struct folio **folios)
242253ba78deSVivek Kasireddy {
242394efde1dSJohn Hubbard struct pages_or_folios pofs = {
242494efde1dSJohn Hubbard .folios = folios,
242594efde1dSJohn Hubbard .has_folios = true,
242694efde1dSJohn Hubbard .nr_entries = nr_folios,
242794efde1dSJohn Hubbard };
242853ba78deSVivek Kasireddy
242994efde1dSJohn Hubbard return check_and_migrate_movable_pages_or_folios(&pofs);
243053ba78deSVivek Kasireddy }
243153ba78deSVivek Kasireddy
243253ba78deSVivek Kasireddy /*
2433aa6f8b25SJohn Hubbard * Return values and behavior are the same as those for
2434aa6f8b25SJohn Hubbard * check_and_migrate_movable_folios().
243567e139b0SAlistair Popple */
check_and_migrate_movable_pages(unsigned long nr_pages,struct page ** pages)243667e139b0SAlistair Popple static long check_and_migrate_movable_pages(unsigned long nr_pages,
243767e139b0SAlistair Popple struct page **pages)
243867e139b0SAlistair Popple {
243994efde1dSJohn Hubbard struct pages_or_folios pofs = {
244094efde1dSJohn Hubbard .pages = pages,
244194efde1dSJohn Hubbard .has_folios = false,
244294efde1dSJohn Hubbard .nr_entries = nr_pages,
244394efde1dSJohn Hubbard };
244467e139b0SAlistair Popple
244594efde1dSJohn Hubbard return check_and_migrate_movable_pages_or_folios(&pofs);
24469a4e9f3bSAneesh Kumar K.V }
24479a4e9f3bSAneesh Kumar K.V #else
check_and_migrate_movable_pages(unsigned long nr_pages,struct page ** pages)2448f68749ecSPavel Tatashin static long check_and_migrate_movable_pages(unsigned long nr_pages,
2449f6d299ecSAlistair Popple struct page **pages)
24509a4e9f3bSAneesh Kumar K.V {
245124a95998SAlistair Popple return 0;
24529a4e9f3bSAneesh Kumar K.V }
245353ba78deSVivek Kasireddy
check_and_migrate_movable_folios(unsigned long nr_folios,struct folio ** folios)245453ba78deSVivek Kasireddy static long check_and_migrate_movable_folios(unsigned long nr_folios,
245553ba78deSVivek Kasireddy struct folio **folios)
245653ba78deSVivek Kasireddy {
245753ba78deSVivek Kasireddy return 0;
245853ba78deSVivek Kasireddy }
2459d1e153feSPavel Tatashin #endif /* CONFIG_MIGRATION */
24609a4e9f3bSAneesh Kumar K.V
24612bb6d283SDan Williams /*
2462932f4a63SIra Weiny * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
2463932f4a63SIra Weiny * allows us to process the FOLL_LONGTERM flag.
24642bb6d283SDan Williams */
__gup_longterm_locked(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,struct page ** pages,int * locked,unsigned int gup_flags)246564019a2eSPeter Xu static long __gup_longterm_locked(struct mm_struct *mm,
2466932f4a63SIra Weiny unsigned long start,
2467932f4a63SIra Weiny unsigned long nr_pages,
2468932f4a63SIra Weiny struct page **pages,
246953b2d09bSJason Gunthorpe int *locked,
2470932f4a63SIra Weiny unsigned int gup_flags)
24712bb6d283SDan Williams {
2472f68749ecSPavel Tatashin unsigned int flags;
247324a95998SAlistair Popple long rc, nr_pinned_pages;
24742bb6d283SDan Williams
2475f68749ecSPavel Tatashin if (!(gup_flags & FOLL_LONGTERM))
2476b2cac248SLorenzo Stoakes return __get_user_pages_locked(mm, start, nr_pages, pages,
247753b2d09bSJason Gunthorpe locked, gup_flags);
247867e139b0SAlistair Popple
24791a08ae36SPavel Tatashin flags = memalloc_pin_save();
2480f68749ecSPavel Tatashin do {
248124a95998SAlistair Popple nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
2482b2cac248SLorenzo Stoakes pages, locked,
248324a95998SAlistair Popple gup_flags);
248424a95998SAlistair Popple if (nr_pinned_pages <= 0) {
248524a95998SAlistair Popple rc = nr_pinned_pages;
2486f68749ecSPavel Tatashin break;
248724a95998SAlistair Popple }
2488d64e2dbcSJason Gunthorpe
2489d64e2dbcSJason Gunthorpe /* FOLL_LONGTERM implies FOLL_PIN */
2490f6d299ecSAlistair Popple rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
249124a95998SAlistair Popple } while (rc == -EAGAIN);
24921a08ae36SPavel Tatashin memalloc_pin_restore(flags);
249324a95998SAlistair Popple return rc ? rc : nr_pinned_pages;
24942bb6d283SDan Williams }
2495932f4a63SIra Weiny
2496d64e2dbcSJason Gunthorpe /*
2497d64e2dbcSJason Gunthorpe * Check that the given flags are valid for the exported gup/pup interface, and
2498d64e2dbcSJason Gunthorpe * update them with the required flags that the caller must have set.
2499d64e2dbcSJason Gunthorpe */
is_valid_gup_args(struct page ** pages,int * locked,unsigned int * gup_flags_p,unsigned int to_set)2500b2cac248SLorenzo Stoakes static bool is_valid_gup_args(struct page **pages, int *locked,
2501b2cac248SLorenzo Stoakes unsigned int *gup_flags_p, unsigned int to_set)
2502447f3e45SBarry Song {
2503d64e2dbcSJason Gunthorpe unsigned int gup_flags = *gup_flags_p;
2504d64e2dbcSJason Gunthorpe
2505447f3e45SBarry Song /*
2506d64e2dbcSJason Gunthorpe * These flags not allowed to be specified externally to the gup
2507d64e2dbcSJason Gunthorpe * interfaces:
25080f20bba1SLorenzo Stoakes * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
25097290840dSDavid Hildenbrand * - FOLL_REMOTE is internal only, set in (get|pin)_user_pages_remote()
2510f04740f5SJason Gunthorpe * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL
2511447f3e45SBarry Song */
25120f20bba1SLorenzo Stoakes if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS))
2513447f3e45SBarry Song return false;
2514447f3e45SBarry Song
2515d64e2dbcSJason Gunthorpe gup_flags |= to_set;
2516f04740f5SJason Gunthorpe if (locked) {
2517f04740f5SJason Gunthorpe /* At the external interface locked must be set */
2518f04740f5SJason Gunthorpe if (WARN_ON_ONCE(*locked != 1))
2519f04740f5SJason Gunthorpe return false;
2520f04740f5SJason Gunthorpe
2521f04740f5SJason Gunthorpe gup_flags |= FOLL_UNLOCKABLE;
2522f04740f5SJason Gunthorpe }
2523d64e2dbcSJason Gunthorpe
2524d64e2dbcSJason Gunthorpe /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2525d64e2dbcSJason Gunthorpe if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
2526d64e2dbcSJason Gunthorpe (FOLL_PIN | FOLL_GET)))
2527d64e2dbcSJason Gunthorpe return false;
2528d64e2dbcSJason Gunthorpe
2529d64e2dbcSJason Gunthorpe /* LONGTERM can only be specified when pinning */
2530d64e2dbcSJason Gunthorpe if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM)))
2531d64e2dbcSJason Gunthorpe return false;
2532d64e2dbcSJason Gunthorpe
2533d64e2dbcSJason Gunthorpe /* Pages input must be given if using GET/PIN */
2534d64e2dbcSJason Gunthorpe if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages))
2535d64e2dbcSJason Gunthorpe return false;
2536d64e2dbcSJason Gunthorpe
2537d64e2dbcSJason Gunthorpe /* We want to allow the pgmap to be hot-unplugged at all times */
2538d64e2dbcSJason Gunthorpe if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) &&
2539d64e2dbcSJason Gunthorpe (gup_flags & FOLL_PCI_P2PDMA)))
2540d64e2dbcSJason Gunthorpe return false;
2541d64e2dbcSJason Gunthorpe
2542d64e2dbcSJason Gunthorpe *gup_flags_p = gup_flags;
2543447f3e45SBarry Song return true;
2544447f3e45SBarry Song }
2545447f3e45SBarry Song
254622bf29b6SJohn Hubbard #ifdef CONFIG_MMU
2547adc8cb40SSouptick Joarder /**
2548c4237f8bSJohn Hubbard * get_user_pages_remote() - pin user pages in memory
2549c4237f8bSJohn Hubbard * @mm: mm_struct of target mm
2550c4237f8bSJohn Hubbard * @start: starting user address
2551c4237f8bSJohn Hubbard * @nr_pages: number of pages from start to pin
2552c4237f8bSJohn Hubbard * @gup_flags: flags modifying lookup behaviour
2553c4237f8bSJohn Hubbard * @pages: array that receives pointers to the pages pinned.
2554c4237f8bSJohn Hubbard * Should be at least nr_pages long. Or NULL, if caller
2555c4237f8bSJohn Hubbard * only intends to ensure the pages are faulted in.
2556c4237f8bSJohn Hubbard * @locked: pointer to lock flag indicating whether lock is held and
2557c4237f8bSJohn Hubbard * subsequently whether VM_FAULT_RETRY functionality can be
2558c4237f8bSJohn Hubbard * utilised. Lock must initially be held.
2559c4237f8bSJohn Hubbard *
2560c4237f8bSJohn Hubbard * Returns either number of pages pinned (which may be less than the
2561c4237f8bSJohn Hubbard * number requested), or an error. Details about the return value:
2562c4237f8bSJohn Hubbard *
2563c4237f8bSJohn Hubbard * -- If nr_pages is 0, returns 0.
2564c4237f8bSJohn Hubbard * -- If nr_pages is >0, but no pages were pinned, returns -errno.
2565c4237f8bSJohn Hubbard * -- If nr_pages is >0, and some pages were pinned, returns the number of
2566c4237f8bSJohn Hubbard * pages pinned. Again, this may be less than nr_pages.
2567c4237f8bSJohn Hubbard *
2568c4237f8bSJohn Hubbard * The caller is responsible for releasing returned @pages, via put_page().
2569c4237f8bSJohn Hubbard *
2570c1e8d7c6SMichel Lespinasse * Must be called with mmap_lock held for read or write.
2571c4237f8bSJohn Hubbard *
2572adc8cb40SSouptick Joarder * get_user_pages_remote walks a process's page tables and takes a reference
2573adc8cb40SSouptick Joarder * to each struct page that each user address corresponds to at a given
2574c4237f8bSJohn Hubbard * instant. That is, it takes the page that would be accessed if a user
2575c4237f8bSJohn Hubbard * thread accesses the given user virtual address at that instant.
2576c4237f8bSJohn Hubbard *
2577c4237f8bSJohn Hubbard * This does not guarantee that the page exists in the user mappings when
2578adc8cb40SSouptick Joarder * get_user_pages_remote returns, and there may even be a completely different
2579c4237f8bSJohn Hubbard * page there in some cases (eg. if mmapped pagecache has been invalidated
25805da1a868SJingyu Wang * and subsequently re-faulted). However it does guarantee that the page
2581c4237f8bSJohn Hubbard * won't be freed completely. And mostly callers simply care that the page
2582c4237f8bSJohn Hubbard * contains data that was valid *at some point in time*. Typically, an IO
2583c4237f8bSJohn Hubbard * or similar operation cannot guarantee anything stronger anyway because
2584c4237f8bSJohn Hubbard * locks can't be held over the syscall boundary.
2585c4237f8bSJohn Hubbard *
2586c4237f8bSJohn Hubbard * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
2587c4237f8bSJohn Hubbard * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
2588c4237f8bSJohn Hubbard * be called after the page is finished with, and before put_page is called.
2589c4237f8bSJohn Hubbard *
2590adc8cb40SSouptick Joarder * get_user_pages_remote is typically used for fewer-copy IO operations,
2591adc8cb40SSouptick Joarder * to get a handle on the memory by some means other than accesses
2592adc8cb40SSouptick Joarder * via the user virtual addresses. The pages may be submitted for
2593adc8cb40SSouptick Joarder * DMA to devices or accessed via their kernel linear mapping (via the
2594adc8cb40SSouptick Joarder * kmap APIs). Care should be taken to use the correct cache flushing APIs.
2595c4237f8bSJohn Hubbard *
2596c4237f8bSJohn Hubbard * See also get_user_pages_fast, for performance critical applications.
2597c4237f8bSJohn Hubbard *
2598adc8cb40SSouptick Joarder * get_user_pages_remote should be phased out in favor of
2599c4237f8bSJohn Hubbard * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
2600adc8cb40SSouptick Joarder * should use get_user_pages_remote because it cannot pass
2601c4237f8bSJohn Hubbard * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
2602c4237f8bSJohn Hubbard */
get_user_pages_remote(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages,int * locked)260364019a2eSPeter Xu long get_user_pages_remote(struct mm_struct *mm,
2604c4237f8bSJohn Hubbard unsigned long start, unsigned long nr_pages,
2605c4237f8bSJohn Hubbard unsigned int gup_flags, struct page **pages,
2606ca5e8632SLorenzo Stoakes int *locked)
2607c4237f8bSJohn Hubbard {
26089a863a6aSJason Gunthorpe int local_locked = 1;
26099a863a6aSJason Gunthorpe
2610b2cac248SLorenzo Stoakes if (!is_valid_gup_args(pages, locked, &gup_flags,
2611d64e2dbcSJason Gunthorpe FOLL_TOUCH | FOLL_REMOTE))
2612eddb1c22SJohn Hubbard return -EINVAL;
2613eddb1c22SJohn Hubbard
2614b2cac248SLorenzo Stoakes return __get_user_pages_locked(mm, start, nr_pages, pages,
26159a863a6aSJason Gunthorpe locked ? locked : &local_locked,
2616d64e2dbcSJason Gunthorpe gup_flags);
2617c4237f8bSJohn Hubbard }
2618c4237f8bSJohn Hubbard EXPORT_SYMBOL(get_user_pages_remote);
2619c4237f8bSJohn Hubbard
2620eddb1c22SJohn Hubbard #else /* CONFIG_MMU */
get_user_pages_remote(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages,int * locked)262164019a2eSPeter Xu long get_user_pages_remote(struct mm_struct *mm,
2622eddb1c22SJohn Hubbard unsigned long start, unsigned long nr_pages,
2623eddb1c22SJohn Hubbard unsigned int gup_flags, struct page **pages,
2624ca5e8632SLorenzo Stoakes int *locked)
2625eddb1c22SJohn Hubbard {
2626eddb1c22SJohn Hubbard return 0;
2627eddb1c22SJohn Hubbard }
2628eddb1c22SJohn Hubbard #endif /* !CONFIG_MMU */
2629eddb1c22SJohn Hubbard
2630adc8cb40SSouptick Joarder /**
2631adc8cb40SSouptick Joarder * get_user_pages() - pin user pages in memory
2632adc8cb40SSouptick Joarder * @start: starting user address
2633adc8cb40SSouptick Joarder * @nr_pages: number of pages from start to pin
2634adc8cb40SSouptick Joarder * @gup_flags: flags modifying lookup behaviour
2635adc8cb40SSouptick Joarder * @pages: array that receives pointers to the pages pinned.
2636adc8cb40SSouptick Joarder * Should be at least nr_pages long. Or NULL, if caller
2637adc8cb40SSouptick Joarder * only intends to ensure the pages are faulted in.
2638adc8cb40SSouptick Joarder *
263964019a2eSPeter Xu * This is the same as get_user_pages_remote(), just with a less-flexible
264064019a2eSPeter Xu * calling convention where we assume that the mm being operated on belongs to
264164019a2eSPeter Xu * the current task, and doesn't allow passing of a locked parameter. We also
264264019a2eSPeter Xu * obviously don't pass FOLL_REMOTE in here.
2643932f4a63SIra Weiny */
get_user_pages(unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages)2644932f4a63SIra Weiny long get_user_pages(unsigned long start, unsigned long nr_pages,
264554d02069SLorenzo Stoakes unsigned int gup_flags, struct page **pages)
2646932f4a63SIra Weiny {
26479a863a6aSJason Gunthorpe int locked = 1;
26489a863a6aSJason Gunthorpe
2649b2cac248SLorenzo Stoakes if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH))
2650eddb1c22SJohn Hubbard return -EINVAL;
2651eddb1c22SJohn Hubbard
2652afa3c33eSJason Gunthorpe return __get_user_pages_locked(current->mm, start, nr_pages, pages,
2653b2cac248SLorenzo Stoakes &locked, gup_flags);
2654932f4a63SIra Weiny }
2655932f4a63SIra Weiny EXPORT_SYMBOL(get_user_pages);
26562bb6d283SDan Williams
2657acc3c8d1SKirill A. Shutemov /*
2658d3649f68SChristoph Hellwig * get_user_pages_unlocked() is suitable to replace the form:
2659acc3c8d1SKirill A. Shutemov *
26603e4e28c5SMichel Lespinasse * mmap_read_lock(mm);
266164019a2eSPeter Xu * get_user_pages(mm, ..., pages, NULL);
26623e4e28c5SMichel Lespinasse * mmap_read_unlock(mm);
2663d3649f68SChristoph Hellwig *
2664d3649f68SChristoph Hellwig * with:
2665d3649f68SChristoph Hellwig *
266664019a2eSPeter Xu * get_user_pages_unlocked(mm, ..., pages);
2667d3649f68SChristoph Hellwig *
2668d3649f68SChristoph Hellwig * It is functionally equivalent to get_user_pages_fast so
2669d3649f68SChristoph Hellwig * get_user_pages_fast should be used instead if specific gup_flags
2670d3649f68SChristoph Hellwig * (e.g. FOLL_FORCE) are not required.
2671acc3c8d1SKirill A. Shutemov */
get_user_pages_unlocked(unsigned long start,unsigned long nr_pages,struct page ** pages,unsigned int gup_flags)2672d3649f68SChristoph Hellwig long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
2673d3649f68SChristoph Hellwig struct page **pages, unsigned int gup_flags)
2674acc3c8d1SKirill A. Shutemov {
2675b2a72dffSJason Gunthorpe int locked = 0;
2676acc3c8d1SKirill A. Shutemov
2677b2cac248SLorenzo Stoakes if (!is_valid_gup_args(pages, NULL, &gup_flags,
2678f04740f5SJason Gunthorpe FOLL_TOUCH | FOLL_UNLOCKABLE))
2679d64e2dbcSJason Gunthorpe return -EINVAL;
2680d64e2dbcSJason Gunthorpe
2681afa3c33eSJason Gunthorpe return __get_user_pages_locked(current->mm, start, nr_pages, pages,
2682b2cac248SLorenzo Stoakes &locked, gup_flags);
2683acc3c8d1SKirill A. Shutemov }
2684d3649f68SChristoph Hellwig EXPORT_SYMBOL(get_user_pages_unlocked);
26852667f50eSSteve Capper
26862667f50eSSteve Capper /*
268723babe19SDavid Hildenbrand * GUP-fast
26882667f50eSSteve Capper *
26892667f50eSSteve Capper * get_user_pages_fast attempts to pin user pages by walking the page
26902667f50eSSteve Capper * tables directly and avoids taking locks. Thus the walker needs to be
26912667f50eSSteve Capper * protected from page table pages being freed from under it, and should
26922667f50eSSteve Capper * block any THP splits.
26932667f50eSSteve Capper *
26942667f50eSSteve Capper * One way to achieve this is to have the walker disable interrupts, and
26952667f50eSSteve Capper * rely on IPIs from the TLB flushing code blocking before the page table
26962667f50eSSteve Capper * pages are freed. This is unsuitable for architectures that do not need
26972667f50eSSteve Capper * to broadcast an IPI when invalidating TLBs.
26982667f50eSSteve Capper *
26992667f50eSSteve Capper * Another way to achieve this is to batch up page table containing pages
27002667f50eSSteve Capper * belonging to more than one mm_user, then rcu_sched a callback to free those
270123babe19SDavid Hildenbrand * pages. Disabling interrupts will allow the gup_fast() walker to both block
27022667f50eSSteve Capper * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
27032667f50eSSteve Capper * (which is a relatively rare event). The code below adopts this strategy.
27042667f50eSSteve Capper *
27052667f50eSSteve Capper * Before activating this code, please be aware that the following assumptions
27062667f50eSSteve Capper * are currently made:
27072667f50eSSteve Capper *
2708ff2e6d72SPeter Zijlstra * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
2709e585513bSKirill A. Shutemov * free pages containing page tables or TLB flushing requires IPI broadcast.
27102667f50eSSteve Capper *
27112667f50eSSteve Capper * *) ptes can be read atomically by the architecture.
27122667f50eSSteve Capper *
2713b6c46600Sjianyun.gao * *) valid user addresses are below TASK_MAX_SIZE
27142667f50eSSteve Capper *
27152667f50eSSteve Capper * The last two assumptions can be relaxed by the addition of helper functions.
27162667f50eSSteve Capper *
27172667f50eSSteve Capper * This code is based heavily on the PowerPC implementation by Nick Piggin.
27182667f50eSSteve Capper */
271925176ad0SDavid Hildenbrand #ifdef CONFIG_HAVE_GUP_FAST
2720a6e79df9SLorenzo Stoakes /*
2721f002882cSDavid Hildenbrand * Used in the GUP-fast path to determine whether GUP is permitted to work on
2722f002882cSDavid Hildenbrand * a specific folio.
2723a6e79df9SLorenzo Stoakes *
2724a6e79df9SLorenzo Stoakes * This call assumes the caller has pinned the folio, that the lowest page table
2725a6e79df9SLorenzo Stoakes * level still points to this folio, and that interrupts have been disabled.
2726a6e79df9SLorenzo Stoakes *
2727f002882cSDavid Hildenbrand * GUP-fast must reject all secretmem folios.
2728f002882cSDavid Hildenbrand *
2729a6e79df9SLorenzo Stoakes * Writing to pinned file-backed dirty tracked folios is inherently problematic
2730a6e79df9SLorenzo Stoakes * (see comment describing the writable_file_mapping_allowed() function). We
2731a6e79df9SLorenzo Stoakes * therefore try to avoid the most egregious case of a long-term mapping doing
2732a6e79df9SLorenzo Stoakes * so.
2733a6e79df9SLorenzo Stoakes *
2734a6e79df9SLorenzo Stoakes * This function cannot be as thorough as that one as the VMA is not available
2735a6e79df9SLorenzo Stoakes * in the fast path, so instead we whitelist known good cases and if in doubt,
2736a6e79df9SLorenzo Stoakes * fall back to the slow path.
2737a6e79df9SLorenzo Stoakes */
gup_fast_folio_allowed(struct folio * folio,unsigned int flags)2738f002882cSDavid Hildenbrand static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
2739a6e79df9SLorenzo Stoakes {
2740f002882cSDavid Hildenbrand bool reject_file_backed = false;
2741a6e79df9SLorenzo Stoakes struct address_space *mapping;
2742f002882cSDavid Hildenbrand bool check_secretmem = false;
2743a6e79df9SLorenzo Stoakes unsigned long mapping_flags;
2744a6e79df9SLorenzo Stoakes
2745a6e79df9SLorenzo Stoakes /*
2746a6e79df9SLorenzo Stoakes * If we aren't pinning then no problematic write can occur. A long term
2747a6e79df9SLorenzo Stoakes * pin is the most egregious case so this is the one we disallow.
2748a6e79df9SLorenzo Stoakes */
2749f002882cSDavid Hildenbrand if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) ==
2750a6e79df9SLorenzo Stoakes (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
2751f002882cSDavid Hildenbrand reject_file_backed = true;
2752a6e79df9SLorenzo Stoakes
2753f002882cSDavid Hildenbrand /* We hold a folio reference, so we can safely access folio fields. */
2754f002882cSDavid Hildenbrand
2755f002882cSDavid Hildenbrand /* secretmem folios are always order-0 folios. */
2756f002882cSDavid Hildenbrand if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))
2757f002882cSDavid Hildenbrand check_secretmem = true;
2758f002882cSDavid Hildenbrand
2759f002882cSDavid Hildenbrand if (!reject_file_backed && !check_secretmem)
2760f002882cSDavid Hildenbrand return true;
2761a6e79df9SLorenzo Stoakes
2762a6e79df9SLorenzo Stoakes if (WARN_ON_ONCE(folio_test_slab(folio)))
2763a6e79df9SLorenzo Stoakes return false;
2764a6e79df9SLorenzo Stoakes
2765f002882cSDavid Hildenbrand /* hugetlb neither requires dirty-tracking nor can be secretmem. */
2766a6e79df9SLorenzo Stoakes if (folio_test_hugetlb(folio))
2767a6e79df9SLorenzo Stoakes return true;
2768a6e79df9SLorenzo Stoakes
2769a6e79df9SLorenzo Stoakes /*
2770a6e79df9SLorenzo Stoakes * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods
2771a6e79df9SLorenzo Stoakes * cannot proceed, which means no actions performed under RCU can
2772a6e79df9SLorenzo Stoakes * proceed either.
2773a6e79df9SLorenzo Stoakes *
2774a6e79df9SLorenzo Stoakes * inodes and thus their mappings are freed under RCU, which means the
2775a6e79df9SLorenzo Stoakes * mapping cannot be freed beneath us and thus we can safely dereference
2776a6e79df9SLorenzo Stoakes * it.
2777a6e79df9SLorenzo Stoakes */
2778a6e79df9SLorenzo Stoakes lockdep_assert_irqs_disabled();
2779a6e79df9SLorenzo Stoakes
2780a6e79df9SLorenzo Stoakes /*
2781a6e79df9SLorenzo Stoakes * However, there may be operations which _alter_ the mapping, so ensure
2782a6e79df9SLorenzo Stoakes * we read it once and only once.
2783a6e79df9SLorenzo Stoakes */
2784a6e79df9SLorenzo Stoakes mapping = READ_ONCE(folio->mapping);
2785a6e79df9SLorenzo Stoakes
2786a6e79df9SLorenzo Stoakes /*
2787a6e79df9SLorenzo Stoakes * The mapping may have been truncated, in any case we cannot determine
2788a6e79df9SLorenzo Stoakes * if this mapping is safe - fall back to slow path to determine how to
2789a6e79df9SLorenzo Stoakes * proceed.
2790a6e79df9SLorenzo Stoakes */
2791a6e79df9SLorenzo Stoakes if (!mapping)
2792a6e79df9SLorenzo Stoakes return false;
2793a6e79df9SLorenzo Stoakes
2794a6e79df9SLorenzo Stoakes /* Anonymous folios pose no problem. */
2795df25569dSDavid Hildenbrand mapping_flags = (unsigned long)mapping & FOLIO_MAPPING_FLAGS;
2796a6e79df9SLorenzo Stoakes if (mapping_flags)
2797df25569dSDavid Hildenbrand return mapping_flags & FOLIO_MAPPING_ANON;
2798a6e79df9SLorenzo Stoakes
2799a6e79df9SLorenzo Stoakes /*
2800a6e79df9SLorenzo Stoakes * At this point, we know the mapping is non-null and points to an
2801f002882cSDavid Hildenbrand * address_space object.
2802a6e79df9SLorenzo Stoakes */
2803f002882cSDavid Hildenbrand if (check_secretmem && secretmem_mapping(mapping))
2804f002882cSDavid Hildenbrand return false;
2805f002882cSDavid Hildenbrand /* The only remaining allowed file system is shmem. */
2806f002882cSDavid Hildenbrand return !reject_file_backed || shmem_mapping(mapping);
2807a6e79df9SLorenzo Stoakes }
2808a6e79df9SLorenzo Stoakes
28093010a5eaSLaurent Dufour #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
281070cbc3ccSYang Shi /*
281123babe19SDavid Hildenbrand * GUP-fast relies on pte change detection to avoid concurrent pgtable
281270cbc3ccSYang Shi * operations.
281370cbc3ccSYang Shi *
281423babe19SDavid Hildenbrand * To pin the page, GUP-fast needs to do below in order:
281570cbc3ccSYang Shi * (1) pin the page (by prefetching pte), then (2) check pte not changed.
281670cbc3ccSYang Shi *
281770cbc3ccSYang Shi * For the rest of pgtable operations where pgtable updates can be racy
281823babe19SDavid Hildenbrand * with GUP-fast, we need to do (1) clear pte, then (2) check whether page
281970cbc3ccSYang Shi * is pinned.
282070cbc3ccSYang Shi *
282170cbc3ccSYang Shi * Above will work for all pte-level operations, including THP split.
282270cbc3ccSYang Shi *
282323babe19SDavid Hildenbrand * For THP collapse, it's a bit more complicated because GUP-fast may be
282470cbc3ccSYang Shi * walking a pgtable page that is being freed (pte is still valid but pmd
282570cbc3ccSYang Shi * can be cleared already). To avoid race in such condition, we need to
282670cbc3ccSYang Shi * also check pmd here to make sure pmd doesn't change (corresponds to
282770cbc3ccSYang Shi * pmdp_collapse_flush() in the THP collapse code path).
282870cbc3ccSYang Shi */
gup_fast_pte_range(pmd_t pmd,pmd_t * pmdp,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)282923babe19SDavid Hildenbrand static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
283023babe19SDavid Hildenbrand unsigned long end, unsigned int flags, struct page **pages,
283123babe19SDavid Hildenbrand int *nr)
28322667f50eSSteve Capper {
2833fd2825b0SAlistair Popple int ret = 0;
28342667f50eSSteve Capper pte_t *ptep, *ptem;
28352667f50eSSteve Capper
28362667f50eSSteve Capper ptem = ptep = pte_offset_map(&pmd, addr);
283704dee9e8SHugh Dickins if (!ptep)
283804dee9e8SHugh Dickins return 0;
28392667f50eSSteve Capper do {
28402a4a06daSPeter Zijlstra pte_t pte = ptep_get_lockless(ptep);
2841b0496fe4SMatthew Wilcox (Oracle) struct page *page;
2842b0496fe4SMatthew Wilcox (Oracle) struct folio *folio;
28432667f50eSSteve Capper
2844d74943a2SDavid Hildenbrand /*
2845d74943a2SDavid Hildenbrand * Always fallback to ordinary GUP on PROT_NONE-mapped pages:
2846d74943a2SDavid Hildenbrand * pte_access_permitted() better should reject these pages
2847d74943a2SDavid Hildenbrand * either way: otherwise, GUP-fast might succeed in
2848d74943a2SDavid Hildenbrand * cases where ordinary GUP would fail due to VMA access
2849d74943a2SDavid Hildenbrand * permissions.
2850d74943a2SDavid Hildenbrand */
2851d74943a2SDavid Hildenbrand if (pte_protnone(pte))
2852e7884f8eSKirill A. Shutemov goto pte_unmap;
2853e7884f8eSKirill A. Shutemov
2854b798bec4SIra Weiny if (!pte_access_permitted(pte, flags & FOLL_WRITE))
2855e7884f8eSKirill A. Shutemov goto pte_unmap;
2856e7884f8eSKirill A. Shutemov
2857fd2825b0SAlistair Popple if (pte_special(pte))
28582667f50eSSteve Capper goto pte_unmap;
28592667f50eSSteve Capper
2860792b429dSDavid Hildenbrand /* If it's not marked as special it must have a valid memmap. */
2861792b429dSDavid Hildenbrand VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte)));
28622667f50eSSteve Capper page = pte_page(pte);
28632667f50eSSteve Capper
2864f442fa61SYang Shi folio = try_grab_folio_fast(page, 1, flags);
2865b0496fe4SMatthew Wilcox (Oracle) if (!folio)
28662667f50eSSteve Capper goto pte_unmap;
28672667f50eSSteve Capper
286870cbc3ccSYang Shi if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
2869c33c7948SRyan Roberts unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
2870b0496fe4SMatthew Wilcox (Oracle) gup_put_folio(folio, 1, flags);
28712667f50eSSteve Capper goto pte_unmap;
28722667f50eSSteve Capper }
28732667f50eSSteve Capper
2874f002882cSDavid Hildenbrand if (!gup_fast_folio_allowed(folio, flags)) {
28752667f50eSSteve Capper gup_put_folio(folio, 1, flags);
28762667f50eSSteve Capper goto pte_unmap;
28772667f50eSSteve Capper }
28782667f50eSSteve Capper
287984209e87SDavid Hildenbrand if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
2880a7f22660SDavid Hildenbrand gup_put_folio(folio, 1, flags);
2881a7f22660SDavid Hildenbrand goto pte_unmap;
2882a7f22660SDavid Hildenbrand }
2883a7f22660SDavid Hildenbrand
2884f28d4363SClaudio Imbrenda /*
2885f28d4363SClaudio Imbrenda * We need to make the page accessible if and only if we are
2886f28d4363SClaudio Imbrenda * going to access its content (the FOLL_PIN case). Please
2887f28d4363SClaudio Imbrenda * see Documentation/core-api/pin_user_pages.rst for
2888f28d4363SClaudio Imbrenda * details.
2889f28d4363SClaudio Imbrenda */
28907cad96aeSDavid Hildenbrand if ((flags & FOLL_PIN) && arch_make_folio_accessible(folio)) {
2891b0496fe4SMatthew Wilcox (Oracle) gup_put_folio(folio, 1, flags);
2892f28d4363SClaudio Imbrenda goto pte_unmap;
2893f28d4363SClaudio Imbrenda }
2894b0496fe4SMatthew Wilcox (Oracle) folio_set_referenced(folio);
28952667f50eSSteve Capper pages[*nr] = page;
28962667f50eSSteve Capper (*nr)++;
28972667f50eSSteve Capper } while (ptep++, addr += PAGE_SIZE, addr != end);
28982667f50eSSteve Capper
28992667f50eSSteve Capper ret = 1;
29002667f50eSSteve Capper
29012667f50eSSteve Capper pte_unmap:
29022667f50eSSteve Capper pte_unmap(ptem);
29032667f50eSSteve Capper return ret;
29042667f50eSSteve Capper }
29052667f50eSSteve Capper #else
29062667f50eSSteve Capper
29072667f50eSSteve Capper /*
29082667f50eSSteve Capper * If we can't determine whether or not a pte is special, then fail immediately
29092667f50eSSteve Capper * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
29102667f50eSSteve Capper * to be special.
29112667f50eSSteve Capper *
29122667f50eSSteve Capper * For a futex to be placed on a THP tail page, get_futex_key requires a
2913dadbb612SSouptick Joarder * get_user_pages_fast_only implementation that can pin pages. Thus it's still
291423babe19SDavid Hildenbrand * useful to have gup_fast_pmd_leaf even if we can't operate on ptes.
29152667f50eSSteve Capper */
gup_fast_pte_range(pmd_t pmd,pmd_t * pmdp,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)291623babe19SDavid Hildenbrand static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
291723babe19SDavid Hildenbrand unsigned long end, unsigned int flags, struct page **pages,
291823babe19SDavid Hildenbrand int *nr)
29192667f50eSSteve Capper {
29202667f50eSSteve Capper return 0;
29212667f50eSSteve Capper }
29223010a5eaSLaurent Dufour #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
29232667f50eSSteve Capper
gup_fast_pmd_leaf(pmd_t orig,pmd_t * pmdp,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)292423babe19SDavid Hildenbrand static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
292523babe19SDavid Hildenbrand unsigned long end, unsigned int flags, struct page **pages,
292623babe19SDavid Hildenbrand int *nr)
29272667f50eSSteve Capper {
2928667ed1f7SMatthew Wilcox (Oracle) struct page *page;
2929667ed1f7SMatthew Wilcox (Oracle) struct folio *folio;
29302667f50eSSteve Capper int refs;
29312667f50eSSteve Capper
2932b798bec4SIra Weiny if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
29332667f50eSSteve Capper return 0;
29342667f50eSSteve Capper
2935ae3c99e6SPeter Xu if (pmd_special(orig))
2936ae3c99e6SPeter Xu return 0;
2937ae3c99e6SPeter Xu
2938e3c05b6eSDavid Hildenbrand refs = (end - addr) >> PAGE_SHIFT;
2939e3c05b6eSDavid Hildenbrand page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
29402667f50eSSteve Capper
2941f442fa61SYang Shi folio = try_grab_folio_fast(page, refs, flags);
2942667ed1f7SMatthew Wilcox (Oracle) if (!folio)
29432667f50eSSteve Capper return 0;
29442667f50eSSteve Capper
29452667f50eSSteve Capper if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
2946667ed1f7SMatthew Wilcox (Oracle) gup_put_folio(folio, refs, flags);
29472667f50eSSteve Capper return 0;
29482667f50eSSteve Capper }
29492667f50eSSteve Capper
2950f002882cSDavid Hildenbrand if (!gup_fast_folio_allowed(folio, flags)) {
2951a6e79df9SLorenzo Stoakes gup_put_folio(folio, refs, flags);
2952a6e79df9SLorenzo Stoakes return 0;
2953a6e79df9SLorenzo Stoakes }
295484209e87SDavid Hildenbrand if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
2955a7f22660SDavid Hildenbrand gup_put_folio(folio, refs, flags);
2956a7f22660SDavid Hildenbrand return 0;
2957a7f22660SDavid Hildenbrand }
2958a7f22660SDavid Hildenbrand
2959e3c05b6eSDavid Hildenbrand pages += *nr;
2960a43e9820SJohn Hubbard *nr += refs;
2961e3c05b6eSDavid Hildenbrand for (; refs; refs--)
2962e3c05b6eSDavid Hildenbrand *(pages++) = page++;
2963667ed1f7SMatthew Wilcox (Oracle) folio_set_referenced(folio);
29642667f50eSSteve Capper return 1;
29652667f50eSSteve Capper }
29662667f50eSSteve Capper
gup_fast_pud_leaf(pud_t orig,pud_t * pudp,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)296723babe19SDavid Hildenbrand static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
296823babe19SDavid Hildenbrand unsigned long end, unsigned int flags, struct page **pages,
296923babe19SDavid Hildenbrand int *nr)
29702667f50eSSteve Capper {
297183afb52eSMatthew Wilcox (Oracle) struct page *page;
297283afb52eSMatthew Wilcox (Oracle) struct folio *folio;
29732667f50eSSteve Capper int refs;
29742667f50eSSteve Capper
2975b798bec4SIra Weiny if (!pud_access_permitted(orig, flags & FOLL_WRITE))
29762667f50eSSteve Capper return 0;
29772667f50eSSteve Capper
2978ae3c99e6SPeter Xu if (pud_special(orig))
2979ae3c99e6SPeter Xu return 0;
2980ae3c99e6SPeter Xu
2981e3c05b6eSDavid Hildenbrand refs = (end - addr) >> PAGE_SHIFT;
2982e3c05b6eSDavid Hildenbrand page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
29832667f50eSSteve Capper
2984f442fa61SYang Shi folio = try_grab_folio_fast(page, refs, flags);
298583afb52eSMatthew Wilcox (Oracle) if (!folio)
29862667f50eSSteve Capper return 0;
29872667f50eSSteve Capper
29882667f50eSSteve Capper if (unlikely(pud_val(orig) != pud_val(*pudp))) {
298983afb52eSMatthew Wilcox (Oracle) gup_put_folio(folio, refs, flags);
29902667f50eSSteve Capper return 0;
29912667f50eSSteve Capper }
29922667f50eSSteve Capper
2993f002882cSDavid Hildenbrand if (!gup_fast_folio_allowed(folio, flags)) {
2994a6e79df9SLorenzo Stoakes gup_put_folio(folio, refs, flags);
2995a6e79df9SLorenzo Stoakes return 0;
2996a6e79df9SLorenzo Stoakes }
2997a6e79df9SLorenzo Stoakes
299884209e87SDavid Hildenbrand if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
2999a7f22660SDavid Hildenbrand gup_put_folio(folio, refs, flags);
3000a7f22660SDavid Hildenbrand return 0;
3001a7f22660SDavid Hildenbrand }
3002a7f22660SDavid Hildenbrand
3003e3c05b6eSDavid Hildenbrand pages += *nr;
3004a43e9820SJohn Hubbard *nr += refs;
3005e3c05b6eSDavid Hildenbrand for (; refs; refs--)
3006e3c05b6eSDavid Hildenbrand *(pages++) = page++;
300783afb52eSMatthew Wilcox (Oracle) folio_set_referenced(folio);
30082667f50eSSteve Capper return 1;
30092667f50eSSteve Capper }
30102667f50eSSteve Capper
gup_fast_pmd_range(pud_t * pudp,pud_t pud,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)301123babe19SDavid Hildenbrand static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
301223babe19SDavid Hildenbrand unsigned long end, unsigned int flags, struct page **pages,
301323babe19SDavid Hildenbrand int *nr)
30142667f50eSSteve Capper {
30152667f50eSSteve Capper unsigned long next;
30162667f50eSSteve Capper pmd_t *pmdp;
30172667f50eSSteve Capper
3018d3f7b1bbSVasily Gorbik pmdp = pmd_offset_lockless(pudp, pud, addr);
30192667f50eSSteve Capper do {
30201180e732SPeter Zijlstra pmd_t pmd = pmdp_get_lockless(pmdp);
30212667f50eSSteve Capper
30222667f50eSSteve Capper next = pmd_addr_end(addr, end);
302384c3fc4eSZi Yan if (!pmd_present(pmd))
30242667f50eSSteve Capper return 0;
30252667f50eSSteve Capper
30267db86dc3SPeter Xu if (unlikely(pmd_leaf(pmd))) {
302723babe19SDavid Hildenbrand /* See gup_fast_pte_range() */
3028d74943a2SDavid Hildenbrand if (pmd_protnone(pmd))
30292667f50eSSteve Capper return 0;
30302667f50eSSteve Capper
303123babe19SDavid Hildenbrand if (!gup_fast_pmd_leaf(pmd, pmdp, addr, next, flags,
30322667f50eSSteve Capper pages, nr))
30332667f50eSSteve Capper return 0;
30342667f50eSSteve Capper
303523babe19SDavid Hildenbrand } else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags,
303623babe19SDavid Hildenbrand pages, nr))
30372667f50eSSteve Capper return 0;
30382667f50eSSteve Capper } while (pmdp++, addr = next, addr != end);
30392667f50eSSteve Capper
30402667f50eSSteve Capper return 1;
30412667f50eSSteve Capper }
30422667f50eSSteve Capper
gup_fast_pud_range(p4d_t * p4dp,p4d_t p4d,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)304323babe19SDavid Hildenbrand static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr,
304423babe19SDavid Hildenbrand unsigned long end, unsigned int flags, struct page **pages,
304523babe19SDavid Hildenbrand int *nr)
30462667f50eSSteve Capper {
30472667f50eSSteve Capper unsigned long next;
30482667f50eSSteve Capper pud_t *pudp;
30492667f50eSSteve Capper
3050d3f7b1bbSVasily Gorbik pudp = pud_offset_lockless(p4dp, p4d, addr);
30512667f50eSSteve Capper do {
3052c0efdb37SAnshuman Khandual pud_t pud = pudp_get(pudp);
30532667f50eSSteve Capper
30542667f50eSSteve Capper next = pud_addr_end(addr, end);
305515494520SQiujun Huang if (unlikely(!pud_present(pud)))
30562667f50eSSteve Capper return 0;
30577db86dc3SPeter Xu if (unlikely(pud_leaf(pud))) {
305823babe19SDavid Hildenbrand if (!gup_fast_pud_leaf(pud, pudp, addr, next, flags,
30592667f50eSSteve Capper pages, nr))
30602667f50eSSteve Capper return 0;
306123babe19SDavid Hildenbrand } else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags,
306223babe19SDavid Hildenbrand pages, nr))
30632667f50eSSteve Capper return 0;
30642667f50eSSteve Capper } while (pudp++, addr = next, addr != end);
30652667f50eSSteve Capper
30662667f50eSSteve Capper return 1;
30672667f50eSSteve Capper }
30682667f50eSSteve Capper
gup_fast_p4d_range(pgd_t * pgdp,pgd_t pgd,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)306923babe19SDavid Hildenbrand static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
307023babe19SDavid Hildenbrand unsigned long end, unsigned int flags, struct page **pages,
307123babe19SDavid Hildenbrand int *nr)
3072c2febafcSKirill A. Shutemov {
3073c2febafcSKirill A. Shutemov unsigned long next;
3074c2febafcSKirill A. Shutemov p4d_t *p4dp;
3075c2febafcSKirill A. Shutemov
3076d3f7b1bbSVasily Gorbik p4dp = p4d_offset_lockless(pgdp, pgd, addr);
3077c2febafcSKirill A. Shutemov do {
3078c0efdb37SAnshuman Khandual p4d_t p4d = p4dp_get(p4dp);
3079c2febafcSKirill A. Shutemov
3080c2febafcSKirill A. Shutemov next = p4d_addr_end(addr, end);
3081089f9214SPeter Xu if (!p4d_present(p4d))
3082c2febafcSKirill A. Shutemov return 0;
30831965e933SPeter Xu BUILD_BUG_ON(p4d_leaf(p4d));
30848268614bSChristophe Leroy if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags,
308523babe19SDavid Hildenbrand pages, nr))
3086c2febafcSKirill A. Shutemov return 0;
3087c2febafcSKirill A. Shutemov } while (p4dp++, addr = next, addr != end);
3088c2febafcSKirill A. Shutemov
3089c2febafcSKirill A. Shutemov return 1;
3090c2febafcSKirill A. Shutemov }
3091c2febafcSKirill A. Shutemov
gup_fast_pgd_range(unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)309223babe19SDavid Hildenbrand static void gup_fast_pgd_range(unsigned long addr, unsigned long end,
3093b798bec4SIra Weiny unsigned int flags, struct page **pages, int *nr)
30945b65c467SKirill A. Shutemov {
30955b65c467SKirill A. Shutemov unsigned long next;
30965b65c467SKirill A. Shutemov pgd_t *pgdp;
30975b65c467SKirill A. Shutemov
30985b65c467SKirill A. Shutemov pgdp = pgd_offset(current->mm, addr);
30995b65c467SKirill A. Shutemov do {
3100c0efdb37SAnshuman Khandual pgd_t pgd = pgdp_get(pgdp);
31015b65c467SKirill A. Shutemov
31025b65c467SKirill A. Shutemov next = pgd_addr_end(addr, end);
31035b65c467SKirill A. Shutemov if (pgd_none(pgd))
31045b65c467SKirill A. Shutemov return;
3105339122abSBaoquan He BUILD_BUG_ON(pgd_leaf(pgd));
3106339122abSBaoquan He if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
310723babe19SDavid Hildenbrand pages, nr))
31085b65c467SKirill A. Shutemov return;
31095b65c467SKirill A. Shutemov } while (pgdp++, addr = next, addr != end);
31105b65c467SKirill A. Shutemov }
3111050a9adcSChristoph Hellwig #else
gup_fast_pgd_range(unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)311223babe19SDavid Hildenbrand static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end,
3113050a9adcSChristoph Hellwig unsigned int flags, struct page **pages, int *nr)
3114050a9adcSChristoph Hellwig {
3115050a9adcSChristoph Hellwig }
311625176ad0SDavid Hildenbrand #endif /* CONFIG_HAVE_GUP_FAST */
31175b65c467SKirill A. Shutemov
31185b65c467SKirill A. Shutemov #ifndef gup_fast_permitted
31195b65c467SKirill A. Shutemov /*
3120dadbb612SSouptick Joarder * Check if it's allowed to use get_user_pages_fast_only() for the range, or
31215b65c467SKirill A. Shutemov * we need to fall back to the slow version:
31225b65c467SKirill A. Shutemov */
gup_fast_permitted(unsigned long start,unsigned long end)312326f4c328SChristoph Hellwig static bool gup_fast_permitted(unsigned long start, unsigned long end)
31245b65c467SKirill A. Shutemov {
312526f4c328SChristoph Hellwig return true;
31265b65c467SKirill A. Shutemov }
31275b65c467SKirill A. Shutemov #endif
31285b65c467SKirill A. Shutemov
gup_fast(unsigned long start,unsigned long end,unsigned int gup_flags,struct page ** pages)312923babe19SDavid Hildenbrand static unsigned long gup_fast(unsigned long start, unsigned long end,
313023babe19SDavid Hildenbrand unsigned int gup_flags, struct page **pages)
31312667f50eSSteve Capper {
3132376a34efSJohn Hubbard unsigned long flags;
3133c28b1fc7SJason Gunthorpe int nr_pinned = 0;
313457efa1feSJason Gunthorpe unsigned seq;
3135c28b1fc7SJason Gunthorpe
313625176ad0SDavid Hildenbrand if (!IS_ENABLED(CONFIG_HAVE_GUP_FAST) ||
3137c28b1fc7SJason Gunthorpe !gup_fast_permitted(start, end))
3138c28b1fc7SJason Gunthorpe return 0;
3139c28b1fc7SJason Gunthorpe
314057efa1feSJason Gunthorpe if (gup_flags & FOLL_PIN) {
314175285852SPeter Zijlstra if (!raw_seqcount_try_begin(¤t->mm->write_protect_seq, seq))
314257efa1feSJason Gunthorpe return 0;
314357efa1feSJason Gunthorpe }
314457efa1feSJason Gunthorpe
3145c28b1fc7SJason Gunthorpe /*
3146c28b1fc7SJason Gunthorpe * Disable interrupts. The nested form is used, in order to allow full,
3147c28b1fc7SJason Gunthorpe * general purpose use of this routine.
3148c28b1fc7SJason Gunthorpe *
3149c28b1fc7SJason Gunthorpe * With interrupts disabled, we block page table pages from being freed
3150c28b1fc7SJason Gunthorpe * from under us. See struct mmu_table_batch comments in
3151c28b1fc7SJason Gunthorpe * include/asm-generic/tlb.h for more details.
3152c28b1fc7SJason Gunthorpe *
3153c28b1fc7SJason Gunthorpe * We do not adopt an rcu_read_lock() here as we also want to block IPIs
315452084f25SJann Horn * that come from callers of tlb_remove_table_sync_one().
3155c28b1fc7SJason Gunthorpe */
3156c28b1fc7SJason Gunthorpe local_irq_save(flags);
315723babe19SDavid Hildenbrand gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);
3158c28b1fc7SJason Gunthorpe local_irq_restore(flags);
315957efa1feSJason Gunthorpe
316057efa1feSJason Gunthorpe /*
316157efa1feSJason Gunthorpe * When pinning pages for DMA there could be a concurrent write protect
316223babe19SDavid Hildenbrand * from fork() via copy_page_range(), in this case always fail GUP-fast.
316357efa1feSJason Gunthorpe */
316457efa1feSJason Gunthorpe if (gup_flags & FOLL_PIN) {
316557efa1feSJason Gunthorpe if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) {
316623babe19SDavid Hildenbrand gup_fast_unpin_user_pages(pages, nr_pinned);
316757efa1feSJason Gunthorpe return 0;
3168b6a2619cSDavid Hildenbrand } else {
3169b6a2619cSDavid Hildenbrand sanity_check_pinned_pages(pages, nr_pinned);
317057efa1feSJason Gunthorpe }
317157efa1feSJason Gunthorpe }
3172c28b1fc7SJason Gunthorpe return nr_pinned;
3173c28b1fc7SJason Gunthorpe }
3174c28b1fc7SJason Gunthorpe
gup_fast_fallback(unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages)317523babe19SDavid Hildenbrand static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,
317623babe19SDavid Hildenbrand unsigned int gup_flags, struct page **pages)
3177c28b1fc7SJason Gunthorpe {
3178c28b1fc7SJason Gunthorpe unsigned long len, end;
3179c28b1fc7SJason Gunthorpe unsigned long nr_pinned;
3180b2a72dffSJason Gunthorpe int locked = 0;
3181c28b1fc7SJason Gunthorpe int ret;
31822667f50eSSteve Capper
3183f4000fdfSJohn Hubbard if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
3184376a34efSJohn Hubbard FOLL_FORCE | FOLL_PIN | FOLL_GET |
31854003f107SLogan Gunthorpe FOLL_FAST_ONLY | FOLL_NOFAULT |
3186d74943a2SDavid Hildenbrand FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT)))
3187817be129SChristoph Hellwig return -EINVAL;
3188817be129SChristoph Hellwig
3189a458b76aSAndrea Arcangeli if (gup_flags & FOLL_PIN)
319012e423baSLorenzo Stoakes mm_set_has_pinned_flag(current->mm);
3191008cfe44SPeter Xu
3192f81cd178SJohn Hubbard if (!(gup_flags & FOLL_FAST_ONLY))
3193da1c55f1SMichel Lespinasse might_lock_read(¤t->mm->mmap_lock);
3194f81cd178SJohn Hubbard
3195f455c854SChristoph Hellwig start = untagged_addr(start) & PAGE_MASK;
3196c28b1fc7SJason Gunthorpe len = nr_pages << PAGE_SHIFT;
3197c28b1fc7SJason Gunthorpe if (check_add_overflow(start, len, &end))
31989883c7f8SJason Gunthorpe return -EOVERFLOW;
31996014bc27SLinus Torvalds if (end > TASK_SIZE_MAX)
32006014bc27SLinus Torvalds return -EFAULT;
320173e10a61SKirill A. Shutemov
320223babe19SDavid Hildenbrand nr_pinned = gup_fast(start, end, gup_flags, pages);
3203c28b1fc7SJason Gunthorpe if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
3204c28b1fc7SJason Gunthorpe return nr_pinned;
3205376a34efSJohn Hubbard
3206c28b1fc7SJason Gunthorpe /* Slow path: try to get the remaining pages with get_user_pages */
32074628b063SPingfan Liu start += nr_pinned << PAGE_SHIFT;
32084628b063SPingfan Liu pages += nr_pinned;
3209b2a72dffSJason Gunthorpe ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,
3210b2cac248SLorenzo Stoakes pages, &locked,
3211f04740f5SJason Gunthorpe gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);
3212c28b1fc7SJason Gunthorpe if (ret < 0) {
3213c28b1fc7SJason Gunthorpe /*
3214c28b1fc7SJason Gunthorpe * The caller has to unpin the pages we already pinned so
3215c28b1fc7SJason Gunthorpe * returning -errno is not an option
3216c28b1fc7SJason Gunthorpe */
3217c28b1fc7SJason Gunthorpe if (nr_pinned)
3218c28b1fc7SJason Gunthorpe return nr_pinned;
32192667f50eSSteve Capper return ret;
32202667f50eSSteve Capper }
3221c28b1fc7SJason Gunthorpe return ret + nr_pinned;
3222c28b1fc7SJason Gunthorpe }
3223c28b1fc7SJason Gunthorpe
3224dadbb612SSouptick Joarder /**
3225dadbb612SSouptick Joarder * get_user_pages_fast_only() - pin user pages in memory
3226dadbb612SSouptick Joarder * @start: starting user address
3227dadbb612SSouptick Joarder * @nr_pages: number of pages from start to pin
3228dadbb612SSouptick Joarder * @gup_flags: flags modifying pin behaviour
3229dadbb612SSouptick Joarder * @pages: array that receives pointers to the pages pinned.
3230dadbb612SSouptick Joarder * Should be at least nr_pages long.
3231dadbb612SSouptick Joarder *
32329e1f0580SJohn Hubbard * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
32339e1f0580SJohn Hubbard * the regular GUP.
32349e1f0580SJohn Hubbard *
32359e1f0580SJohn Hubbard * If the architecture does not support this function, simply return with no
32369e1f0580SJohn Hubbard * pages pinned.
32379e1f0580SJohn Hubbard *
32389e1f0580SJohn Hubbard * Careful, careful! COW breaking can go either way, so a non-write
32399e1f0580SJohn Hubbard * access can get ambiguous page results. If you call this function without
32409e1f0580SJohn Hubbard * 'write' set, you'd better be sure that you're ok with that ambiguity.
32419e1f0580SJohn Hubbard */
get_user_pages_fast_only(unsigned long start,int nr_pages,unsigned int gup_flags,struct page ** pages)3242dadbb612SSouptick Joarder int get_user_pages_fast_only(unsigned long start, int nr_pages,
3243dadbb612SSouptick Joarder unsigned int gup_flags, struct page **pages)
32449e1f0580SJohn Hubbard {
32459e1f0580SJohn Hubbard /*
32469e1f0580SJohn Hubbard * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
32479e1f0580SJohn Hubbard * because gup fast is always a "pin with a +1 page refcount" request.
3248376a34efSJohn Hubbard *
3249376a34efSJohn Hubbard * FOLL_FAST_ONLY is required in order to match the API description of
3250376a34efSJohn Hubbard * this routine: no fall back to regular ("slow") GUP.
32519e1f0580SJohn Hubbard */
3252b2cac248SLorenzo Stoakes if (!is_valid_gup_args(pages, NULL, &gup_flags,
3253d64e2dbcSJason Gunthorpe FOLL_GET | FOLL_FAST_ONLY))
3254d64e2dbcSJason Gunthorpe return -EINVAL;
32559e1f0580SJohn Hubbard
325623babe19SDavid Hildenbrand return gup_fast_fallback(start, nr_pages, gup_flags, pages);
32579e1f0580SJohn Hubbard }
3258dadbb612SSouptick Joarder EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
32599e1f0580SJohn Hubbard
3260eddb1c22SJohn Hubbard /**
3261eddb1c22SJohn Hubbard * get_user_pages_fast() - pin user pages in memory
3262eddb1c22SJohn Hubbard * @start: starting user address
3263eddb1c22SJohn Hubbard * @nr_pages: number of pages from start to pin
3264eddb1c22SJohn Hubbard * @gup_flags: flags modifying pin behaviour
3265eddb1c22SJohn Hubbard * @pages: array that receives pointers to the pages pinned.
3266eddb1c22SJohn Hubbard * Should be at least nr_pages long.
3267eddb1c22SJohn Hubbard *
3268c1e8d7c6SMichel Lespinasse * Attempt to pin user pages in memory without taking mm->mmap_lock.
3269eddb1c22SJohn Hubbard * If not successful, it will fall back to taking the lock and
3270eddb1c22SJohn Hubbard * calling get_user_pages().
3271eddb1c22SJohn Hubbard *
3272eddb1c22SJohn Hubbard * Returns number of pages pinned. This may be fewer than the number requested.
3273eddb1c22SJohn Hubbard * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
3274eddb1c22SJohn Hubbard * -errno.
3275eddb1c22SJohn Hubbard */
get_user_pages_fast(unsigned long start,int nr_pages,unsigned int gup_flags,struct page ** pages)3276eddb1c22SJohn Hubbard int get_user_pages_fast(unsigned long start, int nr_pages,
3277eddb1c22SJohn Hubbard unsigned int gup_flags, struct page **pages)
3278eddb1c22SJohn Hubbard {
327994202f12SJohn Hubbard /*
328094202f12SJohn Hubbard * The caller may or may not have explicitly set FOLL_GET; either way is
328194202f12SJohn Hubbard * OK. However, internally (within mm/gup.c), gup fast variants must set
328294202f12SJohn Hubbard * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
328394202f12SJohn Hubbard * request.
328494202f12SJohn Hubbard */
3285b2cac248SLorenzo Stoakes if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET))
3286d64e2dbcSJason Gunthorpe return -EINVAL;
328723babe19SDavid Hildenbrand return gup_fast_fallback(start, nr_pages, gup_flags, pages);
3288eddb1c22SJohn Hubbard }
3289050a9adcSChristoph Hellwig EXPORT_SYMBOL_GPL(get_user_pages_fast);
3290eddb1c22SJohn Hubbard
3291eddb1c22SJohn Hubbard /**
3292eddb1c22SJohn Hubbard * pin_user_pages_fast() - pin user pages in memory without taking locks
3293eddb1c22SJohn Hubbard *
32943faa52c0SJohn Hubbard * @start: starting user address
32953faa52c0SJohn Hubbard * @nr_pages: number of pages from start to pin
32963faa52c0SJohn Hubbard * @gup_flags: flags modifying pin behaviour
32973faa52c0SJohn Hubbard * @pages: array that receives pointers to the pages pinned.
32983faa52c0SJohn Hubbard * Should be at least nr_pages long.
32993faa52c0SJohn Hubbard *
33003faa52c0SJohn Hubbard * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
33013faa52c0SJohn Hubbard * get_user_pages_fast() for documentation on the function arguments, because
33023faa52c0SJohn Hubbard * the arguments here are identical.
33033faa52c0SJohn Hubbard *
33043faa52c0SJohn Hubbard * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
330572ef5e52SMauro Carvalho Chehab * see Documentation/core-api/pin_user_pages.rst for further details.
3306c8070b78SDavid Howells *
3307c8070b78SDavid Howells * Note that if a zero_page is amongst the returned pages, it will not have
3308c8070b78SDavid Howells * pins in it and unpin_user_page() will not remove pins from it.
3309eddb1c22SJohn Hubbard */
pin_user_pages_fast(unsigned long start,int nr_pages,unsigned int gup_flags,struct page ** pages)3310eddb1c22SJohn Hubbard int pin_user_pages_fast(unsigned long start, int nr_pages,
3311eddb1c22SJohn Hubbard unsigned int gup_flags, struct page **pages)
3312eddb1c22SJohn Hubbard {
3313b2cac248SLorenzo Stoakes if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
33143faa52c0SJohn Hubbard return -EINVAL;
331523babe19SDavid Hildenbrand return gup_fast_fallback(start, nr_pages, gup_flags, pages);
3316eddb1c22SJohn Hubbard }
3317eddb1c22SJohn Hubbard EXPORT_SYMBOL_GPL(pin_user_pages_fast);
3318eddb1c22SJohn Hubbard
3319eddb1c22SJohn Hubbard /**
332064019a2eSPeter Xu * pin_user_pages_remote() - pin pages of a remote process
3321eddb1c22SJohn Hubbard *
33223faa52c0SJohn Hubbard * @mm: mm_struct of target mm
33233faa52c0SJohn Hubbard * @start: starting user address
33243faa52c0SJohn Hubbard * @nr_pages: number of pages from start to pin
33253faa52c0SJohn Hubbard * @gup_flags: flags modifying lookup behaviour
33263faa52c0SJohn Hubbard * @pages: array that receives pointers to the pages pinned.
33270768c8deSYury Norov * Should be at least nr_pages long.
33283faa52c0SJohn Hubbard * @locked: pointer to lock flag indicating whether lock is held and
33293faa52c0SJohn Hubbard * subsequently whether VM_FAULT_RETRY functionality can be
33303faa52c0SJohn Hubbard * utilised. Lock must initially be held.
33313faa52c0SJohn Hubbard *
33323faa52c0SJohn Hubbard * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
33333faa52c0SJohn Hubbard * get_user_pages_remote() for documentation on the function arguments, because
33343faa52c0SJohn Hubbard * the arguments here are identical.
33353faa52c0SJohn Hubbard *
33363faa52c0SJohn Hubbard * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
333772ef5e52SMauro Carvalho Chehab * see Documentation/core-api/pin_user_pages.rst for details.
3338c8070b78SDavid Howells *
3339c8070b78SDavid Howells * Note that if a zero_page is amongst the returned pages, it will not have
3340c8070b78SDavid Howells * pins in it and unpin_user_page*() will not remove pins from it.
3341eddb1c22SJohn Hubbard */
pin_user_pages_remote(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages,int * locked)334264019a2eSPeter Xu long pin_user_pages_remote(struct mm_struct *mm,
3343eddb1c22SJohn Hubbard unsigned long start, unsigned long nr_pages,
3344eddb1c22SJohn Hubbard unsigned int gup_flags, struct page **pages,
33450b295316SLorenzo Stoakes int *locked)
3346eddb1c22SJohn Hubbard {
33479a863a6aSJason Gunthorpe int local_locked = 1;
33489a863a6aSJason Gunthorpe
3349b2cac248SLorenzo Stoakes if (!is_valid_gup_args(pages, locked, &gup_flags,
3350d64e2dbcSJason Gunthorpe FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))
3351d64e2dbcSJason Gunthorpe return 0;
3352b2cac248SLorenzo Stoakes return __gup_longterm_locked(mm, start, nr_pages, pages,
33539a863a6aSJason Gunthorpe locked ? locked : &local_locked,
3354d64e2dbcSJason Gunthorpe gup_flags);
3355eddb1c22SJohn Hubbard }
3356eddb1c22SJohn Hubbard EXPORT_SYMBOL(pin_user_pages_remote);
3357eddb1c22SJohn Hubbard
3358eddb1c22SJohn Hubbard /**
3359eddb1c22SJohn Hubbard * pin_user_pages() - pin user pages in memory for use by other devices
3360eddb1c22SJohn Hubbard *
33613faa52c0SJohn Hubbard * @start: starting user address
33623faa52c0SJohn Hubbard * @nr_pages: number of pages from start to pin
33633faa52c0SJohn Hubbard * @gup_flags: flags modifying lookup behaviour
33643faa52c0SJohn Hubbard * @pages: array that receives pointers to the pages pinned.
33650768c8deSYury Norov * Should be at least nr_pages long.
33663faa52c0SJohn Hubbard *
33673faa52c0SJohn Hubbard * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
33683faa52c0SJohn Hubbard * FOLL_PIN is set.
33693faa52c0SJohn Hubbard *
33703faa52c0SJohn Hubbard * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
337172ef5e52SMauro Carvalho Chehab * see Documentation/core-api/pin_user_pages.rst for details.
3372c8070b78SDavid Howells *
3373c8070b78SDavid Howells * Note that if a zero_page is amongst the returned pages, it will not have
3374c8070b78SDavid Howells * pins in it and unpin_user_page*() will not remove pins from it.
3375eddb1c22SJohn Hubbard */
pin_user_pages(unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages)3376eddb1c22SJohn Hubbard long pin_user_pages(unsigned long start, unsigned long nr_pages,
33774c630f30SLorenzo Stoakes unsigned int gup_flags, struct page **pages)
3378eddb1c22SJohn Hubbard {
33799a863a6aSJason Gunthorpe int locked = 1;
33809a863a6aSJason Gunthorpe
3381b2cac248SLorenzo Stoakes if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
3382d64e2dbcSJason Gunthorpe return 0;
338364019a2eSPeter Xu return __gup_longterm_locked(current->mm, start, nr_pages,
3384b2cac248SLorenzo Stoakes pages, &locked, gup_flags);
3385eddb1c22SJohn Hubbard }
3386eddb1c22SJohn Hubbard EXPORT_SYMBOL(pin_user_pages);
338791429023SJohn Hubbard
338891429023SJohn Hubbard /*
338991429023SJohn Hubbard * pin_user_pages_unlocked() is the FOLL_PIN variant of
339091429023SJohn Hubbard * get_user_pages_unlocked(). Behavior is the same, except that this one sets
339191429023SJohn Hubbard * FOLL_PIN and rejects FOLL_GET.
3392c8070b78SDavid Howells *
3393c8070b78SDavid Howells * Note that if a zero_page is amongst the returned pages, it will not have
3394c8070b78SDavid Howells * pins in it and unpin_user_page*() will not remove pins from it.
339591429023SJohn Hubbard */
pin_user_pages_unlocked(unsigned long start,unsigned long nr_pages,struct page ** pages,unsigned int gup_flags)339691429023SJohn Hubbard long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
339791429023SJohn Hubbard struct page **pages, unsigned int gup_flags)
339891429023SJohn Hubbard {
3399b2a72dffSJason Gunthorpe int locked = 0;
340091429023SJohn Hubbard
3401b2cac248SLorenzo Stoakes if (!is_valid_gup_args(pages, NULL, &gup_flags,
3402f04740f5SJason Gunthorpe FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))
3403d64e2dbcSJason Gunthorpe return 0;
34040768c8deSYury Norov
3405b2cac248SLorenzo Stoakes return __gup_longterm_locked(current->mm, start, nr_pages, pages,
3406b2a72dffSJason Gunthorpe &locked, gup_flags);
340791429023SJohn Hubbard }
340891429023SJohn Hubbard EXPORT_SYMBOL(pin_user_pages_unlocked);
340989c1905dSVivek Kasireddy
341089c1905dSVivek Kasireddy /**
341189c1905dSVivek Kasireddy * memfd_pin_folios() - pin folios associated with a memfd
341289c1905dSVivek Kasireddy * @memfd: the memfd whose folios are to be pinned
341389c1905dSVivek Kasireddy * @start: the first memfd offset
341489c1905dSVivek Kasireddy * @end: the last memfd offset (inclusive)
341589c1905dSVivek Kasireddy * @folios: array that receives pointers to the folios pinned
341689c1905dSVivek Kasireddy * @max_folios: maximum number of entries in @folios
341789c1905dSVivek Kasireddy * @offset: the offset into the first folio
341889c1905dSVivek Kasireddy *
341989c1905dSVivek Kasireddy * Attempt to pin folios associated with a memfd in the contiguous range
342089c1905dSVivek Kasireddy * [start, end]. Given that a memfd is either backed by shmem or hugetlb,
342189c1905dSVivek Kasireddy * the folios can either be found in the page cache or need to be allocated
342289c1905dSVivek Kasireddy * if necessary. Once the folios are located, they are all pinned via
342389c1905dSVivek Kasireddy * FOLL_PIN and @offset is populatedwith the offset into the first folio.
342489c1905dSVivek Kasireddy * And, eventually, these pinned folios must be released either using
342589c1905dSVivek Kasireddy * unpin_folios() or unpin_folio().
342689c1905dSVivek Kasireddy *
342789c1905dSVivek Kasireddy * It must be noted that the folios may be pinned for an indefinite amount
342889c1905dSVivek Kasireddy * of time. And, in most cases, the duration of time they may stay pinned
342989c1905dSVivek Kasireddy * would be controlled by the userspace. This behavior is effectively the
343089c1905dSVivek Kasireddy * same as using FOLL_LONGTERM with other GUP APIs.
343189c1905dSVivek Kasireddy *
343289c1905dSVivek Kasireddy * Returns number of folios pinned, which could be less than @max_folios
343389c1905dSVivek Kasireddy * as it depends on the folio sizes that cover the range [start, end].
343489c1905dSVivek Kasireddy * If no folios were pinned, it returns -errno.
343589c1905dSVivek Kasireddy */
memfd_pin_folios(struct file * memfd,loff_t start,loff_t end,struct folio ** folios,unsigned int max_folios,pgoff_t * offset)343689c1905dSVivek Kasireddy long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
343789c1905dSVivek Kasireddy struct folio **folios, unsigned int max_folios,
343889c1905dSVivek Kasireddy pgoff_t *offset)
343989c1905dSVivek Kasireddy {
344089c1905dSVivek Kasireddy unsigned int flags, nr_folios, nr_found;
344189c1905dSVivek Kasireddy unsigned int i, pgshift = PAGE_SHIFT;
344230f62b92SVishal Moola (Oracle) pgoff_t start_idx, end_idx;
344389c1905dSVivek Kasireddy struct folio *folio = NULL;
344489c1905dSVivek Kasireddy struct folio_batch fbatch;
3445dc677b5fSSteve Sistare struct hstate *h;
344689c1905dSVivek Kasireddy long ret = -EINVAL;
344789c1905dSVivek Kasireddy
344889c1905dSVivek Kasireddy if (start < 0 || start > end || !max_folios)
344989c1905dSVivek Kasireddy return -EINVAL;
345089c1905dSVivek Kasireddy
345189c1905dSVivek Kasireddy if (!memfd)
345289c1905dSVivek Kasireddy return -EINVAL;
345389c1905dSVivek Kasireddy
345489c1905dSVivek Kasireddy if (!shmem_file(memfd) && !is_file_hugepages(memfd))
345589c1905dSVivek Kasireddy return -EINVAL;
345689c1905dSVivek Kasireddy
345789c1905dSVivek Kasireddy if (end >= i_size_read(file_inode(memfd)))
345889c1905dSVivek Kasireddy return -EINVAL;
345989c1905dSVivek Kasireddy
346089c1905dSVivek Kasireddy if (is_file_hugepages(memfd)) {
346189c1905dSVivek Kasireddy h = hstate_file(memfd);
346289c1905dSVivek Kasireddy pgshift = huge_page_shift(h);
346389c1905dSVivek Kasireddy }
346489c1905dSVivek Kasireddy
346589c1905dSVivek Kasireddy flags = memalloc_pin_save();
346689c1905dSVivek Kasireddy do {
346789c1905dSVivek Kasireddy nr_folios = 0;
346889c1905dSVivek Kasireddy start_idx = start >> pgshift;
346989c1905dSVivek Kasireddy end_idx = end >> pgshift;
347089c1905dSVivek Kasireddy if (is_file_hugepages(memfd)) {
347189c1905dSVivek Kasireddy start_idx <<= huge_page_order(h);
347289c1905dSVivek Kasireddy end_idx <<= huge_page_order(h);
347389c1905dSVivek Kasireddy }
347489c1905dSVivek Kasireddy
347589c1905dSVivek Kasireddy folio_batch_init(&fbatch);
347689c1905dSVivek Kasireddy while (start_idx <= end_idx && nr_folios < max_folios) {
347789c1905dSVivek Kasireddy /*
347889c1905dSVivek Kasireddy * In most cases, we should be able to find the folios
347989c1905dSVivek Kasireddy * in the page cache. If we cannot find them for some
348089c1905dSVivek Kasireddy * reason, we try to allocate them and add them to the
348189c1905dSVivek Kasireddy * page cache.
348289c1905dSVivek Kasireddy */
348389c1905dSVivek Kasireddy nr_found = filemap_get_folios_contig(memfd->f_mapping,
348489c1905dSVivek Kasireddy &start_idx,
348589c1905dSVivek Kasireddy end_idx,
348689c1905dSVivek Kasireddy &fbatch);
348789c1905dSVivek Kasireddy if (folio) {
348889c1905dSVivek Kasireddy folio_put(folio);
348989c1905dSVivek Kasireddy folio = NULL;
349089c1905dSVivek Kasireddy }
349189c1905dSVivek Kasireddy
349289c1905dSVivek Kasireddy for (i = 0; i < nr_found; i++) {
3493fe488d34SVishal Moola (Oracle) folio = fbatch.folios[i];
349489c1905dSVivek Kasireddy
349589c1905dSVivek Kasireddy if (try_grab_folio(folio, 1, FOLL_PIN)) {
349689c1905dSVivek Kasireddy folio_batch_release(&fbatch);
349789c1905dSVivek Kasireddy ret = -EINVAL;
349889c1905dSVivek Kasireddy goto err;
349989c1905dSVivek Kasireddy }
350089c1905dSVivek Kasireddy
350189c1905dSVivek Kasireddy if (nr_folios == 0)
350289c1905dSVivek Kasireddy *offset = offset_in_folio(folio, start);
350389c1905dSVivek Kasireddy
350489c1905dSVivek Kasireddy folios[nr_folios] = folio;
350589c1905dSVivek Kasireddy if (++nr_folios == max_folios)
350689c1905dSVivek Kasireddy break;
350789c1905dSVivek Kasireddy }
350889c1905dSVivek Kasireddy
350989c1905dSVivek Kasireddy folio = NULL;
351089c1905dSVivek Kasireddy folio_batch_release(&fbatch);
351189c1905dSVivek Kasireddy if (!nr_found) {
351289c1905dSVivek Kasireddy folio = memfd_alloc_folio(memfd, start_idx);
351389c1905dSVivek Kasireddy if (IS_ERR(folio)) {
351489c1905dSVivek Kasireddy ret = PTR_ERR(folio);
351589c1905dSVivek Kasireddy if (ret != -EEXIST)
351689c1905dSVivek Kasireddy goto err;
3517ce645b9fSSteve Sistare folio = NULL;
351889c1905dSVivek Kasireddy }
351989c1905dSVivek Kasireddy }
352089c1905dSVivek Kasireddy }
352189c1905dSVivek Kasireddy
352289c1905dSVivek Kasireddy ret = check_and_migrate_movable_folios(nr_folios, folios);
352389c1905dSVivek Kasireddy } while (ret == -EAGAIN);
352489c1905dSVivek Kasireddy
352589c1905dSVivek Kasireddy memalloc_pin_restore(flags);
352689c1905dSVivek Kasireddy return ret ? ret : nr_folios;
352789c1905dSVivek Kasireddy err:
352889c1905dSVivek Kasireddy memalloc_pin_restore(flags);
352989c1905dSVivek Kasireddy unpin_folios(folios, nr_folios);
353089c1905dSVivek Kasireddy
353189c1905dSVivek Kasireddy return ret;
353289c1905dSVivek Kasireddy }
353389c1905dSVivek Kasireddy EXPORT_SYMBOL_GPL(memfd_pin_folios);
3534a2ad1b81SSteve Sistare
3535a2ad1b81SSteve Sistare /**
3536a2ad1b81SSteve Sistare * folio_add_pins() - add pins to an already-pinned folio
3537a2ad1b81SSteve Sistare * @folio: the folio to add more pins to
3538a2ad1b81SSteve Sistare * @pins: number of pins to add
3539a2ad1b81SSteve Sistare *
3540a2ad1b81SSteve Sistare * Try to add more pins to an already-pinned folio. The semantics
3541a2ad1b81SSteve Sistare * of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot
3542a2ad1b81SSteve Sistare * be changed.
3543a2ad1b81SSteve Sistare *
3544a2ad1b81SSteve Sistare * This function is helpful when having obtained a pin on a large folio
3545a2ad1b81SSteve Sistare * using memfd_pin_folios(), but wanting to logically unpin parts
3546a2ad1b81SSteve Sistare * (e.g., individual pages) of the folio later, for example, using
3547a2ad1b81SSteve Sistare * unpin_user_page_range_dirty_lock().
3548a2ad1b81SSteve Sistare *
3549a2ad1b81SSteve Sistare * This is not the right interface to initially pin a folio.
3550a2ad1b81SSteve Sistare */
folio_add_pins(struct folio * folio,unsigned int pins)3551a2ad1b81SSteve Sistare int folio_add_pins(struct folio *folio, unsigned int pins)
3552a2ad1b81SSteve Sistare {
3553a2ad1b81SSteve Sistare VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio));
3554a2ad1b81SSteve Sistare
3555a2ad1b81SSteve Sistare return try_grab_folio(folio, pins, FOLL_PIN);
3556a2ad1b81SSteve Sistare }
3557a2ad1b81SSteve Sistare EXPORT_SYMBOL_GPL(folio_add_pins);
3558