xref: /linux/mm/gup.c (revision 0fc8f6200d2313278fbf4539bbab74677c685531)
1457c8996SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
24bbd4c77SKirill A. Shutemov #include <linux/kernel.h>
34bbd4c77SKirill A. Shutemov #include <linux/errno.h>
44bbd4c77SKirill A. Shutemov #include <linux/err.h>
54bbd4c77SKirill A. Shutemov #include <linux/spinlock.h>
64bbd4c77SKirill A. Shutemov 
74bbd4c77SKirill A. Shutemov #include <linux/mm.h>
889c1905dSVivek Kasireddy #include <linux/memfd.h>
93565fce3SDan Williams #include <linux/memremap.h>
104bbd4c77SKirill A. Shutemov #include <linux/pagemap.h>
114bbd4c77SKirill A. Shutemov #include <linux/rmap.h>
124bbd4c77SKirill A. Shutemov #include <linux/swap.h>
134bbd4c77SKirill A. Shutemov #include <linux/swapops.h>
141507f512SMike Rapoport #include <linux/secretmem.h>
154bbd4c77SKirill A. Shutemov 
16174cd4b1SIngo Molnar #include <linux/sched/signal.h>
172667f50eSSteve Capper #include <linux/rwsem.h>
18f30c59e9SAneesh Kumar K.V #include <linux/hugetlb.h>
199a4e9f3bSAneesh Kumar K.V #include <linux/migrate.h>
209a4e9f3bSAneesh Kumar K.V #include <linux/mm_inline.h>
21*4e1d77a8STal Zussman #include <linux/folio_batch.h>
229a4e9f3bSAneesh Kumar K.V #include <linux/sched/mm.h>
23a6e79df9SLorenzo Stoakes #include <linux/shmem_fs.h>
241027e443SKirill A. Shutemov 
2533a709b2SDave Hansen #include <asm/mmu_context.h>
261027e443SKirill A. Shutemov #include <asm/tlbflush.h>
272667f50eSSteve Capper 
284bbd4c77SKirill A. Shutemov #include "internal.h"
297d0f0f06SKairui Song #include "swap.h"
304bbd4c77SKirill A. Shutemov 
sanity_check_pinned_pages(struct page ** pages,unsigned long npages)31b6a2619cSDavid Hildenbrand static inline void sanity_check_pinned_pages(struct page **pages,
32b6a2619cSDavid Hildenbrand 					     unsigned long npages)
33b6a2619cSDavid Hildenbrand {
34b6a2619cSDavid Hildenbrand 	if (!IS_ENABLED(CONFIG_DEBUG_VM))
35b6a2619cSDavid Hildenbrand 		return;
36b6a2619cSDavid Hildenbrand 
37b6a2619cSDavid Hildenbrand 	/*
38b6a2619cSDavid Hildenbrand 	 * We only pin anonymous pages if they are exclusive. Once pinned, we
39b6a2619cSDavid Hildenbrand 	 * can no longer turn them possibly shared and PageAnonExclusive() will
40b6a2619cSDavid Hildenbrand 	 * stick around until the page is freed.
41b6a2619cSDavid Hildenbrand 	 *
42b6a2619cSDavid Hildenbrand 	 * We'd like to verify that our pinned anonymous pages are still mapped
43b6a2619cSDavid Hildenbrand 	 * exclusively. The issue with anon THP is that we don't know how
44b6a2619cSDavid Hildenbrand 	 * they are/were mapped when pinning them. However, for anon
45b6a2619cSDavid Hildenbrand 	 * THP we can assume that either the given page (PTE-mapped THP) or
46b6a2619cSDavid Hildenbrand 	 * the head page (PMD-mapped THP) should be PageAnonExclusive(). If
47b6a2619cSDavid Hildenbrand 	 * neither is the case, there is certainly something wrong.
48b6a2619cSDavid Hildenbrand 	 */
49b6a2619cSDavid Hildenbrand 	for (; npages; npages--, pages++) {
50b6a2619cSDavid Hildenbrand 		struct page *page = *pages;
51a1268be2SJohn Hubbard 		struct folio *folio;
52a1268be2SJohn Hubbard 
53a1268be2SJohn Hubbard 		if (!page)
54a1268be2SJohn Hubbard 			continue;
55a1268be2SJohn Hubbard 
56a1268be2SJohn Hubbard 		folio = page_folio(page);
57b6a2619cSDavid Hildenbrand 
58c8070b78SDavid Howells 		if (is_zero_page(page) ||
59c8070b78SDavid Howells 		    !folio_test_anon(folio))
60b6a2619cSDavid Hildenbrand 			continue;
61b6a2619cSDavid Hildenbrand 		if (!folio_test_large(folio) || folio_test_hugetlb(folio))
62792b429dSDavid Hildenbrand 			VM_WARN_ON_ONCE_FOLIO(!PageAnonExclusive(&folio->page), folio);
63b6a2619cSDavid Hildenbrand 		else
64b6a2619cSDavid Hildenbrand 			/* Either a PTE-mapped or a PMD-mapped THP. */
65792b429dSDavid Hildenbrand 			VM_WARN_ON_ONCE_PAGE(!PageAnonExclusive(&folio->page) &&
66b6a2619cSDavid Hildenbrand 					     !PageAnonExclusive(page), page);
67b6a2619cSDavid Hildenbrand 	}
68b6a2619cSDavid Hildenbrand }
69b6a2619cSDavid Hildenbrand 
70c24d3732SJann Horn /*
71ece1ed7bSMatthew Wilcox (Oracle)  * Return the folio with ref appropriately incremented,
72cd1adf1bSLinus Torvalds  * or NULL if that failed.
73a707cdd5SJohn Hubbard  */
try_get_folio(struct page * page,int refs)74ece1ed7bSMatthew Wilcox (Oracle) static inline struct folio *try_get_folio(struct page *page, int refs)
75a707cdd5SJohn Hubbard {
76ece1ed7bSMatthew Wilcox (Oracle) 	struct folio *folio;
77a707cdd5SJohn Hubbard 
7859409373SMatthew Wilcox (Oracle) retry:
79ece1ed7bSMatthew Wilcox (Oracle) 	folio = page_folio(page);
80ece1ed7bSMatthew Wilcox (Oracle) 	if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
81a707cdd5SJohn Hubbard 		return NULL;
82fa2690afSYang Shi 	if (unlikely(!folio_ref_try_add(folio, refs)))
83a707cdd5SJohn Hubbard 		return NULL;
84c24d3732SJann Horn 
85c24d3732SJann Horn 	/*
86ece1ed7bSMatthew Wilcox (Oracle) 	 * At this point we have a stable reference to the folio; but it
87ece1ed7bSMatthew Wilcox (Oracle) 	 * could be that between calling page_folio() and the refcount
88ece1ed7bSMatthew Wilcox (Oracle) 	 * increment, the folio was split, in which case we'd end up
89ece1ed7bSMatthew Wilcox (Oracle) 	 * holding a reference on a folio that has nothing to do with the page
90c24d3732SJann Horn 	 * we were given anymore.
91ece1ed7bSMatthew Wilcox (Oracle) 	 * So now that the folio is stable, recheck that the page still
92ece1ed7bSMatthew Wilcox (Oracle) 	 * belongs to this folio.
93c24d3732SJann Horn 	 */
94ece1ed7bSMatthew Wilcox (Oracle) 	if (unlikely(page_folio(page) != folio)) {
95ece1ed7bSMatthew Wilcox (Oracle) 		folio_put_refs(folio, refs);
9659409373SMatthew Wilcox (Oracle) 		goto retry;
97c24d3732SJann Horn 	}
98c24d3732SJann Horn 
99ece1ed7bSMatthew Wilcox (Oracle) 	return folio;
100a707cdd5SJohn Hubbard }
101a707cdd5SJohn Hubbard 
gup_put_folio(struct folio * folio,int refs,unsigned int flags)102d8ddc099SMatthew Wilcox (Oracle) static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
1034509b42cSJason Gunthorpe {
1044509b42cSJason Gunthorpe 	if (flags & FOLL_PIN) {
105c8070b78SDavid Howells 		if (is_zero_folio(folio))
106c8070b78SDavid Howells 			return;
107d8ddc099SMatthew Wilcox (Oracle) 		node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
10831a31da8SDavid Hildenbrand 		if (folio_has_pincount(folio))
10994688e8eSMatthew Wilcox (Oracle) 			atomic_sub(refs, &folio->_pincount);
1104509b42cSJason Gunthorpe 		else
1114509b42cSJason Gunthorpe 			refs *= GUP_PIN_COUNTING_BIAS;
1124509b42cSJason Gunthorpe 	}
1134509b42cSJason Gunthorpe 
114d8ddc099SMatthew Wilcox (Oracle) 	folio_put_refs(folio, refs);
1154509b42cSJason Gunthorpe }
1164509b42cSJason Gunthorpe 
1173faa52c0SJohn Hubbard /**
118f442fa61SYang Shi  * try_grab_folio() - add a folio's refcount by a flag-dependent amount
119f442fa61SYang Shi  * @folio:    pointer to folio to be grabbed
120f442fa61SYang Shi  * @refs:     the value to (effectively) add to the folio's refcount
121f442fa61SYang Shi  * @flags:    gup flags: these are the FOLL_* flag values
1223faa52c0SJohn Hubbard  *
1233faa52c0SJohn Hubbard  * This might not do anything at all, depending on the flags argument.
1243faa52c0SJohn Hubbard  *
1253faa52c0SJohn Hubbard  * "grab" names in this file mean, "look at flags to decide whether to use
126f442fa61SYang Shi  * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
1273faa52c0SJohn Hubbard  *
1283faa52c0SJohn Hubbard  * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
129f442fa61SYang Shi  * time.
1303faa52c0SJohn Hubbard  *
1310f089235SLogan Gunthorpe  * Return: 0 for success, or if no action was required (if neither FOLL_PIN
1320f089235SLogan Gunthorpe  * nor FOLL_GET was set, nothing is done). A negative error code for failure:
1330f089235SLogan Gunthorpe  *
134f442fa61SYang Shi  *   -ENOMEM		FOLL_GET or FOLL_PIN was set, but the folio could not
1350f089235SLogan Gunthorpe  *			be grabbed.
136f442fa61SYang Shi  *
137f442fa61SYang Shi  * It is called when we have a stable reference for the folio, typically in
138f442fa61SYang Shi  * GUP slow path.
1393faa52c0SJohn Hubbard  */
try_grab_folio(struct folio * folio,int refs,unsigned int flags)140f442fa61SYang Shi int __must_check try_grab_folio(struct folio *folio, int refs,
141f442fa61SYang Shi 				unsigned int flags)
1423faa52c0SJohn Hubbard {
1435fec0719SMatthew Wilcox (Oracle) 	if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
1440f089235SLogan Gunthorpe 		return -ENOMEM;
145c36c04c2SJohn Hubbard 
14688df6ab2SMatthew Wilcox (Oracle) 	if (unlikely(!(flags & FOLL_PCI_P2PDMA) && folio_is_pci_p2pdma(folio)))
1474003f107SLogan Gunthorpe 		return -EREMOTEIO;
148c36c04c2SJohn Hubbard 
1495fec0719SMatthew Wilcox (Oracle) 	if (flags & FOLL_GET)
150f442fa61SYang Shi 		folio_ref_add(folio, refs);
1515fec0719SMatthew Wilcox (Oracle) 	else if (flags & FOLL_PIN) {
152c36c04c2SJohn Hubbard 		/*
153c8070b78SDavid Howells 		 * Don't take a pin on the zero page - it's not going anywhere
154c8070b78SDavid Howells 		 * and it is used in a *lot* of places.
155c8070b78SDavid Howells 		 */
156f442fa61SYang Shi 		if (is_zero_folio(folio))
157c8070b78SDavid Howells 			return 0;
158c8070b78SDavid Howells 
159c8070b78SDavid Howells 		/*
160f442fa61SYang Shi 		 * Increment the normal page refcount field at least once,
16178d9d6ceSMatthew Wilcox (Oracle) 		 * so that the page really is pinned.
162c36c04c2SJohn Hubbard 		 */
16331a31da8SDavid Hildenbrand 		if (folio_has_pincount(folio)) {
164f442fa61SYang Shi 			folio_ref_add(folio, refs);
165f442fa61SYang Shi 			atomic_add(refs, &folio->_pincount);
1668ea2979cSMatthew Wilcox (Oracle) 		} else {
167f442fa61SYang Shi 			folio_ref_add(folio, refs * GUP_PIN_COUNTING_BIAS);
1688ea2979cSMatthew Wilcox (Oracle) 		}
169c36c04c2SJohn Hubbard 
170f442fa61SYang Shi 		node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
171c36c04c2SJohn Hubbard 	}
172c36c04c2SJohn Hubbard 
1730f089235SLogan Gunthorpe 	return 0;
1743faa52c0SJohn Hubbard }
1753faa52c0SJohn Hubbard 
1763faa52c0SJohn Hubbard /**
1773faa52c0SJohn Hubbard  * unpin_user_page() - release a dma-pinned page
1783faa52c0SJohn Hubbard  * @page:            pointer to page to be released
1793faa52c0SJohn Hubbard  *
1803faa52c0SJohn Hubbard  * Pages that were pinned via pin_user_pages*() must be released via either
1813faa52c0SJohn Hubbard  * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
1823faa52c0SJohn Hubbard  * that such pages can be separately tracked and uniquely handled. In
1833faa52c0SJohn Hubbard  * particular, interactions with RDMA and filesystems need special handling.
1843faa52c0SJohn Hubbard  */
unpin_user_page(struct page * page)1853faa52c0SJohn Hubbard void unpin_user_page(struct page *page)
1863faa52c0SJohn Hubbard {
187b6a2619cSDavid Hildenbrand 	sanity_check_pinned_pages(&page, 1);
188d8ddc099SMatthew Wilcox (Oracle) 	gup_put_folio(page_folio(page), 1, FOLL_PIN);
1893faa52c0SJohn Hubbard }
1903faa52c0SJohn Hubbard EXPORT_SYMBOL(unpin_user_page);
1913faa52c0SJohn Hubbard 
1921101fb8fSDavid Howells /**
1936cc04054SVivek Kasireddy  * unpin_folio() - release a dma-pinned folio
1946cc04054SVivek Kasireddy  * @folio:         pointer to folio to be released
1956cc04054SVivek Kasireddy  *
1966cc04054SVivek Kasireddy  * Folios that were pinned via memfd_pin_folios() or other similar routines
1976cc04054SVivek Kasireddy  * must be released either using unpin_folio() or unpin_folios().
1986cc04054SVivek Kasireddy  */
unpin_folio(struct folio * folio)1996cc04054SVivek Kasireddy void unpin_folio(struct folio *folio)
2006cc04054SVivek Kasireddy {
2016cc04054SVivek Kasireddy 	gup_put_folio(folio, 1, FOLL_PIN);
2026cc04054SVivek Kasireddy }
2036cc04054SVivek Kasireddy EXPORT_SYMBOL_GPL(unpin_folio);
2046cc04054SVivek Kasireddy 
2056cc04054SVivek Kasireddy /**
2061101fb8fSDavid Howells  * folio_add_pin - Try to get an additional pin on a pinned folio
2071101fb8fSDavid Howells  * @folio: The folio to be pinned
2081101fb8fSDavid Howells  *
2091101fb8fSDavid Howells  * Get an additional pin on a folio we already have a pin on.  Makes no change
2101101fb8fSDavid Howells  * if the folio is a zero_page.
2111101fb8fSDavid Howells  */
folio_add_pin(struct folio * folio)2121101fb8fSDavid Howells void folio_add_pin(struct folio *folio)
2131101fb8fSDavid Howells {
2141101fb8fSDavid Howells 	if (is_zero_folio(folio))
2151101fb8fSDavid Howells 		return;
2161101fb8fSDavid Howells 
2171101fb8fSDavid Howells 	/*
2181101fb8fSDavid Howells 	 * Similar to try_grab_folio(): be sure to *also* increment the normal
2191101fb8fSDavid Howells 	 * page refcount field at least once, so that the page really is
2201101fb8fSDavid Howells 	 * pinned.
2211101fb8fSDavid Howells 	 */
22231a31da8SDavid Hildenbrand 	if (folio_has_pincount(folio)) {
2231101fb8fSDavid Howells 		WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1);
2241101fb8fSDavid Howells 		folio_ref_inc(folio);
2251101fb8fSDavid Howells 		atomic_inc(&folio->_pincount);
2261101fb8fSDavid Howells 	} else {
2271101fb8fSDavid Howells 		WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS);
2281101fb8fSDavid Howells 		folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
2291101fb8fSDavid Howells 	}
2301101fb8fSDavid Howells }
2311101fb8fSDavid Howells 
gup_folio_range_next(struct page * start,unsigned long npages,unsigned long i,unsigned int * ntails)232659508f9SMatthew Wilcox (Oracle) static inline struct folio *gup_folio_range_next(struct page *start,
2338f39f5fcSMatthew Wilcox (Oracle) 		unsigned long npages, unsigned long i, unsigned int *ntails)
234458a4f78SJoao Martins {
235b5ba761aSDavid Hildenbrand 	struct page *next = start + i;
236659508f9SMatthew Wilcox (Oracle) 	struct folio *folio = page_folio(next);
237458a4f78SJoao Martins 	unsigned int nr = 1;
238458a4f78SJoao Martins 
239659508f9SMatthew Wilcox (Oracle) 	if (folio_test_large(folio))
2404c654229SMatthew Wilcox (Oracle) 		nr = min_t(unsigned int, npages - i,
241659508f9SMatthew Wilcox (Oracle) 			   folio_nr_pages(folio) - folio_page_idx(folio, next));
242458a4f78SJoao Martins 
243458a4f78SJoao Martins 	*ntails = nr;
244659508f9SMatthew Wilcox (Oracle) 	return folio;
245458a4f78SJoao Martins }
246458a4f78SJoao Martins 
gup_folio_next(struct page ** list,unsigned long npages,unsigned long i,unsigned int * ntails)24712521c76SMatthew Wilcox (Oracle) static inline struct folio *gup_folio_next(struct page **list,
24828297dbcSMatthew Wilcox (Oracle) 		unsigned long npages, unsigned long i, unsigned int *ntails)
2498745d7f6SJoao Martins {
25012521c76SMatthew Wilcox (Oracle) 	struct folio *folio = page_folio(list[i]);
2518745d7f6SJoao Martins 	unsigned int nr;
2528745d7f6SJoao Martins 
2538745d7f6SJoao Martins 	for (nr = i + 1; nr < npages; nr++) {
25412521c76SMatthew Wilcox (Oracle) 		if (page_folio(list[nr]) != folio)
2558745d7f6SJoao Martins 			break;
2568745d7f6SJoao Martins 	}
2578745d7f6SJoao Martins 
2588745d7f6SJoao Martins 	*ntails = nr - i;
25912521c76SMatthew Wilcox (Oracle) 	return folio;
2608745d7f6SJoao Martins }
2618745d7f6SJoao Martins 
262fc1d8e7cSJohn Hubbard /**
263f1f6a7ddSJohn Hubbard  * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
2642d15eb31Sakpm@linux-foundation.org  * @pages:  array of pages to be maybe marked dirty, and definitely released.
265fc1d8e7cSJohn Hubbard  * @npages: number of pages in the @pages array.
2662d15eb31Sakpm@linux-foundation.org  * @make_dirty: whether to mark the pages dirty
267fc1d8e7cSJohn Hubbard  *
268fc1d8e7cSJohn Hubbard  * "gup-pinned page" refers to a page that has had one of the get_user_pages()
269fc1d8e7cSJohn Hubbard  * variants called on that page.
270fc1d8e7cSJohn Hubbard  *
271fc1d8e7cSJohn Hubbard  * For each page in the @pages array, make that page (or its head page, if a
2722d15eb31Sakpm@linux-foundation.org  * compound page) dirty, if @make_dirty is true, and if the page was previously
273f1f6a7ddSJohn Hubbard  * listed as clean. In any case, releases all pages using unpin_user_page(),
274f1f6a7ddSJohn Hubbard  * possibly via unpin_user_pages(), for the non-dirty case.
275fc1d8e7cSJohn Hubbard  *
276f1f6a7ddSJohn Hubbard  * Please see the unpin_user_page() documentation for details.
277fc1d8e7cSJohn Hubbard  *
2782d15eb31Sakpm@linux-foundation.org  * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
2792d15eb31Sakpm@linux-foundation.org  * required, then the caller should a) verify that this is really correct,
2802d15eb31Sakpm@linux-foundation.org  * because _lock() is usually required, and b) hand code it:
281f1f6a7ddSJohn Hubbard  * set_page_dirty_lock(), unpin_user_page().
282fc1d8e7cSJohn Hubbard  *
283fc1d8e7cSJohn Hubbard  */
unpin_user_pages_dirty_lock(struct page ** pages,unsigned long npages,bool make_dirty)284f1f6a7ddSJohn Hubbard void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
2852d15eb31Sakpm@linux-foundation.org 				 bool make_dirty)
286fc1d8e7cSJohn Hubbard {
28712521c76SMatthew Wilcox (Oracle) 	unsigned long i;
28812521c76SMatthew Wilcox (Oracle) 	struct folio *folio;
28912521c76SMatthew Wilcox (Oracle) 	unsigned int nr;
2902d15eb31Sakpm@linux-foundation.org 
2912d15eb31Sakpm@linux-foundation.org 	if (!make_dirty) {
292f1f6a7ddSJohn Hubbard 		unpin_user_pages(pages, npages);
2932d15eb31Sakpm@linux-foundation.org 		return;
2942d15eb31Sakpm@linux-foundation.org 	}
2952d15eb31Sakpm@linux-foundation.org 
296b6a2619cSDavid Hildenbrand 	sanity_check_pinned_pages(pages, npages);
29712521c76SMatthew Wilcox (Oracle) 	for (i = 0; i < npages; i += nr) {
29812521c76SMatthew Wilcox (Oracle) 		folio = gup_folio_next(pages, npages, i, &nr);
2992d15eb31Sakpm@linux-foundation.org 		/*
3002d15eb31Sakpm@linux-foundation.org 		 * Checking PageDirty at this point may race with
3012d15eb31Sakpm@linux-foundation.org 		 * clear_page_dirty_for_io(), but that's OK. Two key
3022d15eb31Sakpm@linux-foundation.org 		 * cases:
3032d15eb31Sakpm@linux-foundation.org 		 *
3042d15eb31Sakpm@linux-foundation.org 		 * 1) This code sees the page as already dirty, so it
3052d15eb31Sakpm@linux-foundation.org 		 * skips the call to set_page_dirty(). That could happen
3062d15eb31Sakpm@linux-foundation.org 		 * because clear_page_dirty_for_io() called
307a929e0d1SKefeng Wang 		 * folio_mkclean(), followed by set_page_dirty().
3082d15eb31Sakpm@linux-foundation.org 		 * However, now the page is going to get written back,
3092d15eb31Sakpm@linux-foundation.org 		 * which meets the original intention of setting it
3102d15eb31Sakpm@linux-foundation.org 		 * dirty, so all is well: clear_page_dirty_for_io() goes
3112d15eb31Sakpm@linux-foundation.org 		 * on to call TestClearPageDirty(), and write the page
3122d15eb31Sakpm@linux-foundation.org 		 * back.
3132d15eb31Sakpm@linux-foundation.org 		 *
3142d15eb31Sakpm@linux-foundation.org 		 * 2) This code sees the page as clean, so it calls
3152d15eb31Sakpm@linux-foundation.org 		 * set_page_dirty(). The page stays dirty, despite being
3162d15eb31Sakpm@linux-foundation.org 		 * written back, so it gets written back again in the
3172d15eb31Sakpm@linux-foundation.org 		 * next writeback cycle. This is harmless.
3182d15eb31Sakpm@linux-foundation.org 		 */
31912521c76SMatthew Wilcox (Oracle) 		if (!folio_test_dirty(folio)) {
32012521c76SMatthew Wilcox (Oracle) 			folio_lock(folio);
32112521c76SMatthew Wilcox (Oracle) 			folio_mark_dirty(folio);
32212521c76SMatthew Wilcox (Oracle) 			folio_unlock(folio);
32312521c76SMatthew Wilcox (Oracle) 		}
32412521c76SMatthew Wilcox (Oracle) 		gup_put_folio(folio, nr, FOLL_PIN);
3252d15eb31Sakpm@linux-foundation.org 	}
326fc1d8e7cSJohn Hubbard }
327f1f6a7ddSJohn Hubbard EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
328fc1d8e7cSJohn Hubbard 
329fc1d8e7cSJohn Hubbard /**
330458a4f78SJoao Martins  * unpin_user_page_range_dirty_lock() - release and optionally dirty
331458a4f78SJoao Martins  * gup-pinned page range
332458a4f78SJoao Martins  *
333458a4f78SJoao Martins  * @page:  the starting page of a range maybe marked dirty, and definitely released.
334458a4f78SJoao Martins  * @npages: number of consecutive pages to release.
335458a4f78SJoao Martins  * @make_dirty: whether to mark the pages dirty
336458a4f78SJoao Martins  *
337458a4f78SJoao Martins  * "gup-pinned page range" refers to a range of pages that has had one of the
338458a4f78SJoao Martins  * pin_user_pages() variants called on that page.
339458a4f78SJoao Martins  *
340b5ba761aSDavid Hildenbrand  * The page range must be truly physically contiguous: the page range
341b5ba761aSDavid Hildenbrand  * corresponds to a contiguous PFN range and all pages can be iterated
342b5ba761aSDavid Hildenbrand  * naturally.
343b5ba761aSDavid Hildenbrand  *
344458a4f78SJoao Martins  * For the page ranges defined by [page .. page+npages], make that range (or
345458a4f78SJoao Martins  * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
346458a4f78SJoao Martins  * page range was previously listed as clean.
347458a4f78SJoao Martins  *
348458a4f78SJoao Martins  * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
349458a4f78SJoao Martins  * required, then the caller should a) verify that this is really correct,
350458a4f78SJoao Martins  * because _lock() is usually required, and b) hand code it:
351458a4f78SJoao Martins  * set_page_dirty_lock(), unpin_user_page().
352458a4f78SJoao Martins  *
353458a4f78SJoao Martins  */
unpin_user_page_range_dirty_lock(struct page * page,unsigned long npages,bool make_dirty)354458a4f78SJoao Martins void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
355458a4f78SJoao Martins 				      bool make_dirty)
356458a4f78SJoao Martins {
357659508f9SMatthew Wilcox (Oracle) 	unsigned long i;
358659508f9SMatthew Wilcox (Oracle) 	struct folio *folio;
359659508f9SMatthew Wilcox (Oracle) 	unsigned int nr;
360458a4f78SJoao Martins 
361b5ba761aSDavid Hildenbrand 	VM_WARN_ON_ONCE(!page_range_contiguous(page, npages));
362b5ba761aSDavid Hildenbrand 
363659508f9SMatthew Wilcox (Oracle) 	for (i = 0; i < npages; i += nr) {
364659508f9SMatthew Wilcox (Oracle) 		folio = gup_folio_range_next(page, npages, i, &nr);
365659508f9SMatthew Wilcox (Oracle) 		if (make_dirty && !folio_test_dirty(folio)) {
366659508f9SMatthew Wilcox (Oracle) 			folio_lock(folio);
367659508f9SMatthew Wilcox (Oracle) 			folio_mark_dirty(folio);
368659508f9SMatthew Wilcox (Oracle) 			folio_unlock(folio);
369659508f9SMatthew Wilcox (Oracle) 		}
370659508f9SMatthew Wilcox (Oracle) 		gup_put_folio(folio, nr, FOLL_PIN);
371458a4f78SJoao Martins 	}
372458a4f78SJoao Martins }
373458a4f78SJoao Martins EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
374458a4f78SJoao Martins 
gup_fast_unpin_user_pages(struct page ** pages,unsigned long npages)37523babe19SDavid Hildenbrand static void gup_fast_unpin_user_pages(struct page **pages, unsigned long npages)
376b6a2619cSDavid Hildenbrand {
377b6a2619cSDavid Hildenbrand 	unsigned long i;
378b6a2619cSDavid Hildenbrand 	struct folio *folio;
379b6a2619cSDavid Hildenbrand 	unsigned int nr;
380b6a2619cSDavid Hildenbrand 
381b6a2619cSDavid Hildenbrand 	/*
382b6a2619cSDavid Hildenbrand 	 * Don't perform any sanity checks because we might have raced with
383b6a2619cSDavid Hildenbrand 	 * fork() and some anonymous pages might now actually be shared --
384b6a2619cSDavid Hildenbrand 	 * which is why we're unpinning after all.
385b6a2619cSDavid Hildenbrand 	 */
386b6a2619cSDavid Hildenbrand 	for (i = 0; i < npages; i += nr) {
387b6a2619cSDavid Hildenbrand 		folio = gup_folio_next(pages, npages, i, &nr);
388b6a2619cSDavid Hildenbrand 		gup_put_folio(folio, nr, FOLL_PIN);
389b6a2619cSDavid Hildenbrand 	}
390b6a2619cSDavid Hildenbrand }
391b6a2619cSDavid Hildenbrand 
392458a4f78SJoao Martins /**
393f1f6a7ddSJohn Hubbard  * unpin_user_pages() - release an array of gup-pinned pages.
394fc1d8e7cSJohn Hubbard  * @pages:  array of pages to be marked dirty and released.
395fc1d8e7cSJohn Hubbard  * @npages: number of pages in the @pages array.
396fc1d8e7cSJohn Hubbard  *
397f1f6a7ddSJohn Hubbard  * For each page in the @pages array, release the page using unpin_user_page().
398fc1d8e7cSJohn Hubbard  *
399f1f6a7ddSJohn Hubbard  * Please see the unpin_user_page() documentation for details.
400fc1d8e7cSJohn Hubbard  */
unpin_user_pages(struct page ** pages,unsigned long npages)401f1f6a7ddSJohn Hubbard void unpin_user_pages(struct page **pages, unsigned long npages)
402fc1d8e7cSJohn Hubbard {
40312521c76SMatthew Wilcox (Oracle) 	unsigned long i;
40412521c76SMatthew Wilcox (Oracle) 	struct folio *folio;
40512521c76SMatthew Wilcox (Oracle) 	unsigned int nr;
406fc1d8e7cSJohn Hubbard 
407fc1d8e7cSJohn Hubbard 	/*
408146608bbSJohn Hubbard 	 * If this WARN_ON() fires, then the system *might* be leaking pages (by
409146608bbSJohn Hubbard 	 * leaving them pinned), but probably not. More likely, gup/pup returned
410146608bbSJohn Hubbard 	 * a hard -ERRNO error to the caller, who erroneously passed it here.
411146608bbSJohn Hubbard 	 */
412146608bbSJohn Hubbard 	if (WARN_ON(IS_ERR_VALUE(npages)))
413146608bbSJohn Hubbard 		return;
41431b912deSJoao Martins 
415b6a2619cSDavid Hildenbrand 	sanity_check_pinned_pages(pages, npages);
41612521c76SMatthew Wilcox (Oracle) 	for (i = 0; i < npages; i += nr) {
417a1268be2SJohn Hubbard 		if (!pages[i]) {
418a1268be2SJohn Hubbard 			nr = 1;
419a1268be2SJohn Hubbard 			continue;
420a1268be2SJohn Hubbard 		}
42112521c76SMatthew Wilcox (Oracle) 		folio = gup_folio_next(pages, npages, i, &nr);
42212521c76SMatthew Wilcox (Oracle) 		gup_put_folio(folio, nr, FOLL_PIN);
423fc1d8e7cSJohn Hubbard 	}
424fc1d8e7cSJohn Hubbard }
425f1f6a7ddSJohn Hubbard EXPORT_SYMBOL(unpin_user_pages);
426fc1d8e7cSJohn Hubbard 
4276cc04054SVivek Kasireddy /**
428d3bfbfb1SKundan Kumar  * unpin_user_folio() - release pages of a folio
429d3bfbfb1SKundan Kumar  * @folio:  pointer to folio to be released
430d3bfbfb1SKundan Kumar  * @npages: number of pages of same folio
431d3bfbfb1SKundan Kumar  *
432d3bfbfb1SKundan Kumar  * Release npages of the folio
433d3bfbfb1SKundan Kumar  */
unpin_user_folio(struct folio * folio,unsigned long npages)434d3bfbfb1SKundan Kumar void unpin_user_folio(struct folio *folio, unsigned long npages)
435d3bfbfb1SKundan Kumar {
436d3bfbfb1SKundan Kumar 	gup_put_folio(folio, npages, FOLL_PIN);
437d3bfbfb1SKundan Kumar }
438d3bfbfb1SKundan Kumar EXPORT_SYMBOL(unpin_user_folio);
439d3bfbfb1SKundan Kumar 
440d3bfbfb1SKundan Kumar /**
4416cc04054SVivek Kasireddy  * unpin_folios() - release an array of gup-pinned folios.
4426cc04054SVivek Kasireddy  * @folios:  array of folios to be marked dirty and released.
4436cc04054SVivek Kasireddy  * @nfolios: number of folios in the @folios array.
4446cc04054SVivek Kasireddy  *
4456cc04054SVivek Kasireddy  * For each folio in the @folios array, release the folio using gup_put_folio.
4466cc04054SVivek Kasireddy  *
4476cc04054SVivek Kasireddy  * Please see the unpin_folio() documentation for details.
4486cc04054SVivek Kasireddy  */
unpin_folios(struct folio ** folios,unsigned long nfolios)4496cc04054SVivek Kasireddy void unpin_folios(struct folio **folios, unsigned long nfolios)
4506cc04054SVivek Kasireddy {
4516cc04054SVivek Kasireddy 	unsigned long i = 0, j;
4526cc04054SVivek Kasireddy 
4536cc04054SVivek Kasireddy 	/*
4546cc04054SVivek Kasireddy 	 * If this WARN_ON() fires, then the system *might* be leaking folios
4556cc04054SVivek Kasireddy 	 * (by leaving them pinned), but probably not. More likely, gup/pup
4566cc04054SVivek Kasireddy 	 * returned a hard -ERRNO error to the caller, who erroneously passed
4576cc04054SVivek Kasireddy 	 * it here.
4586cc04054SVivek Kasireddy 	 */
4596cc04054SVivek Kasireddy 	if (WARN_ON(IS_ERR_VALUE(nfolios)))
4606cc04054SVivek Kasireddy 		return;
4616cc04054SVivek Kasireddy 
4626cc04054SVivek Kasireddy 	while (i < nfolios) {
4636cc04054SVivek Kasireddy 		for (j = i + 1; j < nfolios; j++)
4646cc04054SVivek Kasireddy 			if (folios[i] != folios[j])
4656cc04054SVivek Kasireddy 				break;
4666cc04054SVivek Kasireddy 
4676cc04054SVivek Kasireddy 		if (folios[i])
4686cc04054SVivek Kasireddy 			gup_put_folio(folios[i], j - i, FOLL_PIN);
4696cc04054SVivek Kasireddy 		i = j;
4706cc04054SVivek Kasireddy 	}
4716cc04054SVivek Kasireddy }
4726cc04054SVivek Kasireddy EXPORT_SYMBOL_GPL(unpin_folios);
4736cc04054SVivek Kasireddy 
474a458b76aSAndrea Arcangeli /*
475a458b76aSAndrea Arcangeli  * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
476a458b76aSAndrea Arcangeli  * lifecycle.  Avoid setting the bit unless necessary, or it might cause write
477a458b76aSAndrea Arcangeli  * cache bouncing on large SMP machines for concurrent pinned gups.
478a458b76aSAndrea Arcangeli  */
mm_set_has_pinned_flag(struct mm_struct * mm)47912e423baSLorenzo Stoakes static inline void mm_set_has_pinned_flag(struct mm_struct *mm)
480a458b76aSAndrea Arcangeli {
48112e423baSLorenzo Stoakes 	if (!mm_flags_test(MMF_HAS_PINNED, mm))
48212e423baSLorenzo Stoakes 		mm_flags_set(MMF_HAS_PINNED, mm);
483a458b76aSAndrea Arcangeli }
484a458b76aSAndrea Arcangeli 
485050a9adcSChristoph Hellwig #ifdef CONFIG_MMU
486a12083d7SPeter Xu 
4878268614bSChristophe Leroy #ifdef CONFIG_HAVE_GUP_FAST
488f442fa61SYang Shi /**
489f442fa61SYang Shi  * try_grab_folio_fast() - Attempt to get or pin a folio in fast path.
490f442fa61SYang Shi  * @page:  pointer to page to be grabbed
491f442fa61SYang Shi  * @refs:  the value to (effectively) add to the folio's refcount
492f442fa61SYang Shi  * @flags: gup flags: these are the FOLL_* flag values.
493f442fa61SYang Shi  *
494f442fa61SYang Shi  * "grab" names in this file mean, "look at flags to decide whether to use
495f442fa61SYang Shi  * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
496f442fa61SYang Shi  *
497f442fa61SYang Shi  * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
498f442fa61SYang Shi  * same time. (That's true throughout the get_user_pages*() and
499f442fa61SYang Shi  * pin_user_pages*() APIs.) Cases:
500f442fa61SYang Shi  *
501f442fa61SYang Shi  *    FOLL_GET: folio's refcount will be incremented by @refs.
502f442fa61SYang Shi  *
503f442fa61SYang Shi  *    FOLL_PIN on large folios: folio's refcount will be incremented by
504f442fa61SYang Shi  *    @refs, and its pincount will be incremented by @refs.
505f442fa61SYang Shi  *
506f442fa61SYang Shi  *    FOLL_PIN on single-page folios: folio's refcount will be incremented by
507f442fa61SYang Shi  *    @refs * GUP_PIN_COUNTING_BIAS.
508f442fa61SYang Shi  *
509f442fa61SYang Shi  * Return: The folio containing @page (with refcount appropriately
510f442fa61SYang Shi  * incremented) for success, or NULL upon failure. If neither FOLL_GET
511f442fa61SYang Shi  * nor FOLL_PIN was set, that's considered failure, and furthermore,
512f442fa61SYang Shi  * a likely bug in the caller, so a warning is also emitted.
513f442fa61SYang Shi  *
514f442fa61SYang Shi  * It uses add ref unless zero to elevate the folio refcount and must be called
515f442fa61SYang Shi  * in fast path only.
516f442fa61SYang Shi  */
try_grab_folio_fast(struct page * page,int refs,unsigned int flags)517f442fa61SYang Shi static struct folio *try_grab_folio_fast(struct page *page, int refs,
518f442fa61SYang Shi 					 unsigned int flags)
519f442fa61SYang Shi {
520f442fa61SYang Shi 	struct folio *folio;
521f442fa61SYang Shi 
522f442fa61SYang Shi 	/* Raise warn if it is not called in fast GUP */
523f442fa61SYang Shi 	VM_WARN_ON_ONCE(!irqs_disabled());
524f442fa61SYang Shi 
525f442fa61SYang Shi 	if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))
526f442fa61SYang Shi 		return NULL;
527f442fa61SYang Shi 
528f442fa61SYang Shi 	if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
529f442fa61SYang Shi 		return NULL;
530f442fa61SYang Shi 
531f442fa61SYang Shi 	if (flags & FOLL_GET)
532f442fa61SYang Shi 		return try_get_folio(page, refs);
533f442fa61SYang Shi 
534f442fa61SYang Shi 	/* FOLL_PIN is set */
535f442fa61SYang Shi 
536f442fa61SYang Shi 	/*
537f442fa61SYang Shi 	 * Don't take a pin on the zero page - it's not going anywhere
538f442fa61SYang Shi 	 * and it is used in a *lot* of places.
539f442fa61SYang Shi 	 */
540f442fa61SYang Shi 	if (is_zero_page(page))
541f442fa61SYang Shi 		return page_folio(page);
542f442fa61SYang Shi 
543f442fa61SYang Shi 	folio = try_get_folio(page, refs);
544f442fa61SYang Shi 	if (!folio)
545f442fa61SYang Shi 		return NULL;
546f442fa61SYang Shi 
547f442fa61SYang Shi 	/*
548f442fa61SYang Shi 	 * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
549f442fa61SYang Shi 	 * right zone, so fail and let the caller fall back to the slow
550f442fa61SYang Shi 	 * path.
551f442fa61SYang Shi 	 */
552f442fa61SYang Shi 	if (unlikely((flags & FOLL_LONGTERM) &&
553f442fa61SYang Shi 		     !folio_is_longterm_pinnable(folio))) {
554f442fa61SYang Shi 		folio_put_refs(folio, refs);
555f442fa61SYang Shi 		return NULL;
556f442fa61SYang Shi 	}
557f442fa61SYang Shi 
558f442fa61SYang Shi 	/*
559f442fa61SYang Shi 	 * When pinning a large folio, use an exact count to track it.
560f442fa61SYang Shi 	 *
561f442fa61SYang Shi 	 * However, be sure to *also* increment the normal folio
562f442fa61SYang Shi 	 * refcount field at least once, so that the folio really
563f442fa61SYang Shi 	 * is pinned.  That's why the refcount from the earlier
564f442fa61SYang Shi 	 * try_get_folio() is left intact.
565f442fa61SYang Shi 	 */
56631a31da8SDavid Hildenbrand 	if (folio_has_pincount(folio))
567f442fa61SYang Shi 		atomic_add(refs, &folio->_pincount);
568f442fa61SYang Shi 	else
569f442fa61SYang Shi 		folio_ref_add(folio,
570f442fa61SYang Shi 				refs * (GUP_PIN_COUNTING_BIAS - 1));
571f442fa61SYang Shi 	/*
572f442fa61SYang Shi 	 * Adjust the pincount before re-checking the PTE for changes.
573f442fa61SYang Shi 	 * This is essentially a smp_mb() and is paired with a memory
574f442fa61SYang Shi 	 * barrier in folio_try_share_anon_rmap_*().
575f442fa61SYang Shi 	 */
576f442fa61SYang Shi 	smp_mb__after_atomic();
577f442fa61SYang Shi 
578f442fa61SYang Shi 	node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
579f442fa61SYang Shi 
580f442fa61SYang Shi 	return folio;
581f442fa61SYang Shi }
5828268614bSChristophe Leroy #endif	/* CONFIG_HAVE_GUP_FAST */
583a12083d7SPeter Xu 
584052ccfbcSGuillaume Morin /* Common code for can_follow_write_* */
can_follow_write_common(struct page * page,struct vm_area_struct * vma,unsigned int flags)585052ccfbcSGuillaume Morin static inline bool can_follow_write_common(struct page *page,
586052ccfbcSGuillaume Morin 		struct vm_area_struct *vma, unsigned int flags)
587052ccfbcSGuillaume Morin {
588052ccfbcSGuillaume Morin 	/* Maybe FOLL_FORCE is set to override it? */
589052ccfbcSGuillaume Morin 	if (!(flags & FOLL_FORCE))
590052ccfbcSGuillaume Morin 		return false;
591052ccfbcSGuillaume Morin 
592052ccfbcSGuillaume Morin 	/* But FOLL_FORCE has no effect on shared mappings */
593052ccfbcSGuillaume Morin 	if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
594052ccfbcSGuillaume Morin 		return false;
595052ccfbcSGuillaume Morin 
596052ccfbcSGuillaume Morin 	/* ... or read-only private ones */
597052ccfbcSGuillaume Morin 	if (!(vma->vm_flags & VM_MAYWRITE))
598052ccfbcSGuillaume Morin 		return false;
599052ccfbcSGuillaume Morin 
600052ccfbcSGuillaume Morin 	/* ... or already writable ones that just need to take a write fault */
601052ccfbcSGuillaume Morin 	if (vma->vm_flags & VM_WRITE)
602052ccfbcSGuillaume Morin 		return false;
603052ccfbcSGuillaume Morin 
604052ccfbcSGuillaume Morin 	/*
605052ccfbcSGuillaume Morin 	 * See can_change_pte_writable(): we broke COW and could map the page
606052ccfbcSGuillaume Morin 	 * writable if we have an exclusive anonymous page ...
607052ccfbcSGuillaume Morin 	 */
608052ccfbcSGuillaume Morin 	return page && PageAnon(page) && PageAnonExclusive(page);
609052ccfbcSGuillaume Morin }
610052ccfbcSGuillaume Morin 
no_page_table(struct vm_area_struct * vma,unsigned int flags,unsigned long address)61169e68b4fSKirill A. Shutemov static struct page *no_page_table(struct vm_area_struct *vma,
612878b0c45SPeter Xu 				  unsigned int flags, unsigned long address)
6134bbd4c77SKirill A. Shutemov {
614878b0c45SPeter Xu 	if (!(flags & FOLL_DUMP))
615878b0c45SPeter Xu 		return NULL;
616878b0c45SPeter Xu 
6174bbd4c77SKirill A. Shutemov 	/*
618878b0c45SPeter Xu 	 * When core dumping, we don't want to allocate unnecessary pages or
61969e68b4fSKirill A. Shutemov 	 * page tables.  Return error instead of NULL to skip handle_mm_fault,
62069e68b4fSKirill A. Shutemov 	 * then get_dump_page() will return NULL to leave a hole in the dump.
62169e68b4fSKirill A. Shutemov 	 * But we can only make this optimization where a hole would surely
62269e68b4fSKirill A. Shutemov 	 * be zero-filled if handle_mm_fault() actually did handle it.
6234bbd4c77SKirill A. Shutemov 	 */
624878b0c45SPeter Xu 	if (is_vm_hugetlb_page(vma)) {
625878b0c45SPeter Xu 		struct hstate *h = hstate_vma(vma);
626878b0c45SPeter Xu 
627878b0c45SPeter Xu 		if (!hugetlbfs_pagecache_present(h, vma, address))
62869e68b4fSKirill A. Shutemov 			return ERR_PTR(-EFAULT);
629878b0c45SPeter Xu 	} else if ((vma_is_anonymous(vma) || !vma->vm_ops->fault)) {
630878b0c45SPeter Xu 		return ERR_PTR(-EFAULT);
631878b0c45SPeter Xu 	}
632878b0c45SPeter Xu 
63369e68b4fSKirill A. Shutemov 	return NULL;
6344bbd4c77SKirill A. Shutemov }
63569e68b4fSKirill A. Shutemov 
6361b167618SPeter Xu #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
637052ccfbcSGuillaume Morin /* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */
can_follow_write_pud(pud_t pud,struct page * page,struct vm_area_struct * vma,unsigned int flags)638052ccfbcSGuillaume Morin static inline bool can_follow_write_pud(pud_t pud, struct page *page,
639052ccfbcSGuillaume Morin 					struct vm_area_struct *vma,
640052ccfbcSGuillaume Morin 					unsigned int flags)
641052ccfbcSGuillaume Morin {
642052ccfbcSGuillaume Morin 	/* If the pud is writable, we can write to the page. */
643052ccfbcSGuillaume Morin 	if (pud_write(pud))
644052ccfbcSGuillaume Morin 		return true;
645052ccfbcSGuillaume Morin 
646052ccfbcSGuillaume Morin 	return can_follow_write_common(page, vma, flags);
647052ccfbcSGuillaume Morin }
648052ccfbcSGuillaume Morin 
follow_huge_pud(struct vm_area_struct * vma,unsigned long addr,pud_t * pudp,int flags,unsigned long * page_mask)6491b167618SPeter Xu static struct page *follow_huge_pud(struct vm_area_struct *vma,
6501b167618SPeter Xu 				    unsigned long addr, pud_t *pudp,
651d3f7922bSAlistair Popple 				    int flags, unsigned long *page_mask)
6521b167618SPeter Xu {
6531b167618SPeter Xu 	struct mm_struct *mm = vma->vm_mm;
6541b167618SPeter Xu 	struct page *page;
6551b167618SPeter Xu 	pud_t pud = *pudp;
6561b167618SPeter Xu 	unsigned long pfn = pud_pfn(pud);
6571b167618SPeter Xu 	int ret;
6581b167618SPeter Xu 
6591b167618SPeter Xu 	assert_spin_locked(pud_lockptr(mm, pudp));
6601b167618SPeter Xu 
661052ccfbcSGuillaume Morin 	if (!pud_present(pud))
6621b167618SPeter Xu 		return NULL;
6631b167618SPeter Xu 
664052ccfbcSGuillaume Morin 	if ((flags & FOLL_WRITE) &&
665052ccfbcSGuillaume Morin 	    !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags))
6661b167618SPeter Xu 		return NULL;
6671b167618SPeter Xu 
6681b167618SPeter Xu 	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
6691b167618SPeter Xu 	page = pfn_to_page(pfn);
6701b167618SPeter Xu 
671fd2825b0SAlistair Popple 	if (!pud_write(pud) && gup_must_unshare(vma, flags, page))
6721b167618SPeter Xu 		return ERR_PTR(-EMLINK);
6731b167618SPeter Xu 
674f442fa61SYang Shi 	ret = try_grab_folio(page_folio(page), 1, flags);
6751b167618SPeter Xu 	if (ret)
6761b167618SPeter Xu 		page = ERR_PTR(ret);
6771b167618SPeter Xu 	else
678d3f7922bSAlistair Popple 		*page_mask = HPAGE_PUD_NR - 1;
6791b167618SPeter Xu 
6801b167618SPeter Xu 	return page;
6811b167618SPeter Xu }
6824418c522SPeter Xu 
6834418c522SPeter Xu /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
can_follow_write_pmd(pmd_t pmd,struct page * page,struct vm_area_struct * vma,unsigned int flags)6844418c522SPeter Xu static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
6854418c522SPeter Xu 					struct vm_area_struct *vma,
6864418c522SPeter Xu 					unsigned int flags)
6874418c522SPeter Xu {
6884418c522SPeter Xu 	/* If the pmd is writable, we can write to the page. */
6894418c522SPeter Xu 	if (pmd_write(pmd))
6904418c522SPeter Xu 		return true;
6914418c522SPeter Xu 
692052ccfbcSGuillaume Morin 	if (!can_follow_write_common(page, vma, flags))
6934418c522SPeter Xu 		return false;
6944418c522SPeter Xu 
6954418c522SPeter Xu 	/* ... and a write-fault isn't required for other reasons. */
696f38ee285SBarry Song 	if (pmd_needs_soft_dirty_wp(vma, pmd))
6974418c522SPeter Xu 		return false;
6984418c522SPeter Xu 	return !userfaultfd_huge_pmd_wp(vma, pmd);
6994418c522SPeter Xu }
7004418c522SPeter Xu 
follow_huge_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmd,unsigned int flags,unsigned long * page_mask)7014418c522SPeter Xu static struct page *follow_huge_pmd(struct vm_area_struct *vma,
7024418c522SPeter Xu 				    unsigned long addr, pmd_t *pmd,
7034418c522SPeter Xu 				    unsigned int flags,
704d3f7922bSAlistair Popple 				    unsigned long *page_mask)
7054418c522SPeter Xu {
7064418c522SPeter Xu 	struct mm_struct *mm = vma->vm_mm;
7074418c522SPeter Xu 	pmd_t pmdval = *pmd;
7084418c522SPeter Xu 	struct page *page;
7094418c522SPeter Xu 	int ret;
7104418c522SPeter Xu 
7114418c522SPeter Xu 	assert_spin_locked(pmd_lockptr(mm, pmd));
7124418c522SPeter Xu 
7134418c522SPeter Xu 	page = pmd_page(pmdval);
7144418c522SPeter Xu 	if ((flags & FOLL_WRITE) &&
7154418c522SPeter Xu 	    !can_follow_write_pmd(pmdval, page, vma, flags))
7164418c522SPeter Xu 		return NULL;
7174418c522SPeter Xu 
7184418c522SPeter Xu 	/* Avoid dumping huge zero page */
7194418c522SPeter Xu 	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(pmdval))
7204418c522SPeter Xu 		return ERR_PTR(-EFAULT);
7214418c522SPeter Xu 
7224418c522SPeter Xu 	if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))
7234418c522SPeter Xu 		return NULL;
7244418c522SPeter Xu 
7254418c522SPeter Xu 	if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page))
7264418c522SPeter Xu 		return ERR_PTR(-EMLINK);
7274418c522SPeter Xu 
728792b429dSDavid Hildenbrand 	VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
7294418c522SPeter Xu 			     !PageAnonExclusive(page), page);
7304418c522SPeter Xu 
731f442fa61SYang Shi 	ret = try_grab_folio(page_folio(page), 1, flags);
7324418c522SPeter Xu 	if (ret)
7334418c522SPeter Xu 		return ERR_PTR(ret);
7344418c522SPeter Xu 
7354418c522SPeter Xu #ifdef CONFIG_TRANSPARENT_HUGEPAGE
7364418c522SPeter Xu 	if (pmd_trans_huge(pmdval) && (flags & FOLL_TOUCH))
7374418c522SPeter Xu 		touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
7384418c522SPeter Xu #endif	/* CONFIG_TRANSPARENT_HUGEPAGE */
7394418c522SPeter Xu 
7404418c522SPeter Xu 	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
741d3f7922bSAlistair Popple 	*page_mask = HPAGE_PMD_NR - 1;
7424418c522SPeter Xu 
7434418c522SPeter Xu 	return page;
7444418c522SPeter Xu }
7454418c522SPeter Xu 
7461b167618SPeter Xu #else  /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
follow_huge_pud(struct vm_area_struct * vma,unsigned long addr,pud_t * pudp,int flags,unsigned long * page_mask)7471b167618SPeter Xu static struct page *follow_huge_pud(struct vm_area_struct *vma,
7481b167618SPeter Xu 				    unsigned long addr, pud_t *pudp,
749d3f7922bSAlistair Popple 				    int flags, unsigned long *page_mask)
7501b167618SPeter Xu {
7511b167618SPeter Xu 	return NULL;
7521b167618SPeter Xu }
7534418c522SPeter Xu 
follow_huge_pmd(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmd,unsigned int flags,unsigned long * page_mask)7544418c522SPeter Xu static struct page *follow_huge_pmd(struct vm_area_struct *vma,
7554418c522SPeter Xu 				    unsigned long addr, pmd_t *pmd,
7564418c522SPeter Xu 				    unsigned int flags,
757d3f7922bSAlistair Popple 				    unsigned long *page_mask)
7584418c522SPeter Xu {
7594418c522SPeter Xu 	return NULL;
7604418c522SPeter Xu }
7611b167618SPeter Xu #endif	/* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
7621b167618SPeter Xu 
follow_pfn_pte(struct vm_area_struct * vma,unsigned long address,pte_t * pte,unsigned int flags)7631027e443SKirill A. Shutemov static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
7641027e443SKirill A. Shutemov 		pte_t *pte, unsigned int flags)
7651027e443SKirill A. Shutemov {
7661027e443SKirill A. Shutemov 	if (flags & FOLL_TOUCH) {
767c33c7948SRyan Roberts 		pte_t orig_entry = ptep_get(pte);
768c33c7948SRyan Roberts 		pte_t entry = orig_entry;
7691027e443SKirill A. Shutemov 
7701027e443SKirill A. Shutemov 		if (flags & FOLL_WRITE)
7711027e443SKirill A. Shutemov 			entry = pte_mkdirty(entry);
7721027e443SKirill A. Shutemov 		entry = pte_mkyoung(entry);
7731027e443SKirill A. Shutemov 
774c33c7948SRyan Roberts 		if (!pte_same(orig_entry, entry)) {
7751027e443SKirill A. Shutemov 			set_pte_at(vma->vm_mm, address, pte, entry);
7761027e443SKirill A. Shutemov 			update_mmu_cache(vma, address, pte);
7771027e443SKirill A. Shutemov 		}
7781027e443SKirill A. Shutemov 	}
7791027e443SKirill A. Shutemov 
7801027e443SKirill A. Shutemov 	/* Proper page table entry exists, but no corresponding struct page */
7811027e443SKirill A. Shutemov 	return -EEXIST;
7821027e443SKirill A. Shutemov }
7831027e443SKirill A. Shutemov 
7845535be30SDavid Hildenbrand /* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
can_follow_write_pte(pte_t pte,struct page * page,struct vm_area_struct * vma,unsigned int flags)7855535be30SDavid Hildenbrand static inline bool can_follow_write_pte(pte_t pte, struct page *page,
7865535be30SDavid Hildenbrand 					struct vm_area_struct *vma,
7875535be30SDavid Hildenbrand 					unsigned int flags)
78819be0eafSLinus Torvalds {
7895535be30SDavid Hildenbrand 	/* If the pte is writable, we can write to the page. */
7905535be30SDavid Hildenbrand 	if (pte_write(pte))
7915535be30SDavid Hildenbrand 		return true;
7925535be30SDavid Hildenbrand 
793052ccfbcSGuillaume Morin 	if (!can_follow_write_common(page, vma, flags))
7945535be30SDavid Hildenbrand 		return false;
7955535be30SDavid Hildenbrand 
7965535be30SDavid Hildenbrand 	/* ... and a write-fault isn't required for other reasons. */
797f38ee285SBarry Song 	if (pte_needs_soft_dirty_wp(vma, pte))
7985535be30SDavid Hildenbrand 		return false;
7995535be30SDavid Hildenbrand 	return !userfaultfd_pte_wp(vma, pte);
80019be0eafSLinus Torvalds }
80119be0eafSLinus Torvalds 
follow_page_pte(struct vm_area_struct * vma,unsigned long address,pmd_t * pmd,unsigned int flags)80269e68b4fSKirill A. Shutemov static struct page *follow_page_pte(struct vm_area_struct *vma,
803d3f7922bSAlistair Popple 		unsigned long address, pmd_t *pmd, unsigned int flags)
80469e68b4fSKirill A. Shutemov {
80569e68b4fSKirill A. Shutemov 	struct mm_struct *mm = vma->vm_mm;
806b967c648SDavid Hildenbrand 	struct folio *folio;
80769e68b4fSKirill A. Shutemov 	struct page *page;
80869e68b4fSKirill A. Shutemov 	spinlock_t *ptl;
80969e68b4fSKirill A. Shutemov 	pte_t *ptep, pte;
810f28d4363SClaudio Imbrenda 	int ret;
81169e68b4fSKirill A. Shutemov 
8124bbd4c77SKirill A. Shutemov 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
81304dee9e8SHugh Dickins 	if (!ptep)
814878b0c45SPeter Xu 		return no_page_table(vma, flags, address);
815c33c7948SRyan Roberts 	pte = ptep_get(ptep);
816f7355e99SDavid Hildenbrand 	if (!pte_present(pte))
8174bbd4c77SKirill A. Shutemov 		goto no_page;
818d74943a2SDavid Hildenbrand 	if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags))
8194bbd4c77SKirill A. Shutemov 		goto no_page;
8204bbd4c77SKirill A. Shutemov 
8214bbd4c77SKirill A. Shutemov 	page = vm_normal_page(vma, address, pte);
8225535be30SDavid Hildenbrand 
8235535be30SDavid Hildenbrand 	/*
824fd2825b0SAlistair Popple 	 * We only care about anon pages in can_follow_write_pte().
8255535be30SDavid Hildenbrand 	 */
8265535be30SDavid Hildenbrand 	if ((flags & FOLL_WRITE) &&
8275535be30SDavid Hildenbrand 	    !can_follow_write_pte(pte, page, vma, flags)) {
8285535be30SDavid Hildenbrand 		page = NULL;
8295535be30SDavid Hildenbrand 		goto out;
8305535be30SDavid Hildenbrand 	}
8315535be30SDavid Hildenbrand 
832fd2825b0SAlistair Popple 	if (unlikely(!page)) {
8331027e443SKirill A. Shutemov 		if (flags & FOLL_DUMP) {
8341027e443SKirill A. Shutemov 			/* Avoid special (like zero) pages in core dumps */
8351027e443SKirill A. Shutemov 			page = ERR_PTR(-EFAULT);
8361027e443SKirill A. Shutemov 			goto out;
8371027e443SKirill A. Shutemov 		}
8381027e443SKirill A. Shutemov 
8391027e443SKirill A. Shutemov 		if (is_zero_pfn(pte_pfn(pte))) {
8404bbd4c77SKirill A. Shutemov 			page = pte_page(pte);
8411027e443SKirill A. Shutemov 		} else {
8421027e443SKirill A. Shutemov 			ret = follow_pfn_pte(vma, address, ptep, flags);
8431027e443SKirill A. Shutemov 			page = ERR_PTR(ret);
8441027e443SKirill A. Shutemov 			goto out;
8451027e443SKirill A. Shutemov 		}
8464bbd4c77SKirill A. Shutemov 	}
847b967c648SDavid Hildenbrand 	folio = page_folio(page);
8484bbd4c77SKirill A. Shutemov 
84984209e87SDavid Hildenbrand 	if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {
850a7f22660SDavid Hildenbrand 		page = ERR_PTR(-EMLINK);
851a7f22660SDavid Hildenbrand 		goto out;
852a7f22660SDavid Hildenbrand 	}
853b6a2619cSDavid Hildenbrand 
854792b429dSDavid Hildenbrand 	VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
855b6a2619cSDavid Hildenbrand 			     !PageAnonExclusive(page), page);
856b6a2619cSDavid Hildenbrand 
857f442fa61SYang Shi 	/* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */
858b967c648SDavid Hildenbrand 	ret = try_grab_folio(folio, 1, flags);
8590f089235SLogan Gunthorpe 	if (unlikely(ret)) {
8600f089235SLogan Gunthorpe 		page = ERR_PTR(ret);
8618fde12caSLinus Torvalds 		goto out;
8628fde12caSLinus Torvalds 	}
8634003f107SLogan Gunthorpe 
864f28d4363SClaudio Imbrenda 	/*
865f28d4363SClaudio Imbrenda 	 * We need to make the page accessible if and only if we are going
866f28d4363SClaudio Imbrenda 	 * to access its content (the FOLL_PIN case).  Please see
867f28d4363SClaudio Imbrenda 	 * Documentation/core-api/pin_user_pages.rst for details.
868f28d4363SClaudio Imbrenda 	 */
869f28d4363SClaudio Imbrenda 	if (flags & FOLL_PIN) {
870b967c648SDavid Hildenbrand 		ret = arch_make_folio_accessible(folio);
871f28d4363SClaudio Imbrenda 		if (ret) {
872f28d4363SClaudio Imbrenda 			unpin_user_page(page);
873f28d4363SClaudio Imbrenda 			page = ERR_PTR(ret);
874f28d4363SClaudio Imbrenda 			goto out;
875f28d4363SClaudio Imbrenda 		}
876f28d4363SClaudio Imbrenda 	}
8774bbd4c77SKirill A. Shutemov 	if (flags & FOLL_TOUCH) {
8784bbd4c77SKirill A. Shutemov 		if ((flags & FOLL_WRITE) &&
879f0327de7SMatthew Wilcox (Oracle) 		    !pte_dirty(pte) && !folio_test_dirty(folio))
880f0327de7SMatthew Wilcox (Oracle) 			folio_mark_dirty(folio);
8814bbd4c77SKirill A. Shutemov 		/*
8824bbd4c77SKirill A. Shutemov 		 * pte_mkyoung() would be more correct here, but atomic care
8834bbd4c77SKirill A. Shutemov 		 * is needed to avoid losing the dirty bit: it is easier to use
884f0327de7SMatthew Wilcox (Oracle) 		 * folio_mark_accessed().
8854bbd4c77SKirill A. Shutemov 		 */
886f0327de7SMatthew Wilcox (Oracle) 		folio_mark_accessed(folio);
8874bbd4c77SKirill A. Shutemov 	}
8881027e443SKirill A. Shutemov out:
8894bbd4c77SKirill A. Shutemov 	pte_unmap_unlock(ptep, ptl);
8904bbd4c77SKirill A. Shutemov 	return page;
8914bbd4c77SKirill A. Shutemov no_page:
8924bbd4c77SKirill A. Shutemov 	pte_unmap_unlock(ptep, ptl);
8934bbd4c77SKirill A. Shutemov 	if (!pte_none(pte))
89469e68b4fSKirill A. Shutemov 		return NULL;
895878b0c45SPeter Xu 	return no_page_table(vma, flags, address);
89669e68b4fSKirill A. Shutemov }
8974bbd4c77SKirill A. Shutemov 
follow_pmd_mask(struct vm_area_struct * vma,unsigned long address,pud_t * pudp,unsigned int flags,unsigned long * page_mask)898080dbb61SAneesh Kumar K.V static struct page *follow_pmd_mask(struct vm_area_struct *vma,
899080dbb61SAneesh Kumar K.V 				    unsigned long address, pud_t *pudp,
900df06b37fSKeith Busch 				    unsigned int flags,
901d3f7922bSAlistair Popple 				    unsigned long *page_mask)
90269e68b4fSKirill A. Shutemov {
90368827280SHuang Ying 	pmd_t *pmd, pmdval;
90469e68b4fSKirill A. Shutemov 	spinlock_t *ptl;
90569e68b4fSKirill A. Shutemov 	struct page *page;
90669e68b4fSKirill A. Shutemov 	struct mm_struct *mm = vma->vm_mm;
90769e68b4fSKirill A. Shutemov 
908080dbb61SAneesh Kumar K.V 	pmd = pmd_offset(pudp, address);
90926e1a0c3SHugh Dickins 	pmdval = pmdp_get_lockless(pmd);
91068827280SHuang Ying 	if (pmd_none(pmdval))
911878b0c45SPeter Xu 		return no_page_table(vma, flags, address);
912f7355e99SDavid Hildenbrand 	if (!pmd_present(pmdval))
913878b0c45SPeter Xu 		return no_page_table(vma, flags, address);
9144418c522SPeter Xu 	if (likely(!pmd_leaf(pmdval)))
915d3f7922bSAlistair Popple 		return follow_page_pte(vma, address, pmd, flags);
9166742d293SKirill A. Shutemov 
917d74943a2SDavid Hildenbrand 	if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))
918878b0c45SPeter Xu 		return no_page_table(vma, flags, address);
919db08f203SAneesh Kumar K.V 
9206742d293SKirill A. Shutemov 	ptl = pmd_lock(mm, pmd);
9214418c522SPeter Xu 	pmdval = *pmd;
9224418c522SPeter Xu 	if (unlikely(!pmd_present(pmdval))) {
92384c3fc4eSZi Yan 		spin_unlock(ptl);
924878b0c45SPeter Xu 		return no_page_table(vma, flags, address);
92584c3fc4eSZi Yan 	}
9264418c522SPeter Xu 	if (unlikely(!pmd_leaf(pmdval))) {
9276742d293SKirill A. Shutemov 		spin_unlock(ptl);
928d3f7922bSAlistair Popple 		return follow_page_pte(vma, address, pmd, flags);
92969e68b4fSKirill A. Shutemov 	}
9304418c522SPeter Xu 	if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) {
931bfe7b00dSSong Liu 		spin_unlock(ptl);
932bfe7b00dSSong Liu 		split_huge_pmd(vma, pmd, address);
9332378118bSHugh Dickins 		/* If pmd was left empty, stuff a page table in there quickly */
9342378118bSHugh Dickins 		return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :
935d3f7922bSAlistair Popple 			follow_page_pte(vma, address, pmd, flags);
9366742d293SKirill A. Shutemov 	}
937d3f7922bSAlistair Popple 	page = follow_huge_pmd(vma, address, pmd, flags, page_mask);
93869e68b4fSKirill A. Shutemov 	spin_unlock(ptl);
93969e68b4fSKirill A. Shutemov 	return page;
94069e68b4fSKirill A. Shutemov }
94169e68b4fSKirill A. Shutemov 
follow_pud_mask(struct vm_area_struct * vma,unsigned long address,p4d_t * p4dp,unsigned int flags,unsigned long * page_mask)942080dbb61SAneesh Kumar K.V static struct page *follow_pud_mask(struct vm_area_struct *vma,
943080dbb61SAneesh Kumar K.V 				    unsigned long address, p4d_t *p4dp,
944df06b37fSKeith Busch 				    unsigned int flags,
945d3f7922bSAlistair Popple 				    unsigned long *page_mask)
946080dbb61SAneesh Kumar K.V {
947caf8cab7SPeter Xu 	pud_t *pudp, pud;
948080dbb61SAneesh Kumar K.V 	spinlock_t *ptl;
949080dbb61SAneesh Kumar K.V 	struct page *page;
950080dbb61SAneesh Kumar K.V 	struct mm_struct *mm = vma->vm_mm;
951080dbb61SAneesh Kumar K.V 
952caf8cab7SPeter Xu 	pudp = pud_offset(p4dp, address);
953c0efdb37SAnshuman Khandual 	pud = pudp_get(pudp);
9541b167618SPeter Xu 	if (!pud_present(pud))
955878b0c45SPeter Xu 		return no_page_table(vma, flags, address);
9561b167618SPeter Xu 	if (pud_leaf(pud)) {
957caf8cab7SPeter Xu 		ptl = pud_lock(mm, pudp);
958d3f7922bSAlistair Popple 		page = follow_huge_pud(vma, address, pudp, flags, page_mask);
959080dbb61SAneesh Kumar K.V 		spin_unlock(ptl);
960080dbb61SAneesh Kumar K.V 		if (page)
961080dbb61SAneesh Kumar K.V 			return page;
962878b0c45SPeter Xu 		return no_page_table(vma, flags, address);
963080dbb61SAneesh Kumar K.V 	}
964caf8cab7SPeter Xu 	if (unlikely(pud_bad(pud)))
965878b0c45SPeter Xu 		return no_page_table(vma, flags, address);
966080dbb61SAneesh Kumar K.V 
967d3f7922bSAlistair Popple 	return follow_pmd_mask(vma, address, pudp, flags, page_mask);
968080dbb61SAneesh Kumar K.V }
969080dbb61SAneesh Kumar K.V 
follow_p4d_mask(struct vm_area_struct * vma,unsigned long address,pgd_t * pgdp,unsigned int flags,unsigned long * page_mask)970080dbb61SAneesh Kumar K.V static struct page *follow_p4d_mask(struct vm_area_struct *vma,
971080dbb61SAneesh Kumar K.V 				    unsigned long address, pgd_t *pgdp,
972df06b37fSKeith Busch 				    unsigned int flags,
973d3f7922bSAlistair Popple 				    unsigned long *page_mask)
974080dbb61SAneesh Kumar K.V {
975e6fd5564SPeter Xu 	p4d_t *p4dp, p4d;
976080dbb61SAneesh Kumar K.V 
977e6fd5564SPeter Xu 	p4dp = p4d_offset(pgdp, address);
978c0efdb37SAnshuman Khandual 	p4d = p4dp_get(p4dp);
9791965e933SPeter Xu 	BUILD_BUG_ON(p4d_leaf(p4d));
980a12083d7SPeter Xu 
981a12083d7SPeter Xu 	if (!p4d_present(p4d) || p4d_bad(p4d))
982878b0c45SPeter Xu 		return no_page_table(vma, flags, address);
983080dbb61SAneesh Kumar K.V 
984d3f7922bSAlistair Popple 	return follow_pud_mask(vma, address, p4dp, flags, page_mask);
985080dbb61SAneesh Kumar K.V }
986080dbb61SAneesh Kumar K.V 
987080dbb61SAneesh Kumar K.V /**
988080dbb61SAneesh Kumar K.V  * follow_page_mask - look up a page descriptor from a user-virtual address
989080dbb61SAneesh Kumar K.V  * @vma: vm_area_struct mapping @address
990080dbb61SAneesh Kumar K.V  * @address: virtual address to look up
991080dbb61SAneesh Kumar K.V  * @flags: flags modifying lookup behaviour
992d3f7922bSAlistair Popple  * @page_mask: a pointer to output page_mask
993080dbb61SAneesh Kumar K.V  *
994080dbb61SAneesh Kumar K.V  * @flags can have FOLL_ flags set, defined in <linux/mm.h>
995080dbb61SAneesh Kumar K.V  *
996a7f22660SDavid Hildenbrand  * When getting an anonymous page and the caller has to trigger unsharing
997a7f22660SDavid Hildenbrand  * of a shared anonymous page first, -EMLINK is returned. The caller should
998a7f22660SDavid Hildenbrand  * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
999a7f22660SDavid Hildenbrand  * relevant with FOLL_PIN and !FOLL_WRITE.
1000a7f22660SDavid Hildenbrand  *
1001d3f7922bSAlistair Popple  * On output, @page_mask is set according to the size of the page.
100278179556SMike Rapoport  *
100378179556SMike Rapoport  * Return: the mapped (struct page *), %NULL if no mapping exists, or
1004080dbb61SAneesh Kumar K.V  * an error pointer if there is a mapping to something not represented
1005080dbb61SAneesh Kumar K.V  * by a page descriptor (see also vm_normal_page()).
1006080dbb61SAneesh Kumar K.V  */
follow_page_mask(struct vm_area_struct * vma,unsigned long address,unsigned int flags,unsigned long * page_mask)1007a7030aeaSBharath Vedartham static struct page *follow_page_mask(struct vm_area_struct *vma,
1008080dbb61SAneesh Kumar K.V 			      unsigned long address, unsigned int flags,
1009d3f7922bSAlistair Popple 			      unsigned long *page_mask)
1010080dbb61SAneesh Kumar K.V {
1011080dbb61SAneesh Kumar K.V 	pgd_t *pgd;
1012080dbb61SAneesh Kumar K.V 	struct mm_struct *mm = vma->vm_mm;
10139cb28da5SPeter Xu 	struct page *page;
10149cb28da5SPeter Xu 
10159cb28da5SPeter Xu 	vma_pgtable_walk_begin(vma);
1016080dbb61SAneesh Kumar K.V 
1017d3f7922bSAlistair Popple 	*page_mask = 0;
1018080dbb61SAneesh Kumar K.V 	pgd = pgd_offset(mm, address);
1019080dbb61SAneesh Kumar K.V 
10208268614bSChristophe Leroy 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1021a12083d7SPeter Xu 		page = no_page_table(vma, flags, address);
1022a12083d7SPeter Xu 	else
1023d3f7922bSAlistair Popple 		page = follow_p4d_mask(vma, address, pgd, flags, page_mask);
1024080dbb61SAneesh Kumar K.V 
10259cb28da5SPeter Xu 	vma_pgtable_walk_end(vma);
10269cb28da5SPeter Xu 
1027a12083d7SPeter Xu 	return page;
1028df06b37fSKeith Busch }
1029df06b37fSKeith Busch 
get_gate_page(struct mm_struct * mm,unsigned long address,unsigned int gup_flags,struct vm_area_struct ** vma,struct page ** page)1030f2b495caSKirill A. Shutemov static int get_gate_page(struct mm_struct *mm, unsigned long address,
1031f2b495caSKirill A. Shutemov 		unsigned int gup_flags, struct vm_area_struct **vma,
1032f2b495caSKirill A. Shutemov 		struct page **page)
1033f2b495caSKirill A. Shutemov {
1034f2b495caSKirill A. Shutemov 	pgd_t *pgd;
1035c2febafcSKirill A. Shutemov 	p4d_t *p4d;
1036f2b495caSKirill A. Shutemov 	pud_t *pud;
1037f2b495caSKirill A. Shutemov 	pmd_t *pmd;
1038f2b495caSKirill A. Shutemov 	pte_t *pte;
1039c33c7948SRyan Roberts 	pte_t entry;
1040f2b495caSKirill A. Shutemov 	int ret = -EFAULT;
1041f2b495caSKirill A. Shutemov 
1042f2b495caSKirill A. Shutemov 	/* user gate pages are read-only */
1043f2b495caSKirill A. Shutemov 	if (gup_flags & FOLL_WRITE)
1044f2b495caSKirill A. Shutemov 		return -EFAULT;
10450cad6736SFeng Lee 	pgd = pgd_offset(mm, address);
1046b5d1c39fSAndy Lutomirski 	if (pgd_none(*pgd))
1047b5d1c39fSAndy Lutomirski 		return -EFAULT;
1048c2febafcSKirill A. Shutemov 	p4d = p4d_offset(pgd, address);
1049b5d1c39fSAndy Lutomirski 	if (p4d_none(*p4d))
1050b5d1c39fSAndy Lutomirski 		return -EFAULT;
1051c2febafcSKirill A. Shutemov 	pud = pud_offset(p4d, address);
1052b5d1c39fSAndy Lutomirski 	if (pud_none(*pud))
1053b5d1c39fSAndy Lutomirski 		return -EFAULT;
1054f2b495caSKirill A. Shutemov 	pmd = pmd_offset(pud, address);
105584c3fc4eSZi Yan 	if (!pmd_present(*pmd))
1056f2b495caSKirill A. Shutemov 		return -EFAULT;
1057f2b495caSKirill A. Shutemov 	pte = pte_offset_map(pmd, address);
105804dee9e8SHugh Dickins 	if (!pte)
105904dee9e8SHugh Dickins 		return -EFAULT;
1060c33c7948SRyan Roberts 	entry = ptep_get(pte);
1061c33c7948SRyan Roberts 	if (pte_none(entry))
1062f2b495caSKirill A. Shutemov 		goto unmap;
1063f2b495caSKirill A. Shutemov 	*vma = get_gate_vma(mm);
1064f2b495caSKirill A. Shutemov 	if (!page)
1065f2b495caSKirill A. Shutemov 		goto out;
1066c33c7948SRyan Roberts 	*page = vm_normal_page(*vma, address, entry);
1067f2b495caSKirill A. Shutemov 	if (!*page) {
1068c33c7948SRyan Roberts 		if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry)))
1069f2b495caSKirill A. Shutemov 			goto unmap;
1070c33c7948SRyan Roberts 		*page = pte_page(entry);
1071f2b495caSKirill A. Shutemov 	}
1072f442fa61SYang Shi 	ret = try_grab_folio(page_folio(*page), 1, gup_flags);
10730f089235SLogan Gunthorpe 	if (unlikely(ret))
10748fde12caSLinus Torvalds 		goto unmap;
1075f2b495caSKirill A. Shutemov out:
1076f2b495caSKirill A. Shutemov 	ret = 0;
1077f2b495caSKirill A. Shutemov unmap:
1078f2b495caSKirill A. Shutemov 	pte_unmap(pte);
1079f2b495caSKirill A. Shutemov 	return ret;
1080f2b495caSKirill A. Shutemov }
1081f2b495caSKirill A. Shutemov 
10829a95f3cfSPaul Cassella /*
10839a863a6aSJason Gunthorpe  * mmap_lock must be held on entry.  If @flags has FOLL_UNLOCKABLE but not
10849a863a6aSJason Gunthorpe  * FOLL_NOWAIT, the mmap_lock may be released.  If it is, *@locked will be set
10859a863a6aSJason Gunthorpe  * to 0 and -EBUSY returned.
10869a95f3cfSPaul Cassella  */
faultin_page(struct vm_area_struct * vma,unsigned long address,unsigned int flags,bool unshare,int * locked)108764019a2eSPeter Xu static int faultin_page(struct vm_area_struct *vma,
108847872953SJosef Bacik 		unsigned long address, unsigned int flags, bool unshare,
1089a7f22660SDavid Hildenbrand 		int *locked)
109016744483SKirill A. Shutemov {
109116744483SKirill A. Shutemov 	unsigned int fault_flags = 0;
10922b740303SSouptick Joarder 	vm_fault_t ret;
109316744483SKirill A. Shutemov 
109447872953SJosef Bacik 	if (flags & FOLL_NOFAULT)
109555b8fe70SAndreas Gruenbacher 		return -EFAULT;
109647872953SJosef Bacik 	if (flags & FOLL_WRITE)
109716744483SKirill A. Shutemov 		fault_flags |= FAULT_FLAG_WRITE;
109847872953SJosef Bacik 	if (flags & FOLL_REMOTE)
10991b2ee126SDave Hansen 		fault_flags |= FAULT_FLAG_REMOTE;
110047872953SJosef Bacik 	if (flags & FOLL_UNLOCKABLE) {
110171335f37SPeter Xu 		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
110293c5c61dSPeter Xu 		/*
110393c5c61dSPeter Xu 		 * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set
110493c5c61dSPeter Xu 		 * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE.
110593c5c61dSPeter Xu 		 * That's because some callers may not be prepared to
110693c5c61dSPeter Xu 		 * handle early exits caused by non-fatal signals.
110793c5c61dSPeter Xu 		 */
110847872953SJosef Bacik 		if (flags & FOLL_INTERRUPTIBLE)
110993c5c61dSPeter Xu 			fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
111093c5c61dSPeter Xu 	}
111147872953SJosef Bacik 	if (flags & FOLL_NOWAIT)
111216744483SKirill A. Shutemov 		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
111347872953SJosef Bacik 	if (flags & FOLL_TRIED) {
11144426e945SPeter Xu 		/*
11154426e945SPeter Xu 		 * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
11164426e945SPeter Xu 		 * can co-exist
11174426e945SPeter Xu 		 */
1118234b239bSAndres Lagar-Cavilla 		fault_flags |= FAULT_FLAG_TRIED;
1119234b239bSAndres Lagar-Cavilla 	}
1120a7f22660SDavid Hildenbrand 	if (unshare) {
1121a7f22660SDavid Hildenbrand 		fault_flags |= FAULT_FLAG_UNSHARE;
1122a7f22660SDavid Hildenbrand 		/* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
1123792b429dSDavid Hildenbrand 		VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_WRITE);
1124a7f22660SDavid Hildenbrand 	}
112516744483SKirill A. Shutemov 
1126bce617edSPeter Xu 	ret = handle_mm_fault(vma, address, fault_flags, NULL);
1127d9272525SPeter Xu 
1128d9272525SPeter Xu 	if (ret & VM_FAULT_COMPLETED) {
1129d9272525SPeter Xu 		/*
1130d9272525SPeter Xu 		 * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
1131d9272525SPeter Xu 		 * mmap lock in the page fault handler. Sanity check this.
1132d9272525SPeter Xu 		 */
1133d9272525SPeter Xu 		WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
1134d9272525SPeter Xu 		*locked = 0;
11359a863a6aSJason Gunthorpe 
1136d9272525SPeter Xu 		/*
1137d9272525SPeter Xu 		 * We should do the same as VM_FAULT_RETRY, but let's not
1138d9272525SPeter Xu 		 * return -EBUSY since that's not reflecting the reality of
1139d9272525SPeter Xu 		 * what has happened - we've just fully completed a page
1140d9272525SPeter Xu 		 * fault, with the mmap lock released.  Use -EAGAIN to show
1141d9272525SPeter Xu 		 * that we want to take the mmap lock _again_.
1142d9272525SPeter Xu 		 */
1143d9272525SPeter Xu 		return -EAGAIN;
1144d9272525SPeter Xu 	}
1145d9272525SPeter Xu 
114616744483SKirill A. Shutemov 	if (ret & VM_FAULT_ERROR) {
114747872953SJosef Bacik 		int err = vm_fault_to_errno(ret, flags);
11489a291a7cSJames Morse 
11499a291a7cSJames Morse 		if (err)
11509a291a7cSJames Morse 			return err;
115116744483SKirill A. Shutemov 		BUG();
115216744483SKirill A. Shutemov 	}
115316744483SKirill A. Shutemov 
115416744483SKirill A. Shutemov 	if (ret & VM_FAULT_RETRY) {
11559a863a6aSJason Gunthorpe 		if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
11564f6da934SPeter Xu 			*locked = 0;
115716744483SKirill A. Shutemov 		return -EBUSY;
115816744483SKirill A. Shutemov 	}
115916744483SKirill A. Shutemov 
116016744483SKirill A. Shutemov 	return 0;
116116744483SKirill A. Shutemov }
116216744483SKirill A. Shutemov 
11638ac26843SLorenzo Stoakes /*
11648ac26843SLorenzo Stoakes  * Writing to file-backed mappings which require folio dirty tracking using GUP
11658ac26843SLorenzo Stoakes  * is a fundamentally broken operation, as kernel write access to GUP mappings
11668ac26843SLorenzo Stoakes  * do not adhere to the semantics expected by a file system.
11678ac26843SLorenzo Stoakes  *
11688ac26843SLorenzo Stoakes  * Consider the following scenario:-
11698ac26843SLorenzo Stoakes  *
11708ac26843SLorenzo Stoakes  * 1. A folio is written to via GUP which write-faults the memory, notifying
11718ac26843SLorenzo Stoakes  *    the file system and dirtying the folio.
11728ac26843SLorenzo Stoakes  * 2. Later, writeback is triggered, resulting in the folio being cleaned and
11738ac26843SLorenzo Stoakes  *    the PTE being marked read-only.
11748ac26843SLorenzo Stoakes  * 3. The GUP caller writes to the folio, as it is mapped read/write via the
11758ac26843SLorenzo Stoakes  *    direct mapping.
11768ac26843SLorenzo Stoakes  * 4. The GUP caller, now done with the page, unpins it and sets it dirty
11778ac26843SLorenzo Stoakes  *    (though it does not have to).
11788ac26843SLorenzo Stoakes  *
11798ac26843SLorenzo Stoakes  * This results in both data being written to a folio without writenotify, and
11808ac26843SLorenzo Stoakes  * the folio being dirtied unexpectedly (if the caller decides to do so).
11818ac26843SLorenzo Stoakes  */
writable_file_mapping_allowed(struct vm_area_struct * vma,unsigned long gup_flags)11828ac26843SLorenzo Stoakes static bool writable_file_mapping_allowed(struct vm_area_struct *vma,
11838ac26843SLorenzo Stoakes 					  unsigned long gup_flags)
11848ac26843SLorenzo Stoakes {
11858ac26843SLorenzo Stoakes 	/*
11868ac26843SLorenzo Stoakes 	 * If we aren't pinning then no problematic write can occur. A long term
11878ac26843SLorenzo Stoakes 	 * pin is the most egregious case so this is the case we disallow.
11888ac26843SLorenzo Stoakes 	 */
11898ac26843SLorenzo Stoakes 	if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) !=
11908ac26843SLorenzo Stoakes 	    (FOLL_PIN | FOLL_LONGTERM))
11918ac26843SLorenzo Stoakes 		return true;
11928ac26843SLorenzo Stoakes 
11938ac26843SLorenzo Stoakes 	/*
11948ac26843SLorenzo Stoakes 	 * If the VMA does not require dirty tracking then no problematic write
11958ac26843SLorenzo Stoakes 	 * can occur either.
11968ac26843SLorenzo Stoakes 	 */
11978ac26843SLorenzo Stoakes 	return !vma_needs_dirty_tracking(vma);
11988ac26843SLorenzo Stoakes }
11998ac26843SLorenzo Stoakes 
check_vma_flags(struct vm_area_struct * vma,unsigned long gup_flags)1200fa5bb209SKirill A. Shutemov static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
1201fa5bb209SKirill A. Shutemov {
1202fa5bb209SKirill A. Shutemov 	vm_flags_t vm_flags = vma->vm_flags;
12031b2ee126SDave Hansen 	int write = (gup_flags & FOLL_WRITE);
12041b2ee126SDave Hansen 	int foreign = (gup_flags & FOLL_REMOTE);
12058ac26843SLorenzo Stoakes 	bool vma_anon = vma_is_anonymous(vma);
1206fa5bb209SKirill A. Shutemov 
1207fa5bb209SKirill A. Shutemov 	if (vm_flags & (VM_IO | VM_PFNMAP))
1208fa5bb209SKirill A. Shutemov 		return -EFAULT;
1209fa5bb209SKirill A. Shutemov 
12108ac26843SLorenzo Stoakes 	if ((gup_flags & FOLL_ANON) && !vma_anon)
12117f7ccc2cSWilly Tarreau 		return -EFAULT;
12127f7ccc2cSWilly Tarreau 
121352650c8bSJason Gunthorpe 	if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
121452650c8bSJason Gunthorpe 		return -EOPNOTSUPP;
121552650c8bSJason Gunthorpe 
12168977752cSDavid Hildenbrand 	if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma))
12178977752cSDavid Hildenbrand 		return -EOPNOTSUPP;
12188977752cSDavid Hildenbrand 
12191507f512SMike Rapoport 	if (vma_is_secretmem(vma))
12201507f512SMike Rapoport 		return -EFAULT;
12211507f512SMike Rapoport 
12221b2ee126SDave Hansen 	if (write) {
12238ac26843SLorenzo Stoakes 		if (!vma_anon &&
12248ac26843SLorenzo Stoakes 		    !writable_file_mapping_allowed(vma, gup_flags))
12258ac26843SLorenzo Stoakes 			return -EFAULT;
12268ac26843SLorenzo Stoakes 
12276beb9958SRick Edgecombe 		if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
1228fa5bb209SKirill A. Shutemov 			if (!(gup_flags & FOLL_FORCE))
1229fa5bb209SKirill A. Shutemov 				return -EFAULT;
1230fa5bb209SKirill A. Shutemov 			/*
1231fa5bb209SKirill A. Shutemov 			 * We used to let the write,force case do COW in a
1232fa5bb209SKirill A. Shutemov 			 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
1233fa5bb209SKirill A. Shutemov 			 * set a breakpoint in a read-only mapping of an
1234fa5bb209SKirill A. Shutemov 			 * executable, without corrupting the file (yet only
1235fa5bb209SKirill A. Shutemov 			 * when that file had been opened for writing!).
1236fa5bb209SKirill A. Shutemov 			 * Anon pages in shared mappings are surprising: now
1237fa5bb209SKirill A. Shutemov 			 * just reject it.
1238fa5bb209SKirill A. Shutemov 			 */
123946435364SHugh Dickins 			if (!is_cow_mapping(vm_flags))
1240fa5bb209SKirill A. Shutemov 				return -EFAULT;
1241fa5bb209SKirill A. Shutemov 		}
1242fa5bb209SKirill A. Shutemov 	} else if (!(vm_flags & VM_READ)) {
1243fa5bb209SKirill A. Shutemov 		if (!(gup_flags & FOLL_FORCE))
1244fa5bb209SKirill A. Shutemov 			return -EFAULT;
1245fa5bb209SKirill A. Shutemov 		/*
1246fa5bb209SKirill A. Shutemov 		 * Is there actually any vma we can reach here which does not
1247fa5bb209SKirill A. Shutemov 		 * have VM_MAYREAD set?
1248fa5bb209SKirill A. Shutemov 		 */
1249fa5bb209SKirill A. Shutemov 		if (!(vm_flags & VM_MAYREAD))
1250fa5bb209SKirill A. Shutemov 			return -EFAULT;
1251fa5bb209SKirill A. Shutemov 	}
1252d61172b4SDave Hansen 	/*
1253d61172b4SDave Hansen 	 * gups are always data accesses, not instruction
1254d61172b4SDave Hansen 	 * fetches, so execute=false here
1255d61172b4SDave Hansen 	 */
1256d61172b4SDave Hansen 	if (!arch_vma_access_permitted(vma, write, false, foreign))
125733a709b2SDave Hansen 		return -EFAULT;
1258fa5bb209SKirill A. Shutemov 	return 0;
1259fa5bb209SKirill A. Shutemov }
1260fa5bb209SKirill A. Shutemov 
12616cd06ab1SLinus Torvalds /*
12626cd06ab1SLinus Torvalds  * This is "vma_lookup()", but with a warning if we would have
12636cd06ab1SLinus Torvalds  * historically expanded the stack in the GUP code.
12646cd06ab1SLinus Torvalds  */
gup_vma_lookup(struct mm_struct * mm,unsigned long addr)12656cd06ab1SLinus Torvalds static struct vm_area_struct *gup_vma_lookup(struct mm_struct *mm,
12666cd06ab1SLinus Torvalds 	 unsigned long addr)
12676cd06ab1SLinus Torvalds {
12686cd06ab1SLinus Torvalds #ifdef CONFIG_STACK_GROWSUP
12696cd06ab1SLinus Torvalds 	return vma_lookup(mm, addr);
12706cd06ab1SLinus Torvalds #else
12716cd06ab1SLinus Torvalds 	static volatile unsigned long next_warn;
12726cd06ab1SLinus Torvalds 	struct vm_area_struct *vma;
12736cd06ab1SLinus Torvalds 	unsigned long now, next;
12746cd06ab1SLinus Torvalds 
12756cd06ab1SLinus Torvalds 	vma = find_vma(mm, addr);
12766cd06ab1SLinus Torvalds 	if (!vma || (addr >= vma->vm_start))
12776cd06ab1SLinus Torvalds 		return vma;
12786cd06ab1SLinus Torvalds 
12796cd06ab1SLinus Torvalds 	/* Only warn for half-way relevant accesses */
12806cd06ab1SLinus Torvalds 	if (!(vma->vm_flags & VM_GROWSDOWN))
12816cd06ab1SLinus Torvalds 		return NULL;
12826cd06ab1SLinus Torvalds 	if (vma->vm_start - addr > 65536)
12836cd06ab1SLinus Torvalds 		return NULL;
12846cd06ab1SLinus Torvalds 
12856cd06ab1SLinus Torvalds 	/* Let's not warn more than once an hour.. */
12866cd06ab1SLinus Torvalds 	now = jiffies; next = next_warn;
12876cd06ab1SLinus Torvalds 	if (next && time_before(now, next))
12886cd06ab1SLinus Torvalds 		return NULL;
12896cd06ab1SLinus Torvalds 	next_warn = now + 60*60*HZ;
12906cd06ab1SLinus Torvalds 
12916cd06ab1SLinus Torvalds 	/* Let people know things may have changed. */
12926cd06ab1SLinus Torvalds 	pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n",
12936cd06ab1SLinus Torvalds 		current->comm, task_pid_nr(current),
12946cd06ab1SLinus Torvalds 		vma->vm_start, vma->vm_end, addr);
12956cd06ab1SLinus Torvalds 	dump_stack();
12966cd06ab1SLinus Torvalds 	return NULL;
12976cd06ab1SLinus Torvalds #endif
12986cd06ab1SLinus Torvalds }
12996cd06ab1SLinus Torvalds 
13004bbd4c77SKirill A. Shutemov /**
13014bbd4c77SKirill A. Shutemov  * __get_user_pages() - pin user pages in memory
13024bbd4c77SKirill A. Shutemov  * @mm:		mm_struct of target mm
13034bbd4c77SKirill A. Shutemov  * @start:	starting user address
13044bbd4c77SKirill A. Shutemov  * @nr_pages:	number of pages from start to pin
13054bbd4c77SKirill A. Shutemov  * @gup_flags:	flags modifying pin behaviour
13064bbd4c77SKirill A. Shutemov  * @pages:	array that receives pointers to the pages pinned.
13074bbd4c77SKirill A. Shutemov  *		Should be at least nr_pages long. Or NULL, if caller
13084bbd4c77SKirill A. Shutemov  *		only intends to ensure the pages are faulted in.
1309c1e8d7c6SMichel Lespinasse  * @locked:     whether we're still with the mmap_lock held
13104bbd4c77SKirill A. Shutemov  *
1311d2dfbe47SLiu Xiang  * Returns either number of pages pinned (which may be less than the
1312d2dfbe47SLiu Xiang  * number requested), or an error. Details about the return value:
1313d2dfbe47SLiu Xiang  *
1314d2dfbe47SLiu Xiang  * -- If nr_pages is 0, returns 0.
1315d2dfbe47SLiu Xiang  * -- If nr_pages is >0, but no pages were pinned, returns -errno.
1316d2dfbe47SLiu Xiang  * -- If nr_pages is >0, and some pages were pinned, returns the number of
1317d2dfbe47SLiu Xiang  *    pages pinned. Again, this may be less than nr_pages.
13182d3a36a4SMichal Hocko  * -- 0 return value is possible when the fault would need to be retried.
1319d2dfbe47SLiu Xiang  *
1320d2dfbe47SLiu Xiang  * The caller is responsible for releasing returned @pages, via put_page().
1321d2dfbe47SLiu Xiang  *
1322c1e8d7c6SMichel Lespinasse  * Must be called with mmap_lock held.  It may be released.  See below.
13234bbd4c77SKirill A. Shutemov  *
13244bbd4c77SKirill A. Shutemov  * __get_user_pages walks a process's page tables and takes a reference to
13254bbd4c77SKirill A. Shutemov  * each struct page that each user address corresponds to at a given
13264bbd4c77SKirill A. Shutemov  * instant. That is, it takes the page that would be accessed if a user
13274bbd4c77SKirill A. Shutemov  * thread accesses the given user virtual address at that instant.
13284bbd4c77SKirill A. Shutemov  *
13294bbd4c77SKirill A. Shutemov  * This does not guarantee that the page exists in the user mappings when
13304bbd4c77SKirill A. Shutemov  * __get_user_pages returns, and there may even be a completely different
13314bbd4c77SKirill A. Shutemov  * page there in some cases (eg. if mmapped pagecache has been invalidated
1332c5acf1f6SJongwoo Han  * and subsequently re-faulted). However it does guarantee that the page
13334bbd4c77SKirill A. Shutemov  * won't be freed completely. And mostly callers simply care that the page
13344bbd4c77SKirill A. Shutemov  * contains data that was valid *at some point in time*. Typically, an IO
13354bbd4c77SKirill A. Shutemov  * or similar operation cannot guarantee anything stronger anyway because
13364bbd4c77SKirill A. Shutemov  * locks can't be held over the syscall boundary.
13374bbd4c77SKirill A. Shutemov  *
13384bbd4c77SKirill A. Shutemov  * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
13394bbd4c77SKirill A. Shutemov  * the page is written to, set_page_dirty (or set_page_dirty_lock, as
13404bbd4c77SKirill A. Shutemov  * appropriate) must be called after the page is finished with, and
13414bbd4c77SKirill A. Shutemov  * before put_page is called.
13424bbd4c77SKirill A. Shutemov  *
13439a863a6aSJason Gunthorpe  * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may
13449a863a6aSJason Gunthorpe  * be released. If this happens *@locked will be set to 0 on return.
13459a95f3cfSPaul Cassella  *
13469a863a6aSJason Gunthorpe  * A caller using such a combination of @gup_flags must therefore hold the
13479a863a6aSJason Gunthorpe  * mmap_lock for reading only, and recognize when it's been released. Otherwise,
13489a863a6aSJason Gunthorpe  * it must be held for either reading or writing and will not be released.
13494bbd4c77SKirill A. Shutemov  *
13504bbd4c77SKirill A. Shutemov  * In most cases, get_user_pages or get_user_pages_fast should be used
13514bbd4c77SKirill A. Shutemov  * instead of __get_user_pages. __get_user_pages should be used only if
13524bbd4c77SKirill A. Shutemov  * you need some special @gup_flags.
13534bbd4c77SKirill A. Shutemov  */
__get_user_pages(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages,int * locked)135464019a2eSPeter Xu static long __get_user_pages(struct mm_struct *mm,
13554bbd4c77SKirill A. Shutemov 		unsigned long start, unsigned long nr_pages,
13564bbd4c77SKirill A. Shutemov 		unsigned int gup_flags, struct page **pages,
1357b2cac248SLorenzo Stoakes 		int *locked)
13584bbd4c77SKirill A. Shutemov {
1359df06b37fSKeith Busch 	long ret = 0, i = 0;
1360fa5bb209SKirill A. Shutemov 	struct vm_area_struct *vma = NULL;
1361d3f7922bSAlistair Popple 	unsigned long page_mask = 0;
13624bbd4c77SKirill A. Shutemov 
13634bbd4c77SKirill A. Shutemov 	if (!nr_pages)
13644bbd4c77SKirill A. Shutemov 		return 0;
13654bbd4c77SKirill A. Shutemov 
1366428e106aSKirill A. Shutemov 	start = untagged_addr_remote(mm, start);
1367f9652594SAndrey Konovalov 
1368ede27b7eSBaoquan He 	VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
1369ede27b7eSBaoquan He 
1370ede27b7eSBaoquan He 	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
1371ede27b7eSBaoquan He 	VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
1372ede27b7eSBaoquan He 			(FOLL_PIN | FOLL_GET));
13734bbd4c77SKirill A. Shutemov 
13744bbd4c77SKirill A. Shutemov 	do {
13754bbd4c77SKirill A. Shutemov 		struct page *page;
13764bbd4c77SKirill A. Shutemov 		unsigned int page_increm;
13774bbd4c77SKirill A. Shutemov 
1378fa5bb209SKirill A. Shutemov 		/* first iteration or cross vma bound */
1379fa5bb209SKirill A. Shutemov 		if (!vma || start >= vma->vm_end) {
1380631426baSDavid Hildenbrand 			/*
1381631426baSDavid Hildenbrand 			 * MADV_POPULATE_(READ|WRITE) wants to handle VMA
1382631426baSDavid Hildenbrand 			 * lookups+error reporting differently.
1383631426baSDavid Hildenbrand 			 */
1384631426baSDavid Hildenbrand 			if (gup_flags & FOLL_MADV_POPULATE) {
1385631426baSDavid Hildenbrand 				vma = vma_lookup(mm, start);
1386631426baSDavid Hildenbrand 				if (!vma) {
1387631426baSDavid Hildenbrand 					ret = -ENOMEM;
1388631426baSDavid Hildenbrand 					goto out;
1389631426baSDavid Hildenbrand 				}
1390631426baSDavid Hildenbrand 				if (check_vma_flags(vma, gup_flags)) {
1391631426baSDavid Hildenbrand 					ret = -EINVAL;
1392631426baSDavid Hildenbrand 					goto out;
1393631426baSDavid Hildenbrand 				}
1394631426baSDavid Hildenbrand 				goto retry;
1395631426baSDavid Hildenbrand 			}
13966cd06ab1SLinus Torvalds 			vma = gup_vma_lookup(mm, start);
1397fa5bb209SKirill A. Shutemov 			if (!vma && in_gate_area(mm, start)) {
1398fa5bb209SKirill A. Shutemov 				ret = get_gate_page(mm, start & PAGE_MASK,
1399fa5bb209SKirill A. Shutemov 						gup_flags, &vma,
1400ffe1e786SPeter Xu 						pages ? &page : NULL);
1401fa5bb209SKirill A. Shutemov 				if (ret)
140208be37b7SJohn Hubbard 					goto out;
1403d3f7922bSAlistair Popple 				page_mask = 0;
1404fa5bb209SKirill A. Shutemov 				goto next_page;
1405fa5bb209SKirill A. Shutemov 			}
1406fa5bb209SKirill A. Shutemov 
140752650c8bSJason Gunthorpe 			if (!vma) {
1408df06b37fSKeith Busch 				ret = -EFAULT;
1409df06b37fSKeith Busch 				goto out;
1410df06b37fSKeith Busch 			}
141152650c8bSJason Gunthorpe 			ret = check_vma_flags(vma, gup_flags);
141252650c8bSJason Gunthorpe 			if (ret)
141352650c8bSJason Gunthorpe 				goto out;
1414fa5bb209SKirill A. Shutemov 		}
1415fa5bb209SKirill A. Shutemov retry:
14164bbd4c77SKirill A. Shutemov 		/*
1417fa5bb209SKirill A. Shutemov 		 * If we have a pending SIGKILL, don't keep faulting pages and
1418fa5bb209SKirill A. Shutemov 		 * potentially allocating memory.
14194bbd4c77SKirill A. Shutemov 		 */
1420fa45f116SDavidlohr Bueso 		if (fatal_signal_pending(current)) {
1421d180870dSMichal Hocko 			ret = -EINTR;
1422df06b37fSKeith Busch 			goto out;
1423df06b37fSKeith Busch 		}
14244bbd4c77SKirill A. Shutemov 		cond_resched();
1425df06b37fSKeith Busch 
1426d3f7922bSAlistair Popple 		page = follow_page_mask(vma, start, gup_flags, &page_mask);
1427a7f22660SDavid Hildenbrand 		if (!page || PTR_ERR(page) == -EMLINK) {
1428dc21e700SJosef Bacik 			ret = faultin_page(vma, start, gup_flags,
1429a7f22660SDavid Hildenbrand 					   PTR_ERR(page) == -EMLINK, locked);
143016744483SKirill A. Shutemov 			switch (ret) {
143116744483SKirill A. Shutemov 			case 0:
1432fa5bb209SKirill A. Shutemov 				goto retry;
1433df06b37fSKeith Busch 			case -EBUSY:
1434d9272525SPeter Xu 			case -EAGAIN:
1435df06b37fSKeith Busch 				ret = 0;
1436e4a9bc58SJoe Perches 				fallthrough;
143716744483SKirill A. Shutemov 			case -EFAULT:
143816744483SKirill A. Shutemov 			case -ENOMEM:
143916744483SKirill A. Shutemov 			case -EHWPOISON:
1440df06b37fSKeith Busch 				goto out;
14414bbd4c77SKirill A. Shutemov 			}
1442fa5bb209SKirill A. Shutemov 			BUG();
14431027e443SKirill A. Shutemov 		} else if (PTR_ERR(page) == -EEXIST) {
14441027e443SKirill A. Shutemov 			/*
14451027e443SKirill A. Shutemov 			 * Proper page table entry exists, but no corresponding
144665462462SJohn Hubbard 			 * struct page. If the caller expects **pages to be
144765462462SJohn Hubbard 			 * filled in, bail out now, because that can't be done
144865462462SJohn Hubbard 			 * for this page.
14491027e443SKirill A. Shutemov 			 */
145065462462SJohn Hubbard 			if (pages) {
145165462462SJohn Hubbard 				ret = PTR_ERR(page);
145265462462SJohn Hubbard 				goto out;
145365462462SJohn Hubbard 			}
14541027e443SKirill A. Shutemov 		} else if (IS_ERR(page)) {
1455df06b37fSKeith Busch 			ret = PTR_ERR(page);
1456df06b37fSKeith Busch 			goto out;
14571027e443SKirill A. Shutemov 		}
1458ffe1e786SPeter Xu next_page:
1459d3f7922bSAlistair Popple 		page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
14604bbd4c77SKirill A. Shutemov 		if (page_increm > nr_pages)
14614bbd4c77SKirill A. Shutemov 			page_increm = nr_pages;
146257edfcfdSPeter Xu 
146357edfcfdSPeter Xu 		if (pages) {
146457edfcfdSPeter Xu 			struct page *subpage;
146557edfcfdSPeter Xu 			unsigned int j;
146657edfcfdSPeter Xu 
146757edfcfdSPeter Xu 			/*
146857edfcfdSPeter Xu 			 * This must be a large folio (and doesn't need to
146957edfcfdSPeter Xu 			 * be the whole folio; it can be part of it), do
147057edfcfdSPeter Xu 			 * the refcount work for all the subpages too.
147157edfcfdSPeter Xu 			 *
147257edfcfdSPeter Xu 			 * NOTE: here the page may not be the head page
147357edfcfdSPeter Xu 			 * e.g. when start addr is not thp-size aligned.
147457edfcfdSPeter Xu 			 * try_grab_folio() should have taken care of tail
147557edfcfdSPeter Xu 			 * pages.
147657edfcfdSPeter Xu 			 */
147757edfcfdSPeter Xu 			if (page_increm > 1) {
1478f442fa61SYang Shi 				struct folio *folio = page_folio(page);
147957edfcfdSPeter Xu 
148057edfcfdSPeter Xu 				/*
148157edfcfdSPeter Xu 				 * Since we already hold refcount on the
148257edfcfdSPeter Xu 				 * large folio, this should never fail.
148357edfcfdSPeter Xu 				 */
1484f442fa61SYang Shi 				if (try_grab_folio(folio, page_increm - 1,
1485dc21e700SJosef Bacik 						   gup_flags)) {
148657edfcfdSPeter Xu 					/*
148757edfcfdSPeter Xu 					 * Release the 1st page ref if the
148857edfcfdSPeter Xu 					 * folio is problematic, fail hard.
148957edfcfdSPeter Xu 					 */
1490dc21e700SJosef Bacik 					gup_put_folio(folio, 1, gup_flags);
149157edfcfdSPeter Xu 					ret = -EFAULT;
149257edfcfdSPeter Xu 					goto out;
149357edfcfdSPeter Xu 				}
149457edfcfdSPeter Xu 			}
149557edfcfdSPeter Xu 
149657edfcfdSPeter Xu 			for (j = 0; j < page_increm; j++) {
1497541541dbSDavid Hildenbrand 				subpage = page + j;
149857edfcfdSPeter Xu 				pages[i + j] = subpage;
149957edfcfdSPeter Xu 				flush_anon_page(vma, subpage, start + j * PAGE_SIZE);
150057edfcfdSPeter Xu 				flush_dcache_page(subpage);
150157edfcfdSPeter Xu 			}
150257edfcfdSPeter Xu 		}
150357edfcfdSPeter Xu 
15044bbd4c77SKirill A. Shutemov 		i += page_increm;
15054bbd4c77SKirill A. Shutemov 		start += page_increm * PAGE_SIZE;
15064bbd4c77SKirill A. Shutemov 		nr_pages -= page_increm;
15074bbd4c77SKirill A. Shutemov 	} while (nr_pages);
1508df06b37fSKeith Busch out:
1509df06b37fSKeith Busch 	return i ? i : ret;
15104bbd4c77SKirill A. Shutemov }
15114bbd4c77SKirill A. Shutemov 
vma_permits_fault(struct vm_area_struct * vma,unsigned int fault_flags)1512771ab430STobias Klauser static bool vma_permits_fault(struct vm_area_struct *vma,
1513771ab430STobias Klauser 			      unsigned int fault_flags)
1514d4925e00SDave Hansen {
151533a709b2SDave Hansen 	bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
15161b2ee126SDave Hansen 	bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
151733a709b2SDave Hansen 	vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
1518d4925e00SDave Hansen 
1519d4925e00SDave Hansen 	if (!(vm_flags & vma->vm_flags))
1520d4925e00SDave Hansen 		return false;
1521d4925e00SDave Hansen 
152233a709b2SDave Hansen 	/*
152333a709b2SDave Hansen 	 * The architecture might have a hardware protection
15241b2ee126SDave Hansen 	 * mechanism other than read/write that can deny access.
1525d61172b4SDave Hansen 	 *
1526d61172b4SDave Hansen 	 * gup always represents data access, not instruction
1527d61172b4SDave Hansen 	 * fetches, so execute=false here:
152833a709b2SDave Hansen 	 */
1529d61172b4SDave Hansen 	if (!arch_vma_access_permitted(vma, write, false, foreign))
153033a709b2SDave Hansen 		return false;
153133a709b2SDave Hansen 
1532d4925e00SDave Hansen 	return true;
1533d4925e00SDave Hansen }
1534d4925e00SDave Hansen 
1535adc8cb40SSouptick Joarder /**
15364bbd4c77SKirill A. Shutemov  * fixup_user_fault() - manually resolve a user page fault
15374bbd4c77SKirill A. Shutemov  * @mm:		mm_struct of target mm
15384bbd4c77SKirill A. Shutemov  * @address:	user address
15394bbd4c77SKirill A. Shutemov  * @fault_flags:flags to pass down to handle_mm_fault()
1540c1e8d7c6SMichel Lespinasse  * @unlocked:	did we unlock the mmap_lock while retrying, maybe NULL if caller
1541548b6a1eSMiles Chen  *		does not allow retry. If NULL, the caller must guarantee
1542548b6a1eSMiles Chen  *		that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
15434bbd4c77SKirill A. Shutemov  *
15444bbd4c77SKirill A. Shutemov  * This is meant to be called in the specific scenario where for locking reasons
15454bbd4c77SKirill A. Shutemov  * we try to access user memory in atomic context (within a pagefault_disable()
15464bbd4c77SKirill A. Shutemov  * section), this returns -EFAULT, and we want to resolve the user fault before
15474bbd4c77SKirill A. Shutemov  * trying again.
15484bbd4c77SKirill A. Shutemov  *
15494bbd4c77SKirill A. Shutemov  * Typically this is meant to be used by the futex code.
15504bbd4c77SKirill A. Shutemov  *
15514bbd4c77SKirill A. Shutemov  * The main difference with get_user_pages() is that this function will
15524bbd4c77SKirill A. Shutemov  * unconditionally call handle_mm_fault() which will in turn perform all the
15534bbd4c77SKirill A. Shutemov  * necessary SW fixup of the dirty and young bits in the PTE, while
15544a9e1cdaSDominik Dingel  * get_user_pages() only guarantees to update these in the struct page.
15554bbd4c77SKirill A. Shutemov  *
15564bbd4c77SKirill A. Shutemov  * This is important for some architectures where those bits also gate the
15574bbd4c77SKirill A. Shutemov  * access permission to the page because they are maintained in software.  On
15584bbd4c77SKirill A. Shutemov  * such architectures, gup() will not be enough to make a subsequent access
15594bbd4c77SKirill A. Shutemov  * succeed.
15604bbd4c77SKirill A. Shutemov  *
1561c1e8d7c6SMichel Lespinasse  * This function will not return with an unlocked mmap_lock. So it has not the
1562c1e8d7c6SMichel Lespinasse  * same semantics wrt the @mm->mmap_lock as does filemap_fault().
15634bbd4c77SKirill A. Shutemov  */
fixup_user_fault(struct mm_struct * mm,unsigned long address,unsigned int fault_flags,bool * unlocked)156464019a2eSPeter Xu int fixup_user_fault(struct mm_struct *mm,
15654a9e1cdaSDominik Dingel 		     unsigned long address, unsigned int fault_flags,
15664a9e1cdaSDominik Dingel 		     bool *unlocked)
15674bbd4c77SKirill A. Shutemov {
15684bbd4c77SKirill A. Shutemov 	struct vm_area_struct *vma;
15698fed2f3cSMiaohe Lin 	vm_fault_t ret;
15704bbd4c77SKirill A. Shutemov 
1571428e106aSKirill A. Shutemov 	address = untagged_addr_remote(mm, address);
1572f9652594SAndrey Konovalov 
15734a9e1cdaSDominik Dingel 	if (unlocked)
157471335f37SPeter Xu 		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
15754a9e1cdaSDominik Dingel 
15764a9e1cdaSDominik Dingel retry:
15776cd06ab1SLinus Torvalds 	vma = gup_vma_lookup(mm, address);
15788d7071afSLinus Torvalds 	if (!vma)
15794bbd4c77SKirill A. Shutemov 		return -EFAULT;
15804bbd4c77SKirill A. Shutemov 
1581d4925e00SDave Hansen 	if (!vma_permits_fault(vma, fault_flags))
15824bbd4c77SKirill A. Shutemov 		return -EFAULT;
15834bbd4c77SKirill A. Shutemov 
1584475f4dfcSPeter Xu 	if ((fault_flags & FAULT_FLAG_KILLABLE) &&
1585475f4dfcSPeter Xu 	    fatal_signal_pending(current))
1586475f4dfcSPeter Xu 		return -EINTR;
1587475f4dfcSPeter Xu 
1588bce617edSPeter Xu 	ret = handle_mm_fault(vma, address, fault_flags, NULL);
1589d9272525SPeter Xu 
1590d9272525SPeter Xu 	if (ret & VM_FAULT_COMPLETED) {
1591d9272525SPeter Xu 		/*
1592d9272525SPeter Xu 		 * NOTE: it's a pity that we need to retake the lock here
1593d9272525SPeter Xu 		 * to pair with the unlock() in the callers. Ideally we
1594d9272525SPeter Xu 		 * could tell the callers so they do not need to unlock.
1595d9272525SPeter Xu 		 */
1596d9272525SPeter Xu 		mmap_read_lock(mm);
1597d9272525SPeter Xu 		*unlocked = true;
1598d9272525SPeter Xu 		return 0;
1599d9272525SPeter Xu 	}
1600d9272525SPeter Xu 
16014bbd4c77SKirill A. Shutemov 	if (ret & VM_FAULT_ERROR) {
16029a291a7cSJames Morse 		int err = vm_fault_to_errno(ret, 0);
16039a291a7cSJames Morse 
16049a291a7cSJames Morse 		if (err)
16059a291a7cSJames Morse 			return err;
16064bbd4c77SKirill A. Shutemov 		BUG();
16074bbd4c77SKirill A. Shutemov 	}
16084a9e1cdaSDominik Dingel 
16094a9e1cdaSDominik Dingel 	if (ret & VM_FAULT_RETRY) {
1610d8ed45c5SMichel Lespinasse 		mmap_read_lock(mm);
16114a9e1cdaSDominik Dingel 		*unlocked = true;
16124a9e1cdaSDominik Dingel 		fault_flags |= FAULT_FLAG_TRIED;
16134a9e1cdaSDominik Dingel 		goto retry;
16144a9e1cdaSDominik Dingel 	}
16154a9e1cdaSDominik Dingel 
16164bbd4c77SKirill A. Shutemov 	return 0;
16174bbd4c77SKirill A. Shutemov }
1618add6a0cdSPaolo Bonzini EXPORT_SYMBOL_GPL(fixup_user_fault);
16194bbd4c77SKirill A. Shutemov 
16202d3a36a4SMichal Hocko /*
162193c5c61dSPeter Xu  * GUP always responds to fatal signals.  When FOLL_INTERRUPTIBLE is
162293c5c61dSPeter Xu  * specified, it'll also respond to generic signals.  The caller of GUP
162393c5c61dSPeter Xu  * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption.
162493c5c61dSPeter Xu  */
gup_signal_pending(unsigned int flags)162593c5c61dSPeter Xu static bool gup_signal_pending(unsigned int flags)
162693c5c61dSPeter Xu {
162793c5c61dSPeter Xu 	if (fatal_signal_pending(current))
162893c5c61dSPeter Xu 		return true;
162993c5c61dSPeter Xu 
163093c5c61dSPeter Xu 	if (!(flags & FOLL_INTERRUPTIBLE))
163193c5c61dSPeter Xu 		return false;
163293c5c61dSPeter Xu 
163393c5c61dSPeter Xu 	return signal_pending(current);
163493c5c61dSPeter Xu }
163593c5c61dSPeter Xu 
163693c5c61dSPeter Xu /*
1637b2a72dffSJason Gunthorpe  * Locking: (*locked == 1) means that the mmap_lock has already been acquired by
1638b2a72dffSJason Gunthorpe  * the caller. This function may drop the mmap_lock. If it does so, then it will
1639b2a72dffSJason Gunthorpe  * set (*locked = 0).
1640b2a72dffSJason Gunthorpe  *
1641b2a72dffSJason Gunthorpe  * (*locked == 0) means that the caller expects this function to acquire and
1642b2a72dffSJason Gunthorpe  * drop the mmap_lock. Therefore, the value of *locked will still be zero when
1643b2a72dffSJason Gunthorpe  * the function returns, even though it may have changed temporarily during
1644b2a72dffSJason Gunthorpe  * function execution.
1645b2a72dffSJason Gunthorpe  *
1646b2a72dffSJason Gunthorpe  * Please note that this function, unlike __get_user_pages(), will not return 0
1647b2a72dffSJason Gunthorpe  * for nr_pages > 0, unless FOLL_NOWAIT is used.
16482d3a36a4SMichal Hocko  */
__get_user_pages_locked(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,struct page ** pages,int * locked,unsigned int flags)164964019a2eSPeter Xu static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
1650f0818f47SAndrea Arcangeli 						unsigned long start,
1651f0818f47SAndrea Arcangeli 						unsigned long nr_pages,
1652f0818f47SAndrea Arcangeli 						struct page **pages,
1653e716712fSAl Viro 						int *locked,
16540fd71a56SAndrea Arcangeli 						unsigned int flags)
1655f0818f47SAndrea Arcangeli {
1656f0818f47SAndrea Arcangeli 	long ret, pages_done;
1657b2a72dffSJason Gunthorpe 	bool must_unlock = false;
1658f0818f47SAndrea Arcangeli 
16599c4b2142SLorenzo Stoakes 	if (!nr_pages)
16609c4b2142SLorenzo Stoakes 		return 0;
16619c4b2142SLorenzo Stoakes 
1662b2a72dffSJason Gunthorpe 	/*
1663b2a72dffSJason Gunthorpe 	 * The internal caller expects GUP to manage the lock internally and the
1664b2a72dffSJason Gunthorpe 	 * lock must be released when this returns.
1665b2a72dffSJason Gunthorpe 	 */
16669a863a6aSJason Gunthorpe 	if (!*locked) {
1667b2a72dffSJason Gunthorpe 		if (mmap_read_lock_killable(mm))
1668b2a72dffSJason Gunthorpe 			return -EAGAIN;
1669b2a72dffSJason Gunthorpe 		must_unlock = true;
1670b2a72dffSJason Gunthorpe 		*locked = 1;
1671f0818f47SAndrea Arcangeli 	}
1672961ba472SJason Gunthorpe 	else
1673961ba472SJason Gunthorpe 		mmap_assert_locked(mm);
1674f0818f47SAndrea Arcangeli 
1675a458b76aSAndrea Arcangeli 	if (flags & FOLL_PIN)
167612e423baSLorenzo Stoakes 		mm_set_has_pinned_flag(mm);
1677008cfe44SPeter Xu 
1678eddb1c22SJohn Hubbard 	/*
1679eddb1c22SJohn Hubbard 	 * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
1680eddb1c22SJohn Hubbard 	 * is to set FOLL_GET if the caller wants pages[] filled in (but has
1681eddb1c22SJohn Hubbard 	 * carelessly failed to specify FOLL_GET), so keep doing that, but only
1682eddb1c22SJohn Hubbard 	 * for FOLL_GET, not for the newer FOLL_PIN.
1683eddb1c22SJohn Hubbard 	 *
1684eddb1c22SJohn Hubbard 	 * FOLL_PIN always expects pages to be non-null, but no need to assert
1685eddb1c22SJohn Hubbard 	 * that here, as any failures will be obvious enough.
1686eddb1c22SJohn Hubbard 	 */
1687eddb1c22SJohn Hubbard 	if (pages && !(flags & FOLL_PIN))
1688f0818f47SAndrea Arcangeli 		flags |= FOLL_GET;
1689f0818f47SAndrea Arcangeli 
1690f0818f47SAndrea Arcangeli 	pages_done = 0;
1691f0818f47SAndrea Arcangeli 	for (;;) {
169264019a2eSPeter Xu 		ret = __get_user_pages(mm, start, nr_pages, flags, pages,
1693b2cac248SLorenzo Stoakes 				       locked);
1694f04740f5SJason Gunthorpe 		if (!(flags & FOLL_UNLOCKABLE)) {
1695f0818f47SAndrea Arcangeli 			/* VM_FAULT_RETRY couldn't trigger, bypass */
1696f04740f5SJason Gunthorpe 			pages_done = ret;
1697f04740f5SJason Gunthorpe 			break;
1698f04740f5SJason Gunthorpe 		}
1699f0818f47SAndrea Arcangeli 
1700d9272525SPeter Xu 		/* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
1701792b429dSDavid Hildenbrand 		VM_WARN_ON_ONCE(!*locked && (ret < 0 || ret >= nr_pages));
1702f0818f47SAndrea Arcangeli 
1703f0818f47SAndrea Arcangeli 		if (ret > 0) {
1704f0818f47SAndrea Arcangeli 			nr_pages -= ret;
1705f0818f47SAndrea Arcangeli 			pages_done += ret;
1706f0818f47SAndrea Arcangeli 			if (!nr_pages)
1707f0818f47SAndrea Arcangeli 				break;
1708f0818f47SAndrea Arcangeli 		}
1709f0818f47SAndrea Arcangeli 		if (*locked) {
171096312e61SAndrea Arcangeli 			/*
171196312e61SAndrea Arcangeli 			 * VM_FAULT_RETRY didn't trigger or it was a
171296312e61SAndrea Arcangeli 			 * FOLL_NOWAIT.
171396312e61SAndrea Arcangeli 			 */
1714f0818f47SAndrea Arcangeli 			if (!pages_done)
1715f0818f47SAndrea Arcangeli 				pages_done = ret;
1716f0818f47SAndrea Arcangeli 			break;
1717f0818f47SAndrea Arcangeli 		}
1718df17277bSMike Rapoport 		/*
1719df17277bSMike Rapoport 		 * VM_FAULT_RETRY triggered, so seek to the faulting offset.
1720df17277bSMike Rapoport 		 * For the prefault case (!pages) we only update counts.
1721df17277bSMike Rapoport 		 */
1722df17277bSMike Rapoport 		if (likely(pages))
1723f0818f47SAndrea Arcangeli 			pages += ret;
1724f0818f47SAndrea Arcangeli 		start += ret << PAGE_SHIFT;
1725b2a72dffSJason Gunthorpe 
1726b2a72dffSJason Gunthorpe 		/* The lock was temporarily dropped, so we must unlock later */
1727b2a72dffSJason Gunthorpe 		must_unlock = true;
1728f0818f47SAndrea Arcangeli 
17294426e945SPeter Xu retry:
1730f0818f47SAndrea Arcangeli 		/*
1731f0818f47SAndrea Arcangeli 		 * Repeat on the address that fired VM_FAULT_RETRY
17324426e945SPeter Xu 		 * with both FAULT_FLAG_ALLOW_RETRY and
17334426e945SPeter Xu 		 * FAULT_FLAG_TRIED.  Note that GUP can be interrupted
173493c5c61dSPeter Xu 		 * by fatal signals of even common signals, depending on
173593c5c61dSPeter Xu 		 * the caller's request. So we need to check it before we
17364426e945SPeter Xu 		 * start trying again otherwise it can loop forever.
1737f0818f47SAndrea Arcangeli 		 */
173893c5c61dSPeter Xu 		if (gup_signal_pending(flags)) {
1739ae46d2aaSHillf Danton 			if (!pages_done)
1740ae46d2aaSHillf Danton 				pages_done = -EINTR;
17414426e945SPeter Xu 			break;
1742ae46d2aaSHillf Danton 		}
17434426e945SPeter Xu 
1744d8ed45c5SMichel Lespinasse 		ret = mmap_read_lock_killable(mm);
174571335f37SPeter Xu 		if (ret) {
174671335f37SPeter Xu 			if (!pages_done)
174771335f37SPeter Xu 				pages_done = ret;
174871335f37SPeter Xu 			break;
174971335f37SPeter Xu 		}
17504426e945SPeter Xu 
1751c7b6a566SPeter Xu 		*locked = 1;
175264019a2eSPeter Xu 		ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
1753b2cac248SLorenzo Stoakes 				       pages, locked);
17544426e945SPeter Xu 		if (!*locked) {
17554426e945SPeter Xu 			/* Continue to retry until we succeeded */
1756792b429dSDavid Hildenbrand 			VM_WARN_ON_ONCE(ret != 0);
17574426e945SPeter Xu 			goto retry;
17584426e945SPeter Xu 		}
1759f0818f47SAndrea Arcangeli 		if (ret != 1) {
1760792b429dSDavid Hildenbrand 			VM_WARN_ON_ONCE(ret > 1);
1761f0818f47SAndrea Arcangeli 			if (!pages_done)
1762f0818f47SAndrea Arcangeli 				pages_done = ret;
1763f0818f47SAndrea Arcangeli 			break;
1764f0818f47SAndrea Arcangeli 		}
1765f0818f47SAndrea Arcangeli 		nr_pages--;
1766f0818f47SAndrea Arcangeli 		pages_done++;
1767f0818f47SAndrea Arcangeli 		if (!nr_pages)
1768f0818f47SAndrea Arcangeli 			break;
1769df17277bSMike Rapoport 		if (likely(pages))
1770f0818f47SAndrea Arcangeli 			pages++;
1771f0818f47SAndrea Arcangeli 		start += PAGE_SIZE;
1772f0818f47SAndrea Arcangeli 	}
1773b2a72dffSJason Gunthorpe 	if (must_unlock && *locked) {
1774f0818f47SAndrea Arcangeli 		/*
1775b2a72dffSJason Gunthorpe 		 * We either temporarily dropped the lock, or the caller
1776b2a72dffSJason Gunthorpe 		 * requested that we both acquire and drop the lock. Either way,
1777b2a72dffSJason Gunthorpe 		 * we must now unlock, and notify the caller of that state.
1778f0818f47SAndrea Arcangeli 		 */
1779d8ed45c5SMichel Lespinasse 		mmap_read_unlock(mm);
1780f0818f47SAndrea Arcangeli 		*locked = 0;
1781f0818f47SAndrea Arcangeli 	}
17829c4b2142SLorenzo Stoakes 
17839c4b2142SLorenzo Stoakes 	/*
17849c4b2142SLorenzo Stoakes 	 * Failing to pin anything implies something has gone wrong (except when
17859c4b2142SLorenzo Stoakes 	 * FOLL_NOWAIT is specified).
17869c4b2142SLorenzo Stoakes 	 */
17879c4b2142SLorenzo Stoakes 	if (WARN_ON_ONCE(pages_done == 0 && !(flags & FOLL_NOWAIT)))
17889c4b2142SLorenzo Stoakes 		return -EFAULT;
17899c4b2142SLorenzo Stoakes 
1790f0818f47SAndrea Arcangeli 	return pages_done;
1791f0818f47SAndrea Arcangeli }
1792f0818f47SAndrea Arcangeli 
1793d3649f68SChristoph Hellwig /**
1794d3649f68SChristoph Hellwig  * populate_vma_page_range() -  populate a range of pages in the vma.
1795d3649f68SChristoph Hellwig  * @vma:   target vma
1796d3649f68SChristoph Hellwig  * @start: start address
1797d3649f68SChristoph Hellwig  * @end:   end address
1798c1e8d7c6SMichel Lespinasse  * @locked: whether the mmap_lock is still held
1799d3649f68SChristoph Hellwig  *
1800d3649f68SChristoph Hellwig  * This takes care of mlocking the pages too if VM_LOCKED is set.
1801d3649f68SChristoph Hellwig  *
18020a36f7f8STang Yizhou  * Return either number of pages pinned in the vma, or a negative error
18030a36f7f8STang Yizhou  * code on error.
1804d3649f68SChristoph Hellwig  *
1805c1e8d7c6SMichel Lespinasse  * vma->vm_mm->mmap_lock must be held.
1806d3649f68SChristoph Hellwig  *
18074f6da934SPeter Xu  * If @locked is NULL, it may be held for read or write and will
1808d3649f68SChristoph Hellwig  * be unperturbed.
1809d3649f68SChristoph Hellwig  *
18104f6da934SPeter Xu  * If @locked is non-NULL, it must held for read only and may be
18114f6da934SPeter Xu  * released.  If it's released, *@locked will be set to 0.
1812d3649f68SChristoph Hellwig  */
populate_vma_page_range(struct vm_area_struct * vma,unsigned long start,unsigned long end,int * locked)1813d3649f68SChristoph Hellwig long populate_vma_page_range(struct vm_area_struct *vma,
18144f6da934SPeter Xu 		unsigned long start, unsigned long end, int *locked)
1815d3649f68SChristoph Hellwig {
1816d3649f68SChristoph Hellwig 	struct mm_struct *mm = vma->vm_mm;
1817d3649f68SChristoph Hellwig 	unsigned long nr_pages = (end - start) / PAGE_SIZE;
18189a863a6aSJason Gunthorpe 	int local_locked = 1;
1819d3649f68SChristoph Hellwig 	int gup_flags;
1820ece369c7SHugh Dickins 	long ret;
1821d3649f68SChristoph Hellwig 
1822792b429dSDavid Hildenbrand 	VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
1823792b429dSDavid Hildenbrand 	VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
1824792b429dSDavid Hildenbrand 	VM_WARN_ON_ONCE_VMA(start < vma->vm_start, vma);
1825792b429dSDavid Hildenbrand 	VM_WARN_ON_ONCE_VMA(end   > vma->vm_end, vma);
182642fc5414SMichel Lespinasse 	mmap_assert_locked(mm);
1827d3649f68SChristoph Hellwig 
1828b67bf49cSHugh Dickins 	/*
1829b67bf49cSHugh Dickins 	 * Rightly or wrongly, the VM_LOCKONFAULT case has never used
1830b67bf49cSHugh Dickins 	 * faultin_page() to break COW, so it has no work to do here.
1831b67bf49cSHugh Dickins 	 */
1832d3649f68SChristoph Hellwig 	if (vma->vm_flags & VM_LOCKONFAULT)
1833b67bf49cSHugh Dickins 		return nr_pages;
1834b67bf49cSHugh Dickins 
18351096bc93SLinus Torvalds 	/* ... similarly, we've never faulted in PROT_NONE pages */
18361096bc93SLinus Torvalds 	if (!vma_is_accessible(vma))
18371096bc93SLinus Torvalds 		return -EFAULT;
18381096bc93SLinus Torvalds 
1839b67bf49cSHugh Dickins 	gup_flags = FOLL_TOUCH;
1840d3649f68SChristoph Hellwig 	/*
1841d3649f68SChristoph Hellwig 	 * We want to touch writable mappings with a write fault in order
1842d3649f68SChristoph Hellwig 	 * to break COW, except for shared mappings because these don't COW
1843d3649f68SChristoph Hellwig 	 * and we would not want to dirty them for nothing.
18441096bc93SLinus Torvalds 	 *
18451096bc93SLinus Torvalds 	 * Otherwise, do a read fault, and use FOLL_FORCE in case it's not
18461096bc93SLinus Torvalds 	 * readable (ie write-only or executable).
1847d3649f68SChristoph Hellwig 	 */
1848d3649f68SChristoph Hellwig 	if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
1849d3649f68SChristoph Hellwig 		gup_flags |= FOLL_WRITE;
18501096bc93SLinus Torvalds 	else
1851d3649f68SChristoph Hellwig 		gup_flags |= FOLL_FORCE;
1852d3649f68SChristoph Hellwig 
1853f04740f5SJason Gunthorpe 	if (locked)
1854f04740f5SJason Gunthorpe 		gup_flags |= FOLL_UNLOCKABLE;
1855f04740f5SJason Gunthorpe 
1856d3649f68SChristoph Hellwig 	/*
1857d3649f68SChristoph Hellwig 	 * We made sure addr is within a VMA, so the following will
1858d3649f68SChristoph Hellwig 	 * not result in a stack expansion that recurses back here.
1859d3649f68SChristoph Hellwig 	 */
1860ece369c7SHugh Dickins 	ret = __get_user_pages(mm, start, nr_pages, gup_flags,
1861b2cac248SLorenzo Stoakes 			       NULL, locked ? locked : &local_locked);
1862ece369c7SHugh Dickins 	lru_add_drain();
1863ece369c7SHugh Dickins 	return ret;
1864d3649f68SChristoph Hellwig }
1865d3649f68SChristoph Hellwig 
1866d3649f68SChristoph Hellwig /*
1867631426baSDavid Hildenbrand  * faultin_page_range() - populate (prefault) page tables inside the
1868631426baSDavid Hildenbrand  *			  given range readable/writable
18694ca9b385SDavid Hildenbrand  *
18704ca9b385SDavid Hildenbrand  * This takes care of mlocking the pages, too, if VM_LOCKED is set.
18714ca9b385SDavid Hildenbrand  *
1872631426baSDavid Hildenbrand  * @mm: the mm to populate page tables in
18734ca9b385SDavid Hildenbrand  * @start: start address
18744ca9b385SDavid Hildenbrand  * @end: end address
18754ca9b385SDavid Hildenbrand  * @write: whether to prefault readable or writable
18764ca9b385SDavid Hildenbrand  * @locked: whether the mmap_lock is still held
18774ca9b385SDavid Hildenbrand  *
1878631426baSDavid Hildenbrand  * Returns either number of processed pages in the MM, or a negative error
1879631426baSDavid Hildenbrand  * code on error (see __get_user_pages()). Note that this function reports
1880631426baSDavid Hildenbrand  * errors related to VMAs, such as incompatible mappings, as expected by
1881631426baSDavid Hildenbrand  * MADV_POPULATE_(READ|WRITE).
18824ca9b385SDavid Hildenbrand  *
1883631426baSDavid Hildenbrand  * The range must be page-aligned.
1884631426baSDavid Hildenbrand  *
1885631426baSDavid Hildenbrand  * mm->mmap_lock must be held. If it's released, *@locked will be set to 0.
18864ca9b385SDavid Hildenbrand  */
faultin_page_range(struct mm_struct * mm,unsigned long start,unsigned long end,bool write,int * locked)1887631426baSDavid Hildenbrand long faultin_page_range(struct mm_struct *mm, unsigned long start,
18884ca9b385SDavid Hildenbrand 			unsigned long end, bool write, int *locked)
18894ca9b385SDavid Hildenbrand {
18904ca9b385SDavid Hildenbrand 	unsigned long nr_pages = (end - start) / PAGE_SIZE;
18914ca9b385SDavid Hildenbrand 	int gup_flags;
1892ece369c7SHugh Dickins 	long ret;
18934ca9b385SDavid Hildenbrand 
1894792b429dSDavid Hildenbrand 	VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
1895792b429dSDavid Hildenbrand 	VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
18964ca9b385SDavid Hildenbrand 	mmap_assert_locked(mm);
18974ca9b385SDavid Hildenbrand 
18984ca9b385SDavid Hildenbrand 	/*
18994ca9b385SDavid Hildenbrand 	 * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
19004ca9b385SDavid Hildenbrand 	 *	       the page dirty with FOLL_WRITE -- which doesn't make a
19014ca9b385SDavid Hildenbrand 	 *	       difference with !FOLL_FORCE, because the page is writable
19024ca9b385SDavid Hildenbrand 	 *	       in the page table.
19034ca9b385SDavid Hildenbrand 	 * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
19044ca9b385SDavid Hildenbrand 	 *		  a poisoned page.
19054ca9b385SDavid Hildenbrand 	 * !FOLL_FORCE: Require proper access permissions.
19064ca9b385SDavid Hildenbrand 	 */
1907631426baSDavid Hildenbrand 	gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE |
1908631426baSDavid Hildenbrand 		    FOLL_MADV_POPULATE;
19094ca9b385SDavid Hildenbrand 	if (write)
19104ca9b385SDavid Hildenbrand 		gup_flags |= FOLL_WRITE;
19114ca9b385SDavid Hildenbrand 
1912631426baSDavid Hildenbrand 	ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked,
1913631426baSDavid Hildenbrand 				      gup_flags);
1914ece369c7SHugh Dickins 	lru_add_drain();
1915ece369c7SHugh Dickins 	return ret;
19164ca9b385SDavid Hildenbrand }
19174ca9b385SDavid Hildenbrand 
19184ca9b385SDavid Hildenbrand /*
1919d3649f68SChristoph Hellwig  * __mm_populate - populate and/or mlock pages within a range of address space.
1920d3649f68SChristoph Hellwig  *
1921d3649f68SChristoph Hellwig  * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
1922d3649f68SChristoph Hellwig  * flags. VMAs must be already marked with the desired vm_flags, and
1923c1e8d7c6SMichel Lespinasse  * mmap_lock must not be held.
1924d3649f68SChristoph Hellwig  */
__mm_populate(unsigned long start,unsigned long len,int ignore_errors)1925d3649f68SChristoph Hellwig int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
1926d3649f68SChristoph Hellwig {
1927d3649f68SChristoph Hellwig 	struct mm_struct *mm = current->mm;
1928d3649f68SChristoph Hellwig 	unsigned long end, nstart, nend;
1929d3649f68SChristoph Hellwig 	struct vm_area_struct *vma = NULL;
1930d3649f68SChristoph Hellwig 	int locked = 0;
1931d3649f68SChristoph Hellwig 	long ret = 0;
1932d3649f68SChristoph Hellwig 
1933d3649f68SChristoph Hellwig 	end = start + len;
1934d3649f68SChristoph Hellwig 
1935d3649f68SChristoph Hellwig 	for (nstart = start; nstart < end; nstart = nend) {
1936d3649f68SChristoph Hellwig 		/*
1937d3649f68SChristoph Hellwig 		 * We want to fault in pages for [nstart; end) address range.
1938d3649f68SChristoph Hellwig 		 * Find first corresponding VMA.
1939d3649f68SChristoph Hellwig 		 */
1940d3649f68SChristoph Hellwig 		if (!locked) {
1941d3649f68SChristoph Hellwig 			locked = 1;
1942d8ed45c5SMichel Lespinasse 			mmap_read_lock(mm);
1943c4d1a92dSLiam R. Howlett 			vma = find_vma_intersection(mm, nstart, end);
1944d3649f68SChristoph Hellwig 		} else if (nstart >= vma->vm_end)
1945c4d1a92dSLiam R. Howlett 			vma = find_vma_intersection(mm, vma->vm_end, end);
1946c4d1a92dSLiam R. Howlett 
1947c4d1a92dSLiam R. Howlett 		if (!vma)
1948d3649f68SChristoph Hellwig 			break;
1949d3649f68SChristoph Hellwig 		/*
1950d3649f68SChristoph Hellwig 		 * Set [nstart; nend) to intersection of desired address
1951d3649f68SChristoph Hellwig 		 * range with the first VMA. Also, skip undesirable VMA types.
1952d3649f68SChristoph Hellwig 		 */
1953d3649f68SChristoph Hellwig 		nend = min(end, vma->vm_end);
1954d3649f68SChristoph Hellwig 		if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1955d3649f68SChristoph Hellwig 			continue;
1956d3649f68SChristoph Hellwig 		if (nstart < vma->vm_start)
1957d3649f68SChristoph Hellwig 			nstart = vma->vm_start;
1958d3649f68SChristoph Hellwig 		/*
1959d3649f68SChristoph Hellwig 		 * Now fault in a range of pages. populate_vma_page_range()
1960d3649f68SChristoph Hellwig 		 * double checks the vma flags, so that it won't mlock pages
1961d3649f68SChristoph Hellwig 		 * if the vma was already munlocked.
1962d3649f68SChristoph Hellwig 		 */
1963d3649f68SChristoph Hellwig 		ret = populate_vma_page_range(vma, nstart, nend, &locked);
1964d3649f68SChristoph Hellwig 		if (ret < 0) {
1965d3649f68SChristoph Hellwig 			if (ignore_errors) {
1966d3649f68SChristoph Hellwig 				ret = 0;
1967d3649f68SChristoph Hellwig 				continue;	/* continue at next VMA */
1968d3649f68SChristoph Hellwig 			}
1969d3649f68SChristoph Hellwig 			break;
1970d3649f68SChristoph Hellwig 		}
1971d3649f68SChristoph Hellwig 		nend = nstart + ret * PAGE_SIZE;
1972d3649f68SChristoph Hellwig 		ret = 0;
1973d3649f68SChristoph Hellwig 	}
1974d3649f68SChristoph Hellwig 	if (locked)
1975d8ed45c5SMichel Lespinasse 		mmap_read_unlock(mm);
1976d3649f68SChristoph Hellwig 	return ret;	/* 0 or negative error code */
1977d3649f68SChristoph Hellwig }
1978050a9adcSChristoph Hellwig #else /* CONFIG_MMU */
__get_user_pages_locked(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,struct page ** pages,int * locked,unsigned int foll_flags)197964019a2eSPeter Xu static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
1980050a9adcSChristoph Hellwig 		unsigned long nr_pages, struct page **pages,
1981b2cac248SLorenzo Stoakes 		int *locked, unsigned int foll_flags)
1982050a9adcSChristoph Hellwig {
1983050a9adcSChristoph Hellwig 	struct vm_area_struct *vma;
1984b2a72dffSJason Gunthorpe 	bool must_unlock = false;
1985bfbe7110SLorenzo Stoakes 	vm_flags_t vm_flags;
198624dc20c7SPavel Tatashin 	long i;
1987050a9adcSChristoph Hellwig 
1988b2a72dffSJason Gunthorpe 	if (!nr_pages)
1989b2a72dffSJason Gunthorpe 		return 0;
1990b2a72dffSJason Gunthorpe 
1991b2a72dffSJason Gunthorpe 	/*
1992b2a72dffSJason Gunthorpe 	 * The internal caller expects GUP to manage the lock internally and the
1993b2a72dffSJason Gunthorpe 	 * lock must be released when this returns.
1994b2a72dffSJason Gunthorpe 	 */
19959a863a6aSJason Gunthorpe 	if (!*locked) {
1996b2a72dffSJason Gunthorpe 		if (mmap_read_lock_killable(mm))
1997b2a72dffSJason Gunthorpe 			return -EAGAIN;
1998b2a72dffSJason Gunthorpe 		must_unlock = true;
1999b2a72dffSJason Gunthorpe 		*locked = 1;
2000b2a72dffSJason Gunthorpe 	}
2001b2a72dffSJason Gunthorpe 
2002050a9adcSChristoph Hellwig 	/* calculate required read or write permissions.
2003050a9adcSChristoph Hellwig 	 * If FOLL_FORCE is set, we only require the "MAY" flags.
2004050a9adcSChristoph Hellwig 	 */
2005050a9adcSChristoph Hellwig 	vm_flags  = (foll_flags & FOLL_WRITE) ?
2006050a9adcSChristoph Hellwig 			(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
2007050a9adcSChristoph Hellwig 	vm_flags &= (foll_flags & FOLL_FORCE) ?
2008050a9adcSChristoph Hellwig 			(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
2009050a9adcSChristoph Hellwig 
2010050a9adcSChristoph Hellwig 	for (i = 0; i < nr_pages; i++) {
2011050a9adcSChristoph Hellwig 		vma = find_vma(mm, start);
2012050a9adcSChristoph Hellwig 		if (!vma)
2013b2a72dffSJason Gunthorpe 			break;
2014050a9adcSChristoph Hellwig 
2015050a9adcSChristoph Hellwig 		/* protect what we can, including chardevs */
2016050a9adcSChristoph Hellwig 		if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
2017050a9adcSChristoph Hellwig 		    !(vm_flags & vma->vm_flags))
2018b2a72dffSJason Gunthorpe 			break;
2019050a9adcSChristoph Hellwig 
2020050a9adcSChristoph Hellwig 		if (pages) {
2021396a400bSLinus Walleij 			pages[i] = virt_to_page((void *)start);
2022050a9adcSChristoph Hellwig 			if (pages[i])
2023050a9adcSChristoph Hellwig 				get_page(pages[i]);
2024050a9adcSChristoph Hellwig 		}
2025b2cac248SLorenzo Stoakes 
2026050a9adcSChristoph Hellwig 		start = (start + PAGE_SIZE) & PAGE_MASK;
2027050a9adcSChristoph Hellwig 	}
2028050a9adcSChristoph Hellwig 
2029b2a72dffSJason Gunthorpe 	if (must_unlock && *locked) {
2030b2a72dffSJason Gunthorpe 		mmap_read_unlock(mm);
2031b2a72dffSJason Gunthorpe 		*locked = 0;
2032b2a72dffSJason Gunthorpe 	}
2033050a9adcSChristoph Hellwig 
2034050a9adcSChristoph Hellwig 	return i ? : -EFAULT;
2035050a9adcSChristoph Hellwig }
2036050a9adcSChristoph Hellwig #endif /* !CONFIG_MMU */
2037d3649f68SChristoph Hellwig 
20388f942eeaSJann Horn /**
2039bb523b40SAndreas Gruenbacher  * fault_in_writeable - fault in userspace address range for writing
2040bb523b40SAndreas Gruenbacher  * @uaddr: start of address range
2041bb523b40SAndreas Gruenbacher  * @size: size of address range
2042bb523b40SAndreas Gruenbacher  *
2043bb523b40SAndreas Gruenbacher  * Returns the number of bytes not faulted in (like copy_to_user() and
2044bb523b40SAndreas Gruenbacher  * copy_from_user()).
2045bb523b40SAndreas Gruenbacher  */
fault_in_writeable(char __user * uaddr,size_t size)2046bb523b40SAndreas Gruenbacher size_t fault_in_writeable(char __user *uaddr, size_t size)
2047bb523b40SAndreas Gruenbacher {
2048a7797e74SBaoquan He 	const unsigned long start = (unsigned long)uaddr;
2049a7797e74SBaoquan He 	const unsigned long end = start + size;
2050a7797e74SBaoquan He 	unsigned long cur;
2051bb523b40SAndreas Gruenbacher 
2052bb523b40SAndreas Gruenbacher 	if (unlikely(size == 0))
2053bb523b40SAndreas Gruenbacher 		return 0;
2054677b2a8cSChristophe Leroy 	if (!user_write_access_begin(uaddr, size))
2055bb523b40SAndreas Gruenbacher 		return size;
2056bb523b40SAndreas Gruenbacher 
2057a7797e74SBaoquan He 	/* Stop once we overflow to 0. */
2058a7797e74SBaoquan He 	for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
2059a7797e74SBaoquan He 		unsafe_put_user(0, (char __user *)cur, out);
2060bb523b40SAndreas Gruenbacher out:
2061677b2a8cSChristophe Leroy 	user_write_access_end();
2062a7797e74SBaoquan He 	if (size > cur - start)
2063a7797e74SBaoquan He 		return size - (cur - start);
2064bb523b40SAndreas Gruenbacher 	return 0;
2065bb523b40SAndreas Gruenbacher }
2066bb523b40SAndreas Gruenbacher EXPORT_SYMBOL(fault_in_writeable);
2067bb523b40SAndreas Gruenbacher 
2068da32b581SCatalin Marinas /**
2069da32b581SCatalin Marinas  * fault_in_subpage_writeable - fault in an address range for writing
2070da32b581SCatalin Marinas  * @uaddr: start of address range
2071da32b581SCatalin Marinas  * @size: size of address range
2072da32b581SCatalin Marinas  *
2073da32b581SCatalin Marinas  * Fault in a user address range for writing while checking for permissions at
2074da32b581SCatalin Marinas  * sub-page granularity (e.g. arm64 MTE). This function should be used when
2075da32b581SCatalin Marinas  * the caller cannot guarantee forward progress of a copy_to_user() loop.
2076da32b581SCatalin Marinas  *
2077da32b581SCatalin Marinas  * Returns the number of bytes not faulted in (like copy_to_user() and
2078da32b581SCatalin Marinas  * copy_from_user()).
2079da32b581SCatalin Marinas  */
fault_in_subpage_writeable(char __user * uaddr,size_t size)2080da32b581SCatalin Marinas size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)
2081da32b581SCatalin Marinas {
2082da32b581SCatalin Marinas 	size_t faulted_in;
2083da32b581SCatalin Marinas 
2084da32b581SCatalin Marinas 	/*
2085da32b581SCatalin Marinas 	 * Attempt faulting in at page granularity first for page table
2086da32b581SCatalin Marinas 	 * permission checking. The arch-specific probe_subpage_writeable()
2087da32b581SCatalin Marinas 	 * functions may not check for this.
2088da32b581SCatalin Marinas 	 */
2089da32b581SCatalin Marinas 	faulted_in = size - fault_in_writeable(uaddr, size);
2090da32b581SCatalin Marinas 	if (faulted_in)
2091da32b581SCatalin Marinas 		faulted_in -= probe_subpage_writeable(uaddr, faulted_in);
2092da32b581SCatalin Marinas 
2093da32b581SCatalin Marinas 	return size - faulted_in;
2094da32b581SCatalin Marinas }
2095da32b581SCatalin Marinas EXPORT_SYMBOL(fault_in_subpage_writeable);
2096da32b581SCatalin Marinas 
2097cdd591fcSAndreas Gruenbacher /*
2098cdd591fcSAndreas Gruenbacher  * fault_in_safe_writeable - fault in an address range for writing
2099cdd591fcSAndreas Gruenbacher  * @uaddr: start of address range
2100cdd591fcSAndreas Gruenbacher  * @size: length of address range
2101cdd591fcSAndreas Gruenbacher  *
2102fe673d3fSLinus Torvalds  * Faults in an address range for writing.  This is primarily useful when we
2103fe673d3fSLinus Torvalds  * already know that some or all of the pages in the address range aren't in
2104fe673d3fSLinus Torvalds  * memory.
2105cdd591fcSAndreas Gruenbacher  *
2106fe673d3fSLinus Torvalds  * Unlike fault_in_writeable(), this function is non-destructive.
2107cdd591fcSAndreas Gruenbacher  *
2108cdd591fcSAndreas Gruenbacher  * Note that we don't pin or otherwise hold the pages referenced that we fault
2109cdd591fcSAndreas Gruenbacher  * in.  There's no guarantee that they'll stay in memory for any duration of
2110cdd591fcSAndreas Gruenbacher  * time.
2111cdd591fcSAndreas Gruenbacher  *
2112cdd591fcSAndreas Gruenbacher  * Returns the number of bytes not faulted in, like copy_to_user() and
2113cdd591fcSAndreas Gruenbacher  * copy_from_user().
2114cdd591fcSAndreas Gruenbacher  */
fault_in_safe_writeable(const char __user * uaddr,size_t size)2115cdd591fcSAndreas Gruenbacher size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
2116cdd591fcSAndreas Gruenbacher {
2117a7797e74SBaoquan He 	const unsigned long start = (unsigned long)uaddr;
2118a7797e74SBaoquan He 	const unsigned long end = start + size;
2119a7797e74SBaoquan He 	unsigned long cur;
2120cdd591fcSAndreas Gruenbacher 	struct mm_struct *mm = current->mm;
2121fe673d3fSLinus Torvalds 	bool unlocked = false;
2122cdd591fcSAndreas Gruenbacher 
2123fe673d3fSLinus Torvalds 	if (unlikely(size == 0))
2124cdd591fcSAndreas Gruenbacher 		return 0;
2125fe673d3fSLinus Torvalds 
2126fe673d3fSLinus Torvalds 	mmap_read_lock(mm);
2127a7797e74SBaoquan He 	/* Stop once we overflow to 0. */
2128a7797e74SBaoquan He 	for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
2129a7797e74SBaoquan He 		if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked))
2130fe673d3fSLinus Torvalds 			break;
2131fe673d3fSLinus Torvalds 	mmap_read_unlock(mm);
2132fe673d3fSLinus Torvalds 
2133a7797e74SBaoquan He 	if (size > cur - start)
2134a7797e74SBaoquan He 		return size - (cur - start);
2135fe673d3fSLinus Torvalds 	return 0;
2136cdd591fcSAndreas Gruenbacher }
2137cdd591fcSAndreas Gruenbacher EXPORT_SYMBOL(fault_in_safe_writeable);
2138cdd591fcSAndreas Gruenbacher 
2139bb523b40SAndreas Gruenbacher /**
2140bb523b40SAndreas Gruenbacher  * fault_in_readable - fault in userspace address range for reading
2141bb523b40SAndreas Gruenbacher  * @uaddr: start of user address range
2142bb523b40SAndreas Gruenbacher  * @size: size of user address range
2143bb523b40SAndreas Gruenbacher  *
2144bb523b40SAndreas Gruenbacher  * Returns the number of bytes not faulted in (like copy_to_user() and
2145bb523b40SAndreas Gruenbacher  * copy_from_user()).
2146bb523b40SAndreas Gruenbacher  */
fault_in_readable(const char __user * uaddr,size_t size)2147bb523b40SAndreas Gruenbacher size_t fault_in_readable(const char __user *uaddr, size_t size)
2148bb523b40SAndreas Gruenbacher {
2149a7797e74SBaoquan He 	const unsigned long start = (unsigned long)uaddr;
2150a7797e74SBaoquan He 	const unsigned long end = start + size;
2151a7797e74SBaoquan He 	unsigned long cur;
2152bb523b40SAndreas Gruenbacher 	volatile char c;
2153bb523b40SAndreas Gruenbacher 
2154bb523b40SAndreas Gruenbacher 	if (unlikely(size == 0))
2155bb523b40SAndreas Gruenbacher 		return 0;
2156677b2a8cSChristophe Leroy 	if (!user_read_access_begin(uaddr, size))
2157bb523b40SAndreas Gruenbacher 		return size;
2158bb523b40SAndreas Gruenbacher 
2159a7797e74SBaoquan He 	/* Stop once we overflow to 0. */
2160a7797e74SBaoquan He 	for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))
2161a7797e74SBaoquan He 		unsafe_get_user(c, (const char __user *)cur, out);
2162bb523b40SAndreas Gruenbacher out:
2163677b2a8cSChristophe Leroy 	user_read_access_end();
2164bb523b40SAndreas Gruenbacher 	(void)c;
2165a7797e74SBaoquan He 	if (size > cur - start)
2166a7797e74SBaoquan He 		return size - (cur - start);
2167bb523b40SAndreas Gruenbacher 	return 0;
2168bb523b40SAndreas Gruenbacher }
2169bb523b40SAndreas Gruenbacher EXPORT_SYMBOL(fault_in_readable);
2170bb523b40SAndreas Gruenbacher 
2171bb523b40SAndreas Gruenbacher /**
21728f942eeaSJann Horn  * get_dump_page() - pin user page in memory while writing it to core dump
21738f942eeaSJann Horn  * @addr: user address
2174d6ff4c8fSMateusz Guzik  * @locked: a pointer to an int denoting whether the mmap sem is held
21758f942eeaSJann Horn  *
21768f942eeaSJann Horn  * Returns struct page pointer of user page pinned for dump,
21778f942eeaSJann Horn  * to be freed afterwards by put_page().
21788f942eeaSJann Horn  *
21798f942eeaSJann Horn  * Returns NULL on any kind of failure - a hole must then be inserted into
21808f942eeaSJann Horn  * the corefile, to preserve alignment with its headers; and also returns
21818f942eeaSJann Horn  * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
21828f942eeaSJann Horn  * allowing a hole to be left in the corefile to save disk space.
21838f942eeaSJann Horn  *
21847f3bfab5SJann Horn  * Called without mmap_lock (takes and releases the mmap_lock by itself).
21858f942eeaSJann Horn  */
21868f942eeaSJann Horn #ifdef CONFIG_ELF_CORE
get_dump_page(unsigned long addr,int * locked)2187d6ff4c8fSMateusz Guzik struct page *get_dump_page(unsigned long addr, int *locked)
21888f942eeaSJann Horn {
21898f942eeaSJann Horn 	struct page *page;
21907f3bfab5SJann Horn 	int ret;
21918f942eeaSJann Horn 
2192d6ff4c8fSMateusz Guzik 	ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked,
21937f3bfab5SJann Horn 				      FOLL_FORCE | FOLL_DUMP | FOLL_GET);
21947f3bfab5SJann Horn 	return (ret == 1) ? page : NULL;
21958f942eeaSJann Horn }
21968f942eeaSJann Horn #endif /* CONFIG_ELF_CORE */
21978f942eeaSJann Horn 
2198d1e153feSPavel Tatashin #ifdef CONFIG_MIGRATION
219994efde1dSJohn Hubbard 
220094efde1dSJohn Hubbard /*
220194efde1dSJohn Hubbard  * An array of either pages or folios ("pofs"). Although it may seem tempting to
220294efde1dSJohn Hubbard  * avoid this complication, by simply interpreting a list of folios as a list of
220394efde1dSJohn Hubbard  * pages, that approach won't work in the longer term, because eventually the
220494efde1dSJohn Hubbard  * layouts of struct page and struct folio will become completely different.
220594efde1dSJohn Hubbard  * Furthermore, this pof approach avoids excessive page_folio() calls.
220694efde1dSJohn Hubbard  */
220794efde1dSJohn Hubbard struct pages_or_folios {
220894efde1dSJohn Hubbard 	union {
220994efde1dSJohn Hubbard 		struct page **pages;
221094efde1dSJohn Hubbard 		struct folio **folios;
221194efde1dSJohn Hubbard 		void **entries;
221294efde1dSJohn Hubbard 	};
221394efde1dSJohn Hubbard 	bool has_folios;
221494efde1dSJohn Hubbard 	long nr_entries;
221594efde1dSJohn Hubbard };
221694efde1dSJohn Hubbard 
pofs_get_folio(struct pages_or_folios * pofs,long i)221794efde1dSJohn Hubbard static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i)
221894efde1dSJohn Hubbard {
221994efde1dSJohn Hubbard 	if (pofs->has_folios)
222094efde1dSJohn Hubbard 		return pofs->folios[i];
222194efde1dSJohn Hubbard 	return page_folio(pofs->pages[i]);
222294efde1dSJohn Hubbard }
222394efde1dSJohn Hubbard 
pofs_clear_entry(struct pages_or_folios * pofs,long i)222494efde1dSJohn Hubbard static void pofs_clear_entry(struct pages_or_folios *pofs, long i)
222594efde1dSJohn Hubbard {
222694efde1dSJohn Hubbard 	pofs->entries[i] = NULL;
222794efde1dSJohn Hubbard }
222894efde1dSJohn Hubbard 
pofs_unpin(struct pages_or_folios * pofs)222994efde1dSJohn Hubbard static void pofs_unpin(struct pages_or_folios *pofs)
223094efde1dSJohn Hubbard {
223194efde1dSJohn Hubbard 	if (pofs->has_folios)
223294efde1dSJohn Hubbard 		unpin_folios(pofs->folios, pofs->nr_entries);
223394efde1dSJohn Hubbard 	else
223494efde1dSJohn Hubbard 		unpin_user_pages(pofs->pages, pofs->nr_entries);
223594efde1dSJohn Hubbard }
223694efde1dSJohn Hubbard 
pofs_next_folio(struct folio * folio,struct pages_or_folios * pofs,long * index_ptr)2237a03db236SLi Zhe static struct folio *pofs_next_folio(struct folio *folio,
2238a03db236SLi Zhe 		struct pages_or_folios *pofs, long *index_ptr)
2239a03db236SLi Zhe {
2240a03db236SLi Zhe 	long i = *index_ptr + 1;
2241a03db236SLi Zhe 
2242a03db236SLi Zhe 	if (!pofs->has_folios && folio_test_large(folio)) {
2243a03db236SLi Zhe 		const unsigned long start_pfn = folio_pfn(folio);
2244a03db236SLi Zhe 		const unsigned long end_pfn = start_pfn + folio_nr_pages(folio);
2245a03db236SLi Zhe 
2246a03db236SLi Zhe 		for (; i < pofs->nr_entries; i++) {
2247a03db236SLi Zhe 			unsigned long pfn = page_to_pfn(pofs->pages[i]);
2248a03db236SLi Zhe 
2249a03db236SLi Zhe 			/* Is this page part of this folio? */
2250a03db236SLi Zhe 			if (pfn < start_pfn || pfn >= end_pfn)
2251a03db236SLi Zhe 				break;
2252a03db236SLi Zhe 		}
2253a03db236SLi Zhe 	}
2254a03db236SLi Zhe 
2255a03db236SLi Zhe 	if (unlikely(i == pofs->nr_entries))
2256a03db236SLi Zhe 		return NULL;
2257a03db236SLi Zhe 	*index_ptr = i;
2258a03db236SLi Zhe 
2259a03db236SLi Zhe 	return pofs_get_folio(pofs, i);
2260a03db236SLi Zhe }
2261a03db236SLi Zhe 
2262f68749ecSPavel Tatashin /*
226353ba78deSVivek Kasireddy  * Returns the number of collected folios. Return value is always >= 0.
2264f68749ecSPavel Tatashin  */
collect_longterm_unpinnable_folios(struct list_head * movable_folio_list,struct pages_or_folios * pofs)2265517f496eSDavid Hildenbrand static unsigned long collect_longterm_unpinnable_folios(
226653ba78deSVivek Kasireddy 		struct list_head *movable_folio_list,
226794efde1dSJohn Hubbard 		struct pages_or_folios *pofs)
22689a4e9f3bSAneesh Kumar K.V {
2269a03db236SLi Zhe 	unsigned long collected = 0;
2270a03db236SLi Zhe 	struct folio *folio;
2271a09a8a1fSHugh Dickins 	int drained = 0;
2272a03db236SLi Zhe 	long i = 0;
22739a4e9f3bSAneesh Kumar K.V 
2274a03db236SLi Zhe 	for (folio = pofs_get_folio(pofs, i); folio;
2275a03db236SLi Zhe 	     folio = pofs_next_folio(folio, pofs, &i)) {
2276f9f38f78SChristoph Hellwig 
22776077c943SAlex Sierra 		if (folio_is_longterm_pinnable(folio))
2278f9f38f78SChristoph Hellwig 			continue;
227967e139b0SAlistair Popple 
2280517f496eSDavid Hildenbrand 		collected++;
2281517f496eSDavid Hildenbrand 
228267e139b0SAlistair Popple 		if (folio_is_device_coherent(folio))
228367e139b0SAlistair Popple 			continue;
228467e139b0SAlistair Popple 
22851b7f7e58SMatthew Wilcox (Oracle) 		if (folio_test_hugetlb(folio)) {
22864c640f12SDavid Hildenbrand 			folio_isolate_hugetlb(folio, movable_folio_list);
2287f9f38f78SChristoph Hellwig 			continue;
2288f9f38f78SChristoph Hellwig 		}
2289f9f38f78SChristoph Hellwig 
22902da6de30SHugh Dickins 		if (drained == 0 && folio_may_be_lru_cached(folio) &&
2291a09a8a1fSHugh Dickins 				folio_ref_count(folio) !=
2292a09a8a1fSHugh Dickins 				folio_expected_ref_count(folio) + 1) {
2293a09a8a1fSHugh Dickins 			lru_add_drain();
2294a09a8a1fSHugh Dickins 			drained = 1;
2295a09a8a1fSHugh Dickins 		}
22962da6de30SHugh Dickins 		if (drained == 1 && folio_may_be_lru_cached(folio) &&
2297a09a8a1fSHugh Dickins 				folio_ref_count(folio) !=
229898c6d259SHugh Dickins 				folio_expected_ref_count(folio) + 1) {
22999a4e9f3bSAneesh Kumar K.V 			lru_add_drain_all();
2300a09a8a1fSHugh Dickins 			drained = 2;
23019a4e9f3bSAneesh Kumar K.V 		}
23029a4e9f3bSAneesh Kumar K.V 
2303be2d5756SBaolin Wang 		if (!folio_isolate_lru(folio))
23046e7f34ebSPavel Tatashin 			continue;
230567e139b0SAlistair Popple 
230653ba78deSVivek Kasireddy 		list_add_tail(&folio->lru, movable_folio_list);
23071b7f7e58SMatthew Wilcox (Oracle) 		node_stat_mod_folio(folio,
23081b7f7e58SMatthew Wilcox (Oracle) 				    NR_ISOLATED_ANON + folio_is_file_lru(folio),
23091b7f7e58SMatthew Wilcox (Oracle) 				    folio_nr_pages(folio));
23109a4e9f3bSAneesh Kumar K.V 	}
2311517f496eSDavid Hildenbrand 
2312517f496eSDavid Hildenbrand 	return collected;
2313f68749ecSPavel Tatashin }
2314f9f38f78SChristoph Hellwig 
231567e139b0SAlistair Popple /*
231653ba78deSVivek Kasireddy  * Unpins all folios and migrates device coherent folios and movable_folio_list.
231753ba78deSVivek Kasireddy  * Returns -EAGAIN if all folios were successfully migrated or -errno for
231853ba78deSVivek Kasireddy  * failure (or partial success).
231967e139b0SAlistair Popple  */
232094efde1dSJohn Hubbard static int
migrate_longterm_unpinnable_folios(struct list_head * movable_folio_list,struct pages_or_folios * pofs)232194efde1dSJohn Hubbard migrate_longterm_unpinnable_folios(struct list_head *movable_folio_list,
232294efde1dSJohn Hubbard 				   struct pages_or_folios *pofs)
232367e139b0SAlistair Popple {
232467e139b0SAlistair Popple 	int ret;
232567e139b0SAlistair Popple 	unsigned long i;
232667e139b0SAlistair Popple 
232794efde1dSJohn Hubbard 	for (i = 0; i < pofs->nr_entries; i++) {
232894efde1dSJohn Hubbard 		struct folio *folio = pofs_get_folio(pofs, i);
232967e139b0SAlistair Popple 
233067e139b0SAlistair Popple 		if (folio_is_device_coherent(folio)) {
233167e139b0SAlistair Popple 			/*
233253ba78deSVivek Kasireddy 			 * Migration will fail if the folio is pinned, so
233353ba78deSVivek Kasireddy 			 * convert the pin on the source folio to a normal
233453ba78deSVivek Kasireddy 			 * reference.
233567e139b0SAlistair Popple 			 */
233694efde1dSJohn Hubbard 			pofs_clear_entry(pofs, i);
233767e139b0SAlistair Popple 			folio_get(folio);
233867e139b0SAlistair Popple 			gup_put_folio(folio, 1, FOLL_PIN);
233967e139b0SAlistair Popple 
23405c8525a3SKefeng Wang 			if (migrate_device_coherent_folio(folio)) {
234167e139b0SAlistair Popple 				ret = -EBUSY;
234267e139b0SAlistair Popple 				goto err;
234367e139b0SAlistair Popple 			}
234467e139b0SAlistair Popple 
234567e139b0SAlistair Popple 			continue;
234667e139b0SAlistair Popple 		}
234767e139b0SAlistair Popple 
234867e139b0SAlistair Popple 		/*
234953ba78deSVivek Kasireddy 		 * We can't migrate folios with unexpected references, so drop
235067e139b0SAlistair Popple 		 * the reference obtained by __get_user_pages_locked().
235153ba78deSVivek Kasireddy 		 * Migrating folios have been added to movable_folio_list after
235267e139b0SAlistair Popple 		 * calling folio_isolate_lru() which takes a reference so the
235353ba78deSVivek Kasireddy 		 * folio won't be freed if it's migrating.
235467e139b0SAlistair Popple 		 */
235594efde1dSJohn Hubbard 		unpin_folio(folio);
235694efde1dSJohn Hubbard 		pofs_clear_entry(pofs, i);
235767e139b0SAlistair Popple 	}
235867e139b0SAlistair Popple 
235953ba78deSVivek Kasireddy 	if (!list_empty(movable_folio_list)) {
2360f9f38f78SChristoph Hellwig 		struct migration_target_control mtc = {
2361f9f38f78SChristoph Hellwig 			.nid = NUMA_NO_NODE,
2362f9f38f78SChristoph Hellwig 			.gfp_mask = GFP_USER | __GFP_NOWARN,
2363e42dfe4eSBaolin Wang 			.reason = MR_LONGTERM_PIN,
2364f9f38f78SChristoph Hellwig 		};
2365f9f38f78SChristoph Hellwig 
236653ba78deSVivek Kasireddy 		if (migrate_pages(movable_folio_list, alloc_migration_target,
2367f0f44638SPavel Tatashin 				  NULL, (unsigned long)&mtc, MIGRATE_SYNC,
236867e139b0SAlistair Popple 				  MR_LONGTERM_PIN, NULL)) {
2369f9f38f78SChristoph Hellwig 			ret = -ENOMEM;
237067e139b0SAlistair Popple 			goto err;
237167e139b0SAlistair Popple 		}
2372f68749ecSPavel Tatashin 	}
2373f68749ecSPavel Tatashin 
237453ba78deSVivek Kasireddy 	putback_movable_pages(movable_folio_list);
237524a95998SAlistair Popple 
237667e139b0SAlistair Popple 	return -EAGAIN;
237767e139b0SAlistair Popple 
237867e139b0SAlistair Popple err:
237994efde1dSJohn Hubbard 	pofs_unpin(pofs);
238053ba78deSVivek Kasireddy 	putback_movable_pages(movable_folio_list);
238167e139b0SAlistair Popple 
238267e139b0SAlistair Popple 	return ret;
238367e139b0SAlistair Popple }
238467e139b0SAlistair Popple 
238594efde1dSJohn Hubbard static long
check_and_migrate_movable_pages_or_folios(struct pages_or_folios * pofs)238694efde1dSJohn Hubbard check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
238794efde1dSJohn Hubbard {
238894efde1dSJohn Hubbard 	LIST_HEAD(movable_folio_list);
2389517f496eSDavid Hildenbrand 	unsigned long collected;
239094efde1dSJohn Hubbard 
2391517f496eSDavid Hildenbrand 	collected = collect_longterm_unpinnable_folios(&movable_folio_list,
2392517f496eSDavid Hildenbrand 						       pofs);
2393517f496eSDavid Hildenbrand 	if (!collected)
239494efde1dSJohn Hubbard 		return 0;
239594efde1dSJohn Hubbard 
239694efde1dSJohn Hubbard 	return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
239794efde1dSJohn Hubbard }
239894efde1dSJohn Hubbard 
239967e139b0SAlistair Popple /*
240053ba78deSVivek Kasireddy  * Check whether all folios are *allowed* to be pinned indefinitely (long term).
240153ba78deSVivek Kasireddy  * Rather confusingly, all folios in the range are required to be pinned via
240253ba78deSVivek Kasireddy  * FOLL_PIN, before calling this routine.
240367e139b0SAlistair Popple  *
2404aa6f8b25SJohn Hubbard  * Return values:
240567e139b0SAlistair Popple  *
2406aa6f8b25SJohn Hubbard  * 0: if everything is OK and all folios in the range are allowed to be pinned,
240753ba78deSVivek Kasireddy  * then this routine leaves all folios pinned and returns zero for success.
2408aa6f8b25SJohn Hubbard  *
2409aa6f8b25SJohn Hubbard  * -EAGAIN: if any folios in the range are not allowed to be pinned, then this
2410aa6f8b25SJohn Hubbard  * routine will migrate those folios away, unpin all the folios in the range. If
2411aa6f8b25SJohn Hubbard  * migration of the entire set of folios succeeds, then -EAGAIN is returned. The
2412aa6f8b25SJohn Hubbard  * caller should re-pin the entire range with FOLL_PIN and then call this
2413aa6f8b25SJohn Hubbard  * routine again.
2414aa6f8b25SJohn Hubbard  *
2415aa6f8b25SJohn Hubbard  * -ENOMEM, or any other -errno: if an error *other* than -EAGAIN occurs, this
2416aa6f8b25SJohn Hubbard  * indicates a migration failure. The caller should give up, and propagate the
2417aa6f8b25SJohn Hubbard  * error back up the call stack. The caller does not need to unpin any folios in
2418aa6f8b25SJohn Hubbard  * that case, because this routine will do the unpinning.
241953ba78deSVivek Kasireddy  */
check_and_migrate_movable_folios(unsigned long nr_folios,struct folio ** folios)242053ba78deSVivek Kasireddy static long check_and_migrate_movable_folios(unsigned long nr_folios,
242153ba78deSVivek Kasireddy 					     struct folio **folios)
242253ba78deSVivek Kasireddy {
242394efde1dSJohn Hubbard 	struct pages_or_folios pofs = {
242494efde1dSJohn Hubbard 		.folios = folios,
242594efde1dSJohn Hubbard 		.has_folios = true,
242694efde1dSJohn Hubbard 		.nr_entries = nr_folios,
242794efde1dSJohn Hubbard 	};
242853ba78deSVivek Kasireddy 
242994efde1dSJohn Hubbard 	return check_and_migrate_movable_pages_or_folios(&pofs);
243053ba78deSVivek Kasireddy }
243153ba78deSVivek Kasireddy 
243253ba78deSVivek Kasireddy /*
2433aa6f8b25SJohn Hubbard  * Return values and behavior are the same as those for
2434aa6f8b25SJohn Hubbard  * check_and_migrate_movable_folios().
243567e139b0SAlistair Popple  */
check_and_migrate_movable_pages(unsigned long nr_pages,struct page ** pages)243667e139b0SAlistair Popple static long check_and_migrate_movable_pages(unsigned long nr_pages,
243767e139b0SAlistair Popple 					    struct page **pages)
243867e139b0SAlistair Popple {
243994efde1dSJohn Hubbard 	struct pages_or_folios pofs = {
244094efde1dSJohn Hubbard 		.pages = pages,
244194efde1dSJohn Hubbard 		.has_folios = false,
244294efde1dSJohn Hubbard 		.nr_entries = nr_pages,
244394efde1dSJohn Hubbard 	};
244467e139b0SAlistair Popple 
244594efde1dSJohn Hubbard 	return check_and_migrate_movable_pages_or_folios(&pofs);
24469a4e9f3bSAneesh Kumar K.V }
24479a4e9f3bSAneesh Kumar K.V #else
check_and_migrate_movable_pages(unsigned long nr_pages,struct page ** pages)2448f68749ecSPavel Tatashin static long check_and_migrate_movable_pages(unsigned long nr_pages,
2449f6d299ecSAlistair Popple 					    struct page **pages)
24509a4e9f3bSAneesh Kumar K.V {
245124a95998SAlistair Popple 	return 0;
24529a4e9f3bSAneesh Kumar K.V }
245353ba78deSVivek Kasireddy 
check_and_migrate_movable_folios(unsigned long nr_folios,struct folio ** folios)245453ba78deSVivek Kasireddy static long check_and_migrate_movable_folios(unsigned long nr_folios,
245553ba78deSVivek Kasireddy 					     struct folio **folios)
245653ba78deSVivek Kasireddy {
245753ba78deSVivek Kasireddy 	return 0;
245853ba78deSVivek Kasireddy }
2459d1e153feSPavel Tatashin #endif /* CONFIG_MIGRATION */
24609a4e9f3bSAneesh Kumar K.V 
24612bb6d283SDan Williams /*
2462932f4a63SIra Weiny  * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
2463932f4a63SIra Weiny  * allows us to process the FOLL_LONGTERM flag.
24642bb6d283SDan Williams  */
__gup_longterm_locked(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,struct page ** pages,int * locked,unsigned int gup_flags)246564019a2eSPeter Xu static long __gup_longterm_locked(struct mm_struct *mm,
2466932f4a63SIra Weiny 				  unsigned long start,
2467932f4a63SIra Weiny 				  unsigned long nr_pages,
2468932f4a63SIra Weiny 				  struct page **pages,
246953b2d09bSJason Gunthorpe 				  int *locked,
2470932f4a63SIra Weiny 				  unsigned int gup_flags)
24712bb6d283SDan Williams {
2472f68749ecSPavel Tatashin 	unsigned int flags;
247324a95998SAlistair Popple 	long rc, nr_pinned_pages;
24742bb6d283SDan Williams 
2475f68749ecSPavel Tatashin 	if (!(gup_flags & FOLL_LONGTERM))
2476b2cac248SLorenzo Stoakes 		return __get_user_pages_locked(mm, start, nr_pages, pages,
247753b2d09bSJason Gunthorpe 					       locked, gup_flags);
247867e139b0SAlistair Popple 
24791a08ae36SPavel Tatashin 	flags = memalloc_pin_save();
2480f68749ecSPavel Tatashin 	do {
248124a95998SAlistair Popple 		nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
2482b2cac248SLorenzo Stoakes 							  pages, locked,
248324a95998SAlistair Popple 							  gup_flags);
248424a95998SAlistair Popple 		if (nr_pinned_pages <= 0) {
248524a95998SAlistair Popple 			rc = nr_pinned_pages;
2486f68749ecSPavel Tatashin 			break;
248724a95998SAlistair Popple 		}
2488d64e2dbcSJason Gunthorpe 
2489d64e2dbcSJason Gunthorpe 		/* FOLL_LONGTERM implies FOLL_PIN */
2490f6d299ecSAlistair Popple 		rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
249124a95998SAlistair Popple 	} while (rc == -EAGAIN);
24921a08ae36SPavel Tatashin 	memalloc_pin_restore(flags);
249324a95998SAlistair Popple 	return rc ? rc : nr_pinned_pages;
24942bb6d283SDan Williams }
2495932f4a63SIra Weiny 
2496d64e2dbcSJason Gunthorpe /*
2497d64e2dbcSJason Gunthorpe  * Check that the given flags are valid for the exported gup/pup interface, and
2498d64e2dbcSJason Gunthorpe  * update them with the required flags that the caller must have set.
2499d64e2dbcSJason Gunthorpe  */
is_valid_gup_args(struct page ** pages,int * locked,unsigned int * gup_flags_p,unsigned int to_set)2500b2cac248SLorenzo Stoakes static bool is_valid_gup_args(struct page **pages, int *locked,
2501b2cac248SLorenzo Stoakes 			      unsigned int *gup_flags_p, unsigned int to_set)
2502447f3e45SBarry Song {
2503d64e2dbcSJason Gunthorpe 	unsigned int gup_flags = *gup_flags_p;
2504d64e2dbcSJason Gunthorpe 
2505447f3e45SBarry Song 	/*
2506d64e2dbcSJason Gunthorpe 	 * These flags not allowed to be specified externally to the gup
2507d64e2dbcSJason Gunthorpe 	 * interfaces:
25080f20bba1SLorenzo Stoakes 	 * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
25097290840dSDavid Hildenbrand 	 * - FOLL_REMOTE is internal only, set in (get|pin)_user_pages_remote()
2510f04740f5SJason Gunthorpe 	 * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL
2511447f3e45SBarry Song 	 */
25120f20bba1SLorenzo Stoakes 	if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS))
2513447f3e45SBarry Song 		return false;
2514447f3e45SBarry Song 
2515d64e2dbcSJason Gunthorpe 	gup_flags |= to_set;
2516f04740f5SJason Gunthorpe 	if (locked) {
2517f04740f5SJason Gunthorpe 		/* At the external interface locked must be set */
2518f04740f5SJason Gunthorpe 		if (WARN_ON_ONCE(*locked != 1))
2519f04740f5SJason Gunthorpe 			return false;
2520f04740f5SJason Gunthorpe 
2521f04740f5SJason Gunthorpe 		gup_flags |= FOLL_UNLOCKABLE;
2522f04740f5SJason Gunthorpe 	}
2523d64e2dbcSJason Gunthorpe 
2524d64e2dbcSJason Gunthorpe 	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
2525d64e2dbcSJason Gunthorpe 	if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
2526d64e2dbcSJason Gunthorpe 			 (FOLL_PIN | FOLL_GET)))
2527d64e2dbcSJason Gunthorpe 		return false;
2528d64e2dbcSJason Gunthorpe 
2529d64e2dbcSJason Gunthorpe 	/* LONGTERM can only be specified when pinning */
2530d64e2dbcSJason Gunthorpe 	if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM)))
2531d64e2dbcSJason Gunthorpe 		return false;
2532d64e2dbcSJason Gunthorpe 
2533d64e2dbcSJason Gunthorpe 	/* Pages input must be given if using GET/PIN */
2534d64e2dbcSJason Gunthorpe 	if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages))
2535d64e2dbcSJason Gunthorpe 		return false;
2536d64e2dbcSJason Gunthorpe 
2537d64e2dbcSJason Gunthorpe 	/* We want to allow the pgmap to be hot-unplugged at all times */
2538d64e2dbcSJason Gunthorpe 	if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) &&
2539d64e2dbcSJason Gunthorpe 			 (gup_flags & FOLL_PCI_P2PDMA)))
2540d64e2dbcSJason Gunthorpe 		return false;
2541d64e2dbcSJason Gunthorpe 
2542d64e2dbcSJason Gunthorpe 	*gup_flags_p = gup_flags;
2543447f3e45SBarry Song 	return true;
2544447f3e45SBarry Song }
2545447f3e45SBarry Song 
254622bf29b6SJohn Hubbard #ifdef CONFIG_MMU
2547adc8cb40SSouptick Joarder /**
2548c4237f8bSJohn Hubbard  * get_user_pages_remote() - pin user pages in memory
2549c4237f8bSJohn Hubbard  * @mm:		mm_struct of target mm
2550c4237f8bSJohn Hubbard  * @start:	starting user address
2551c4237f8bSJohn Hubbard  * @nr_pages:	number of pages from start to pin
2552c4237f8bSJohn Hubbard  * @gup_flags:	flags modifying lookup behaviour
2553c4237f8bSJohn Hubbard  * @pages:	array that receives pointers to the pages pinned.
2554c4237f8bSJohn Hubbard  *		Should be at least nr_pages long. Or NULL, if caller
2555c4237f8bSJohn Hubbard  *		only intends to ensure the pages are faulted in.
2556c4237f8bSJohn Hubbard  * @locked:	pointer to lock flag indicating whether lock is held and
2557c4237f8bSJohn Hubbard  *		subsequently whether VM_FAULT_RETRY functionality can be
2558c4237f8bSJohn Hubbard  *		utilised. Lock must initially be held.
2559c4237f8bSJohn Hubbard  *
2560c4237f8bSJohn Hubbard  * Returns either number of pages pinned (which may be less than the
2561c4237f8bSJohn Hubbard  * number requested), or an error. Details about the return value:
2562c4237f8bSJohn Hubbard  *
2563c4237f8bSJohn Hubbard  * -- If nr_pages is 0, returns 0.
2564c4237f8bSJohn Hubbard  * -- If nr_pages is >0, but no pages were pinned, returns -errno.
2565c4237f8bSJohn Hubbard  * -- If nr_pages is >0, and some pages were pinned, returns the number of
2566c4237f8bSJohn Hubbard  *    pages pinned. Again, this may be less than nr_pages.
2567c4237f8bSJohn Hubbard  *
2568c4237f8bSJohn Hubbard  * The caller is responsible for releasing returned @pages, via put_page().
2569c4237f8bSJohn Hubbard  *
2570c1e8d7c6SMichel Lespinasse  * Must be called with mmap_lock held for read or write.
2571c4237f8bSJohn Hubbard  *
2572adc8cb40SSouptick Joarder  * get_user_pages_remote walks a process's page tables and takes a reference
2573adc8cb40SSouptick Joarder  * to each struct page that each user address corresponds to at a given
2574c4237f8bSJohn Hubbard  * instant. That is, it takes the page that would be accessed if a user
2575c4237f8bSJohn Hubbard  * thread accesses the given user virtual address at that instant.
2576c4237f8bSJohn Hubbard  *
2577c4237f8bSJohn Hubbard  * This does not guarantee that the page exists in the user mappings when
2578adc8cb40SSouptick Joarder  * get_user_pages_remote returns, and there may even be a completely different
2579c4237f8bSJohn Hubbard  * page there in some cases (eg. if mmapped pagecache has been invalidated
25805da1a868SJingyu Wang  * and subsequently re-faulted). However it does guarantee that the page
2581c4237f8bSJohn Hubbard  * won't be freed completely. And mostly callers simply care that the page
2582c4237f8bSJohn Hubbard  * contains data that was valid *at some point in time*. Typically, an IO
2583c4237f8bSJohn Hubbard  * or similar operation cannot guarantee anything stronger anyway because
2584c4237f8bSJohn Hubbard  * locks can't be held over the syscall boundary.
2585c4237f8bSJohn Hubbard  *
2586c4237f8bSJohn Hubbard  * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
2587c4237f8bSJohn Hubbard  * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
2588c4237f8bSJohn Hubbard  * be called after the page is finished with, and before put_page is called.
2589c4237f8bSJohn Hubbard  *
2590adc8cb40SSouptick Joarder  * get_user_pages_remote is typically used for fewer-copy IO operations,
2591adc8cb40SSouptick Joarder  * to get a handle on the memory by some means other than accesses
2592adc8cb40SSouptick Joarder  * via the user virtual addresses. The pages may be submitted for
2593adc8cb40SSouptick Joarder  * DMA to devices or accessed via their kernel linear mapping (via the
2594adc8cb40SSouptick Joarder  * kmap APIs). Care should be taken to use the correct cache flushing APIs.
2595c4237f8bSJohn Hubbard  *
2596c4237f8bSJohn Hubbard  * See also get_user_pages_fast, for performance critical applications.
2597c4237f8bSJohn Hubbard  *
2598adc8cb40SSouptick Joarder  * get_user_pages_remote should be phased out in favor of
2599c4237f8bSJohn Hubbard  * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
2600adc8cb40SSouptick Joarder  * should use get_user_pages_remote because it cannot pass
2601c4237f8bSJohn Hubbard  * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
2602c4237f8bSJohn Hubbard  */
get_user_pages_remote(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages,int * locked)260364019a2eSPeter Xu long get_user_pages_remote(struct mm_struct *mm,
2604c4237f8bSJohn Hubbard 		unsigned long start, unsigned long nr_pages,
2605c4237f8bSJohn Hubbard 		unsigned int gup_flags, struct page **pages,
2606ca5e8632SLorenzo Stoakes 		int *locked)
2607c4237f8bSJohn Hubbard {
26089a863a6aSJason Gunthorpe 	int local_locked = 1;
26099a863a6aSJason Gunthorpe 
2610b2cac248SLorenzo Stoakes 	if (!is_valid_gup_args(pages, locked, &gup_flags,
2611d64e2dbcSJason Gunthorpe 			       FOLL_TOUCH | FOLL_REMOTE))
2612eddb1c22SJohn Hubbard 		return -EINVAL;
2613eddb1c22SJohn Hubbard 
2614b2cac248SLorenzo Stoakes 	return __get_user_pages_locked(mm, start, nr_pages, pages,
26159a863a6aSJason Gunthorpe 				       locked ? locked : &local_locked,
2616d64e2dbcSJason Gunthorpe 				       gup_flags);
2617c4237f8bSJohn Hubbard }
2618c4237f8bSJohn Hubbard EXPORT_SYMBOL(get_user_pages_remote);
2619c4237f8bSJohn Hubbard 
2620eddb1c22SJohn Hubbard #else /* CONFIG_MMU */
get_user_pages_remote(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages,int * locked)262164019a2eSPeter Xu long get_user_pages_remote(struct mm_struct *mm,
2622eddb1c22SJohn Hubbard 			   unsigned long start, unsigned long nr_pages,
2623eddb1c22SJohn Hubbard 			   unsigned int gup_flags, struct page **pages,
2624ca5e8632SLorenzo Stoakes 			   int *locked)
2625eddb1c22SJohn Hubbard {
2626eddb1c22SJohn Hubbard 	return 0;
2627eddb1c22SJohn Hubbard }
2628eddb1c22SJohn Hubbard #endif /* !CONFIG_MMU */
2629eddb1c22SJohn Hubbard 
2630adc8cb40SSouptick Joarder /**
2631adc8cb40SSouptick Joarder  * get_user_pages() - pin user pages in memory
2632adc8cb40SSouptick Joarder  * @start:      starting user address
2633adc8cb40SSouptick Joarder  * @nr_pages:   number of pages from start to pin
2634adc8cb40SSouptick Joarder  * @gup_flags:  flags modifying lookup behaviour
2635adc8cb40SSouptick Joarder  * @pages:      array that receives pointers to the pages pinned.
2636adc8cb40SSouptick Joarder  *              Should be at least nr_pages long. Or NULL, if caller
2637adc8cb40SSouptick Joarder  *              only intends to ensure the pages are faulted in.
2638adc8cb40SSouptick Joarder  *
263964019a2eSPeter Xu  * This is the same as get_user_pages_remote(), just with a less-flexible
264064019a2eSPeter Xu  * calling convention where we assume that the mm being operated on belongs to
264164019a2eSPeter Xu  * the current task, and doesn't allow passing of a locked parameter.  We also
264264019a2eSPeter Xu  * obviously don't pass FOLL_REMOTE in here.
2643932f4a63SIra Weiny  */
get_user_pages(unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages)2644932f4a63SIra Weiny long get_user_pages(unsigned long start, unsigned long nr_pages,
264554d02069SLorenzo Stoakes 		    unsigned int gup_flags, struct page **pages)
2646932f4a63SIra Weiny {
26479a863a6aSJason Gunthorpe 	int locked = 1;
26489a863a6aSJason Gunthorpe 
2649b2cac248SLorenzo Stoakes 	if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH))
2650eddb1c22SJohn Hubbard 		return -EINVAL;
2651eddb1c22SJohn Hubbard 
2652afa3c33eSJason Gunthorpe 	return __get_user_pages_locked(current->mm, start, nr_pages, pages,
2653b2cac248SLorenzo Stoakes 				       &locked, gup_flags);
2654932f4a63SIra Weiny }
2655932f4a63SIra Weiny EXPORT_SYMBOL(get_user_pages);
26562bb6d283SDan Williams 
2657acc3c8d1SKirill A. Shutemov /*
2658d3649f68SChristoph Hellwig  * get_user_pages_unlocked() is suitable to replace the form:
2659acc3c8d1SKirill A. Shutemov  *
26603e4e28c5SMichel Lespinasse  *      mmap_read_lock(mm);
266164019a2eSPeter Xu  *      get_user_pages(mm, ..., pages, NULL);
26623e4e28c5SMichel Lespinasse  *      mmap_read_unlock(mm);
2663d3649f68SChristoph Hellwig  *
2664d3649f68SChristoph Hellwig  *  with:
2665d3649f68SChristoph Hellwig  *
266664019a2eSPeter Xu  *      get_user_pages_unlocked(mm, ..., pages);
2667d3649f68SChristoph Hellwig  *
2668d3649f68SChristoph Hellwig  * It is functionally equivalent to get_user_pages_fast so
2669d3649f68SChristoph Hellwig  * get_user_pages_fast should be used instead if specific gup_flags
2670d3649f68SChristoph Hellwig  * (e.g. FOLL_FORCE) are not required.
2671acc3c8d1SKirill A. Shutemov  */
get_user_pages_unlocked(unsigned long start,unsigned long nr_pages,struct page ** pages,unsigned int gup_flags)2672d3649f68SChristoph Hellwig long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
2673d3649f68SChristoph Hellwig 			     struct page **pages, unsigned int gup_flags)
2674acc3c8d1SKirill A. Shutemov {
2675b2a72dffSJason Gunthorpe 	int locked = 0;
2676acc3c8d1SKirill A. Shutemov 
2677b2cac248SLorenzo Stoakes 	if (!is_valid_gup_args(pages, NULL, &gup_flags,
2678f04740f5SJason Gunthorpe 			       FOLL_TOUCH | FOLL_UNLOCKABLE))
2679d64e2dbcSJason Gunthorpe 		return -EINVAL;
2680d64e2dbcSJason Gunthorpe 
2681afa3c33eSJason Gunthorpe 	return __get_user_pages_locked(current->mm, start, nr_pages, pages,
2682b2cac248SLorenzo Stoakes 				       &locked, gup_flags);
2683acc3c8d1SKirill A. Shutemov }
2684d3649f68SChristoph Hellwig EXPORT_SYMBOL(get_user_pages_unlocked);
26852667f50eSSteve Capper 
26862667f50eSSteve Capper /*
268723babe19SDavid Hildenbrand  * GUP-fast
26882667f50eSSteve Capper  *
26892667f50eSSteve Capper  * get_user_pages_fast attempts to pin user pages by walking the page
26902667f50eSSteve Capper  * tables directly and avoids taking locks. Thus the walker needs to be
26912667f50eSSteve Capper  * protected from page table pages being freed from under it, and should
26922667f50eSSteve Capper  * block any THP splits.
26932667f50eSSteve Capper  *
26942667f50eSSteve Capper  * One way to achieve this is to have the walker disable interrupts, and
26952667f50eSSteve Capper  * rely on IPIs from the TLB flushing code blocking before the page table
26962667f50eSSteve Capper  * pages are freed. This is unsuitable for architectures that do not need
26972667f50eSSteve Capper  * to broadcast an IPI when invalidating TLBs.
26982667f50eSSteve Capper  *
26992667f50eSSteve Capper  * Another way to achieve this is to batch up page table containing pages
27002667f50eSSteve Capper  * belonging to more than one mm_user, then rcu_sched a callback to free those
270123babe19SDavid Hildenbrand  * pages. Disabling interrupts will allow the gup_fast() walker to both block
27022667f50eSSteve Capper  * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
27032667f50eSSteve Capper  * (which is a relatively rare event). The code below adopts this strategy.
27042667f50eSSteve Capper  *
27052667f50eSSteve Capper  * Before activating this code, please be aware that the following assumptions
27062667f50eSSteve Capper  * are currently made:
27072667f50eSSteve Capper  *
2708ff2e6d72SPeter Zijlstra  *  *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
2709e585513bSKirill A. Shutemov  *  free pages containing page tables or TLB flushing requires IPI broadcast.
27102667f50eSSteve Capper  *
27112667f50eSSteve Capper  *  *) ptes can be read atomically by the architecture.
27122667f50eSSteve Capper  *
2713b6c46600Sjianyun.gao  *  *) valid user addresses are below TASK_MAX_SIZE
27142667f50eSSteve Capper  *
27152667f50eSSteve Capper  * The last two assumptions can be relaxed by the addition of helper functions.
27162667f50eSSteve Capper  *
27172667f50eSSteve Capper  * This code is based heavily on the PowerPC implementation by Nick Piggin.
27182667f50eSSteve Capper  */
271925176ad0SDavid Hildenbrand #ifdef CONFIG_HAVE_GUP_FAST
2720a6e79df9SLorenzo Stoakes /*
2721f002882cSDavid Hildenbrand  * Used in the GUP-fast path to determine whether GUP is permitted to work on
2722f002882cSDavid Hildenbrand  * a specific folio.
2723a6e79df9SLorenzo Stoakes  *
2724a6e79df9SLorenzo Stoakes  * This call assumes the caller has pinned the folio, that the lowest page table
2725a6e79df9SLorenzo Stoakes  * level still points to this folio, and that interrupts have been disabled.
2726a6e79df9SLorenzo Stoakes  *
2727f002882cSDavid Hildenbrand  * GUP-fast must reject all secretmem folios.
2728f002882cSDavid Hildenbrand  *
2729a6e79df9SLorenzo Stoakes  * Writing to pinned file-backed dirty tracked folios is inherently problematic
2730a6e79df9SLorenzo Stoakes  * (see comment describing the writable_file_mapping_allowed() function). We
2731a6e79df9SLorenzo Stoakes  * therefore try to avoid the most egregious case of a long-term mapping doing
2732a6e79df9SLorenzo Stoakes  * so.
2733a6e79df9SLorenzo Stoakes  *
2734a6e79df9SLorenzo Stoakes  * This function cannot be as thorough as that one as the VMA is not available
2735a6e79df9SLorenzo Stoakes  * in the fast path, so instead we whitelist known good cases and if in doubt,
2736a6e79df9SLorenzo Stoakes  * fall back to the slow path.
2737a6e79df9SLorenzo Stoakes  */
gup_fast_folio_allowed(struct folio * folio,unsigned int flags)2738f002882cSDavid Hildenbrand static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
2739a6e79df9SLorenzo Stoakes {
2740f002882cSDavid Hildenbrand 	bool reject_file_backed = false;
2741a6e79df9SLorenzo Stoakes 	struct address_space *mapping;
2742f002882cSDavid Hildenbrand 	bool check_secretmem = false;
2743a6e79df9SLorenzo Stoakes 	unsigned long mapping_flags;
2744a6e79df9SLorenzo Stoakes 
2745a6e79df9SLorenzo Stoakes 	/*
2746a6e79df9SLorenzo Stoakes 	 * If we aren't pinning then no problematic write can occur. A long term
2747a6e79df9SLorenzo Stoakes 	 * pin is the most egregious case so this is the one we disallow.
2748a6e79df9SLorenzo Stoakes 	 */
2749f002882cSDavid Hildenbrand 	if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) ==
2750a6e79df9SLorenzo Stoakes 	    (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
2751f002882cSDavid Hildenbrand 		reject_file_backed = true;
2752a6e79df9SLorenzo Stoakes 
2753f002882cSDavid Hildenbrand 	/* We hold a folio reference, so we can safely access folio fields. */
2754f002882cSDavid Hildenbrand 
2755f002882cSDavid Hildenbrand 	/* secretmem folios are always order-0 folios. */
2756f002882cSDavid Hildenbrand 	if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))
2757f002882cSDavid Hildenbrand 		check_secretmem = true;
2758f002882cSDavid Hildenbrand 
2759f002882cSDavid Hildenbrand 	if (!reject_file_backed && !check_secretmem)
2760f002882cSDavid Hildenbrand 		return true;
2761a6e79df9SLorenzo Stoakes 
2762a6e79df9SLorenzo Stoakes 	if (WARN_ON_ONCE(folio_test_slab(folio)))
2763a6e79df9SLorenzo Stoakes 		return false;
2764a6e79df9SLorenzo Stoakes 
2765f002882cSDavid Hildenbrand 	/* hugetlb neither requires dirty-tracking nor can be secretmem. */
2766a6e79df9SLorenzo Stoakes 	if (folio_test_hugetlb(folio))
2767a6e79df9SLorenzo Stoakes 		return true;
2768a6e79df9SLorenzo Stoakes 
2769a6e79df9SLorenzo Stoakes 	/*
2770a6e79df9SLorenzo Stoakes 	 * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods
2771a6e79df9SLorenzo Stoakes 	 * cannot proceed, which means no actions performed under RCU can
2772a6e79df9SLorenzo Stoakes 	 * proceed either.
2773a6e79df9SLorenzo Stoakes 	 *
2774a6e79df9SLorenzo Stoakes 	 * inodes and thus their mappings are freed under RCU, which means the
2775a6e79df9SLorenzo Stoakes 	 * mapping cannot be freed beneath us and thus we can safely dereference
2776a6e79df9SLorenzo Stoakes 	 * it.
2777a6e79df9SLorenzo Stoakes 	 */
2778a6e79df9SLorenzo Stoakes 	lockdep_assert_irqs_disabled();
2779a6e79df9SLorenzo Stoakes 
2780a6e79df9SLorenzo Stoakes 	/*
2781a6e79df9SLorenzo Stoakes 	 * However, there may be operations which _alter_ the mapping, so ensure
2782a6e79df9SLorenzo Stoakes 	 * we read it once and only once.
2783a6e79df9SLorenzo Stoakes 	 */
2784a6e79df9SLorenzo Stoakes 	mapping = READ_ONCE(folio->mapping);
2785a6e79df9SLorenzo Stoakes 
2786a6e79df9SLorenzo Stoakes 	/*
2787a6e79df9SLorenzo Stoakes 	 * The mapping may have been truncated, in any case we cannot determine
2788a6e79df9SLorenzo Stoakes 	 * if this mapping is safe - fall back to slow path to determine how to
2789a6e79df9SLorenzo Stoakes 	 * proceed.
2790a6e79df9SLorenzo Stoakes 	 */
2791a6e79df9SLorenzo Stoakes 	if (!mapping)
2792a6e79df9SLorenzo Stoakes 		return false;
2793a6e79df9SLorenzo Stoakes 
2794a6e79df9SLorenzo Stoakes 	/* Anonymous folios pose no problem. */
2795df25569dSDavid Hildenbrand 	mapping_flags = (unsigned long)mapping & FOLIO_MAPPING_FLAGS;
2796a6e79df9SLorenzo Stoakes 	if (mapping_flags)
2797df25569dSDavid Hildenbrand 		return mapping_flags & FOLIO_MAPPING_ANON;
2798a6e79df9SLorenzo Stoakes 
2799a6e79df9SLorenzo Stoakes 	/*
2800a6e79df9SLorenzo Stoakes 	 * At this point, we know the mapping is non-null and points to an
2801f002882cSDavid Hildenbrand 	 * address_space object.
2802a6e79df9SLorenzo Stoakes 	 */
2803f002882cSDavid Hildenbrand 	if (check_secretmem && secretmem_mapping(mapping))
2804f002882cSDavid Hildenbrand 		return false;
2805f002882cSDavid Hildenbrand 	/* The only remaining allowed file system is shmem. */
2806f002882cSDavid Hildenbrand 	return !reject_file_backed || shmem_mapping(mapping);
2807a6e79df9SLorenzo Stoakes }
2808a6e79df9SLorenzo Stoakes 
28093010a5eaSLaurent Dufour #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
281070cbc3ccSYang Shi /*
281123babe19SDavid Hildenbrand  * GUP-fast relies on pte change detection to avoid concurrent pgtable
281270cbc3ccSYang Shi  * operations.
281370cbc3ccSYang Shi  *
281423babe19SDavid Hildenbrand  * To pin the page, GUP-fast needs to do below in order:
281570cbc3ccSYang Shi  * (1) pin the page (by prefetching pte), then (2) check pte not changed.
281670cbc3ccSYang Shi  *
281770cbc3ccSYang Shi  * For the rest of pgtable operations where pgtable updates can be racy
281823babe19SDavid Hildenbrand  * with GUP-fast, we need to do (1) clear pte, then (2) check whether page
281970cbc3ccSYang Shi  * is pinned.
282070cbc3ccSYang Shi  *
282170cbc3ccSYang Shi  * Above will work for all pte-level operations, including THP split.
282270cbc3ccSYang Shi  *
282323babe19SDavid Hildenbrand  * For THP collapse, it's a bit more complicated because GUP-fast may be
282470cbc3ccSYang Shi  * walking a pgtable page that is being freed (pte is still valid but pmd
282570cbc3ccSYang Shi  * can be cleared already).  To avoid race in such condition, we need to
282670cbc3ccSYang Shi  * also check pmd here to make sure pmd doesn't change (corresponds to
282770cbc3ccSYang Shi  * pmdp_collapse_flush() in the THP collapse code path).
282870cbc3ccSYang Shi  */
gup_fast_pte_range(pmd_t pmd,pmd_t * pmdp,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)282923babe19SDavid Hildenbrand static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
283023babe19SDavid Hildenbrand 		unsigned long end, unsigned int flags, struct page **pages,
283123babe19SDavid Hildenbrand 		int *nr)
28322667f50eSSteve Capper {
2833fd2825b0SAlistair Popple 	int ret = 0;
28342667f50eSSteve Capper 	pte_t *ptep, *ptem;
28352667f50eSSteve Capper 
28362667f50eSSteve Capper 	ptem = ptep = pte_offset_map(&pmd, addr);
283704dee9e8SHugh Dickins 	if (!ptep)
283804dee9e8SHugh Dickins 		return 0;
28392667f50eSSteve Capper 	do {
28402a4a06daSPeter Zijlstra 		pte_t pte = ptep_get_lockless(ptep);
2841b0496fe4SMatthew Wilcox (Oracle) 		struct page *page;
2842b0496fe4SMatthew Wilcox (Oracle) 		struct folio *folio;
28432667f50eSSteve Capper 
2844d74943a2SDavid Hildenbrand 		/*
2845d74943a2SDavid Hildenbrand 		 * Always fallback to ordinary GUP on PROT_NONE-mapped pages:
2846d74943a2SDavid Hildenbrand 		 * pte_access_permitted() better should reject these pages
2847d74943a2SDavid Hildenbrand 		 * either way: otherwise, GUP-fast might succeed in
2848d74943a2SDavid Hildenbrand 		 * cases where ordinary GUP would fail due to VMA access
2849d74943a2SDavid Hildenbrand 		 * permissions.
2850d74943a2SDavid Hildenbrand 		 */
2851d74943a2SDavid Hildenbrand 		if (pte_protnone(pte))
2852e7884f8eSKirill A. Shutemov 			goto pte_unmap;
2853e7884f8eSKirill A. Shutemov 
2854b798bec4SIra Weiny 		if (!pte_access_permitted(pte, flags & FOLL_WRITE))
2855e7884f8eSKirill A. Shutemov 			goto pte_unmap;
2856e7884f8eSKirill A. Shutemov 
2857fd2825b0SAlistair Popple 		if (pte_special(pte))
28582667f50eSSteve Capper 			goto pte_unmap;
28592667f50eSSteve Capper 
2860792b429dSDavid Hildenbrand 		/* If it's not marked as special it must have a valid memmap. */
2861792b429dSDavid Hildenbrand 		VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte)));
28622667f50eSSteve Capper 		page = pte_page(pte);
28632667f50eSSteve Capper 
2864f442fa61SYang Shi 		folio = try_grab_folio_fast(page, 1, flags);
2865b0496fe4SMatthew Wilcox (Oracle) 		if (!folio)
28662667f50eSSteve Capper 			goto pte_unmap;
28672667f50eSSteve Capper 
286870cbc3ccSYang Shi 		if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
2869c33c7948SRyan Roberts 		    unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
2870b0496fe4SMatthew Wilcox (Oracle) 			gup_put_folio(folio, 1, flags);
28712667f50eSSteve Capper 			goto pte_unmap;
28722667f50eSSteve Capper 		}
28732667f50eSSteve Capper 
2874f002882cSDavid Hildenbrand 		if (!gup_fast_folio_allowed(folio, flags)) {
28752667f50eSSteve Capper 			gup_put_folio(folio, 1, flags);
28762667f50eSSteve Capper 			goto pte_unmap;
28772667f50eSSteve Capper 		}
28782667f50eSSteve Capper 
287984209e87SDavid Hildenbrand 		if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
2880a7f22660SDavid Hildenbrand 			gup_put_folio(folio, 1, flags);
2881a7f22660SDavid Hildenbrand 			goto pte_unmap;
2882a7f22660SDavid Hildenbrand 		}
2883a7f22660SDavid Hildenbrand 
2884f28d4363SClaudio Imbrenda 		/*
2885f28d4363SClaudio Imbrenda 		 * We need to make the page accessible if and only if we are
2886f28d4363SClaudio Imbrenda 		 * going to access its content (the FOLL_PIN case).  Please
2887f28d4363SClaudio Imbrenda 		 * see Documentation/core-api/pin_user_pages.rst for
2888f28d4363SClaudio Imbrenda 		 * details.
2889f28d4363SClaudio Imbrenda 		 */
28907cad96aeSDavid Hildenbrand 		if ((flags & FOLL_PIN) && arch_make_folio_accessible(folio)) {
2891b0496fe4SMatthew Wilcox (Oracle) 			gup_put_folio(folio, 1, flags);
2892f28d4363SClaudio Imbrenda 			goto pte_unmap;
2893f28d4363SClaudio Imbrenda 		}
2894b0496fe4SMatthew Wilcox (Oracle) 		folio_set_referenced(folio);
28952667f50eSSteve Capper 		pages[*nr] = page;
28962667f50eSSteve Capper 		(*nr)++;
28972667f50eSSteve Capper 	} while (ptep++, addr += PAGE_SIZE, addr != end);
28982667f50eSSteve Capper 
28992667f50eSSteve Capper 	ret = 1;
29002667f50eSSteve Capper 
29012667f50eSSteve Capper pte_unmap:
29022667f50eSSteve Capper 	pte_unmap(ptem);
29032667f50eSSteve Capper 	return ret;
29042667f50eSSteve Capper }
29052667f50eSSteve Capper #else
29062667f50eSSteve Capper 
29072667f50eSSteve Capper /*
29082667f50eSSteve Capper  * If we can't determine whether or not a pte is special, then fail immediately
29092667f50eSSteve Capper  * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
29102667f50eSSteve Capper  * to be special.
29112667f50eSSteve Capper  *
29122667f50eSSteve Capper  * For a futex to be placed on a THP tail page, get_futex_key requires a
2913dadbb612SSouptick Joarder  * get_user_pages_fast_only implementation that can pin pages. Thus it's still
291423babe19SDavid Hildenbrand  * useful to have gup_fast_pmd_leaf even if we can't operate on ptes.
29152667f50eSSteve Capper  */
gup_fast_pte_range(pmd_t pmd,pmd_t * pmdp,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)291623babe19SDavid Hildenbrand static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
291723babe19SDavid Hildenbrand 		unsigned long end, unsigned int flags, struct page **pages,
291823babe19SDavid Hildenbrand 		int *nr)
29192667f50eSSteve Capper {
29202667f50eSSteve Capper 	return 0;
29212667f50eSSteve Capper }
29223010a5eaSLaurent Dufour #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
29232667f50eSSteve Capper 
gup_fast_pmd_leaf(pmd_t orig,pmd_t * pmdp,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)292423babe19SDavid Hildenbrand static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
292523babe19SDavid Hildenbrand 		unsigned long end, unsigned int flags, struct page **pages,
292623babe19SDavid Hildenbrand 		int *nr)
29272667f50eSSteve Capper {
2928667ed1f7SMatthew Wilcox (Oracle) 	struct page *page;
2929667ed1f7SMatthew Wilcox (Oracle) 	struct folio *folio;
29302667f50eSSteve Capper 	int refs;
29312667f50eSSteve Capper 
2932b798bec4SIra Weiny 	if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
29332667f50eSSteve Capper 		return 0;
29342667f50eSSteve Capper 
2935ae3c99e6SPeter Xu 	if (pmd_special(orig))
2936ae3c99e6SPeter Xu 		return 0;
2937ae3c99e6SPeter Xu 
2938e3c05b6eSDavid Hildenbrand 	refs = (end - addr) >> PAGE_SHIFT;
2939e3c05b6eSDavid Hildenbrand 	page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
29402667f50eSSteve Capper 
2941f442fa61SYang Shi 	folio = try_grab_folio_fast(page, refs, flags);
2942667ed1f7SMatthew Wilcox (Oracle) 	if (!folio)
29432667f50eSSteve Capper 		return 0;
29442667f50eSSteve Capper 
29452667f50eSSteve Capper 	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
2946667ed1f7SMatthew Wilcox (Oracle) 		gup_put_folio(folio, refs, flags);
29472667f50eSSteve Capper 		return 0;
29482667f50eSSteve Capper 	}
29492667f50eSSteve Capper 
2950f002882cSDavid Hildenbrand 	if (!gup_fast_folio_allowed(folio, flags)) {
2951a6e79df9SLorenzo Stoakes 		gup_put_folio(folio, refs, flags);
2952a6e79df9SLorenzo Stoakes 		return 0;
2953a6e79df9SLorenzo Stoakes 	}
295484209e87SDavid Hildenbrand 	if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
2955a7f22660SDavid Hildenbrand 		gup_put_folio(folio, refs, flags);
2956a7f22660SDavid Hildenbrand 		return 0;
2957a7f22660SDavid Hildenbrand 	}
2958a7f22660SDavid Hildenbrand 
2959e3c05b6eSDavid Hildenbrand 	pages += *nr;
2960a43e9820SJohn Hubbard 	*nr += refs;
2961e3c05b6eSDavid Hildenbrand 	for (; refs; refs--)
2962e3c05b6eSDavid Hildenbrand 		*(pages++) = page++;
2963667ed1f7SMatthew Wilcox (Oracle) 	folio_set_referenced(folio);
29642667f50eSSteve Capper 	return 1;
29652667f50eSSteve Capper }
29662667f50eSSteve Capper 
gup_fast_pud_leaf(pud_t orig,pud_t * pudp,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)296723babe19SDavid Hildenbrand static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
296823babe19SDavid Hildenbrand 		unsigned long end, unsigned int flags, struct page **pages,
296923babe19SDavid Hildenbrand 		int *nr)
29702667f50eSSteve Capper {
297183afb52eSMatthew Wilcox (Oracle) 	struct page *page;
297283afb52eSMatthew Wilcox (Oracle) 	struct folio *folio;
29732667f50eSSteve Capper 	int refs;
29742667f50eSSteve Capper 
2975b798bec4SIra Weiny 	if (!pud_access_permitted(orig, flags & FOLL_WRITE))
29762667f50eSSteve Capper 		return 0;
29772667f50eSSteve Capper 
2978ae3c99e6SPeter Xu 	if (pud_special(orig))
2979ae3c99e6SPeter Xu 		return 0;
2980ae3c99e6SPeter Xu 
2981e3c05b6eSDavid Hildenbrand 	refs = (end - addr) >> PAGE_SHIFT;
2982e3c05b6eSDavid Hildenbrand 	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
29832667f50eSSteve Capper 
2984f442fa61SYang Shi 	folio = try_grab_folio_fast(page, refs, flags);
298583afb52eSMatthew Wilcox (Oracle) 	if (!folio)
29862667f50eSSteve Capper 		return 0;
29872667f50eSSteve Capper 
29882667f50eSSteve Capper 	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
298983afb52eSMatthew Wilcox (Oracle) 		gup_put_folio(folio, refs, flags);
29902667f50eSSteve Capper 		return 0;
29912667f50eSSteve Capper 	}
29922667f50eSSteve Capper 
2993f002882cSDavid Hildenbrand 	if (!gup_fast_folio_allowed(folio, flags)) {
2994a6e79df9SLorenzo Stoakes 		gup_put_folio(folio, refs, flags);
2995a6e79df9SLorenzo Stoakes 		return 0;
2996a6e79df9SLorenzo Stoakes 	}
2997a6e79df9SLorenzo Stoakes 
299884209e87SDavid Hildenbrand 	if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
2999a7f22660SDavid Hildenbrand 		gup_put_folio(folio, refs, flags);
3000a7f22660SDavid Hildenbrand 		return 0;
3001a7f22660SDavid Hildenbrand 	}
3002a7f22660SDavid Hildenbrand 
3003e3c05b6eSDavid Hildenbrand 	pages += *nr;
3004a43e9820SJohn Hubbard 	*nr += refs;
3005e3c05b6eSDavid Hildenbrand 	for (; refs; refs--)
3006e3c05b6eSDavid Hildenbrand 		*(pages++) = page++;
300783afb52eSMatthew Wilcox (Oracle) 	folio_set_referenced(folio);
30082667f50eSSteve Capper 	return 1;
30092667f50eSSteve Capper }
30102667f50eSSteve Capper 
gup_fast_pmd_range(pud_t * pudp,pud_t pud,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)301123babe19SDavid Hildenbrand static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
301223babe19SDavid Hildenbrand 		unsigned long end, unsigned int flags, struct page **pages,
301323babe19SDavid Hildenbrand 		int *nr)
30142667f50eSSteve Capper {
30152667f50eSSteve Capper 	unsigned long next;
30162667f50eSSteve Capper 	pmd_t *pmdp;
30172667f50eSSteve Capper 
3018d3f7b1bbSVasily Gorbik 	pmdp = pmd_offset_lockless(pudp, pud, addr);
30192667f50eSSteve Capper 	do {
30201180e732SPeter Zijlstra 		pmd_t pmd = pmdp_get_lockless(pmdp);
30212667f50eSSteve Capper 
30222667f50eSSteve Capper 		next = pmd_addr_end(addr, end);
302384c3fc4eSZi Yan 		if (!pmd_present(pmd))
30242667f50eSSteve Capper 			return 0;
30252667f50eSSteve Capper 
30267db86dc3SPeter Xu 		if (unlikely(pmd_leaf(pmd))) {
302723babe19SDavid Hildenbrand 			/* See gup_fast_pte_range() */
3028d74943a2SDavid Hildenbrand 			if (pmd_protnone(pmd))
30292667f50eSSteve Capper 				return 0;
30302667f50eSSteve Capper 
303123babe19SDavid Hildenbrand 			if (!gup_fast_pmd_leaf(pmd, pmdp, addr, next, flags,
30322667f50eSSteve Capper 				pages, nr))
30332667f50eSSteve Capper 				return 0;
30342667f50eSSteve Capper 
303523babe19SDavid Hildenbrand 		} else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags,
303623babe19SDavid Hildenbrand 					       pages, nr))
30372667f50eSSteve Capper 			return 0;
30382667f50eSSteve Capper 	} while (pmdp++, addr = next, addr != end);
30392667f50eSSteve Capper 
30402667f50eSSteve Capper 	return 1;
30412667f50eSSteve Capper }
30422667f50eSSteve Capper 
gup_fast_pud_range(p4d_t * p4dp,p4d_t p4d,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)304323babe19SDavid Hildenbrand static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr,
304423babe19SDavid Hildenbrand 		unsigned long end, unsigned int flags, struct page **pages,
304523babe19SDavid Hildenbrand 		int *nr)
30462667f50eSSteve Capper {
30472667f50eSSteve Capper 	unsigned long next;
30482667f50eSSteve Capper 	pud_t *pudp;
30492667f50eSSteve Capper 
3050d3f7b1bbSVasily Gorbik 	pudp = pud_offset_lockless(p4dp, p4d, addr);
30512667f50eSSteve Capper 	do {
3052c0efdb37SAnshuman Khandual 		pud_t pud = pudp_get(pudp);
30532667f50eSSteve Capper 
30542667f50eSSteve Capper 		next = pud_addr_end(addr, end);
305515494520SQiujun Huang 		if (unlikely(!pud_present(pud)))
30562667f50eSSteve Capper 			return 0;
30577db86dc3SPeter Xu 		if (unlikely(pud_leaf(pud))) {
305823babe19SDavid Hildenbrand 			if (!gup_fast_pud_leaf(pud, pudp, addr, next, flags,
30592667f50eSSteve Capper 					       pages, nr))
30602667f50eSSteve Capper 				return 0;
306123babe19SDavid Hildenbrand 		} else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags,
306223babe19SDavid Hildenbrand 					       pages, nr))
30632667f50eSSteve Capper 			return 0;
30642667f50eSSteve Capper 	} while (pudp++, addr = next, addr != end);
30652667f50eSSteve Capper 
30662667f50eSSteve Capper 	return 1;
30672667f50eSSteve Capper }
30682667f50eSSteve Capper 
gup_fast_p4d_range(pgd_t * pgdp,pgd_t pgd,unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)306923babe19SDavid Hildenbrand static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
307023babe19SDavid Hildenbrand 		unsigned long end, unsigned int flags, struct page **pages,
307123babe19SDavid Hildenbrand 		int *nr)
3072c2febafcSKirill A. Shutemov {
3073c2febafcSKirill A. Shutemov 	unsigned long next;
3074c2febafcSKirill A. Shutemov 	p4d_t *p4dp;
3075c2febafcSKirill A. Shutemov 
3076d3f7b1bbSVasily Gorbik 	p4dp = p4d_offset_lockless(pgdp, pgd, addr);
3077c2febafcSKirill A. Shutemov 	do {
3078c0efdb37SAnshuman Khandual 		p4d_t p4d = p4dp_get(p4dp);
3079c2febafcSKirill A. Shutemov 
3080c2febafcSKirill A. Shutemov 		next = p4d_addr_end(addr, end);
3081089f9214SPeter Xu 		if (!p4d_present(p4d))
3082c2febafcSKirill A. Shutemov 			return 0;
30831965e933SPeter Xu 		BUILD_BUG_ON(p4d_leaf(p4d));
30848268614bSChristophe Leroy 		if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags,
308523babe19SDavid Hildenbrand 					pages, nr))
3086c2febafcSKirill A. Shutemov 			return 0;
3087c2febafcSKirill A. Shutemov 	} while (p4dp++, addr = next, addr != end);
3088c2febafcSKirill A. Shutemov 
3089c2febafcSKirill A. Shutemov 	return 1;
3090c2febafcSKirill A. Shutemov }
3091c2febafcSKirill A. Shutemov 
gup_fast_pgd_range(unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)309223babe19SDavid Hildenbrand static void gup_fast_pgd_range(unsigned long addr, unsigned long end,
3093b798bec4SIra Weiny 		unsigned int flags, struct page **pages, int *nr)
30945b65c467SKirill A. Shutemov {
30955b65c467SKirill A. Shutemov 	unsigned long next;
30965b65c467SKirill A. Shutemov 	pgd_t *pgdp;
30975b65c467SKirill A. Shutemov 
30985b65c467SKirill A. Shutemov 	pgdp = pgd_offset(current->mm, addr);
30995b65c467SKirill A. Shutemov 	do {
3100c0efdb37SAnshuman Khandual 		pgd_t pgd = pgdp_get(pgdp);
31015b65c467SKirill A. Shutemov 
31025b65c467SKirill A. Shutemov 		next = pgd_addr_end(addr, end);
31035b65c467SKirill A. Shutemov 		if (pgd_none(pgd))
31045b65c467SKirill A. Shutemov 			return;
3105339122abSBaoquan He 		BUILD_BUG_ON(pgd_leaf(pgd));
3106339122abSBaoquan He 		if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
310723babe19SDavid Hildenbrand 					pages, nr))
31085b65c467SKirill A. Shutemov 			return;
31095b65c467SKirill A. Shutemov 	} while (pgdp++, addr = next, addr != end);
31105b65c467SKirill A. Shutemov }
3111050a9adcSChristoph Hellwig #else
gup_fast_pgd_range(unsigned long addr,unsigned long end,unsigned int flags,struct page ** pages,int * nr)311223babe19SDavid Hildenbrand static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end,
3113050a9adcSChristoph Hellwig 		unsigned int flags, struct page **pages, int *nr)
3114050a9adcSChristoph Hellwig {
3115050a9adcSChristoph Hellwig }
311625176ad0SDavid Hildenbrand #endif /* CONFIG_HAVE_GUP_FAST */
31175b65c467SKirill A. Shutemov 
31185b65c467SKirill A. Shutemov #ifndef gup_fast_permitted
31195b65c467SKirill A. Shutemov /*
3120dadbb612SSouptick Joarder  * Check if it's allowed to use get_user_pages_fast_only() for the range, or
31215b65c467SKirill A. Shutemov  * we need to fall back to the slow version:
31225b65c467SKirill A. Shutemov  */
gup_fast_permitted(unsigned long start,unsigned long end)312326f4c328SChristoph Hellwig static bool gup_fast_permitted(unsigned long start, unsigned long end)
31245b65c467SKirill A. Shutemov {
312526f4c328SChristoph Hellwig 	return true;
31265b65c467SKirill A. Shutemov }
31275b65c467SKirill A. Shutemov #endif
31285b65c467SKirill A. Shutemov 
gup_fast(unsigned long start,unsigned long end,unsigned int gup_flags,struct page ** pages)312923babe19SDavid Hildenbrand static unsigned long gup_fast(unsigned long start, unsigned long end,
313023babe19SDavid Hildenbrand 		unsigned int gup_flags, struct page **pages)
31312667f50eSSteve Capper {
3132376a34efSJohn Hubbard 	unsigned long flags;
3133c28b1fc7SJason Gunthorpe 	int nr_pinned = 0;
313457efa1feSJason Gunthorpe 	unsigned seq;
3135c28b1fc7SJason Gunthorpe 
313625176ad0SDavid Hildenbrand 	if (!IS_ENABLED(CONFIG_HAVE_GUP_FAST) ||
3137c28b1fc7SJason Gunthorpe 	    !gup_fast_permitted(start, end))
3138c28b1fc7SJason Gunthorpe 		return 0;
3139c28b1fc7SJason Gunthorpe 
314057efa1feSJason Gunthorpe 	if (gup_flags & FOLL_PIN) {
314175285852SPeter Zijlstra 		if (!raw_seqcount_try_begin(&current->mm->write_protect_seq, seq))
314257efa1feSJason Gunthorpe 			return 0;
314357efa1feSJason Gunthorpe 	}
314457efa1feSJason Gunthorpe 
3145c28b1fc7SJason Gunthorpe 	/*
3146c28b1fc7SJason Gunthorpe 	 * Disable interrupts. The nested form is used, in order to allow full,
3147c28b1fc7SJason Gunthorpe 	 * general purpose use of this routine.
3148c28b1fc7SJason Gunthorpe 	 *
3149c28b1fc7SJason Gunthorpe 	 * With interrupts disabled, we block page table pages from being freed
3150c28b1fc7SJason Gunthorpe 	 * from under us. See struct mmu_table_batch comments in
3151c28b1fc7SJason Gunthorpe 	 * include/asm-generic/tlb.h for more details.
3152c28b1fc7SJason Gunthorpe 	 *
3153c28b1fc7SJason Gunthorpe 	 * We do not adopt an rcu_read_lock() here as we also want to block IPIs
315452084f25SJann Horn 	 * that come from callers of tlb_remove_table_sync_one().
3155c28b1fc7SJason Gunthorpe 	 */
3156c28b1fc7SJason Gunthorpe 	local_irq_save(flags);
315723babe19SDavid Hildenbrand 	gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);
3158c28b1fc7SJason Gunthorpe 	local_irq_restore(flags);
315957efa1feSJason Gunthorpe 
316057efa1feSJason Gunthorpe 	/*
316157efa1feSJason Gunthorpe 	 * When pinning pages for DMA there could be a concurrent write protect
316223babe19SDavid Hildenbrand 	 * from fork() via copy_page_range(), in this case always fail GUP-fast.
316357efa1feSJason Gunthorpe 	 */
316457efa1feSJason Gunthorpe 	if (gup_flags & FOLL_PIN) {
316557efa1feSJason Gunthorpe 		if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
316623babe19SDavid Hildenbrand 			gup_fast_unpin_user_pages(pages, nr_pinned);
316757efa1feSJason Gunthorpe 			return 0;
3168b6a2619cSDavid Hildenbrand 		} else {
3169b6a2619cSDavid Hildenbrand 			sanity_check_pinned_pages(pages, nr_pinned);
317057efa1feSJason Gunthorpe 		}
317157efa1feSJason Gunthorpe 	}
3172c28b1fc7SJason Gunthorpe 	return nr_pinned;
3173c28b1fc7SJason Gunthorpe }
3174c28b1fc7SJason Gunthorpe 
gup_fast_fallback(unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages)317523babe19SDavid Hildenbrand static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,
317623babe19SDavid Hildenbrand 		unsigned int gup_flags, struct page **pages)
3177c28b1fc7SJason Gunthorpe {
3178c28b1fc7SJason Gunthorpe 	unsigned long len, end;
3179c28b1fc7SJason Gunthorpe 	unsigned long nr_pinned;
3180b2a72dffSJason Gunthorpe 	int locked = 0;
3181c28b1fc7SJason Gunthorpe 	int ret;
31822667f50eSSteve Capper 
3183f4000fdfSJohn Hubbard 	if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
3184376a34efSJohn Hubbard 				       FOLL_FORCE | FOLL_PIN | FOLL_GET |
31854003f107SLogan Gunthorpe 				       FOLL_FAST_ONLY | FOLL_NOFAULT |
3186d74943a2SDavid Hildenbrand 				       FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT)))
3187817be129SChristoph Hellwig 		return -EINVAL;
3188817be129SChristoph Hellwig 
3189a458b76aSAndrea Arcangeli 	if (gup_flags & FOLL_PIN)
319012e423baSLorenzo Stoakes 		mm_set_has_pinned_flag(current->mm);
3191008cfe44SPeter Xu 
3192f81cd178SJohn Hubbard 	if (!(gup_flags & FOLL_FAST_ONLY))
3193da1c55f1SMichel Lespinasse 		might_lock_read(&current->mm->mmap_lock);
3194f81cd178SJohn Hubbard 
3195f455c854SChristoph Hellwig 	start = untagged_addr(start) & PAGE_MASK;
3196c28b1fc7SJason Gunthorpe 	len = nr_pages << PAGE_SHIFT;
3197c28b1fc7SJason Gunthorpe 	if (check_add_overflow(start, len, &end))
31989883c7f8SJason Gunthorpe 		return -EOVERFLOW;
31996014bc27SLinus Torvalds 	if (end > TASK_SIZE_MAX)
32006014bc27SLinus Torvalds 		return -EFAULT;
320173e10a61SKirill A. Shutemov 
320223babe19SDavid Hildenbrand 	nr_pinned = gup_fast(start, end, gup_flags, pages);
3203c28b1fc7SJason Gunthorpe 	if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
3204c28b1fc7SJason Gunthorpe 		return nr_pinned;
3205376a34efSJohn Hubbard 
3206c28b1fc7SJason Gunthorpe 	/* Slow path: try to get the remaining pages with get_user_pages */
32074628b063SPingfan Liu 	start += nr_pinned << PAGE_SHIFT;
32084628b063SPingfan Liu 	pages += nr_pinned;
3209b2a72dffSJason Gunthorpe 	ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,
3210b2cac248SLorenzo Stoakes 				    pages, &locked,
3211f04740f5SJason Gunthorpe 				    gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);
3212c28b1fc7SJason Gunthorpe 	if (ret < 0) {
3213c28b1fc7SJason Gunthorpe 		/*
3214c28b1fc7SJason Gunthorpe 		 * The caller has to unpin the pages we already pinned so
3215c28b1fc7SJason Gunthorpe 		 * returning -errno is not an option
3216c28b1fc7SJason Gunthorpe 		 */
3217c28b1fc7SJason Gunthorpe 		if (nr_pinned)
3218c28b1fc7SJason Gunthorpe 			return nr_pinned;
32192667f50eSSteve Capper 		return ret;
32202667f50eSSteve Capper 	}
3221c28b1fc7SJason Gunthorpe 	return ret + nr_pinned;
3222c28b1fc7SJason Gunthorpe }
3223c28b1fc7SJason Gunthorpe 
3224dadbb612SSouptick Joarder /**
3225dadbb612SSouptick Joarder  * get_user_pages_fast_only() - pin user pages in memory
3226dadbb612SSouptick Joarder  * @start:      starting user address
3227dadbb612SSouptick Joarder  * @nr_pages:   number of pages from start to pin
3228dadbb612SSouptick Joarder  * @gup_flags:  flags modifying pin behaviour
3229dadbb612SSouptick Joarder  * @pages:      array that receives pointers to the pages pinned.
3230dadbb612SSouptick Joarder  *              Should be at least nr_pages long.
3231dadbb612SSouptick Joarder  *
32329e1f0580SJohn Hubbard  * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
32339e1f0580SJohn Hubbard  * the regular GUP.
32349e1f0580SJohn Hubbard  *
32359e1f0580SJohn Hubbard  * If the architecture does not support this function, simply return with no
32369e1f0580SJohn Hubbard  * pages pinned.
32379e1f0580SJohn Hubbard  *
32389e1f0580SJohn Hubbard  * Careful, careful! COW breaking can go either way, so a non-write
32399e1f0580SJohn Hubbard  * access can get ambiguous page results. If you call this function without
32409e1f0580SJohn Hubbard  * 'write' set, you'd better be sure that you're ok with that ambiguity.
32419e1f0580SJohn Hubbard  */
get_user_pages_fast_only(unsigned long start,int nr_pages,unsigned int gup_flags,struct page ** pages)3242dadbb612SSouptick Joarder int get_user_pages_fast_only(unsigned long start, int nr_pages,
3243dadbb612SSouptick Joarder 			     unsigned int gup_flags, struct page **pages)
32449e1f0580SJohn Hubbard {
32459e1f0580SJohn Hubbard 	/*
32469e1f0580SJohn Hubbard 	 * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
32479e1f0580SJohn Hubbard 	 * because gup fast is always a "pin with a +1 page refcount" request.
3248376a34efSJohn Hubbard 	 *
3249376a34efSJohn Hubbard 	 * FOLL_FAST_ONLY is required in order to match the API description of
3250376a34efSJohn Hubbard 	 * this routine: no fall back to regular ("slow") GUP.
32519e1f0580SJohn Hubbard 	 */
3252b2cac248SLorenzo Stoakes 	if (!is_valid_gup_args(pages, NULL, &gup_flags,
3253d64e2dbcSJason Gunthorpe 			       FOLL_GET | FOLL_FAST_ONLY))
3254d64e2dbcSJason Gunthorpe 		return -EINVAL;
32559e1f0580SJohn Hubbard 
325623babe19SDavid Hildenbrand 	return gup_fast_fallback(start, nr_pages, gup_flags, pages);
32579e1f0580SJohn Hubbard }
3258dadbb612SSouptick Joarder EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
32599e1f0580SJohn Hubbard 
3260eddb1c22SJohn Hubbard /**
3261eddb1c22SJohn Hubbard  * get_user_pages_fast() - pin user pages in memory
3262eddb1c22SJohn Hubbard  * @start:      starting user address
3263eddb1c22SJohn Hubbard  * @nr_pages:   number of pages from start to pin
3264eddb1c22SJohn Hubbard  * @gup_flags:  flags modifying pin behaviour
3265eddb1c22SJohn Hubbard  * @pages:      array that receives pointers to the pages pinned.
3266eddb1c22SJohn Hubbard  *              Should be at least nr_pages long.
3267eddb1c22SJohn Hubbard  *
3268c1e8d7c6SMichel Lespinasse  * Attempt to pin user pages in memory without taking mm->mmap_lock.
3269eddb1c22SJohn Hubbard  * If not successful, it will fall back to taking the lock and
3270eddb1c22SJohn Hubbard  * calling get_user_pages().
3271eddb1c22SJohn Hubbard  *
3272eddb1c22SJohn Hubbard  * Returns number of pages pinned. This may be fewer than the number requested.
3273eddb1c22SJohn Hubbard  * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
3274eddb1c22SJohn Hubbard  * -errno.
3275eddb1c22SJohn Hubbard  */
get_user_pages_fast(unsigned long start,int nr_pages,unsigned int gup_flags,struct page ** pages)3276eddb1c22SJohn Hubbard int get_user_pages_fast(unsigned long start, int nr_pages,
3277eddb1c22SJohn Hubbard 			unsigned int gup_flags, struct page **pages)
3278eddb1c22SJohn Hubbard {
327994202f12SJohn Hubbard 	/*
328094202f12SJohn Hubbard 	 * The caller may or may not have explicitly set FOLL_GET; either way is
328194202f12SJohn Hubbard 	 * OK. However, internally (within mm/gup.c), gup fast variants must set
328294202f12SJohn Hubbard 	 * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
328394202f12SJohn Hubbard 	 * request.
328494202f12SJohn Hubbard 	 */
3285b2cac248SLorenzo Stoakes 	if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET))
3286d64e2dbcSJason Gunthorpe 		return -EINVAL;
328723babe19SDavid Hildenbrand 	return gup_fast_fallback(start, nr_pages, gup_flags, pages);
3288eddb1c22SJohn Hubbard }
3289050a9adcSChristoph Hellwig EXPORT_SYMBOL_GPL(get_user_pages_fast);
3290eddb1c22SJohn Hubbard 
3291eddb1c22SJohn Hubbard /**
3292eddb1c22SJohn Hubbard  * pin_user_pages_fast() - pin user pages in memory without taking locks
3293eddb1c22SJohn Hubbard  *
32943faa52c0SJohn Hubbard  * @start:      starting user address
32953faa52c0SJohn Hubbard  * @nr_pages:   number of pages from start to pin
32963faa52c0SJohn Hubbard  * @gup_flags:  flags modifying pin behaviour
32973faa52c0SJohn Hubbard  * @pages:      array that receives pointers to the pages pinned.
32983faa52c0SJohn Hubbard  *              Should be at least nr_pages long.
32993faa52c0SJohn Hubbard  *
33003faa52c0SJohn Hubbard  * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
33013faa52c0SJohn Hubbard  * get_user_pages_fast() for documentation on the function arguments, because
33023faa52c0SJohn Hubbard  * the arguments here are identical.
33033faa52c0SJohn Hubbard  *
33043faa52c0SJohn Hubbard  * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
330572ef5e52SMauro Carvalho Chehab  * see Documentation/core-api/pin_user_pages.rst for further details.
3306c8070b78SDavid Howells  *
3307c8070b78SDavid Howells  * Note that if a zero_page is amongst the returned pages, it will not have
3308c8070b78SDavid Howells  * pins in it and unpin_user_page() will not remove pins from it.
3309eddb1c22SJohn Hubbard  */
pin_user_pages_fast(unsigned long start,int nr_pages,unsigned int gup_flags,struct page ** pages)3310eddb1c22SJohn Hubbard int pin_user_pages_fast(unsigned long start, int nr_pages,
3311eddb1c22SJohn Hubbard 			unsigned int gup_flags, struct page **pages)
3312eddb1c22SJohn Hubbard {
3313b2cac248SLorenzo Stoakes 	if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
33143faa52c0SJohn Hubbard 		return -EINVAL;
331523babe19SDavid Hildenbrand 	return gup_fast_fallback(start, nr_pages, gup_flags, pages);
3316eddb1c22SJohn Hubbard }
3317eddb1c22SJohn Hubbard EXPORT_SYMBOL_GPL(pin_user_pages_fast);
3318eddb1c22SJohn Hubbard 
3319eddb1c22SJohn Hubbard /**
332064019a2eSPeter Xu  * pin_user_pages_remote() - pin pages of a remote process
3321eddb1c22SJohn Hubbard  *
33223faa52c0SJohn Hubbard  * @mm:		mm_struct of target mm
33233faa52c0SJohn Hubbard  * @start:	starting user address
33243faa52c0SJohn Hubbard  * @nr_pages:	number of pages from start to pin
33253faa52c0SJohn Hubbard  * @gup_flags:	flags modifying lookup behaviour
33263faa52c0SJohn Hubbard  * @pages:	array that receives pointers to the pages pinned.
33270768c8deSYury Norov  *		Should be at least nr_pages long.
33283faa52c0SJohn Hubbard  * @locked:	pointer to lock flag indicating whether lock is held and
33293faa52c0SJohn Hubbard  *		subsequently whether VM_FAULT_RETRY functionality can be
33303faa52c0SJohn Hubbard  *		utilised. Lock must initially be held.
33313faa52c0SJohn Hubbard  *
33323faa52c0SJohn Hubbard  * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
33333faa52c0SJohn Hubbard  * get_user_pages_remote() for documentation on the function arguments, because
33343faa52c0SJohn Hubbard  * the arguments here are identical.
33353faa52c0SJohn Hubbard  *
33363faa52c0SJohn Hubbard  * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
333772ef5e52SMauro Carvalho Chehab  * see Documentation/core-api/pin_user_pages.rst for details.
3338c8070b78SDavid Howells  *
3339c8070b78SDavid Howells  * Note that if a zero_page is amongst the returned pages, it will not have
3340c8070b78SDavid Howells  * pins in it and unpin_user_page*() will not remove pins from it.
3341eddb1c22SJohn Hubbard  */
pin_user_pages_remote(struct mm_struct * mm,unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages,int * locked)334264019a2eSPeter Xu long pin_user_pages_remote(struct mm_struct *mm,
3343eddb1c22SJohn Hubbard 			   unsigned long start, unsigned long nr_pages,
3344eddb1c22SJohn Hubbard 			   unsigned int gup_flags, struct page **pages,
33450b295316SLorenzo Stoakes 			   int *locked)
3346eddb1c22SJohn Hubbard {
33479a863a6aSJason Gunthorpe 	int local_locked = 1;
33489a863a6aSJason Gunthorpe 
3349b2cac248SLorenzo Stoakes 	if (!is_valid_gup_args(pages, locked, &gup_flags,
3350d64e2dbcSJason Gunthorpe 			       FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))
3351d64e2dbcSJason Gunthorpe 		return 0;
3352b2cac248SLorenzo Stoakes 	return __gup_longterm_locked(mm, start, nr_pages, pages,
33539a863a6aSJason Gunthorpe 				     locked ? locked : &local_locked,
3354d64e2dbcSJason Gunthorpe 				     gup_flags);
3355eddb1c22SJohn Hubbard }
3356eddb1c22SJohn Hubbard EXPORT_SYMBOL(pin_user_pages_remote);
3357eddb1c22SJohn Hubbard 
3358eddb1c22SJohn Hubbard /**
3359eddb1c22SJohn Hubbard  * pin_user_pages() - pin user pages in memory for use by other devices
3360eddb1c22SJohn Hubbard  *
33613faa52c0SJohn Hubbard  * @start:	starting user address
33623faa52c0SJohn Hubbard  * @nr_pages:	number of pages from start to pin
33633faa52c0SJohn Hubbard  * @gup_flags:	flags modifying lookup behaviour
33643faa52c0SJohn Hubbard  * @pages:	array that receives pointers to the pages pinned.
33650768c8deSYury Norov  *		Should be at least nr_pages long.
33663faa52c0SJohn Hubbard  *
33673faa52c0SJohn Hubbard  * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
33683faa52c0SJohn Hubbard  * FOLL_PIN is set.
33693faa52c0SJohn Hubbard  *
33703faa52c0SJohn Hubbard  * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
337172ef5e52SMauro Carvalho Chehab  * see Documentation/core-api/pin_user_pages.rst for details.
3372c8070b78SDavid Howells  *
3373c8070b78SDavid Howells  * Note that if a zero_page is amongst the returned pages, it will not have
3374c8070b78SDavid Howells  * pins in it and unpin_user_page*() will not remove pins from it.
3375eddb1c22SJohn Hubbard  */
pin_user_pages(unsigned long start,unsigned long nr_pages,unsigned int gup_flags,struct page ** pages)3376eddb1c22SJohn Hubbard long pin_user_pages(unsigned long start, unsigned long nr_pages,
33774c630f30SLorenzo Stoakes 		    unsigned int gup_flags, struct page **pages)
3378eddb1c22SJohn Hubbard {
33799a863a6aSJason Gunthorpe 	int locked = 1;
33809a863a6aSJason Gunthorpe 
3381b2cac248SLorenzo Stoakes 	if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
3382d64e2dbcSJason Gunthorpe 		return 0;
338364019a2eSPeter Xu 	return __gup_longterm_locked(current->mm, start, nr_pages,
3384b2cac248SLorenzo Stoakes 				     pages, &locked, gup_flags);
3385eddb1c22SJohn Hubbard }
3386eddb1c22SJohn Hubbard EXPORT_SYMBOL(pin_user_pages);
338791429023SJohn Hubbard 
338891429023SJohn Hubbard /*
338991429023SJohn Hubbard  * pin_user_pages_unlocked() is the FOLL_PIN variant of
339091429023SJohn Hubbard  * get_user_pages_unlocked(). Behavior is the same, except that this one sets
339191429023SJohn Hubbard  * FOLL_PIN and rejects FOLL_GET.
3392c8070b78SDavid Howells  *
3393c8070b78SDavid Howells  * Note that if a zero_page is amongst the returned pages, it will not have
3394c8070b78SDavid Howells  * pins in it and unpin_user_page*() will not remove pins from it.
339591429023SJohn Hubbard  */
pin_user_pages_unlocked(unsigned long start,unsigned long nr_pages,struct page ** pages,unsigned int gup_flags)339691429023SJohn Hubbard long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
339791429023SJohn Hubbard 			     struct page **pages, unsigned int gup_flags)
339891429023SJohn Hubbard {
3399b2a72dffSJason Gunthorpe 	int locked = 0;
340091429023SJohn Hubbard 
3401b2cac248SLorenzo Stoakes 	if (!is_valid_gup_args(pages, NULL, &gup_flags,
3402f04740f5SJason Gunthorpe 			       FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))
3403d64e2dbcSJason Gunthorpe 		return 0;
34040768c8deSYury Norov 
3405b2cac248SLorenzo Stoakes 	return __gup_longterm_locked(current->mm, start, nr_pages, pages,
3406b2a72dffSJason Gunthorpe 				     &locked, gup_flags);
340791429023SJohn Hubbard }
340891429023SJohn Hubbard EXPORT_SYMBOL(pin_user_pages_unlocked);
340989c1905dSVivek Kasireddy 
341089c1905dSVivek Kasireddy /**
341189c1905dSVivek Kasireddy  * memfd_pin_folios() - pin folios associated with a memfd
341289c1905dSVivek Kasireddy  * @memfd:      the memfd whose folios are to be pinned
341389c1905dSVivek Kasireddy  * @start:      the first memfd offset
341489c1905dSVivek Kasireddy  * @end:        the last memfd offset (inclusive)
341589c1905dSVivek Kasireddy  * @folios:     array that receives pointers to the folios pinned
341689c1905dSVivek Kasireddy  * @max_folios: maximum number of entries in @folios
341789c1905dSVivek Kasireddy  * @offset:     the offset into the first folio
341889c1905dSVivek Kasireddy  *
341989c1905dSVivek Kasireddy  * Attempt to pin folios associated with a memfd in the contiguous range
342089c1905dSVivek Kasireddy  * [start, end]. Given that a memfd is either backed by shmem or hugetlb,
342189c1905dSVivek Kasireddy  * the folios can either be found in the page cache or need to be allocated
342289c1905dSVivek Kasireddy  * if necessary. Once the folios are located, they are all pinned via
342389c1905dSVivek Kasireddy  * FOLL_PIN and @offset is populatedwith the offset into the first folio.
342489c1905dSVivek Kasireddy  * And, eventually, these pinned folios must be released either using
342589c1905dSVivek Kasireddy  * unpin_folios() or unpin_folio().
342689c1905dSVivek Kasireddy  *
342789c1905dSVivek Kasireddy  * It must be noted that the folios may be pinned for an indefinite amount
342889c1905dSVivek Kasireddy  * of time. And, in most cases, the duration of time they may stay pinned
342989c1905dSVivek Kasireddy  * would be controlled by the userspace. This behavior is effectively the
343089c1905dSVivek Kasireddy  * same as using FOLL_LONGTERM with other GUP APIs.
343189c1905dSVivek Kasireddy  *
343289c1905dSVivek Kasireddy  * Returns number of folios pinned, which could be less than @max_folios
343389c1905dSVivek Kasireddy  * as it depends on the folio sizes that cover the range [start, end].
343489c1905dSVivek Kasireddy  * If no folios were pinned, it returns -errno.
343589c1905dSVivek Kasireddy  */
memfd_pin_folios(struct file * memfd,loff_t start,loff_t end,struct folio ** folios,unsigned int max_folios,pgoff_t * offset)343689c1905dSVivek Kasireddy long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
343789c1905dSVivek Kasireddy 		      struct folio **folios, unsigned int max_folios,
343889c1905dSVivek Kasireddy 		      pgoff_t *offset)
343989c1905dSVivek Kasireddy {
344089c1905dSVivek Kasireddy 	unsigned int flags, nr_folios, nr_found;
344189c1905dSVivek Kasireddy 	unsigned int i, pgshift = PAGE_SHIFT;
344230f62b92SVishal Moola (Oracle) 	pgoff_t start_idx, end_idx;
344389c1905dSVivek Kasireddy 	struct folio *folio = NULL;
344489c1905dSVivek Kasireddy 	struct folio_batch fbatch;
3445dc677b5fSSteve Sistare 	struct hstate *h;
344689c1905dSVivek Kasireddy 	long ret = -EINVAL;
344789c1905dSVivek Kasireddy 
344889c1905dSVivek Kasireddy 	if (start < 0 || start > end || !max_folios)
344989c1905dSVivek Kasireddy 		return -EINVAL;
345089c1905dSVivek Kasireddy 
345189c1905dSVivek Kasireddy 	if (!memfd)
345289c1905dSVivek Kasireddy 		return -EINVAL;
345389c1905dSVivek Kasireddy 
345489c1905dSVivek Kasireddy 	if (!shmem_file(memfd) && !is_file_hugepages(memfd))
345589c1905dSVivek Kasireddy 		return -EINVAL;
345689c1905dSVivek Kasireddy 
345789c1905dSVivek Kasireddy 	if (end >= i_size_read(file_inode(memfd)))
345889c1905dSVivek Kasireddy 		return -EINVAL;
345989c1905dSVivek Kasireddy 
346089c1905dSVivek Kasireddy 	if (is_file_hugepages(memfd)) {
346189c1905dSVivek Kasireddy 		h = hstate_file(memfd);
346289c1905dSVivek Kasireddy 		pgshift = huge_page_shift(h);
346389c1905dSVivek Kasireddy 	}
346489c1905dSVivek Kasireddy 
346589c1905dSVivek Kasireddy 	flags = memalloc_pin_save();
346689c1905dSVivek Kasireddy 	do {
346789c1905dSVivek Kasireddy 		nr_folios = 0;
346889c1905dSVivek Kasireddy 		start_idx = start >> pgshift;
346989c1905dSVivek Kasireddy 		end_idx = end >> pgshift;
347089c1905dSVivek Kasireddy 		if (is_file_hugepages(memfd)) {
347189c1905dSVivek Kasireddy 			start_idx <<= huge_page_order(h);
347289c1905dSVivek Kasireddy 			end_idx <<= huge_page_order(h);
347389c1905dSVivek Kasireddy 		}
347489c1905dSVivek Kasireddy 
347589c1905dSVivek Kasireddy 		folio_batch_init(&fbatch);
347689c1905dSVivek Kasireddy 		while (start_idx <= end_idx && nr_folios < max_folios) {
347789c1905dSVivek Kasireddy 			/*
347889c1905dSVivek Kasireddy 			 * In most cases, we should be able to find the folios
347989c1905dSVivek Kasireddy 			 * in the page cache. If we cannot find them for some
348089c1905dSVivek Kasireddy 			 * reason, we try to allocate them and add them to the
348189c1905dSVivek Kasireddy 			 * page cache.
348289c1905dSVivek Kasireddy 			 */
348389c1905dSVivek Kasireddy 			nr_found = filemap_get_folios_contig(memfd->f_mapping,
348489c1905dSVivek Kasireddy 							     &start_idx,
348589c1905dSVivek Kasireddy 							     end_idx,
348689c1905dSVivek Kasireddy 							     &fbatch);
348789c1905dSVivek Kasireddy 			if (folio) {
348889c1905dSVivek Kasireddy 				folio_put(folio);
348989c1905dSVivek Kasireddy 				folio = NULL;
349089c1905dSVivek Kasireddy 			}
349189c1905dSVivek Kasireddy 
349289c1905dSVivek Kasireddy 			for (i = 0; i < nr_found; i++) {
3493fe488d34SVishal Moola (Oracle) 				folio = fbatch.folios[i];
349489c1905dSVivek Kasireddy 
349589c1905dSVivek Kasireddy 				if (try_grab_folio(folio, 1, FOLL_PIN)) {
349689c1905dSVivek Kasireddy 					folio_batch_release(&fbatch);
349789c1905dSVivek Kasireddy 					ret = -EINVAL;
349889c1905dSVivek Kasireddy 					goto err;
349989c1905dSVivek Kasireddy 				}
350089c1905dSVivek Kasireddy 
350189c1905dSVivek Kasireddy 				if (nr_folios == 0)
350289c1905dSVivek Kasireddy 					*offset = offset_in_folio(folio, start);
350389c1905dSVivek Kasireddy 
350489c1905dSVivek Kasireddy 				folios[nr_folios] = folio;
350589c1905dSVivek Kasireddy 				if (++nr_folios == max_folios)
350689c1905dSVivek Kasireddy 					break;
350789c1905dSVivek Kasireddy 			}
350889c1905dSVivek Kasireddy 
350989c1905dSVivek Kasireddy 			folio = NULL;
351089c1905dSVivek Kasireddy 			folio_batch_release(&fbatch);
351189c1905dSVivek Kasireddy 			if (!nr_found) {
351289c1905dSVivek Kasireddy 				folio = memfd_alloc_folio(memfd, start_idx);
351389c1905dSVivek Kasireddy 				if (IS_ERR(folio)) {
351489c1905dSVivek Kasireddy 					ret = PTR_ERR(folio);
351589c1905dSVivek Kasireddy 					if (ret != -EEXIST)
351689c1905dSVivek Kasireddy 						goto err;
3517ce645b9fSSteve Sistare 					folio = NULL;
351889c1905dSVivek Kasireddy 				}
351989c1905dSVivek Kasireddy 			}
352089c1905dSVivek Kasireddy 		}
352189c1905dSVivek Kasireddy 
352289c1905dSVivek Kasireddy 		ret = check_and_migrate_movable_folios(nr_folios, folios);
352389c1905dSVivek Kasireddy 	} while (ret == -EAGAIN);
352489c1905dSVivek Kasireddy 
352589c1905dSVivek Kasireddy 	memalloc_pin_restore(flags);
352689c1905dSVivek Kasireddy 	return ret ? ret : nr_folios;
352789c1905dSVivek Kasireddy err:
352889c1905dSVivek Kasireddy 	memalloc_pin_restore(flags);
352989c1905dSVivek Kasireddy 	unpin_folios(folios, nr_folios);
353089c1905dSVivek Kasireddy 
353189c1905dSVivek Kasireddy 	return ret;
353289c1905dSVivek Kasireddy }
353389c1905dSVivek Kasireddy EXPORT_SYMBOL_GPL(memfd_pin_folios);
3534a2ad1b81SSteve Sistare 
3535a2ad1b81SSteve Sistare /**
3536a2ad1b81SSteve Sistare  * folio_add_pins() - add pins to an already-pinned folio
3537a2ad1b81SSteve Sistare  * @folio: the folio to add more pins to
3538a2ad1b81SSteve Sistare  * @pins: number of pins to add
3539a2ad1b81SSteve Sistare  *
3540a2ad1b81SSteve Sistare  * Try to add more pins to an already-pinned folio. The semantics
3541a2ad1b81SSteve Sistare  * of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot
3542a2ad1b81SSteve Sistare  * be changed.
3543a2ad1b81SSteve Sistare  *
3544a2ad1b81SSteve Sistare  * This function is helpful when having obtained a pin on a large folio
3545a2ad1b81SSteve Sistare  * using memfd_pin_folios(), but wanting to logically unpin parts
3546a2ad1b81SSteve Sistare  * (e.g., individual pages) of the folio later, for example, using
3547a2ad1b81SSteve Sistare  * unpin_user_page_range_dirty_lock().
3548a2ad1b81SSteve Sistare  *
3549a2ad1b81SSteve Sistare  * This is not the right interface to initially pin a folio.
3550a2ad1b81SSteve Sistare  */
folio_add_pins(struct folio * folio,unsigned int pins)3551a2ad1b81SSteve Sistare int folio_add_pins(struct folio *folio, unsigned int pins)
3552a2ad1b81SSteve Sistare {
3553a2ad1b81SSteve Sistare 	VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio));
3554a2ad1b81SSteve Sistare 
3555a2ad1b81SSteve Sistare 	return try_grab_folio(folio, pins, FOLL_PIN);
3556a2ad1b81SSteve Sistare }
3557a2ad1b81SSteve Sistare EXPORT_SYMBOL_GPL(folio_add_pins);
3558