1c942fddfSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later
2133ff0eaSJérôme Glisse /*
3133ff0eaSJérôme Glisse * Copyright 2013 Red Hat Inc.
4133ff0eaSJérôme Glisse *
5f813f219SJérôme Glisse * Authors: Jérôme Glisse <jglisse@redhat.com>
6133ff0eaSJérôme Glisse */
7133ff0eaSJérôme Glisse /*
8133ff0eaSJérôme Glisse * Refer to include/linux/hmm.h for information about heterogeneous memory
9133ff0eaSJérôme Glisse * management or HMM for short.
10133ff0eaSJérôme Glisse */
11a520110eSChristoph Hellwig #include <linux/pagewalk.h>
12133ff0eaSJérôme Glisse #include <linux/hmm.h>
138cad4713SLeon Romanovsky #include <linux/hmm-dma.h>
14858b54daSJérôme Glisse #include <linux/init.h>
15da4c3c73SJérôme Glisse #include <linux/rmap.h>
16da4c3c73SJérôme Glisse #include <linux/swap.h>
17133ff0eaSJérôme Glisse #include <linux/slab.h>
18133ff0eaSJérôme Glisse #include <linux/sched.h>
194ef589dcSJérôme Glisse #include <linux/mmzone.h>
204ef589dcSJérôme Glisse #include <linux/pagemap.h>
210ac881efSLorenzo Stoakes #include <linux/leafops.h>
22da4c3c73SJérôme Glisse #include <linux/hugetlb.h>
234ef589dcSJérôme Glisse #include <linux/memremap.h>
24c8a53b2dSJason Gunthorpe #include <linux/sched/mm.h>
257b2d55d2SJérôme Glisse #include <linux/jump_label.h>
2655c0ece8SJérôme Glisse #include <linux/dma-mapping.h>
278cad4713SLeon Romanovsky #include <linux/pci-p2pdma.h>
28c0b12405SJérôme Glisse #include <linux/mmu_notifier.h>
294ef589dcSJérôme Glisse #include <linux/memory_hotplug.h>
304ef589dcSJérôme Glisse
31b756a3b5SAlistair Popple #include "internal.h"
32b756a3b5SAlistair Popple
3374eee180SJérôme Glisse struct hmm_vma_walk {
3474eee180SJérôme Glisse struct hmm_range *range;
3574eee180SJérôme Glisse unsigned long last;
3674eee180SJérôme Glisse };
3774eee180SJérôme Glisse
38a3eb13c1SJason Gunthorpe enum {
39a3eb13c1SJason Gunthorpe HMM_NEED_FAULT = 1 << 0,
40a3eb13c1SJason Gunthorpe HMM_NEED_WRITE_FAULT = 1 << 1,
41a3eb13c1SJason Gunthorpe HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
42a3eb13c1SJason Gunthorpe };
43a3eb13c1SJason Gunthorpe
44285e8718SLeon Romanovsky enum {
45285e8718SLeon Romanovsky /* These flags are carried from input-to-output */
468cad4713SLeon Romanovsky HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
478cad4713SLeon Romanovsky HMM_PFN_P2PDMA_BUS,
48285e8718SLeon Romanovsky };
49285e8718SLeon Romanovsky
hmm_pfns_fill(unsigned long addr,unsigned long end,struct hmm_range * range,unsigned long cpu_flags)50d28c2c9aSRalph Campbell static int hmm_pfns_fill(unsigned long addr, unsigned long end,
512733ea14SJason Gunthorpe struct hmm_range *range, unsigned long cpu_flags)
52da4c3c73SJérôme Glisse {
532733ea14SJason Gunthorpe unsigned long i = (addr - range->start) >> PAGE_SHIFT;
54da4c3c73SJérôme Glisse
55285e8718SLeon Romanovsky for (; addr < end; addr += PAGE_SIZE, i++) {
56285e8718SLeon Romanovsky range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
57285e8718SLeon Romanovsky range->hmm_pfns[i] |= cpu_flags;
58285e8718SLeon Romanovsky }
59da4c3c73SJérôme Glisse return 0;
60da4c3c73SJérôme Glisse }
61da4c3c73SJérôme Glisse
625504ed29SJérôme Glisse /*
63f8c888a3SChristoph Hellwig * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s)
64d2e8d551SRalph Campbell * @addr: range virtual start address (inclusive)
655504ed29SJérôme Glisse * @end: range virtual end address (exclusive)
66a3eb13c1SJason Gunthorpe * @required_fault: HMM_NEED_* flags
675504ed29SJérôme Glisse * @walk: mm_walk structure
68f8c888a3SChristoph Hellwig * Return: -EBUSY after page fault, or page fault error
695504ed29SJérôme Glisse *
705504ed29SJérôme Glisse * This function will be called whenever pmd_none() or pte_none() returns true,
715504ed29SJérôme Glisse * or whenever there is no page directory covering the virtual address range.
725504ed29SJérôme Glisse */
hmm_vma_fault(unsigned long addr,unsigned long end,unsigned int required_fault,struct mm_walk * walk)73f8c888a3SChristoph Hellwig static int hmm_vma_fault(unsigned long addr, unsigned long end,
74a3eb13c1SJason Gunthorpe unsigned int required_fault, struct mm_walk *walk)
75da4c3c73SJérôme Glisse {
7674eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private;
775a0c38d3SChristoph Hellwig struct vm_area_struct *vma = walk->vma;
785a0c38d3SChristoph Hellwig unsigned int fault_flags = FAULT_FLAG_REMOTE;
79da4c3c73SJérôme Glisse
80a3eb13c1SJason Gunthorpe WARN_ON_ONCE(!required_fault);
8174eee180SJérôme Glisse hmm_vma_walk->last = addr;
8263d5066fSJérôme Glisse
83a3eb13c1SJason Gunthorpe if (required_fault & HMM_NEED_WRITE_FAULT) {
845a0c38d3SChristoph Hellwig if (!(vma->vm_flags & VM_WRITE))
85c18ce674SRalph Campbell return -EPERM;
865a0c38d3SChristoph Hellwig fault_flags |= FAULT_FLAG_WRITE;
8774eee180SJérôme Glisse }
8874eee180SJérôme Glisse
8953bfe17fSJason Gunthorpe for (; addr < end; addr += PAGE_SIZE)
90bce617edSPeter Xu if (handle_mm_fault(vma, addr, fault_flags, NULL) &
91bce617edSPeter Xu VM_FAULT_ERROR)
925a0c38d3SChristoph Hellwig return -EFAULT;
9353bfe17fSJason Gunthorpe return -EBUSY;
942aee09d8SJérôme Glisse }
952aee09d8SJérôme Glisse
hmm_pte_need_fault(const struct hmm_vma_walk * hmm_vma_walk,unsigned long pfn_req_flags,unsigned long cpu_flags)96a3eb13c1SJason Gunthorpe static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
972733ea14SJason Gunthorpe unsigned long pfn_req_flags,
982733ea14SJason Gunthorpe unsigned long cpu_flags)
992aee09d8SJérôme Glisse {
100f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range;
101f88a1e90SJérôme Glisse
102023a019aSJérôme Glisse /*
103023a019aSJérôme Glisse * So we not only consider the individual per page request we also
104023a019aSJérôme Glisse * consider the default flags requested for the range. The API can
105d2e8d551SRalph Campbell * be used 2 ways. The first one where the HMM user coalesces
106d2e8d551SRalph Campbell * multiple page faults into one request and sets flags per pfn for
107d2e8d551SRalph Campbell * those faults. The second one where the HMM user wants to pre-
108023a019aSJérôme Glisse * fault a range with specific flags. For the latter one it is a
109023a019aSJérôme Glisse * waste to have the user pre-fill the pfn arrays with a default
110023a019aSJérôme Glisse * flags value.
111023a019aSJérôme Glisse */
1122733ea14SJason Gunthorpe pfn_req_flags &= range->pfn_flags_mask;
1132733ea14SJason Gunthorpe pfn_req_flags |= range->default_flags;
114023a019aSJérôme Glisse
1152aee09d8SJérôme Glisse /* We aren't ask to do anything ... */
1162733ea14SJason Gunthorpe if (!(pfn_req_flags & HMM_PFN_REQ_FAULT))
117a3eb13c1SJason Gunthorpe return 0;
118f88a1e90SJérôme Glisse
119f88a1e90SJérôme Glisse /* Need to write fault ? */
1202733ea14SJason Gunthorpe if ((pfn_req_flags & HMM_PFN_REQ_WRITE) &&
1212733ea14SJason Gunthorpe !(cpu_flags & HMM_PFN_WRITE))
122a3eb13c1SJason Gunthorpe return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT;
123a3eb13c1SJason Gunthorpe
124a3eb13c1SJason Gunthorpe /* If CPU page table is not valid then we need to fault */
1252733ea14SJason Gunthorpe if (!(cpu_flags & HMM_PFN_VALID))
126a3eb13c1SJason Gunthorpe return HMM_NEED_FAULT;
127a3eb13c1SJason Gunthorpe return 0;
1282aee09d8SJérôme Glisse }
1292aee09d8SJérôme Glisse
130a3eb13c1SJason Gunthorpe static unsigned int
hmm_range_need_fault(const struct hmm_vma_walk * hmm_vma_walk,const unsigned long hmm_pfns[],unsigned long npages,unsigned long cpu_flags)131a3eb13c1SJason Gunthorpe hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
1322733ea14SJason Gunthorpe const unsigned long hmm_pfns[], unsigned long npages,
1332733ea14SJason Gunthorpe unsigned long cpu_flags)
1342aee09d8SJérôme Glisse {
1356bfef2f9SJason Gunthorpe struct hmm_range *range = hmm_vma_walk->range;
136a3eb13c1SJason Gunthorpe unsigned int required_fault = 0;
1372aee09d8SJérôme Glisse unsigned long i;
1382aee09d8SJérôme Glisse
1396bfef2f9SJason Gunthorpe /*
1406bfef2f9SJason Gunthorpe * If the default flags do not request to fault pages, and the mask does
1416bfef2f9SJason Gunthorpe * not allow for individual pages to be faulted, then
1426bfef2f9SJason Gunthorpe * hmm_pte_need_fault() will always return 0.
1436bfef2f9SJason Gunthorpe */
1446bfef2f9SJason Gunthorpe if (!((range->default_flags | range->pfn_flags_mask) &
1452733ea14SJason Gunthorpe HMM_PFN_REQ_FAULT))
146a3eb13c1SJason Gunthorpe return 0;
1472aee09d8SJérôme Glisse
1482aee09d8SJérôme Glisse for (i = 0; i < npages; ++i) {
1492733ea14SJason Gunthorpe required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i],
1502733ea14SJason Gunthorpe cpu_flags);
151a3eb13c1SJason Gunthorpe if (required_fault == HMM_NEED_ALL_BITS)
152a3eb13c1SJason Gunthorpe return required_fault;
1532aee09d8SJérôme Glisse }
154a3eb13c1SJason Gunthorpe return required_fault;
1552aee09d8SJérôme Glisse }
1562aee09d8SJérôme Glisse
hmm_vma_walk_hole(unsigned long addr,unsigned long end,__always_unused int depth,struct mm_walk * walk)1572aee09d8SJérôme Glisse static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
158b7a16c7aSSteven Price __always_unused int depth, struct mm_walk *walk)
1592aee09d8SJérôme Glisse {
1602aee09d8SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private;
1612aee09d8SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range;
162a3eb13c1SJason Gunthorpe unsigned int required_fault;
1632aee09d8SJérôme Glisse unsigned long i, npages;
1642733ea14SJason Gunthorpe unsigned long *hmm_pfns;
1652aee09d8SJérôme Glisse
1662aee09d8SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT;
1672aee09d8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT;
1682733ea14SJason Gunthorpe hmm_pfns = &range->hmm_pfns[i];
1692733ea14SJason Gunthorpe required_fault =
1702733ea14SJason Gunthorpe hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0);
171bd5d3587SJason Gunthorpe if (!walk->vma) {
172bd5d3587SJason Gunthorpe if (required_fault)
173bd5d3587SJason Gunthorpe return -EFAULT;
174bd5d3587SJason Gunthorpe return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR);
175bd5d3587SJason Gunthorpe }
176a3eb13c1SJason Gunthorpe if (required_fault)
177a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk);
1782733ea14SJason Gunthorpe return hmm_pfns_fill(addr, end, range, 0);
1792aee09d8SJérôme Glisse }
1802aee09d8SJérôme Glisse
hmm_pfn_flags_order(unsigned long order)1813b50a6e5SRalph Campbell static inline unsigned long hmm_pfn_flags_order(unsigned long order)
1823b50a6e5SRalph Campbell {
1833b50a6e5SRalph Campbell return order << HMM_PFN_ORDER_SHIFT;
1843b50a6e5SRalph Campbell }
1853b50a6e5SRalph Campbell
186188cb385SAndy Shevchenko #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmd_to_hmm_pfn_flags(struct hmm_range * range,pmd_t pmd)1872733ea14SJason Gunthorpe static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
1882733ea14SJason Gunthorpe pmd_t pmd)
1892aee09d8SJérôme Glisse {
1902aee09d8SJérôme Glisse if (pmd_protnone(pmd))
1912aee09d8SJérôme Glisse return 0;
1923b50a6e5SRalph Campbell return (pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
1933b50a6e5SRalph Campbell HMM_PFN_VALID) |
1943b50a6e5SRalph Campbell hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
195da4c3c73SJérôme Glisse }
196da4c3c73SJérôme Glisse
hmm_vma_handle_pmd(struct mm_walk * walk,unsigned long addr,unsigned long end,unsigned long hmm_pfns[],pmd_t pmd)1979d3973d6SChristoph Hellwig static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
1982733ea14SJason Gunthorpe unsigned long end, unsigned long hmm_pfns[],
1992733ea14SJason Gunthorpe pmd_t pmd)
2009d3973d6SChristoph Hellwig {
20153f5c3f4SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private;
202f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range;
2032aee09d8SJérôme Glisse unsigned long pfn, npages, i;
204a3eb13c1SJason Gunthorpe unsigned int required_fault;
2052733ea14SJason Gunthorpe unsigned long cpu_flags;
20653f5c3f4SJérôme Glisse
2072aee09d8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT;
208f88a1e90SJérôme Glisse cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
209a3eb13c1SJason Gunthorpe required_fault =
2102733ea14SJason Gunthorpe hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags);
211a3eb13c1SJason Gunthorpe if (required_fault)
212a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk);
21353f5c3f4SJérôme Glisse
214309f9a4fSChristoph Hellwig pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
215285e8718SLeon Romanovsky for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
216285e8718SLeon Romanovsky hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
217285e8718SLeon Romanovsky hmm_pfns[i] |= pfn | cpu_flags;
218285e8718SLeon Romanovsky }
21953f5c3f4SJérôme Glisse return 0;
22053f5c3f4SJérôme Glisse }
2219d3973d6SChristoph Hellwig #else /* CONFIG_TRANSPARENT_HUGEPAGE */
2229d3973d6SChristoph Hellwig /* stub to allow the code below to compile */
2239d3973d6SChristoph Hellwig int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
2242733ea14SJason Gunthorpe unsigned long end, unsigned long hmm_pfns[], pmd_t pmd);
2259d3973d6SChristoph Hellwig #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
22653f5c3f4SJérôme Glisse
pte_to_hmm_pfn_flags(struct hmm_range * range,pte_t pte)2272733ea14SJason Gunthorpe static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range,
2282733ea14SJason Gunthorpe pte_t pte)
2292aee09d8SJérôme Glisse {
230789c2af8SPhilip Yang if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
2312aee09d8SJérôme Glisse return 0;
2322733ea14SJason Gunthorpe return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
2332aee09d8SJérôme Glisse }
2342aee09d8SJérôme Glisse
hmm_vma_handle_pte(struct mm_walk * walk,unsigned long addr,unsigned long end,pmd_t * pmdp,pte_t * ptep,unsigned long * hmm_pfn)23553f5c3f4SJérôme Glisse static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
23653f5c3f4SJérôme Glisse unsigned long end, pmd_t *pmdp, pte_t *ptep,
2372733ea14SJason Gunthorpe unsigned long *hmm_pfn)
23853f5c3f4SJérôme Glisse {
23953f5c3f4SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private;
240f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range;
241a3eb13c1SJason Gunthorpe unsigned int required_fault;
2422733ea14SJason Gunthorpe unsigned long cpu_flags;
243c33c7948SRyan Roberts pte_t pte = ptep_get(ptep);
2442733ea14SJason Gunthorpe uint64_t pfn_req_flags = *hmm_pfn;
245285e8718SLeon Romanovsky uint64_t new_pfn_flags = 0;
24653f5c3f4SJérôme Glisse
247c093cf45SLorenzo Stoakes /*
248c093cf45SLorenzo Stoakes * Any other marker than a UFFD WP marker will result in a fault error
249c093cf45SLorenzo Stoakes * that will be correctly handled, so we need only check for UFFD WP
250c093cf45SLorenzo Stoakes * here.
251c093cf45SLorenzo Stoakes */
25268aa2fdbSLorenzo Stoakes if (pte_none(pte) || pte_is_uffd_wp_marker(pte)) {
2532733ea14SJason Gunthorpe required_fault =
2542733ea14SJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
255a3eb13c1SJason Gunthorpe if (required_fault)
25653f5c3f4SJérôme Glisse goto fault;
257285e8718SLeon Romanovsky goto out;
25853f5c3f4SJérôme Glisse }
25953f5c3f4SJérôme Glisse
26053f5c3f4SJérôme Glisse if (!pte_present(pte)) {
2619ff30bb9SLorenzo Stoakes const softleaf_t entry = softleaf_from_pte(pte);
26253f5c3f4SJérôme Glisse
26353f5c3f4SJérôme Glisse /*
2648a295dbbSRalph Campbell * Don't fault in device private pages owned by the caller,
2658a295dbbSRalph Campbell * just report the PFN.
26653f5c3f4SJérôme Glisse */
2679ff30bb9SLorenzo Stoakes if (softleaf_is_device_private(entry) &&
2689ff30bb9SLorenzo Stoakes page_pgmap(softleaf_to_page(entry))->owner ==
2698a295dbbSRalph Campbell range->dev_private_owner) {
2702733ea14SJason Gunthorpe cpu_flags = HMM_PFN_VALID;
2719ff30bb9SLorenzo Stoakes if (softleaf_is_device_private_write(entry))
2722733ea14SJason Gunthorpe cpu_flags |= HMM_PFN_WRITE;
27393976a20SLorenzo Stoakes new_pfn_flags = softleaf_to_pfn(entry) | cpu_flags;
274285e8718SLeon Romanovsky goto out;
27553f5c3f4SJérôme Glisse }
27653f5c3f4SJérôme Glisse
2772733ea14SJason Gunthorpe required_fault =
2782733ea14SJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
279285e8718SLeon Romanovsky if (!required_fault)
280285e8718SLeon Romanovsky goto out;
28176612d6cSJason Gunthorpe
2829ff30bb9SLorenzo Stoakes if (softleaf_is_swap(entry))
28376612d6cSJason Gunthorpe goto fault;
28476612d6cSJason Gunthorpe
2859ff30bb9SLorenzo Stoakes if (softleaf_is_device_private(entry))
2868a295dbbSRalph Campbell goto fault;
2878a295dbbSRalph Campbell
2889ff30bb9SLorenzo Stoakes if (softleaf_is_device_exclusive(entry))
289b756a3b5SAlistair Popple goto fault;
290b756a3b5SAlistair Popple
2919ff30bb9SLorenzo Stoakes if (softleaf_is_migration(entry)) {
29253f5c3f4SJérôme Glisse pte_unmap(ptep);
29353f5c3f4SJérôme Glisse hmm_vma_walk->last = addr;
294d2e8d551SRalph Campbell migration_entry_wait(walk->mm, pmdp, addr);
29573231612SJérôme Glisse return -EBUSY;
29653f5c3f4SJérôme Glisse }
29753f5c3f4SJérôme Glisse
29853f5c3f4SJérôme Glisse /* Report error for everything else */
299dfdc2207SJason Gunthorpe pte_unmap(ptep);
30053f5c3f4SJérôme Glisse return -EFAULT;
30153f5c3f4SJérôme Glisse }
30253f5c3f4SJérôme Glisse
30376612d6cSJason Gunthorpe cpu_flags = pte_to_hmm_pfn_flags(range, pte);
3042733ea14SJason Gunthorpe required_fault =
3052733ea14SJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
306a3eb13c1SJason Gunthorpe if (required_fault)
30753f5c3f4SJérôme Glisse goto fault;
30853f5c3f4SJérôme Glisse
30940550627SJason Gunthorpe /*
31040550627SJason Gunthorpe * Since each architecture defines a struct page for the zero page, just
31140550627SJason Gunthorpe * fall through and treat it like a normal page.
31240550627SJason Gunthorpe */
31387c01d57SAlistair Popple if (!vm_normal_page(walk->vma, addr, pte) &&
3144b42fb21SLi Zhijian !is_zero_pfn(pte_pfn(pte))) {
3152733ea14SJason Gunthorpe if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
316dfdc2207SJason Gunthorpe pte_unmap(ptep);
317992de9a8SJérôme Glisse return -EFAULT;
318992de9a8SJérôme Glisse }
319285e8718SLeon Romanovsky new_pfn_flags = HMM_PFN_ERROR;
320285e8718SLeon Romanovsky goto out;
321ac541f25SRalph Campbell }
322992de9a8SJérôme Glisse
323285e8718SLeon Romanovsky new_pfn_flags = pte_pfn(pte) | cpu_flags;
324285e8718SLeon Romanovsky out:
325285e8718SLeon Romanovsky *hmm_pfn = (*hmm_pfn & HMM_PFN_INOUT_FLAGS) | new_pfn_flags;
32653f5c3f4SJérôme Glisse return 0;
32753f5c3f4SJérôme Glisse
32853f5c3f4SJérôme Glisse fault:
32953f5c3f4SJérôme Glisse pte_unmap(ptep);
33053f5c3f4SJérôme Glisse /* Fault any virtual address we were asked to fault */
331a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk);
33253f5c3f4SJérôme Glisse }
33353f5c3f4SJérôme Glisse
33410b9feeeSFrancois Dugast #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
hmm_vma_handle_absent_pmd(struct mm_walk * walk,unsigned long start,unsigned long end,unsigned long * hmm_pfns,pmd_t pmd)33510b9feeeSFrancois Dugast static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
33610b9feeeSFrancois Dugast unsigned long end, unsigned long *hmm_pfns,
33710b9feeeSFrancois Dugast pmd_t pmd)
33810b9feeeSFrancois Dugast {
33910b9feeeSFrancois Dugast struct hmm_vma_walk *hmm_vma_walk = walk->private;
34010b9feeeSFrancois Dugast struct hmm_range *range = hmm_vma_walk->range;
34110b9feeeSFrancois Dugast unsigned long npages = (end - start) >> PAGE_SHIFT;
3420ac881efSLorenzo Stoakes const softleaf_t entry = softleaf_from_pmd(pmd);
34310b9feeeSFrancois Dugast unsigned long addr = start;
34410b9feeeSFrancois Dugast unsigned int required_fault;
34510b9feeeSFrancois Dugast
3460ac881efSLorenzo Stoakes if (softleaf_is_device_private(entry) &&
3470ac881efSLorenzo Stoakes softleaf_to_folio(entry)->pgmap->owner ==
34810b9feeeSFrancois Dugast range->dev_private_owner) {
34910b9feeeSFrancois Dugast unsigned long cpu_flags = HMM_PFN_VALID |
35010b9feeeSFrancois Dugast hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
3510ac881efSLorenzo Stoakes unsigned long pfn = softleaf_to_pfn(entry);
35210b9feeeSFrancois Dugast unsigned long i;
35310b9feeeSFrancois Dugast
3540ac881efSLorenzo Stoakes if (softleaf_is_device_private_write(entry))
35510b9feeeSFrancois Dugast cpu_flags |= HMM_PFN_WRITE;
35610b9feeeSFrancois Dugast
35710b9feeeSFrancois Dugast /*
35810b9feeeSFrancois Dugast * Fully populate the PFN list though subsequent PFNs could be
35910b9feeeSFrancois Dugast * inferred, because drivers which are not yet aware of large
36010b9feeeSFrancois Dugast * folios probably do not support sparsely populated PFN lists.
36110b9feeeSFrancois Dugast */
36210b9feeeSFrancois Dugast for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
36310b9feeeSFrancois Dugast hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
36410b9feeeSFrancois Dugast hmm_pfns[i] |= pfn | cpu_flags;
36510b9feeeSFrancois Dugast }
36610b9feeeSFrancois Dugast
36710b9feeeSFrancois Dugast return 0;
36810b9feeeSFrancois Dugast }
36910b9feeeSFrancois Dugast
37010b9feeeSFrancois Dugast required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
37110b9feeeSFrancois Dugast npages, 0);
37210b9feeeSFrancois Dugast if (required_fault) {
3730ac881efSLorenzo Stoakes if (softleaf_is_device_private(entry))
37410b9feeeSFrancois Dugast return hmm_vma_fault(addr, end, required_fault, walk);
37510b9feeeSFrancois Dugast else
37610b9feeeSFrancois Dugast return -EFAULT;
37710b9feeeSFrancois Dugast }
37810b9feeeSFrancois Dugast
37910b9feeeSFrancois Dugast return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
38010b9feeeSFrancois Dugast }
38110b9feeeSFrancois Dugast #else
hmm_vma_handle_absent_pmd(struct mm_walk * walk,unsigned long start,unsigned long end,unsigned long * hmm_pfns,pmd_t pmd)38210b9feeeSFrancois Dugast static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
38310b9feeeSFrancois Dugast unsigned long end, unsigned long *hmm_pfns,
38410b9feeeSFrancois Dugast pmd_t pmd)
38510b9feeeSFrancois Dugast {
38610b9feeeSFrancois Dugast struct hmm_vma_walk *hmm_vma_walk = walk->private;
38710b9feeeSFrancois Dugast struct hmm_range *range = hmm_vma_walk->range;
38810b9feeeSFrancois Dugast unsigned long npages = (end - start) >> PAGE_SHIFT;
38910b9feeeSFrancois Dugast
39010b9feeeSFrancois Dugast if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
39110b9feeeSFrancois Dugast return -EFAULT;
39210b9feeeSFrancois Dugast return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
39310b9feeeSFrancois Dugast }
39410b9feeeSFrancois Dugast #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
39510b9feeeSFrancois Dugast
hmm_vma_walk_pmd(pmd_t * pmdp,unsigned long start,unsigned long end,struct mm_walk * walk)396da4c3c73SJérôme Glisse static int hmm_vma_walk_pmd(pmd_t *pmdp,
397da4c3c73SJérôme Glisse unsigned long start,
398da4c3c73SJérôme Glisse unsigned long end,
399da4c3c73SJérôme Glisse struct mm_walk *walk)
400da4c3c73SJérôme Glisse {
40174eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private;
40274eee180SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range;
4032733ea14SJason Gunthorpe unsigned long *hmm_pfns =
4042733ea14SJason Gunthorpe &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
4052288a9a6SJason Gunthorpe unsigned long npages = (end - start) >> PAGE_SHIFT;
4062288a9a6SJason Gunthorpe unsigned long addr = start;
407da4c3c73SJérôme Glisse pte_t *ptep;
408da4c3c73SJérôme Glisse pmd_t pmd;
409da4c3c73SJérôme Glisse
410d08faca0SJérôme Glisse again:
41126e1a0c3SHugh Dickins pmd = pmdp_get_lockless(pmdp);
412d08faca0SJérôme Glisse if (pmd_none(pmd))
413b7a16c7aSSteven Price return hmm_vma_walk_hole(start, end, -1, walk);
414d08faca0SJérôme Glisse
4150ac881efSLorenzo Stoakes if (thp_migration_supported() && pmd_is_migration_entry(pmd)) {
4162733ea14SJason Gunthorpe if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
417d08faca0SJérôme Glisse hmm_vma_walk->last = addr;
418d2e8d551SRalph Campbell pmd_migration_entry_wait(walk->mm, pmdp);
41973231612SJérôme Glisse return -EBUSY;
420d08faca0SJérôme Glisse }
4212733ea14SJason Gunthorpe return hmm_pfns_fill(start, end, range, 0);
4222288a9a6SJason Gunthorpe }
4232288a9a6SJason Gunthorpe
42410b9feeeSFrancois Dugast if (!pmd_present(pmd))
42510b9feeeSFrancois Dugast return hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns,
42610b9feeeSFrancois Dugast pmd);
427d08faca0SJérôme Glisse
4288a6a984cSAlistair Popple if (pmd_trans_huge(pmd)) {
429da4c3c73SJérôme Glisse /*
430d2e8d551SRalph Campbell * No need to take pmd_lock here, even if some other thread
431da4c3c73SJérôme Glisse * is splitting the huge pmd we will get that event through
432da4c3c73SJérôme Glisse * mmu_notifier callback.
433da4c3c73SJérôme Glisse *
434d2e8d551SRalph Campbell * So just read pmd value and check again it's a transparent
435da4c3c73SJérôme Glisse * huge or device mapping one and compute corresponding pfn
436da4c3c73SJérôme Glisse * values.
437da4c3c73SJérôme Glisse */
438dab6e717SPeter Zijlstra pmd = pmdp_get_lockless(pmdp);
4398a6a984cSAlistair Popple if (!pmd_trans_huge(pmd))
440da4c3c73SJérôme Glisse goto again;
441da4c3c73SJérôme Glisse
4422733ea14SJason Gunthorpe return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
443da4c3c73SJérôme Glisse }
444da4c3c73SJérôme Glisse
445d08faca0SJérôme Glisse /*
446d2e8d551SRalph Campbell * We have handled all the valid cases above ie either none, migration,
447d08faca0SJérôme Glisse * huge or transparent huge. At this point either it is a valid pmd
448d08faca0SJérôme Glisse * entry pointing to pte directory or it is a bad pmd that will not
449d08faca0SJérôme Glisse * recover.
450d08faca0SJérôme Glisse */
4512288a9a6SJason Gunthorpe if (pmd_bad(pmd)) {
4522733ea14SJason Gunthorpe if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
4532288a9a6SJason Gunthorpe return -EFAULT;
454d28c2c9aSRalph Campbell return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
4552288a9a6SJason Gunthorpe }
456da4c3c73SJérôme Glisse
457da4c3c73SJérôme Glisse ptep = pte_offset_map(pmdp, addr);
4586ec1905fSHugh Dickins if (!ptep)
4596ec1905fSHugh Dickins goto again;
4602733ea14SJason Gunthorpe for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
46153f5c3f4SJérôme Glisse int r;
462da4c3c73SJérôme Glisse
4632733ea14SJason Gunthorpe r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
46453f5c3f4SJérôme Glisse if (r) {
465dfdc2207SJason Gunthorpe /* hmm_vma_handle_pte() did pte_unmap() */
46653f5c3f4SJérôme Glisse return r;
46774eee180SJérôme Glisse }
468da4c3c73SJérôme Glisse }
469da4c3c73SJérôme Glisse pte_unmap(ptep - 1);
470da4c3c73SJérôme Glisse return 0;
471da4c3c73SJérôme Glisse }
472da4c3c73SJérôme Glisse
473d438d273SAlistair Popple #if defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
pud_to_hmm_pfn_flags(struct hmm_range * range,pud_t pud)4742733ea14SJason Gunthorpe static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
4752733ea14SJason Gunthorpe pud_t pud)
476f0b3c45cSChristoph Hellwig {
477f0b3c45cSChristoph Hellwig if (!pud_present(pud))
478f0b3c45cSChristoph Hellwig return 0;
4793b50a6e5SRalph Campbell return (pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
4803b50a6e5SRalph Campbell HMM_PFN_VALID) |
4813b50a6e5SRalph Campbell hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT);
482f0b3c45cSChristoph Hellwig }
483f0b3c45cSChristoph Hellwig
hmm_vma_walk_pud(pud_t * pudp,unsigned long start,unsigned long end,struct mm_walk * walk)484f0b3c45cSChristoph Hellwig static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
485992de9a8SJérôme Glisse struct mm_walk *walk)
486992de9a8SJérôme Glisse {
487992de9a8SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private;
488992de9a8SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range;
4893afc4236SSteven Price unsigned long addr = start;
490992de9a8SJérôme Glisse pud_t pud;
4913afc4236SSteven Price spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
492992de9a8SJérôme Glisse
4933afc4236SSteven Price if (!ptl)
4943afc4236SSteven Price return 0;
4953afc4236SSteven Price
4963afc4236SSteven Price /* Normally we don't want to split the huge page */
4973afc4236SSteven Price walk->action = ACTION_CONTINUE;
4983afc4236SSteven Price
499c0efdb37SAnshuman Khandual pud = pudp_get(pudp);
5009abc71b4SPeter Xu if (!pud_present(pud)) {
50105fc1df9SJason Gunthorpe spin_unlock(ptl);
50205fc1df9SJason Gunthorpe return hmm_vma_walk_hole(start, end, -1, walk);
5033afc4236SSteven Price }
504992de9a8SJérôme Glisse
5050544f3f7SAlistair Popple if (pud_leaf(pud)) {
506992de9a8SJérôme Glisse unsigned long i, npages, pfn;
507a3eb13c1SJason Gunthorpe unsigned int required_fault;
5082733ea14SJason Gunthorpe unsigned long *hmm_pfns;
5092733ea14SJason Gunthorpe unsigned long cpu_flags;
510992de9a8SJérôme Glisse
511992de9a8SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT;
512992de9a8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT;
5132733ea14SJason Gunthorpe hmm_pfns = &range->hmm_pfns[i];
514992de9a8SJérôme Glisse
515992de9a8SJérôme Glisse cpu_flags = pud_to_hmm_pfn_flags(range, pud);
5162733ea14SJason Gunthorpe required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
517a3eb13c1SJason Gunthorpe npages, cpu_flags);
518a3eb13c1SJason Gunthorpe if (required_fault) {
51905fc1df9SJason Gunthorpe spin_unlock(ptl);
520a3eb13c1SJason Gunthorpe return hmm_vma_fault(addr, end, required_fault, walk);
5213afc4236SSteven Price }
522992de9a8SJérôme Glisse
523992de9a8SJérôme Glisse pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
524285e8718SLeon Romanovsky for (i = 0; i < npages; ++i, ++pfn) {
525285e8718SLeon Romanovsky hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
526285e8718SLeon Romanovsky hmm_pfns[i] |= pfn | cpu_flags;
527285e8718SLeon Romanovsky }
5283afc4236SSteven Price goto out_unlock;
529992de9a8SJérôme Glisse }
530992de9a8SJérôme Glisse
5313afc4236SSteven Price /* Ask for the PUD to be split */
5323afc4236SSteven Price walk->action = ACTION_SUBTREE;
533992de9a8SJérôme Glisse
5343afc4236SSteven Price out_unlock:
5353afc4236SSteven Price spin_unlock(ptl);
536d0977efaSMiaohe Lin return 0;
537992de9a8SJérôme Glisse }
538f0b3c45cSChristoph Hellwig #else
539f0b3c45cSChristoph Hellwig #define hmm_vma_walk_pud NULL
540f0b3c45cSChristoph Hellwig #endif
541992de9a8SJérôme Glisse
542251bbe59SChristoph Hellwig #ifdef CONFIG_HUGETLB_PAGE
hmm_vma_walk_hugetlb_entry(pte_t * pte,unsigned long hmask,unsigned long start,unsigned long end,struct mm_walk * walk)54363d5066fSJérôme Glisse static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
54463d5066fSJérôme Glisse unsigned long start, unsigned long end,
54563d5066fSJérôme Glisse struct mm_walk *walk)
54663d5066fSJérôme Glisse {
54705c23af4SChristoph Hellwig unsigned long addr = start, i, pfn;
54863d5066fSJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private;
54963d5066fSJérôme Glisse struct hmm_range *range = hmm_vma_walk->range;
55063d5066fSJérôme Glisse struct vm_area_struct *vma = walk->vma;
551a3eb13c1SJason Gunthorpe unsigned int required_fault;
5522733ea14SJason Gunthorpe unsigned long pfn_req_flags;
5532733ea14SJason Gunthorpe unsigned long cpu_flags;
55463d5066fSJérôme Glisse spinlock_t *ptl;
55563d5066fSJérôme Glisse pte_t entry;
55663d5066fSJérôme Glisse
557d2e8d551SRalph Campbell ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
558e6c0c032SChristophe Leroy entry = huge_ptep_get(walk->mm, addr, pte);
55963d5066fSJérôme Glisse
5607f08263dSChristoph Hellwig i = (start - range->start) >> PAGE_SHIFT;
5612733ea14SJason Gunthorpe pfn_req_flags = range->hmm_pfns[i];
5623b50a6e5SRalph Campbell cpu_flags = pte_to_hmm_pfn_flags(range, entry) |
5633b50a6e5SRalph Campbell hmm_pfn_flags_order(huge_page_order(hstate_vma(vma)));
5642733ea14SJason Gunthorpe required_fault =
5652733ea14SJason Gunthorpe hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
566a3eb13c1SJason Gunthorpe if (required_fault) {
567dd361e50SPeter Xu int ret;
568dd361e50SPeter Xu
56945050692SChristoph Hellwig spin_unlock(ptl);
570dd361e50SPeter Xu hugetlb_vma_unlock_read(vma);
571dd361e50SPeter Xu /*
572dd361e50SPeter Xu * Avoid deadlock: drop the vma lock before calling
573dd361e50SPeter Xu * hmm_vma_fault(), which will itself potentially take and
574dd361e50SPeter Xu * drop the vma lock. This is also correct from a
575dd361e50SPeter Xu * protection point of view, because there is no further
576dd361e50SPeter Xu * use here of either pte or ptl after dropping the vma
577dd361e50SPeter Xu * lock.
578dd361e50SPeter Xu */
579dd361e50SPeter Xu ret = hmm_vma_fault(addr, end, required_fault, walk);
580dd361e50SPeter Xu hugetlb_vma_lock_read(vma);
581dd361e50SPeter Xu return ret;
58263d5066fSJérôme Glisse }
58363d5066fSJérôme Glisse
58405c23af4SChristoph Hellwig pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
585285e8718SLeon Romanovsky for (; addr < end; addr += PAGE_SIZE, i++, pfn++) {
586285e8718SLeon Romanovsky range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
587285e8718SLeon Romanovsky range->hmm_pfns[i] |= pfn | cpu_flags;
588285e8718SLeon Romanovsky }
5892733ea14SJason Gunthorpe
59063d5066fSJérôme Glisse spin_unlock(ptl);
59145050692SChristoph Hellwig return 0;
59263d5066fSJérôme Glisse }
593251bbe59SChristoph Hellwig #else
594251bbe59SChristoph Hellwig #define hmm_vma_walk_hugetlb_entry NULL
595251bbe59SChristoph Hellwig #endif /* CONFIG_HUGETLB_PAGE */
59663d5066fSJérôme Glisse
hmm_vma_walk_test(unsigned long start,unsigned long end,struct mm_walk * walk)597d28c2c9aSRalph Campbell static int hmm_vma_walk_test(unsigned long start, unsigned long end,
598d28c2c9aSRalph Campbell struct mm_walk *walk)
59933cd47dcSJérôme Glisse {
600d28c2c9aSRalph Campbell struct hmm_vma_walk *hmm_vma_walk = walk->private;
601d28c2c9aSRalph Campbell struct hmm_range *range = hmm_vma_walk->range;
602d28c2c9aSRalph Campbell struct vm_area_struct *vma = walk->vma;
603d28c2c9aSRalph Campbell
60487c01d57SAlistair Popple if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
605a3eb13c1SJason Gunthorpe vma->vm_flags & VM_READ)
606a3eb13c1SJason Gunthorpe return 0;
607a3eb13c1SJason Gunthorpe
608d28c2c9aSRalph Campbell /*
609a3eb13c1SJason Gunthorpe * vma ranges that don't have struct page backing them or map I/O
610a3eb13c1SJason Gunthorpe * devices directly cannot be handled by hmm_range_fault().
611c2579c9cSJason Gunthorpe *
612d28c2c9aSRalph Campbell * If the vma does not allow read access, then assume that it does not
613c2579c9cSJason Gunthorpe * allow write access either. HMM does not support architectures that
614c2579c9cSJason Gunthorpe * allow write without read.
615a3eb13c1SJason Gunthorpe *
616a3eb13c1SJason Gunthorpe * If a fault is requested for an unsupported range then it is a hard
617a3eb13c1SJason Gunthorpe * failure.
618d28c2c9aSRalph Campbell */
619a3eb13c1SJason Gunthorpe if (hmm_range_need_fault(hmm_vma_walk,
6202733ea14SJason Gunthorpe range->hmm_pfns +
621d28c2c9aSRalph Campbell ((start - range->start) >> PAGE_SHIFT),
622a3eb13c1SJason Gunthorpe (end - start) >> PAGE_SHIFT, 0))
623d28c2c9aSRalph Campbell return -EFAULT;
624d28c2c9aSRalph Campbell
625c2579c9cSJason Gunthorpe hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
626d28c2c9aSRalph Campbell
627d28c2c9aSRalph Campbell /* Skip this vma and continue processing the next vma. */
628d28c2c9aSRalph Campbell return 1;
629d28c2c9aSRalph Campbell }
630d28c2c9aSRalph Campbell
6317b86ac33SChristoph Hellwig static const struct mm_walk_ops hmm_walk_ops = {
6327b86ac33SChristoph Hellwig .pud_entry = hmm_vma_walk_pud,
6337b86ac33SChristoph Hellwig .pmd_entry = hmm_vma_walk_pmd,
6347b86ac33SChristoph Hellwig .pte_hole = hmm_vma_walk_hole,
6357b86ac33SChristoph Hellwig .hugetlb_entry = hmm_vma_walk_hugetlb_entry,
636d28c2c9aSRalph Campbell .test_walk = hmm_vma_walk_test,
63749b06385SSuren Baghdasaryan .walk_lock = PGWALK_RDLOCK,
6387b86ac33SChristoph Hellwig };
6397b86ac33SChristoph Hellwig
6409a4903e4SChristoph Hellwig /**
6419a4903e4SChristoph Hellwig * hmm_range_fault - try to fault some address in a virtual address range
642f970b977SJason Gunthorpe * @range: argument structure
64373231612SJérôme Glisse *
644be957c88SJason Gunthorpe * Returns 0 on success or one of the following error codes:
6459a4903e4SChristoph Hellwig *
6469a4903e4SChristoph Hellwig * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma
6479a4903e4SChristoph Hellwig * (e.g., device file vma).
64873231612SJérôme Glisse * -ENOMEM: Out of memory.
6499a4903e4SChristoph Hellwig * -EPERM: Invalid permission (e.g., asking for write and range is read
6509a4903e4SChristoph Hellwig * only).
6519a4903e4SChristoph Hellwig * -EBUSY: The range has been invalidated and the caller needs to wait for
6529a4903e4SChristoph Hellwig * the invalidation to finish.
653f970b977SJason Gunthorpe * -EFAULT: A page was requested to be valid and could not be made valid
654f970b977SJason Gunthorpe * ie it has no backing VMA or it is illegal to access
65574eee180SJérôme Glisse *
656f970b977SJason Gunthorpe * This is similar to get_user_pages(), except that it can read the page tables
657f970b977SJason Gunthorpe * without mutating them (ie causing faults).
65874eee180SJérôme Glisse */
hmm_range_fault(struct hmm_range * range)659be957c88SJason Gunthorpe int hmm_range_fault(struct hmm_range *range)
66074eee180SJérôme Glisse {
661d28c2c9aSRalph Campbell struct hmm_vma_walk hmm_vma_walk = {
662d28c2c9aSRalph Campbell .range = range,
663d28c2c9aSRalph Campbell .last = range->start,
664d28c2c9aSRalph Campbell };
665a22dd506SJason Gunthorpe struct mm_struct *mm = range->notifier->mm;
66674eee180SJérôme Glisse int ret;
66774eee180SJérôme Glisse
66842fc5414SMichel Lespinasse mmap_assert_locked(mm);
669a3e0d41cSJérôme Glisse
670a3e0d41cSJérôme Glisse do {
671a3e0d41cSJérôme Glisse /* If range is no longer valid force retry. */
672a22dd506SJason Gunthorpe if (mmu_interval_check_retry(range->notifier,
673a22dd506SJason Gunthorpe range->notifier_seq))
6742bcbeaefSChristoph Hellwig return -EBUSY;
675d28c2c9aSRalph Campbell ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
6767b86ac33SChristoph Hellwig &hmm_walk_ops, &hmm_vma_walk);
677be957c88SJason Gunthorpe /*
678be957c88SJason Gunthorpe * When -EBUSY is returned the loop restarts with
679be957c88SJason Gunthorpe * hmm_vma_walk.last set to an address that has not been stored
680be957c88SJason Gunthorpe * in pfns. All entries < last in the pfn array are set to their
681be957c88SJason Gunthorpe * output, and all >= are still at their input values.
682be957c88SJason Gunthorpe */
683d28c2c9aSRalph Campbell } while (ret == -EBUSY);
68473231612SJérôme Glisse return ret;
68574eee180SJérôme Glisse }
68673231612SJérôme Glisse EXPORT_SYMBOL(hmm_range_fault);
6878cad4713SLeon Romanovsky
6888cad4713SLeon Romanovsky /**
6898cad4713SLeon Romanovsky * hmm_dma_map_alloc - Allocate HMM map structure
6908cad4713SLeon Romanovsky * @dev: device to allocate structure for
6918cad4713SLeon Romanovsky * @map: HMM map to allocate
6928cad4713SLeon Romanovsky * @nr_entries: number of entries in the map
6938cad4713SLeon Romanovsky * @dma_entry_size: size of the DMA entry in the map
6948cad4713SLeon Romanovsky *
6958cad4713SLeon Romanovsky * Allocate the HMM map structure and all the lists it contains.
6968cad4713SLeon Romanovsky * Return 0 on success, -ENOMEM on failure.
6978cad4713SLeon Romanovsky */
hmm_dma_map_alloc(struct device * dev,struct hmm_dma_map * map,size_t nr_entries,size_t dma_entry_size)6988cad4713SLeon Romanovsky int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
6998cad4713SLeon Romanovsky size_t nr_entries, size_t dma_entry_size)
7008cad4713SLeon Romanovsky {
7018cad4713SLeon Romanovsky bool dma_need_sync = false;
7028cad4713SLeon Romanovsky bool use_iova;
7038cad4713SLeon Romanovsky
704259e9bd0SDaisuke Matsuda WARN_ON_ONCE(!(nr_entries * PAGE_SIZE / dma_entry_size));
7058cad4713SLeon Romanovsky
7068cad4713SLeon Romanovsky /*
7078cad4713SLeon Romanovsky * The HMM API violates our normal DMA buffer ownership rules and can't
7088cad4713SLeon Romanovsky * transfer buffer ownership. The dma_addressing_limited() check is a
7098cad4713SLeon Romanovsky * best approximation to ensure no swiotlb buffering happens.
7108cad4713SLeon Romanovsky */
7118cad4713SLeon Romanovsky #ifdef CONFIG_DMA_NEED_SYNC
7128cad4713SLeon Romanovsky dma_need_sync = !dev->dma_skip_sync;
7138cad4713SLeon Romanovsky #endif /* CONFIG_DMA_NEED_SYNC */
7148cad4713SLeon Romanovsky if (dma_need_sync || dma_addressing_limited(dev))
7158cad4713SLeon Romanovsky return -EOPNOTSUPP;
7168cad4713SLeon Romanovsky
7178cad4713SLeon Romanovsky map->dma_entry_size = dma_entry_size;
7188cad4713SLeon Romanovsky map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
7198cad4713SLeon Romanovsky GFP_KERNEL | __GFP_NOWARN);
7208cad4713SLeon Romanovsky if (!map->pfn_list)
7218cad4713SLeon Romanovsky return -ENOMEM;
7228cad4713SLeon Romanovsky
7238cad4713SLeon Romanovsky use_iova = dma_iova_try_alloc(dev, &map->state, 0,
7248cad4713SLeon Romanovsky nr_entries * PAGE_SIZE);
7258cad4713SLeon Romanovsky if (!use_iova && dma_need_unmap(dev)) {
72669050f8dSKees Cook map->dma_list = kvzalloc_objs(*map->dma_list, nr_entries,
7278cad4713SLeon Romanovsky GFP_KERNEL | __GFP_NOWARN);
7288cad4713SLeon Romanovsky if (!map->dma_list)
7298cad4713SLeon Romanovsky goto err_dma;
7308cad4713SLeon Romanovsky }
7318cad4713SLeon Romanovsky return 0;
7328cad4713SLeon Romanovsky
7338cad4713SLeon Romanovsky err_dma:
7348cad4713SLeon Romanovsky kvfree(map->pfn_list);
7358cad4713SLeon Romanovsky return -ENOMEM;
7368cad4713SLeon Romanovsky }
7378cad4713SLeon Romanovsky EXPORT_SYMBOL_GPL(hmm_dma_map_alloc);
7388cad4713SLeon Romanovsky
7398cad4713SLeon Romanovsky /**
7408cad4713SLeon Romanovsky * hmm_dma_map_free - iFree HMM map structure
7418cad4713SLeon Romanovsky * @dev: device to free structure from
7428cad4713SLeon Romanovsky * @map: HMM map containing the various lists and state
7438cad4713SLeon Romanovsky *
7448cad4713SLeon Romanovsky * Free the HMM map structure and all the lists it contains.
7458cad4713SLeon Romanovsky */
hmm_dma_map_free(struct device * dev,struct hmm_dma_map * map)7468cad4713SLeon Romanovsky void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map)
7478cad4713SLeon Romanovsky {
7488cad4713SLeon Romanovsky if (dma_use_iova(&map->state))
7498cad4713SLeon Romanovsky dma_iova_free(dev, &map->state);
7508cad4713SLeon Romanovsky kvfree(map->pfn_list);
7518cad4713SLeon Romanovsky kvfree(map->dma_list);
7528cad4713SLeon Romanovsky }
7538cad4713SLeon Romanovsky EXPORT_SYMBOL_GPL(hmm_dma_map_free);
7548cad4713SLeon Romanovsky
7558cad4713SLeon Romanovsky /**
7568cad4713SLeon Romanovsky * hmm_dma_map_pfn - Map a physical HMM page to DMA address
7578cad4713SLeon Romanovsky * @dev: Device to map the page for
7588cad4713SLeon Romanovsky * @map: HMM map
7598cad4713SLeon Romanovsky * @idx: Index into the PFN and dma address arrays
7608cad4713SLeon Romanovsky * @p2pdma_state: PCI P2P state.
7618cad4713SLeon Romanovsky *
7628cad4713SLeon Romanovsky * dma_alloc_iova() allocates IOVA based on the size specified by their use in
7638cad4713SLeon Romanovsky * iova->size. Call this function after IOVA allocation to link whole @page
7648cad4713SLeon Romanovsky * to get the DMA address. Note that very first call to this function
7658cad4713SLeon Romanovsky * will have @offset set to 0 in the IOVA space allocated from
7668cad4713SLeon Romanovsky * dma_alloc_iova(). For subsequent calls to this function on same @iova,
7678cad4713SLeon Romanovsky * @offset needs to be advanced by the caller with the size of previous
7688cad4713SLeon Romanovsky * page that was linked + DMA address returned for the previous page that was
7698cad4713SLeon Romanovsky * linked by this function.
7708cad4713SLeon Romanovsky */
hmm_dma_map_pfn(struct device * dev,struct hmm_dma_map * map,size_t idx,struct pci_p2pdma_map_state * p2pdma_state)7718cad4713SLeon Romanovsky dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
7728cad4713SLeon Romanovsky size_t idx,
7738cad4713SLeon Romanovsky struct pci_p2pdma_map_state *p2pdma_state)
7748cad4713SLeon Romanovsky {
7758cad4713SLeon Romanovsky struct dma_iova_state *state = &map->state;
7768cad4713SLeon Romanovsky dma_addr_t *dma_addrs = map->dma_list;
7778cad4713SLeon Romanovsky unsigned long *pfns = map->pfn_list;
7788cad4713SLeon Romanovsky struct page *page = hmm_pfn_to_page(pfns[idx]);
7798cad4713SLeon Romanovsky phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]);
7808cad4713SLeon Romanovsky size_t offset = idx * map->dma_entry_size;
781*f5ebf241SLeon Romanovsky unsigned long attrs = DMA_ATTR_REQUIRE_COHERENT;
7828cad4713SLeon Romanovsky dma_addr_t dma_addr;
7838cad4713SLeon Romanovsky int ret;
7848cad4713SLeon Romanovsky
7858cad4713SLeon Romanovsky if ((pfns[idx] & HMM_PFN_DMA_MAPPED) &&
7868cad4713SLeon Romanovsky !(pfns[idx] & HMM_PFN_P2PDMA_BUS)) {
7878cad4713SLeon Romanovsky /*
7888cad4713SLeon Romanovsky * We are in this flow when there is a need to resync flags,
7898cad4713SLeon Romanovsky * for example when page was already linked in prefetch call
7908cad4713SLeon Romanovsky * with READ flag and now we need to add WRITE flag
7918cad4713SLeon Romanovsky *
7928cad4713SLeon Romanovsky * This page was already programmed to HW and we don't want/need
7938cad4713SLeon Romanovsky * to unlink and link it again just to resync flags.
7948cad4713SLeon Romanovsky */
7958cad4713SLeon Romanovsky if (dma_use_iova(state))
7968cad4713SLeon Romanovsky return state->addr + offset;
7978cad4713SLeon Romanovsky
7988cad4713SLeon Romanovsky /*
7998cad4713SLeon Romanovsky * Without dma_need_unmap, the dma_addrs array is NULL, thus we
8008cad4713SLeon Romanovsky * need to regenerate the address below even if there already
8018cad4713SLeon Romanovsky * was a mapping. But !dma_need_unmap implies that the
8028cad4713SLeon Romanovsky * mapping stateless, so this is fine.
8038cad4713SLeon Romanovsky */
8048cad4713SLeon Romanovsky if (dma_need_unmap(dev))
8058cad4713SLeon Romanovsky return dma_addrs[idx];
8068cad4713SLeon Romanovsky
8078cad4713SLeon Romanovsky /* Continue to remapping */
8088cad4713SLeon Romanovsky }
8098cad4713SLeon Romanovsky
8108cad4713SLeon Romanovsky switch (pci_p2pdma_state(p2pdma_state, dev, page)) {
8118cad4713SLeon Romanovsky case PCI_P2PDMA_MAP_NONE:
8128cad4713SLeon Romanovsky break;
8138cad4713SLeon Romanovsky case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
814ec818caeSLeon Romanovsky attrs |= DMA_ATTR_MMIO;
8158cad4713SLeon Romanovsky pfns[idx] |= HMM_PFN_P2PDMA;
8168cad4713SLeon Romanovsky break;
8178cad4713SLeon Romanovsky case PCI_P2PDMA_MAP_BUS_ADDR:
8188cad4713SLeon Romanovsky pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED;
819d4504262SLeon Romanovsky return pci_p2pdma_bus_addr_map(p2pdma_state->mem, paddr);
8208cad4713SLeon Romanovsky default:
8218cad4713SLeon Romanovsky return DMA_MAPPING_ERROR;
8228cad4713SLeon Romanovsky }
8238cad4713SLeon Romanovsky
8248cad4713SLeon Romanovsky if (dma_use_iova(state)) {
8258cad4713SLeon Romanovsky ret = dma_iova_link(dev, state, paddr, offset,
8268cad4713SLeon Romanovsky map->dma_entry_size, DMA_BIDIRECTIONAL,
8278cad4713SLeon Romanovsky attrs);
8288cad4713SLeon Romanovsky if (ret)
8298cad4713SLeon Romanovsky goto error;
8308cad4713SLeon Romanovsky
8318cad4713SLeon Romanovsky ret = dma_iova_sync(dev, state, offset, map->dma_entry_size);
8328cad4713SLeon Romanovsky if (ret) {
8338cad4713SLeon Romanovsky dma_iova_unlink(dev, state, offset, map->dma_entry_size,
8348cad4713SLeon Romanovsky DMA_BIDIRECTIONAL, attrs);
8358cad4713SLeon Romanovsky goto error;
8368cad4713SLeon Romanovsky }
8378cad4713SLeon Romanovsky
8388cad4713SLeon Romanovsky dma_addr = state->addr + offset;
8398cad4713SLeon Romanovsky } else {
8408cad4713SLeon Romanovsky if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs))
8418cad4713SLeon Romanovsky goto error;
8428cad4713SLeon Romanovsky
843e1d69da2SLeon Romanovsky dma_addr = dma_map_phys(dev, paddr, map->dma_entry_size,
844ec818caeSLeon Romanovsky DMA_BIDIRECTIONAL, attrs);
8458cad4713SLeon Romanovsky if (dma_mapping_error(dev, dma_addr))
8468cad4713SLeon Romanovsky goto error;
8478cad4713SLeon Romanovsky
8488cad4713SLeon Romanovsky if (dma_need_unmap(dev))
8498cad4713SLeon Romanovsky dma_addrs[idx] = dma_addr;
8508cad4713SLeon Romanovsky }
8518cad4713SLeon Romanovsky pfns[idx] |= HMM_PFN_DMA_MAPPED;
8528cad4713SLeon Romanovsky return dma_addr;
8538cad4713SLeon Romanovsky error:
8548cad4713SLeon Romanovsky pfns[idx] &= ~HMM_PFN_P2PDMA;
8558cad4713SLeon Romanovsky return DMA_MAPPING_ERROR;
8568cad4713SLeon Romanovsky
8578cad4713SLeon Romanovsky }
8588cad4713SLeon Romanovsky EXPORT_SYMBOL_GPL(hmm_dma_map_pfn);
8598cad4713SLeon Romanovsky
8608cad4713SLeon Romanovsky /**
8618cad4713SLeon Romanovsky * hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address
8628cad4713SLeon Romanovsky * @dev: Device to unmap the page from
8638cad4713SLeon Romanovsky * @map: HMM map
8648cad4713SLeon Romanovsky * @idx: Index of the PFN to unmap
8658cad4713SLeon Romanovsky *
8668cad4713SLeon Romanovsky * Returns true if the PFN was mapped and has been unmapped, false otherwise.
8678cad4713SLeon Romanovsky */
hmm_dma_unmap_pfn(struct device * dev,struct hmm_dma_map * map,size_t idx)8688cad4713SLeon Romanovsky bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx)
8698cad4713SLeon Romanovsky {
8708cad4713SLeon Romanovsky const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED;
8718cad4713SLeon Romanovsky struct dma_iova_state *state = &map->state;
8728cad4713SLeon Romanovsky dma_addr_t *dma_addrs = map->dma_list;
8738cad4713SLeon Romanovsky unsigned long *pfns = map->pfn_list;
874*f5ebf241SLeon Romanovsky unsigned long attrs = DMA_ATTR_REQUIRE_COHERENT;
8758cad4713SLeon Romanovsky
8768cad4713SLeon Romanovsky if ((pfns[idx] & valid_dma) != valid_dma)
8778cad4713SLeon Romanovsky return false;
8788cad4713SLeon Romanovsky
879ec818caeSLeon Romanovsky if (pfns[idx] & HMM_PFN_P2PDMA)
880ec818caeSLeon Romanovsky attrs |= DMA_ATTR_MMIO;
881ec818caeSLeon Romanovsky
8828cad4713SLeon Romanovsky if (pfns[idx] & HMM_PFN_P2PDMA_BUS)
8838cad4713SLeon Romanovsky ; /* no need to unmap bus address P2P mappings */
884ec818caeSLeon Romanovsky else if (dma_use_iova(state))
8858cad4713SLeon Romanovsky dma_iova_unlink(dev, state, idx * map->dma_entry_size,
8868cad4713SLeon Romanovsky map->dma_entry_size, DMA_BIDIRECTIONAL, attrs);
887ec818caeSLeon Romanovsky else if (dma_need_unmap(dev))
888e1d69da2SLeon Romanovsky dma_unmap_phys(dev, dma_addrs[idx], map->dma_entry_size,
889ec818caeSLeon Romanovsky DMA_BIDIRECTIONAL, attrs);
8908cad4713SLeon Romanovsky
8918cad4713SLeon Romanovsky pfns[idx] &=
8928cad4713SLeon Romanovsky ~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS);
8938cad4713SLeon Romanovsky return true;
8948cad4713SLeon Romanovsky }
8958cad4713SLeon Romanovsky EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn);
896