1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * fs/dax.c - Direct Access filesystem code
4 * Copyright (c) 2013-2014 Intel Corporation
5 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
6 * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
7 */
8
9 #include <linux/atomic.h>
10 #include <linux/blkdev.h>
11 #include <linux/buffer_head.h>
12 #include <linux/dax.h>
13 #include <linux/fs.h>
14 #include <linux/highmem.h>
15 #include <linux/memcontrol.h>
16 #include <linux/mm.h>
17 #include <linux/mutex.h>
18 #include <linux/sched.h>
19 #include <linux/sched/signal.h>
20 #include <linux/uio.h>
21 #include <linux/vmstat.h>
22 #include <linux/sizes.h>
23 #include <linux/mmu_notifier.h>
24 #include <linux/iomap.h>
25 #include <linux/rmap.h>
26 #include <linux/pgalloc.h>
27
28 #define CREATE_TRACE_POINTS
29 #include <trace/events/fs_dax.h>
30
31 /* We choose 4096 entries - same as per-zone page wait tables */
32 #define DAX_WAIT_TABLE_BITS 12
33 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
34
35 /* The 'colour' (ie low bits) within a PMD of a page offset. */
36 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
37 #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
38
39 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
40
init_dax_wait_table(void)41 static int __init init_dax_wait_table(void)
42 {
43 int i;
44
45 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
46 init_waitqueue_head(wait_table + i);
47 return 0;
48 }
49 fs_initcall(init_dax_wait_table);
50
51 /*
52 * DAX pagecache entries use XArray value entries so they can't be mistaken
53 * for pages. We use one bit for locking, one bit for the entry size (PMD)
54 * and two more to tell us if the entry is a zero page or an empty entry that
55 * is just used for locking. In total four special bits.
56 *
57 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
58 * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
59 * block allocation.
60 */
61 #define DAX_SHIFT (4)
62 #define DAX_LOCKED (1UL << 0)
63 #define DAX_PMD (1UL << 1)
64 #define DAX_ZERO_PAGE (1UL << 2)
65 #define DAX_EMPTY (1UL << 3)
66
dax_to_pfn(void * entry)67 static unsigned long dax_to_pfn(void *entry)
68 {
69 return xa_to_value(entry) >> DAX_SHIFT;
70 }
71
dax_to_folio(void * entry)72 static struct folio *dax_to_folio(void *entry)
73 {
74 return page_folio(pfn_to_page(dax_to_pfn(entry)));
75 }
76
dax_make_entry(unsigned long pfn,unsigned long flags)77 static void *dax_make_entry(unsigned long pfn, unsigned long flags)
78 {
79 return xa_mk_value(flags | (pfn << DAX_SHIFT));
80 }
81
dax_is_locked(void * entry)82 static bool dax_is_locked(void *entry)
83 {
84 return xa_to_value(entry) & DAX_LOCKED;
85 }
86
dax_entry_order(void * entry)87 static unsigned int dax_entry_order(void *entry)
88 {
89 if (xa_to_value(entry) & DAX_PMD)
90 return PMD_ORDER;
91 return 0;
92 }
93
dax_is_pmd_entry(void * entry)94 static unsigned long dax_is_pmd_entry(void *entry)
95 {
96 return xa_to_value(entry) & DAX_PMD;
97 }
98
dax_is_pte_entry(void * entry)99 static bool dax_is_pte_entry(void *entry)
100 {
101 return !(xa_to_value(entry) & DAX_PMD);
102 }
103
dax_is_zero_entry(void * entry)104 static int dax_is_zero_entry(void *entry)
105 {
106 return xa_to_value(entry) & DAX_ZERO_PAGE;
107 }
108
dax_is_empty_entry(void * entry)109 static int dax_is_empty_entry(void *entry)
110 {
111 return xa_to_value(entry) & DAX_EMPTY;
112 }
113
114 /*
115 * true if the entry that was found is of a smaller order than the entry
116 * we were looking for
117 */
dax_is_conflict(void * entry)118 static bool dax_is_conflict(void *entry)
119 {
120 return entry == XA_RETRY_ENTRY;
121 }
122
123 /*
124 * DAX page cache entry locking
125 */
126 struct exceptional_entry_key {
127 struct xarray *xa;
128 pgoff_t entry_start;
129 };
130
131 struct wait_exceptional_entry_queue {
132 wait_queue_entry_t wait;
133 struct exceptional_entry_key key;
134 };
135
136 /**
137 * enum dax_wake_mode: waitqueue wakeup behaviour
138 * @WAKE_ALL: wake all waiters in the waitqueue
139 * @WAKE_NEXT: wake only the first waiter in the waitqueue
140 */
141 enum dax_wake_mode {
142 WAKE_ALL,
143 WAKE_NEXT,
144 };
145
dax_entry_waitqueue(struct xa_state * xas,void * entry,struct exceptional_entry_key * key)146 static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
147 void *entry, struct exceptional_entry_key *key)
148 {
149 unsigned long hash;
150 unsigned long index = xas->xa_index;
151
152 /*
153 * If 'entry' is a PMD, align the 'index' that we use for the wait
154 * queue to the start of that PMD. This ensures that all offsets in
155 * the range covered by the PMD map to the same bit lock.
156 */
157 if (dax_is_pmd_entry(entry))
158 index &= ~PG_PMD_COLOUR;
159 key->xa = xas->xa;
160 key->entry_start = index;
161
162 hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
163 return wait_table + hash;
164 }
165
wake_exceptional_entry_func(wait_queue_entry_t * wait,unsigned int mode,int sync,void * keyp)166 static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
167 unsigned int mode, int sync, void *keyp)
168 {
169 struct exceptional_entry_key *key = keyp;
170 struct wait_exceptional_entry_queue *ewait =
171 container_of(wait, struct wait_exceptional_entry_queue, wait);
172
173 if (key->xa != ewait->key.xa ||
174 key->entry_start != ewait->key.entry_start)
175 return 0;
176 return autoremove_wake_function(wait, mode, sync, NULL);
177 }
178
179 /*
180 * @entry may no longer be the entry at the index in the mapping.
181 * The important information it's conveying is whether the entry at
182 * this index used to be a PMD entry.
183 */
dax_wake_entry(struct xa_state * xas,void * entry,enum dax_wake_mode mode)184 static void dax_wake_entry(struct xa_state *xas, void *entry,
185 enum dax_wake_mode mode)
186 {
187 struct exceptional_entry_key key;
188 wait_queue_head_t *wq;
189
190 wq = dax_entry_waitqueue(xas, entry, &key);
191
192 /*
193 * Checking for locked entry and prepare_to_wait_exclusive() happens
194 * under the i_pages lock, ditto for entry handling in our callers.
195 * So at this point all tasks that could have seen our entry locked
196 * must be in the waitqueue and the following check will see them.
197 */
198 if (waitqueue_active(wq))
199 __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
200 }
201
202 /*
203 * Look up entry in page cache, wait for it to become unlocked if it
204 * is a DAX entry and return it. The caller must subsequently call
205 * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
206 * if it did. The entry returned may have a larger order than @order.
207 * If @order is larger than the order of the entry found in i_pages, this
208 * function returns a dax_is_conflict entry.
209 *
210 * Must be called with the i_pages lock held.
211 */
get_next_unlocked_entry(struct xa_state * xas,unsigned int order)212 static void *get_next_unlocked_entry(struct xa_state *xas, unsigned int order)
213 {
214 void *entry;
215 struct wait_exceptional_entry_queue ewait;
216 wait_queue_head_t *wq;
217
218 init_wait(&ewait.wait);
219 ewait.wait.func = wake_exceptional_entry_func;
220
221 for (;;) {
222 entry = xas_find_conflict(xas);
223 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
224 return entry;
225 if (dax_entry_order(entry) < order)
226 return XA_RETRY_ENTRY;
227 if (!dax_is_locked(entry))
228 return entry;
229
230 wq = dax_entry_waitqueue(xas, entry, &ewait.key);
231 prepare_to_wait_exclusive(wq, &ewait.wait,
232 TASK_UNINTERRUPTIBLE);
233 xas_unlock_irq(xas);
234 xas_reset(xas);
235 schedule();
236 finish_wait(wq, &ewait.wait);
237 xas_lock_irq(xas);
238 }
239 }
240
241 /*
242 * Wait for the given entry to become unlocked. Caller must hold the i_pages
243 * lock and call either put_unlocked_entry() if it did not lock the entry or
244 * dax_unlock_entry() if it did. Returns an unlocked entry if still present.
245 */
wait_entry_unlocked_exclusive(struct xa_state * xas,void * entry)246 static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry)
247 {
248 struct wait_exceptional_entry_queue ewait;
249 wait_queue_head_t *wq;
250
251 init_wait(&ewait.wait);
252 ewait.wait.func = wake_exceptional_entry_func;
253
254 while (unlikely(dax_is_locked(entry))) {
255 wq = dax_entry_waitqueue(xas, entry, &ewait.key);
256 prepare_to_wait_exclusive(wq, &ewait.wait,
257 TASK_UNINTERRUPTIBLE);
258 xas_reset(xas);
259 xas_unlock_irq(xas);
260 schedule();
261 finish_wait(wq, &ewait.wait);
262 xas_lock_irq(xas);
263 entry = xas_load(xas);
264 }
265
266 if (xa_is_internal(entry))
267 return NULL;
268
269 return entry;
270 }
271
272 /*
273 * The only thing keeping the address space around is the i_pages lock
274 * (it's cycled in clear_inode() after removing the entries from i_pages)
275 * After we call xas_unlock_irq(), we cannot touch xas->xa.
276 */
wait_entry_unlocked(struct xa_state * xas,void * entry)277 static void wait_entry_unlocked(struct xa_state *xas, void *entry)
278 {
279 struct wait_exceptional_entry_queue ewait;
280 wait_queue_head_t *wq;
281
282 init_wait(&ewait.wait);
283 ewait.wait.func = wake_exceptional_entry_func;
284
285 wq = dax_entry_waitqueue(xas, entry, &ewait.key);
286 /*
287 * Unlike get_next_unlocked_entry() there is no guarantee that this
288 * path ever successfully retrieves an unlocked entry before an
289 * inode dies. Perform a non-exclusive wait in case this path
290 * never successfully performs its own wake up.
291 */
292 prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
293 xas_unlock_irq(xas);
294 schedule();
295 finish_wait(wq, &ewait.wait);
296 }
297
put_unlocked_entry(struct xa_state * xas,void * entry,enum dax_wake_mode mode)298 static void put_unlocked_entry(struct xa_state *xas, void *entry,
299 enum dax_wake_mode mode)
300 {
301 if (entry && !dax_is_conflict(entry))
302 dax_wake_entry(xas, entry, mode);
303 }
304
305 /*
306 * We used the xa_state to get the entry, but then we locked the entry and
307 * dropped the xa_lock, so we know the xa_state is stale and must be reset
308 * before use.
309 */
dax_unlock_entry(struct xa_state * xas,void * entry)310 static void dax_unlock_entry(struct xa_state *xas, void *entry)
311 {
312 void *old;
313
314 BUG_ON(dax_is_locked(entry));
315 xas_reset(xas);
316 xas_lock_irq(xas);
317 old = xas_store(xas, entry);
318 xas_unlock_irq(xas);
319 BUG_ON(!dax_is_locked(old));
320 dax_wake_entry(xas, entry, WAKE_NEXT);
321 }
322
323 /*
324 * Return: The entry stored at this location before it was locked.
325 */
dax_lock_entry(struct xa_state * xas,void * entry)326 static void *dax_lock_entry(struct xa_state *xas, void *entry)
327 {
328 unsigned long v = xa_to_value(entry);
329 return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
330 }
331
dax_entry_size(void * entry)332 static unsigned long dax_entry_size(void *entry)
333 {
334 if (dax_is_zero_entry(entry))
335 return 0;
336 else if (dax_is_empty_entry(entry))
337 return 0;
338 else if (dax_is_pmd_entry(entry))
339 return PMD_SIZE;
340 else
341 return PAGE_SIZE;
342 }
343
344 /*
345 * A DAX folio is considered shared if it has no mapping set and ->share (which
346 * shares the ->index field) is non-zero. Note this may return false even if the
347 * page is shared between multiple files but has not yet actually been mapped
348 * into multiple address spaces.
349 */
dax_folio_is_shared(struct folio * folio)350 static inline bool dax_folio_is_shared(struct folio *folio)
351 {
352 return !folio->mapping && folio->share;
353 }
354
355 /*
356 * When it is called by dax_insert_entry(), the shared flag will indicate
357 * whether this entry is shared by multiple files. If the page has not
358 * previously been associated with any mappings the ->mapping and ->index
359 * fields will be set. If it has already been associated with a mapping
360 * the mapping will be cleared and the share count set. It's then up to
361 * reverse map users like memory_failure() to call back into the filesystem to
362 * recover ->mapping and ->index information. For example by implementing
363 * dax_holder_operations.
364 */
dax_folio_make_shared(struct folio * folio)365 static void dax_folio_make_shared(struct folio *folio)
366 {
367 /*
368 * folio is not currently shared so mark it as shared by clearing
369 * folio->mapping.
370 */
371 folio->mapping = NULL;
372
373 /*
374 * folio has previously been mapped into one address space so set the
375 * share count.
376 */
377 folio->share = 1;
378 }
379
380 /**
381 * dax_folio_reset_order - Reset a compound DAX folio to order-0 pages
382 * @folio: The folio to reset
383 *
384 * Splits a compound folio back into individual order-0 pages,
385 * clearing compound state and restoring pgmap pointers.
386 *
387 * Returns: the original folio order (0 if already order-0)
388 */
dax_folio_reset_order(struct folio * folio)389 int dax_folio_reset_order(struct folio *folio)
390 {
391 struct dev_pagemap *pgmap = page_pgmap(&folio->page);
392 int order = folio_order(folio);
393
394 /*
395 * DAX maintains the invariant that folio->share != 0 only when
396 * folio->mapping == NULL (enforced by dax_folio_make_shared()).
397 * Equivalently: folio->mapping != NULL implies folio->share == 0.
398 * Callers ensure share has been decremented to zero before
399 * calling here, so unconditionally clearing both fields is
400 * correct.
401 */
402 folio->mapping = NULL;
403 folio->share = 0;
404
405 if (!order) {
406 /*
407 * Restore pgmap explicitly even for order-0 folios. For the
408 * dax_folio_put() caller this is a no-op (same value), but
409 * fsdev_clear_folio_state() may call this on folios that
410 * were previously compound and need pgmap re-established.
411 */
412 folio->pgmap = pgmap;
413 return 0;
414 }
415
416 folio_reset_order(folio);
417
418 for (int i = 0; i < (1UL << order); i++) {
419 struct page *page = folio_page(folio, i);
420 struct folio *f = (struct folio *)page;
421
422 ClearPageHead(page);
423 clear_compound_head(page);
424 f->mapping = NULL;
425 f->share = 0;
426 f->pgmap = pgmap;
427 }
428
429 return order;
430 }
431 EXPORT_SYMBOL_GPL(dax_folio_reset_order);
432
dax_folio_put(struct folio * folio)433 static inline unsigned long dax_folio_put(struct folio *folio)
434 {
435 unsigned long ref;
436 int order, i;
437
438 if (!dax_folio_is_shared(folio))
439 ref = 0;
440 else
441 ref = --folio->share;
442
443 if (ref)
444 return ref;
445
446 order = dax_folio_reset_order(folio);
447
448 /* Debug check: verify refcounts are zero for all sub-folios */
449 for (i = 0; i < (1UL << order); i++) {
450 struct page *page = folio_page(folio, i);
451
452 WARN_ON_ONCE(folio_ref_count((struct folio *)page));
453 }
454
455 return ref;
456 }
457
dax_folio_init(void * entry)458 static void dax_folio_init(void *entry)
459 {
460 struct folio *folio = dax_to_folio(entry);
461 int order = dax_entry_order(entry);
462
463 /*
464 * Folio should have been split back to order-0 pages in
465 * dax_folio_put() when they were removed from their
466 * final mapping.
467 */
468 WARN_ON_ONCE(folio_order(folio));
469
470 if (order > 0) {
471 prep_compound_page(&folio->page, order);
472 if (order > 1)
473 INIT_LIST_HEAD(&folio->_deferred_list);
474 WARN_ON_ONCE(folio_ref_count(folio));
475 }
476 }
477
dax_associate_entry(void * entry,struct address_space * mapping,struct vm_area_struct * vma,unsigned long address,bool shared)478 static void dax_associate_entry(void *entry, struct address_space *mapping,
479 struct vm_area_struct *vma,
480 unsigned long address, bool shared)
481 {
482 unsigned long size = dax_entry_size(entry), index;
483 struct folio *folio = dax_to_folio(entry);
484
485 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
486 return;
487
488 index = linear_page_index(vma, address & ~(size - 1));
489 if (shared && (folio->mapping || dax_folio_is_shared(folio))) {
490 if (folio->mapping)
491 dax_folio_make_shared(folio);
492
493 WARN_ON_ONCE(!folio->share);
494 WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio));
495 folio->share++;
496 } else {
497 WARN_ON_ONCE(folio->mapping);
498 dax_folio_init(entry);
499 folio = dax_to_folio(entry);
500 folio->mapping = mapping;
501 folio->index = index;
502 }
503 }
504
dax_disassociate_entry(void * entry,struct address_space * mapping,bool trunc)505 static void dax_disassociate_entry(void *entry, struct address_space *mapping,
506 bool trunc)
507 {
508 struct folio *folio = dax_to_folio(entry);
509
510 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
511 return;
512
513 dax_folio_put(folio);
514 }
515
dax_busy_page(void * entry)516 static struct page *dax_busy_page(void *entry)
517 {
518 struct folio *folio = dax_to_folio(entry);
519
520 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
521 return NULL;
522
523 if (folio_ref_count(folio) - folio_mapcount(folio))
524 return &folio->page;
525 else
526 return NULL;
527 }
528
529 /**
530 * dax_lock_folio - Lock the DAX entry corresponding to a folio
531 * @folio: The folio whose entry we want to lock
532 *
533 * Context: Process context.
534 * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could
535 * not be locked.
536 */
dax_lock_folio(struct folio * folio)537 dax_entry_t dax_lock_folio(struct folio *folio)
538 {
539 XA_STATE(xas, NULL, 0);
540 void *entry;
541
542 /* Ensure folio->mapping isn't freed while we look at it */
543 rcu_read_lock();
544 for (;;) {
545 struct address_space *mapping = READ_ONCE(folio->mapping);
546
547 entry = NULL;
548 if (!mapping || !dax_mapping(mapping))
549 break;
550
551 /*
552 * In the device-dax case there's no need to lock, a
553 * struct dev_pagemap pin is sufficient to keep the
554 * inode alive, and we assume we have dev_pagemap pin
555 * otherwise we would not have a valid pfn_to_page()
556 * translation.
557 */
558 entry = (void *)~0UL;
559 if (S_ISCHR(mapping->host->i_mode))
560 break;
561
562 xas.xa = &mapping->i_pages;
563 xas_lock_irq(&xas);
564 if (mapping != folio->mapping) {
565 xas_unlock_irq(&xas);
566 continue;
567 }
568 xas_set(&xas, folio->index);
569 entry = xas_load(&xas);
570 if (dax_is_locked(entry)) {
571 rcu_read_unlock();
572 wait_entry_unlocked(&xas, entry);
573 rcu_read_lock();
574 continue;
575 }
576 dax_lock_entry(&xas, entry);
577 xas_unlock_irq(&xas);
578 break;
579 }
580 rcu_read_unlock();
581 return (dax_entry_t)entry;
582 }
583
dax_unlock_folio(struct folio * folio,dax_entry_t cookie)584 void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
585 {
586 struct address_space *mapping = folio->mapping;
587 XA_STATE(xas, &mapping->i_pages, folio->index);
588
589 if (S_ISCHR(mapping->host->i_mode))
590 return;
591
592 dax_unlock_entry(&xas, (void *)cookie);
593 }
594
595 /*
596 * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
597 * @mapping: the file's mapping whose entry we want to lock
598 * @index: the offset within this file
599 * @page: output the dax page corresponding to this dax entry
600 *
601 * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
602 * could not be locked.
603 */
dax_lock_mapping_entry(struct address_space * mapping,pgoff_t index,struct page ** page)604 dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
605 struct page **page)
606 {
607 XA_STATE(xas, NULL, 0);
608 void *entry;
609
610 rcu_read_lock();
611 for (;;) {
612 entry = NULL;
613 if (!dax_mapping(mapping))
614 break;
615
616 xas.xa = &mapping->i_pages;
617 xas_lock_irq(&xas);
618 xas_set(&xas, index);
619 entry = xas_load(&xas);
620 if (dax_is_locked(entry)) {
621 rcu_read_unlock();
622 wait_entry_unlocked(&xas, entry);
623 rcu_read_lock();
624 continue;
625 }
626 if (!entry ||
627 dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
628 /*
629 * Because we are looking for entry from file's mapping
630 * and index, so the entry may not be inserted for now,
631 * or even a zero/empty entry. We don't think this is
632 * an error case. So, return a special value and do
633 * not output @page.
634 */
635 entry = (void *)~0UL;
636 } else {
637 *page = pfn_to_page(dax_to_pfn(entry));
638 dax_lock_entry(&xas, entry);
639 }
640 xas_unlock_irq(&xas);
641 break;
642 }
643 rcu_read_unlock();
644 return (dax_entry_t)entry;
645 }
646
dax_unlock_mapping_entry(struct address_space * mapping,pgoff_t index,dax_entry_t cookie)647 void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
648 dax_entry_t cookie)
649 {
650 XA_STATE(xas, &mapping->i_pages, index);
651
652 if (cookie == ~0UL)
653 return;
654
655 dax_unlock_entry(&xas, (void *)cookie);
656 }
657
658 /*
659 * Find page cache entry at given index. If it is a DAX entry, return it
660 * with the entry locked. If the page cache doesn't contain an entry at
661 * that index, add a locked empty entry.
662 *
663 * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
664 * either return that locked entry or will return VM_FAULT_FALLBACK.
665 * This will happen if there are any PTE entries within the PMD range
666 * that we are requesting.
667 *
668 * We always favor PTE entries over PMD entries. There isn't a flow where we
669 * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
670 * insertion will fail if it finds any PTE entries already in the tree, and a
671 * PTE insertion will cause an existing PMD entry to be unmapped and
672 * downgraded to PTE entries. This happens for both PMD zero pages as
673 * well as PMD empty entries.
674 *
675 * The exception to this downgrade path is for PMD entries that have
676 * real storage backing them. We will leave these real PMD entries in
677 * the tree, and PTE writes will simply dirty the entire PMD entry.
678 *
679 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
680 * persistent memory the benefit is doubtful. We can add that later if we can
681 * show it helps.
682 *
683 * On error, this function does not return an ERR_PTR. Instead it returns
684 * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
685 * overlap with xarray value entries.
686 */
grab_mapping_entry(struct xa_state * xas,struct address_space * mapping,unsigned int order)687 static void *grab_mapping_entry(struct xa_state *xas,
688 struct address_space *mapping, unsigned int order)
689 {
690 unsigned long index = xas->xa_index;
691 bool pmd_downgrade; /* splitting PMD entry into PTE entries? */
692 void *entry;
693
694 retry:
695 pmd_downgrade = false;
696 xas_lock_irq(xas);
697 entry = get_next_unlocked_entry(xas, order);
698
699 if (entry) {
700 if (dax_is_conflict(entry))
701 goto fallback;
702 if (!xa_is_value(entry)) {
703 xas_set_err(xas, -EIO);
704 goto out_unlock;
705 }
706
707 if (order == 0) {
708 if (dax_is_pmd_entry(entry) &&
709 (dax_is_zero_entry(entry) ||
710 dax_is_empty_entry(entry))) {
711 pmd_downgrade = true;
712 }
713 }
714 }
715
716 if (pmd_downgrade) {
717 /*
718 * Make sure 'entry' remains valid while we drop
719 * the i_pages lock.
720 */
721 dax_lock_entry(xas, entry);
722
723 /*
724 * Besides huge zero pages the only other thing that gets
725 * downgraded are empty entries which don't need to be
726 * unmapped.
727 */
728 if (dax_is_zero_entry(entry)) {
729 xas_unlock_irq(xas);
730 unmap_mapping_pages(mapping,
731 xas->xa_index & ~PG_PMD_COLOUR,
732 PG_PMD_NR, false);
733 xas_reset(xas);
734 xas_lock_irq(xas);
735 }
736
737 dax_disassociate_entry(entry, mapping, false);
738 xas_store(xas, NULL); /* undo the PMD join */
739 dax_wake_entry(xas, entry, WAKE_ALL);
740 mapping->nrpages -= PG_PMD_NR;
741 entry = NULL;
742 xas_set(xas, index);
743 }
744
745 if (entry) {
746 dax_lock_entry(xas, entry);
747 } else {
748 unsigned long flags = DAX_EMPTY;
749
750 if (order > 0)
751 flags |= DAX_PMD;
752 entry = dax_make_entry(0, flags);
753 dax_lock_entry(xas, entry);
754 if (xas_error(xas))
755 goto out_unlock;
756 mapping->nrpages += 1UL << order;
757 }
758
759 out_unlock:
760 xas_unlock_irq(xas);
761 if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
762 goto retry;
763 if (xas->xa_node == XA_ERROR(-ENOMEM))
764 return xa_mk_internal(VM_FAULT_OOM);
765 if (xas_error(xas))
766 return xa_mk_internal(VM_FAULT_SIGBUS);
767 return entry;
768 fallback:
769 xas_unlock_irq(xas);
770 return xa_mk_internal(VM_FAULT_FALLBACK);
771 }
772
773 /**
774 * dax_layout_busy_page_range - find first pinned page in @mapping
775 * @mapping: address space to scan for a page with ref count > 1
776 * @start: Starting offset. Page containing 'start' is included.
777 * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
778 * pages from 'start' till the end of file are included.
779 *
780 * DAX requires ZONE_DEVICE mapped pages. These pages are never
781 * 'onlined' to the page allocator so they are considered idle when
782 * page->count == 1. A filesystem uses this interface to determine if
783 * any page in the mapping is busy, i.e. for DMA, or other
784 * get_user_pages() usages.
785 *
786 * It is expected that the filesystem is holding locks to block the
787 * establishment of new mappings in this address_space. I.e. it expects
788 * to be able to run unmap_mapping_range() and subsequently not race
789 * mapping_mapped() becoming true.
790 */
dax_layout_busy_page_range(struct address_space * mapping,loff_t start,loff_t end)791 struct page *dax_layout_busy_page_range(struct address_space *mapping,
792 loff_t start, loff_t end)
793 {
794 void *entry;
795 unsigned int scanned = 0;
796 struct page *page = NULL;
797 pgoff_t start_idx = start >> PAGE_SHIFT;
798 pgoff_t end_idx;
799 XA_STATE(xas, &mapping->i_pages, start_idx);
800
801 if (!dax_mapping(mapping))
802 return NULL;
803
804 /* If end == LLONG_MAX, all pages from start to till end of file */
805 if (end == LLONG_MAX)
806 end_idx = ULONG_MAX;
807 else
808 end_idx = end >> PAGE_SHIFT;
809 /*
810 * If we race get_user_pages_fast() here either we'll see the
811 * elevated page count in the iteration and wait, or
812 * get_user_pages_fast() will see that the page it took a reference
813 * against is no longer mapped in the page tables and bail to the
814 * get_user_pages() slow path. The slow path is protected by
815 * pte_lock() and pmd_lock(). New references are not taken without
816 * holding those locks, and unmap_mapping_pages() will not zero the
817 * pte or pmd without holding the respective lock, so we are
818 * guaranteed to either see new references or prevent new
819 * references from being established.
820 */
821 unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
822
823 xas_lock_irq(&xas);
824 xas_for_each(&xas, entry, end_idx) {
825 if (WARN_ON_ONCE(!xa_is_value(entry)))
826 continue;
827 entry = wait_entry_unlocked_exclusive(&xas, entry);
828 if (entry)
829 page = dax_busy_page(entry);
830 put_unlocked_entry(&xas, entry, WAKE_NEXT);
831 if (page)
832 break;
833 if (++scanned % XA_CHECK_SCHED)
834 continue;
835
836 xas_pause(&xas);
837 xas_unlock_irq(&xas);
838 cond_resched();
839 xas_lock_irq(&xas);
840 }
841 xas_unlock_irq(&xas);
842 return page;
843 }
844 EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
845
dax_layout_busy_page(struct address_space * mapping)846 struct page *dax_layout_busy_page(struct address_space *mapping)
847 {
848 return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
849 }
850 EXPORT_SYMBOL_GPL(dax_layout_busy_page);
851
__dax_invalidate_entry(struct address_space * mapping,pgoff_t index,bool trunc)852 static int __dax_invalidate_entry(struct address_space *mapping,
853 pgoff_t index, bool trunc)
854 {
855 XA_STATE(xas, &mapping->i_pages, index);
856 int ret = 0;
857 void *entry;
858
859 xas_lock_irq(&xas);
860 entry = get_next_unlocked_entry(&xas, 0);
861 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
862 goto out;
863 if (!trunc &&
864 (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
865 xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
866 goto out;
867 dax_disassociate_entry(entry, mapping, trunc);
868 xas_store(&xas, NULL);
869 mapping->nrpages -= 1UL << dax_entry_order(entry);
870 ret = 1;
871 out:
872 put_unlocked_entry(&xas, entry, WAKE_ALL);
873 xas_unlock_irq(&xas);
874 return ret;
875 }
876
__dax_clear_dirty_range(struct address_space * mapping,pgoff_t start,pgoff_t end)877 static int __dax_clear_dirty_range(struct address_space *mapping,
878 pgoff_t start, pgoff_t end)
879 {
880 XA_STATE(xas, &mapping->i_pages, start);
881 unsigned int scanned = 0;
882 void *entry;
883
884 xas_lock_irq(&xas);
885 xas_for_each(&xas, entry, end) {
886 entry = wait_entry_unlocked_exclusive(&xas, entry);
887 if (!entry)
888 continue;
889 xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
890 xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
891 put_unlocked_entry(&xas, entry, WAKE_NEXT);
892
893 if (++scanned % XA_CHECK_SCHED)
894 continue;
895
896 xas_pause(&xas);
897 xas_unlock_irq(&xas);
898 cond_resched();
899 xas_lock_irq(&xas);
900 }
901 xas_unlock_irq(&xas);
902
903 return 0;
904 }
905
906 /*
907 * Delete DAX entry at @index from @mapping. Wait for it
908 * to be unlocked before deleting it.
909 */
dax_delete_mapping_entry(struct address_space * mapping,pgoff_t index)910 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
911 {
912 int ret = __dax_invalidate_entry(mapping, index, true);
913
914 /*
915 * This gets called from truncate / punch_hole path. As such, the caller
916 * must hold locks protecting against concurrent modifications of the
917 * page cache (usually fs-private i_mmap_sem for writing). Since the
918 * caller has seen a DAX entry for this index, we better find it
919 * at that index as well...
920 */
921 WARN_ON_ONCE(!ret);
922 return ret;
923 }
924
dax_delete_mapping_range(struct address_space * mapping,loff_t start,loff_t end)925 void dax_delete_mapping_range(struct address_space *mapping,
926 loff_t start, loff_t end)
927 {
928 void *entry;
929 pgoff_t start_idx = start >> PAGE_SHIFT;
930 pgoff_t end_idx;
931 XA_STATE(xas, &mapping->i_pages, start_idx);
932
933 /* If end == LLONG_MAX, all pages from start to till end of file */
934 if (end == LLONG_MAX)
935 end_idx = ULONG_MAX;
936 else
937 end_idx = end >> PAGE_SHIFT;
938
939 xas_lock_irq(&xas);
940 xas_for_each(&xas, entry, end_idx) {
941 if (!xa_is_value(entry))
942 continue;
943 entry = wait_entry_unlocked_exclusive(&xas, entry);
944 if (!entry)
945 continue;
946 dax_disassociate_entry(entry, mapping, true);
947 xas_store(&xas, NULL);
948 mapping->nrpages -= 1UL << dax_entry_order(entry);
949 put_unlocked_entry(&xas, entry, WAKE_ALL);
950 }
951 xas_unlock_irq(&xas);
952 }
953 EXPORT_SYMBOL_GPL(dax_delete_mapping_range);
954
wait_page_idle(struct page * page,void (cb)(struct inode *),struct inode * inode)955 static int wait_page_idle(struct page *page,
956 void (cb)(struct inode *),
957 struct inode *inode)
958 {
959 return ___wait_var_event(page, dax_page_is_idle(page),
960 TASK_INTERRUPTIBLE, 0, 0, cb(inode));
961 }
962
wait_page_idle_uninterruptible(struct page * page,struct inode * inode)963 static void wait_page_idle_uninterruptible(struct page *page,
964 struct inode *inode)
965 {
966 ___wait_var_event(page, dax_page_is_idle(page),
967 TASK_UNINTERRUPTIBLE, 0, 0, schedule());
968 }
969
970 /*
971 * Unmaps the inode and waits for any DMA to complete prior to deleting the
972 * DAX mapping entries for the range.
973 *
974 * For NOWAIT behavior, pass @cb as NULL to early-exit on first found
975 * busy page
976 */
dax_break_layout(struct inode * inode,loff_t start,loff_t end,void (cb)(struct inode *))977 int dax_break_layout(struct inode *inode, loff_t start, loff_t end,
978 void (cb)(struct inode *))
979 {
980 struct page *page;
981 int error = 0;
982
983 if (!dax_mapping(inode->i_mapping))
984 return 0;
985
986 do {
987 page = dax_layout_busy_page_range(inode->i_mapping, start, end);
988 if (!page)
989 break;
990 if (!cb) {
991 error = -ERESTARTSYS;
992 break;
993 }
994
995 error = wait_page_idle(page, cb, inode);
996 } while (error == 0);
997
998 if (!page)
999 dax_delete_mapping_range(inode->i_mapping, start, end);
1000
1001 return error;
1002 }
1003 EXPORT_SYMBOL_GPL(dax_break_layout);
1004
dax_break_layout_final(struct inode * inode)1005 void dax_break_layout_final(struct inode *inode)
1006 {
1007 struct page *page;
1008
1009 if (!dax_mapping(inode->i_mapping))
1010 return;
1011
1012 do {
1013 page = dax_layout_busy_page_range(inode->i_mapping, 0,
1014 LLONG_MAX);
1015 if (!page)
1016 break;
1017
1018 wait_page_idle_uninterruptible(page, inode);
1019 } while (true);
1020
1021 if (!page)
1022 dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX);
1023 }
1024 EXPORT_SYMBOL_GPL(dax_break_layout_final);
1025
1026 /*
1027 * Invalidate DAX entry if it is clean.
1028 */
dax_invalidate_mapping_entry_sync(struct address_space * mapping,pgoff_t index)1029 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
1030 pgoff_t index)
1031 {
1032 return __dax_invalidate_entry(mapping, index, false);
1033 }
1034
dax_iomap_pgoff(const struct iomap * iomap,loff_t pos)1035 static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
1036 {
1037 return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
1038 }
1039
copy_cow_page_dax(struct vm_fault * vmf,const struct iomap_iter * iter)1040 static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
1041 {
1042 pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
1043 void *vto, *kaddr;
1044 long rc;
1045 int id;
1046
1047 id = dax_read_lock();
1048 rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS,
1049 &kaddr, NULL);
1050 if (rc < 0) {
1051 dax_read_unlock(id);
1052 return rc;
1053 }
1054 vto = kmap_atomic(vmf->cow_page);
1055 copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
1056 kunmap_atomic(vto);
1057 dax_read_unlock(id);
1058 return 0;
1059 }
1060
1061 /*
1062 * MAP_SYNC on a dax mapping guarantees dirty metadata is
1063 * flushed on write-faults (non-cow), but not read-faults.
1064 */
dax_fault_is_synchronous(const struct iomap_iter * iter,struct vm_area_struct * vma)1065 static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
1066 struct vm_area_struct *vma)
1067 {
1068 return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
1069 (iter->iomap.flags & IOMAP_F_DIRTY);
1070 }
1071
1072 /*
1073 * By this point grab_mapping_entry() has ensured that we have a locked entry
1074 * of the appropriate size so we don't have to worry about downgrading PMDs to
1075 * PTEs. If we happen to be trying to insert a PTE and there is a PMD
1076 * already in the tree, we will skip the insertion and just dirty the PMD as
1077 * appropriate.
1078 */
dax_insert_entry(struct xa_state * xas,struct vm_fault * vmf,const struct iomap_iter * iter,void * entry,unsigned long pfn,unsigned long flags)1079 static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
1080 const struct iomap_iter *iter, void *entry, unsigned long pfn,
1081 unsigned long flags)
1082 {
1083 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1084 void *new_entry = dax_make_entry(pfn, flags);
1085 bool write = iter->flags & IOMAP_WRITE;
1086 bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma);
1087 bool shared = iter->iomap.flags & IOMAP_F_SHARED;
1088
1089 if (dirty)
1090 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1091
1092 if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
1093 unsigned long index = xas->xa_index;
1094 /* we are replacing a zero page with block mapping */
1095 if (dax_is_pmd_entry(entry))
1096 unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
1097 PG_PMD_NR, false);
1098 else /* pte entry */
1099 unmap_mapping_pages(mapping, index, 1, false);
1100 }
1101
1102 xas_reset(xas);
1103 xas_lock_irq(xas);
1104 if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
1105 void *old;
1106
1107 dax_disassociate_entry(entry, mapping, false);
1108 dax_associate_entry(new_entry, mapping, vmf->vma,
1109 vmf->address, shared);
1110
1111 /*
1112 * Only swap our new entry into the page cache if the current
1113 * entry is a zero page or an empty entry. If a normal PTE or
1114 * PMD entry is already in the cache, we leave it alone. This
1115 * means that if we are trying to insert a PTE and the
1116 * existing entry is a PMD, we will just leave the PMD in the
1117 * tree and dirty it if necessary.
1118 */
1119 old = dax_lock_entry(xas, new_entry);
1120 WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
1121 DAX_LOCKED));
1122 entry = new_entry;
1123 } else {
1124 xas_load(xas); /* Walk the xa_state */
1125 }
1126
1127 if (dirty)
1128 xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
1129
1130 if (write && shared)
1131 xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
1132
1133 xas_unlock_irq(xas);
1134 return entry;
1135 }
1136
dax_writeback_one(struct xa_state * xas,struct dax_device * dax_dev,struct address_space * mapping,void * entry)1137 static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
1138 struct address_space *mapping, void *entry)
1139 {
1140 unsigned long pfn, index, count, end;
1141 long ret = 0;
1142 struct vm_area_struct *vma;
1143
1144 /*
1145 * A page got tagged dirty in DAX mapping? Something is seriously
1146 * wrong.
1147 */
1148 if (WARN_ON(!xa_is_value(entry)))
1149 return -EIO;
1150
1151 if (unlikely(dax_is_locked(entry))) {
1152 void *old_entry = entry;
1153
1154 entry = get_next_unlocked_entry(xas, 0);
1155
1156 /* Entry got punched out / reallocated? */
1157 if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
1158 goto put_unlocked;
1159 /*
1160 * Entry got reallocated elsewhere? No need to writeback.
1161 * We have to compare pfns as we must not bail out due to
1162 * difference in lockbit or entry type.
1163 */
1164 if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
1165 goto put_unlocked;
1166 if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
1167 dax_is_zero_entry(entry))) {
1168 ret = -EIO;
1169 goto put_unlocked;
1170 }
1171
1172 /* Another fsync thread may have already done this entry */
1173 if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
1174 goto put_unlocked;
1175 }
1176
1177 /* Lock the entry to serialize with page faults */
1178 dax_lock_entry(xas, entry);
1179
1180 /*
1181 * We can clear the tag now but we have to be careful so that concurrent
1182 * dax_writeback_one() calls for the same index cannot finish before we
1183 * actually flush the caches. This is achieved as the calls will look
1184 * at the entry only under the i_pages lock and once they do that
1185 * they will see the entry locked and wait for it to unlock.
1186 */
1187 xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
1188 xas_unlock_irq(xas);
1189
1190 /*
1191 * If dax_writeback_mapping_range() was given a wbc->range_start
1192 * in the middle of a PMD, the 'index' we use needs to be
1193 * aligned to the start of the PMD.
1194 * This allows us to flush for PMD_SIZE and not have to worry about
1195 * partial PMD writebacks.
1196 */
1197 pfn = dax_to_pfn(entry);
1198 count = 1UL << dax_entry_order(entry);
1199 index = xas->xa_index & ~(count - 1);
1200 end = index + count - 1;
1201
1202 /* Walk all mappings of a given index of a file and writeprotect them */
1203 i_mmap_lock_read(mapping);
1204 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
1205 pfn_mkclean_range(pfn, count, index, vma);
1206 cond_resched();
1207 }
1208 i_mmap_unlock_read(mapping);
1209
1210 dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
1211 /*
1212 * After we have flushed the cache, we can clear the dirty tag. There
1213 * cannot be new dirty data in the pfn after the flush has completed as
1214 * the pfn mappings are writeprotected and fault waits for mapping
1215 * entry lock.
1216 */
1217 xas_reset(xas);
1218 xas_lock_irq(xas);
1219 xas_store(xas, entry);
1220 xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
1221 dax_wake_entry(xas, entry, WAKE_NEXT);
1222
1223 trace_dax_writeback_one(mapping->host, index, count);
1224 return ret;
1225
1226 put_unlocked:
1227 put_unlocked_entry(xas, entry, WAKE_NEXT);
1228 return ret;
1229 }
1230
1231 /*
1232 * Flush the mapping to the persistent domain within the byte range of [start,
1233 * end]. This is required by data integrity operations to ensure file data is
1234 * on persistent storage prior to completion of the operation.
1235 */
dax_writeback_mapping_range(struct address_space * mapping,struct dax_device * dax_dev,struct writeback_control * wbc)1236 int dax_writeback_mapping_range(struct address_space *mapping,
1237 struct dax_device *dax_dev, struct writeback_control *wbc)
1238 {
1239 XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
1240 struct inode *inode = mapping->host;
1241 pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
1242 void *entry;
1243 int ret = 0;
1244 unsigned int scanned = 0;
1245
1246 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
1247 return -EIO;
1248
1249 if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
1250 return 0;
1251
1252 trace_dax_writeback_range(inode, xas.xa_index, end_index);
1253
1254 tag_pages_for_writeback(mapping, xas.xa_index, end_index);
1255
1256 xas_lock_irq(&xas);
1257 xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
1258 ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
1259 if (ret < 0) {
1260 mapping_set_error(mapping, ret);
1261 break;
1262 }
1263 if (++scanned % XA_CHECK_SCHED)
1264 continue;
1265
1266 xas_pause(&xas);
1267 xas_unlock_irq(&xas);
1268 cond_resched();
1269 xas_lock_irq(&xas);
1270 }
1271 xas_unlock_irq(&xas);
1272 trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
1273 return ret;
1274 }
1275 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
1276
dax_iomap_direct_access(const struct iomap * iomap,loff_t pos,size_t size,void ** kaddr,unsigned long * pfnp)1277 static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
1278 size_t size, void **kaddr, unsigned long *pfnp)
1279 {
1280 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1281 int id, rc = 0;
1282 long length;
1283
1284 id = dax_read_lock();
1285 length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
1286 DAX_ACCESS, kaddr, pfnp);
1287 if (length < 0) {
1288 rc = length;
1289 goto out;
1290 }
1291 if (!pfnp)
1292 goto out_check_addr;
1293 rc = -EINVAL;
1294 if (PFN_PHYS(length) < size)
1295 goto out;
1296 if (*pfnp & (PHYS_PFN(size)-1))
1297 goto out;
1298
1299 rc = 0;
1300
1301 out_check_addr:
1302 if (!kaddr)
1303 goto out;
1304 if (!*kaddr)
1305 rc = -EFAULT;
1306 out:
1307 dax_read_unlock(id);
1308 return rc;
1309 }
1310
1311 /**
1312 * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
1313 * by copying the data before and after the range to be written.
1314 * @pos: address to do copy from.
1315 * @length: size of copy operation.
1316 * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
1317 * @srcmap: iomap srcmap
1318 * @daddr: destination address to copy to.
1319 *
1320 * This can be called from two places. Either during DAX write fault (page
1321 * aligned), to copy the length size data to daddr. Or, while doing normal DAX
1322 * write operation, dax_iomap_iter() might call this to do the copy of either
1323 * start or end unaligned address. In the latter case the rest of the copy of
1324 * aligned ranges is taken care by dax_iomap_iter() itself.
1325 * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
1326 * area to make sure no old data remains.
1327 */
dax_iomap_copy_around(loff_t pos,uint64_t length,size_t align_size,const struct iomap * srcmap,void * daddr)1328 static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
1329 const struct iomap *srcmap, void *daddr)
1330 {
1331 loff_t head_off = pos & (align_size - 1);
1332 size_t size = ALIGN(head_off + length, align_size);
1333 loff_t end = pos + length;
1334 loff_t pg_end = round_up(end, align_size);
1335 /* copy_all is usually in page fault case */
1336 bool copy_all = head_off == 0 && end == pg_end;
1337 /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
1338 bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
1339 srcmap->type == IOMAP_UNWRITTEN;
1340 void *saddr = NULL;
1341 int ret = 0;
1342
1343 if (!zero_edge) {
1344 ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
1345 if (ret)
1346 return dax_mem2blk_err(ret);
1347 }
1348
1349 if (copy_all) {
1350 if (zero_edge)
1351 memset(daddr, 0, size);
1352 else
1353 ret = copy_mc_to_kernel(daddr, saddr, length);
1354 goto out;
1355 }
1356
1357 /* Copy the head part of the range */
1358 if (head_off) {
1359 if (zero_edge)
1360 memset(daddr, 0, head_off);
1361 else {
1362 ret = copy_mc_to_kernel(daddr, saddr, head_off);
1363 if (ret)
1364 return -EIO;
1365 }
1366 }
1367
1368 /* Copy the tail part of the range */
1369 if (end < pg_end) {
1370 loff_t tail_off = head_off + length;
1371 loff_t tail_len = pg_end - end;
1372
1373 if (zero_edge)
1374 memset(daddr + tail_off, 0, tail_len);
1375 else {
1376 ret = copy_mc_to_kernel(daddr + tail_off,
1377 saddr + tail_off, tail_len);
1378 if (ret)
1379 return -EIO;
1380 }
1381 }
1382 out:
1383 if (zero_edge)
1384 dax_flush(srcmap->dax_dev, daddr, size);
1385 return ret ? -EIO : 0;
1386 }
1387
1388 /*
1389 * The user has performed a load from a hole in the file. Allocating a new
1390 * page in the file would cause excessive storage usage for workloads with
1391 * sparse files. Instead we insert a read-only mapping of the 4k zero page.
1392 * If this page is ever written to we will re-fault and change the mapping to
1393 * point to real DAX storage instead.
1394 */
dax_load_hole(struct xa_state * xas,struct vm_fault * vmf,const struct iomap_iter * iter,void ** entry)1395 static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
1396 const struct iomap_iter *iter, void **entry)
1397 {
1398 struct inode *inode = iter->inode;
1399 unsigned long vaddr = vmf->address;
1400 unsigned long pfn = zero_pfn(vaddr);
1401 vm_fault_t ret;
1402
1403 *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
1404
1405 ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), false);
1406 trace_dax_load_hole(inode, vmf, ret);
1407 return ret;
1408 }
1409
1410 #ifdef CONFIG_FS_DAX_PMD
dax_pmd_load_hole(struct xa_state * xas,struct vm_fault * vmf,const struct iomap_iter * iter,void ** entry)1411 static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
1412 const struct iomap_iter *iter, void **entry)
1413 {
1414 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1415 struct inode *inode = mapping->host;
1416 struct folio *zero_folio;
1417 vm_fault_t ret;
1418
1419 zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
1420
1421 if (unlikely(!zero_folio)) {
1422 trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
1423 return VM_FAULT_FALLBACK;
1424 }
1425
1426 *entry = dax_insert_entry(xas, vmf, iter, *entry, folio_pfn(zero_folio),
1427 DAX_PMD | DAX_ZERO_PAGE);
1428
1429 ret = vmf_insert_folio_pmd(vmf, zero_folio, false);
1430 if (ret == VM_FAULT_NOPAGE)
1431 trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
1432 return ret;
1433 }
1434 #else
dax_pmd_load_hole(struct xa_state * xas,struct vm_fault * vmf,const struct iomap_iter * iter,void ** entry)1435 static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
1436 const struct iomap_iter *iter, void **entry)
1437 {
1438 return VM_FAULT_FALLBACK;
1439 }
1440 #endif /* CONFIG_FS_DAX_PMD */
1441
dax_unshare_iter(struct iomap_iter * iter)1442 static int dax_unshare_iter(struct iomap_iter *iter)
1443 {
1444 struct iomap *iomap = &iter->iomap;
1445 const struct iomap *srcmap = iomap_iter_srcmap(iter);
1446 loff_t copy_pos = iter->pos;
1447 u64 copy_len = iomap_length(iter);
1448 u32 mod;
1449 int id = 0;
1450 s64 ret;
1451 void *daddr = NULL, *saddr = NULL;
1452
1453 if (!iomap_want_unshare_iter(iter))
1454 return iomap_iter_advance_full(iter);
1455
1456 /*
1457 * Extend the file range to be aligned to fsblock/pagesize, because
1458 * we need to copy entire blocks, not just the byte range specified.
1459 * Invalidate the mapping because we're about to CoW.
1460 */
1461 mod = offset_in_page(copy_pos);
1462 if (mod) {
1463 copy_len += mod;
1464 copy_pos -= mod;
1465 }
1466
1467 mod = offset_in_page(copy_pos + copy_len);
1468 if (mod)
1469 copy_len += PAGE_SIZE - mod;
1470
1471 invalidate_inode_pages2_range(iter->inode->i_mapping,
1472 copy_pos >> PAGE_SHIFT,
1473 (copy_pos + copy_len - 1) >> PAGE_SHIFT);
1474
1475 id = dax_read_lock();
1476 ret = dax_iomap_direct_access(iomap, copy_pos, copy_len, &daddr, NULL);
1477 if (ret < 0)
1478 goto out_unlock;
1479
1480 ret = dax_iomap_direct_access(srcmap, copy_pos, copy_len, &saddr, NULL);
1481 if (ret < 0)
1482 goto out_unlock;
1483
1484 if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0)
1485 ret = -EIO;
1486
1487 out_unlock:
1488 dax_read_unlock(id);
1489 if (ret < 0)
1490 return dax_mem2blk_err(ret);
1491 return iomap_iter_advance_full(iter);
1492 }
1493
dax_file_unshare(struct inode * inode,loff_t pos,loff_t len,const struct iomap_ops * ops)1494 int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
1495 const struct iomap_ops *ops)
1496 {
1497 struct iomap_iter iter = {
1498 .inode = inode,
1499 .pos = pos,
1500 .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX,
1501 };
1502 loff_t size = i_size_read(inode);
1503 int ret;
1504
1505 if (pos < 0 || pos >= size)
1506 return 0;
1507
1508 iter.len = min(len, size - pos);
1509 while ((ret = iomap_iter(&iter, ops)) > 0)
1510 iter.status = dax_unshare_iter(&iter);
1511 return ret;
1512 }
1513 EXPORT_SYMBOL_GPL(dax_file_unshare);
1514
dax_memzero(struct iomap_iter * iter,loff_t pos,size_t size)1515 static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
1516 {
1517 const struct iomap *iomap = &iter->iomap;
1518 const struct iomap *srcmap = iomap_iter_srcmap(iter);
1519 unsigned offset = offset_in_page(pos);
1520 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1521 void *kaddr;
1522 long ret;
1523
1524 ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
1525 NULL);
1526 if (ret < 0)
1527 return dax_mem2blk_err(ret);
1528
1529 memset(kaddr + offset, 0, size);
1530 if (iomap->flags & IOMAP_F_SHARED)
1531 ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
1532 kaddr);
1533 else
1534 dax_flush(iomap->dax_dev, kaddr + offset, size);
1535 return ret;
1536 }
1537
dax_zero_iter(struct iomap_iter * iter,bool * did_zero)1538 static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
1539 {
1540 const struct iomap *iomap = &iter->iomap;
1541 const struct iomap *srcmap = iomap_iter_srcmap(iter);
1542 u64 length = iomap_length(iter);
1543 int ret;
1544
1545 /* already zeroed? we're done. */
1546 if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
1547 return iomap_iter_advance(iter, length);
1548
1549 /*
1550 * invalidate the pages whose sharing state is to be changed
1551 * because of CoW.
1552 */
1553 if (iomap->flags & IOMAP_F_SHARED)
1554 invalidate_inode_pages2_range(iter->inode->i_mapping,
1555 iter->pos >> PAGE_SHIFT,
1556 (iter->pos + length - 1) >> PAGE_SHIFT);
1557
1558 do {
1559 loff_t pos = iter->pos;
1560 unsigned offset = offset_in_page(pos);
1561 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1562 int id;
1563
1564 length = min_t(u64, PAGE_SIZE - offset, length);
1565
1566 id = dax_read_lock();
1567 if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE)
1568 ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
1569 else
1570 ret = dax_memzero(iter, pos, length);
1571 dax_read_unlock(id);
1572
1573 if (ret < 0)
1574 return ret;
1575
1576 ret = iomap_iter_advance(iter, length);
1577 if (ret)
1578 return ret;
1579 } while ((length = iomap_length(iter)) > 0);
1580
1581 if (did_zero)
1582 *did_zero = true;
1583 return ret;
1584 }
1585
dax_zero_range(struct inode * inode,loff_t pos,loff_t len,bool * did_zero,const struct iomap_ops * ops)1586 int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
1587 const struct iomap_ops *ops)
1588 {
1589 struct iomap_iter iter = {
1590 .inode = inode,
1591 .pos = pos,
1592 .len = len,
1593 .flags = IOMAP_DAX | IOMAP_ZERO,
1594 };
1595 int ret;
1596
1597 while ((ret = iomap_iter(&iter, ops)) > 0)
1598 iter.status = dax_zero_iter(&iter, did_zero);
1599 return ret;
1600 }
1601 EXPORT_SYMBOL_GPL(dax_zero_range);
1602
dax_truncate_page(struct inode * inode,loff_t pos,bool * did_zero,const struct iomap_ops * ops)1603 int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
1604 const struct iomap_ops *ops)
1605 {
1606 unsigned int blocksize = i_blocksize(inode);
1607 unsigned int off = pos & (blocksize - 1);
1608
1609 /* Block boundary? Nothing to do */
1610 if (!off)
1611 return 0;
1612 return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
1613 }
1614 EXPORT_SYMBOL_GPL(dax_truncate_page);
1615
dax_iomap_iter(struct iomap_iter * iomi,struct iov_iter * iter)1616 static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
1617 {
1618 const struct iomap *iomap = &iomi->iomap;
1619 const struct iomap *srcmap = iomap_iter_srcmap(iomi);
1620 loff_t length = iomap_length(iomi);
1621 loff_t pos = iomi->pos;
1622 struct dax_device *dax_dev = iomap->dax_dev;
1623 loff_t end = pos + length, done = 0;
1624 bool write = iov_iter_rw(iter) == WRITE;
1625 bool cow = write && iomap->flags & IOMAP_F_SHARED;
1626 ssize_t ret = 0;
1627 size_t xfer;
1628 int id;
1629
1630 if (!write) {
1631 end = min(end, i_size_read(iomi->inode));
1632 if (pos >= end)
1633 return 0;
1634
1635 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) {
1636 done = iov_iter_zero(min(length, end - pos), iter);
1637 return iomap_iter_advance(iomi, done);
1638 }
1639 }
1640
1641 /*
1642 * In DAX mode, enforce either pure overwrites of written extents, or
1643 * writes to unwritten extents as part of a copy-on-write operation.
1644 */
1645 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
1646 !(iomap->flags & IOMAP_F_SHARED)))
1647 return -EIO;
1648
1649 /*
1650 * Write can allocate block for an area which has a hole page mapped
1651 * into page tables. We have to tear down these mappings so that data
1652 * written by write(2) is visible in mmap.
1653 */
1654 if (iomap->flags & IOMAP_F_NEW || cow) {
1655 /*
1656 * Filesystem allows CoW on non-shared extents. The src extents
1657 * may have been mmapped with dirty mark before. To be able to
1658 * invalidate its dax entries, we need to clear the dirty mark
1659 * in advance.
1660 */
1661 if (cow)
1662 __dax_clear_dirty_range(iomi->inode->i_mapping,
1663 pos >> PAGE_SHIFT,
1664 (end - 1) >> PAGE_SHIFT);
1665 invalidate_inode_pages2_range(iomi->inode->i_mapping,
1666 pos >> PAGE_SHIFT,
1667 (end - 1) >> PAGE_SHIFT);
1668 }
1669
1670 id = dax_read_lock();
1671 while ((pos = iomi->pos) < end) {
1672 unsigned offset = pos & (PAGE_SIZE - 1);
1673 const size_t size = ALIGN(length + offset, PAGE_SIZE);
1674 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1675 ssize_t map_len;
1676 bool recovery = false;
1677 void *kaddr;
1678
1679 if (fatal_signal_pending(current)) {
1680 ret = -EINTR;
1681 break;
1682 }
1683
1684 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
1685 DAX_ACCESS, &kaddr, NULL);
1686 if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
1687 map_len = dax_direct_access(dax_dev, pgoff,
1688 PHYS_PFN(size), DAX_RECOVERY_WRITE,
1689 &kaddr, NULL);
1690 if (map_len > 0)
1691 recovery = true;
1692 }
1693 if (map_len < 0) {
1694 ret = dax_mem2blk_err(map_len);
1695 break;
1696 }
1697
1698 if (cow) {
1699 ret = dax_iomap_copy_around(pos, length, PAGE_SIZE,
1700 srcmap, kaddr);
1701 if (ret)
1702 break;
1703 }
1704
1705 map_len = PFN_PHYS(map_len);
1706 kaddr += offset;
1707 map_len -= offset;
1708 if (map_len > end - pos)
1709 map_len = end - pos;
1710
1711 if (recovery)
1712 xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
1713 map_len, iter);
1714 else if (write)
1715 xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
1716 map_len, iter);
1717 else
1718 xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
1719 map_len, iter);
1720
1721 ret = iomap_iter_advance(iomi, xfer);
1722 if (!ret && xfer == 0)
1723 ret = -EFAULT;
1724 if (xfer < map_len)
1725 break;
1726 length = iomap_length(iomi);
1727 }
1728 dax_read_unlock(id);
1729
1730 return ret;
1731 }
1732
1733 /**
1734 * dax_iomap_rw - Perform I/O to a DAX file
1735 * @iocb: The control block for this I/O
1736 * @iter: The addresses to do I/O from or to
1737 * @ops: iomap ops passed from the file system
1738 *
1739 * This function performs read and write operations to directly mapped
1740 * persistent memory. The callers needs to take care of read/write exclusion
1741 * and evicting any page cache pages in the region under I/O.
1742 */
1743 ssize_t
dax_iomap_rw(struct kiocb * iocb,struct iov_iter * iter,const struct iomap_ops * ops)1744 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1745 const struct iomap_ops *ops)
1746 {
1747 struct iomap_iter iomi = {
1748 .inode = iocb->ki_filp->f_mapping->host,
1749 .pos = iocb->ki_pos,
1750 .len = iov_iter_count(iter),
1751 .flags = IOMAP_DAX,
1752 };
1753 loff_t done = 0;
1754 int ret;
1755
1756 if (WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC))
1757 return -EIO;
1758
1759 if (!iomi.len)
1760 return 0;
1761
1762 if (iov_iter_rw(iter) == WRITE) {
1763 lockdep_assert_held_write(&iomi.inode->i_rwsem);
1764 iomi.flags |= IOMAP_WRITE;
1765 } else if (!sb_rdonly(iomi.inode->i_sb)) {
1766 lockdep_assert_held(&iomi.inode->i_rwsem);
1767 }
1768
1769 if (iocb->ki_flags & IOCB_NOWAIT)
1770 iomi.flags |= IOMAP_NOWAIT;
1771
1772 while ((ret = iomap_iter(&iomi, ops)) > 0)
1773 iomi.status = dax_iomap_iter(&iomi, iter);
1774
1775 done = iomi.pos - iocb->ki_pos;
1776 iocb->ki_pos = iomi.pos;
1777 return done ? done : ret;
1778 }
1779 EXPORT_SYMBOL_GPL(dax_iomap_rw);
1780
dax_fault_return(int error)1781 static vm_fault_t dax_fault_return(int error)
1782 {
1783 if (error == 0)
1784 return VM_FAULT_NOPAGE;
1785 return vmf_error(error);
1786 }
1787
1788 /*
1789 * When handling a synchronous page fault and the inode need a fsync, we can
1790 * insert the PTE/PMD into page tables only after that fsync happened. Skip
1791 * insertion for now and return the pfn so that caller can insert it after the
1792 * fsync is done.
1793 */
dax_fault_synchronous_pfnp(unsigned long * pfnp,unsigned long pfn)1794 static vm_fault_t dax_fault_synchronous_pfnp(unsigned long *pfnp,
1795 unsigned long pfn)
1796 {
1797 if (WARN_ON_ONCE(!pfnp))
1798 return VM_FAULT_SIGBUS;
1799 *pfnp = pfn;
1800 return VM_FAULT_NEEDDSYNC;
1801 }
1802
dax_fault_cow_page(struct vm_fault * vmf,const struct iomap_iter * iter)1803 static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
1804 const struct iomap_iter *iter)
1805 {
1806 vm_fault_t ret;
1807 int error = 0;
1808
1809 switch (iter->iomap.type) {
1810 case IOMAP_HOLE:
1811 case IOMAP_UNWRITTEN:
1812 clear_user_highpage(vmf->cow_page, vmf->address);
1813 break;
1814 case IOMAP_MAPPED:
1815 error = copy_cow_page_dax(vmf, iter);
1816 break;
1817 default:
1818 WARN_ON_ONCE(1);
1819 error = -EIO;
1820 break;
1821 }
1822
1823 if (error)
1824 return dax_fault_return(error);
1825
1826 __SetPageUptodate(vmf->cow_page);
1827 ret = finish_fault(vmf);
1828 if (!ret)
1829 return VM_FAULT_DONE_COW;
1830 return ret;
1831 }
1832
1833 /**
1834 * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault.
1835 * @vmf: vm fault instance
1836 * @iter: iomap iter
1837 * @pfnp: pfn to be returned
1838 * @xas: the dax mapping tree of a file
1839 * @entry: an unlocked dax entry to be inserted
1840 * @pmd: distinguish whether it is a pmd fault
1841 */
dax_fault_iter(struct vm_fault * vmf,const struct iomap_iter * iter,unsigned long * pfnp,struct xa_state * xas,void ** entry,bool pmd)1842 static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
1843 const struct iomap_iter *iter, unsigned long *pfnp,
1844 struct xa_state *xas, void **entry, bool pmd)
1845 {
1846 const struct iomap *iomap = &iter->iomap;
1847 const struct iomap *srcmap = iomap_iter_srcmap(iter);
1848 size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
1849 loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
1850 bool write = iter->flags & IOMAP_WRITE;
1851 unsigned long entry_flags = pmd ? DAX_PMD : 0;
1852 struct folio *folio;
1853 int ret, err = 0;
1854 unsigned long pfn;
1855 void *kaddr;
1856
1857 if (!pmd && vmf->cow_page)
1858 return dax_fault_cow_page(vmf, iter);
1859
1860 /* if we are reading UNWRITTEN and HOLE, return a hole. */
1861 if (!write &&
1862 (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
1863 if (!pmd)
1864 return dax_load_hole(xas, vmf, iter, entry);
1865 return dax_pmd_load_hole(xas, vmf, iter, entry);
1866 }
1867
1868 if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
1869 WARN_ON_ONCE(1);
1870 return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
1871 }
1872
1873 err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
1874 if (err)
1875 return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
1876
1877 *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
1878
1879 if (write && iomap->flags & IOMAP_F_SHARED) {
1880 err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr);
1881 if (err)
1882 return dax_fault_return(err);
1883 }
1884
1885 folio = dax_to_folio(*entry);
1886 if (dax_fault_is_synchronous(iter, vmf->vma))
1887 return dax_fault_synchronous_pfnp(pfnp, pfn);
1888
1889 folio_ref_inc(folio);
1890 if (pmd)
1891 ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn), write);
1892 else
1893 ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), write);
1894 folio_put(folio);
1895
1896 return ret;
1897 }
1898
dax_iomap_pte_fault(struct vm_fault * vmf,unsigned long * pfnp,int * iomap_errp,const struct iomap_ops * ops)1899 static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp,
1900 int *iomap_errp, const struct iomap_ops *ops)
1901 {
1902 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1903 XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
1904 struct iomap_iter iter = {
1905 .inode = mapping->host,
1906 .pos = (loff_t)vmf->pgoff << PAGE_SHIFT,
1907 .len = PAGE_SIZE,
1908 .flags = IOMAP_DAX | IOMAP_FAULT,
1909 };
1910 vm_fault_t ret = 0;
1911 void *entry;
1912 int error;
1913
1914 trace_dax_pte_fault(iter.inode, vmf, ret);
1915 /*
1916 * Check whether offset isn't beyond end of file now. Caller is supposed
1917 * to hold locks serializing us with truncate / punch hole so this is
1918 * a reliable test.
1919 */
1920 if (iter.pos >= i_size_read(iter.inode)) {
1921 ret = VM_FAULT_SIGBUS;
1922 goto out;
1923 }
1924
1925 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1926 iter.flags |= IOMAP_WRITE;
1927
1928 entry = grab_mapping_entry(&xas, mapping, 0);
1929 if (xa_is_internal(entry)) {
1930 ret = xa_to_internal(entry);
1931 goto out;
1932 }
1933
1934 /*
1935 * It is possible, particularly with mixed reads & writes to private
1936 * mappings, that we have raced with a PMD fault that overlaps with
1937 * the PTE we need to set up. If so just return and the fault will be
1938 * retried.
1939 */
1940 if (pmd_trans_huge(*vmf->pmd)) {
1941 ret = VM_FAULT_NOPAGE;
1942 goto unlock_entry;
1943 }
1944
1945 while ((error = iomap_iter(&iter, ops)) > 0) {
1946 if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
1947 iter.status = -EIO; /* fs corruption? */
1948 continue;
1949 }
1950
1951 ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false);
1952 if (ret != VM_FAULT_SIGBUS &&
1953 (iter.iomap.flags & IOMAP_F_NEW)) {
1954 count_vm_event(PGMAJFAULT);
1955 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
1956 ret |= VM_FAULT_MAJOR;
1957 }
1958
1959 if (!(ret & VM_FAULT_ERROR))
1960 iter.status = iomap_iter_advance(&iter, PAGE_SIZE);
1961 }
1962
1963 if (iomap_errp)
1964 *iomap_errp = error;
1965 if (!ret && error)
1966 ret = dax_fault_return(error);
1967
1968 unlock_entry:
1969 dax_unlock_entry(&xas, entry);
1970 out:
1971 trace_dax_pte_fault_done(iter.inode, vmf, ret);
1972 return ret;
1973 }
1974
1975 #ifdef CONFIG_FS_DAX_PMD
dax_fault_check_fallback(struct vm_fault * vmf,struct xa_state * xas,pgoff_t max_pgoff)1976 static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
1977 pgoff_t max_pgoff)
1978 {
1979 unsigned long pmd_addr = vmf->address & PMD_MASK;
1980 bool write = vmf->flags & FAULT_FLAG_WRITE;
1981
1982 /*
1983 * Make sure that the faulting address's PMD offset (color) matches
1984 * the PMD offset from the start of the file. This is necessary so
1985 * that a PMD range in the page table overlaps exactly with a PMD
1986 * range in the page cache.
1987 */
1988 if ((vmf->pgoff & PG_PMD_COLOUR) !=
1989 ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
1990 return true;
1991
1992 /* Fall back to PTEs if we're going to COW */
1993 if (write && !(vmf->vma->vm_flags & VM_SHARED))
1994 return true;
1995
1996 /* If the PMD would extend outside the VMA */
1997 if (pmd_addr < vmf->vma->vm_start)
1998 return true;
1999 if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
2000 return true;
2001
2002 /* If the PMD would extend beyond the file size */
2003 if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff)
2004 return true;
2005
2006 return false;
2007 }
2008
dax_iomap_pmd_fault(struct vm_fault * vmf,unsigned long * pfnp,const struct iomap_ops * ops)2009 static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
2010 const struct iomap_ops *ops)
2011 {
2012 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
2013 XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
2014 struct iomap_iter iter = {
2015 .inode = mapping->host,
2016 .len = PMD_SIZE,
2017 .flags = IOMAP_DAX | IOMAP_FAULT,
2018 };
2019 vm_fault_t ret = VM_FAULT_FALLBACK;
2020 pgoff_t max_pgoff;
2021 void *entry;
2022
2023 if (vmf->flags & FAULT_FLAG_WRITE)
2024 iter.flags |= IOMAP_WRITE;
2025
2026 /*
2027 * Check whether offset isn't beyond end of file now. Caller is
2028 * supposed to hold locks serializing us with truncate / punch hole so
2029 * this is a reliable test.
2030 */
2031 max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);
2032
2033 trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0);
2034
2035 if (xas.xa_index >= max_pgoff) {
2036 ret = VM_FAULT_SIGBUS;
2037 goto out;
2038 }
2039
2040 if (dax_fault_check_fallback(vmf, &xas, max_pgoff))
2041 goto fallback;
2042
2043 /*
2044 * grab_mapping_entry() will make sure we get an empty PMD entry,
2045 * a zero PMD entry or a DAX PMD. If it can't (because a PTE
2046 * entry is already in the array, for instance), it will return
2047 * VM_FAULT_FALLBACK.
2048 */
2049 entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
2050 if (xa_is_internal(entry)) {
2051 ret = xa_to_internal(entry);
2052 goto fallback;
2053 }
2054
2055 /*
2056 * It is possible, particularly with mixed reads & writes to private
2057 * mappings, that we have raced with a PTE fault that overlaps with
2058 * the PMD we need to set up. If so just return and the fault will be
2059 * retried.
2060 */
2061 if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd)) {
2062 ret = 0;
2063 goto unlock_entry;
2064 }
2065
2066 iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
2067 while (iomap_iter(&iter, ops) > 0) {
2068 if (iomap_length(&iter) < PMD_SIZE)
2069 continue; /* actually breaks out of the loop */
2070
2071 ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
2072 if (ret != VM_FAULT_FALLBACK)
2073 iter.status = iomap_iter_advance(&iter, PMD_SIZE);
2074 }
2075
2076 unlock_entry:
2077 dax_unlock_entry(&xas, entry);
2078 fallback:
2079 if (ret == VM_FAULT_FALLBACK) {
2080 split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
2081 count_vm_event(THP_FAULT_FALLBACK);
2082 }
2083 out:
2084 trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret);
2085 return ret;
2086 }
2087 #else
dax_iomap_pmd_fault(struct vm_fault * vmf,unsigned long * pfnp,const struct iomap_ops * ops)2088 static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
2089 const struct iomap_ops *ops)
2090 {
2091 return VM_FAULT_FALLBACK;
2092 }
2093 #endif /* CONFIG_FS_DAX_PMD */
2094
2095 /**
2096 * dax_iomap_fault - handle a page fault on a DAX file
2097 * @vmf: The description of the fault
2098 * @order: Order of the page to fault in
2099 * @pfnp: PFN to insert for synchronous faults if fsync is required
2100 * @iomap_errp: Storage for detailed error code in case of error
2101 * @ops: Iomap ops passed from the file system
2102 *
2103 * When a page fault occurs, filesystems may call this helper in
2104 * their fault handler for DAX files. dax_iomap_fault() assumes the caller
2105 * has done all the necessary locking for page fault to proceed
2106 * successfully.
2107 */
dax_iomap_fault(struct vm_fault * vmf,unsigned int order,unsigned long * pfnp,int * iomap_errp,const struct iomap_ops * ops)2108 vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
2109 unsigned long *pfnp, int *iomap_errp,
2110 const struct iomap_ops *ops)
2111 {
2112 if (order == 0)
2113 return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
2114 else if (order == PMD_ORDER)
2115 return dax_iomap_pmd_fault(vmf, pfnp, ops);
2116 else
2117 return VM_FAULT_FALLBACK;
2118 }
2119 EXPORT_SYMBOL_GPL(dax_iomap_fault);
2120
2121 /*
2122 * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
2123 * @vmf: The description of the fault
2124 * @pfn: PFN to insert
2125 * @order: Order of entry to insert.
2126 *
2127 * This function inserts a writeable PTE or PMD entry into the page tables
2128 * for an mmaped DAX file. It also marks the page cache entry as dirty.
2129 */
dax_insert_pfn_mkwrite(struct vm_fault * vmf,unsigned long pfn,unsigned int order)2130 static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
2131 unsigned long pfn, unsigned int order)
2132 {
2133 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
2134 XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
2135 struct folio *folio;
2136 void *entry;
2137 vm_fault_t ret;
2138
2139 xas_lock_irq(&xas);
2140 entry = get_next_unlocked_entry(&xas, order);
2141 /* Did we race with someone splitting entry or so? */
2142 if (!entry || dax_is_conflict(entry) ||
2143 (order == 0 && !dax_is_pte_entry(entry))) {
2144 put_unlocked_entry(&xas, entry, WAKE_NEXT);
2145 xas_unlock_irq(&xas);
2146 trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
2147 VM_FAULT_NOPAGE);
2148 return VM_FAULT_NOPAGE;
2149 }
2150 xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
2151 dax_lock_entry(&xas, entry);
2152 xas_unlock_irq(&xas);
2153 folio = pfn_folio(pfn);
2154 folio_ref_inc(folio);
2155 if (order == 0)
2156 ret = vmf_insert_page_mkwrite(vmf, &folio->page, true);
2157 #ifdef CONFIG_FS_DAX_PMD
2158 else if (order == PMD_ORDER)
2159 ret = vmf_insert_folio_pmd(vmf, folio, FAULT_FLAG_WRITE);
2160 #endif
2161 else
2162 ret = VM_FAULT_FALLBACK;
2163 folio_put(folio);
2164 dax_unlock_entry(&xas, entry);
2165 trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
2166 return ret;
2167 }
2168
2169 /**
2170 * dax_finish_sync_fault - finish synchronous page fault
2171 * @vmf: The description of the fault
2172 * @order: Order of entry to be inserted
2173 * @pfn: PFN to insert
2174 *
2175 * This function ensures that the file range touched by the page fault is
2176 * stored persistently on the media and handles inserting of appropriate page
2177 * table entry.
2178 */
dax_finish_sync_fault(struct vm_fault * vmf,unsigned int order,unsigned long pfn)2179 vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
2180 unsigned long pfn)
2181 {
2182 int err;
2183 loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
2184 size_t len = PAGE_SIZE << order;
2185
2186 err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
2187 if (err)
2188 return VM_FAULT_SIGBUS;
2189 return dax_insert_pfn_mkwrite(vmf, pfn, order);
2190 }
2191 EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
2192
dax_range_compare_iter(struct iomap_iter * it_src,struct iomap_iter * it_dest,u64 len,bool * same)2193 static int dax_range_compare_iter(struct iomap_iter *it_src,
2194 struct iomap_iter *it_dest, u64 len, bool *same)
2195 {
2196 const struct iomap *smap = &it_src->iomap;
2197 const struct iomap *dmap = &it_dest->iomap;
2198 loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
2199 void *saddr, *daddr;
2200 int id, ret;
2201
2202 len = min(len, min(smap->length, dmap->length));
2203
2204 if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
2205 *same = true;
2206 goto advance;
2207 }
2208
2209 if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
2210 *same = false;
2211 return 0;
2212 }
2213
2214 id = dax_read_lock();
2215 ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
2216 &saddr, NULL);
2217 if (ret < 0)
2218 goto out_unlock;
2219
2220 ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
2221 &daddr, NULL);
2222 if (ret < 0)
2223 goto out_unlock;
2224
2225 *same = !memcmp(saddr, daddr, len);
2226 if (!*same)
2227 len = 0;
2228 dax_read_unlock(id);
2229
2230 advance:
2231 ret = iomap_iter_advance(it_src, len);
2232 if (!ret)
2233 ret = iomap_iter_advance(it_dest, len);
2234 return ret;
2235
2236 out_unlock:
2237 dax_read_unlock(id);
2238 return -EIO;
2239 }
2240
dax_dedupe_file_range_compare(struct inode * src,loff_t srcoff,struct inode * dst,loff_t dstoff,loff_t len,bool * same,const struct iomap_ops * ops)2241 int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
2242 struct inode *dst, loff_t dstoff, loff_t len, bool *same,
2243 const struct iomap_ops *ops)
2244 {
2245 struct iomap_iter src_iter = {
2246 .inode = src,
2247 .pos = srcoff,
2248 .len = len,
2249 .flags = IOMAP_DAX,
2250 };
2251 struct iomap_iter dst_iter = {
2252 .inode = dst,
2253 .pos = dstoff,
2254 .len = len,
2255 .flags = IOMAP_DAX,
2256 };
2257 int ret, status;
2258
2259 while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
2260 (ret = iomap_iter(&dst_iter, ops)) > 0) {
2261 status = dax_range_compare_iter(&src_iter, &dst_iter,
2262 min(src_iter.len, dst_iter.len), same);
2263 if (status < 0)
2264 return ret;
2265 src_iter.status = dst_iter.status = status;
2266 }
2267 return ret;
2268 }
2269
dax_remap_file_range_prep(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t * len,unsigned int remap_flags,const struct iomap_ops * ops)2270 int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
2271 struct file *file_out, loff_t pos_out,
2272 loff_t *len, unsigned int remap_flags,
2273 const struct iomap_ops *ops)
2274 {
2275 return __generic_remap_file_range_prep(file_in, pos_in, file_out,
2276 pos_out, len, remap_flags, ops);
2277 }
2278 EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);
2279