xref: /linux/mm/memfd_luo.c (revision 0fc8f6200d2313278fbf4539bbab74677c685531)
1b3749f17SPratyush Yadav // SPDX-License-Identifier: GPL-2.0
2b3749f17SPratyush Yadav 
3b3749f17SPratyush Yadav /*
4b3749f17SPratyush Yadav  * Copyright (c) 2025, Google LLC.
5b3749f17SPratyush Yadav  * Pasha Tatashin <pasha.tatashin@soleen.com>
6b3749f17SPratyush Yadav  *
7b3749f17SPratyush Yadav  * Copyright (C) 2025 Amazon.com Inc. or its affiliates.
8b3749f17SPratyush Yadav  * Pratyush Yadav <ptyadav@amazon.de>
9b3749f17SPratyush Yadav  */
10b3749f17SPratyush Yadav 
11b3749f17SPratyush Yadav /**
12b3749f17SPratyush Yadav  * DOC: Memfd Preservation via LUO
13b3749f17SPratyush Yadav  *
14b3749f17SPratyush Yadav  * Overview
15b3749f17SPratyush Yadav  * ========
16b3749f17SPratyush Yadav  *
17b3749f17SPratyush Yadav  * Memory file descriptors (memfd) can be preserved over a kexec using the Live
18b3749f17SPratyush Yadav  * Update Orchestrator (LUO) file preservation. This allows userspace to
19b3749f17SPratyush Yadav  * transfer its memory contents to the next kernel after a kexec.
20b3749f17SPratyush Yadav  *
21b3749f17SPratyush Yadav  * The preservation is not intended to be transparent. Only select properties of
22b3749f17SPratyush Yadav  * the file are preserved. All others are reset to default. The preserved
23b3749f17SPratyush Yadav  * properties are described below.
24b3749f17SPratyush Yadav  *
25b3749f17SPratyush Yadav  * .. note::
26b3749f17SPratyush Yadav  *    The LUO API is not stabilized yet, so the preserved properties of a memfd
27b3749f17SPratyush Yadav  *    are also not stable and are subject to backwards incompatible changes.
28b3749f17SPratyush Yadav  *
29b3749f17SPratyush Yadav  * .. note::
30b3749f17SPratyush Yadav  *    Currently a memfd backed by Hugetlb is not supported. Memfds created
31b3749f17SPratyush Yadav  *    with ``MFD_HUGETLB`` will be rejected.
32b3749f17SPratyush Yadav  *
33b3749f17SPratyush Yadav  * Preserved Properties
34b3749f17SPratyush Yadav  * ====================
35b3749f17SPratyush Yadav  *
36b3749f17SPratyush Yadav  * The following properties of the memfd are preserved across kexec:
37b3749f17SPratyush Yadav  *
38b3749f17SPratyush Yadav  * File Contents
39b3749f17SPratyush Yadav  *   All data stored in the file is preserved.
40b3749f17SPratyush Yadav  *
41b3749f17SPratyush Yadav  * File Size
42b3749f17SPratyush Yadav  *   The size of the file is preserved. Holes in the file are filled by
43b3749f17SPratyush Yadav  *   allocating pages for them during preservation.
44b3749f17SPratyush Yadav  *
45b3749f17SPratyush Yadav  * File Position
46b3749f17SPratyush Yadav  *   The current file position is preserved, allowing applications to continue
47b3749f17SPratyush Yadav  *   reading/writing from their last position.
48b3749f17SPratyush Yadav  *
49b3749f17SPratyush Yadav  * File Status Flags
50b3749f17SPratyush Yadav  *   memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
51b3749f17SPratyush Yadav  *   is maintained.
52b3749f17SPratyush Yadav  *
53b3749f17SPratyush Yadav  * Non-Preserved Properties
54b3749f17SPratyush Yadav  * ========================
55b3749f17SPratyush Yadav  *
56b3749f17SPratyush Yadav  * All properties which are not preserved must be assumed to be reset to
57b3749f17SPratyush Yadav  * default. This section describes some of those properties which may be more of
58b3749f17SPratyush Yadav  * note.
59b3749f17SPratyush Yadav  *
60b3749f17SPratyush Yadav  * ``FD_CLOEXEC`` flag
61b3749f17SPratyush Yadav  *   A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
62b3749f17SPratyush Yadav  *   ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
63b3749f17SPratyush Yadav  *   again after restore via ``fcntl()``.
64b3749f17SPratyush Yadav  *
65b3749f17SPratyush Yadav  * Seals
66b3749f17SPratyush Yadav  *   File seals are not preserved. The file is unsealed on restore and if
67b3749f17SPratyush Yadav  *   needed, must be sealed again via ``fcntl()``.
68b3749f17SPratyush Yadav  */
69b3749f17SPratyush Yadav 
70b3749f17SPratyush Yadav #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
71b3749f17SPratyush Yadav 
72b3749f17SPratyush Yadav #include <linux/bits.h>
73b3749f17SPratyush Yadav #include <linux/err.h>
74b3749f17SPratyush Yadav #include <linux/file.h>
75b3749f17SPratyush Yadav #include <linux/io.h>
76b3749f17SPratyush Yadav #include <linux/kexec_handover.h>
77b3749f17SPratyush Yadav #include <linux/kho/abi/memfd.h>
78b3749f17SPratyush Yadav #include <linux/liveupdate.h>
79b3749f17SPratyush Yadav #include <linux/shmem_fs.h>
80b3749f17SPratyush Yadav #include <linux/vmalloc.h>
8102e117b8SPratyush Yadav (Google) #include <linux/memfd.h>
828a552d68SPratyush Yadav (Google) #include <uapi/linux/memfd.h>
838a552d68SPratyush Yadav (Google) 
84b3749f17SPratyush Yadav #include "internal.h"
85b3749f17SPratyush Yadav 
memfd_luo_preserve_folios(struct file * file,struct kho_vmalloc * kho_vmalloc,struct memfd_luo_folio_ser ** out_folios_ser,u64 * nr_foliosp)86b3749f17SPratyush Yadav static int memfd_luo_preserve_folios(struct file *file,
87b3749f17SPratyush Yadav 				     struct kho_vmalloc *kho_vmalloc,
88b3749f17SPratyush Yadav 				     struct memfd_luo_folio_ser **out_folios_ser,
89b3749f17SPratyush Yadav 				     u64 *nr_foliosp)
90b3749f17SPratyush Yadav {
91b3749f17SPratyush Yadav 	struct inode *inode = file_inode(file);
92b3749f17SPratyush Yadav 	struct memfd_luo_folio_ser *folios_ser;
93b3749f17SPratyush Yadav 	unsigned int max_folios;
94b3749f17SPratyush Yadav 	long i, size, nr_pinned;
95b3749f17SPratyush Yadav 	struct folio **folios;
96b3749f17SPratyush Yadav 	int err = -EINVAL;
97b3749f17SPratyush Yadav 	pgoff_t offset;
98b3749f17SPratyush Yadav 	u64 nr_folios;
99b3749f17SPratyush Yadav 
100b3749f17SPratyush Yadav 	size = i_size_read(inode);
101b3749f17SPratyush Yadav 	/*
102b3749f17SPratyush Yadav 	 * If the file has zero size, then the folios and nr_folios properties
103b3749f17SPratyush Yadav 	 * are not set.
104b3749f17SPratyush Yadav 	 */
105b3749f17SPratyush Yadav 	if (!size) {
106b3749f17SPratyush Yadav 		*nr_foliosp = 0;
107b3749f17SPratyush Yadav 		*out_folios_ser = NULL;
108b3749f17SPratyush Yadav 		return 0;
109b3749f17SPratyush Yadav 	}
110b3749f17SPratyush Yadav 
111b3749f17SPratyush Yadav 	/*
112b3749f17SPratyush Yadav 	 * Guess the number of folios based on inode size. Real number might end
113b3749f17SPratyush Yadav 	 * up being smaller if there are higher order folios.
114b3749f17SPratyush Yadav 	 */
115b3749f17SPratyush Yadav 	max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
116bf4afc53SLinus Torvalds 	folios = kvmalloc_objs(*folios, max_folios);
117b3749f17SPratyush Yadav 	if (!folios)
118b3749f17SPratyush Yadav 		return -ENOMEM;
119b3749f17SPratyush Yadav 
120b3749f17SPratyush Yadav 	/*
121b3749f17SPratyush Yadav 	 * Pin the folios so they don't move around behind our back. This also
122b3749f17SPratyush Yadav 	 * ensures none of the folios are in CMA -- which ensures they don't
123b3749f17SPratyush Yadav 	 * fall in KHO scratch memory. It also moves swapped out folios back to
124b3749f17SPratyush Yadav 	 * memory.
125b3749f17SPratyush Yadav 	 *
126b3749f17SPratyush Yadav 	 * A side effect of doing this is that it allocates a folio for all
127b3749f17SPratyush Yadav 	 * indices in the file. This might waste memory on sparse memfds. If
128b3749f17SPratyush Yadav 	 * that is really a problem in the future, we can have a
129b3749f17SPratyush Yadav 	 * memfd_pin_folios() variant that does not allocate a page on empty
130b3749f17SPratyush Yadav 	 * slots.
131b3749f17SPratyush Yadav 	 */
132b3749f17SPratyush Yadav 	nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
133b3749f17SPratyush Yadav 				     &offset);
134b3749f17SPratyush Yadav 	if (nr_pinned < 0) {
135b3749f17SPratyush Yadav 		err = nr_pinned;
136b3749f17SPratyush Yadav 		pr_err("failed to pin folios: %d\n", err);
137b3749f17SPratyush Yadav 		goto err_free_folios;
138b3749f17SPratyush Yadav 	}
139b3749f17SPratyush Yadav 	nr_folios = nr_pinned;
140b3749f17SPratyush Yadav 
141b3749f17SPratyush Yadav 	folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
142b3749f17SPratyush Yadav 	if (!folios_ser) {
143b3749f17SPratyush Yadav 		err = -ENOMEM;
144b3749f17SPratyush Yadav 		goto err_unpin;
145b3749f17SPratyush Yadav 	}
146b3749f17SPratyush Yadav 
147b3749f17SPratyush Yadav 	for (i = 0; i < nr_folios; i++) {
148b3749f17SPratyush Yadav 		struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
149b3749f17SPratyush Yadav 		struct folio *folio = folios[i];
150b3749f17SPratyush Yadav 
151b3749f17SPratyush Yadav 		err = kho_preserve_folio(folio);
152b3749f17SPratyush Yadav 		if (err)
153b3749f17SPratyush Yadav 			goto err_unpreserve;
154b3749f17SPratyush Yadav 
15550d7b433SPratyush Yadav (Google) 		folio_lock(folio);
15650d7b433SPratyush Yadav (Google) 
1577e04bf1fSPratyush Yadav (Google) 		/*
1587e04bf1fSPratyush Yadav (Google) 		 * A dirty folio is one which has been written to. A clean folio
1597e04bf1fSPratyush Yadav (Google) 		 * is its opposite. Since a clean folio does not carry user
1607e04bf1fSPratyush Yadav (Google) 		 * data, it can be freed by page reclaim under memory pressure.
1617e04bf1fSPratyush Yadav (Google) 		 *
1627e04bf1fSPratyush Yadav (Google) 		 * Saving the dirty flag at prepare() time doesn't work since it
1637e04bf1fSPratyush Yadav (Google) 		 * can change later. Saving it at freeze() also won't work
1647e04bf1fSPratyush Yadav (Google) 		 * because the dirty bit is normally synced at unmap and there
1657e04bf1fSPratyush Yadav (Google) 		 * might still be a mapping of the file at freeze().
1667e04bf1fSPratyush Yadav (Google) 		 *
1677e04bf1fSPratyush Yadav (Google) 		 * To see why this is a problem, say a folio is clean at
1687e04bf1fSPratyush Yadav (Google) 		 * preserve, but gets dirtied later. The pfolio flags will mark
1697e04bf1fSPratyush Yadav (Google) 		 * it as clean. After retrieve, the next kernel might try to
1707e04bf1fSPratyush Yadav (Google) 		 * reclaim this folio under memory pressure, losing user data.
1717e04bf1fSPratyush Yadav (Google) 		 *
1727e04bf1fSPratyush Yadav (Google) 		 * Unconditionally mark it dirty to avoid this problem. This
1737e04bf1fSPratyush Yadav (Google) 		 * comes at the cost of making clean folios un-reclaimable after
1747e04bf1fSPratyush Yadav (Google) 		 * live update.
1757e04bf1fSPratyush Yadav (Google) 		 */
1767e04bf1fSPratyush Yadav (Google) 		folio_mark_dirty(folio);
17750d7b433SPratyush Yadav (Google) 
17850d7b433SPratyush Yadav (Google) 		/*
17950d7b433SPratyush Yadav (Google) 		 * If the folio is not uptodate, it was fallocated but never
18050d7b433SPratyush Yadav (Google) 		 * used. Saving this flag at prepare() doesn't work since it
18150d7b433SPratyush Yadav (Google) 		 * might change later when someone uses the folio.
18250d7b433SPratyush Yadav (Google) 		 *
18350d7b433SPratyush Yadav (Google) 		 * Since we have taken the performance penalty of allocating,
18450d7b433SPratyush Yadav (Google) 		 * zeroing, and pinning all the folios in the holes, take a bit
18550d7b433SPratyush Yadav (Google) 		 * more and zero all non-uptodate folios too.
18650d7b433SPratyush Yadav (Google) 		 *
18750d7b433SPratyush Yadav (Google) 		 * NOTE: For someone looking to improve preserve performance,
18850d7b433SPratyush Yadav (Google) 		 * this is a good place to look.
18950d7b433SPratyush Yadav (Google) 		 */
19050d7b433SPratyush Yadav (Google) 		if (!folio_test_uptodate(folio)) {
19150d7b433SPratyush Yadav (Google) 			folio_zero_range(folio, 0, folio_size(folio));
19250d7b433SPratyush Yadav (Google) 			flush_dcache_folio(folio);
19350d7b433SPratyush Yadav (Google) 			folio_mark_uptodate(folio);
19450d7b433SPratyush Yadav (Google) 		}
195b3749f17SPratyush Yadav 
19650d7b433SPratyush Yadav (Google) 		folio_unlock(folio);
19750d7b433SPratyush Yadav (Google) 
198b3749f17SPratyush Yadav 		pfolio->pfn = folio_pfn(folio);
1997e04bf1fSPratyush Yadav (Google) 		pfolio->flags = MEMFD_LUO_FOLIO_DIRTY | MEMFD_LUO_FOLIO_UPTODATE;
200b3749f17SPratyush Yadav 		pfolio->index = folio->index;
201b3749f17SPratyush Yadav 	}
202b3749f17SPratyush Yadav 
203b3749f17SPratyush Yadav 	err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
204b3749f17SPratyush Yadav 	if (err)
205b3749f17SPratyush Yadav 		goto err_unpreserve;
206b3749f17SPratyush Yadav 
207b3749f17SPratyush Yadav 	kvfree(folios);
208b3749f17SPratyush Yadav 	*nr_foliosp = nr_folios;
209b3749f17SPratyush Yadav 	*out_folios_ser = folios_ser;
210b3749f17SPratyush Yadav 
211b3749f17SPratyush Yadav 	/*
212b3749f17SPratyush Yadav 	 * Note: folios_ser is purposely not freed here. It is preserved
213b3749f17SPratyush Yadav 	 * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
214b3749f17SPratyush Yadav 	 * that is passed via private_data.
215b3749f17SPratyush Yadav 	 */
216b3749f17SPratyush Yadav 	return 0;
217b3749f17SPratyush Yadav 
218b3749f17SPratyush Yadav err_unpreserve:
219b3749f17SPratyush Yadav 	for (i = i - 1; i >= 0; i--)
220b3749f17SPratyush Yadav 		kho_unpreserve_folio(folios[i]);
221b3749f17SPratyush Yadav 	vfree(folios_ser);
222b3749f17SPratyush Yadav err_unpin:
223b3749f17SPratyush Yadav 	unpin_folios(folios, nr_folios);
224b3749f17SPratyush Yadav err_free_folios:
225b3749f17SPratyush Yadav 	kvfree(folios);
226b3749f17SPratyush Yadav 
227b3749f17SPratyush Yadav 	return err;
228b3749f17SPratyush Yadav }
229b3749f17SPratyush Yadav 
memfd_luo_unpreserve_folios(struct kho_vmalloc * kho_vmalloc,struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)230b3749f17SPratyush Yadav static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
231b3749f17SPratyush Yadav 					struct memfd_luo_folio_ser *folios_ser,
232b3749f17SPratyush Yadav 					u64 nr_folios)
233b3749f17SPratyush Yadav {
234b3749f17SPratyush Yadav 	long i;
235b3749f17SPratyush Yadav 
236b3749f17SPratyush Yadav 	if (!nr_folios)
237b3749f17SPratyush Yadav 		return;
238b3749f17SPratyush Yadav 
239b3749f17SPratyush Yadav 	kho_unpreserve_vmalloc(kho_vmalloc);
240b3749f17SPratyush Yadav 
241b3749f17SPratyush Yadav 	for (i = 0; i < nr_folios; i++) {
242b3749f17SPratyush Yadav 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
243b3749f17SPratyush Yadav 		struct folio *folio;
244b3749f17SPratyush Yadav 
245b3749f17SPratyush Yadav 		if (!pfolio->pfn)
246b3749f17SPratyush Yadav 			continue;
247b3749f17SPratyush Yadav 
248b3749f17SPratyush Yadav 		folio = pfn_folio(pfolio->pfn);
249b3749f17SPratyush Yadav 
250b3749f17SPratyush Yadav 		kho_unpreserve_folio(folio);
251b3749f17SPratyush Yadav 		unpin_folio(folio);
252b3749f17SPratyush Yadav 	}
253b3749f17SPratyush Yadav 
254b3749f17SPratyush Yadav 	vfree(folios_ser);
255b3749f17SPratyush Yadav }
256b3749f17SPratyush Yadav 
memfd_luo_preserve(struct liveupdate_file_op_args * args)257b3749f17SPratyush Yadav static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
258b3749f17SPratyush Yadav {
259b3749f17SPratyush Yadav 	struct inode *inode = file_inode(args->file);
260b3749f17SPratyush Yadav 	struct memfd_luo_folio_ser *folios_ser;
261b3749f17SPratyush Yadav 	struct memfd_luo_ser *ser;
262b3749f17SPratyush Yadav 	u64 nr_folios;
2638a552d68SPratyush Yadav (Google) 	int err = 0, seals;
264b3749f17SPratyush Yadav 
265b3749f17SPratyush Yadav 	inode_lock(inode);
266b3749f17SPratyush Yadav 	shmem_freeze(inode, true);
267b3749f17SPratyush Yadav 
268b3749f17SPratyush Yadav 	/* Allocate the main serialization structure in preserved memory */
269b3749f17SPratyush Yadav 	ser = kho_alloc_preserve(sizeof(*ser));
270b3749f17SPratyush Yadav 	if (IS_ERR(ser)) {
271b3749f17SPratyush Yadav 		err = PTR_ERR(ser);
272b3749f17SPratyush Yadav 		goto err_unlock;
273b3749f17SPratyush Yadav 	}
274b3749f17SPratyush Yadav 
2758a552d68SPratyush Yadav (Google) 	seals = memfd_get_seals(args->file);
2768a552d68SPratyush Yadav (Google) 	if (seals < 0) {
2778a552d68SPratyush Yadav (Google) 		err = seals;
2788a552d68SPratyush Yadav (Google) 		goto err_free_ser;
2798a552d68SPratyush Yadav (Google) 	}
2808a552d68SPratyush Yadav (Google) 
2818a552d68SPratyush Yadav (Google) 	/* Make sure the file only has the seals supported by this version. */
2828a552d68SPratyush Yadav (Google) 	if (seals & ~MEMFD_LUO_ALL_SEALS) {
2838a552d68SPratyush Yadav (Google) 		err = -EOPNOTSUPP;
2848a552d68SPratyush Yadav (Google) 		goto err_free_ser;
2858a552d68SPratyush Yadav (Google) 	}
2868a552d68SPratyush Yadav (Google) 
287b3749f17SPratyush Yadav 	ser->pos = args->file->f_pos;
288b3749f17SPratyush Yadav 	ser->size = i_size_read(inode);
2898a552d68SPratyush Yadav (Google) 	ser->seals = seals;
290b3749f17SPratyush Yadav 
291b3749f17SPratyush Yadav 	err = memfd_luo_preserve_folios(args->file, &ser->folios,
292b3749f17SPratyush Yadav 					&folios_ser, &nr_folios);
293b3749f17SPratyush Yadav 	if (err)
294b3749f17SPratyush Yadav 		goto err_free_ser;
295b3749f17SPratyush Yadav 
296b3749f17SPratyush Yadav 	ser->nr_folios = nr_folios;
297b3749f17SPratyush Yadav 	inode_unlock(inode);
298b3749f17SPratyush Yadav 
299b3749f17SPratyush Yadav 	args->private_data = folios_ser;
300b3749f17SPratyush Yadav 	args->serialized_data = virt_to_phys(ser);
301b3749f17SPratyush Yadav 
302b3749f17SPratyush Yadav 	return 0;
303b3749f17SPratyush Yadav 
304b3749f17SPratyush Yadav err_free_ser:
305b3749f17SPratyush Yadav 	kho_unpreserve_free(ser);
306b3749f17SPratyush Yadav err_unlock:
307b3749f17SPratyush Yadav 	shmem_freeze(inode, false);
308b3749f17SPratyush Yadav 	inode_unlock(inode);
309b3749f17SPratyush Yadav 	return err;
310b3749f17SPratyush Yadav }
311b3749f17SPratyush Yadav 
memfd_luo_freeze(struct liveupdate_file_op_args * args)312b3749f17SPratyush Yadav static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
313b3749f17SPratyush Yadav {
314b3749f17SPratyush Yadav 	struct memfd_luo_ser *ser;
315b3749f17SPratyush Yadav 
316b3749f17SPratyush Yadav 	if (WARN_ON_ONCE(!args->serialized_data))
317b3749f17SPratyush Yadav 		return -EINVAL;
318b3749f17SPratyush Yadav 
319b3749f17SPratyush Yadav 	ser = phys_to_virt(args->serialized_data);
320b3749f17SPratyush Yadav 
321b3749f17SPratyush Yadav 	/*
322b3749f17SPratyush Yadav 	 * The pos might have changed since prepare. Everything else stays the
323b3749f17SPratyush Yadav 	 * same.
324b3749f17SPratyush Yadav 	 */
325b3749f17SPratyush Yadav 	ser->pos = args->file->f_pos;
326b3749f17SPratyush Yadav 
327b3749f17SPratyush Yadav 	return 0;
328b3749f17SPratyush Yadav }
329b3749f17SPratyush Yadav 
memfd_luo_unpreserve(struct liveupdate_file_op_args * args)330b3749f17SPratyush Yadav static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
331b3749f17SPratyush Yadav {
332b3749f17SPratyush Yadav 	struct inode *inode = file_inode(args->file);
333b3749f17SPratyush Yadav 	struct memfd_luo_ser *ser;
334b3749f17SPratyush Yadav 
335b3749f17SPratyush Yadav 	if (WARN_ON_ONCE(!args->serialized_data))
336b3749f17SPratyush Yadav 		return;
337b3749f17SPratyush Yadav 
338b3749f17SPratyush Yadav 	inode_lock(inode);
339b3749f17SPratyush Yadav 	shmem_freeze(inode, false);
340b3749f17SPratyush Yadav 
341b3749f17SPratyush Yadav 	ser = phys_to_virt(args->serialized_data);
342b3749f17SPratyush Yadav 
343b3749f17SPratyush Yadav 	memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
344b3749f17SPratyush Yadav 				    ser->nr_folios);
345b3749f17SPratyush Yadav 
346b3749f17SPratyush Yadav 	kho_unpreserve_free(ser);
347b3749f17SPratyush Yadav 	inode_unlock(inode);
348b3749f17SPratyush Yadav }
349b3749f17SPratyush Yadav 
memfd_luo_discard_folios(const struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)350b3749f17SPratyush Yadav static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
351b3749f17SPratyush Yadav 				     u64 nr_folios)
352b3749f17SPratyush Yadav {
353b3749f17SPratyush Yadav 	u64 i;
354b3749f17SPratyush Yadav 
355b3749f17SPratyush Yadav 	for (i = 0; i < nr_folios; i++) {
356b3749f17SPratyush Yadav 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
357b3749f17SPratyush Yadav 		struct folio *folio;
358b3749f17SPratyush Yadav 		phys_addr_t phys;
359b3749f17SPratyush Yadav 
360b3749f17SPratyush Yadav 		if (!pfolio->pfn)
361b3749f17SPratyush Yadav 			continue;
362b3749f17SPratyush Yadav 
363b3749f17SPratyush Yadav 		phys = PFN_PHYS(pfolio->pfn);
364b3749f17SPratyush Yadav 		folio = kho_restore_folio(phys);
365b3749f17SPratyush Yadav 		if (!folio) {
366b3749f17SPratyush Yadav 			pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
367b3749f17SPratyush Yadav 					    phys);
368b3749f17SPratyush Yadav 			continue;
369b3749f17SPratyush Yadav 		}
370b3749f17SPratyush Yadav 
371b3749f17SPratyush Yadav 		folio_put(folio);
372b3749f17SPratyush Yadav 	}
373b3749f17SPratyush Yadav }
374b3749f17SPratyush Yadav 
memfd_luo_finish(struct liveupdate_file_op_args * args)375b3749f17SPratyush Yadav static void memfd_luo_finish(struct liveupdate_file_op_args *args)
376b3749f17SPratyush Yadav {
377b3749f17SPratyush Yadav 	struct memfd_luo_folio_ser *folios_ser;
378b3749f17SPratyush Yadav 	struct memfd_luo_ser *ser;
379b3749f17SPratyush Yadav 
380f85b1c6aSPratyush Yadav (Google) 	/*
381f85b1c6aSPratyush Yadav (Google) 	 * If retrieve was successful, nothing to do. If it failed, retrieve()
382f85b1c6aSPratyush Yadav (Google) 	 * already cleaned up everything it could. So nothing to do there
383f85b1c6aSPratyush Yadav (Google) 	 * either. Only need to clean up when retrieve was not called.
384f85b1c6aSPratyush Yadav (Google) 	 */
385f85b1c6aSPratyush Yadav (Google) 	if (args->retrieve_status)
386b3749f17SPratyush Yadav 		return;
387b3749f17SPratyush Yadav 
388b3749f17SPratyush Yadav 	ser = phys_to_virt(args->serialized_data);
389b3749f17SPratyush Yadav 	if (!ser)
390b3749f17SPratyush Yadav 		return;
391b3749f17SPratyush Yadav 
392b3749f17SPratyush Yadav 	if (ser->nr_folios) {
393b3749f17SPratyush Yadav 		folios_ser = kho_restore_vmalloc(&ser->folios);
394b3749f17SPratyush Yadav 		if (!folios_ser)
395b3749f17SPratyush Yadav 			goto out;
396b3749f17SPratyush Yadav 
397b3749f17SPratyush Yadav 		memfd_luo_discard_folios(folios_ser, ser->nr_folios);
398b3749f17SPratyush Yadav 		vfree(folios_ser);
399b3749f17SPratyush Yadav 	}
400b3749f17SPratyush Yadav 
401b3749f17SPratyush Yadav out:
402b3749f17SPratyush Yadav 	kho_restore_free(ser);
403b3749f17SPratyush Yadav }
404b3749f17SPratyush Yadav 
memfd_luo_retrieve_folios(struct file * file,struct memfd_luo_folio_ser * folios_ser,u64 nr_folios)405b3749f17SPratyush Yadav static int memfd_luo_retrieve_folios(struct file *file,
406b3749f17SPratyush Yadav 				     struct memfd_luo_folio_ser *folios_ser,
407b3749f17SPratyush Yadav 				     u64 nr_folios)
408b3749f17SPratyush Yadav {
409b3749f17SPratyush Yadav 	struct inode *inode = file_inode(file);
410b3749f17SPratyush Yadav 	struct address_space *mapping = inode->i_mapping;
411b3749f17SPratyush Yadav 	struct folio *folio;
412502d3c2aSChenghao Duan 	long npages, nr_added_pages = 0;
413b3749f17SPratyush Yadav 	int err = -EIO;
414b3749f17SPratyush Yadav 	long i;
415b3749f17SPratyush Yadav 
416b3749f17SPratyush Yadav 	for (i = 0; i < nr_folios; i++) {
417b3749f17SPratyush Yadav 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
418b3749f17SPratyush Yadav 		phys_addr_t phys;
419b3749f17SPratyush Yadav 		u64 index;
420b3749f17SPratyush Yadav 		int flags;
421b3749f17SPratyush Yadav 
422b3749f17SPratyush Yadav 		if (!pfolio->pfn)
423b3749f17SPratyush Yadav 			continue;
424b3749f17SPratyush Yadav 
425b3749f17SPratyush Yadav 		phys = PFN_PHYS(pfolio->pfn);
426b3749f17SPratyush Yadav 		folio = kho_restore_folio(phys);
427b3749f17SPratyush Yadav 		if (!folio) {
428b3749f17SPratyush Yadav 			pr_err("Unable to restore folio at physical address: %llx\n",
429b3749f17SPratyush Yadav 			       phys);
430b3749f17SPratyush Yadav 			goto put_folios;
431b3749f17SPratyush Yadav 		}
432b3749f17SPratyush Yadav 		index = pfolio->index;
433b3749f17SPratyush Yadav 		flags = pfolio->flags;
434b3749f17SPratyush Yadav 
435b3749f17SPratyush Yadav 		/* Set up the folio for insertion. */
436b3749f17SPratyush Yadav 		__folio_set_locked(folio);
437b3749f17SPratyush Yadav 		__folio_set_swapbacked(folio);
438b3749f17SPratyush Yadav 
439b3749f17SPratyush Yadav 		err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
440b3749f17SPratyush Yadav 		if (err) {
441b3749f17SPratyush Yadav 			pr_err("shmem: failed to charge folio index %ld: %d\n",
442b3749f17SPratyush Yadav 			       i, err);
443b3749f17SPratyush Yadav 			goto unlock_folio;
444b3749f17SPratyush Yadav 		}
445b3749f17SPratyush Yadav 
446b3749f17SPratyush Yadav 		err = shmem_add_to_page_cache(folio, mapping, index, NULL,
447b3749f17SPratyush Yadav 					      mapping_gfp_mask(mapping));
448b3749f17SPratyush Yadav 		if (err) {
449b3749f17SPratyush Yadav 			pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
450b3749f17SPratyush Yadav 			       i, err);
451b3749f17SPratyush Yadav 			goto unlock_folio;
452b3749f17SPratyush Yadav 		}
453b3749f17SPratyush Yadav 
454b3749f17SPratyush Yadav 		if (flags & MEMFD_LUO_FOLIO_UPTODATE)
455b3749f17SPratyush Yadav 			folio_mark_uptodate(folio);
456b3749f17SPratyush Yadav 		if (flags & MEMFD_LUO_FOLIO_DIRTY)
457b3749f17SPratyush Yadav 			folio_mark_dirty(folio);
458b3749f17SPratyush Yadav 
459ed2a29dcSChenghao Duan 		npages = folio_nr_pages(folio);
460ed2a29dcSChenghao Duan 		err = shmem_inode_acct_blocks(inode, npages);
461b3749f17SPratyush Yadav 		if (err) {
462ed2a29dcSChenghao Duan 			pr_err("shmem: failed to account folio index %ld(%ld pages): %d\n",
463ed2a29dcSChenghao Duan 			       i, npages, err);
464*dc44f32fSChenghao Duan 			goto remove_from_cache;
465b3749f17SPratyush Yadav 		}
466b3749f17SPratyush Yadav 
467502d3c2aSChenghao Duan 		nr_added_pages += npages;
468b3749f17SPratyush Yadav 		folio_add_lru(folio);
469b3749f17SPratyush Yadav 		folio_unlock(folio);
470b3749f17SPratyush Yadav 		folio_put(folio);
471b3749f17SPratyush Yadav 	}
472b3749f17SPratyush Yadav 
473502d3c2aSChenghao Duan 	shmem_recalc_inode(inode, nr_added_pages, 0);
474502d3c2aSChenghao Duan 
475b3749f17SPratyush Yadav 	return 0;
476b3749f17SPratyush Yadav 
477*dc44f32fSChenghao Duan remove_from_cache:
478*dc44f32fSChenghao Duan 	filemap_remove_folio(folio);
479b3749f17SPratyush Yadav unlock_folio:
480b3749f17SPratyush Yadav 	folio_unlock(folio);
481b3749f17SPratyush Yadav 	folio_put(folio);
482b3749f17SPratyush Yadav put_folios:
483b3749f17SPratyush Yadav 	/*
484b3749f17SPratyush Yadav 	 * Note: don't free the folios already added to the file. They will be
485b3749f17SPratyush Yadav 	 * freed when the file is freed. Free the ones not added yet here.
486b3749f17SPratyush Yadav 	 */
487b3749f17SPratyush Yadav 	for (long j = i + 1; j < nr_folios; j++) {
488b3749f17SPratyush Yadav 		const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
4893538f90aSChenghao Duan 		phys_addr_t phys;
490b3749f17SPratyush Yadav 
4913538f90aSChenghao Duan 		if (!pfolio->pfn)
4923538f90aSChenghao Duan 			continue;
4933538f90aSChenghao Duan 
4943538f90aSChenghao Duan 		phys = PFN_PHYS(pfolio->pfn);
4953538f90aSChenghao Duan 		folio = kho_restore_folio(phys);
496b3749f17SPratyush Yadav 		if (folio)
497b3749f17SPratyush Yadav 			folio_put(folio);
498b3749f17SPratyush Yadav 	}
499b3749f17SPratyush Yadav 
500502d3c2aSChenghao Duan 	shmem_recalc_inode(inode, nr_added_pages, 0);
501502d3c2aSChenghao Duan 
502b3749f17SPratyush Yadav 	return err;
503b3749f17SPratyush Yadav }
504b3749f17SPratyush Yadav 
memfd_luo_retrieve(struct liveupdate_file_op_args * args)505b3749f17SPratyush Yadav static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
506b3749f17SPratyush Yadav {
507b3749f17SPratyush Yadav 	struct memfd_luo_folio_ser *folios_ser;
508b3749f17SPratyush Yadav 	struct memfd_luo_ser *ser;
509b3749f17SPratyush Yadav 	struct file *file;
510b3749f17SPratyush Yadav 	int err;
511b3749f17SPratyush Yadav 
512b3749f17SPratyush Yadav 	ser = phys_to_virt(args->serialized_data);
513b3749f17SPratyush Yadav 	if (!ser)
514b3749f17SPratyush Yadav 		return -EINVAL;
515b3749f17SPratyush Yadav 
5168a552d68SPratyush Yadav (Google) 	/* Make sure the file only has seals supported by this version. */
5178a552d68SPratyush Yadav (Google) 	if (ser->seals & ~MEMFD_LUO_ALL_SEALS) {
5188a552d68SPratyush Yadav (Google) 		err = -EOPNOTSUPP;
5198a552d68SPratyush Yadav (Google) 		goto free_ser;
5208a552d68SPratyush Yadav (Google) 	}
5218a552d68SPratyush Yadav (Google) 
5228a552d68SPratyush Yadav (Google) 	/*
5238a552d68SPratyush Yadav (Google) 	 * The seals are preserved. Allow sealing here so they can be added
5248a552d68SPratyush Yadav (Google) 	 * later.
5258a552d68SPratyush Yadav (Google) 	 */
5268a552d68SPratyush Yadav (Google) 	file = memfd_alloc_file("", MFD_ALLOW_SEALING);
527b3749f17SPratyush Yadav 	if (IS_ERR(file)) {
528b3749f17SPratyush Yadav 		pr_err("failed to setup file: %pe\n", file);
529c657c5dcSPratyush Yadav (Google) 		err = PTR_ERR(file);
530c657c5dcSPratyush Yadav (Google) 		goto free_ser;
531b3749f17SPratyush Yadav 	}
532b3749f17SPratyush Yadav 
5338a552d68SPratyush Yadav (Google) 	err = memfd_add_seals(file, ser->seals);
5348a552d68SPratyush Yadav (Google) 	if (err) {
5358a552d68SPratyush Yadav (Google) 		pr_err("failed to add seals: %pe\n", ERR_PTR(err));
5368a552d68SPratyush Yadav (Google) 		goto put_file;
5378a552d68SPratyush Yadav (Google) 	}
5388a552d68SPratyush Yadav (Google) 
539b3749f17SPratyush Yadav 	vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
54032f6cec5SChenghao Duan 	i_size_write(file_inode(file), ser->size);
541b3749f17SPratyush Yadav 
542b3749f17SPratyush Yadav 	if (ser->nr_folios) {
543b3749f17SPratyush Yadav 		folios_ser = kho_restore_vmalloc(&ser->folios);
544b3749f17SPratyush Yadav 		if (!folios_ser) {
545b3749f17SPratyush Yadav 			err = -EINVAL;
546b3749f17SPratyush Yadav 			goto put_file;
547b3749f17SPratyush Yadav 		}
548b3749f17SPratyush Yadav 
549b3749f17SPratyush Yadav 		err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
550b3749f17SPratyush Yadav 		vfree(folios_ser);
551b3749f17SPratyush Yadav 		if (err)
552b3749f17SPratyush Yadav 			goto put_file;
553b3749f17SPratyush Yadav 	}
554b3749f17SPratyush Yadav 
555b3749f17SPratyush Yadav 	args->file = file;
556b3749f17SPratyush Yadav 	kho_restore_free(ser);
557b3749f17SPratyush Yadav 
558b3749f17SPratyush Yadav 	return 0;
559b3749f17SPratyush Yadav 
560b3749f17SPratyush Yadav put_file:
561b3749f17SPratyush Yadav 	fput(file);
562c657c5dcSPratyush Yadav (Google) free_ser:
563c657c5dcSPratyush Yadav (Google) 	kho_restore_free(ser);
564b3749f17SPratyush Yadav 	return err;
565b3749f17SPratyush Yadav }
566b3749f17SPratyush Yadav 
memfd_luo_can_preserve(struct liveupdate_file_handler * handler,struct file * file)567b3749f17SPratyush Yadav static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
568b3749f17SPratyush Yadav 				   struct file *file)
569b3749f17SPratyush Yadav {
570b3749f17SPratyush Yadav 	struct inode *inode = file_inode(file);
571b3749f17SPratyush Yadav 
572b3749f17SPratyush Yadav 	return shmem_file(file) && !inode->i_nlink;
573b3749f17SPratyush Yadav }
574b3749f17SPratyush Yadav 
memfd_luo_get_id(struct file * file)575bc3a5763SPasha Tatashin static unsigned long memfd_luo_get_id(struct file *file)
576bc3a5763SPasha Tatashin {
577bc3a5763SPasha Tatashin 	return (unsigned long)file_inode(file);
578bc3a5763SPasha Tatashin }
579bc3a5763SPasha Tatashin 
580b3749f17SPratyush Yadav static const struct liveupdate_file_ops memfd_luo_file_ops = {
581b3749f17SPratyush Yadav 	.freeze = memfd_luo_freeze,
582b3749f17SPratyush Yadav 	.finish = memfd_luo_finish,
583b3749f17SPratyush Yadav 	.retrieve = memfd_luo_retrieve,
584b3749f17SPratyush Yadav 	.preserve = memfd_luo_preserve,
585b3749f17SPratyush Yadav 	.unpreserve = memfd_luo_unpreserve,
586b3749f17SPratyush Yadav 	.can_preserve = memfd_luo_can_preserve,
587bc3a5763SPasha Tatashin 	.get_id = memfd_luo_get_id,
588b3749f17SPratyush Yadav 	.owner = THIS_MODULE,
589b3749f17SPratyush Yadav };
590b3749f17SPratyush Yadav 
591b3749f17SPratyush Yadav static struct liveupdate_file_handler memfd_luo_handler = {
592b3749f17SPratyush Yadav 	.ops = &memfd_luo_file_ops,
593b3749f17SPratyush Yadav 	.compatible = MEMFD_LUO_FH_COMPATIBLE,
594b3749f17SPratyush Yadav };
595b3749f17SPratyush Yadav 
memfd_luo_init(void)596b3749f17SPratyush Yadav static int __init memfd_luo_init(void)
597b3749f17SPratyush Yadav {
598b3749f17SPratyush Yadav 	int err = liveupdate_register_file_handler(&memfd_luo_handler);
599b3749f17SPratyush Yadav 
600b3749f17SPratyush Yadav 	if (err && err != -EOPNOTSUPP) {
601b3749f17SPratyush Yadav 		pr_err("Could not register luo filesystem handler: %pe\n",
602b3749f17SPratyush Yadav 		       ERR_PTR(err));
603b3749f17SPratyush Yadav 
604b3749f17SPratyush Yadav 		return err;
605b3749f17SPratyush Yadav 	}
606b3749f17SPratyush Yadav 
607b3749f17SPratyush Yadav 	return 0;
608b3749f17SPratyush Yadav }
609b3749f17SPratyush Yadav late_initcall(memfd_luo_init);
610