17a338472SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2f8af4da3SHugh Dickins /*
331dbd01fSIzik Eidus * Memory merging support.
431dbd01fSIzik Eidus *
531dbd01fSIzik Eidus * This code enables dynamic sharing of identical pages found in different
631dbd01fSIzik Eidus * memory areas, even if they are not shared by fork()
731dbd01fSIzik Eidus *
836b2528dSIzik Eidus * Copyright (C) 2008-2009 Red Hat, Inc.
931dbd01fSIzik Eidus * Authors:
1031dbd01fSIzik Eidus * Izik Eidus
1131dbd01fSIzik Eidus * Andrea Arcangeli
1231dbd01fSIzik Eidus * Chris Wright
1336b2528dSIzik Eidus * Hugh Dickins
14f8af4da3SHugh Dickins */
15f8af4da3SHugh Dickins
16f8af4da3SHugh Dickins #include <linux/errno.h>
1731dbd01fSIzik Eidus #include <linux/mm.h>
1836090defSArnd Bergmann #include <linux/mm_inline.h>
1931dbd01fSIzik Eidus #include <linux/fs.h>
20f8af4da3SHugh Dickins #include <linux/mman.h>
2131dbd01fSIzik Eidus #include <linux/sched.h>
226e84f315SIngo Molnar #include <linux/sched/mm.h>
234e5fa4f5SStefan Roesch #include <linux/sched/cputime.h>
2431dbd01fSIzik Eidus #include <linux/rwsem.h>
2531dbd01fSIzik Eidus #include <linux/pagemap.h>
2631dbd01fSIzik Eidus #include <linux/rmap.h>
2731dbd01fSIzik Eidus #include <linux/spinlock.h>
2859e1a2f4STimofey Titovets #include <linux/xxhash.h>
2931dbd01fSIzik Eidus #include <linux/delay.h>
3031dbd01fSIzik Eidus #include <linux/kthread.h>
3131dbd01fSIzik Eidus #include <linux/wait.h>
3231dbd01fSIzik Eidus #include <linux/slab.h>
3331dbd01fSIzik Eidus #include <linux/rbtree.h>
3462b61f61SHugh Dickins #include <linux/memory.h>
3531dbd01fSIzik Eidus #include <linux/mmu_notifier.h>
362c6854fdSIzik Eidus #include <linux/swap.h>
37f8af4da3SHugh Dickins #include <linux/ksm.h>
384ca3a69bSSasha Levin #include <linux/hashtable.h>
39878aee7dSAndrea Arcangeli #include <linux/freezer.h>
4072788c38SDavid Rientjes #include <linux/oom.h>
4190bd6fd3SPetr Holasek #include <linux/numa.h>
42d7c0e68dSDavid Hildenbrand #include <linux/pagewalk.h>
43f8af4da3SHugh Dickins
4431dbd01fSIzik Eidus #include <asm/tlbflush.h>
4573848b46SHugh Dickins #include "internal.h"
4658730ab6SQi Zheng #include "mm_slot.h"
4731dbd01fSIzik Eidus
48739100c8SStefan Roesch #define CREATE_TRACE_POINTS
49739100c8SStefan Roesch #include <trace/events/ksm.h>
50739100c8SStefan Roesch
51e850dcf5SHugh Dickins #ifdef CONFIG_NUMA
52e850dcf5SHugh Dickins #define NUMA(x) (x)
53e850dcf5SHugh Dickins #define DO_NUMA(x) do { (x); } while (0)
54e850dcf5SHugh Dickins #else
55e850dcf5SHugh Dickins #define NUMA(x) (0)
56e850dcf5SHugh Dickins #define DO_NUMA(x) do { } while (0)
57e850dcf5SHugh Dickins #endif
58e850dcf5SHugh Dickins
595e924ff5SStefan Roesch typedef u8 rmap_age_t;
605e924ff5SStefan Roesch
615a2ca3efSMike Rapoport /**
625a2ca3efSMike Rapoport * DOC: Overview
635a2ca3efSMike Rapoport *
6431dbd01fSIzik Eidus * A few notes about the KSM scanning process,
6531dbd01fSIzik Eidus * to make it easier to understand the data structures below:
6631dbd01fSIzik Eidus *
6731dbd01fSIzik Eidus * In order to reduce excessive scanning, KSM sorts the memory pages by their
6831dbd01fSIzik Eidus * contents into a data structure that holds pointers to the pages' locations.
6931dbd01fSIzik Eidus *
7031dbd01fSIzik Eidus * Since the contents of the pages may change at any moment, KSM cannot just
7131dbd01fSIzik Eidus * insert the pages into a normal sorted tree and expect it to find anything.
7231dbd01fSIzik Eidus * Therefore KSM uses two data structures - the stable and the unstable tree.
7331dbd01fSIzik Eidus *
7431dbd01fSIzik Eidus * The stable tree holds pointers to all the merged pages (ksm pages), sorted
7531dbd01fSIzik Eidus * by their contents. Because each such page is write-protected, searching on
7631dbd01fSIzik Eidus * this tree is fully assured to be working (except when pages are unmapped),
7731dbd01fSIzik Eidus * and therefore this tree is called the stable tree.
7831dbd01fSIzik Eidus *
795a2ca3efSMike Rapoport * The stable tree node includes information required for reverse
805a2ca3efSMike Rapoport * mapping from a KSM page to virtual addresses that map this page.
815a2ca3efSMike Rapoport *
825a2ca3efSMike Rapoport * In order to avoid large latencies of the rmap walks on KSM pages,
835a2ca3efSMike Rapoport * KSM maintains two types of nodes in the stable tree:
845a2ca3efSMike Rapoport *
855a2ca3efSMike Rapoport * * the regular nodes that keep the reverse mapping structures in a
865a2ca3efSMike Rapoport * linked list
875a2ca3efSMike Rapoport * * the "chains" that link nodes ("dups") that represent the same
885a2ca3efSMike Rapoport * write protected memory content, but each "dup" corresponds to a
895a2ca3efSMike Rapoport * different KSM page copy of that content
905a2ca3efSMike Rapoport *
915a2ca3efSMike Rapoport * Internally, the regular nodes, "dups" and "chains" are represented
9221fbd591SQi Zheng * using the same struct ksm_stable_node structure.
935a2ca3efSMike Rapoport *
9431dbd01fSIzik Eidus * In addition to the stable tree, KSM uses a second data structure called the
9531dbd01fSIzik Eidus * unstable tree: this tree holds pointers to pages which have been found to
9631dbd01fSIzik Eidus * be "unchanged for a period of time". The unstable tree sorts these pages
9731dbd01fSIzik Eidus * by their contents, but since they are not write-protected, KSM cannot rely
9831dbd01fSIzik Eidus * upon the unstable tree to work correctly - the unstable tree is liable to
9931dbd01fSIzik Eidus * be corrupted as its contents are modified, and so it is called unstable.
10031dbd01fSIzik Eidus *
10131dbd01fSIzik Eidus * KSM solves this problem by several techniques:
10231dbd01fSIzik Eidus *
10331dbd01fSIzik Eidus * 1) The unstable tree is flushed every time KSM completes scanning all
10431dbd01fSIzik Eidus * memory areas, and then the tree is rebuilt again from the beginning.
10531dbd01fSIzik Eidus * 2) KSM will only insert into the unstable tree, pages whose hash value
10631dbd01fSIzik Eidus * has not changed since the previous scan of all memory areas.
10731dbd01fSIzik Eidus * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
10831dbd01fSIzik Eidus * colors of the nodes and not on their contents, assuring that even when
10931dbd01fSIzik Eidus * the tree gets "corrupted" it won't get out of balance, so scanning time
11031dbd01fSIzik Eidus * remains the same (also, searching and inserting nodes in an rbtree uses
11131dbd01fSIzik Eidus * the same algorithm, so we have no overhead when we flush and rebuild).
11231dbd01fSIzik Eidus * 4) KSM never flushes the stable tree, which means that even if it were to
11331dbd01fSIzik Eidus * take 10 attempts to find a page in the unstable tree, once it is found,
11431dbd01fSIzik Eidus * it is secured in the stable tree. (When we scan a new page, we first
11531dbd01fSIzik Eidus * compare it against the stable tree, and then against the unstable tree.)
1168fdb3dbfSHugh Dickins *
1178fdb3dbfSHugh Dickins * If the merge_across_nodes tunable is unset, then KSM maintains multiple
1188fdb3dbfSHugh Dickins * stable trees and multiple unstable trees: one of each for each NUMA node.
11931dbd01fSIzik Eidus */
12031dbd01fSIzik Eidus
12131dbd01fSIzik Eidus /**
12221fbd591SQi Zheng * struct ksm_mm_slot - ksm information per mm that is being scanned
12358730ab6SQi Zheng * @slot: hash lookup from mm to mm_slot
1246514d511SHugh Dickins * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
12531dbd01fSIzik Eidus */
12621fbd591SQi Zheng struct ksm_mm_slot {
12758730ab6SQi Zheng struct mm_slot slot;
12821fbd591SQi Zheng struct ksm_rmap_item *rmap_list;
12931dbd01fSIzik Eidus };
13031dbd01fSIzik Eidus
13131dbd01fSIzik Eidus /**
13231dbd01fSIzik Eidus * struct ksm_scan - cursor for scanning
13331dbd01fSIzik Eidus * @mm_slot: the current mm_slot we are scanning
13431dbd01fSIzik Eidus * @address: the next address inside that to be scanned
1356514d511SHugh Dickins * @rmap_list: link to the next rmap to be scanned in the rmap_list
13631dbd01fSIzik Eidus * @seqnr: count of completed full scans (needed when removing unstable node)
13731dbd01fSIzik Eidus *
13831dbd01fSIzik Eidus * There is only the one ksm_scan instance of this cursor structure.
13931dbd01fSIzik Eidus */
14031dbd01fSIzik Eidus struct ksm_scan {
14121fbd591SQi Zheng struct ksm_mm_slot *mm_slot;
14231dbd01fSIzik Eidus unsigned long address;
14321fbd591SQi Zheng struct ksm_rmap_item **rmap_list;
14431dbd01fSIzik Eidus unsigned long seqnr;
14531dbd01fSIzik Eidus };
14631dbd01fSIzik Eidus
14731dbd01fSIzik Eidus /**
14821fbd591SQi Zheng * struct ksm_stable_node - node of the stable rbtree
1497b6ba2c7SHugh Dickins * @node: rb node of this ksm page in the stable tree
1504146d2d6SHugh Dickins * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
1512c653d0eSAndrea Arcangeli * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
1524146d2d6SHugh Dickins * @list: linked into migrate_nodes, pending placement in the proper node tree
1537b6ba2c7SHugh Dickins * @hlist: hlist head of rmap_items using this ksm page
1544146d2d6SHugh Dickins * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
1552c653d0eSAndrea Arcangeli * @chain_prune_time: time of the last full garbage collection
1562c653d0eSAndrea Arcangeli * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
1574146d2d6SHugh Dickins * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
1587b6ba2c7SHugh Dickins */
15921fbd591SQi Zheng struct ksm_stable_node {
1604146d2d6SHugh Dickins union {
1614146d2d6SHugh Dickins struct rb_node node; /* when node of stable tree */
1624146d2d6SHugh Dickins struct { /* when listed for migration */
1634146d2d6SHugh Dickins struct list_head *head;
1642c653d0eSAndrea Arcangeli struct {
1652c653d0eSAndrea Arcangeli struct hlist_node hlist_dup;
1664146d2d6SHugh Dickins struct list_head list;
1674146d2d6SHugh Dickins };
1684146d2d6SHugh Dickins };
1692c653d0eSAndrea Arcangeli };
1707b6ba2c7SHugh Dickins struct hlist_head hlist;
1712c653d0eSAndrea Arcangeli union {
17262b61f61SHugh Dickins unsigned long kpfn;
1732c653d0eSAndrea Arcangeli unsigned long chain_prune_time;
1742c653d0eSAndrea Arcangeli };
1752c653d0eSAndrea Arcangeli /*
1762c653d0eSAndrea Arcangeli * STABLE_NODE_CHAIN can be any negative number in
1772c653d0eSAndrea Arcangeli * rmap_hlist_len negative range, but better not -1 to be able
1782c653d0eSAndrea Arcangeli * to reliably detect underflows.
1792c653d0eSAndrea Arcangeli */
1802c653d0eSAndrea Arcangeli #define STABLE_NODE_CHAIN -1024
1812c653d0eSAndrea Arcangeli int rmap_hlist_len;
1824146d2d6SHugh Dickins #ifdef CONFIG_NUMA
1834146d2d6SHugh Dickins int nid;
1844146d2d6SHugh Dickins #endif
1857b6ba2c7SHugh Dickins };
1867b6ba2c7SHugh Dickins
1877b6ba2c7SHugh Dickins /**
18821fbd591SQi Zheng * struct ksm_rmap_item - reverse mapping item for virtual addresses
1896514d511SHugh Dickins * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
190db114b83SHugh Dickins * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
191bc56620bSHugh Dickins * @nid: NUMA node id of unstable tree in which linked (may not match page)
19231dbd01fSIzik Eidus * @mm: the memory structure this rmap_item is pointing into
19331dbd01fSIzik Eidus * @address: the virtual address this rmap_item tracks (+ flags in low bits)
19431dbd01fSIzik Eidus * @oldchecksum: previous checksum of the page at that virtual address
1957b6ba2c7SHugh Dickins * @node: rb node of this rmap_item in the unstable tree
1967b6ba2c7SHugh Dickins * @head: pointer to stable_node heading this list in the stable tree
1977b6ba2c7SHugh Dickins * @hlist: link into hlist of rmap_items hanging off that stable_node
1985e924ff5SStefan Roesch * @age: number of scan iterations since creation
1995e924ff5SStefan Roesch * @remaining_skips: how many scans to skip
20031dbd01fSIzik Eidus */
20121fbd591SQi Zheng struct ksm_rmap_item {
20221fbd591SQi Zheng struct ksm_rmap_item *rmap_list;
203bc56620bSHugh Dickins union {
204db114b83SHugh Dickins struct anon_vma *anon_vma; /* when stable */
205bc56620bSHugh Dickins #ifdef CONFIG_NUMA
206bc56620bSHugh Dickins int nid; /* when node of unstable tree */
207bc56620bSHugh Dickins #endif
208bc56620bSHugh Dickins };
20931dbd01fSIzik Eidus struct mm_struct *mm;
21031dbd01fSIzik Eidus unsigned long address; /* + low bits used for flags below */
21131dbd01fSIzik Eidus unsigned int oldchecksum; /* when unstable */
2125e924ff5SStefan Roesch rmap_age_t age;
2135e924ff5SStefan Roesch rmap_age_t remaining_skips;
21431dbd01fSIzik Eidus union {
2157b6ba2c7SHugh Dickins struct rb_node node; /* when node of unstable tree */
2167b6ba2c7SHugh Dickins struct { /* when listed from stable tree */
21721fbd591SQi Zheng struct ksm_stable_node *head;
2187b6ba2c7SHugh Dickins struct hlist_node hlist;
2197b6ba2c7SHugh Dickins };
22031dbd01fSIzik Eidus };
22131dbd01fSIzik Eidus };
22231dbd01fSIzik Eidus
22331dbd01fSIzik Eidus #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
2247b6ba2c7SHugh Dickins #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
2257b6ba2c7SHugh Dickins #define STABLE_FLAG 0x200 /* is listed from the stable tree */
22631dbd01fSIzik Eidus
22731dbd01fSIzik Eidus /* The stable and unstable tree heads */
228ef53d16cSHugh Dickins static struct rb_root one_stable_tree[1] = { RB_ROOT };
229ef53d16cSHugh Dickins static struct rb_root one_unstable_tree[1] = { RB_ROOT };
230ef53d16cSHugh Dickins static struct rb_root *root_stable_tree = one_stable_tree;
231ef53d16cSHugh Dickins static struct rb_root *root_unstable_tree = one_unstable_tree;
23231dbd01fSIzik Eidus
2334146d2d6SHugh Dickins /* Recently migrated nodes of stable tree, pending proper placement */
2344146d2d6SHugh Dickins static LIST_HEAD(migrate_nodes);
2352c653d0eSAndrea Arcangeli #define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
2364146d2d6SHugh Dickins
2374ca3a69bSSasha Levin #define MM_SLOTS_HASH_BITS 10
2384ca3a69bSSasha Levin static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
23931dbd01fSIzik Eidus
24021fbd591SQi Zheng static struct ksm_mm_slot ksm_mm_head = {
24158730ab6SQi Zheng .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node),
24231dbd01fSIzik Eidus };
24331dbd01fSIzik Eidus static struct ksm_scan ksm_scan = {
24431dbd01fSIzik Eidus .mm_slot = &ksm_mm_head,
24531dbd01fSIzik Eidus };
24631dbd01fSIzik Eidus
24731dbd01fSIzik Eidus static struct kmem_cache *rmap_item_cache;
2487b6ba2c7SHugh Dickins static struct kmem_cache *stable_node_cache;
24931dbd01fSIzik Eidus static struct kmem_cache *mm_slot_cache;
25031dbd01fSIzik Eidus
2514e5fa4f5SStefan Roesch /* Default number of pages to scan per batch */
2524e5fa4f5SStefan Roesch #define DEFAULT_PAGES_TO_SCAN 100
2534e5fa4f5SStefan Roesch
254b348b5feSStefan Roesch /* The number of pages scanned */
255b348b5feSStefan Roesch static unsigned long ksm_pages_scanned;
256b348b5feSStefan Roesch
25731dbd01fSIzik Eidus /* The number of nodes in the stable tree */
258b4028260SHugh Dickins static unsigned long ksm_pages_shared;
25931dbd01fSIzik Eidus
260e178dfdeSHugh Dickins /* The number of page slots additionally sharing those nodes */
261b4028260SHugh Dickins static unsigned long ksm_pages_sharing;
26231dbd01fSIzik Eidus
263473b0ce4SHugh Dickins /* The number of nodes in the unstable tree */
264473b0ce4SHugh Dickins static unsigned long ksm_pages_unshared;
265473b0ce4SHugh Dickins
266473b0ce4SHugh Dickins /* The number of rmap_items in use: to calculate pages_volatile */
267473b0ce4SHugh Dickins static unsigned long ksm_rmap_items;
268473b0ce4SHugh Dickins
2692c653d0eSAndrea Arcangeli /* The number of stable_node chains */
2702c653d0eSAndrea Arcangeli static unsigned long ksm_stable_node_chains;
2712c653d0eSAndrea Arcangeli
2722c653d0eSAndrea Arcangeli /* The number of stable_node dups linked to the stable_node chains */
2732c653d0eSAndrea Arcangeli static unsigned long ksm_stable_node_dups;
2742c653d0eSAndrea Arcangeli
2752c653d0eSAndrea Arcangeli /* Delay in pruning stale stable_node_dups in the stable_node_chains */
276584ff0dfSZhansaya Bagdauletkyzy static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;
2772c653d0eSAndrea Arcangeli
2782c653d0eSAndrea Arcangeli /* Maximum number of page slots sharing a stable node */
2792c653d0eSAndrea Arcangeli static int ksm_max_page_sharing = 256;
2802c653d0eSAndrea Arcangeli
28131dbd01fSIzik Eidus /* Number of pages ksmd should scan in one batch */
2824e5fa4f5SStefan Roesch static unsigned int ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN;
28331dbd01fSIzik Eidus
28431dbd01fSIzik Eidus /* Milliseconds ksmd should sleep between batches */
2852ffd8679SHugh Dickins static unsigned int ksm_thread_sleep_millisecs = 20;
28631dbd01fSIzik Eidus
287e86c59b1SClaudio Imbrenda /* Checksum of an empty (zeroed) page */
288e86c59b1SClaudio Imbrenda static unsigned int zero_checksum __read_mostly;
289e86c59b1SClaudio Imbrenda
290e86c59b1SClaudio Imbrenda /* Whether to merge empty (zeroed) pages with actual zero pages */
291e86c59b1SClaudio Imbrenda static bool ksm_use_zero_pages __read_mostly;
292e86c59b1SClaudio Imbrenda
2935e924ff5SStefan Roesch /* Skip pages that couldn't be de-duplicated previously */
2945e924ff5SStefan Roesch /* Default to true at least temporarily, for testing */
2955e924ff5SStefan Roesch static bool ksm_smart_scan = true;
2965e924ff5SStefan Roesch
297e2942062Sxu xin /* The number of zero pages which is placed by KSM */
298c2dc78b8SChengming Zhou atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0);
299e2942062Sxu xin
300e5a68991SStefan Roesch /* The number of pages that have been skipped due to "smart scanning" */
301e5a68991SStefan Roesch static unsigned long ksm_pages_skipped;
302e5a68991SStefan Roesch
3034e5fa4f5SStefan Roesch /* Don't scan more than max pages per batch. */
3044e5fa4f5SStefan Roesch static unsigned long ksm_advisor_max_pages_to_scan = 30000;
3054e5fa4f5SStefan Roesch
3064e5fa4f5SStefan Roesch /* Min CPU for scanning pages per scan */
3074e5fa4f5SStefan Roesch #define KSM_ADVISOR_MIN_CPU 10
3084e5fa4f5SStefan Roesch
3094e5fa4f5SStefan Roesch /* Max CPU for scanning pages per scan */
3104e5fa4f5SStefan Roesch static unsigned int ksm_advisor_max_cpu = 70;
3114e5fa4f5SStefan Roesch
3124e5fa4f5SStefan Roesch /* Target scan time in seconds to analyze all KSM candidate pages. */
3134e5fa4f5SStefan Roesch static unsigned long ksm_advisor_target_scan_time = 200;
3144e5fa4f5SStefan Roesch
3154e5fa4f5SStefan Roesch /* Exponentially weighted moving average. */
3164e5fa4f5SStefan Roesch #define EWMA_WEIGHT 30
3174e5fa4f5SStefan Roesch
3184e5fa4f5SStefan Roesch /**
3194e5fa4f5SStefan Roesch * struct advisor_ctx - metadata for KSM advisor
3204e5fa4f5SStefan Roesch * @start_scan: start time of the current scan
3214e5fa4f5SStefan Roesch * @scan_time: scan time of previous scan
3224e5fa4f5SStefan Roesch * @change: change in percent to pages_to_scan parameter
3234e5fa4f5SStefan Roesch * @cpu_time: cpu time consumed by the ksmd thread in the previous scan
3244e5fa4f5SStefan Roesch */
3254e5fa4f5SStefan Roesch struct advisor_ctx {
3264e5fa4f5SStefan Roesch ktime_t start_scan;
3274e5fa4f5SStefan Roesch unsigned long scan_time;
3284e5fa4f5SStefan Roesch unsigned long change;
3294e5fa4f5SStefan Roesch unsigned long long cpu_time;
3304e5fa4f5SStefan Roesch };
3314e5fa4f5SStefan Roesch static struct advisor_ctx advisor_ctx;
3324e5fa4f5SStefan Roesch
3334e5fa4f5SStefan Roesch /* Define different advisor's */
3344e5fa4f5SStefan Roesch enum ksm_advisor_type {
3354e5fa4f5SStefan Roesch KSM_ADVISOR_NONE,
3364e5fa4f5SStefan Roesch KSM_ADVISOR_SCAN_TIME,
3374e5fa4f5SStefan Roesch };
3384e5fa4f5SStefan Roesch static enum ksm_advisor_type ksm_advisor;
3394e5fa4f5SStefan Roesch
34066790e9aSStefan Roesch #ifdef CONFIG_SYSFS
34166790e9aSStefan Roesch /*
34266790e9aSStefan Roesch * Only called through the sysfs control interface:
34366790e9aSStefan Roesch */
34466790e9aSStefan Roesch
34566790e9aSStefan Roesch /* At least scan this many pages per batch. */
34666790e9aSStefan Roesch static unsigned long ksm_advisor_min_pages_to_scan = 500;
34766790e9aSStefan Roesch
set_advisor_defaults(void)34866790e9aSStefan Roesch static void set_advisor_defaults(void)
34966790e9aSStefan Roesch {
35066790e9aSStefan Roesch if (ksm_advisor == KSM_ADVISOR_NONE) {
35166790e9aSStefan Roesch ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN;
35266790e9aSStefan Roesch } else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) {
35366790e9aSStefan Roesch advisor_ctx = (const struct advisor_ctx){ 0 };
35466790e9aSStefan Roesch ksm_thread_pages_to_scan = ksm_advisor_min_pages_to_scan;
35566790e9aSStefan Roesch }
35666790e9aSStefan Roesch }
35766790e9aSStefan Roesch #endif /* CONFIG_SYSFS */
35866790e9aSStefan Roesch
advisor_start_scan(void)3594e5fa4f5SStefan Roesch static inline void advisor_start_scan(void)
3604e5fa4f5SStefan Roesch {
3614e5fa4f5SStefan Roesch if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
3624e5fa4f5SStefan Roesch advisor_ctx.start_scan = ktime_get();
3634e5fa4f5SStefan Roesch }
3644e5fa4f5SStefan Roesch
3654e5fa4f5SStefan Roesch /*
3664e5fa4f5SStefan Roesch * Use previous scan time if available, otherwise use current scan time as an
3674e5fa4f5SStefan Roesch * approximation for the previous scan time.
3684e5fa4f5SStefan Roesch */
prev_scan_time(struct advisor_ctx * ctx,unsigned long scan_time)3694e5fa4f5SStefan Roesch static inline unsigned long prev_scan_time(struct advisor_ctx *ctx,
3704e5fa4f5SStefan Roesch unsigned long scan_time)
3714e5fa4f5SStefan Roesch {
3724e5fa4f5SStefan Roesch return ctx->scan_time ? ctx->scan_time : scan_time;
3734e5fa4f5SStefan Roesch }
3744e5fa4f5SStefan Roesch
3754e5fa4f5SStefan Roesch /* Calculate exponential weighted moving average */
ewma(unsigned long prev,unsigned long curr)3764e5fa4f5SStefan Roesch static unsigned long ewma(unsigned long prev, unsigned long curr)
3774e5fa4f5SStefan Roesch {
3784e5fa4f5SStefan Roesch return ((100 - EWMA_WEIGHT) * prev + EWMA_WEIGHT * curr) / 100;
3794e5fa4f5SStefan Roesch }
3804e5fa4f5SStefan Roesch
3814e5fa4f5SStefan Roesch /*
3824e5fa4f5SStefan Roesch * The scan time advisor is based on the current scan rate and the target
3834e5fa4f5SStefan Roesch * scan rate.
3844e5fa4f5SStefan Roesch *
3854e5fa4f5SStefan Roesch * new_pages_to_scan = pages_to_scan * (scan_time / target_scan_time)
3864e5fa4f5SStefan Roesch *
3874e5fa4f5SStefan Roesch * To avoid perturbations it calculates a change factor of previous changes.
3884e5fa4f5SStefan Roesch * A new change factor is calculated for each iteration and it uses an
3894e5fa4f5SStefan Roesch * exponentially weighted moving average. The new pages_to_scan value is
3904e5fa4f5SStefan Roesch * multiplied with that change factor:
3914e5fa4f5SStefan Roesch *
392b6c46600Sjianyun.gao * new_pages_to_scan *= change factor
3934e5fa4f5SStefan Roesch *
3944e5fa4f5SStefan Roesch * The new_pages_to_scan value is limited by the cpu min and max values. It
3954e5fa4f5SStefan Roesch * calculates the cpu percent for the last scan and calculates the new
3964e5fa4f5SStefan Roesch * estimated cpu percent cost for the next scan. That value is capped by the
3974e5fa4f5SStefan Roesch * cpu min and max setting.
3984e5fa4f5SStefan Roesch *
3994e5fa4f5SStefan Roesch * In addition the new pages_to_scan value is capped by the max and min
4004e5fa4f5SStefan Roesch * limits.
4014e5fa4f5SStefan Roesch */
scan_time_advisor(void)4024e5fa4f5SStefan Roesch static void scan_time_advisor(void)
4034e5fa4f5SStefan Roesch {
4044e5fa4f5SStefan Roesch unsigned int cpu_percent;
4054e5fa4f5SStefan Roesch unsigned long cpu_time;
4064e5fa4f5SStefan Roesch unsigned long cpu_time_diff;
4074e5fa4f5SStefan Roesch unsigned long cpu_time_diff_ms;
4084e5fa4f5SStefan Roesch unsigned long pages;
4094e5fa4f5SStefan Roesch unsigned long per_page_cost;
4104e5fa4f5SStefan Roesch unsigned long factor;
4114e5fa4f5SStefan Roesch unsigned long change;
4124e5fa4f5SStefan Roesch unsigned long last_scan_time;
4134e5fa4f5SStefan Roesch unsigned long scan_time;
4144e5fa4f5SStefan Roesch
4154e5fa4f5SStefan Roesch /* Convert scan time to seconds */
4164e5fa4f5SStefan Roesch scan_time = div_s64(ktime_ms_delta(ktime_get(), advisor_ctx.start_scan),
4174e5fa4f5SStefan Roesch MSEC_PER_SEC);
4184e5fa4f5SStefan Roesch scan_time = scan_time ? scan_time : 1;
4194e5fa4f5SStefan Roesch
4204e5fa4f5SStefan Roesch /* Calculate CPU consumption of ksmd background thread */
4214e5fa4f5SStefan Roesch cpu_time = task_sched_runtime(current);
4224e5fa4f5SStefan Roesch cpu_time_diff = cpu_time - advisor_ctx.cpu_time;
4234e5fa4f5SStefan Roesch cpu_time_diff_ms = cpu_time_diff / 1000 / 1000;
4244e5fa4f5SStefan Roesch
4254e5fa4f5SStefan Roesch cpu_percent = (cpu_time_diff_ms * 100) / (scan_time * 1000);
4264e5fa4f5SStefan Roesch cpu_percent = cpu_percent ? cpu_percent : 1;
4274e5fa4f5SStefan Roesch last_scan_time = prev_scan_time(&advisor_ctx, scan_time);
4284e5fa4f5SStefan Roesch
4294e5fa4f5SStefan Roesch /* Calculate scan time as percentage of target scan time */
4304e5fa4f5SStefan Roesch factor = ksm_advisor_target_scan_time * 100 / scan_time;
4314e5fa4f5SStefan Roesch factor = factor ? factor : 1;
4324e5fa4f5SStefan Roesch
4334e5fa4f5SStefan Roesch /*
4344e5fa4f5SStefan Roesch * Calculate scan time as percentage of last scan time and use
4354e5fa4f5SStefan Roesch * exponentially weighted average to smooth it
4364e5fa4f5SStefan Roesch */
4374e5fa4f5SStefan Roesch change = scan_time * 100 / last_scan_time;
4384e5fa4f5SStefan Roesch change = change ? change : 1;
4394e5fa4f5SStefan Roesch change = ewma(advisor_ctx.change, change);
4404e5fa4f5SStefan Roesch
4414e5fa4f5SStefan Roesch /* Calculate new scan rate based on target scan rate. */
4424e5fa4f5SStefan Roesch pages = ksm_thread_pages_to_scan * 100 / factor;
4434e5fa4f5SStefan Roesch /* Update pages_to_scan by weighted change percentage. */
4444e5fa4f5SStefan Roesch pages = pages * change / 100;
4454e5fa4f5SStefan Roesch
4464e5fa4f5SStefan Roesch /* Cap new pages_to_scan value */
4474e5fa4f5SStefan Roesch per_page_cost = ksm_thread_pages_to_scan / cpu_percent;
4484e5fa4f5SStefan Roesch per_page_cost = per_page_cost ? per_page_cost : 1;
4494e5fa4f5SStefan Roesch
4504e5fa4f5SStefan Roesch pages = min(pages, per_page_cost * ksm_advisor_max_cpu);
4514e5fa4f5SStefan Roesch pages = max(pages, per_page_cost * KSM_ADVISOR_MIN_CPU);
4524e5fa4f5SStefan Roesch pages = min(pages, ksm_advisor_max_pages_to_scan);
4534e5fa4f5SStefan Roesch
4544e5fa4f5SStefan Roesch /* Update advisor context */
4554e5fa4f5SStefan Roesch advisor_ctx.change = change;
4564e5fa4f5SStefan Roesch advisor_ctx.scan_time = scan_time;
4574e5fa4f5SStefan Roesch advisor_ctx.cpu_time = cpu_time;
4584e5fa4f5SStefan Roesch
4594e5fa4f5SStefan Roesch ksm_thread_pages_to_scan = pages;
4605088b497SStefan Roesch trace_ksm_advisor(scan_time, pages, cpu_percent);
4614e5fa4f5SStefan Roesch }
4624e5fa4f5SStefan Roesch
advisor_stop_scan(void)4634e5fa4f5SStefan Roesch static void advisor_stop_scan(void)
4644e5fa4f5SStefan Roesch {
4654e5fa4f5SStefan Roesch if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
4664e5fa4f5SStefan Roesch scan_time_advisor();
4674e5fa4f5SStefan Roesch }
4684e5fa4f5SStefan Roesch
469e850dcf5SHugh Dickins #ifdef CONFIG_NUMA
47090bd6fd3SPetr Holasek /* Zeroed when merging across nodes is not allowed */
47190bd6fd3SPetr Holasek static unsigned int ksm_merge_across_nodes = 1;
472ef53d16cSHugh Dickins static int ksm_nr_node_ids = 1;
473e850dcf5SHugh Dickins #else
474e850dcf5SHugh Dickins #define ksm_merge_across_nodes 1U
475ef53d16cSHugh Dickins #define ksm_nr_node_ids 1
476e850dcf5SHugh Dickins #endif
47790bd6fd3SPetr Holasek
47831dbd01fSIzik Eidus #define KSM_RUN_STOP 0
47931dbd01fSIzik Eidus #define KSM_RUN_MERGE 1
48031dbd01fSIzik Eidus #define KSM_RUN_UNMERGE 2
481ef4d43a8SHugh Dickins #define KSM_RUN_OFFLINE 4
482ef4d43a8SHugh Dickins static unsigned long ksm_run = KSM_RUN_STOP;
483ef4d43a8SHugh Dickins static void wait_while_offlining(void);
48431dbd01fSIzik Eidus
48531dbd01fSIzik Eidus static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
486fcf9a0efSKirill Tkhai static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
48731dbd01fSIzik Eidus static DEFINE_MUTEX(ksm_thread_mutex);
48831dbd01fSIzik Eidus static DEFINE_SPINLOCK(ksm_mmlist_lock);
48931dbd01fSIzik Eidus
ksm_slab_init(void)49031dbd01fSIzik Eidus static int __init ksm_slab_init(void)
49131dbd01fSIzik Eidus {
492aa1b9489SKefeng Wang rmap_item_cache = KMEM_CACHE(ksm_rmap_item, 0);
49331dbd01fSIzik Eidus if (!rmap_item_cache)
49431dbd01fSIzik Eidus goto out;
49531dbd01fSIzik Eidus
496aa1b9489SKefeng Wang stable_node_cache = KMEM_CACHE(ksm_stable_node, 0);
4977b6ba2c7SHugh Dickins if (!stable_node_cache)
4987b6ba2c7SHugh Dickins goto out_free1;
4997b6ba2c7SHugh Dickins
500aa1b9489SKefeng Wang mm_slot_cache = KMEM_CACHE(ksm_mm_slot, 0);
50131dbd01fSIzik Eidus if (!mm_slot_cache)
5027b6ba2c7SHugh Dickins goto out_free2;
50331dbd01fSIzik Eidus
50431dbd01fSIzik Eidus return 0;
50531dbd01fSIzik Eidus
5067b6ba2c7SHugh Dickins out_free2:
5077b6ba2c7SHugh Dickins kmem_cache_destroy(stable_node_cache);
5087b6ba2c7SHugh Dickins out_free1:
50931dbd01fSIzik Eidus kmem_cache_destroy(rmap_item_cache);
51031dbd01fSIzik Eidus out:
51131dbd01fSIzik Eidus return -ENOMEM;
51231dbd01fSIzik Eidus }
51331dbd01fSIzik Eidus
ksm_slab_free(void)51431dbd01fSIzik Eidus static void __init ksm_slab_free(void)
51531dbd01fSIzik Eidus {
51631dbd01fSIzik Eidus kmem_cache_destroy(mm_slot_cache);
5177b6ba2c7SHugh Dickins kmem_cache_destroy(stable_node_cache);
51831dbd01fSIzik Eidus kmem_cache_destroy(rmap_item_cache);
51931dbd01fSIzik Eidus mm_slot_cache = NULL;
52031dbd01fSIzik Eidus }
52131dbd01fSIzik Eidus
is_stable_node_chain(struct ksm_stable_node * chain)52221fbd591SQi Zheng static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain)
5232c653d0eSAndrea Arcangeli {
5242c653d0eSAndrea Arcangeli return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
5252c653d0eSAndrea Arcangeli }
5262c653d0eSAndrea Arcangeli
is_stable_node_dup(struct ksm_stable_node * dup)52721fbd591SQi Zheng static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup)
5282c653d0eSAndrea Arcangeli {
5292c653d0eSAndrea Arcangeli return dup->head == STABLE_NODE_DUP_HEAD;
5302c653d0eSAndrea Arcangeli }
5312c653d0eSAndrea Arcangeli
stable_node_chain_add_dup(struct ksm_stable_node * dup,struct ksm_stable_node * chain)53221fbd591SQi Zheng static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup,
53321fbd591SQi Zheng struct ksm_stable_node *chain)
5342c653d0eSAndrea Arcangeli {
5352c653d0eSAndrea Arcangeli VM_BUG_ON(is_stable_node_dup(dup));
5362c653d0eSAndrea Arcangeli dup->head = STABLE_NODE_DUP_HEAD;
5372c653d0eSAndrea Arcangeli VM_BUG_ON(!is_stable_node_chain(chain));
5382c653d0eSAndrea Arcangeli hlist_add_head(&dup->hlist_dup, &chain->hlist);
5392c653d0eSAndrea Arcangeli ksm_stable_node_dups++;
5402c653d0eSAndrea Arcangeli }
5412c653d0eSAndrea Arcangeli
__stable_node_dup_del(struct ksm_stable_node * dup)54221fbd591SQi Zheng static inline void __stable_node_dup_del(struct ksm_stable_node *dup)
5432c653d0eSAndrea Arcangeli {
544b4fecc67SAndrea Arcangeli VM_BUG_ON(!is_stable_node_dup(dup));
5452c653d0eSAndrea Arcangeli hlist_del(&dup->hlist_dup);
5462c653d0eSAndrea Arcangeli ksm_stable_node_dups--;
5472c653d0eSAndrea Arcangeli }
5482c653d0eSAndrea Arcangeli
stable_node_dup_del(struct ksm_stable_node * dup)54921fbd591SQi Zheng static inline void stable_node_dup_del(struct ksm_stable_node *dup)
5502c653d0eSAndrea Arcangeli {
5512c653d0eSAndrea Arcangeli VM_BUG_ON(is_stable_node_chain(dup));
5522c653d0eSAndrea Arcangeli if (is_stable_node_dup(dup))
5532c653d0eSAndrea Arcangeli __stable_node_dup_del(dup);
5542c653d0eSAndrea Arcangeli else
5552c653d0eSAndrea Arcangeli rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
5562c653d0eSAndrea Arcangeli #ifdef CONFIG_DEBUG_VM
5572c653d0eSAndrea Arcangeli dup->head = NULL;
5582c653d0eSAndrea Arcangeli #endif
5592c653d0eSAndrea Arcangeli }
5602c653d0eSAndrea Arcangeli
alloc_rmap_item(void)56121fbd591SQi Zheng static inline struct ksm_rmap_item *alloc_rmap_item(void)
56231dbd01fSIzik Eidus {
56321fbd591SQi Zheng struct ksm_rmap_item *rmap_item;
564473b0ce4SHugh Dickins
5655b398e41Szhong jiang rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
5665b398e41Szhong jiang __GFP_NORETRY | __GFP_NOWARN);
567473b0ce4SHugh Dickins if (rmap_item)
568473b0ce4SHugh Dickins ksm_rmap_items++;
569473b0ce4SHugh Dickins return rmap_item;
57031dbd01fSIzik Eidus }
57131dbd01fSIzik Eidus
free_rmap_item(struct ksm_rmap_item * rmap_item)57221fbd591SQi Zheng static inline void free_rmap_item(struct ksm_rmap_item *rmap_item)
57331dbd01fSIzik Eidus {
574473b0ce4SHugh Dickins ksm_rmap_items--;
575cb4df4caSxu xin rmap_item->mm->ksm_rmap_items--;
57631dbd01fSIzik Eidus rmap_item->mm = NULL; /* debug safety */
57731dbd01fSIzik Eidus kmem_cache_free(rmap_item_cache, rmap_item);
57831dbd01fSIzik Eidus }
57931dbd01fSIzik Eidus
alloc_stable_node(void)58021fbd591SQi Zheng static inline struct ksm_stable_node *alloc_stable_node(void)
5817b6ba2c7SHugh Dickins {
5826213055fSzhong jiang /*
5836213055fSzhong jiang * The allocation can take too long with GFP_KERNEL when memory is under
5846213055fSzhong jiang * pressure, which may lead to hung task warnings. Adding __GFP_HIGH
5856213055fSzhong jiang * grants access to memory reserves, helping to avoid this problem.
5866213055fSzhong jiang */
5876213055fSzhong jiang return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
5887b6ba2c7SHugh Dickins }
5897b6ba2c7SHugh Dickins
free_stable_node(struct ksm_stable_node * stable_node)59021fbd591SQi Zheng static inline void free_stable_node(struct ksm_stable_node *stable_node)
5917b6ba2c7SHugh Dickins {
5922c653d0eSAndrea Arcangeli VM_BUG_ON(stable_node->rmap_hlist_len &&
5932c653d0eSAndrea Arcangeli !is_stable_node_chain(stable_node));
5947b6ba2c7SHugh Dickins kmem_cache_free(stable_node_cache, stable_node);
5957b6ba2c7SHugh Dickins }
5967b6ba2c7SHugh Dickins
59731dbd01fSIzik Eidus /*
598a913e182SHugh Dickins * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
599a913e182SHugh Dickins * page tables after it has passed through ksm_exit() - which, if necessary,
600c1e8d7c6SMichel Lespinasse * takes mmap_lock briefly to serialize against them. ksm_exit() does not set
601a913e182SHugh Dickins * a special flag: they can just back out as soon as mm_users goes to zero.
602a913e182SHugh Dickins * ksm_test_exit() is used throughout to make this test for exit: in some
603a913e182SHugh Dickins * places for correctness, in some places just to avoid unnecessary work.
604a913e182SHugh Dickins */
ksm_test_exit(struct mm_struct * mm)605a913e182SHugh Dickins static inline bool ksm_test_exit(struct mm_struct *mm)
606a913e182SHugh Dickins {
607a913e182SHugh Dickins return atomic_read(&mm->mm_users) == 0;
608a913e182SHugh Dickins }
609a913e182SHugh Dickins
break_ksm_pmd_entry(pmd_t * pmdp,unsigned long addr,unsigned long end,struct mm_walk * walk)6105d4939fcSPedro Demarchi Gomes static int break_ksm_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
611912aa825SPedro Demarchi Gomes struct mm_walk *walk)
612912aa825SPedro Demarchi Gomes {
6135d4939fcSPedro Demarchi Gomes unsigned long *found_addr = (unsigned long *) walk->private;
6145d4939fcSPedro Demarchi Gomes struct mm_struct *mm = walk->mm;
6155d4939fcSPedro Demarchi Gomes pte_t *start_ptep, *ptep;
616912aa825SPedro Demarchi Gomes spinlock_t *ptl;
6175d4939fcSPedro Demarchi Gomes int found = 0;
618912aa825SPedro Demarchi Gomes
6195d4939fcSPedro Demarchi Gomes if (ksm_test_exit(walk->mm))
620912aa825SPedro Demarchi Gomes return 0;
6215d4939fcSPedro Demarchi Gomes if (signal_pending(current))
6225d4939fcSPedro Demarchi Gomes return -ERESTARTSYS;
6235d4939fcSPedro Demarchi Gomes
6245d4939fcSPedro Demarchi Gomes start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
6255d4939fcSPedro Demarchi Gomes if (!start_ptep)
6265d4939fcSPedro Demarchi Gomes return 0;
6275d4939fcSPedro Demarchi Gomes
6285d4939fcSPedro Demarchi Gomes for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
6295d4939fcSPedro Demarchi Gomes pte_t pte = ptep_get(ptep);
6305d4939fcSPedro Demarchi Gomes struct folio *folio = NULL;
6315d4939fcSPedro Demarchi Gomes
6325d4939fcSPedro Demarchi Gomes if (pte_present(pte)) {
6335d4939fcSPedro Demarchi Gomes folio = vm_normal_folio(walk->vma, addr, pte);
6345d4939fcSPedro Demarchi Gomes } else if (!pte_none(pte)) {
63593976a20SLorenzo Stoakes const softleaf_t entry = softleaf_from_pte(pte);
636912aa825SPedro Demarchi Gomes
637912aa825SPedro Demarchi Gomes /*
638912aa825SPedro Demarchi Gomes * As KSM pages remain KSM pages until freed, no need to wait
639912aa825SPedro Demarchi Gomes * here for migration to end.
640912aa825SPedro Demarchi Gomes */
64193976a20SLorenzo Stoakes if (softleaf_is_migration(entry))
64293976a20SLorenzo Stoakes folio = softleaf_to_folio(entry);
643912aa825SPedro Demarchi Gomes }
644912aa825SPedro Demarchi Gomes /* return 1 if the page is an normal ksm page or KSM-placed zero page */
645912aa825SPedro Demarchi Gomes found = (folio && folio_test_ksm(folio)) ||
6465d4939fcSPedro Demarchi Gomes (pte_present(pte) && is_ksm_zero_pte(pte));
6475d4939fcSPedro Demarchi Gomes if (found) {
6485d4939fcSPedro Demarchi Gomes *found_addr = addr;
6495d4939fcSPedro Demarchi Gomes goto out_unlock;
6505d4939fcSPedro Demarchi Gomes }
6515d4939fcSPedro Demarchi Gomes }
6525d4939fcSPedro Demarchi Gomes out_unlock:
653d6b5a8d6SSasha Levin pte_unmap_unlock(start_ptep, ptl);
654912aa825SPedro Demarchi Gomes return found;
655912aa825SPedro Demarchi Gomes }
656912aa825SPedro Demarchi Gomes
657912aa825SPedro Demarchi Gomes static const struct mm_walk_ops break_ksm_ops = {
658912aa825SPedro Demarchi Gomes .pmd_entry = break_ksm_pmd_entry,
659912aa825SPedro Demarchi Gomes .walk_lock = PGWALK_RDLOCK,
660912aa825SPedro Demarchi Gomes };
661912aa825SPedro Demarchi Gomes
662912aa825SPedro Demarchi Gomes static const struct mm_walk_ops break_ksm_lock_vma_ops = {
663912aa825SPedro Demarchi Gomes .pmd_entry = break_ksm_pmd_entry,
664912aa825SPedro Demarchi Gomes .walk_lock = PGWALK_WRLOCK,
665912aa825SPedro Demarchi Gomes };
666912aa825SPedro Demarchi Gomes
667a913e182SHugh Dickins /*
66805c3fa9cSPedro Demarchi Gomes * Though it's very tempting to unmerge rmap_items from stable tree rather
66905c3fa9cSPedro Demarchi Gomes * than check every pte of a given vma, the locking doesn't quite work for
67005c3fa9cSPedro Demarchi Gomes * that - an rmap_item is assigned to the stable tree after inserting ksm
67105c3fa9cSPedro Demarchi Gomes * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
67205c3fa9cSPedro Demarchi Gomes * rmap_items from parent to child at fork time (so as not to waste time
67305c3fa9cSPedro Demarchi Gomes * if exit comes before the next scan reaches it).
67405c3fa9cSPedro Demarchi Gomes *
67505c3fa9cSPedro Demarchi Gomes * Similarly, although we'd like to remove rmap_items (so updating counts
67605c3fa9cSPedro Demarchi Gomes * and freeing memory) when unmerging an area, it's easier to leave that
67705c3fa9cSPedro Demarchi Gomes * to the next pass of ksmd - consider, for example, how ksmd might be
67805c3fa9cSPedro Demarchi Gomes * in cmp_and_merge_page on one of the rmap_items we would be removing.
67905c3fa9cSPedro Demarchi Gomes *
6806cce3314SDavid Hildenbrand * We use break_ksm to break COW on a ksm page by triggering unsharing,
6816cce3314SDavid Hildenbrand * such that the ksm page will get replaced by an exclusive anonymous page.
68231dbd01fSIzik Eidus *
6836cce3314SDavid Hildenbrand * We take great care only to touch a ksm page, in a VM_MERGEABLE vma,
68431dbd01fSIzik Eidus * in case the application has unmapped and remapped mm,addr meanwhile.
68531dbd01fSIzik Eidus * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
686bbcd53c9SDavid Hildenbrand * mmap of /dev/mem, where we would not want to touch it.
6871b2ee126SDave Hansen *
6886cce3314SDavid Hildenbrand * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context
6891b2ee126SDave Hansen * of the process that owns 'vma'. We also do not want to enforce
6901b2ee126SDave Hansen * protection keys here anyway.
69131dbd01fSIzik Eidus */
break_ksm(struct vm_area_struct * vma,unsigned long addr,unsigned long end,bool lock_vma)6925d4939fcSPedro Demarchi Gomes static int break_ksm(struct vm_area_struct *vma, unsigned long addr,
6935d4939fcSPedro Demarchi Gomes unsigned long end, bool lock_vma)
69431dbd01fSIzik Eidus {
69550a7ca3cSSouptick Joarder vm_fault_t ret = 0;
696912aa825SPedro Demarchi Gomes const struct mm_walk_ops *ops = lock_vma ?
697912aa825SPedro Demarchi Gomes &break_ksm_lock_vma_ops : &break_ksm_ops;
69831dbd01fSIzik Eidus
69931dbd01fSIzik Eidus do {
700912aa825SPedro Demarchi Gomes int ksm_page;
70158f595c6SDavid Hildenbrand
70231dbd01fSIzik Eidus cond_resched();
7035d4939fcSPedro Demarchi Gomes ksm_page = walk_page_range_vma(vma, addr, end, ops, &addr);
7045d4939fcSPedro Demarchi Gomes if (ksm_page <= 0)
705912aa825SPedro Demarchi Gomes return ksm_page;
706dcddffd4SKirill A. Shutemov ret = handle_mm_fault(vma, addr,
7076cce3314SDavid Hildenbrand FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
708bce617edSPeter Xu NULL);
70958f595c6SDavid Hildenbrand } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
710d952b791SHugh Dickins /*
71158f595c6SDavid Hildenbrand * We must loop until we no longer find a KSM page because
71258f595c6SDavid Hildenbrand * handle_mm_fault() may back out if there's any difficulty e.g. if
71358f595c6SDavid Hildenbrand * pte accessed bit gets updated concurrently.
714d952b791SHugh Dickins *
715d952b791SHugh Dickins * VM_FAULT_SIGBUS could occur if we race with truncation of the
716d952b791SHugh Dickins * backing file, which also invalidates anonymous pages: that's
717b9a25635SMatthew Wilcox (Oracle) * okay, that truncation will have unmapped the KSM page for us.
718d952b791SHugh Dickins *
719d952b791SHugh Dickins * VM_FAULT_OOM: at the time of writing (late July 2009), setting
720d952b791SHugh Dickins * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
721d952b791SHugh Dickins * current task has TIF_MEMDIE set, and will be OOM killed on return
722d952b791SHugh Dickins * to user; and ksmd, having no mm, would never be chosen for that.
723d952b791SHugh Dickins *
724d952b791SHugh Dickins * But if the mm is in a limited mem_cgroup, then the fault may fail
725d952b791SHugh Dickins * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
726d952b791SHugh Dickins * even ksmd can fail in this way - though it's usually breaking ksm
727d952b791SHugh Dickins * just to undo a merge it made a moment before, so unlikely to oom.
728d952b791SHugh Dickins *
729d952b791SHugh Dickins * That's a pity: we might therefore have more kernel pages allocated
730d952b791SHugh Dickins * than we're counting as nodes in the stable tree; but ksm_do_scan
731d952b791SHugh Dickins * will retry to break_cow on each pass, so should recover the page
732d952b791SHugh Dickins * in due course. The important thing is to not let VM_MERGEABLE
733d952b791SHugh Dickins * be cleared while any such pages might remain in the area.
734d952b791SHugh Dickins */
735d952b791SHugh Dickins return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
73631dbd01fSIzik Eidus }
73731dbd01fSIzik Eidus
ksm_compatible(const struct file * file,vma_flags_t vma_flags)738*3a6455d5SLorenzo Stoakes (Oracle) static bool ksm_compatible(const struct file *file, vma_flags_t vma_flags)
739d7597f59SStefan Roesch {
740*3a6455d5SLorenzo Stoakes (Oracle) /* Just ignore the advice. */
741*3a6455d5SLorenzo Stoakes (Oracle) if (vma_flags_test_any(&vma_flags, VMA_SHARED_BIT, VMA_MAYSHARE_BIT,
742*3a6455d5SLorenzo Stoakes (Oracle) VMA_HUGETLB_BIT))
743*3a6455d5SLorenzo Stoakes (Oracle) return false;
744*3a6455d5SLorenzo Stoakes (Oracle) if (vma_flags_test_single_mask(&vma_flags, VMA_DROPPABLE))
745*3a6455d5SLorenzo Stoakes (Oracle) return false;
746*3a6455d5SLorenzo Stoakes (Oracle) if (vma_flags_test_any_mask(&vma_flags, VMA_SPECIAL_FLAGS))
747*3a6455d5SLorenzo Stoakes (Oracle) return false;
748de195c67SLorenzo Stoakes if (file_is_dax(file))
749d7597f59SStefan Roesch return false;
750d7597f59SStefan Roesch #ifdef VM_SAO
751*3a6455d5SLorenzo Stoakes (Oracle) if (vma_flags_test(&vma_flags, VMA_SAO_BIT))
752d7597f59SStefan Roesch return false;
753d7597f59SStefan Roesch #endif
754d7597f59SStefan Roesch #ifdef VM_SPARC_ADI
755*3a6455d5SLorenzo Stoakes (Oracle) if (vma_flags_test(&vma_flags, VMA_SPARC_ADI_BIT))
756d7597f59SStefan Roesch return false;
757d7597f59SStefan Roesch #endif
758d7597f59SStefan Roesch
759d7597f59SStefan Roesch return true;
760d7597f59SStefan Roesch }
761d7597f59SStefan Roesch
vma_ksm_compatible(struct vm_area_struct * vma)762de195c67SLorenzo Stoakes static bool vma_ksm_compatible(struct vm_area_struct *vma)
763de195c67SLorenzo Stoakes {
764*3a6455d5SLorenzo Stoakes (Oracle) return ksm_compatible(vma->vm_file, vma->flags);
765de195c67SLorenzo Stoakes }
766de195c67SLorenzo Stoakes
find_mergeable_vma(struct mm_struct * mm,unsigned long addr)767ef694222SBob Liu static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
768ef694222SBob Liu unsigned long addr)
769ef694222SBob Liu {
770ef694222SBob Liu struct vm_area_struct *vma;
771ef694222SBob Liu if (ksm_test_exit(mm))
772ef694222SBob Liu return NULL;
773ff69fb81SLiam Howlett vma = vma_lookup(mm, addr);
774ff69fb81SLiam Howlett if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
775ef694222SBob Liu return NULL;
776ef694222SBob Liu return vma;
777ef694222SBob Liu }
778ef694222SBob Liu
break_cow(struct ksm_rmap_item * rmap_item)77921fbd591SQi Zheng static void break_cow(struct ksm_rmap_item *rmap_item)
78031dbd01fSIzik Eidus {
7818dd3557aSHugh Dickins struct mm_struct *mm = rmap_item->mm;
7828dd3557aSHugh Dickins unsigned long addr = rmap_item->address;
78331dbd01fSIzik Eidus struct vm_area_struct *vma;
78431dbd01fSIzik Eidus
7854035c07aSHugh Dickins /*
7864035c07aSHugh Dickins * It is not an accident that whenever we want to break COW
7874035c07aSHugh Dickins * to undo, we also need to drop a reference to the anon_vma.
7884035c07aSHugh Dickins */
7899e60109fSPeter Zijlstra put_anon_vma(rmap_item->anon_vma);
7904035c07aSHugh Dickins
791d8ed45c5SMichel Lespinasse mmap_read_lock(mm);
792ef694222SBob Liu vma = find_mergeable_vma(mm, addr);
793ef694222SBob Liu if (vma)
7945d4939fcSPedro Demarchi Gomes break_ksm(vma, addr, addr + PAGE_SIZE, false);
795d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
79631dbd01fSIzik Eidus }
79731dbd01fSIzik Eidus
get_mergeable_page(struct ksm_rmap_item * rmap_item)79821fbd591SQi Zheng static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item)
79931dbd01fSIzik Eidus {
80031dbd01fSIzik Eidus struct mm_struct *mm = rmap_item->mm;
80131dbd01fSIzik Eidus unsigned long addr = rmap_item->address;
80231dbd01fSIzik Eidus struct vm_area_struct *vma;
803184e916cSDavid Hildenbrand struct page *page = NULL;
804184e916cSDavid Hildenbrand struct folio_walk fw;
805184e916cSDavid Hildenbrand struct folio *folio;
80631dbd01fSIzik Eidus
807d8ed45c5SMichel Lespinasse mmap_read_lock(mm);
808ef694222SBob Liu vma = find_mergeable_vma(mm, addr);
809ef694222SBob Liu if (!vma)
81031dbd01fSIzik Eidus goto out;
81131dbd01fSIzik Eidus
812184e916cSDavid Hildenbrand folio = folio_walk_start(&fw, vma, addr, 0);
813184e916cSDavid Hildenbrand if (folio) {
814184e916cSDavid Hildenbrand if (!folio_is_zone_device(folio) &&
815184e916cSDavid Hildenbrand folio_test_anon(folio)) {
816184e916cSDavid Hildenbrand folio_get(folio);
817184e916cSDavid Hildenbrand page = fw.page;
818184e916cSDavid Hildenbrand }
819184e916cSDavid Hildenbrand folio_walk_end(&fw, vma);
820184e916cSDavid Hildenbrand }
821184e916cSDavid Hildenbrand out:
822184e916cSDavid Hildenbrand if (page) {
82331dbd01fSIzik Eidus flush_anon_page(vma, page, addr);
82431dbd01fSIzik Eidus flush_dcache_page(page);
82531dbd01fSIzik Eidus }
826d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
82731dbd01fSIzik Eidus return page;
82831dbd01fSIzik Eidus }
82931dbd01fSIzik Eidus
83090bd6fd3SPetr Holasek /*
83190bd6fd3SPetr Holasek * This helper is used for getting right index into array of tree roots.
83290bd6fd3SPetr Holasek * When merge_across_nodes knob is set to 1, there are only two rb-trees for
83390bd6fd3SPetr Holasek * stable and unstable pages from all nodes with roots in index 0. Otherwise,
83490bd6fd3SPetr Holasek * every node has its own stable and unstable tree.
83590bd6fd3SPetr Holasek */
get_kpfn_nid(unsigned long kpfn)83690bd6fd3SPetr Holasek static inline int get_kpfn_nid(unsigned long kpfn)
83790bd6fd3SPetr Holasek {
838d8fc16a8SHugh Dickins return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
83990bd6fd3SPetr Holasek }
84090bd6fd3SPetr Holasek
alloc_stable_node_chain(struct ksm_stable_node * dup,struct rb_root * root)84121fbd591SQi Zheng static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup,
8422c653d0eSAndrea Arcangeli struct rb_root *root)
8432c653d0eSAndrea Arcangeli {
84421fbd591SQi Zheng struct ksm_stable_node *chain = alloc_stable_node();
8452c653d0eSAndrea Arcangeli VM_BUG_ON(is_stable_node_chain(dup));
8462c653d0eSAndrea Arcangeli if (likely(chain)) {
8472c653d0eSAndrea Arcangeli INIT_HLIST_HEAD(&chain->hlist);
8482c653d0eSAndrea Arcangeli chain->chain_prune_time = jiffies;
8492c653d0eSAndrea Arcangeli chain->rmap_hlist_len = STABLE_NODE_CHAIN;
8502c653d0eSAndrea Arcangeli #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
85198fa15f3SAnshuman Khandual chain->nid = NUMA_NO_NODE; /* debug */
8522c653d0eSAndrea Arcangeli #endif
8532c653d0eSAndrea Arcangeli ksm_stable_node_chains++;
8542c653d0eSAndrea Arcangeli
8552c653d0eSAndrea Arcangeli /*
8562c653d0eSAndrea Arcangeli * Put the stable node chain in the first dimension of
8572c653d0eSAndrea Arcangeli * the stable tree and at the same time remove the old
8582c653d0eSAndrea Arcangeli * stable node.
8592c653d0eSAndrea Arcangeli */
8602c653d0eSAndrea Arcangeli rb_replace_node(&dup->node, &chain->node, root);
8612c653d0eSAndrea Arcangeli
8622c653d0eSAndrea Arcangeli /*
8632c653d0eSAndrea Arcangeli * Move the old stable node to the second dimension
8642c653d0eSAndrea Arcangeli * queued in the hlist_dup. The invariant is that all
8652c653d0eSAndrea Arcangeli * dup stable_nodes in the chain->hlist point to pages
866457aef94SEthon Paul * that are write protected and have the exact same
8672c653d0eSAndrea Arcangeli * content.
8682c653d0eSAndrea Arcangeli */
8692c653d0eSAndrea Arcangeli stable_node_chain_add_dup(dup, chain);
8702c653d0eSAndrea Arcangeli }
8712c653d0eSAndrea Arcangeli return chain;
8722c653d0eSAndrea Arcangeli }
8732c653d0eSAndrea Arcangeli
free_stable_node_chain(struct ksm_stable_node * chain,struct rb_root * root)87421fbd591SQi Zheng static inline void free_stable_node_chain(struct ksm_stable_node *chain,
8752c653d0eSAndrea Arcangeli struct rb_root *root)
8762c653d0eSAndrea Arcangeli {
8772c653d0eSAndrea Arcangeli rb_erase(&chain->node, root);
8782c653d0eSAndrea Arcangeli free_stable_node(chain);
8792c653d0eSAndrea Arcangeli ksm_stable_node_chains--;
8802c653d0eSAndrea Arcangeli }
8812c653d0eSAndrea Arcangeli
remove_node_from_stable_tree(struct ksm_stable_node * stable_node)88221fbd591SQi Zheng static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node)
8834035c07aSHugh Dickins {
88421fbd591SQi Zheng struct ksm_rmap_item *rmap_item;
8854035c07aSHugh Dickins
8862c653d0eSAndrea Arcangeli /* check it's not STABLE_NODE_CHAIN or negative */
8872c653d0eSAndrea Arcangeli BUG_ON(stable_node->rmap_hlist_len < 0);
8882c653d0eSAndrea Arcangeli
889b67bfe0dSSasha Levin hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
890739100c8SStefan Roesch if (rmap_item->hlist.next) {
8914035c07aSHugh Dickins ksm_pages_sharing--;
892739100c8SStefan Roesch trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm);
893739100c8SStefan Roesch } else {
8944035c07aSHugh Dickins ksm_pages_shared--;
895739100c8SStefan Roesch }
89676093853Sxu xin
89776093853Sxu xin rmap_item->mm->ksm_merging_pages--;
89876093853Sxu xin
8992c653d0eSAndrea Arcangeli VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
9002c653d0eSAndrea Arcangeli stable_node->rmap_hlist_len--;
9019e60109fSPeter Zijlstra put_anon_vma(rmap_item->anon_vma);
9024035c07aSHugh Dickins rmap_item->address &= PAGE_MASK;
9034035c07aSHugh Dickins cond_resched();
9044035c07aSHugh Dickins }
9054035c07aSHugh Dickins
9062c653d0eSAndrea Arcangeli /*
9072c653d0eSAndrea Arcangeli * We need the second aligned pointer of the migrate_nodes
9082c653d0eSAndrea Arcangeli * list_head to stay clear from the rb_parent_color union
9092c653d0eSAndrea Arcangeli * (aligned and different than any node) and also different
9102c653d0eSAndrea Arcangeli * from &migrate_nodes. This will verify that future list.h changes
911815f0ddbSNick Desaulniers * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
9122c653d0eSAndrea Arcangeli */
9132c653d0eSAndrea Arcangeli BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
9142c653d0eSAndrea Arcangeli BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
9152c653d0eSAndrea Arcangeli
916739100c8SStefan Roesch trace_ksm_remove_ksm_page(stable_node->kpfn);
9174146d2d6SHugh Dickins if (stable_node->head == &migrate_nodes)
9184146d2d6SHugh Dickins list_del(&stable_node->list);
9194146d2d6SHugh Dickins else
9202c653d0eSAndrea Arcangeli stable_node_dup_del(stable_node);
9214035c07aSHugh Dickins free_stable_node(stable_node);
9224035c07aSHugh Dickins }
9234035c07aSHugh Dickins
92485b67b01SDavid Hildenbrand enum ksm_get_folio_flags {
92585b67b01SDavid Hildenbrand KSM_GET_FOLIO_NOLOCK,
92685b67b01SDavid Hildenbrand KSM_GET_FOLIO_LOCK,
92785b67b01SDavid Hildenbrand KSM_GET_FOLIO_TRYLOCK
9282cee57d1SYang Shi };
9292cee57d1SYang Shi
9304035c07aSHugh Dickins /*
931b91f9472SAlex Shi (tencent) * ksm_get_folio: checks if the page indicated by the stable node
9324035c07aSHugh Dickins * is still its ksm page, despite having held no reference to it.
9334035c07aSHugh Dickins * In which case we can trust the content of the page, and it
9344035c07aSHugh Dickins * returns the gotten page; but if the page has now been zapped,
9354035c07aSHugh Dickins * remove the stale node from the stable tree and return NULL.
936c8d6553bSHugh Dickins * But beware, the stable node's page might be being migrated.
9374035c07aSHugh Dickins *
9384035c07aSHugh Dickins * You would expect the stable_node to hold a reference to the ksm page.
9394035c07aSHugh Dickins * But if it increments the page's count, swapping out has to wait for
9404035c07aSHugh Dickins * ksmd to come around again before it can free the page, which may take
9414035c07aSHugh Dickins * seconds or even minutes: much too unresponsive. So instead we use a
9424035c07aSHugh Dickins * "keyhole reference": access to the ksm page from the stable node peeps
9434035c07aSHugh Dickins * out through its keyhole to see if that page still holds the right key,
9444035c07aSHugh Dickins * pointing back to this stable node. This relies on freeing a PageAnon
9454035c07aSHugh Dickins * page to reset its page->mapping to NULL, and relies on no other use of
9464035c07aSHugh Dickins * a page to put something that might look like our key in page->mapping.
9474035c07aSHugh Dickins * is on its way to being freed; but it is an anomaly to bear in mind.
9484035c07aSHugh Dickins */
ksm_get_folio(struct ksm_stable_node * stable_node,enum ksm_get_folio_flags flags)949b91f9472SAlex Shi (tencent) static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node,
95085b67b01SDavid Hildenbrand enum ksm_get_folio_flags flags)
9514035c07aSHugh Dickins {
952b91f9472SAlex Shi (tencent) struct folio *folio;
9534035c07aSHugh Dickins void *expected_mapping;
954c8d6553bSHugh Dickins unsigned long kpfn;
9554035c07aSHugh Dickins
956bda807d4SMinchan Kim expected_mapping = (void *)((unsigned long)stable_node |
957df25569dSDavid Hildenbrand FOLIO_MAPPING_KSM);
958c8d6553bSHugh Dickins again:
95908df4774SPaul E. McKenney kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
960b91f9472SAlex Shi (tencent) folio = pfn_folio(kpfn);
961b91f9472SAlex Shi (tencent) if (READ_ONCE(folio->mapping) != expected_mapping)
9624035c07aSHugh Dickins goto stale;
963c8d6553bSHugh Dickins
964c8d6553bSHugh Dickins /*
965c8d6553bSHugh Dickins * We cannot do anything with the page while its refcount is 0.
966c8d6553bSHugh Dickins * Usually 0 means free, or tail of a higher-order page: in which
967c8d6553bSHugh Dickins * case this node is no longer referenced, and should be freed;
9681c4c3b99SJiang Biao * however, it might mean that the page is under page_ref_freeze().
969c8d6553bSHugh Dickins * The __remove_mapping() case is easy, again the node is now stale;
97052d1e606SKirill Tkhai * the same is in reuse_ksm_page() case; but if page is swapcache
9719800562fSMatthew Wilcox (Oracle) * in folio_migrate_mapping(), it might still be our page,
97252d1e606SKirill Tkhai * in which case it's essential to keep the node.
973c8d6553bSHugh Dickins */
974b91f9472SAlex Shi (tencent) while (!folio_try_get(folio)) {
975c8d6553bSHugh Dickins /*
97632f51eadSMatthew Wilcox (Oracle) * Another check for folio->mapping != expected_mapping
97732f51eadSMatthew Wilcox (Oracle) * would work here too. We have chosen to test the
97832f51eadSMatthew Wilcox (Oracle) * swapcache flag to optimize the common case, when the
97932f51eadSMatthew Wilcox (Oracle) * folio is or is about to be freed: the swapcache flag
98032f51eadSMatthew Wilcox (Oracle) * is cleared (under spin_lock_irq) in the ref_freeze
98132f51eadSMatthew Wilcox (Oracle) * section of __remove_mapping(); but anon folio->mapping
98232f51eadSMatthew Wilcox (Oracle) * is reset to NULL later, in free_pages_prepare().
983c8d6553bSHugh Dickins */
984b91f9472SAlex Shi (tencent) if (!folio_test_swapcache(folio))
9854035c07aSHugh Dickins goto stale;
986c8d6553bSHugh Dickins cpu_relax();
987c8d6553bSHugh Dickins }
988c8d6553bSHugh Dickins
989b91f9472SAlex Shi (tencent) if (READ_ONCE(folio->mapping) != expected_mapping) {
990b91f9472SAlex Shi (tencent) folio_put(folio);
9914035c07aSHugh Dickins goto stale;
9924035c07aSHugh Dickins }
993c8d6553bSHugh Dickins
99485b67b01SDavid Hildenbrand if (flags == KSM_GET_FOLIO_TRYLOCK) {
995b91f9472SAlex Shi (tencent) if (!folio_trylock(folio)) {
996b91f9472SAlex Shi (tencent) folio_put(folio);
9972cee57d1SYang Shi return ERR_PTR(-EBUSY);
9982cee57d1SYang Shi }
99985b67b01SDavid Hildenbrand } else if (flags == KSM_GET_FOLIO_LOCK)
1000b91f9472SAlex Shi (tencent) folio_lock(folio);
10012cee57d1SYang Shi
100285b67b01SDavid Hildenbrand if (flags != KSM_GET_FOLIO_NOLOCK) {
1003b91f9472SAlex Shi (tencent) if (READ_ONCE(folio->mapping) != expected_mapping) {
1004b91f9472SAlex Shi (tencent) folio_unlock(folio);
1005b91f9472SAlex Shi (tencent) folio_put(folio);
10068aafa6a4SHugh Dickins goto stale;
10078aafa6a4SHugh Dickins }
10088aafa6a4SHugh Dickins }
1009b91f9472SAlex Shi (tencent) return folio;
1010c8d6553bSHugh Dickins
10114035c07aSHugh Dickins stale:
1012c8d6553bSHugh Dickins /*
101332f51eadSMatthew Wilcox (Oracle) * We come here from above when folio->mapping or the swapcache flag
1014c8d6553bSHugh Dickins * suggests that the node is stale; but it might be under migration.
101519138349SMatthew Wilcox (Oracle) * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
1016c8d6553bSHugh Dickins * before checking whether node->kpfn has been changed.
1017c8d6553bSHugh Dickins */
1018c8d6553bSHugh Dickins smp_rmb();
10194db0c3c2SJason Low if (READ_ONCE(stable_node->kpfn) != kpfn)
1020c8d6553bSHugh Dickins goto again;
10214035c07aSHugh Dickins remove_node_from_stable_tree(stable_node);
10224035c07aSHugh Dickins return NULL;
10234035c07aSHugh Dickins }
10244035c07aSHugh Dickins
102531dbd01fSIzik Eidus /*
102631dbd01fSIzik Eidus * Removing rmap_item from stable or unstable tree.
102731dbd01fSIzik Eidus * This function will clean the information from the stable/unstable tree.
102831dbd01fSIzik Eidus */
remove_rmap_item_from_tree(struct ksm_rmap_item * rmap_item)102921fbd591SQi Zheng static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item)
103031dbd01fSIzik Eidus {
10317b6ba2c7SHugh Dickins if (rmap_item->address & STABLE_FLAG) {
103221fbd591SQi Zheng struct ksm_stable_node *stable_node;
1033f39b6e2dSAlex Shi (tencent) struct folio *folio;
103431dbd01fSIzik Eidus
10357b6ba2c7SHugh Dickins stable_node = rmap_item->head;
103685b67b01SDavid Hildenbrand folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK);
1037f39b6e2dSAlex Shi (tencent) if (!folio)
10384035c07aSHugh Dickins goto out;
10395ad64688SHugh Dickins
10407b6ba2c7SHugh Dickins hlist_del(&rmap_item->hlist);
1041f39b6e2dSAlex Shi (tencent) folio_unlock(folio);
1042f39b6e2dSAlex Shi (tencent) folio_put(folio);
104308beca44SHugh Dickins
104498666f8aSAndrea Arcangeli if (!hlist_empty(&stable_node->hlist))
10454035c07aSHugh Dickins ksm_pages_sharing--;
10464035c07aSHugh Dickins else
1047b4028260SHugh Dickins ksm_pages_shared--;
104876093853Sxu xin
104976093853Sxu xin rmap_item->mm->ksm_merging_pages--;
105076093853Sxu xin
10512c653d0eSAndrea Arcangeli VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
10522c653d0eSAndrea Arcangeli stable_node->rmap_hlist_len--;
105331dbd01fSIzik Eidus
10549e60109fSPeter Zijlstra put_anon_vma(rmap_item->anon_vma);
1055c89a384eSMiaohe Lin rmap_item->head = NULL;
105693d17715SHugh Dickins rmap_item->address &= PAGE_MASK;
105731dbd01fSIzik Eidus
10587b6ba2c7SHugh Dickins } else if (rmap_item->address & UNSTABLE_FLAG) {
105931dbd01fSIzik Eidus unsigned char age;
106031dbd01fSIzik Eidus /*
10619ba69294SHugh Dickins * Usually ksmd can and must skip the rb_erase, because
106231dbd01fSIzik Eidus * root_unstable_tree was already reset to RB_ROOT.
10639ba69294SHugh Dickins * But be careful when an mm is exiting: do the rb_erase
10649ba69294SHugh Dickins * if this rmap_item was inserted by this scan, rather
10659ba69294SHugh Dickins * than left over from before.
106631dbd01fSIzik Eidus */
106731dbd01fSIzik Eidus age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
1068cd551f97SHugh Dickins BUG_ON(age > 1);
106931dbd01fSIzik Eidus if (!age)
107090bd6fd3SPetr Holasek rb_erase(&rmap_item->node,
1071ef53d16cSHugh Dickins root_unstable_tree + NUMA(rmap_item->nid));
107293d17715SHugh Dickins ksm_pages_unshared--;
107331dbd01fSIzik Eidus rmap_item->address &= PAGE_MASK;
107493d17715SHugh Dickins }
10754035c07aSHugh Dickins out:
107631dbd01fSIzik Eidus cond_resched(); /* we're called from many long loops */
107731dbd01fSIzik Eidus }
107831dbd01fSIzik Eidus
remove_trailing_rmap_items(struct ksm_rmap_item ** rmap_list)107921fbd591SQi Zheng static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
108031dbd01fSIzik Eidus {
10816514d511SHugh Dickins while (*rmap_list) {
108221fbd591SQi Zheng struct ksm_rmap_item *rmap_item = *rmap_list;
10836514d511SHugh Dickins *rmap_list = rmap_item->rmap_list;
108431dbd01fSIzik Eidus remove_rmap_item_from_tree(rmap_item);
108531dbd01fSIzik Eidus free_rmap_item(rmap_item);
108631dbd01fSIzik Eidus }
108731dbd01fSIzik Eidus }
108831dbd01fSIzik Eidus
108968158bfaSMatthew Wilcox (Oracle) static inline
folio_stable_node(const struct folio * folio)109068158bfaSMatthew Wilcox (Oracle) struct ksm_stable_node *folio_stable_node(const struct folio *folio)
109119138349SMatthew Wilcox (Oracle) {
109219138349SMatthew Wilcox (Oracle) return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
109319138349SMatthew Wilcox (Oracle) }
109419138349SMatthew Wilcox (Oracle)
folio_set_stable_node(struct folio * folio,struct ksm_stable_node * stable_node)1095b8b0ff24SAlex Shi (tencent) static inline void folio_set_stable_node(struct folio *folio,
109621fbd591SQi Zheng struct ksm_stable_node *stable_node)
109788484826SMike Rapoport {
1098452e862fSAlex Shi (tencent) VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio);
1099df25569dSDavid Hildenbrand folio->mapping = (void *)((unsigned long)stable_node | FOLIO_MAPPING_KSM);
110088484826SMike Rapoport }
110188484826SMike Rapoport
11022ffd8679SHugh Dickins #ifdef CONFIG_SYSFS
11032ffd8679SHugh Dickins /*
11042ffd8679SHugh Dickins * Only called through the sysfs control interface:
11052ffd8679SHugh Dickins */
remove_stable_node(struct ksm_stable_node * stable_node)110621fbd591SQi Zheng static int remove_stable_node(struct ksm_stable_node *stable_node)
1107cbf86cfeSHugh Dickins {
11089d5cc140SAlex Shi (tencent) struct folio *folio;
1109cbf86cfeSHugh Dickins int err;
1110cbf86cfeSHugh Dickins
111185b67b01SDavid Hildenbrand folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK);
11129d5cc140SAlex Shi (tencent) if (!folio) {
1113cbf86cfeSHugh Dickins /*
11149d5cc140SAlex Shi (tencent) * ksm_get_folio did remove_node_from_stable_tree itself.
1115cbf86cfeSHugh Dickins */
1116cbf86cfeSHugh Dickins return 0;
1117cbf86cfeSHugh Dickins }
1118cbf86cfeSHugh Dickins
1119cbf86cfeSHugh Dickins /*
11209a63236fSAndrey Ryabinin * Page could be still mapped if this races with __mmput() running in
11219a63236fSAndrey Ryabinin * between ksm_exit() and exit_mmap(). Just refuse to let
11229a63236fSAndrey Ryabinin * merge_across_nodes/max_page_sharing be switched.
11238fdb3dbfSHugh Dickins */
11248fdb3dbfSHugh Dickins err = -EBUSY;
11259d5cc140SAlex Shi (tencent) if (!folio_mapped(folio)) {
11268fdb3dbfSHugh Dickins /*
11279d5cc140SAlex Shi (tencent) * The stable node did not yet appear stale to ksm_get_folio(),
11289d5cc140SAlex Shi (tencent) * since that allows for an unmapped ksm folio to be recognized
11298fdb3dbfSHugh Dickins * right up until it is freed; but the node is safe to remove.
11309d5cc140SAlex Shi (tencent) * This folio might be in an LRU cache waiting to be freed,
11319d5cc140SAlex Shi (tencent) * or it might be in the swapcache (perhaps under writeback),
1132cbf86cfeSHugh Dickins * or it might have been removed from swapcache a moment ago.
1133cbf86cfeSHugh Dickins */
11349d5cc140SAlex Shi (tencent) folio_set_stable_node(folio, NULL);
1135cbf86cfeSHugh Dickins remove_node_from_stable_tree(stable_node);
1136cbf86cfeSHugh Dickins err = 0;
1137cbf86cfeSHugh Dickins }
1138cbf86cfeSHugh Dickins
11399d5cc140SAlex Shi (tencent) folio_unlock(folio);
11409d5cc140SAlex Shi (tencent) folio_put(folio);
1141cbf86cfeSHugh Dickins return err;
1142cbf86cfeSHugh Dickins }
1143cbf86cfeSHugh Dickins
remove_stable_node_chain(struct ksm_stable_node * stable_node,struct rb_root * root)114421fbd591SQi Zheng static int remove_stable_node_chain(struct ksm_stable_node *stable_node,
11452c653d0eSAndrea Arcangeli struct rb_root *root)
11462c653d0eSAndrea Arcangeli {
114721fbd591SQi Zheng struct ksm_stable_node *dup;
11482c653d0eSAndrea Arcangeli struct hlist_node *hlist_safe;
11492c653d0eSAndrea Arcangeli
11502c653d0eSAndrea Arcangeli if (!is_stable_node_chain(stable_node)) {
11512c653d0eSAndrea Arcangeli VM_BUG_ON(is_stable_node_dup(stable_node));
11522c653d0eSAndrea Arcangeli if (remove_stable_node(stable_node))
11532c653d0eSAndrea Arcangeli return true;
11542c653d0eSAndrea Arcangeli else
11552c653d0eSAndrea Arcangeli return false;
11562c653d0eSAndrea Arcangeli }
11572c653d0eSAndrea Arcangeli
11582c653d0eSAndrea Arcangeli hlist_for_each_entry_safe(dup, hlist_safe,
11592c653d0eSAndrea Arcangeli &stable_node->hlist, hlist_dup) {
11602c653d0eSAndrea Arcangeli VM_BUG_ON(!is_stable_node_dup(dup));
11612c653d0eSAndrea Arcangeli if (remove_stable_node(dup))
11622c653d0eSAndrea Arcangeli return true;
11632c653d0eSAndrea Arcangeli }
11642c653d0eSAndrea Arcangeli BUG_ON(!hlist_empty(&stable_node->hlist));
11652c653d0eSAndrea Arcangeli free_stable_node_chain(stable_node, root);
11662c653d0eSAndrea Arcangeli return false;
11672c653d0eSAndrea Arcangeli }
11682c653d0eSAndrea Arcangeli
remove_all_stable_nodes(void)1169cbf86cfeSHugh Dickins static int remove_all_stable_nodes(void)
1170cbf86cfeSHugh Dickins {
117121fbd591SQi Zheng struct ksm_stable_node *stable_node, *next;
1172cbf86cfeSHugh Dickins int nid;
1173cbf86cfeSHugh Dickins int err = 0;
1174cbf86cfeSHugh Dickins
1175ef53d16cSHugh Dickins for (nid = 0; nid < ksm_nr_node_ids; nid++) {
1176cbf86cfeSHugh Dickins while (root_stable_tree[nid].rb_node) {
1177cbf86cfeSHugh Dickins stable_node = rb_entry(root_stable_tree[nid].rb_node,
117821fbd591SQi Zheng struct ksm_stable_node, node);
11792c653d0eSAndrea Arcangeli if (remove_stable_node_chain(stable_node,
11802c653d0eSAndrea Arcangeli root_stable_tree + nid)) {
1181cbf86cfeSHugh Dickins err = -EBUSY;
1182cbf86cfeSHugh Dickins break; /* proceed to next nid */
1183cbf86cfeSHugh Dickins }
1184cbf86cfeSHugh Dickins cond_resched();
1185cbf86cfeSHugh Dickins }
1186cbf86cfeSHugh Dickins }
118703640418SGeliang Tang list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
11884146d2d6SHugh Dickins if (remove_stable_node(stable_node))
11894146d2d6SHugh Dickins err = -EBUSY;
11904146d2d6SHugh Dickins cond_resched();
11914146d2d6SHugh Dickins }
1192cbf86cfeSHugh Dickins return err;
1193cbf86cfeSHugh Dickins }
1194cbf86cfeSHugh Dickins
unmerge_and_remove_all_rmap_items(void)1195d952b791SHugh Dickins static int unmerge_and_remove_all_rmap_items(void)
119631dbd01fSIzik Eidus {
119721fbd591SQi Zheng struct ksm_mm_slot *mm_slot;
119858730ab6SQi Zheng struct mm_slot *slot;
119931dbd01fSIzik Eidus struct mm_struct *mm;
120031dbd01fSIzik Eidus struct vm_area_struct *vma;
1201d952b791SHugh Dickins int err = 0;
120231dbd01fSIzik Eidus
1203d952b791SHugh Dickins spin_lock(&ksm_mmlist_lock);
120458730ab6SQi Zheng slot = list_entry(ksm_mm_head.slot.mm_node.next,
120558730ab6SQi Zheng struct mm_slot, mm_node);
120658730ab6SQi Zheng ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
1207d952b791SHugh Dickins spin_unlock(&ksm_mmlist_lock);
1208d952b791SHugh Dickins
1209a5f18ba0SMatthew Wilcox (Oracle) for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head;
1210a5f18ba0SMatthew Wilcox (Oracle) mm_slot = ksm_scan.mm_slot) {
121158730ab6SQi Zheng VMA_ITERATOR(vmi, mm_slot->slot.mm, 0);
1212a5f18ba0SMatthew Wilcox (Oracle)
121358730ab6SQi Zheng mm = mm_slot->slot.mm;
1214d8ed45c5SMichel Lespinasse mmap_read_lock(mm);
12156db504ceSLiam R. Howlett
12166db504ceSLiam R. Howlett /*
12176db504ceSLiam R. Howlett * Exit right away if mm is exiting to avoid lockdep issue in
12186db504ceSLiam R. Howlett * the maple tree
12196db504ceSLiam R. Howlett */
12209ba69294SHugh Dickins if (ksm_test_exit(mm))
12216db504ceSLiam R. Howlett goto mm_exiting;
12226db504ceSLiam R. Howlett
12236db504ceSLiam R. Howlett for_each_vma(vmi, vma) {
122431dbd01fSIzik Eidus if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
122531dbd01fSIzik Eidus continue;
122605c3fa9cSPedro Demarchi Gomes err = break_ksm(vma, vma->vm_start, vma->vm_end, false);
12279ba69294SHugh Dickins if (err)
12289ba69294SHugh Dickins goto error;
1229d952b791SHugh Dickins }
12309ba69294SHugh Dickins
12316db504ceSLiam R. Howlett mm_exiting:
1232420be4edSChengyang Fan remove_trailing_rmap_items(&mm_slot->rmap_list);
1233d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
123431dbd01fSIzik Eidus
123531dbd01fSIzik Eidus spin_lock(&ksm_mmlist_lock);
123658730ab6SQi Zheng slot = list_entry(mm_slot->slot.mm_node.next,
123758730ab6SQi Zheng struct mm_slot, mm_node);
123858730ab6SQi Zheng ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
12399ba69294SHugh Dickins if (ksm_test_exit(mm)) {
124058730ab6SQi Zheng hash_del(&mm_slot->slot.hash);
124158730ab6SQi Zheng list_del(&mm_slot->slot.mm_node);
124231dbd01fSIzik Eidus spin_unlock(&ksm_mmlist_lock);
12439ba69294SHugh Dickins
124458730ab6SQi Zheng mm_slot_free(mm_slot_cache, mm_slot);
124512e423baSLorenzo Stoakes mm_flags_clear(MMF_VM_MERGEABLE, mm);
124612e423baSLorenzo Stoakes mm_flags_clear(MMF_VM_MERGE_ANY, mm);
12479ba69294SHugh Dickins mmdrop(mm);
12487496fea9SZhou Chengming } else
12499ba69294SHugh Dickins spin_unlock(&ksm_mmlist_lock);
125031dbd01fSIzik Eidus }
125131dbd01fSIzik Eidus
1252cbf86cfeSHugh Dickins /* Clean up stable nodes, but don't worry if some are still busy */
1253cbf86cfeSHugh Dickins remove_all_stable_nodes();
1254d952b791SHugh Dickins ksm_scan.seqnr = 0;
12559ba69294SHugh Dickins return 0;
12569ba69294SHugh Dickins
12579ba69294SHugh Dickins error:
1258d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
1259d952b791SHugh Dickins spin_lock(&ksm_mmlist_lock);
1260d952b791SHugh Dickins ksm_scan.mm_slot = &ksm_mm_head;
1261d952b791SHugh Dickins spin_unlock(&ksm_mmlist_lock);
1262d952b791SHugh Dickins return err;
1263d952b791SHugh Dickins }
12642ffd8679SHugh Dickins #endif /* CONFIG_SYSFS */
1265d952b791SHugh Dickins
calc_checksum(struct page * page)126631dbd01fSIzik Eidus static u32 calc_checksum(struct page *page)
126731dbd01fSIzik Eidus {
126831dbd01fSIzik Eidus u32 checksum;
1269b3351989SFabio M. De Francesco void *addr = kmap_local_page(page);
127059e1a2f4STimofey Titovets checksum = xxhash(addr, PAGE_SIZE, 0);
1271b3351989SFabio M. De Francesco kunmap_local(addr);
127231dbd01fSIzik Eidus return checksum;
127331dbd01fSIzik Eidus }
127431dbd01fSIzik Eidus
write_protect_page(struct vm_area_struct * vma,struct folio * folio,pte_t * orig_pte)127540d707f3SAlex Shi (tencent) static int write_protect_page(struct vm_area_struct *vma, struct folio *folio,
127631dbd01fSIzik Eidus pte_t *orig_pte)
127731dbd01fSIzik Eidus {
127831dbd01fSIzik Eidus struct mm_struct *mm = vma->vm_mm;
127940d707f3SAlex Shi (tencent) DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, 0, 0);
128031dbd01fSIzik Eidus int swapped;
128131dbd01fSIzik Eidus int err = -EFAULT;
1282ac46d4f3SJérôme Glisse struct mmu_notifier_range range;
12836c287605SDavid Hildenbrand bool anon_exclusive;
1284c33c7948SRyan Roberts pte_t entry;
128531dbd01fSIzik Eidus
128640d707f3SAlex Shi (tencent) if (WARN_ON_ONCE(folio_test_large(folio)))
128740d707f3SAlex Shi (tencent) return err;
128840d707f3SAlex Shi (tencent)
1289713da0b3SMatthew Wilcox (Oracle) pvmw.address = page_address_in_vma(folio, folio_page(folio, 0), vma);
129036eaff33SKirill A. Shutemov if (pvmw.address == -EFAULT)
129131dbd01fSIzik Eidus goto out;
129231dbd01fSIzik Eidus
12937d4a8be0SAlistair Popple mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address,
1294ac46d4f3SJérôme Glisse pvmw.address + PAGE_SIZE);
1295ac46d4f3SJérôme Glisse mmu_notifier_invalidate_range_start(&range);
12966bdb913fSHaggai Eran
129736eaff33SKirill A. Shutemov if (!page_vma_mapped_walk(&pvmw))
12986bdb913fSHaggai Eran goto out_mn;
129936eaff33SKirill A. Shutemov if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
130036eaff33SKirill A. Shutemov goto out_unlock;
130131dbd01fSIzik Eidus
1302c33c7948SRyan Roberts entry = ptep_get(pvmw.pte);
1303789cfc66SDavid Hildenbrand /*
1304789cfc66SDavid Hildenbrand * Handle PFN swap PTEs, such as device-exclusive ones, that actually
1305789cfc66SDavid Hildenbrand * map pages: give up just like the next folio_walk would.
1306789cfc66SDavid Hildenbrand */
1307789cfc66SDavid Hildenbrand if (unlikely(!pte_present(entry)))
1308789cfc66SDavid Hildenbrand goto out_unlock;
1309789cfc66SDavid Hildenbrand
1310789cfc66SDavid Hildenbrand anon_exclusive = PageAnonExclusive(&folio->page);
1311c33c7948SRyan Roberts if (pte_write(entry) || pte_dirty(entry) ||
13126c287605SDavid Hildenbrand anon_exclusive || mm_tlb_flush_pending(mm)) {
131340d707f3SAlex Shi (tencent) swapped = folio_test_swapcache(folio);
131440d707f3SAlex Shi (tencent) flush_cache_page(vma, pvmw.address, folio_pfn(folio));
131531dbd01fSIzik Eidus /*
131625985edcSLucas De Marchi * Ok this is tricky, when get_user_pages_fast() run it doesn't
131731dbd01fSIzik Eidus * take any lock, therefore the check that we are going to make
1318f0953a1bSIngo Molnar * with the pagecount against the mapcount is racy and
131931dbd01fSIzik Eidus * O_DIRECT can happen right after the check.
132031dbd01fSIzik Eidus * So we clear the pte and flush the tlb before the check
132131dbd01fSIzik Eidus * this assure us that no O_DIRECT can happen after the check
132231dbd01fSIzik Eidus * or in the middle of the check.
13230f10851eSJérôme Glisse *
13240f10851eSJérôme Glisse * No need to notify as we are downgrading page table to read
13250f10851eSJérôme Glisse * only not changing it to point to a new page.
13260f10851eSJérôme Glisse *
1327ee65728eSMike Rapoport * See Documentation/mm/mmu_notifier.rst
132831dbd01fSIzik Eidus */
13290f10851eSJérôme Glisse entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
133031dbd01fSIzik Eidus /*
133131dbd01fSIzik Eidus * Check that no O_DIRECT or similar I/O is in progress on the
133231dbd01fSIzik Eidus * page
133331dbd01fSIzik Eidus */
133440d707f3SAlex Shi (tencent) if (folio_mapcount(folio) + 1 + swapped != folio_ref_count(folio)) {
133536eaff33SKirill A. Shutemov set_pte_at(mm, pvmw.address, pvmw.pte, entry);
133631dbd01fSIzik Eidus goto out_unlock;
133731dbd01fSIzik Eidus }
13386c287605SDavid Hildenbrand
1339e3b4b137SDavid Hildenbrand /* See folio_try_share_anon_rmap_pte(): clear PTE first. */
1340e3b4b137SDavid Hildenbrand if (anon_exclusive &&
134140d707f3SAlex Shi (tencent) folio_try_share_anon_rmap_pte(folio, &folio->page)) {
13426c287605SDavid Hildenbrand set_pte_at(mm, pvmw.address, pvmw.pte, entry);
13436c287605SDavid Hildenbrand goto out_unlock;
13446c287605SDavid Hildenbrand }
13456c287605SDavid Hildenbrand
13464e31635cSHugh Dickins if (pte_dirty(entry))
134740d707f3SAlex Shi (tencent) folio_mark_dirty(folio);
13486a56ccbcSDavid Hildenbrand entry = pte_mkclean(entry);
1349595cd8f2SAneesh Kumar K.V
13506a56ccbcSDavid Hildenbrand if (pte_write(entry))
13516a56ccbcSDavid Hildenbrand entry = pte_wrprotect(entry);
13526a56ccbcSDavid Hildenbrand
1353f7842747SPaolo Bonzini set_pte_at(mm, pvmw.address, pvmw.pte, entry);
135431dbd01fSIzik Eidus }
1355c33c7948SRyan Roberts *orig_pte = entry;
135631dbd01fSIzik Eidus err = 0;
135731dbd01fSIzik Eidus
135831dbd01fSIzik Eidus out_unlock:
135936eaff33SKirill A. Shutemov page_vma_mapped_walk_done(&pvmw);
13606bdb913fSHaggai Eran out_mn:
1361ac46d4f3SJérôme Glisse mmu_notifier_invalidate_range_end(&range);
136231dbd01fSIzik Eidus out:
136331dbd01fSIzik Eidus return err;
136431dbd01fSIzik Eidus }
136531dbd01fSIzik Eidus
136631dbd01fSIzik Eidus /**
136731dbd01fSIzik Eidus * replace_page - replace page in vma by new ksm page
13688dd3557aSHugh Dickins * @vma: vma that holds the pte pointing to page
13698dd3557aSHugh Dickins * @page: the page we are replacing by kpage
13708dd3557aSHugh Dickins * @kpage: the ksm page we replace page by
137131dbd01fSIzik Eidus * @orig_pte: the original value of the pte
137231dbd01fSIzik Eidus *
137331dbd01fSIzik Eidus * Returns 0 on success, -EFAULT on failure.
137431dbd01fSIzik Eidus */
replace_page(struct vm_area_struct * vma,struct page * page,struct page * kpage,pte_t orig_pte)13758dd3557aSHugh Dickins static int replace_page(struct vm_area_struct *vma, struct page *page,
13768dd3557aSHugh Dickins struct page *kpage, pte_t orig_pte)
137731dbd01fSIzik Eidus {
137897729534SDavid Hildenbrand struct folio *kfolio = page_folio(kpage);
137931dbd01fSIzik Eidus struct mm_struct *mm = vma->vm_mm;
1380713da0b3SMatthew Wilcox (Oracle) struct folio *folio = page_folio(page);
138131dbd01fSIzik Eidus pmd_t *pmd;
138250722804SZach O'Keefe pmd_t pmde;
138331dbd01fSIzik Eidus pte_t *ptep;
1384e86c59b1SClaudio Imbrenda pte_t newpte;
138531dbd01fSIzik Eidus spinlock_t *ptl;
138631dbd01fSIzik Eidus unsigned long addr;
138731dbd01fSIzik Eidus int err = -EFAULT;
1388ac46d4f3SJérôme Glisse struct mmu_notifier_range range;
138931dbd01fSIzik Eidus
1390713da0b3SMatthew Wilcox (Oracle) addr = page_address_in_vma(folio, page, vma);
139131dbd01fSIzik Eidus if (addr == -EFAULT)
139231dbd01fSIzik Eidus goto out;
139331dbd01fSIzik Eidus
13946219049aSBob Liu pmd = mm_find_pmd(mm, addr);
13956219049aSBob Liu if (!pmd)
139631dbd01fSIzik Eidus goto out;
139750722804SZach O'Keefe /*
139850722804SZach O'Keefe * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
139950722804SZach O'Keefe * without holding anon_vma lock for write. So when looking for a
140050722804SZach O'Keefe * genuine pmde (in which to find pte), test present and !THP together.
140150722804SZach O'Keefe */
140226e1a0c3SHugh Dickins pmde = pmdp_get_lockless(pmd);
140350722804SZach O'Keefe if (!pmd_present(pmde) || pmd_trans_huge(pmde))
140450722804SZach O'Keefe goto out;
140531dbd01fSIzik Eidus
14067d4a8be0SAlistair Popple mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
14076f4f13e8SJérôme Glisse addr + PAGE_SIZE);
1408ac46d4f3SJérôme Glisse mmu_notifier_invalidate_range_start(&range);
14096bdb913fSHaggai Eran
141031dbd01fSIzik Eidus ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
141104dee9e8SHugh Dickins if (!ptep)
141204dee9e8SHugh Dickins goto out_mn;
1413c33c7948SRyan Roberts if (!pte_same(ptep_get(ptep), orig_pte)) {
141431dbd01fSIzik Eidus pte_unmap_unlock(ptep, ptl);
14156bdb913fSHaggai Eran goto out_mn;
141631dbd01fSIzik Eidus }
14176c287605SDavid Hildenbrand VM_BUG_ON_PAGE(PageAnonExclusive(page), page);
141897729534SDavid Hildenbrand VM_BUG_ON_FOLIO(folio_test_anon(kfolio) && PageAnonExclusive(kpage),
141997729534SDavid Hildenbrand kfolio);
142031dbd01fSIzik Eidus
1421e86c59b1SClaudio Imbrenda /*
1422e86c59b1SClaudio Imbrenda * No need to check ksm_use_zero_pages here: we can only have a
1423457aef94SEthon Paul * zero_page here if ksm_use_zero_pages was enabled already.
1424e86c59b1SClaudio Imbrenda */
1425e86c59b1SClaudio Imbrenda if (!is_zero_pfn(page_to_pfn(kpage))) {
142697729534SDavid Hildenbrand folio_get(kfolio);
142797729534SDavid Hildenbrand folio_add_anon_rmap_pte(kfolio, kpage, vma, addr, RMAP_NONE);
1428e86c59b1SClaudio Imbrenda newpte = mk_pte(kpage, vma->vm_page_prot);
1429e86c59b1SClaudio Imbrenda } else {
143079271476Sxu xin /*
143179271476Sxu xin * Use pte_mkdirty to mark the zero page mapped by KSM, and then
143279271476Sxu xin * we can easily track all KSM-placed zero pages by checking if
143379271476Sxu xin * the dirty bit in zero page's PTE is set.
143479271476Sxu xin */
143579271476Sxu xin newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)));
1436c2dc78b8SChengming Zhou ksm_map_zero_page(mm);
1437a38c015fSClaudio Imbrenda /*
1438a38c015fSClaudio Imbrenda * We're replacing an anonymous page with a zero page, which is
1439a38c015fSClaudio Imbrenda * not anonymous. We need to do proper accounting otherwise we
1440a38c015fSClaudio Imbrenda * will get wrong values in /proc, and a BUG message in dmesg
1441a38c015fSClaudio Imbrenda * when tearing down the mm.
1442a38c015fSClaudio Imbrenda */
1443a38c015fSClaudio Imbrenda dec_mm_counter(mm, MM_ANONPAGES);
1444e86c59b1SClaudio Imbrenda }
144531dbd01fSIzik Eidus
1446c33c7948SRyan Roberts flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep)));
14470f10851eSJérôme Glisse /*
14480f10851eSJérôme Glisse * No need to notify as we are replacing a read only page with another
14490f10851eSJérôme Glisse * read only page with the same content.
14500f10851eSJérôme Glisse *
1451ee65728eSMike Rapoport * See Documentation/mm/mmu_notifier.rst
14520f10851eSJérôme Glisse */
14530f10851eSJérôme Glisse ptep_clear_flush(vma, addr, ptep);
1454f7842747SPaolo Bonzini set_pte_at(mm, addr, ptep, newpte);
145531dbd01fSIzik Eidus
145618e8612eSDavid Hildenbrand folio_remove_rmap_pte(folio, page, vma);
1457b4e6f66eSMatthew Wilcox (Oracle) if (!folio_mapped(folio))
1458b4e6f66eSMatthew Wilcox (Oracle) folio_free_swap(folio);
1459b4e6f66eSMatthew Wilcox (Oracle) folio_put(folio);
146031dbd01fSIzik Eidus
146131dbd01fSIzik Eidus pte_unmap_unlock(ptep, ptl);
146231dbd01fSIzik Eidus err = 0;
14636bdb913fSHaggai Eran out_mn:
1464ac46d4f3SJérôme Glisse mmu_notifier_invalidate_range_end(&range);
146531dbd01fSIzik Eidus out:
146631dbd01fSIzik Eidus return err;
146731dbd01fSIzik Eidus }
146831dbd01fSIzik Eidus
146931dbd01fSIzik Eidus /*
147031dbd01fSIzik Eidus * try_to_merge_one_page - take two pages and merge them into one
14718dd3557aSHugh Dickins * @vma: the vma that holds the pte pointing to page
14728dd3557aSHugh Dickins * @page: the PageAnon page that we want to replace with kpage
1473b9a25635SMatthew Wilcox (Oracle) * @kpage: the KSM page that we want to map instead of page,
147480e14822SHugh Dickins * or NULL the first time when we want to use page as kpage.
147531dbd01fSIzik Eidus *
147631dbd01fSIzik Eidus * This function returns 0 if the pages were merged, -EFAULT otherwise.
147731dbd01fSIzik Eidus */
try_to_merge_one_page(struct vm_area_struct * vma,struct page * page,struct page * kpage)147831dbd01fSIzik Eidus static int try_to_merge_one_page(struct vm_area_struct *vma,
14798dd3557aSHugh Dickins struct page *page, struct page *kpage)
148031dbd01fSIzik Eidus {
14819c0a1b99SMatthew Wilcox (Oracle) struct folio *folio = page_folio(page);
148231dbd01fSIzik Eidus pte_t orig_pte = __pte(0);
148331dbd01fSIzik Eidus int err = -EFAULT;
148431dbd01fSIzik Eidus
1485db114b83SHugh Dickins if (page == kpage) /* ksm page forked */
1486db114b83SHugh Dickins return 0;
1487db114b83SHugh Dickins
14889c0a1b99SMatthew Wilcox (Oracle) if (!folio_test_anon(folio))
148931dbd01fSIzik Eidus goto out;
149031dbd01fSIzik Eidus
149131dbd01fSIzik Eidus /*
149232f51eadSMatthew Wilcox (Oracle) * We need the folio lock to read a stable swapcache flag in
14939c0a1b99SMatthew Wilcox (Oracle) * write_protect_page(). We trylock because we don't want to wait
14949c0a1b99SMatthew Wilcox (Oracle) * here - we prefer to continue scanning and merging different
14959c0a1b99SMatthew Wilcox (Oracle) * pages, then come back to this page when it is unlocked.
149631dbd01fSIzik Eidus */
14979c0a1b99SMatthew Wilcox (Oracle) if (!folio_trylock(folio))
149831e855eaSHugh Dickins goto out;
1499f765f540SKirill A. Shutemov
15009c0a1b99SMatthew Wilcox (Oracle) if (folio_test_large(folio)) {
1501a7306c34SAndrea Arcangeli if (split_huge_page(page))
1502f765f540SKirill A. Shutemov goto out_unlock;
15039c0a1b99SMatthew Wilcox (Oracle) folio = page_folio(page);
1504f765f540SKirill A. Shutemov }
1505f765f540SKirill A. Shutemov
150631dbd01fSIzik Eidus /*
150731dbd01fSIzik Eidus * If this anonymous page is mapped only here, its pte may need
150831dbd01fSIzik Eidus * to be write-protected. If it's mapped elsewhere, all of its
150931dbd01fSIzik Eidus * ptes are necessarily already write-protected. But in either
151031dbd01fSIzik Eidus * case, we need to lock and check page_count is not raised.
151131dbd01fSIzik Eidus */
15129c0a1b99SMatthew Wilcox (Oracle) if (write_protect_page(vma, folio, &orig_pte) == 0) {
151380e14822SHugh Dickins if (!kpage) {
151480e14822SHugh Dickins /*
15159c0a1b99SMatthew Wilcox (Oracle) * While we hold folio lock, upgrade folio from
15169c0a1b99SMatthew Wilcox (Oracle) * anon to a NULL stable_node with the KSM flag set:
151780e14822SHugh Dickins * stable_tree_insert() will update stable_node.
151880e14822SHugh Dickins */
15199c0a1b99SMatthew Wilcox (Oracle) folio_set_stable_node(folio, NULL);
15209c0a1b99SMatthew Wilcox (Oracle) folio_mark_accessed(folio);
1521337ed7ebSMinchan Kim /*
15229c0a1b99SMatthew Wilcox (Oracle) * Page reclaim just frees a clean folio with no dirty
1523337ed7ebSMinchan Kim * ptes: make sure that the ksm page would be swapped.
1524337ed7ebSMinchan Kim */
15259c0a1b99SMatthew Wilcox (Oracle) if (!folio_test_dirty(folio))
15269c0a1b99SMatthew Wilcox (Oracle) folio_mark_dirty(folio);
152780e14822SHugh Dickins err = 0;
152880e14822SHugh Dickins } else if (pages_identical(page, kpage))
15298dd3557aSHugh Dickins err = replace_page(vma, page, kpage, orig_pte);
153080e14822SHugh Dickins }
153131dbd01fSIzik Eidus
1532f765f540SKirill A. Shutemov out_unlock:
15339c0a1b99SMatthew Wilcox (Oracle) folio_unlock(folio);
153431dbd01fSIzik Eidus out:
153531dbd01fSIzik Eidus return err;
153631dbd01fSIzik Eidus }
153731dbd01fSIzik Eidus
153831dbd01fSIzik Eidus /*
1539ac90c56bSChengming Zhou * This function returns 0 if the pages were merged or if they are
1540ac90c56bSChengming Zhou * no longer merging candidates (e.g., VMA stale), -EFAULT otherwise.
1541ac90c56bSChengming Zhou */
try_to_merge_with_zero_page(struct ksm_rmap_item * rmap_item,struct page * page)1542ac90c56bSChengming Zhou static int try_to_merge_with_zero_page(struct ksm_rmap_item *rmap_item,
1543ac90c56bSChengming Zhou struct page *page)
1544ac90c56bSChengming Zhou {
1545ac90c56bSChengming Zhou struct mm_struct *mm = rmap_item->mm;
1546ac90c56bSChengming Zhou int err = -EFAULT;
1547ac90c56bSChengming Zhou
1548ac90c56bSChengming Zhou /*
1549ac90c56bSChengming Zhou * Same checksum as an empty page. We attempt to merge it with the
1550ac90c56bSChengming Zhou * appropriate zero page if the user enabled this via sysfs.
1551ac90c56bSChengming Zhou */
1552ac90c56bSChengming Zhou if (ksm_use_zero_pages && (rmap_item->oldchecksum == zero_checksum)) {
1553ac90c56bSChengming Zhou struct vm_area_struct *vma;
1554ac90c56bSChengming Zhou
1555ac90c56bSChengming Zhou mmap_read_lock(mm);
1556ac90c56bSChengming Zhou vma = find_mergeable_vma(mm, rmap_item->address);
1557ac90c56bSChengming Zhou if (vma) {
1558ac90c56bSChengming Zhou err = try_to_merge_one_page(vma, page,
1559ac90c56bSChengming Zhou ZERO_PAGE(rmap_item->address));
1560ac90c56bSChengming Zhou trace_ksm_merge_one_page(
1561ac90c56bSChengming Zhou page_to_pfn(ZERO_PAGE(rmap_item->address)),
1562ac90c56bSChengming Zhou rmap_item, mm, err);
1563ac90c56bSChengming Zhou } else {
1564ac90c56bSChengming Zhou /*
1565ac90c56bSChengming Zhou * If the vma is out of date, we do not need to
1566ac90c56bSChengming Zhou * continue.
1567ac90c56bSChengming Zhou */
1568ac90c56bSChengming Zhou err = 0;
1569ac90c56bSChengming Zhou }
1570ac90c56bSChengming Zhou mmap_read_unlock(mm);
1571ac90c56bSChengming Zhou }
1572ac90c56bSChengming Zhou
1573ac90c56bSChengming Zhou return err;
1574ac90c56bSChengming Zhou }
1575ac90c56bSChengming Zhou
1576ac90c56bSChengming Zhou /*
157781464e30SHugh Dickins * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
157881464e30SHugh Dickins * but no new kernel page is allocated: kpage must already be a ksm page.
15798dd3557aSHugh Dickins *
15808dd3557aSHugh Dickins * This function returns 0 if the pages were merged, -EFAULT otherwise.
158181464e30SHugh Dickins */
try_to_merge_with_ksm_page(struct ksm_rmap_item * rmap_item,struct page * page,struct page * kpage)158221fbd591SQi Zheng static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item,
15838dd3557aSHugh Dickins struct page *page, struct page *kpage)
158481464e30SHugh Dickins {
15858dd3557aSHugh Dickins struct mm_struct *mm = rmap_item->mm;
158681464e30SHugh Dickins struct vm_area_struct *vma;
158781464e30SHugh Dickins int err = -EFAULT;
158881464e30SHugh Dickins
1589d8ed45c5SMichel Lespinasse mmap_read_lock(mm);
159085c6e8ddSAndrea Arcangeli vma = find_mergeable_vma(mm, rmap_item->address);
159185c6e8ddSAndrea Arcangeli if (!vma)
15929ba69294SHugh Dickins goto out;
15939ba69294SHugh Dickins
15948dd3557aSHugh Dickins err = try_to_merge_one_page(vma, page, kpage);
1595db114b83SHugh Dickins if (err)
1596db114b83SHugh Dickins goto out;
1597db114b83SHugh Dickins
1598bc56620bSHugh Dickins /* Unstable nid is in union with stable anon_vma: remove first */
1599bc56620bSHugh Dickins remove_rmap_item_from_tree(rmap_item);
1600bc56620bSHugh Dickins
1601c1e8d7c6SMichel Lespinasse /* Must get reference to anon_vma while still holding mmap_lock */
16029e60109fSPeter Zijlstra rmap_item->anon_vma = vma->anon_vma;
16039e60109fSPeter Zijlstra get_anon_vma(vma->anon_vma);
160481464e30SHugh Dickins out:
1605d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
1606739100c8SStefan Roesch trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page),
1607739100c8SStefan Roesch rmap_item, mm, err);
160881464e30SHugh Dickins return err;
160981464e30SHugh Dickins }
161081464e30SHugh Dickins
161181464e30SHugh Dickins /*
161231dbd01fSIzik Eidus * try_to_merge_two_pages - take two identical pages and prepare them
161331dbd01fSIzik Eidus * to be merged into one page.
161431dbd01fSIzik Eidus *
16158dd3557aSHugh Dickins * This function returns the kpage if we successfully merged two identical
16168dd3557aSHugh Dickins * pages into one ksm page, NULL otherwise.
161731dbd01fSIzik Eidus *
161880e14822SHugh Dickins * Note that this function upgrades page to ksm page: if one of the pages
161931dbd01fSIzik Eidus * is already a ksm page, try_to_merge_with_ksm_page should be used.
162031dbd01fSIzik Eidus */
try_to_merge_two_pages(struct ksm_rmap_item * rmap_item,struct page * page,struct ksm_rmap_item * tree_rmap_item,struct page * tree_page)162198c3ca00SMatthew Wilcox (Oracle) static struct folio *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
16228dd3557aSHugh Dickins struct page *page,
162321fbd591SQi Zheng struct ksm_rmap_item *tree_rmap_item,
16248dd3557aSHugh Dickins struct page *tree_page)
162531dbd01fSIzik Eidus {
162680e14822SHugh Dickins int err;
162731dbd01fSIzik Eidus
162880e14822SHugh Dickins err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
162931dbd01fSIzik Eidus if (!err) {
16308dd3557aSHugh Dickins err = try_to_merge_with_ksm_page(tree_rmap_item,
163180e14822SHugh Dickins tree_page, page);
163231dbd01fSIzik Eidus /*
163381464e30SHugh Dickins * If that fails, we have a ksm page with only one pte
163481464e30SHugh Dickins * pointing to it: so break it.
163531dbd01fSIzik Eidus */
16364035c07aSHugh Dickins if (err)
16378dd3557aSHugh Dickins break_cow(rmap_item);
163831dbd01fSIzik Eidus }
163998c3ca00SMatthew Wilcox (Oracle) return err ? NULL : page_folio(page);
164031dbd01fSIzik Eidus }
164131dbd01fSIzik Eidus
16422c653d0eSAndrea Arcangeli static __always_inline
__is_page_sharing_candidate(struct ksm_stable_node * stable_node,int offset)164321fbd591SQi Zheng bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset)
16442c653d0eSAndrea Arcangeli {
16452c653d0eSAndrea Arcangeli VM_BUG_ON(stable_node->rmap_hlist_len < 0);
16462c653d0eSAndrea Arcangeli /*
16472c653d0eSAndrea Arcangeli * Check that at least one mapping still exists, otherwise
16482c653d0eSAndrea Arcangeli * there's no much point to merge and share with this
16492c653d0eSAndrea Arcangeli * stable_node, as the underlying tree_page of the other
16502c653d0eSAndrea Arcangeli * sharer is going to be freed soon.
16512c653d0eSAndrea Arcangeli */
16522c653d0eSAndrea Arcangeli return stable_node->rmap_hlist_len &&
16532c653d0eSAndrea Arcangeli stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
16542c653d0eSAndrea Arcangeli }
16552c653d0eSAndrea Arcangeli
16562c653d0eSAndrea Arcangeli static __always_inline
is_page_sharing_candidate(struct ksm_stable_node * stable_node)165721fbd591SQi Zheng bool is_page_sharing_candidate(struct ksm_stable_node *stable_node)
16582c653d0eSAndrea Arcangeli {
16592c653d0eSAndrea Arcangeli return __is_page_sharing_candidate(stable_node, 0);
16602c653d0eSAndrea Arcangeli }
16612c653d0eSAndrea Arcangeli
stable_node_dup(struct ksm_stable_node ** _stable_node_dup,struct ksm_stable_node ** _stable_node,struct rb_root * root,bool prune_stale_stable_nodes)166279899cceSAlex Shi (tencent) static struct folio *stable_node_dup(struct ksm_stable_node **_stable_node_dup,
166321fbd591SQi Zheng struct ksm_stable_node **_stable_node,
16642c653d0eSAndrea Arcangeli struct rb_root *root,
16652c653d0eSAndrea Arcangeli bool prune_stale_stable_nodes)
16662c653d0eSAndrea Arcangeli {
166721fbd591SQi Zheng struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node;
16682c653d0eSAndrea Arcangeli struct hlist_node *hlist_safe;
16696f528de2SAlex Shi (tencent) struct folio *folio, *tree_folio = NULL;
16702c653d0eSAndrea Arcangeli int found_rmap_hlist_len;
16712c653d0eSAndrea Arcangeli
16722c653d0eSAndrea Arcangeli if (!prune_stale_stable_nodes ||
16732c653d0eSAndrea Arcangeli time_before(jiffies, stable_node->chain_prune_time +
16742c653d0eSAndrea Arcangeli msecs_to_jiffies(
16752c653d0eSAndrea Arcangeli ksm_stable_node_chains_prune_millisecs)))
16762c653d0eSAndrea Arcangeli prune_stale_stable_nodes = false;
16772c653d0eSAndrea Arcangeli else
16782c653d0eSAndrea Arcangeli stable_node->chain_prune_time = jiffies;
16792c653d0eSAndrea Arcangeli
16802c653d0eSAndrea Arcangeli hlist_for_each_entry_safe(dup, hlist_safe,
16812c653d0eSAndrea Arcangeli &stable_node->hlist, hlist_dup) {
16822c653d0eSAndrea Arcangeli cond_resched();
16832c653d0eSAndrea Arcangeli /*
16842c653d0eSAndrea Arcangeli * We must walk all stable_node_dup to prune the stale
16852c653d0eSAndrea Arcangeli * stable nodes during lookup.
16862c653d0eSAndrea Arcangeli *
16876f528de2SAlex Shi (tencent) * ksm_get_folio can drop the nodes from the
16882c653d0eSAndrea Arcangeli * stable_node->hlist if they point to freed pages
16892c653d0eSAndrea Arcangeli * (that's why we do a _safe walk). The "dup"
16902c653d0eSAndrea Arcangeli * stable_node parameter itself will be freed from
16912c653d0eSAndrea Arcangeli * under us if it returns NULL.
16922c653d0eSAndrea Arcangeli */
169385b67b01SDavid Hildenbrand folio = ksm_get_folio(dup, KSM_GET_FOLIO_NOLOCK);
16946f528de2SAlex Shi (tencent) if (!folio)
16952c653d0eSAndrea Arcangeli continue;
1696a0b856b6SChengming Zhou /* Pick the best candidate if possible. */
1697a0b856b6SChengming Zhou if (!found || (is_page_sharing_candidate(dup) &&
1698a0b856b6SChengming Zhou (!is_page_sharing_candidate(found) ||
1699a0b856b6SChengming Zhou dup->rmap_hlist_len > found_rmap_hlist_len))) {
17002c653d0eSAndrea Arcangeli if (found)
17016f528de2SAlex Shi (tencent) folio_put(tree_folio);
17022c653d0eSAndrea Arcangeli found = dup;
17032c653d0eSAndrea Arcangeli found_rmap_hlist_len = found->rmap_hlist_len;
17046f528de2SAlex Shi (tencent) tree_folio = folio;
1705a0b856b6SChengming Zhou /* skip put_page for found candidate */
1706a0b856b6SChengming Zhou if (!prune_stale_stable_nodes &&
1707a0b856b6SChengming Zhou is_page_sharing_candidate(found))
17082c653d0eSAndrea Arcangeli break;
17092c653d0eSAndrea Arcangeli continue;
17102c653d0eSAndrea Arcangeli }
17116f528de2SAlex Shi (tencent) folio_put(folio);
17122c653d0eSAndrea Arcangeli }
17132c653d0eSAndrea Arcangeli
171480b18dfaSAndrea Arcangeli if (found) {
1715a0b856b6SChengming Zhou if (hlist_is_singular_node(&found->hlist_dup, &stable_node->hlist)) {
17162c653d0eSAndrea Arcangeli /*
17172c653d0eSAndrea Arcangeli * If there's not just one entry it would
17182c653d0eSAndrea Arcangeli * corrupt memory, better BUG_ON. In KSM
17192c653d0eSAndrea Arcangeli * context with no lock held it's not even
17202c653d0eSAndrea Arcangeli * fatal.
17212c653d0eSAndrea Arcangeli */
17222c653d0eSAndrea Arcangeli BUG_ON(stable_node->hlist.first->next);
17232c653d0eSAndrea Arcangeli
17242c653d0eSAndrea Arcangeli /*
17252c653d0eSAndrea Arcangeli * There's just one entry and it is below the
17262c653d0eSAndrea Arcangeli * deduplication limit so drop the chain.
17272c653d0eSAndrea Arcangeli */
17282c653d0eSAndrea Arcangeli rb_replace_node(&stable_node->node, &found->node,
17292c653d0eSAndrea Arcangeli root);
17302c653d0eSAndrea Arcangeli free_stable_node(stable_node);
17312c653d0eSAndrea Arcangeli ksm_stable_node_chains--;
17322c653d0eSAndrea Arcangeli ksm_stable_node_dups--;
1733b4fecc67SAndrea Arcangeli /*
17340ba1d0f7SAndrea Arcangeli * NOTE: the caller depends on the stable_node
17350ba1d0f7SAndrea Arcangeli * to be equal to stable_node_dup if the chain
17360ba1d0f7SAndrea Arcangeli * was collapsed.
1737b4fecc67SAndrea Arcangeli */
17380ba1d0f7SAndrea Arcangeli *_stable_node = found;
17390ba1d0f7SAndrea Arcangeli /*
1740f0953a1bSIngo Molnar * Just for robustness, as stable_node is
17410ba1d0f7SAndrea Arcangeli * otherwise left as a stable pointer, the
17420ba1d0f7SAndrea Arcangeli * compiler shall optimize it away at build
17430ba1d0f7SAndrea Arcangeli * time.
17440ba1d0f7SAndrea Arcangeli */
17450ba1d0f7SAndrea Arcangeli stable_node = NULL;
174680b18dfaSAndrea Arcangeli } else if (stable_node->hlist.first != &found->hlist_dup &&
174780b18dfaSAndrea Arcangeli __is_page_sharing_candidate(found, 1)) {
17482c653d0eSAndrea Arcangeli /*
174980b18dfaSAndrea Arcangeli * If the found stable_node dup can accept one
175080b18dfaSAndrea Arcangeli * more future merge (in addition to the one
175180b18dfaSAndrea Arcangeli * that is underway) and is not at the head of
175280b18dfaSAndrea Arcangeli * the chain, put it there so next search will
175380b18dfaSAndrea Arcangeli * be quicker in the !prune_stale_stable_nodes
175480b18dfaSAndrea Arcangeli * case.
175580b18dfaSAndrea Arcangeli *
175680b18dfaSAndrea Arcangeli * NOTE: it would be inaccurate to use nr > 1
175780b18dfaSAndrea Arcangeli * instead of checking the hlist.first pointer
175880b18dfaSAndrea Arcangeli * directly, because in the
175980b18dfaSAndrea Arcangeli * prune_stale_stable_nodes case "nr" isn't
176080b18dfaSAndrea Arcangeli * the position of the found dup in the chain,
176180b18dfaSAndrea Arcangeli * but the total number of dups in the chain.
17622c653d0eSAndrea Arcangeli */
17632c653d0eSAndrea Arcangeli hlist_del(&found->hlist_dup);
17642c653d0eSAndrea Arcangeli hlist_add_head(&found->hlist_dup,
17652c653d0eSAndrea Arcangeli &stable_node->hlist);
17662c653d0eSAndrea Arcangeli }
1767a0b856b6SChengming Zhou } else {
1768a0b856b6SChengming Zhou /* Its hlist must be empty if no one found. */
1769a0b856b6SChengming Zhou free_stable_node_chain(stable_node, root);
17702c653d0eSAndrea Arcangeli }
17712c653d0eSAndrea Arcangeli
17728dc5ffcdSAndrea Arcangeli *_stable_node_dup = found;
177379899cceSAlex Shi (tencent) return tree_folio;
17742c653d0eSAndrea Arcangeli }
17752c653d0eSAndrea Arcangeli
17768dc5ffcdSAndrea Arcangeli /*
177779899cceSAlex Shi (tencent) * Like for ksm_get_folio, this function can free the *_stable_node and
17788dc5ffcdSAndrea Arcangeli * *_stable_node_dup if the returned tree_page is NULL.
17798dc5ffcdSAndrea Arcangeli *
17808dc5ffcdSAndrea Arcangeli * It can also free and overwrite *_stable_node with the found
17818dc5ffcdSAndrea Arcangeli * stable_node_dup if the chain is collapsed (in which case
17828dc5ffcdSAndrea Arcangeli * *_stable_node will be equal to *_stable_node_dup like if the chain
17838dc5ffcdSAndrea Arcangeli * never existed). It's up to the caller to verify tree_page is not
17848dc5ffcdSAndrea Arcangeli * NULL before dereferencing *_stable_node or *_stable_node_dup.
17858dc5ffcdSAndrea Arcangeli *
17868dc5ffcdSAndrea Arcangeli * *_stable_node_dup is really a second output parameter of this
17878dc5ffcdSAndrea Arcangeli * function and will be overwritten in all cases, the caller doesn't
17888dc5ffcdSAndrea Arcangeli * need to initialize it.
17898dc5ffcdSAndrea Arcangeli */
__stable_node_chain(struct ksm_stable_node ** _stable_node_dup,struct ksm_stable_node ** _stable_node,struct rb_root * root,bool prune_stale_stable_nodes)179079899cceSAlex Shi (tencent) static struct folio *__stable_node_chain(struct ksm_stable_node **_stable_node_dup,
179121fbd591SQi Zheng struct ksm_stable_node **_stable_node,
17922c653d0eSAndrea Arcangeli struct rb_root *root,
17932c653d0eSAndrea Arcangeli bool prune_stale_stable_nodes)
17942c653d0eSAndrea Arcangeli {
179521fbd591SQi Zheng struct ksm_stable_node *stable_node = *_stable_node;
1796a0b856b6SChengming Zhou
17972c653d0eSAndrea Arcangeli if (!is_stable_node_chain(stable_node)) {
17988dc5ffcdSAndrea Arcangeli *_stable_node_dup = stable_node;
179985b67b01SDavid Hildenbrand return ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK);
18002c653d0eSAndrea Arcangeli }
18018dc5ffcdSAndrea Arcangeli return stable_node_dup(_stable_node_dup, _stable_node, root,
18022c653d0eSAndrea Arcangeli prune_stale_stable_nodes);
18032c653d0eSAndrea Arcangeli }
18042c653d0eSAndrea Arcangeli
chain_prune(struct ksm_stable_node ** s_n_d,struct ksm_stable_node ** s_n,struct rb_root * root)180579899cceSAlex Shi (tencent) static __always_inline struct folio *chain_prune(struct ksm_stable_node **s_n_d,
180621fbd591SQi Zheng struct ksm_stable_node **s_n,
18072c653d0eSAndrea Arcangeli struct rb_root *root)
18082c653d0eSAndrea Arcangeli {
18098dc5ffcdSAndrea Arcangeli return __stable_node_chain(s_n_d, s_n, root, true);
18102c653d0eSAndrea Arcangeli }
18112c653d0eSAndrea Arcangeli
chain(struct ksm_stable_node ** s_n_d,struct ksm_stable_node ** s_n,struct rb_root * root)181279899cceSAlex Shi (tencent) static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d,
1813a0b856b6SChengming Zhou struct ksm_stable_node **s_n,
18142c653d0eSAndrea Arcangeli struct rb_root *root)
18152c653d0eSAndrea Arcangeli {
1816a0b856b6SChengming Zhou return __stable_node_chain(s_n_d, s_n, root, false);
18172c653d0eSAndrea Arcangeli }
18182c653d0eSAndrea Arcangeli
181931dbd01fSIzik Eidus /*
18208dd3557aSHugh Dickins * stable_tree_search - search for page inside the stable tree
182131dbd01fSIzik Eidus *
182231dbd01fSIzik Eidus * This function checks if there is a page inside the stable tree
182331dbd01fSIzik Eidus * with identical content to the page that we are scanning right now.
182431dbd01fSIzik Eidus *
18257b6ba2c7SHugh Dickins * This function returns the stable tree node of identical content if found,
182698c3ca00SMatthew Wilcox (Oracle) * -EBUSY if the stable node's page is being migrated, NULL otherwise.
182731dbd01fSIzik Eidus */
stable_tree_search(struct page * page)182898c3ca00SMatthew Wilcox (Oracle) static struct folio *stable_tree_search(struct page *page)
182931dbd01fSIzik Eidus {
183090bd6fd3SPetr Holasek int nid;
1831ef53d16cSHugh Dickins struct rb_root *root;
18324146d2d6SHugh Dickins struct rb_node **new;
18334146d2d6SHugh Dickins struct rb_node *parent;
1834a0b856b6SChengming Zhou struct ksm_stable_node *stable_node, *stable_node_dup;
183521fbd591SQi Zheng struct ksm_stable_node *page_node;
183679899cceSAlex Shi (tencent) struct folio *folio;
183731dbd01fSIzik Eidus
183879899cceSAlex Shi (tencent) folio = page_folio(page);
183979899cceSAlex Shi (tencent) page_node = folio_stable_node(folio);
18404146d2d6SHugh Dickins if (page_node && page_node->head != &migrate_nodes) {
18414146d2d6SHugh Dickins /* ksm page forked */
184279899cceSAlex Shi (tencent) folio_get(folio);
184398c3ca00SMatthew Wilcox (Oracle) return folio;
184408beca44SHugh Dickins }
184508beca44SHugh Dickins
184679899cceSAlex Shi (tencent) nid = get_kpfn_nid(folio_pfn(folio));
1847ef53d16cSHugh Dickins root = root_stable_tree + nid;
18484146d2d6SHugh Dickins again:
1849ef53d16cSHugh Dickins new = &root->rb_node;
18504146d2d6SHugh Dickins parent = NULL;
185190bd6fd3SPetr Holasek
18524146d2d6SHugh Dickins while (*new) {
185379899cceSAlex Shi (tencent) struct folio *tree_folio;
185431dbd01fSIzik Eidus int ret;
185531dbd01fSIzik Eidus
185631dbd01fSIzik Eidus cond_resched();
185721fbd591SQi Zheng stable_node = rb_entry(*new, struct ksm_stable_node, node);
185879899cceSAlex Shi (tencent) tree_folio = chain_prune(&stable_node_dup, &stable_node, root);
185979899cceSAlex Shi (tencent) if (!tree_folio) {
1860f2e5ff85SAndrea Arcangeli /*
1861f2e5ff85SAndrea Arcangeli * If we walked over a stale stable_node,
186279899cceSAlex Shi (tencent) * ksm_get_folio() will call rb_erase() and it
1863f2e5ff85SAndrea Arcangeli * may rebalance the tree from under us. So
1864f2e5ff85SAndrea Arcangeli * restart the search from scratch. Returning
1865f2e5ff85SAndrea Arcangeli * NULL would be safe too, but we'd generate
1866f2e5ff85SAndrea Arcangeli * false negative insertions just because some
1867f2e5ff85SAndrea Arcangeli * stable_node was stale.
1868f2e5ff85SAndrea Arcangeli */
1869f2e5ff85SAndrea Arcangeli goto again;
1870f2e5ff85SAndrea Arcangeli }
187131dbd01fSIzik Eidus
187279899cceSAlex Shi (tencent) ret = memcmp_pages(page, &tree_folio->page);
187379899cceSAlex Shi (tencent) folio_put(tree_folio);
187431dbd01fSIzik Eidus
18754146d2d6SHugh Dickins parent = *new;
1876c8d6553bSHugh Dickins if (ret < 0)
18774146d2d6SHugh Dickins new = &parent->rb_left;
1878c8d6553bSHugh Dickins else if (ret > 0)
18794146d2d6SHugh Dickins new = &parent->rb_right;
1880c8d6553bSHugh Dickins else {
18812c653d0eSAndrea Arcangeli if (page_node) {
18822c653d0eSAndrea Arcangeli VM_BUG_ON(page_node->head != &migrate_nodes);
18832c653d0eSAndrea Arcangeli /*
18842aa33912SDavid Hildenbrand * If the mapcount of our migrated KSM folio is
18852aa33912SDavid Hildenbrand * at most 1, we can merge it with another
18862aa33912SDavid Hildenbrand * KSM folio where we know that we have space
18872aa33912SDavid Hildenbrand * for one more mapping without exceeding the
18882aa33912SDavid Hildenbrand * ksm_max_page_sharing limit: see
18892aa33912SDavid Hildenbrand * chain_prune(). This way, we can avoid adding
18902aa33912SDavid Hildenbrand * this stable node to the chain.
18912c653d0eSAndrea Arcangeli */
18922aa33912SDavid Hildenbrand if (folio_mapcount(folio) > 1)
18932c653d0eSAndrea Arcangeli goto chain_append;
18942c653d0eSAndrea Arcangeli }
18952c653d0eSAndrea Arcangeli
1896a0b856b6SChengming Zhou if (!is_page_sharing_candidate(stable_node_dup)) {
18972c653d0eSAndrea Arcangeli /*
18982c653d0eSAndrea Arcangeli * If the stable_node is a chain and
18992c653d0eSAndrea Arcangeli * we got a payload match in memcmp
19002c653d0eSAndrea Arcangeli * but we cannot merge the scanned
19012c653d0eSAndrea Arcangeli * page in any of the existing
19022c653d0eSAndrea Arcangeli * stable_node dups because they're
19032c653d0eSAndrea Arcangeli * all full, we need to wait the
19042c653d0eSAndrea Arcangeli * scanned page to find itself a match
19052c653d0eSAndrea Arcangeli * in the unstable tree to create a
19062c653d0eSAndrea Arcangeli * brand new KSM page to add later to
19072c653d0eSAndrea Arcangeli * the dups of this stable_node.
19082c653d0eSAndrea Arcangeli */
19092c653d0eSAndrea Arcangeli return NULL;
19102c653d0eSAndrea Arcangeli }
19112c653d0eSAndrea Arcangeli
1912c8d6553bSHugh Dickins /*
1913c8d6553bSHugh Dickins * Lock and unlock the stable_node's page (which
1914c8d6553bSHugh Dickins * might already have been migrated) so that page
1915c8d6553bSHugh Dickins * migration is sure to notice its raised count.
1916c8d6553bSHugh Dickins * It would be more elegant to return stable_node
1917c8d6553bSHugh Dickins * than kpage, but that involves more changes.
1918c8d6553bSHugh Dickins */
191979899cceSAlex Shi (tencent) tree_folio = ksm_get_folio(stable_node_dup,
192085b67b01SDavid Hildenbrand KSM_GET_FOLIO_TRYLOCK);
19212cee57d1SYang Shi
192279899cceSAlex Shi (tencent) if (PTR_ERR(tree_folio) == -EBUSY)
19232cee57d1SYang Shi return ERR_PTR(-EBUSY);
19242cee57d1SYang Shi
192579899cceSAlex Shi (tencent) if (unlikely(!tree_folio))
19262c653d0eSAndrea Arcangeli /*
19272c653d0eSAndrea Arcangeli * The tree may have been rebalanced,
19282c653d0eSAndrea Arcangeli * so re-evaluate parent and new.
19292c653d0eSAndrea Arcangeli */
19302c653d0eSAndrea Arcangeli goto again;
193179899cceSAlex Shi (tencent) folio_unlock(tree_folio);
19322c653d0eSAndrea Arcangeli
19332c653d0eSAndrea Arcangeli if (get_kpfn_nid(stable_node_dup->kpfn) !=
19342c653d0eSAndrea Arcangeli NUMA(stable_node_dup->nid)) {
193579899cceSAlex Shi (tencent) folio_put(tree_folio);
19364146d2d6SHugh Dickins goto replace;
19374146d2d6SHugh Dickins }
193898c3ca00SMatthew Wilcox (Oracle) return tree_folio;
193931dbd01fSIzik Eidus }
1940c8d6553bSHugh Dickins }
194131dbd01fSIzik Eidus
19424146d2d6SHugh Dickins if (!page_node)
194331dbd01fSIzik Eidus return NULL;
19444146d2d6SHugh Dickins
19454146d2d6SHugh Dickins list_del(&page_node->list);
19464146d2d6SHugh Dickins DO_NUMA(page_node->nid = nid);
19474146d2d6SHugh Dickins rb_link_node(&page_node->node, parent, new);
1948ef53d16cSHugh Dickins rb_insert_color(&page_node->node, root);
19492c653d0eSAndrea Arcangeli out:
19502c653d0eSAndrea Arcangeli if (is_page_sharing_candidate(page_node)) {
195179899cceSAlex Shi (tencent) folio_get(folio);
195298c3ca00SMatthew Wilcox (Oracle) return folio;
19532c653d0eSAndrea Arcangeli } else
19542c653d0eSAndrea Arcangeli return NULL;
19554146d2d6SHugh Dickins
19564146d2d6SHugh Dickins replace:
1957b4fecc67SAndrea Arcangeli /*
1958b4fecc67SAndrea Arcangeli * If stable_node was a chain and chain_prune collapsed it,
19590ba1d0f7SAndrea Arcangeli * stable_node has been updated to be the new regular
19600ba1d0f7SAndrea Arcangeli * stable_node. A collapse of the chain is indistinguishable
19610ba1d0f7SAndrea Arcangeli * from the case there was no chain in the stable
19620ba1d0f7SAndrea Arcangeli * rbtree. Otherwise stable_node is the chain and
19630ba1d0f7SAndrea Arcangeli * stable_node_dup is the dup to replace.
1964b4fecc67SAndrea Arcangeli */
19650ba1d0f7SAndrea Arcangeli if (stable_node_dup == stable_node) {
1966b4fecc67SAndrea Arcangeli VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1967b4fecc67SAndrea Arcangeli VM_BUG_ON(is_stable_node_dup(stable_node_dup));
19682c653d0eSAndrea Arcangeli /* there is no chain */
19694146d2d6SHugh Dickins if (page_node) {
19702c653d0eSAndrea Arcangeli VM_BUG_ON(page_node->head != &migrate_nodes);
19714146d2d6SHugh Dickins list_del(&page_node->list);
19724146d2d6SHugh Dickins DO_NUMA(page_node->nid = nid);
1973b4fecc67SAndrea Arcangeli rb_replace_node(&stable_node_dup->node,
1974b4fecc67SAndrea Arcangeli &page_node->node,
19752c653d0eSAndrea Arcangeli root);
19762c653d0eSAndrea Arcangeli if (is_page_sharing_candidate(page_node))
197779899cceSAlex Shi (tencent) folio_get(folio);
19782c653d0eSAndrea Arcangeli else
197979899cceSAlex Shi (tencent) folio = NULL;
19804146d2d6SHugh Dickins } else {
1981b4fecc67SAndrea Arcangeli rb_erase(&stable_node_dup->node, root);
198279899cceSAlex Shi (tencent) folio = NULL;
19834146d2d6SHugh Dickins }
19842c653d0eSAndrea Arcangeli } else {
19852c653d0eSAndrea Arcangeli VM_BUG_ON(!is_stable_node_chain(stable_node));
19862c653d0eSAndrea Arcangeli __stable_node_dup_del(stable_node_dup);
19872c653d0eSAndrea Arcangeli if (page_node) {
19882c653d0eSAndrea Arcangeli VM_BUG_ON(page_node->head != &migrate_nodes);
19892c653d0eSAndrea Arcangeli list_del(&page_node->list);
19902c653d0eSAndrea Arcangeli DO_NUMA(page_node->nid = nid);
19912c653d0eSAndrea Arcangeli stable_node_chain_add_dup(page_node, stable_node);
19922c653d0eSAndrea Arcangeli if (is_page_sharing_candidate(page_node))
199379899cceSAlex Shi (tencent) folio_get(folio);
19942c653d0eSAndrea Arcangeli else
199579899cceSAlex Shi (tencent) folio = NULL;
19962c653d0eSAndrea Arcangeli } else {
199779899cceSAlex Shi (tencent) folio = NULL;
19982c653d0eSAndrea Arcangeli }
19992c653d0eSAndrea Arcangeli }
20002c653d0eSAndrea Arcangeli stable_node_dup->head = &migrate_nodes;
20012c653d0eSAndrea Arcangeli list_add(&stable_node_dup->list, stable_node_dup->head);
200298c3ca00SMatthew Wilcox (Oracle) return folio;
20032c653d0eSAndrea Arcangeli
20042c653d0eSAndrea Arcangeli chain_append:
2005b4fecc67SAndrea Arcangeli /*
2006b4fecc67SAndrea Arcangeli * If stable_node was a chain and chain_prune collapsed it,
20070ba1d0f7SAndrea Arcangeli * stable_node has been updated to be the new regular
20080ba1d0f7SAndrea Arcangeli * stable_node. A collapse of the chain is indistinguishable
20090ba1d0f7SAndrea Arcangeli * from the case there was no chain in the stable
20100ba1d0f7SAndrea Arcangeli * rbtree. Otherwise stable_node is the chain and
20110ba1d0f7SAndrea Arcangeli * stable_node_dup is the dup to replace.
2012b4fecc67SAndrea Arcangeli */
20130ba1d0f7SAndrea Arcangeli if (stable_node_dup == stable_node) {
2014b4fecc67SAndrea Arcangeli VM_BUG_ON(is_stable_node_dup(stable_node_dup));
20152c653d0eSAndrea Arcangeli /* chain is missing so create it */
20162c653d0eSAndrea Arcangeli stable_node = alloc_stable_node_chain(stable_node_dup,
20172c653d0eSAndrea Arcangeli root);
20182c653d0eSAndrea Arcangeli if (!stable_node)
20192c653d0eSAndrea Arcangeli return NULL;
20202c653d0eSAndrea Arcangeli }
20212c653d0eSAndrea Arcangeli /*
20222c653d0eSAndrea Arcangeli * Add this stable_node dup that was
20232c653d0eSAndrea Arcangeli * migrated to the stable_node chain
20242c653d0eSAndrea Arcangeli * of the current nid for this page
20252c653d0eSAndrea Arcangeli * content.
20262c653d0eSAndrea Arcangeli */
2027b4fecc67SAndrea Arcangeli VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
20282c653d0eSAndrea Arcangeli VM_BUG_ON(page_node->head != &migrate_nodes);
20292c653d0eSAndrea Arcangeli list_del(&page_node->list);
20302c653d0eSAndrea Arcangeli DO_NUMA(page_node->nid = nid);
20312c653d0eSAndrea Arcangeli stable_node_chain_add_dup(page_node, stable_node);
20322c653d0eSAndrea Arcangeli goto out;
203331dbd01fSIzik Eidus }
203431dbd01fSIzik Eidus
203531dbd01fSIzik Eidus /*
2036e850dcf5SHugh Dickins * stable_tree_insert - insert stable tree node pointing to new ksm page
203731dbd01fSIzik Eidus * into the stable tree.
203831dbd01fSIzik Eidus *
20397b6ba2c7SHugh Dickins * This function returns the stable tree node just allocated on success,
20407b6ba2c7SHugh Dickins * NULL otherwise.
204131dbd01fSIzik Eidus */
stable_tree_insert(struct folio * kfolio)204279899cceSAlex Shi (tencent) static struct ksm_stable_node *stable_tree_insert(struct folio *kfolio)
204331dbd01fSIzik Eidus {
204490bd6fd3SPetr Holasek int nid;
204590bd6fd3SPetr Holasek unsigned long kpfn;
2046ef53d16cSHugh Dickins struct rb_root *root;
204790bd6fd3SPetr Holasek struct rb_node **new;
2048f2e5ff85SAndrea Arcangeli struct rb_node *parent;
2049a0b856b6SChengming Zhou struct ksm_stable_node *stable_node, *stable_node_dup;
20502c653d0eSAndrea Arcangeli bool need_chain = false;
205131dbd01fSIzik Eidus
205279899cceSAlex Shi (tencent) kpfn = folio_pfn(kfolio);
205390bd6fd3SPetr Holasek nid = get_kpfn_nid(kpfn);
2054ef53d16cSHugh Dickins root = root_stable_tree + nid;
2055f2e5ff85SAndrea Arcangeli again:
2056f2e5ff85SAndrea Arcangeli parent = NULL;
2057ef53d16cSHugh Dickins new = &root->rb_node;
205890bd6fd3SPetr Holasek
205931dbd01fSIzik Eidus while (*new) {
206079899cceSAlex Shi (tencent) struct folio *tree_folio;
206131dbd01fSIzik Eidus int ret;
206231dbd01fSIzik Eidus
206331dbd01fSIzik Eidus cond_resched();
206421fbd591SQi Zheng stable_node = rb_entry(*new, struct ksm_stable_node, node);
2065a0b856b6SChengming Zhou tree_folio = chain(&stable_node_dup, &stable_node, root);
206679899cceSAlex Shi (tencent) if (!tree_folio) {
2067f2e5ff85SAndrea Arcangeli /*
2068f2e5ff85SAndrea Arcangeli * If we walked over a stale stable_node,
206979899cceSAlex Shi (tencent) * ksm_get_folio() will call rb_erase() and it
2070f2e5ff85SAndrea Arcangeli * may rebalance the tree from under us. So
2071f2e5ff85SAndrea Arcangeli * restart the search from scratch. Returning
2072f2e5ff85SAndrea Arcangeli * NULL would be safe too, but we'd generate
2073f2e5ff85SAndrea Arcangeli * false negative insertions just because some
2074f2e5ff85SAndrea Arcangeli * stable_node was stale.
2075f2e5ff85SAndrea Arcangeli */
2076f2e5ff85SAndrea Arcangeli goto again;
2077f2e5ff85SAndrea Arcangeli }
207831dbd01fSIzik Eidus
207979899cceSAlex Shi (tencent) ret = memcmp_pages(&kfolio->page, &tree_folio->page);
208079899cceSAlex Shi (tencent) folio_put(tree_folio);
208131dbd01fSIzik Eidus
208231dbd01fSIzik Eidus parent = *new;
208331dbd01fSIzik Eidus if (ret < 0)
208431dbd01fSIzik Eidus new = &parent->rb_left;
208531dbd01fSIzik Eidus else if (ret > 0)
208631dbd01fSIzik Eidus new = &parent->rb_right;
208731dbd01fSIzik Eidus else {
20882c653d0eSAndrea Arcangeli need_chain = true;
20892c653d0eSAndrea Arcangeli break;
209031dbd01fSIzik Eidus }
209131dbd01fSIzik Eidus }
209231dbd01fSIzik Eidus
20932c653d0eSAndrea Arcangeli stable_node_dup = alloc_stable_node();
20942c653d0eSAndrea Arcangeli if (!stable_node_dup)
20957b6ba2c7SHugh Dickins return NULL;
209631dbd01fSIzik Eidus
20972c653d0eSAndrea Arcangeli INIT_HLIST_HEAD(&stable_node_dup->hlist);
20982c653d0eSAndrea Arcangeli stable_node_dup->kpfn = kpfn;
20992c653d0eSAndrea Arcangeli stable_node_dup->rmap_hlist_len = 0;
21002c653d0eSAndrea Arcangeli DO_NUMA(stable_node_dup->nid = nid);
21012c653d0eSAndrea Arcangeli if (!need_chain) {
21022c653d0eSAndrea Arcangeli rb_link_node(&stable_node_dup->node, parent, new);
21032c653d0eSAndrea Arcangeli rb_insert_color(&stable_node_dup->node, root);
21042c653d0eSAndrea Arcangeli } else {
21052c653d0eSAndrea Arcangeli if (!is_stable_node_chain(stable_node)) {
210621fbd591SQi Zheng struct ksm_stable_node *orig = stable_node;
21072c653d0eSAndrea Arcangeli /* chain is missing so create it */
21082c653d0eSAndrea Arcangeli stable_node = alloc_stable_node_chain(orig, root);
21092c653d0eSAndrea Arcangeli if (!stable_node) {
21102c653d0eSAndrea Arcangeli free_stable_node(stable_node_dup);
21112c653d0eSAndrea Arcangeli return NULL;
21122c653d0eSAndrea Arcangeli }
21132c653d0eSAndrea Arcangeli }
21142c653d0eSAndrea Arcangeli stable_node_chain_add_dup(stable_node_dup, stable_node);
21152c653d0eSAndrea Arcangeli }
211608beca44SHugh Dickins
211790e82349SChengming Zhou folio_set_stable_node(kfolio, stable_node_dup);
211890e82349SChengming Zhou
21192c653d0eSAndrea Arcangeli return stable_node_dup;
212031dbd01fSIzik Eidus }
212131dbd01fSIzik Eidus
212231dbd01fSIzik Eidus /*
21238dd3557aSHugh Dickins * unstable_tree_search_insert - search for identical page,
21248dd3557aSHugh Dickins * else insert rmap_item into the unstable tree.
212531dbd01fSIzik Eidus *
212631dbd01fSIzik Eidus * This function searches for a page in the unstable tree identical to the
212731dbd01fSIzik Eidus * page currently being scanned; and if no identical page is found in the
212831dbd01fSIzik Eidus * tree, we insert rmap_item as a new object into the unstable tree.
212931dbd01fSIzik Eidus *
213031dbd01fSIzik Eidus * This function returns pointer to rmap_item found to be identical
213131dbd01fSIzik Eidus * to the currently scanned page, NULL otherwise.
213231dbd01fSIzik Eidus *
213331dbd01fSIzik Eidus * This function does both searching and inserting, because they share
213431dbd01fSIzik Eidus * the same walking algorithm in an rbtree.
213531dbd01fSIzik Eidus */
21368dd3557aSHugh Dickins static
unstable_tree_search_insert(struct ksm_rmap_item * rmap_item,struct page * page,struct page ** tree_pagep)213721fbd591SQi Zheng struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item,
21388dd3557aSHugh Dickins struct page *page,
21398dd3557aSHugh Dickins struct page **tree_pagep)
214031dbd01fSIzik Eidus {
214190bd6fd3SPetr Holasek struct rb_node **new;
214290bd6fd3SPetr Holasek struct rb_root *root;
214331dbd01fSIzik Eidus struct rb_node *parent = NULL;
214490bd6fd3SPetr Holasek int nid;
214590bd6fd3SPetr Holasek
214690bd6fd3SPetr Holasek nid = get_kpfn_nid(page_to_pfn(page));
2147ef53d16cSHugh Dickins root = root_unstable_tree + nid;
214890bd6fd3SPetr Holasek new = &root->rb_node;
214931dbd01fSIzik Eidus
215031dbd01fSIzik Eidus while (*new) {
215121fbd591SQi Zheng struct ksm_rmap_item *tree_rmap_item;
21528dd3557aSHugh Dickins struct page *tree_page;
215331dbd01fSIzik Eidus int ret;
215431dbd01fSIzik Eidus
2155d178f27fSHugh Dickins cond_resched();
215621fbd591SQi Zheng tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node);
21578dd3557aSHugh Dickins tree_page = get_mergeable_page(tree_rmap_item);
2158c8f95ed1SAndrea Arcangeli if (!tree_page)
215931dbd01fSIzik Eidus return NULL;
216031dbd01fSIzik Eidus
216131dbd01fSIzik Eidus /*
21628dd3557aSHugh Dickins * Don't substitute a ksm page for a forked page.
216331dbd01fSIzik Eidus */
21648dd3557aSHugh Dickins if (page == tree_page) {
21658dd3557aSHugh Dickins put_page(tree_page);
216631dbd01fSIzik Eidus return NULL;
216731dbd01fSIzik Eidus }
216831dbd01fSIzik Eidus
21698dd3557aSHugh Dickins ret = memcmp_pages(page, tree_page);
217031dbd01fSIzik Eidus
217131dbd01fSIzik Eidus parent = *new;
217231dbd01fSIzik Eidus if (ret < 0) {
21738dd3557aSHugh Dickins put_page(tree_page);
217431dbd01fSIzik Eidus new = &parent->rb_left;
217531dbd01fSIzik Eidus } else if (ret > 0) {
21768dd3557aSHugh Dickins put_page(tree_page);
217731dbd01fSIzik Eidus new = &parent->rb_right;
2178b599cbdfSHugh Dickins } else if (!ksm_merge_across_nodes &&
2179b599cbdfSHugh Dickins page_to_nid(tree_page) != nid) {
2180b599cbdfSHugh Dickins /*
2181b599cbdfSHugh Dickins * If tree_page has been migrated to another NUMA node,
2182b599cbdfSHugh Dickins * it will be flushed out and put in the right unstable
2183b599cbdfSHugh Dickins * tree next time: only merge with it when across_nodes.
2184b599cbdfSHugh Dickins */
2185b599cbdfSHugh Dickins put_page(tree_page);
2186b599cbdfSHugh Dickins return NULL;
218731dbd01fSIzik Eidus } else {
21888dd3557aSHugh Dickins *tree_pagep = tree_page;
218931dbd01fSIzik Eidus return tree_rmap_item;
219031dbd01fSIzik Eidus }
219131dbd01fSIzik Eidus }
219231dbd01fSIzik Eidus
21937b6ba2c7SHugh Dickins rmap_item->address |= UNSTABLE_FLAG;
219431dbd01fSIzik Eidus rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
2195e850dcf5SHugh Dickins DO_NUMA(rmap_item->nid = nid);
219631dbd01fSIzik Eidus rb_link_node(&rmap_item->node, parent, new);
219790bd6fd3SPetr Holasek rb_insert_color(&rmap_item->node, root);
219831dbd01fSIzik Eidus
2199473b0ce4SHugh Dickins ksm_pages_unshared++;
220031dbd01fSIzik Eidus return NULL;
220131dbd01fSIzik Eidus }
220231dbd01fSIzik Eidus
220331dbd01fSIzik Eidus /*
220431dbd01fSIzik Eidus * stable_tree_append - add another rmap_item to the linked list of
220531dbd01fSIzik Eidus * rmap_items hanging off a given node of the stable tree, all sharing
220631dbd01fSIzik Eidus * the same ksm page.
220731dbd01fSIzik Eidus */
stable_tree_append(struct ksm_rmap_item * rmap_item,struct ksm_stable_node * stable_node,bool max_page_sharing_bypass)220821fbd591SQi Zheng static void stable_tree_append(struct ksm_rmap_item *rmap_item,
220921fbd591SQi Zheng struct ksm_stable_node *stable_node,
22102c653d0eSAndrea Arcangeli bool max_page_sharing_bypass)
221131dbd01fSIzik Eidus {
22122c653d0eSAndrea Arcangeli /*
22132c653d0eSAndrea Arcangeli * rmap won't find this mapping if we don't insert the
22142c653d0eSAndrea Arcangeli * rmap_item in the right stable_node
22152c653d0eSAndrea Arcangeli * duplicate. page_migration could break later if rmap breaks,
22162c653d0eSAndrea Arcangeli * so we can as well crash here. We really need to check for
22172c653d0eSAndrea Arcangeli * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
2218457aef94SEthon Paul * for other negative values as an underflow if detected here
22192c653d0eSAndrea Arcangeli * for the first time (and not when decreasing rmap_hlist_len)
22202c653d0eSAndrea Arcangeli * would be sign of memory corruption in the stable_node.
22212c653d0eSAndrea Arcangeli */
22222c653d0eSAndrea Arcangeli BUG_ON(stable_node->rmap_hlist_len < 0);
22232c653d0eSAndrea Arcangeli
22242c653d0eSAndrea Arcangeli stable_node->rmap_hlist_len++;
22252c653d0eSAndrea Arcangeli if (!max_page_sharing_bypass)
22262c653d0eSAndrea Arcangeli /* possibly non fatal but unexpected overflow, only warn */
22272c653d0eSAndrea Arcangeli WARN_ON_ONCE(stable_node->rmap_hlist_len >
22282c653d0eSAndrea Arcangeli ksm_max_page_sharing);
22292c653d0eSAndrea Arcangeli
22307b6ba2c7SHugh Dickins rmap_item->head = stable_node;
223131dbd01fSIzik Eidus rmap_item->address |= STABLE_FLAG;
22327b6ba2c7SHugh Dickins hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
2233e178dfdeSHugh Dickins
22347b6ba2c7SHugh Dickins if (rmap_item->hlist.next)
2235e178dfdeSHugh Dickins ksm_pages_sharing++;
22367b6ba2c7SHugh Dickins else
22377b6ba2c7SHugh Dickins ksm_pages_shared++;
223876093853Sxu xin
223976093853Sxu xin rmap_item->mm->ksm_merging_pages++;
224031dbd01fSIzik Eidus }
224131dbd01fSIzik Eidus
224231dbd01fSIzik Eidus /*
224381464e30SHugh Dickins * cmp_and_merge_page - first see if page can be merged into the stable tree;
224481464e30SHugh Dickins * if not, compare checksum to previous and if it's the same, see if page can
224581464e30SHugh Dickins * be inserted into the unstable tree, or merged with a page already there and
224681464e30SHugh Dickins * both transferred to the stable tree.
224731dbd01fSIzik Eidus *
224831dbd01fSIzik Eidus * @page: the page that we are searching identical page to.
224931dbd01fSIzik Eidus * @rmap_item: the reverse mapping into the virtual address of this page
225031dbd01fSIzik Eidus */
cmp_and_merge_page(struct page * page,struct ksm_rmap_item * rmap_item)225121fbd591SQi Zheng static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
225231dbd01fSIzik Eidus {
22531b00ab48SMatthew Wilcox (Oracle) struct folio *folio = page_folio(page);
225421fbd591SQi Zheng struct ksm_rmap_item *tree_rmap_item;
22558dd3557aSHugh Dickins struct page *tree_page = NULL;
225621fbd591SQi Zheng struct ksm_stable_node *stable_node;
225798c3ca00SMatthew Wilcox (Oracle) struct folio *kfolio;
225831dbd01fSIzik Eidus unsigned int checksum;
225931dbd01fSIzik Eidus int err;
22602c653d0eSAndrea Arcangeli bool max_page_sharing_bypass = false;
226131dbd01fSIzik Eidus
22621b00ab48SMatthew Wilcox (Oracle) stable_node = folio_stable_node(folio);
22634146d2d6SHugh Dickins if (stable_node) {
22644146d2d6SHugh Dickins if (stable_node->head != &migrate_nodes &&
22652c653d0eSAndrea Arcangeli get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
22662c653d0eSAndrea Arcangeli NUMA(stable_node->nid)) {
22672c653d0eSAndrea Arcangeli stable_node_dup_del(stable_node);
22684146d2d6SHugh Dickins stable_node->head = &migrate_nodes;
22694146d2d6SHugh Dickins list_add(&stable_node->list, stable_node->head);
22704146d2d6SHugh Dickins }
22714146d2d6SHugh Dickins if (stable_node->head != &migrate_nodes &&
22724146d2d6SHugh Dickins rmap_item->head == stable_node)
22734146d2d6SHugh Dickins return;
22742c653d0eSAndrea Arcangeli /*
22752c653d0eSAndrea Arcangeli * If it's a KSM fork, allow it to go over the sharing limit
22762c653d0eSAndrea Arcangeli * without warnings.
22772c653d0eSAndrea Arcangeli */
22782c653d0eSAndrea Arcangeli if (!is_page_sharing_candidate(stable_node))
22792c653d0eSAndrea Arcangeli max_page_sharing_bypass = true;
2280d58a361bSChengming Zhou } else {
2281d58a361bSChengming Zhou remove_rmap_item_from_tree(rmap_item);
2282d58a361bSChengming Zhou
2283d58a361bSChengming Zhou /*
2284d58a361bSChengming Zhou * If the hash value of the page has changed from the last time
2285d58a361bSChengming Zhou * we calculated it, this page is changing frequently: therefore we
2286d58a361bSChengming Zhou * don't want to insert it in the unstable tree, and we don't want
2287d58a361bSChengming Zhou * to waste our time searching for something identical to it there.
2288d58a361bSChengming Zhou */
2289d58a361bSChengming Zhou checksum = calc_checksum(page);
2290d58a361bSChengming Zhou if (rmap_item->oldchecksum != checksum) {
2291d58a361bSChengming Zhou rmap_item->oldchecksum = checksum;
2292d58a361bSChengming Zhou return;
2293d58a361bSChengming Zhou }
2294d58a361bSChengming Zhou
2295d58a361bSChengming Zhou if (!try_to_merge_with_zero_page(rmap_item, page))
2296d58a361bSChengming Zhou return;
22974146d2d6SHugh Dickins }
229831dbd01fSIzik Eidus
229998c3ca00SMatthew Wilcox (Oracle) /* Start by searching for the folio in the stable tree */
230098c3ca00SMatthew Wilcox (Oracle) kfolio = stable_tree_search(page);
23011b00ab48SMatthew Wilcox (Oracle) if (kfolio == folio && rmap_item->head == stable_node) {
230298c3ca00SMatthew Wilcox (Oracle) folio_put(kfolio);
23034146d2d6SHugh Dickins return;
23044146d2d6SHugh Dickins }
23054146d2d6SHugh Dickins
23064146d2d6SHugh Dickins remove_rmap_item_from_tree(rmap_item);
23074146d2d6SHugh Dickins
230898c3ca00SMatthew Wilcox (Oracle) if (kfolio) {
230998c3ca00SMatthew Wilcox (Oracle) if (kfolio == ERR_PTR(-EBUSY))
23102cee57d1SYang Shi return;
23112cee57d1SYang Shi
231298c3ca00SMatthew Wilcox (Oracle) err = try_to_merge_with_ksm_page(rmap_item, page, &kfolio->page);
231331dbd01fSIzik Eidus if (!err) {
231431dbd01fSIzik Eidus /*
231531dbd01fSIzik Eidus * The page was successfully merged:
231631dbd01fSIzik Eidus * add its rmap_item to the stable tree.
231731dbd01fSIzik Eidus */
231898c3ca00SMatthew Wilcox (Oracle) folio_lock(kfolio);
231998c3ca00SMatthew Wilcox (Oracle) stable_tree_append(rmap_item, folio_stable_node(kfolio),
23202c653d0eSAndrea Arcangeli max_page_sharing_bypass);
232198c3ca00SMatthew Wilcox (Oracle) folio_unlock(kfolio);
232231dbd01fSIzik Eidus }
232398c3ca00SMatthew Wilcox (Oracle) folio_put(kfolio);
232431dbd01fSIzik Eidus return;
232531dbd01fSIzik Eidus }
232631dbd01fSIzik Eidus
23278dd3557aSHugh Dickins tree_rmap_item =
23288dd3557aSHugh Dickins unstable_tree_search_insert(rmap_item, page, &tree_page);
232931dbd01fSIzik Eidus if (tree_rmap_item) {
233077da2ba0SClaudio Imbrenda bool split;
233177da2ba0SClaudio Imbrenda
233298c3ca00SMatthew Wilcox (Oracle) kfolio = try_to_merge_two_pages(rmap_item, page,
23338dd3557aSHugh Dickins tree_rmap_item, tree_page);
233477da2ba0SClaudio Imbrenda /*
233577da2ba0SClaudio Imbrenda * If both pages we tried to merge belong to the same compound
233677da2ba0SClaudio Imbrenda * page, then we actually ended up increasing the reference
233777da2ba0SClaudio Imbrenda * count of the same compound page twice, and split_huge_page
233877da2ba0SClaudio Imbrenda * failed.
233977da2ba0SClaudio Imbrenda * Here we set a flag if that happened, and we use it later to
234077da2ba0SClaudio Imbrenda * try split_huge_page again. Since we call put_page right
234177da2ba0SClaudio Imbrenda * afterwards, the reference count will be correct and
234277da2ba0SClaudio Imbrenda * split_huge_page should succeed.
234377da2ba0SClaudio Imbrenda */
234477da2ba0SClaudio Imbrenda split = PageTransCompound(page)
234577da2ba0SClaudio Imbrenda && compound_head(page) == compound_head(tree_page);
23468dd3557aSHugh Dickins put_page(tree_page);
234798c3ca00SMatthew Wilcox (Oracle) if (kfolio) {
2348bc56620bSHugh Dickins /*
2349bc56620bSHugh Dickins * The pages were successfully merged: insert new
2350bc56620bSHugh Dickins * node in the stable tree and add both rmap_items.
2351bc56620bSHugh Dickins */
235298c3ca00SMatthew Wilcox (Oracle) folio_lock(kfolio);
235398c3ca00SMatthew Wilcox (Oracle) stable_node = stable_tree_insert(kfolio);
23547b6ba2c7SHugh Dickins if (stable_node) {
23552c653d0eSAndrea Arcangeli stable_tree_append(tree_rmap_item, stable_node,
23562c653d0eSAndrea Arcangeli false);
23572c653d0eSAndrea Arcangeli stable_tree_append(rmap_item, stable_node,
23582c653d0eSAndrea Arcangeli false);
23597b6ba2c7SHugh Dickins }
236098c3ca00SMatthew Wilcox (Oracle) folio_unlock(kfolio);
23617b6ba2c7SHugh Dickins
236231dbd01fSIzik Eidus /*
236331dbd01fSIzik Eidus * If we fail to insert the page into the stable tree,
236431dbd01fSIzik Eidus * we will have 2 virtual addresses that are pointing
236531dbd01fSIzik Eidus * to a ksm page left outside the stable tree,
236631dbd01fSIzik Eidus * in which case we need to break_cow on both.
236731dbd01fSIzik Eidus */
23687b6ba2c7SHugh Dickins if (!stable_node) {
23698dd3557aSHugh Dickins break_cow(tree_rmap_item);
23708dd3557aSHugh Dickins break_cow(rmap_item);
237131dbd01fSIzik Eidus }
237277da2ba0SClaudio Imbrenda } else if (split) {
237377da2ba0SClaudio Imbrenda /*
237477da2ba0SClaudio Imbrenda * We are here if we tried to merge two pages and
237577da2ba0SClaudio Imbrenda * failed because they both belonged to the same
237677da2ba0SClaudio Imbrenda * compound page. We will split the page now, but no
237777da2ba0SClaudio Imbrenda * merging will take place.
237877da2ba0SClaudio Imbrenda * We do not want to add the cost of a full lock; if
237977da2ba0SClaudio Imbrenda * the page is locked, it is better to skip it and
238077da2ba0SClaudio Imbrenda * perhaps try again later.
238177da2ba0SClaudio Imbrenda */
23821b00ab48SMatthew Wilcox (Oracle) if (!folio_trylock(folio))
238377da2ba0SClaudio Imbrenda return;
238477da2ba0SClaudio Imbrenda split_huge_page(page);
23851b00ab48SMatthew Wilcox (Oracle) folio = page_folio(page);
23861b00ab48SMatthew Wilcox (Oracle) folio_unlock(folio);
238731dbd01fSIzik Eidus }
238831dbd01fSIzik Eidus }
238931dbd01fSIzik Eidus }
239031dbd01fSIzik Eidus
get_next_rmap_item(struct ksm_mm_slot * mm_slot,struct ksm_rmap_item ** rmap_list,unsigned long addr)239121fbd591SQi Zheng static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
239221fbd591SQi Zheng struct ksm_rmap_item **rmap_list,
239331dbd01fSIzik Eidus unsigned long addr)
239431dbd01fSIzik Eidus {
239521fbd591SQi Zheng struct ksm_rmap_item *rmap_item;
239631dbd01fSIzik Eidus
23976514d511SHugh Dickins while (*rmap_list) {
23986514d511SHugh Dickins rmap_item = *rmap_list;
239993d17715SHugh Dickins if ((rmap_item->address & PAGE_MASK) == addr)
240031dbd01fSIzik Eidus return rmap_item;
240131dbd01fSIzik Eidus if (rmap_item->address > addr)
240231dbd01fSIzik Eidus break;
24036514d511SHugh Dickins *rmap_list = rmap_item->rmap_list;
240431dbd01fSIzik Eidus remove_rmap_item_from_tree(rmap_item);
240531dbd01fSIzik Eidus free_rmap_item(rmap_item);
240631dbd01fSIzik Eidus }
240731dbd01fSIzik Eidus
240831dbd01fSIzik Eidus rmap_item = alloc_rmap_item();
240931dbd01fSIzik Eidus if (rmap_item) {
241031dbd01fSIzik Eidus /* It has already been zeroed */
241158730ab6SQi Zheng rmap_item->mm = mm_slot->slot.mm;
2412cb4df4caSxu xin rmap_item->mm->ksm_rmap_items++;
241331dbd01fSIzik Eidus rmap_item->address = addr;
24146514d511SHugh Dickins rmap_item->rmap_list = *rmap_list;
24156514d511SHugh Dickins *rmap_list = rmap_item;
241631dbd01fSIzik Eidus }
241731dbd01fSIzik Eidus return rmap_item;
241831dbd01fSIzik Eidus }
241931dbd01fSIzik Eidus
24205e924ff5SStefan Roesch /*
24215e924ff5SStefan Roesch * Calculate skip age for the ksm page age. The age determines how often
24225e924ff5SStefan Roesch * de-duplicating has already been tried unsuccessfully. If the age is
24235e924ff5SStefan Roesch * smaller, the scanning of this page is skipped for less scans.
24245e924ff5SStefan Roesch *
24255e924ff5SStefan Roesch * @age: rmap_item age of page
24265e924ff5SStefan Roesch */
skip_age(rmap_age_t age)24275e924ff5SStefan Roesch static unsigned int skip_age(rmap_age_t age)
24285e924ff5SStefan Roesch {
24295e924ff5SStefan Roesch if (age <= 3)
24305e924ff5SStefan Roesch return 1;
24315e924ff5SStefan Roesch if (age <= 5)
24325e924ff5SStefan Roesch return 2;
24335e924ff5SStefan Roesch if (age <= 8)
24345e924ff5SStefan Roesch return 4;
24355e924ff5SStefan Roesch
24365e924ff5SStefan Roesch return 8;
24375e924ff5SStefan Roesch }
24385e924ff5SStefan Roesch
24395e924ff5SStefan Roesch /*
24405e924ff5SStefan Roesch * Determines if a page should be skipped for the current scan.
24415e924ff5SStefan Roesch *
244276f1a826SMatthew Wilcox (Oracle) * @folio: folio containing the page to check
24435e924ff5SStefan Roesch * @rmap_item: associated rmap_item of page
24445e924ff5SStefan Roesch */
should_skip_rmap_item(struct folio * folio,struct ksm_rmap_item * rmap_item)244576f1a826SMatthew Wilcox (Oracle) static bool should_skip_rmap_item(struct folio *folio,
24465e924ff5SStefan Roesch struct ksm_rmap_item *rmap_item)
24475e924ff5SStefan Roesch {
24485e924ff5SStefan Roesch rmap_age_t age;
24495e924ff5SStefan Roesch
24505e924ff5SStefan Roesch if (!ksm_smart_scan)
24515e924ff5SStefan Roesch return false;
24525e924ff5SStefan Roesch
24535e924ff5SStefan Roesch /*
24545e924ff5SStefan Roesch * Never skip pages that are already KSM; pages cmp_and_merge_page()
24555e924ff5SStefan Roesch * will essentially ignore them, but we still have to process them
24565e924ff5SStefan Roesch * properly.
24575e924ff5SStefan Roesch */
245876f1a826SMatthew Wilcox (Oracle) if (folio_test_ksm(folio))
24595e924ff5SStefan Roesch return false;
24605e924ff5SStefan Roesch
24615e924ff5SStefan Roesch age = rmap_item->age;
24625e924ff5SStefan Roesch if (age != U8_MAX)
24635e924ff5SStefan Roesch rmap_item->age++;
24645e924ff5SStefan Roesch
24655e924ff5SStefan Roesch /*
24665e924ff5SStefan Roesch * Smaller ages are not skipped, they need to get a chance to go
24675e924ff5SStefan Roesch * through the different phases of the KSM merging.
24685e924ff5SStefan Roesch */
24695e924ff5SStefan Roesch if (age < 3)
24705e924ff5SStefan Roesch return false;
24715e924ff5SStefan Roesch
24725e924ff5SStefan Roesch /*
24735e924ff5SStefan Roesch * Are we still allowed to skip? If not, then don't skip it
24745e924ff5SStefan Roesch * and determine how much more often we are allowed to skip next.
24755e924ff5SStefan Roesch */
24765e924ff5SStefan Roesch if (!rmap_item->remaining_skips) {
24775e924ff5SStefan Roesch rmap_item->remaining_skips = skip_age(age);
24785e924ff5SStefan Roesch return false;
24795e924ff5SStefan Roesch }
24805e924ff5SStefan Roesch
24815e924ff5SStefan Roesch /* Skip this page */
2482e5a68991SStefan Roesch ksm_pages_skipped++;
24835e924ff5SStefan Roesch rmap_item->remaining_skips--;
24845e924ff5SStefan Roesch remove_rmap_item_from_tree(rmap_item);
24855e924ff5SStefan Roesch return true;
24865e924ff5SStefan Roesch }
24875e924ff5SStefan Roesch
2488f5548c31SPedro Demarchi Gomes struct ksm_next_page_arg {
2489f5548c31SPedro Demarchi Gomes struct folio *folio;
2490f5548c31SPedro Demarchi Gomes struct page *page;
2491f5548c31SPedro Demarchi Gomes unsigned long addr;
2492f5548c31SPedro Demarchi Gomes };
2493f5548c31SPedro Demarchi Gomes
ksm_next_page_pmd_entry(pmd_t * pmdp,unsigned long addr,unsigned long end,struct mm_walk * walk)2494f5548c31SPedro Demarchi Gomes static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
2495f5548c31SPedro Demarchi Gomes struct mm_walk *walk)
2496f5548c31SPedro Demarchi Gomes {
2497f5548c31SPedro Demarchi Gomes struct ksm_next_page_arg *private = walk->private;
2498f5548c31SPedro Demarchi Gomes struct vm_area_struct *vma = walk->vma;
2499f5548c31SPedro Demarchi Gomes pte_t *start_ptep = NULL, *ptep, pte;
2500f5548c31SPedro Demarchi Gomes struct mm_struct *mm = walk->mm;
2501f5548c31SPedro Demarchi Gomes struct folio *folio;
2502f5548c31SPedro Demarchi Gomes struct page *page;
2503f5548c31SPedro Demarchi Gomes spinlock_t *ptl;
2504f5548c31SPedro Demarchi Gomes pmd_t pmd;
2505f5548c31SPedro Demarchi Gomes
2506f5548c31SPedro Demarchi Gomes if (ksm_test_exit(mm))
2507f5548c31SPedro Demarchi Gomes return 0;
2508f5548c31SPedro Demarchi Gomes
2509f5548c31SPedro Demarchi Gomes cond_resched();
2510f5548c31SPedro Demarchi Gomes
2511f5548c31SPedro Demarchi Gomes pmd = pmdp_get_lockless(pmdp);
2512f5548c31SPedro Demarchi Gomes if (!pmd_present(pmd))
2513f5548c31SPedro Demarchi Gomes return 0;
2514f5548c31SPedro Demarchi Gomes
2515f5548c31SPedro Demarchi Gomes if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
2516f5548c31SPedro Demarchi Gomes ptl = pmd_lock(mm, pmdp);
2517f5548c31SPedro Demarchi Gomes pmd = pmdp_get(pmdp);
2518f5548c31SPedro Demarchi Gomes
2519f5548c31SPedro Demarchi Gomes if (!pmd_present(pmd)) {
2520f5548c31SPedro Demarchi Gomes goto not_found_unlock;
2521f5548c31SPedro Demarchi Gomes } else if (pmd_leaf(pmd)) {
2522f5548c31SPedro Demarchi Gomes page = vm_normal_page_pmd(vma, addr, pmd);
2523f5548c31SPedro Demarchi Gomes if (!page)
2524f5548c31SPedro Demarchi Gomes goto not_found_unlock;
2525f5548c31SPedro Demarchi Gomes folio = page_folio(page);
2526f5548c31SPedro Demarchi Gomes
2527f5548c31SPedro Demarchi Gomes if (folio_is_zone_device(folio) || !folio_test_anon(folio))
2528f5548c31SPedro Demarchi Gomes goto not_found_unlock;
2529f5548c31SPedro Demarchi Gomes
2530f5548c31SPedro Demarchi Gomes page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
2531f5548c31SPedro Demarchi Gomes goto found_unlock;
2532f5548c31SPedro Demarchi Gomes }
2533f5548c31SPedro Demarchi Gomes spin_unlock(ptl);
2534f5548c31SPedro Demarchi Gomes }
2535f5548c31SPedro Demarchi Gomes
2536f5548c31SPedro Demarchi Gomes start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
2537f5548c31SPedro Demarchi Gomes if (!start_ptep)
2538f5548c31SPedro Demarchi Gomes return 0;
2539f5548c31SPedro Demarchi Gomes
2540f5548c31SPedro Demarchi Gomes for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
2541f5548c31SPedro Demarchi Gomes pte = ptep_get(ptep);
2542f5548c31SPedro Demarchi Gomes
2543f5548c31SPedro Demarchi Gomes if (!pte_present(pte))
2544f5548c31SPedro Demarchi Gomes continue;
2545f5548c31SPedro Demarchi Gomes
2546f5548c31SPedro Demarchi Gomes page = vm_normal_page(vma, addr, pte);
2547f5548c31SPedro Demarchi Gomes if (!page)
2548f5548c31SPedro Demarchi Gomes continue;
2549f5548c31SPedro Demarchi Gomes folio = page_folio(page);
2550f5548c31SPedro Demarchi Gomes
2551f5548c31SPedro Demarchi Gomes if (folio_is_zone_device(folio) || !folio_test_anon(folio))
2552f5548c31SPedro Demarchi Gomes continue;
2553f5548c31SPedro Demarchi Gomes goto found_unlock;
2554f5548c31SPedro Demarchi Gomes }
2555f5548c31SPedro Demarchi Gomes
2556f5548c31SPedro Demarchi Gomes not_found_unlock:
2557f5548c31SPedro Demarchi Gomes spin_unlock(ptl);
2558f5548c31SPedro Demarchi Gomes if (start_ptep)
2559f5548c31SPedro Demarchi Gomes pte_unmap(start_ptep);
2560f5548c31SPedro Demarchi Gomes return 0;
2561f5548c31SPedro Demarchi Gomes found_unlock:
2562f5548c31SPedro Demarchi Gomes folio_get(folio);
2563f5548c31SPedro Demarchi Gomes spin_unlock(ptl);
2564f5548c31SPedro Demarchi Gomes if (start_ptep)
2565f5548c31SPedro Demarchi Gomes pte_unmap(start_ptep);
2566f5548c31SPedro Demarchi Gomes private->page = page;
2567f5548c31SPedro Demarchi Gomes private->folio = folio;
2568f5548c31SPedro Demarchi Gomes private->addr = addr;
2569f5548c31SPedro Demarchi Gomes return 1;
2570f5548c31SPedro Demarchi Gomes }
2571f5548c31SPedro Demarchi Gomes
2572f5548c31SPedro Demarchi Gomes static struct mm_walk_ops ksm_next_page_ops = {
2573f5548c31SPedro Demarchi Gomes .pmd_entry = ksm_next_page_pmd_entry,
2574f5548c31SPedro Demarchi Gomes .walk_lock = PGWALK_RDLOCK,
2575f5548c31SPedro Demarchi Gomes };
2576f5548c31SPedro Demarchi Gomes
scan_get_next_rmap_item(struct page ** page)257721fbd591SQi Zheng static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
257831dbd01fSIzik Eidus {
257931dbd01fSIzik Eidus struct mm_struct *mm;
258058730ab6SQi Zheng struct ksm_mm_slot *mm_slot;
258158730ab6SQi Zheng struct mm_slot *slot;
258231dbd01fSIzik Eidus struct vm_area_struct *vma;
258321fbd591SQi Zheng struct ksm_rmap_item *rmap_item;
2584a5f18ba0SMatthew Wilcox (Oracle) struct vma_iterator vmi;
258590bd6fd3SPetr Holasek int nid;
258631dbd01fSIzik Eidus
258758730ab6SQi Zheng if (list_empty(&ksm_mm_head.slot.mm_node))
258831dbd01fSIzik Eidus return NULL;
258931dbd01fSIzik Eidus
259058730ab6SQi Zheng mm_slot = ksm_scan.mm_slot;
259158730ab6SQi Zheng if (mm_slot == &ksm_mm_head) {
25924e5fa4f5SStefan Roesch advisor_start_scan();
2593739100c8SStefan Roesch trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);
2594739100c8SStefan Roesch
25952919bfd0SHugh Dickins /*
25961fec6890SMatthew Wilcox (Oracle) * A number of pages can hang around indefinitely in per-cpu
25971fec6890SMatthew Wilcox (Oracle) * LRU cache, raised page count preventing write_protect_page
25982919bfd0SHugh Dickins * from merging them. Though it doesn't really matter much,
25992919bfd0SHugh Dickins * it is puzzling to see some stuck in pages_volatile until
26002919bfd0SHugh Dickins * other activity jostles them out, and they also prevented
26012919bfd0SHugh Dickins * LTP's KSM test from succeeding deterministically; so drain
26022919bfd0SHugh Dickins * them here (here rather than on entry to ksm_do_scan(),
26032919bfd0SHugh Dickins * so we don't IPI too often when pages_to_scan is set low).
26042919bfd0SHugh Dickins */
26052919bfd0SHugh Dickins lru_add_drain_all();
26062919bfd0SHugh Dickins
26074146d2d6SHugh Dickins /*
26084146d2d6SHugh Dickins * Whereas stale stable_nodes on the stable_tree itself
26094146d2d6SHugh Dickins * get pruned in the regular course of stable_tree_search(),
26104146d2d6SHugh Dickins * those moved out to the migrate_nodes list can accumulate:
26114146d2d6SHugh Dickins * so prune them once before each full scan.
26124146d2d6SHugh Dickins */
26134146d2d6SHugh Dickins if (!ksm_merge_across_nodes) {
261421fbd591SQi Zheng struct ksm_stable_node *stable_node, *next;
261572556a4cSAlex Shi (tencent) struct folio *folio;
26164146d2d6SHugh Dickins
261703640418SGeliang Tang list_for_each_entry_safe(stable_node, next,
261803640418SGeliang Tang &migrate_nodes, list) {
261972556a4cSAlex Shi (tencent) folio = ksm_get_folio(stable_node,
262085b67b01SDavid Hildenbrand KSM_GET_FOLIO_NOLOCK);
262172556a4cSAlex Shi (tencent) if (folio)
262272556a4cSAlex Shi (tencent) folio_put(folio);
26234146d2d6SHugh Dickins cond_resched();
26244146d2d6SHugh Dickins }
26254146d2d6SHugh Dickins }
26264146d2d6SHugh Dickins
2627ef53d16cSHugh Dickins for (nid = 0; nid < ksm_nr_node_ids; nid++)
262890bd6fd3SPetr Holasek root_unstable_tree[nid] = RB_ROOT;
262931dbd01fSIzik Eidus
263031dbd01fSIzik Eidus spin_lock(&ksm_mmlist_lock);
263158730ab6SQi Zheng slot = list_entry(mm_slot->slot.mm_node.next,
263258730ab6SQi Zheng struct mm_slot, mm_node);
263358730ab6SQi Zheng mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
263458730ab6SQi Zheng ksm_scan.mm_slot = mm_slot;
263531dbd01fSIzik Eidus spin_unlock(&ksm_mmlist_lock);
26362b472611SHugh Dickins /*
26372b472611SHugh Dickins * Although we tested list_empty() above, a racing __ksm_exit
26382b472611SHugh Dickins * of the last mm on the list may have removed it since then.
26392b472611SHugh Dickins */
264058730ab6SQi Zheng if (mm_slot == &ksm_mm_head)
26412b472611SHugh Dickins return NULL;
264231dbd01fSIzik Eidus next_mm:
264331dbd01fSIzik Eidus ksm_scan.address = 0;
264458730ab6SQi Zheng ksm_scan.rmap_list = &mm_slot->rmap_list;
264531dbd01fSIzik Eidus }
264631dbd01fSIzik Eidus
264758730ab6SQi Zheng slot = &mm_slot->slot;
264831dbd01fSIzik Eidus mm = slot->mm;
2649a5f18ba0SMatthew Wilcox (Oracle) vma_iter_init(&vmi, mm, ksm_scan.address);
2650a5f18ba0SMatthew Wilcox (Oracle)
2651d8ed45c5SMichel Lespinasse mmap_read_lock(mm);
26529ba69294SHugh Dickins if (ksm_test_exit(mm))
2653a5f18ba0SMatthew Wilcox (Oracle) goto no_vmas;
26549ba69294SHugh Dickins
2655a5f18ba0SMatthew Wilcox (Oracle) for_each_vma(vmi, vma) {
265631dbd01fSIzik Eidus if (!(vma->vm_flags & VM_MERGEABLE))
265731dbd01fSIzik Eidus continue;
265831dbd01fSIzik Eidus if (ksm_scan.address < vma->vm_start)
265931dbd01fSIzik Eidus ksm_scan.address = vma->vm_start;
266031dbd01fSIzik Eidus if (!vma->anon_vma)
266131dbd01fSIzik Eidus ksm_scan.address = vma->vm_end;
266231dbd01fSIzik Eidus
266331dbd01fSIzik Eidus while (ksm_scan.address < vma->vm_end) {
2664f5548c31SPedro Demarchi Gomes struct ksm_next_page_arg ksm_next_page_arg;
2665b1d3e9bbSDavid Hildenbrand struct page *tmp_page = NULL;
2666b1d3e9bbSDavid Hildenbrand struct folio *folio;
2667b1d3e9bbSDavid Hildenbrand
26689ba69294SHugh Dickins if (ksm_test_exit(mm))
26699ba69294SHugh Dickins break;
2670b1d3e9bbSDavid Hildenbrand
2671f5548c31SPedro Demarchi Gomes int found;
2672f5548c31SPedro Demarchi Gomes
2673f5548c31SPedro Demarchi Gomes found = walk_page_range_vma(vma, ksm_scan.address,
2674f5548c31SPedro Demarchi Gomes vma->vm_end,
2675f5548c31SPedro Demarchi Gomes &ksm_next_page_ops,
2676f5548c31SPedro Demarchi Gomes &ksm_next_page_arg);
2677f5548c31SPedro Demarchi Gomes
2678f5548c31SPedro Demarchi Gomes if (found > 0) {
2679f5548c31SPedro Demarchi Gomes folio = ksm_next_page_arg.folio;
2680f5548c31SPedro Demarchi Gomes tmp_page = ksm_next_page_arg.page;
2681f5548c31SPedro Demarchi Gomes ksm_scan.address = ksm_next_page_arg.addr;
2682f5548c31SPedro Demarchi Gomes } else {
2683f5548c31SPedro Demarchi Gomes VM_WARN_ON_ONCE(found < 0);
2684f5548c31SPedro Demarchi Gomes ksm_scan.address = vma->vm_end - PAGE_SIZE;
2685b1d3e9bbSDavid Hildenbrand }
2686b1d3e9bbSDavid Hildenbrand
2687b1d3e9bbSDavid Hildenbrand if (tmp_page) {
2688b1d3e9bbSDavid Hildenbrand flush_anon_page(vma, tmp_page, ksm_scan.address);
2689b1d3e9bbSDavid Hildenbrand flush_dcache_page(tmp_page);
269058730ab6SQi Zheng rmap_item = get_next_rmap_item(mm_slot,
26916514d511SHugh Dickins ksm_scan.rmap_list, ksm_scan.address);
269231dbd01fSIzik Eidus if (rmap_item) {
26936514d511SHugh Dickins ksm_scan.rmap_list =
26946514d511SHugh Dickins &rmap_item->rmap_list;
26955e924ff5SStefan Roesch
269676f1a826SMatthew Wilcox (Oracle) if (should_skip_rmap_item(folio, rmap_item)) {
2697b1d3e9bbSDavid Hildenbrand folio_put(folio);
26985e924ff5SStefan Roesch goto next_page;
2699b1d3e9bbSDavid Hildenbrand }
27005e924ff5SStefan Roesch
270131dbd01fSIzik Eidus ksm_scan.address += PAGE_SIZE;
2702b1d3e9bbSDavid Hildenbrand *page = tmp_page;
2703b1d3e9bbSDavid Hildenbrand } else {
2704b1d3e9bbSDavid Hildenbrand folio_put(folio);
2705b1d3e9bbSDavid Hildenbrand }
2706d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
270731dbd01fSIzik Eidus return rmap_item;
270831dbd01fSIzik Eidus }
2709f7091ed6SHaiyue Wang next_page:
271031dbd01fSIzik Eidus ksm_scan.address += PAGE_SIZE;
271131dbd01fSIzik Eidus cond_resched();
271231dbd01fSIzik Eidus }
271331dbd01fSIzik Eidus }
271431dbd01fSIzik Eidus
27159ba69294SHugh Dickins if (ksm_test_exit(mm)) {
2716a5f18ba0SMatthew Wilcox (Oracle) no_vmas:
27179ba69294SHugh Dickins ksm_scan.address = 0;
271858730ab6SQi Zheng ksm_scan.rmap_list = &mm_slot->rmap_list;
27199ba69294SHugh Dickins }
272031dbd01fSIzik Eidus /*
272131dbd01fSIzik Eidus * Nuke all the rmap_items that are above this current rmap:
272231dbd01fSIzik Eidus * because there were no VM_MERGEABLE vmas with such addresses.
272331dbd01fSIzik Eidus */
2724420be4edSChengyang Fan remove_trailing_rmap_items(ksm_scan.rmap_list);
272531dbd01fSIzik Eidus
272631dbd01fSIzik Eidus spin_lock(&ksm_mmlist_lock);
272758730ab6SQi Zheng slot = list_entry(mm_slot->slot.mm_node.next,
272858730ab6SQi Zheng struct mm_slot, mm_node);
272958730ab6SQi Zheng ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
2730cd551f97SHugh Dickins if (ksm_scan.address == 0) {
2731cd551f97SHugh Dickins /*
2732c1e8d7c6SMichel Lespinasse * We've completed a full scan of all vmas, holding mmap_lock
2733cd551f97SHugh Dickins * throughout, and found no VM_MERGEABLE: so do the same as
2734cd551f97SHugh Dickins * __ksm_exit does to remove this mm from all our lists now.
27359ba69294SHugh Dickins * This applies either when cleaning up after __ksm_exit
27369ba69294SHugh Dickins * (but beware: we can reach here even before __ksm_exit),
27379ba69294SHugh Dickins * or when all VM_MERGEABLE areas have been unmapped (and
2738c1e8d7c6SMichel Lespinasse * mmap_lock then protects against race with MADV_MERGEABLE).
2739cd551f97SHugh Dickins */
274058730ab6SQi Zheng hash_del(&mm_slot->slot.hash);
274158730ab6SQi Zheng list_del(&mm_slot->slot.mm_node);
27429ba69294SHugh Dickins spin_unlock(&ksm_mmlist_lock);
27439ba69294SHugh Dickins
274458730ab6SQi Zheng mm_slot_free(mm_slot_cache, mm_slot);
2745590c03caSxu xin /*
2746590c03caSxu xin * Only clear MMF_VM_MERGEABLE. We must not clear
2747590c03caSxu xin * MMF_VM_MERGE_ANY, because for those MMF_VM_MERGE_ANY process,
2748590c03caSxu xin * perhaps their mm_struct has just been added to ksm_mm_slot
2749590c03caSxu xin * list, and its process has not yet officially started running
2750590c03caSxu xin * or has not yet performed mmap/brk to allocate anonymous VMAS.
2751590c03caSxu xin */
275212e423baSLorenzo Stoakes mm_flags_clear(MMF_VM_MERGEABLE, mm);
2753d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
27549ba69294SHugh Dickins mmdrop(mm);
27559ba69294SHugh Dickins } else {
2756d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
27577496fea9SZhou Chengming /*
27583e4e28c5SMichel Lespinasse * mmap_read_unlock(mm) first because after
27597496fea9SZhou Chengming * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
27607496fea9SZhou Chengming * already have been freed under us by __ksm_exit()
27617496fea9SZhou Chengming * because the "mm_slot" is still hashed and
27627496fea9SZhou Chengming * ksm_scan.mm_slot doesn't point to it anymore.
27637496fea9SZhou Chengming */
27647496fea9SZhou Chengming spin_unlock(&ksm_mmlist_lock);
27659ba69294SHugh Dickins }
276631dbd01fSIzik Eidus
276731dbd01fSIzik Eidus /* Repeat until we've completed scanning the whole list */
276858730ab6SQi Zheng mm_slot = ksm_scan.mm_slot;
276958730ab6SQi Zheng if (mm_slot != &ksm_mm_head)
277031dbd01fSIzik Eidus goto next_mm;
277131dbd01fSIzik Eidus
27724e5fa4f5SStefan Roesch advisor_stop_scan();
27734e5fa4f5SStefan Roesch
2774739100c8SStefan Roesch trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items);
277531dbd01fSIzik Eidus ksm_scan.seqnr++;
277631dbd01fSIzik Eidus return NULL;
277731dbd01fSIzik Eidus }
277831dbd01fSIzik Eidus
277931dbd01fSIzik Eidus /**
278031dbd01fSIzik Eidus * ksm_do_scan - the ksm scanner main worker function.
2781b7701a5fSMike Rapoport * @scan_npages: number of pages we want to scan before we return.
278231dbd01fSIzik Eidus */
ksm_do_scan(unsigned int scan_npages)278331dbd01fSIzik Eidus static void ksm_do_scan(unsigned int scan_npages)
278431dbd01fSIzik Eidus {
278521fbd591SQi Zheng struct ksm_rmap_item *rmap_item;
27863f649ab7SKees Cook struct page *page;
278731dbd01fSIzik Eidus
2788730cdc2cSChengming Zhou while (scan_npages-- && likely(!freezing(current))) {
278931dbd01fSIzik Eidus cond_resched();
279031dbd01fSIzik Eidus rmap_item = scan_get_next_rmap_item(&page);
279131dbd01fSIzik Eidus if (!rmap_item)
279231dbd01fSIzik Eidus return;
279331dbd01fSIzik Eidus cmp_and_merge_page(page, rmap_item);
279431dbd01fSIzik Eidus put_page(page);
2795730cdc2cSChengming Zhou ksm_pages_scanned++;
279631dbd01fSIzik Eidus }
279731dbd01fSIzik Eidus }
279831dbd01fSIzik Eidus
ksmd_should_run(void)27996e158384SHugh Dickins static int ksmd_should_run(void)
28006e158384SHugh Dickins {
280158730ab6SQi Zheng return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node);
28026e158384SHugh Dickins }
28036e158384SHugh Dickins
ksm_scan_thread(void * nothing)280431dbd01fSIzik Eidus static int ksm_scan_thread(void *nothing)
280531dbd01fSIzik Eidus {
2806fcf9a0efSKirill Tkhai unsigned int sleep_ms;
2807fcf9a0efSKirill Tkhai
2808878aee7dSAndrea Arcangeli set_freezable();
2809339aa624SIzik Eidus set_user_nice(current, 5);
281031dbd01fSIzik Eidus
281131dbd01fSIzik Eidus while (!kthread_should_stop()) {
281231dbd01fSIzik Eidus mutex_lock(&ksm_thread_mutex);
2813ef4d43a8SHugh Dickins wait_while_offlining();
28146e158384SHugh Dickins if (ksmd_should_run())
281531dbd01fSIzik Eidus ksm_do_scan(ksm_thread_pages_to_scan);
281631dbd01fSIzik Eidus mutex_unlock(&ksm_thread_mutex);
28176e158384SHugh Dickins
28186e158384SHugh Dickins if (ksmd_should_run()) {
2819fcf9a0efSKirill Tkhai sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
2820f55afd95SKevin Hao wait_event_freezable_timeout(ksm_iter_wait,
2821fcf9a0efSKirill Tkhai sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
2822fcf9a0efSKirill Tkhai msecs_to_jiffies(sleep_ms));
282331dbd01fSIzik Eidus } else {
2824878aee7dSAndrea Arcangeli wait_event_freezable(ksm_thread_wait,
28256e158384SHugh Dickins ksmd_should_run() || kthread_should_stop());
282631dbd01fSIzik Eidus }
282731dbd01fSIzik Eidus }
282831dbd01fSIzik Eidus return 0;
282931dbd01fSIzik Eidus }
283031dbd01fSIzik Eidus
__ksm_should_add_vma(const struct file * file,vma_flags_t vma_flags)2831*3a6455d5SLorenzo Stoakes (Oracle) static bool __ksm_should_add_vma(const struct file *file, vma_flags_t vma_flags)
2832de195c67SLorenzo Stoakes {
2833*3a6455d5SLorenzo Stoakes (Oracle) if (vma_flags_test(&vma_flags, VMA_MERGEABLE_BIT))
2834de195c67SLorenzo Stoakes return false;
2835de195c67SLorenzo Stoakes
2836*3a6455d5SLorenzo Stoakes (Oracle) return ksm_compatible(file, vma_flags);
2837de195c67SLorenzo Stoakes }
2838de195c67SLorenzo Stoakes
__ksm_add_vma(struct vm_area_struct * vma)2839d7597f59SStefan Roesch static void __ksm_add_vma(struct vm_area_struct *vma)
2840d7597f59SStefan Roesch {
2841*3a6455d5SLorenzo Stoakes (Oracle) if (__ksm_should_add_vma(vma->vm_file, vma->flags))
2842d7597f59SStefan Roesch vm_flags_set(vma, VM_MERGEABLE);
2843d7597f59SStefan Roesch }
2844d7597f59SStefan Roesch
__ksm_del_vma(struct vm_area_struct * vma)284524139c07SDavid Hildenbrand static int __ksm_del_vma(struct vm_area_struct *vma)
284624139c07SDavid Hildenbrand {
284724139c07SDavid Hildenbrand int err;
284824139c07SDavid Hildenbrand
284924139c07SDavid Hildenbrand if (!(vma->vm_flags & VM_MERGEABLE))
285024139c07SDavid Hildenbrand return 0;
285124139c07SDavid Hildenbrand
285224139c07SDavid Hildenbrand if (vma->anon_vma) {
285305c3fa9cSPedro Demarchi Gomes err = break_ksm(vma, vma->vm_start, vma->vm_end, true);
285424139c07SDavid Hildenbrand if (err)
285524139c07SDavid Hildenbrand return err;
285624139c07SDavid Hildenbrand }
285724139c07SDavid Hildenbrand
285824139c07SDavid Hildenbrand vm_flags_clear(vma, VM_MERGEABLE);
285924139c07SDavid Hildenbrand return 0;
286024139c07SDavid Hildenbrand }
2861d7597f59SStefan Roesch /**
2862cf7e7a35SLorenzo Stoakes * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible
2863d7597f59SStefan Roesch *
2864cf7e7a35SLorenzo Stoakes * @mm: Proposed VMA's mm_struct
2865cf7e7a35SLorenzo Stoakes * @file: Proposed VMA's file-backed mapping, if any.
2866*3a6455d5SLorenzo Stoakes (Oracle) * @vma_flags: Proposed VMA"s flags.
2867cf7e7a35SLorenzo Stoakes *
2868*3a6455d5SLorenzo Stoakes (Oracle) * Returns: @vma_flags possibly updated to mark mergeable.
2869d7597f59SStefan Roesch */
ksm_vma_flags(struct mm_struct * mm,const struct file * file,vma_flags_t vma_flags)2870*3a6455d5SLorenzo Stoakes (Oracle) vma_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file,
2871*3a6455d5SLorenzo Stoakes (Oracle) vma_flags_t vma_flags)
2872d7597f59SStefan Roesch {
287312e423baSLorenzo Stoakes if (mm_flags_test(MMF_VM_MERGE_ANY, mm) &&
2874*3a6455d5SLorenzo Stoakes (Oracle) __ksm_should_add_vma(file, vma_flags)) {
2875*3a6455d5SLorenzo Stoakes (Oracle) vma_flags_set(&vma_flags, VMA_MERGEABLE_BIT);
2876590c03caSxu xin /*
2877590c03caSxu xin * Generally, the flags here always include MMF_VM_MERGEABLE.
2878590c03caSxu xin * However, in rare cases, this flag may be cleared by ksmd who
2879590c03caSxu xin * scans a cycle without finding any mergeable vma.
2880590c03caSxu xin */
2881590c03caSxu xin if (unlikely(!mm_flags_test(MMF_VM_MERGEABLE, mm)))
2882590c03caSxu xin __ksm_enter(mm);
2883590c03caSxu xin }
2884d7597f59SStefan Roesch
2885*3a6455d5SLorenzo Stoakes (Oracle) return vma_flags;
2886d7597f59SStefan Roesch }
2887d7597f59SStefan Roesch
ksm_add_vmas(struct mm_struct * mm)2888d7597f59SStefan Roesch static void ksm_add_vmas(struct mm_struct *mm)
2889d7597f59SStefan Roesch {
2890d7597f59SStefan Roesch struct vm_area_struct *vma;
2891d7597f59SStefan Roesch
2892d7597f59SStefan Roesch VMA_ITERATOR(vmi, mm, 0);
2893d7597f59SStefan Roesch for_each_vma(vmi, vma)
2894d7597f59SStefan Roesch __ksm_add_vma(vma);
2895d7597f59SStefan Roesch }
2896d7597f59SStefan Roesch
ksm_del_vmas(struct mm_struct * mm)289724139c07SDavid Hildenbrand static int ksm_del_vmas(struct mm_struct *mm)
289824139c07SDavid Hildenbrand {
289924139c07SDavid Hildenbrand struct vm_area_struct *vma;
290024139c07SDavid Hildenbrand int err;
290124139c07SDavid Hildenbrand
290224139c07SDavid Hildenbrand VMA_ITERATOR(vmi, mm, 0);
290324139c07SDavid Hildenbrand for_each_vma(vmi, vma) {
290424139c07SDavid Hildenbrand err = __ksm_del_vma(vma);
290524139c07SDavid Hildenbrand if (err)
290624139c07SDavid Hildenbrand return err;
290724139c07SDavid Hildenbrand }
290824139c07SDavid Hildenbrand return 0;
290924139c07SDavid Hildenbrand }
291024139c07SDavid Hildenbrand
2911d7597f59SStefan Roesch /**
2912d7597f59SStefan Roesch * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all
2913d7597f59SStefan Roesch * compatible VMA's
2914d7597f59SStefan Roesch *
2915d7597f59SStefan Roesch * @mm: Pointer to mm
2916d7597f59SStefan Roesch *
2917d7597f59SStefan Roesch * Returns 0 on success, otherwise error code
2918d7597f59SStefan Roesch */
ksm_enable_merge_any(struct mm_struct * mm)2919d7597f59SStefan Roesch int ksm_enable_merge_any(struct mm_struct *mm)
2920d7597f59SStefan Roesch {
2921d7597f59SStefan Roesch int err;
2922d7597f59SStefan Roesch
292312e423baSLorenzo Stoakes if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
2924d7597f59SStefan Roesch return 0;
2925d7597f59SStefan Roesch
292612e423baSLorenzo Stoakes if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) {
2927d7597f59SStefan Roesch err = __ksm_enter(mm);
2928d7597f59SStefan Roesch if (err)
2929d7597f59SStefan Roesch return err;
2930d7597f59SStefan Roesch }
2931d7597f59SStefan Roesch
293212e423baSLorenzo Stoakes mm_flags_set(MMF_VM_MERGE_ANY, mm);
2933d7597f59SStefan Roesch ksm_add_vmas(mm);
2934d7597f59SStefan Roesch
2935d7597f59SStefan Roesch return 0;
2936d7597f59SStefan Roesch }
2937d7597f59SStefan Roesch
293824139c07SDavid Hildenbrand /**
293924139c07SDavid Hildenbrand * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm,
294024139c07SDavid Hildenbrand * previously enabled via ksm_enable_merge_any().
294124139c07SDavid Hildenbrand *
294224139c07SDavid Hildenbrand * Disabling merging implies unmerging any merged pages, like setting
294324139c07SDavid Hildenbrand * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and
294424139c07SDavid Hildenbrand * merging on all compatible VMA's remains enabled.
294524139c07SDavid Hildenbrand *
294624139c07SDavid Hildenbrand * @mm: Pointer to mm
294724139c07SDavid Hildenbrand *
294824139c07SDavid Hildenbrand * Returns 0 on success, otherwise error code
294924139c07SDavid Hildenbrand */
ksm_disable_merge_any(struct mm_struct * mm)295024139c07SDavid Hildenbrand int ksm_disable_merge_any(struct mm_struct *mm)
295124139c07SDavid Hildenbrand {
295224139c07SDavid Hildenbrand int err;
295324139c07SDavid Hildenbrand
295412e423baSLorenzo Stoakes if (!mm_flags_test(MMF_VM_MERGE_ANY, mm))
295524139c07SDavid Hildenbrand return 0;
295624139c07SDavid Hildenbrand
295724139c07SDavid Hildenbrand err = ksm_del_vmas(mm);
295824139c07SDavid Hildenbrand if (err) {
295924139c07SDavid Hildenbrand ksm_add_vmas(mm);
296024139c07SDavid Hildenbrand return err;
296124139c07SDavid Hildenbrand }
296224139c07SDavid Hildenbrand
296312e423baSLorenzo Stoakes mm_flags_clear(MMF_VM_MERGE_ANY, mm);
296424139c07SDavid Hildenbrand return 0;
296524139c07SDavid Hildenbrand }
296624139c07SDavid Hildenbrand
ksm_disable(struct mm_struct * mm)29672c281f54SDavid Hildenbrand int ksm_disable(struct mm_struct *mm)
29682c281f54SDavid Hildenbrand {
29692c281f54SDavid Hildenbrand mmap_assert_write_locked(mm);
29702c281f54SDavid Hildenbrand
297112e423baSLorenzo Stoakes if (!mm_flags_test(MMF_VM_MERGEABLE, mm))
29722c281f54SDavid Hildenbrand return 0;
297312e423baSLorenzo Stoakes if (mm_flags_test(MMF_VM_MERGE_ANY, mm))
29742c281f54SDavid Hildenbrand return ksm_disable_merge_any(mm);
29752c281f54SDavid Hildenbrand return ksm_del_vmas(mm);
29762c281f54SDavid Hildenbrand }
29772c281f54SDavid Hildenbrand
ksm_madvise(struct vm_area_struct * vma,unsigned long start,unsigned long end,int advice,vm_flags_t * vm_flags)2978f8af4da3SHugh Dickins int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
2979bfbe7110SLorenzo Stoakes unsigned long end, int advice, vm_flags_t *vm_flags)
2980f8af4da3SHugh Dickins {
2981f8af4da3SHugh Dickins struct mm_struct *mm = vma->vm_mm;
2982d952b791SHugh Dickins int err;
2983f8af4da3SHugh Dickins
2984f8af4da3SHugh Dickins switch (advice) {
2985f8af4da3SHugh Dickins case MADV_MERGEABLE:
2986d7597f59SStefan Roesch if (vma->vm_flags & VM_MERGEABLE)
2987e1fb4a08SDave Jiang return 0;
2988d7597f59SStefan Roesch if (!vma_ksm_compatible(vma))
298912564485SShawn Anastasio return 0;
2990cc2383ecSKonstantin Khlebnikov
299112e423baSLorenzo Stoakes if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) {
2992d952b791SHugh Dickins err = __ksm_enter(mm);
2993d952b791SHugh Dickins if (err)
2994d952b791SHugh Dickins return err;
2995d952b791SHugh Dickins }
2996f8af4da3SHugh Dickins
2997f8af4da3SHugh Dickins *vm_flags |= VM_MERGEABLE;
2998f8af4da3SHugh Dickins break;
2999f8af4da3SHugh Dickins
3000f8af4da3SHugh Dickins case MADV_UNMERGEABLE:
3001f8af4da3SHugh Dickins if (!(*vm_flags & VM_MERGEABLE))
3002f8af4da3SHugh Dickins return 0; /* just ignore the advice */
3003f8af4da3SHugh Dickins
3004d952b791SHugh Dickins if (vma->anon_vma) {
300505c3fa9cSPedro Demarchi Gomes err = break_ksm(vma, start, end, true);
3006d952b791SHugh Dickins if (err)
3007d952b791SHugh Dickins return err;
3008d952b791SHugh Dickins }
3009f8af4da3SHugh Dickins
3010f8af4da3SHugh Dickins *vm_flags &= ~VM_MERGEABLE;
3011f8af4da3SHugh Dickins break;
3012f8af4da3SHugh Dickins }
3013f8af4da3SHugh Dickins
3014f8af4da3SHugh Dickins return 0;
3015f8af4da3SHugh Dickins }
301633cf1707SBharata B Rao EXPORT_SYMBOL_GPL(ksm_madvise);
3017f8af4da3SHugh Dickins
__ksm_enter(struct mm_struct * mm)3018f8af4da3SHugh Dickins int __ksm_enter(struct mm_struct *mm)
3019f8af4da3SHugh Dickins {
302021fbd591SQi Zheng struct ksm_mm_slot *mm_slot;
302158730ab6SQi Zheng struct mm_slot *slot;
30226e158384SHugh Dickins int needs_wakeup;
30236e158384SHugh Dickins
302458730ab6SQi Zheng mm_slot = mm_slot_alloc(mm_slot_cache);
302531dbd01fSIzik Eidus if (!mm_slot)
302631dbd01fSIzik Eidus return -ENOMEM;
302731dbd01fSIzik Eidus
302858730ab6SQi Zheng slot = &mm_slot->slot;
302958730ab6SQi Zheng
30306e158384SHugh Dickins /* Check ksm_run too? Would need tighter locking */
303158730ab6SQi Zheng needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node);
30326e158384SHugh Dickins
303331dbd01fSIzik Eidus spin_lock(&ksm_mmlist_lock);
303458730ab6SQi Zheng mm_slot_insert(mm_slots_hash, mm, slot);
303531dbd01fSIzik Eidus /*
3036cbf86cfeSHugh Dickins * When KSM_RUN_MERGE (or KSM_RUN_STOP),
3037cbf86cfeSHugh Dickins * insert just behind the scanning cursor, to let the area settle
303831dbd01fSIzik Eidus * down a little; when fork is followed by immediate exec, we don't
303931dbd01fSIzik Eidus * want ksmd to waste time setting up and tearing down an rmap_list.
3040cbf86cfeSHugh Dickins *
3041cbf86cfeSHugh Dickins * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
3042cbf86cfeSHugh Dickins * scanning cursor, otherwise KSM pages in newly forked mms will be
3043cbf86cfeSHugh Dickins * missed: then we might as well insert at the end of the list.
304431dbd01fSIzik Eidus */
3045cbf86cfeSHugh Dickins if (ksm_run & KSM_RUN_UNMERGE)
304658730ab6SQi Zheng list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node);
3047cbf86cfeSHugh Dickins else
304858730ab6SQi Zheng list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
304931dbd01fSIzik Eidus spin_unlock(&ksm_mmlist_lock);
305031dbd01fSIzik Eidus
305112e423baSLorenzo Stoakes mm_flags_set(MMF_VM_MERGEABLE, mm);
3052f1f10076SVegard Nossum mmgrab(mm);
30536e158384SHugh Dickins
30546e158384SHugh Dickins if (needs_wakeup)
30556e158384SHugh Dickins wake_up_interruptible(&ksm_thread_wait);
30566e158384SHugh Dickins
3057739100c8SStefan Roesch trace_ksm_enter(mm);
3058f8af4da3SHugh Dickins return 0;
3059f8af4da3SHugh Dickins }
3060f8af4da3SHugh Dickins
__ksm_exit(struct mm_struct * mm)30611c2fb7a4SAndrea Arcangeli void __ksm_exit(struct mm_struct *mm)
3062f8af4da3SHugh Dickins {
3063df6879a7SWei Yang struct ksm_mm_slot *mm_slot = NULL;
306458730ab6SQi Zheng struct mm_slot *slot;
30659ba69294SHugh Dickins int easy_to_free = 0;
3066cd551f97SHugh Dickins
306731dbd01fSIzik Eidus /*
30689ba69294SHugh Dickins * This process is exiting: if it's straightforward (as is the
30699ba69294SHugh Dickins * case when ksmd was never running), free mm_slot immediately.
30709ba69294SHugh Dickins * But if it's at the cursor or has rmap_items linked to it, use
3071c1e8d7c6SMichel Lespinasse * mmap_lock to synchronize with any break_cows before pagetables
30729ba69294SHugh Dickins * are freed, and leave the mm_slot on the list for ksmd to free.
30739ba69294SHugh Dickins * Beware: ksm may already have noticed it exiting and freed the slot.
307431dbd01fSIzik Eidus */
30759ba69294SHugh Dickins
3076cd551f97SHugh Dickins spin_lock(&ksm_mmlist_lock);
307758730ab6SQi Zheng slot = mm_slot_lookup(mm_slots_hash, mm);
3078df6879a7SWei Yang if (!slot)
3079df6879a7SWei Yang goto unlock;
308058730ab6SQi Zheng mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
3081df6879a7SWei Yang if (ksm_scan.mm_slot == mm_slot)
3082df6879a7SWei Yang goto unlock;
30836514d511SHugh Dickins if (!mm_slot->rmap_list) {
308458730ab6SQi Zheng hash_del(&slot->hash);
308558730ab6SQi Zheng list_del(&slot->mm_node);
30869ba69294SHugh Dickins easy_to_free = 1;
30879ba69294SHugh Dickins } else {
308858730ab6SQi Zheng list_move(&slot->mm_node,
308958730ab6SQi Zheng &ksm_scan.mm_slot->slot.mm_node);
30909ba69294SHugh Dickins }
3091df6879a7SWei Yang unlock:
3092cd551f97SHugh Dickins spin_unlock(&ksm_mmlist_lock);
3093cd551f97SHugh Dickins
30949ba69294SHugh Dickins if (easy_to_free) {
309558730ab6SQi Zheng mm_slot_free(mm_slot_cache, mm_slot);
309612e423baSLorenzo Stoakes mm_flags_clear(MMF_VM_MERGE_ANY, mm);
309712e423baSLorenzo Stoakes mm_flags_clear(MMF_VM_MERGEABLE, mm);
30989ba69294SHugh Dickins mmdrop(mm);
30999ba69294SHugh Dickins } else if (mm_slot) {
3100d8ed45c5SMichel Lespinasse mmap_write_lock(mm);
3101d8ed45c5SMichel Lespinasse mmap_write_unlock(mm);
31029ba69294SHugh Dickins }
3103739100c8SStefan Roesch
3104739100c8SStefan Roesch trace_ksm_exit(mm);
3105f8af4da3SHugh Dickins }
310631dbd01fSIzik Eidus
ksm_might_need_to_copy(struct folio * folio,struct vm_area_struct * vma,unsigned long addr)310796db66d9SMatthew Wilcox (Oracle) struct folio *ksm_might_need_to_copy(struct folio *folio,
31081486fb50SKefeng Wang struct vm_area_struct *vma, unsigned long addr)
31095ad64688SHugh Dickins {
311096db66d9SMatthew Wilcox (Oracle) struct page *page = folio_page(folio, 0);
3111e05b3453SMatthew Wilcox (Oracle) struct anon_vma *anon_vma = folio_anon_vma(folio);
31121486fb50SKefeng Wang struct folio *new_folio;
31135ad64688SHugh Dickins
31141486fb50SKefeng Wang if (folio_test_large(folio))
311596db66d9SMatthew Wilcox (Oracle) return folio;
31161486fb50SKefeng Wang
31171486fb50SKefeng Wang if (folio_test_ksm(folio)) {
31181486fb50SKefeng Wang if (folio_stable_node(folio) &&
3119cbf86cfeSHugh Dickins !(ksm_run & KSM_RUN_UNMERGE))
312096db66d9SMatthew Wilcox (Oracle) return folio; /* no need to copy it */
3121cbf86cfeSHugh Dickins } else if (!anon_vma) {
312296db66d9SMatthew Wilcox (Oracle) return folio; /* no need to copy it */
31231486fb50SKefeng Wang } else if (folio->index == linear_page_index(vma, addr) &&
3124e1c63e11SNanyong Sun anon_vma->root == vma->anon_vma->root) {
312596db66d9SMatthew Wilcox (Oracle) return folio; /* still no need to copy it */
3126cbf86cfeSHugh Dickins }
3127f985fc32SMiaohe Lin if (PageHWPoison(page))
3128f985fc32SMiaohe Lin return ERR_PTR(-EHWPOISON);
31291486fb50SKefeng Wang if (!folio_test_uptodate(folio))
313096db66d9SMatthew Wilcox (Oracle) return folio; /* let do_swap_page report the error */
3131cbf86cfeSHugh Dickins
31326359c39cSKefeng Wang new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr);
31331486fb50SKefeng Wang if (new_folio &&
31341486fb50SKefeng Wang mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) {
31351486fb50SKefeng Wang folio_put(new_folio);
31361486fb50SKefeng Wang new_folio = NULL;
313762fdb163SHugh Dickins }
31381486fb50SKefeng Wang if (new_folio) {
313996db66d9SMatthew Wilcox (Oracle) if (copy_mc_user_highpage(folio_page(new_folio, 0), page,
314096db66d9SMatthew Wilcox (Oracle) addr, vma)) {
31411486fb50SKefeng Wang folio_put(new_folio);
31426b970599SKefeng Wang return ERR_PTR(-EHWPOISON);
31436b970599SKefeng Wang }
31441486fb50SKefeng Wang folio_set_dirty(new_folio);
31451486fb50SKefeng Wang __folio_mark_uptodate(new_folio);
31461486fb50SKefeng Wang __folio_set_locked(new_folio);
31474d45c3afSYang Yang #ifdef CONFIG_SWAP
31484d45c3afSYang Yang count_vm_event(KSM_SWPIN_COPY);
31494d45c3afSYang Yang #endif
31505ad64688SHugh Dickins }
31515ad64688SHugh Dickins
315296db66d9SMatthew Wilcox (Oracle) return new_folio;
31535ad64688SHugh Dickins }
31545ad64688SHugh Dickins
rmap_walk_ksm(struct folio * folio,struct rmap_walk_control * rwc)31556d4675e6SMinchan Kim void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
3156e9995ef9SHugh Dickins {
315721fbd591SQi Zheng struct ksm_stable_node *stable_node;
315821fbd591SQi Zheng struct ksm_rmap_item *rmap_item;
3159e9995ef9SHugh Dickins int search_new_forks = 0;
3160e9995ef9SHugh Dickins
31612f031c6fSMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);
31629f32624bSJoonsoo Kim
31639f32624bSJoonsoo Kim /*
31649f32624bSJoonsoo Kim * Rely on the page lock to protect against concurrent modifications
31659f32624bSJoonsoo Kim * to that page's node of the stable tree.
31669f32624bSJoonsoo Kim */
31672f031c6fSMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
3168e9995ef9SHugh Dickins
31692f031c6fSMatthew Wilcox (Oracle) stable_node = folio_stable_node(folio);
3170e9995ef9SHugh Dickins if (!stable_node)
31711df631aeSMinchan Kim return;
3172e9995ef9SHugh Dickins again:
3173b67bfe0dSSasha Levin hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
3174318d87b8Sxu xin /* Ignore the stable/unstable/sqnr flags */
3175318d87b8Sxu xin const unsigned long addr = rmap_item->address & PAGE_MASK;
3176e9995ef9SHugh Dickins struct anon_vma *anon_vma = rmap_item->anon_vma;
31775beb4930SRik van Riel struct anon_vma_chain *vmac;
3178e9995ef9SHugh Dickins struct vm_area_struct *vma;
3179e9995ef9SHugh Dickins
3180ad12695fSAndrea Arcangeli cond_resched();
31816d4675e6SMinchan Kim if (!anon_vma_trylock_read(anon_vma)) {
31826d4675e6SMinchan Kim if (rwc->try_lock) {
31836d4675e6SMinchan Kim rwc->contended = true;
31846d4675e6SMinchan Kim return;
31856d4675e6SMinchan Kim }
3186b6b19f25SHugh Dickins anon_vma_lock_read(anon_vma);
31876d4675e6SMinchan Kim }
3188318d87b8Sxu xin
3189bf181b9fSMichel Lespinasse anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
3190bf181b9fSMichel Lespinasse 0, ULONG_MAX) {
31911105a2fcSJia He
3192ad12695fSAndrea Arcangeli cond_resched();
31935beb4930SRik van Riel vma = vmac->vma;
31941105a2fcSJia He
31951105a2fcSJia He if (addr < vma->vm_start || addr >= vma->vm_end)
3196e9995ef9SHugh Dickins continue;
3197e9995ef9SHugh Dickins /*
3198e9995ef9SHugh Dickins * Initially we examine only the vma which covers this
3199e9995ef9SHugh Dickins * rmap_item; but later, if there is still work to do,
3200e9995ef9SHugh Dickins * we examine covering vmas in other mms: in case they
3201e9995ef9SHugh Dickins * were forked from the original since ksmd passed.
3202e9995ef9SHugh Dickins */
3203e9995ef9SHugh Dickins if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
3204e9995ef9SHugh Dickins continue;
3205e9995ef9SHugh Dickins
32060dd1c7bbSJoonsoo Kim if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
32070dd1c7bbSJoonsoo Kim continue;
32080dd1c7bbSJoonsoo Kim
32092f031c6fSMatthew Wilcox (Oracle) if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) {
3210b6b19f25SHugh Dickins anon_vma_unlock_read(anon_vma);
32111df631aeSMinchan Kim return;
3212e9995ef9SHugh Dickins }
32132f031c6fSMatthew Wilcox (Oracle) if (rwc->done && rwc->done(folio)) {
32140dd1c7bbSJoonsoo Kim anon_vma_unlock_read(anon_vma);
32151df631aeSMinchan Kim return;
32160dd1c7bbSJoonsoo Kim }
3217e9995ef9SHugh Dickins }
3218b6b19f25SHugh Dickins anon_vma_unlock_read(anon_vma);
3219e9995ef9SHugh Dickins }
3220e9995ef9SHugh Dickins if (!search_new_forks++)
3221e9995ef9SHugh Dickins goto again;
3222e9995ef9SHugh Dickins }
3223e9995ef9SHugh Dickins
32244248d008SLonglong Xia #ifdef CONFIG_MEMORY_FAILURE
32254248d008SLonglong Xia /*
32264248d008SLonglong Xia * Collect processes when the error hit an ksm page.
32274248d008SLonglong Xia */
collect_procs_ksm(const struct folio * folio,const struct page * page,struct list_head * to_kill,int force_early)322868158bfaSMatthew Wilcox (Oracle) void collect_procs_ksm(const struct folio *folio, const struct page *page,
3229b650e1d2SMatthew Wilcox (Oracle) struct list_head *to_kill, int force_early)
32304248d008SLonglong Xia {
32314248d008SLonglong Xia struct ksm_stable_node *stable_node;
32324248d008SLonglong Xia struct ksm_rmap_item *rmap_item;
32334248d008SLonglong Xia struct vm_area_struct *vma;
32344248d008SLonglong Xia struct task_struct *tsk;
32354248d008SLonglong Xia
32364248d008SLonglong Xia stable_node = folio_stable_node(folio);
32374248d008SLonglong Xia if (!stable_node)
32384248d008SLonglong Xia return;
32394248d008SLonglong Xia hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
32404248d008SLonglong Xia struct anon_vma *av = rmap_item->anon_vma;
32414248d008SLonglong Xia
32424248d008SLonglong Xia anon_vma_lock_read(av);
3243d256d1cdSTong Tiangen rcu_read_lock();
32444248d008SLonglong Xia for_each_process(tsk) {
32454248d008SLonglong Xia struct anon_vma_chain *vmac;
32464248d008SLonglong Xia unsigned long addr;
32474248d008SLonglong Xia struct task_struct *t =
32484248d008SLonglong Xia task_early_kill(tsk, force_early);
32494248d008SLonglong Xia if (!t)
32504248d008SLonglong Xia continue;
32514248d008SLonglong Xia anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0,
32524248d008SLonglong Xia ULONG_MAX)
32534248d008SLonglong Xia {
32544248d008SLonglong Xia vma = vmac->vma;
32554248d008SLonglong Xia if (vma->vm_mm == t->mm) {
32564248d008SLonglong Xia addr = rmap_item->address & PAGE_MASK;
32574248d008SLonglong Xia add_to_kill_ksm(t, page, vma, to_kill,
32584248d008SLonglong Xia addr);
32594248d008SLonglong Xia }
32604248d008SLonglong Xia }
32614248d008SLonglong Xia }
3262d256d1cdSTong Tiangen rcu_read_unlock();
32634248d008SLonglong Xia anon_vma_unlock_read(av);
32644248d008SLonglong Xia }
32654248d008SLonglong Xia }
32664248d008SLonglong Xia #endif
32674248d008SLonglong Xia
326852629506SJoonsoo Kim #ifdef CONFIG_MIGRATION
folio_migrate_ksm(struct folio * newfolio,struct folio * folio)326919138349SMatthew Wilcox (Oracle) void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
3270e9995ef9SHugh Dickins {
327121fbd591SQi Zheng struct ksm_stable_node *stable_node;
3272e9995ef9SHugh Dickins
327319138349SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
327419138349SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
327519138349SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);
3276e9995ef9SHugh Dickins
327719138349SMatthew Wilcox (Oracle) stable_node = folio_stable_node(folio);
3278e9995ef9SHugh Dickins if (stable_node) {
327919138349SMatthew Wilcox (Oracle) VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
328019138349SMatthew Wilcox (Oracle) stable_node->kpfn = folio_pfn(newfolio);
3281c8d6553bSHugh Dickins /*
328219138349SMatthew Wilcox (Oracle) * newfolio->mapping was set in advance; now we need smp_wmb()
3283c8d6553bSHugh Dickins * to make sure that the new stable_node->kpfn is visible
328479899cceSAlex Shi (tencent) * to ksm_get_folio() before it can see that folio->mapping
328532f51eadSMatthew Wilcox (Oracle) * has gone stale (or that the swapcache flag has been cleared).
3286c8d6553bSHugh Dickins */
3287c8d6553bSHugh Dickins smp_wmb();
3288b8b0ff24SAlex Shi (tencent) folio_set_stable_node(folio, NULL);
3289e9995ef9SHugh Dickins }
3290e9995ef9SHugh Dickins }
3291e9995ef9SHugh Dickins #endif /* CONFIG_MIGRATION */
3292e9995ef9SHugh Dickins
329362b61f61SHugh Dickins #ifdef CONFIG_MEMORY_HOTREMOVE
wait_while_offlining(void)3294ef4d43a8SHugh Dickins static void wait_while_offlining(void)
3295ef4d43a8SHugh Dickins {
3296ef4d43a8SHugh Dickins while (ksm_run & KSM_RUN_OFFLINE) {
3297ef4d43a8SHugh Dickins mutex_unlock(&ksm_thread_mutex);
3298ef4d43a8SHugh Dickins wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
329974316201SNeilBrown TASK_UNINTERRUPTIBLE);
3300ef4d43a8SHugh Dickins mutex_lock(&ksm_thread_mutex);
3301ef4d43a8SHugh Dickins }
3302ef4d43a8SHugh Dickins }
3303ef4d43a8SHugh Dickins
stable_node_dup_remove_range(struct ksm_stable_node * stable_node,unsigned long start_pfn,unsigned long end_pfn)330421fbd591SQi Zheng static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node,
33052c653d0eSAndrea Arcangeli unsigned long start_pfn,
33062c653d0eSAndrea Arcangeli unsigned long end_pfn)
33072c653d0eSAndrea Arcangeli {
33082c653d0eSAndrea Arcangeli if (stable_node->kpfn >= start_pfn &&
33092c653d0eSAndrea Arcangeli stable_node->kpfn < end_pfn) {
33102c653d0eSAndrea Arcangeli /*
331179899cceSAlex Shi (tencent) * Don't ksm_get_folio, page has already gone:
33122c653d0eSAndrea Arcangeli * which is why we keep kpfn instead of page*
33132c653d0eSAndrea Arcangeli */
33142c653d0eSAndrea Arcangeli remove_node_from_stable_tree(stable_node);
33152c653d0eSAndrea Arcangeli return true;
33162c653d0eSAndrea Arcangeli }
33172c653d0eSAndrea Arcangeli return false;
33182c653d0eSAndrea Arcangeli }
33192c653d0eSAndrea Arcangeli
stable_node_chain_remove_range(struct ksm_stable_node * stable_node,unsigned long start_pfn,unsigned long end_pfn,struct rb_root * root)332021fbd591SQi Zheng static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node,
33212c653d0eSAndrea Arcangeli unsigned long start_pfn,
33222c653d0eSAndrea Arcangeli unsigned long end_pfn,
33232c653d0eSAndrea Arcangeli struct rb_root *root)
33242c653d0eSAndrea Arcangeli {
332521fbd591SQi Zheng struct ksm_stable_node *dup;
33262c653d0eSAndrea Arcangeli struct hlist_node *hlist_safe;
33272c653d0eSAndrea Arcangeli
33282c653d0eSAndrea Arcangeli if (!is_stable_node_chain(stable_node)) {
33292c653d0eSAndrea Arcangeli VM_BUG_ON(is_stable_node_dup(stable_node));
33302c653d0eSAndrea Arcangeli return stable_node_dup_remove_range(stable_node, start_pfn,
33312c653d0eSAndrea Arcangeli end_pfn);
33322c653d0eSAndrea Arcangeli }
33332c653d0eSAndrea Arcangeli
33342c653d0eSAndrea Arcangeli hlist_for_each_entry_safe(dup, hlist_safe,
33352c653d0eSAndrea Arcangeli &stable_node->hlist, hlist_dup) {
33362c653d0eSAndrea Arcangeli VM_BUG_ON(!is_stable_node_dup(dup));
33372c653d0eSAndrea Arcangeli stable_node_dup_remove_range(dup, start_pfn, end_pfn);
33382c653d0eSAndrea Arcangeli }
33392c653d0eSAndrea Arcangeli if (hlist_empty(&stable_node->hlist)) {
33402c653d0eSAndrea Arcangeli free_stable_node_chain(stable_node, root);
33412c653d0eSAndrea Arcangeli return true; /* notify caller that tree was rebalanced */
33422c653d0eSAndrea Arcangeli } else
33432c653d0eSAndrea Arcangeli return false;
33442c653d0eSAndrea Arcangeli }
33452c653d0eSAndrea Arcangeli
ksm_check_stable_tree(unsigned long start_pfn,unsigned long end_pfn)3346ee0ea59cSHugh Dickins static void ksm_check_stable_tree(unsigned long start_pfn,
334762b61f61SHugh Dickins unsigned long end_pfn)
334862b61f61SHugh Dickins {
334921fbd591SQi Zheng struct ksm_stable_node *stable_node, *next;
335062b61f61SHugh Dickins struct rb_node *node;
335190bd6fd3SPetr Holasek int nid;
335262b61f61SHugh Dickins
3353ef53d16cSHugh Dickins for (nid = 0; nid < ksm_nr_node_ids; nid++) {
3354ef53d16cSHugh Dickins node = rb_first(root_stable_tree + nid);
3355ee0ea59cSHugh Dickins while (node) {
335621fbd591SQi Zheng stable_node = rb_entry(node, struct ksm_stable_node, node);
33572c653d0eSAndrea Arcangeli if (stable_node_chain_remove_range(stable_node,
33582c653d0eSAndrea Arcangeli start_pfn, end_pfn,
33592c653d0eSAndrea Arcangeli root_stable_tree +
33602c653d0eSAndrea Arcangeli nid))
3361ef53d16cSHugh Dickins node = rb_first(root_stable_tree + nid);
33622c653d0eSAndrea Arcangeli else
3363ee0ea59cSHugh Dickins node = rb_next(node);
3364ee0ea59cSHugh Dickins cond_resched();
336562b61f61SHugh Dickins }
3366ee0ea59cSHugh Dickins }
336703640418SGeliang Tang list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
33684146d2d6SHugh Dickins if (stable_node->kpfn >= start_pfn &&
33694146d2d6SHugh Dickins stable_node->kpfn < end_pfn)
33704146d2d6SHugh Dickins remove_node_from_stable_tree(stable_node);
33714146d2d6SHugh Dickins cond_resched();
33724146d2d6SHugh Dickins }
337362b61f61SHugh Dickins }
337462b61f61SHugh Dickins
ksm_memory_callback(struct notifier_block * self,unsigned long action,void * arg)337562b61f61SHugh Dickins static int ksm_memory_callback(struct notifier_block *self,
337662b61f61SHugh Dickins unsigned long action, void *arg)
337762b61f61SHugh Dickins {
337862b61f61SHugh Dickins struct memory_notify *mn = arg;
337962b61f61SHugh Dickins
338062b61f61SHugh Dickins switch (action) {
338162b61f61SHugh Dickins case MEM_GOING_OFFLINE:
338262b61f61SHugh Dickins /*
3383ef4d43a8SHugh Dickins * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
3384ef4d43a8SHugh Dickins * and remove_all_stable_nodes() while memory is going offline:
3385ef4d43a8SHugh Dickins * it is unsafe for them to touch the stable tree at this time.
338605c3fa9cSPedro Demarchi Gomes * But break_ksm(), rmap lookups and other entry points
3387ef4d43a8SHugh Dickins * which do not need the ksm_thread_mutex are all safe.
338862b61f61SHugh Dickins */
3389ef4d43a8SHugh Dickins mutex_lock(&ksm_thread_mutex);
3390ef4d43a8SHugh Dickins ksm_run |= KSM_RUN_OFFLINE;
3391ef4d43a8SHugh Dickins mutex_unlock(&ksm_thread_mutex);
339262b61f61SHugh Dickins break;
339362b61f61SHugh Dickins
339462b61f61SHugh Dickins case MEM_OFFLINE:
339562b61f61SHugh Dickins /*
339662b61f61SHugh Dickins * Most of the work is done by page migration; but there might
339762b61f61SHugh Dickins * be a few stable_nodes left over, still pointing to struct
3398ee0ea59cSHugh Dickins * pages which have been offlined: prune those from the tree,
339979899cceSAlex Shi (tencent) * otherwise ksm_get_folio() might later try to access a
3400ee0ea59cSHugh Dickins * non-existent struct page.
340162b61f61SHugh Dickins */
3402ee0ea59cSHugh Dickins ksm_check_stable_tree(mn->start_pfn,
3403ee0ea59cSHugh Dickins mn->start_pfn + mn->nr_pages);
3404e4a9bc58SJoe Perches fallthrough;
340562b61f61SHugh Dickins case MEM_CANCEL_OFFLINE:
3406ef4d43a8SHugh Dickins mutex_lock(&ksm_thread_mutex);
3407ef4d43a8SHugh Dickins ksm_run &= ~KSM_RUN_OFFLINE;
340862b61f61SHugh Dickins mutex_unlock(&ksm_thread_mutex);
3409ef4d43a8SHugh Dickins
3410ef4d43a8SHugh Dickins smp_mb(); /* wake_up_bit advises this */
3411ef4d43a8SHugh Dickins wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
341262b61f61SHugh Dickins break;
341362b61f61SHugh Dickins }
341462b61f61SHugh Dickins return NOTIFY_OK;
341562b61f61SHugh Dickins }
3416ef4d43a8SHugh Dickins #else
wait_while_offlining(void)3417ef4d43a8SHugh Dickins static void wait_while_offlining(void)
3418ef4d43a8SHugh Dickins {
3419ef4d43a8SHugh Dickins }
342062b61f61SHugh Dickins #endif /* CONFIG_MEMORY_HOTREMOVE */
342162b61f61SHugh Dickins
3422d21077fbSStefan Roesch #ifdef CONFIG_PROC_FS
34233ab76c76Sxu xin /*
34243ab76c76Sxu xin * The process is mergeable only if any VMA is currently
34253ab76c76Sxu xin * applicable to KSM.
34263ab76c76Sxu xin *
34273ab76c76Sxu xin * The mmap lock must be held in read mode.
34283ab76c76Sxu xin */
ksm_process_mergeable(struct mm_struct * mm)34293ab76c76Sxu xin bool ksm_process_mergeable(struct mm_struct *mm)
34303ab76c76Sxu xin {
34313ab76c76Sxu xin struct vm_area_struct *vma;
34323ab76c76Sxu xin
34333ab76c76Sxu xin mmap_assert_locked(mm);
34343ab76c76Sxu xin VMA_ITERATOR(vmi, mm, 0);
34353ab76c76Sxu xin for_each_vma(vmi, vma)
34363ab76c76Sxu xin if (vma->vm_flags & VM_MERGEABLE)
34373ab76c76Sxu xin return true;
34383ab76c76Sxu xin
34393ab76c76Sxu xin return false;
34403ab76c76Sxu xin }
34413ab76c76Sxu xin
ksm_process_profit(struct mm_struct * mm)3442d21077fbSStefan Roesch long ksm_process_profit(struct mm_struct *mm)
3443d21077fbSStefan Roesch {
3444c2dc78b8SChengming Zhou return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE -
3445d21077fbSStefan Roesch mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
3446d21077fbSStefan Roesch }
3447d21077fbSStefan Roesch #endif /* CONFIG_PROC_FS */
3448d21077fbSStefan Roesch
34492ffd8679SHugh Dickins #ifdef CONFIG_SYSFS
34502ffd8679SHugh Dickins /*
34512ffd8679SHugh Dickins * This all compiles without CONFIG_SYSFS, but is a waste of space.
34522ffd8679SHugh Dickins */
34532ffd8679SHugh Dickins
345431dbd01fSIzik Eidus #define KSM_ATTR_RO(_name) \
345531dbd01fSIzik Eidus static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
345631dbd01fSIzik Eidus #define KSM_ATTR(_name) \
34571bad2e5cSMiaohe Lin static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
345831dbd01fSIzik Eidus
sleep_millisecs_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)345931dbd01fSIzik Eidus static ssize_t sleep_millisecs_show(struct kobject *kobj,
346031dbd01fSIzik Eidus struct kobj_attribute *attr, char *buf)
346131dbd01fSIzik Eidus {
3462ae7a927dSJoe Perches return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs);
346331dbd01fSIzik Eidus }
346431dbd01fSIzik Eidus
sleep_millisecs_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)346531dbd01fSIzik Eidus static ssize_t sleep_millisecs_store(struct kobject *kobj,
346631dbd01fSIzik Eidus struct kobj_attribute *attr,
346731dbd01fSIzik Eidus const char *buf, size_t count)
346831dbd01fSIzik Eidus {
3469dfefd226SAlexey Dobriyan unsigned int msecs;
347031dbd01fSIzik Eidus int err;
347131dbd01fSIzik Eidus
3472dfefd226SAlexey Dobriyan err = kstrtouint(buf, 10, &msecs);
3473dfefd226SAlexey Dobriyan if (err)
347431dbd01fSIzik Eidus return -EINVAL;
347531dbd01fSIzik Eidus
347631dbd01fSIzik Eidus ksm_thread_sleep_millisecs = msecs;
3477fcf9a0efSKirill Tkhai wake_up_interruptible(&ksm_iter_wait);
347831dbd01fSIzik Eidus
347931dbd01fSIzik Eidus return count;
348031dbd01fSIzik Eidus }
348131dbd01fSIzik Eidus KSM_ATTR(sleep_millisecs);
348231dbd01fSIzik Eidus
pages_to_scan_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)348331dbd01fSIzik Eidus static ssize_t pages_to_scan_show(struct kobject *kobj,
348431dbd01fSIzik Eidus struct kobj_attribute *attr, char *buf)
348531dbd01fSIzik Eidus {
3486ae7a927dSJoe Perches return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan);
348731dbd01fSIzik Eidus }
348831dbd01fSIzik Eidus
pages_to_scan_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)348931dbd01fSIzik Eidus static ssize_t pages_to_scan_store(struct kobject *kobj,
349031dbd01fSIzik Eidus struct kobj_attribute *attr,
349131dbd01fSIzik Eidus const char *buf, size_t count)
349231dbd01fSIzik Eidus {
3493dfefd226SAlexey Dobriyan unsigned int nr_pages;
349431dbd01fSIzik Eidus int err;
349531dbd01fSIzik Eidus
34964e5fa4f5SStefan Roesch if (ksm_advisor != KSM_ADVISOR_NONE)
34974e5fa4f5SStefan Roesch return -EINVAL;
34984e5fa4f5SStefan Roesch
3499dfefd226SAlexey Dobriyan err = kstrtouint(buf, 10, &nr_pages);
3500dfefd226SAlexey Dobriyan if (err)
350131dbd01fSIzik Eidus return -EINVAL;
350231dbd01fSIzik Eidus
350331dbd01fSIzik Eidus ksm_thread_pages_to_scan = nr_pages;
350431dbd01fSIzik Eidus
350531dbd01fSIzik Eidus return count;
350631dbd01fSIzik Eidus }
350731dbd01fSIzik Eidus KSM_ATTR(pages_to_scan);
350831dbd01fSIzik Eidus
run_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)350931dbd01fSIzik Eidus static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
351031dbd01fSIzik Eidus char *buf)
351131dbd01fSIzik Eidus {
3512ae7a927dSJoe Perches return sysfs_emit(buf, "%lu\n", ksm_run);
351331dbd01fSIzik Eidus }
351431dbd01fSIzik Eidus
run_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)351531dbd01fSIzik Eidus static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
351631dbd01fSIzik Eidus const char *buf, size_t count)
351731dbd01fSIzik Eidus {
3518dfefd226SAlexey Dobriyan unsigned int flags;
351931dbd01fSIzik Eidus int err;
352031dbd01fSIzik Eidus
3521dfefd226SAlexey Dobriyan err = kstrtouint(buf, 10, &flags);
3522dfefd226SAlexey Dobriyan if (err)
352331dbd01fSIzik Eidus return -EINVAL;
352431dbd01fSIzik Eidus if (flags > KSM_RUN_UNMERGE)
352531dbd01fSIzik Eidus return -EINVAL;
352631dbd01fSIzik Eidus
352731dbd01fSIzik Eidus /*
352831dbd01fSIzik Eidus * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
352931dbd01fSIzik Eidus * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
3530d0f209f6SHugh Dickins * breaking COW to free the pages_shared (but leaves mm_slots
3531d0f209f6SHugh Dickins * on the list for when ksmd may be set running again).
353231dbd01fSIzik Eidus */
353331dbd01fSIzik Eidus
353431dbd01fSIzik Eidus mutex_lock(&ksm_thread_mutex);
3535ef4d43a8SHugh Dickins wait_while_offlining();
353631dbd01fSIzik Eidus if (ksm_run != flags) {
353731dbd01fSIzik Eidus ksm_run = flags;
3538d952b791SHugh Dickins if (flags & KSM_RUN_UNMERGE) {
3539e1e12d2fSDavid Rientjes set_current_oom_origin();
3540d952b791SHugh Dickins err = unmerge_and_remove_all_rmap_items();
3541e1e12d2fSDavid Rientjes clear_current_oom_origin();
3542d952b791SHugh Dickins if (err) {
3543d952b791SHugh Dickins ksm_run = KSM_RUN_STOP;
3544d952b791SHugh Dickins count = err;
3545d952b791SHugh Dickins }
3546d952b791SHugh Dickins }
354731dbd01fSIzik Eidus }
354831dbd01fSIzik Eidus mutex_unlock(&ksm_thread_mutex);
354931dbd01fSIzik Eidus
355031dbd01fSIzik Eidus if (flags & KSM_RUN_MERGE)
355131dbd01fSIzik Eidus wake_up_interruptible(&ksm_thread_wait);
355231dbd01fSIzik Eidus
355331dbd01fSIzik Eidus return count;
355431dbd01fSIzik Eidus }
355531dbd01fSIzik Eidus KSM_ATTR(run);
355631dbd01fSIzik Eidus
355790bd6fd3SPetr Holasek #ifdef CONFIG_NUMA
merge_across_nodes_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)355890bd6fd3SPetr Holasek static ssize_t merge_across_nodes_show(struct kobject *kobj,
355990bd6fd3SPetr Holasek struct kobj_attribute *attr, char *buf)
356090bd6fd3SPetr Holasek {
3561ae7a927dSJoe Perches return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes);
356290bd6fd3SPetr Holasek }
356390bd6fd3SPetr Holasek
merge_across_nodes_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)356490bd6fd3SPetr Holasek static ssize_t merge_across_nodes_store(struct kobject *kobj,
356590bd6fd3SPetr Holasek struct kobj_attribute *attr,
356690bd6fd3SPetr Holasek const char *buf, size_t count)
356790bd6fd3SPetr Holasek {
356890bd6fd3SPetr Holasek int err;
356990bd6fd3SPetr Holasek unsigned long knob;
357090bd6fd3SPetr Holasek
357190bd6fd3SPetr Holasek err = kstrtoul(buf, 10, &knob);
357290bd6fd3SPetr Holasek if (err)
357390bd6fd3SPetr Holasek return err;
357490bd6fd3SPetr Holasek if (knob > 1)
357590bd6fd3SPetr Holasek return -EINVAL;
357690bd6fd3SPetr Holasek
357790bd6fd3SPetr Holasek mutex_lock(&ksm_thread_mutex);
3578ef4d43a8SHugh Dickins wait_while_offlining();
357990bd6fd3SPetr Holasek if (ksm_merge_across_nodes != knob) {
3580cbf86cfeSHugh Dickins if (ksm_pages_shared || remove_all_stable_nodes())
358190bd6fd3SPetr Holasek err = -EBUSY;
3582ef53d16cSHugh Dickins else if (root_stable_tree == one_stable_tree) {
3583ef53d16cSHugh Dickins struct rb_root *buf;
3584ef53d16cSHugh Dickins /*
3585ef53d16cSHugh Dickins * This is the first time that we switch away from the
3586ef53d16cSHugh Dickins * default of merging across nodes: must now allocate
3587ef53d16cSHugh Dickins * a buffer to hold as many roots as may be needed.
3588ef53d16cSHugh Dickins * Allocate stable and unstable together:
3589ef53d16cSHugh Dickins * MAXSMP NODES_SHIFT 10 will use 16kB.
3590ef53d16cSHugh Dickins */
359132a92f8cSLinus Torvalds buf = kzalloc_objs(*buf, nr_node_ids + nr_node_ids);
3592ef53d16cSHugh Dickins /* Let us assume that RB_ROOT is NULL is zero */
3593ef53d16cSHugh Dickins if (!buf)
3594ef53d16cSHugh Dickins err = -ENOMEM;
3595ef53d16cSHugh Dickins else {
3596ef53d16cSHugh Dickins root_stable_tree = buf;
3597ef53d16cSHugh Dickins root_unstable_tree = buf + nr_node_ids;
3598ef53d16cSHugh Dickins /* Stable tree is empty but not the unstable */
3599ef53d16cSHugh Dickins root_unstable_tree[0] = one_unstable_tree[0];
3600ef53d16cSHugh Dickins }
3601ef53d16cSHugh Dickins }
3602ef53d16cSHugh Dickins if (!err) {
360390bd6fd3SPetr Holasek ksm_merge_across_nodes = knob;
3604ef53d16cSHugh Dickins ksm_nr_node_ids = knob ? 1 : nr_node_ids;
3605ef53d16cSHugh Dickins }
360690bd6fd3SPetr Holasek }
360790bd6fd3SPetr Holasek mutex_unlock(&ksm_thread_mutex);
360890bd6fd3SPetr Holasek
360990bd6fd3SPetr Holasek return err ? err : count;
361090bd6fd3SPetr Holasek }
361190bd6fd3SPetr Holasek KSM_ATTR(merge_across_nodes);
361290bd6fd3SPetr Holasek #endif
361390bd6fd3SPetr Holasek
use_zero_pages_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3614e86c59b1SClaudio Imbrenda static ssize_t use_zero_pages_show(struct kobject *kobj,
3615e86c59b1SClaudio Imbrenda struct kobj_attribute *attr, char *buf)
3616e86c59b1SClaudio Imbrenda {
3617ae7a927dSJoe Perches return sysfs_emit(buf, "%u\n", ksm_use_zero_pages);
3618e86c59b1SClaudio Imbrenda }
use_zero_pages_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3619e86c59b1SClaudio Imbrenda static ssize_t use_zero_pages_store(struct kobject *kobj,
3620e86c59b1SClaudio Imbrenda struct kobj_attribute *attr,
3621e86c59b1SClaudio Imbrenda const char *buf, size_t count)
3622e86c59b1SClaudio Imbrenda {
3623e86c59b1SClaudio Imbrenda int err;
3624e86c59b1SClaudio Imbrenda bool value;
3625e86c59b1SClaudio Imbrenda
3626e86c59b1SClaudio Imbrenda err = kstrtobool(buf, &value);
3627e86c59b1SClaudio Imbrenda if (err)
3628e86c59b1SClaudio Imbrenda return -EINVAL;
3629e86c59b1SClaudio Imbrenda
3630e86c59b1SClaudio Imbrenda ksm_use_zero_pages = value;
3631e86c59b1SClaudio Imbrenda
3632e86c59b1SClaudio Imbrenda return count;
3633e86c59b1SClaudio Imbrenda }
3634e86c59b1SClaudio Imbrenda KSM_ATTR(use_zero_pages);
3635e86c59b1SClaudio Imbrenda
max_page_sharing_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)36362c653d0eSAndrea Arcangeli static ssize_t max_page_sharing_show(struct kobject *kobj,
36372c653d0eSAndrea Arcangeli struct kobj_attribute *attr, char *buf)
36382c653d0eSAndrea Arcangeli {
3639ae7a927dSJoe Perches return sysfs_emit(buf, "%u\n", ksm_max_page_sharing);
36402c653d0eSAndrea Arcangeli }
36412c653d0eSAndrea Arcangeli
max_page_sharing_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)36422c653d0eSAndrea Arcangeli static ssize_t max_page_sharing_store(struct kobject *kobj,
36432c653d0eSAndrea Arcangeli struct kobj_attribute *attr,
36442c653d0eSAndrea Arcangeli const char *buf, size_t count)
36452c653d0eSAndrea Arcangeli {
36462c653d0eSAndrea Arcangeli int err;
36472c653d0eSAndrea Arcangeli int knob;
36482c653d0eSAndrea Arcangeli
36492c653d0eSAndrea Arcangeli err = kstrtoint(buf, 10, &knob);
36502c653d0eSAndrea Arcangeli if (err)
36512c653d0eSAndrea Arcangeli return err;
36522c653d0eSAndrea Arcangeli /*
36532c653d0eSAndrea Arcangeli * When a KSM page is created it is shared by 2 mappings. This
36542c653d0eSAndrea Arcangeli * being a signed comparison, it implicitly verifies it's not
36552c653d0eSAndrea Arcangeli * negative.
36562c653d0eSAndrea Arcangeli */
36572c653d0eSAndrea Arcangeli if (knob < 2)
36582c653d0eSAndrea Arcangeli return -EINVAL;
36592c653d0eSAndrea Arcangeli
36602c653d0eSAndrea Arcangeli if (READ_ONCE(ksm_max_page_sharing) == knob)
36612c653d0eSAndrea Arcangeli return count;
36622c653d0eSAndrea Arcangeli
36632c653d0eSAndrea Arcangeli mutex_lock(&ksm_thread_mutex);
36642c653d0eSAndrea Arcangeli wait_while_offlining();
36652c653d0eSAndrea Arcangeli if (ksm_max_page_sharing != knob) {
36662c653d0eSAndrea Arcangeli if (ksm_pages_shared || remove_all_stable_nodes())
36672c653d0eSAndrea Arcangeli err = -EBUSY;
36682c653d0eSAndrea Arcangeli else
36692c653d0eSAndrea Arcangeli ksm_max_page_sharing = knob;
36702c653d0eSAndrea Arcangeli }
36712c653d0eSAndrea Arcangeli mutex_unlock(&ksm_thread_mutex);
36722c653d0eSAndrea Arcangeli
36732c653d0eSAndrea Arcangeli return err ? err : count;
36742c653d0eSAndrea Arcangeli }
36752c653d0eSAndrea Arcangeli KSM_ATTR(max_page_sharing);
36762c653d0eSAndrea Arcangeli
pages_scanned_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3677b348b5feSStefan Roesch static ssize_t pages_scanned_show(struct kobject *kobj,
3678b348b5feSStefan Roesch struct kobj_attribute *attr, char *buf)
3679b348b5feSStefan Roesch {
3680b348b5feSStefan Roesch return sysfs_emit(buf, "%lu\n", ksm_pages_scanned);
3681b348b5feSStefan Roesch }
3682b348b5feSStefan Roesch KSM_ATTR_RO(pages_scanned);
3683b348b5feSStefan Roesch
pages_shared_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3684b4028260SHugh Dickins static ssize_t pages_shared_show(struct kobject *kobj,
3685b4028260SHugh Dickins struct kobj_attribute *attr, char *buf)
3686b4028260SHugh Dickins {
3687ae7a927dSJoe Perches return sysfs_emit(buf, "%lu\n", ksm_pages_shared);
3688b4028260SHugh Dickins }
3689b4028260SHugh Dickins KSM_ATTR_RO(pages_shared);
3690b4028260SHugh Dickins
pages_sharing_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3691b4028260SHugh Dickins static ssize_t pages_sharing_show(struct kobject *kobj,
3692b4028260SHugh Dickins struct kobj_attribute *attr, char *buf)
3693b4028260SHugh Dickins {
3694ae7a927dSJoe Perches return sysfs_emit(buf, "%lu\n", ksm_pages_sharing);
3695b4028260SHugh Dickins }
3696b4028260SHugh Dickins KSM_ATTR_RO(pages_sharing);
3697b4028260SHugh Dickins
pages_unshared_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3698473b0ce4SHugh Dickins static ssize_t pages_unshared_show(struct kobject *kobj,
3699473b0ce4SHugh Dickins struct kobj_attribute *attr, char *buf)
3700473b0ce4SHugh Dickins {
3701ae7a927dSJoe Perches return sysfs_emit(buf, "%lu\n", ksm_pages_unshared);
3702473b0ce4SHugh Dickins }
3703473b0ce4SHugh Dickins KSM_ATTR_RO(pages_unshared);
3704473b0ce4SHugh Dickins
pages_volatile_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3705473b0ce4SHugh Dickins static ssize_t pages_volatile_show(struct kobject *kobj,
3706473b0ce4SHugh Dickins struct kobj_attribute *attr, char *buf)
3707473b0ce4SHugh Dickins {
3708473b0ce4SHugh Dickins long ksm_pages_volatile;
3709473b0ce4SHugh Dickins
3710473b0ce4SHugh Dickins ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
3711473b0ce4SHugh Dickins - ksm_pages_sharing - ksm_pages_unshared;
3712473b0ce4SHugh Dickins /*
3713473b0ce4SHugh Dickins * It was not worth any locking to calculate that statistic,
3714473b0ce4SHugh Dickins * but it might therefore sometimes be negative: conceal that.
3715473b0ce4SHugh Dickins */
3716473b0ce4SHugh Dickins if (ksm_pages_volatile < 0)
3717473b0ce4SHugh Dickins ksm_pages_volatile = 0;
3718ae7a927dSJoe Perches return sysfs_emit(buf, "%ld\n", ksm_pages_volatile);
3719473b0ce4SHugh Dickins }
3720473b0ce4SHugh Dickins KSM_ATTR_RO(pages_volatile);
3721473b0ce4SHugh Dickins
pages_skipped_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3722e5a68991SStefan Roesch static ssize_t pages_skipped_show(struct kobject *kobj,
3723e5a68991SStefan Roesch struct kobj_attribute *attr, char *buf)
3724e5a68991SStefan Roesch {
3725e5a68991SStefan Roesch return sysfs_emit(buf, "%lu\n", ksm_pages_skipped);
3726e5a68991SStefan Roesch }
3727e5a68991SStefan Roesch KSM_ATTR_RO(pages_skipped);
3728e5a68991SStefan Roesch
ksm_zero_pages_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3729e2942062Sxu xin static ssize_t ksm_zero_pages_show(struct kobject *kobj,
3730e2942062Sxu xin struct kobj_attribute *attr, char *buf)
3731e2942062Sxu xin {
3732c2dc78b8SChengming Zhou return sysfs_emit(buf, "%ld\n", atomic_long_read(&ksm_zero_pages));
3733e2942062Sxu xin }
3734e2942062Sxu xin KSM_ATTR_RO(ksm_zero_pages);
3735e2942062Sxu xin
general_profit_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3736d21077fbSStefan Roesch static ssize_t general_profit_show(struct kobject *kobj,
3737d21077fbSStefan Roesch struct kobj_attribute *attr, char *buf)
3738d21077fbSStefan Roesch {
3739d21077fbSStefan Roesch long general_profit;
3740d21077fbSStefan Roesch
3741c2dc78b8SChengming Zhou general_profit = (ksm_pages_sharing + atomic_long_read(&ksm_zero_pages)) * PAGE_SIZE -
3742d21077fbSStefan Roesch ksm_rmap_items * sizeof(struct ksm_rmap_item);
3743d21077fbSStefan Roesch
3744d21077fbSStefan Roesch return sysfs_emit(buf, "%ld\n", general_profit);
3745d21077fbSStefan Roesch }
3746d21077fbSStefan Roesch KSM_ATTR_RO(general_profit);
3747d21077fbSStefan Roesch
stable_node_dups_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)37482c653d0eSAndrea Arcangeli static ssize_t stable_node_dups_show(struct kobject *kobj,
37492c653d0eSAndrea Arcangeli struct kobj_attribute *attr, char *buf)
37502c653d0eSAndrea Arcangeli {
3751ae7a927dSJoe Perches return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups);
37522c653d0eSAndrea Arcangeli }
37532c653d0eSAndrea Arcangeli KSM_ATTR_RO(stable_node_dups);
37542c653d0eSAndrea Arcangeli
stable_node_chains_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)37552c653d0eSAndrea Arcangeli static ssize_t stable_node_chains_show(struct kobject *kobj,
37562c653d0eSAndrea Arcangeli struct kobj_attribute *attr, char *buf)
37572c653d0eSAndrea Arcangeli {
3758ae7a927dSJoe Perches return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains);
37592c653d0eSAndrea Arcangeli }
37602c653d0eSAndrea Arcangeli KSM_ATTR_RO(stable_node_chains);
37612c653d0eSAndrea Arcangeli
37622c653d0eSAndrea Arcangeli static ssize_t
stable_node_chains_prune_millisecs_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)37632c653d0eSAndrea Arcangeli stable_node_chains_prune_millisecs_show(struct kobject *kobj,
37642c653d0eSAndrea Arcangeli struct kobj_attribute *attr,
37652c653d0eSAndrea Arcangeli char *buf)
37662c653d0eSAndrea Arcangeli {
3767ae7a927dSJoe Perches return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
37682c653d0eSAndrea Arcangeli }
37692c653d0eSAndrea Arcangeli
37702c653d0eSAndrea Arcangeli static ssize_t
stable_node_chains_prune_millisecs_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)37712c653d0eSAndrea Arcangeli stable_node_chains_prune_millisecs_store(struct kobject *kobj,
37722c653d0eSAndrea Arcangeli struct kobj_attribute *attr,
37732c653d0eSAndrea Arcangeli const char *buf, size_t count)
37742c653d0eSAndrea Arcangeli {
3775584ff0dfSZhansaya Bagdauletkyzy unsigned int msecs;
37762c653d0eSAndrea Arcangeli int err;
37772c653d0eSAndrea Arcangeli
3778584ff0dfSZhansaya Bagdauletkyzy err = kstrtouint(buf, 10, &msecs);
3779584ff0dfSZhansaya Bagdauletkyzy if (err)
37802c653d0eSAndrea Arcangeli return -EINVAL;
37812c653d0eSAndrea Arcangeli
37822c653d0eSAndrea Arcangeli ksm_stable_node_chains_prune_millisecs = msecs;
37832c653d0eSAndrea Arcangeli
37842c653d0eSAndrea Arcangeli return count;
37852c653d0eSAndrea Arcangeli }
37862c653d0eSAndrea Arcangeli KSM_ATTR(stable_node_chains_prune_millisecs);
37872c653d0eSAndrea Arcangeli
full_scans_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3788473b0ce4SHugh Dickins static ssize_t full_scans_show(struct kobject *kobj,
3789473b0ce4SHugh Dickins struct kobj_attribute *attr, char *buf)
3790473b0ce4SHugh Dickins {
3791ae7a927dSJoe Perches return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr);
3792473b0ce4SHugh Dickins }
3793473b0ce4SHugh Dickins KSM_ATTR_RO(full_scans);
3794473b0ce4SHugh Dickins
smart_scan_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)37955e924ff5SStefan Roesch static ssize_t smart_scan_show(struct kobject *kobj,
37965e924ff5SStefan Roesch struct kobj_attribute *attr, char *buf)
37975e924ff5SStefan Roesch {
37985e924ff5SStefan Roesch return sysfs_emit(buf, "%u\n", ksm_smart_scan);
37995e924ff5SStefan Roesch }
38005e924ff5SStefan Roesch
smart_scan_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)38015e924ff5SStefan Roesch static ssize_t smart_scan_store(struct kobject *kobj,
38025e924ff5SStefan Roesch struct kobj_attribute *attr,
38035e924ff5SStefan Roesch const char *buf, size_t count)
38045e924ff5SStefan Roesch {
38055e924ff5SStefan Roesch int err;
38065e924ff5SStefan Roesch bool value;
38075e924ff5SStefan Roesch
38085e924ff5SStefan Roesch err = kstrtobool(buf, &value);
38095e924ff5SStefan Roesch if (err)
38105e924ff5SStefan Roesch return -EINVAL;
38115e924ff5SStefan Roesch
38125e924ff5SStefan Roesch ksm_smart_scan = value;
38135e924ff5SStefan Roesch return count;
38145e924ff5SStefan Roesch }
38155e924ff5SStefan Roesch KSM_ATTR(smart_scan);
38165e924ff5SStefan Roesch
advisor_mode_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)381766790e9aSStefan Roesch static ssize_t advisor_mode_show(struct kobject *kobj,
381866790e9aSStefan Roesch struct kobj_attribute *attr, char *buf)
381966790e9aSStefan Roesch {
382066790e9aSStefan Roesch const char *output;
382166790e9aSStefan Roesch
3822153ad566SNathan Chancellor if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
382366790e9aSStefan Roesch output = "none [scan-time]";
3824153ad566SNathan Chancellor else
3825153ad566SNathan Chancellor output = "[none] scan-time";
382666790e9aSStefan Roesch
382766790e9aSStefan Roesch return sysfs_emit(buf, "%s\n", output);
382866790e9aSStefan Roesch }
382966790e9aSStefan Roesch
advisor_mode_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)383066790e9aSStefan Roesch static ssize_t advisor_mode_store(struct kobject *kobj,
383166790e9aSStefan Roesch struct kobj_attribute *attr, const char *buf,
383266790e9aSStefan Roesch size_t count)
383366790e9aSStefan Roesch {
383466790e9aSStefan Roesch enum ksm_advisor_type curr_advisor = ksm_advisor;
383566790e9aSStefan Roesch
383666790e9aSStefan Roesch if (sysfs_streq("scan-time", buf))
383766790e9aSStefan Roesch ksm_advisor = KSM_ADVISOR_SCAN_TIME;
383866790e9aSStefan Roesch else if (sysfs_streq("none", buf))
383966790e9aSStefan Roesch ksm_advisor = KSM_ADVISOR_NONE;
384066790e9aSStefan Roesch else
384166790e9aSStefan Roesch return -EINVAL;
384266790e9aSStefan Roesch
384366790e9aSStefan Roesch /* Set advisor default values */
384466790e9aSStefan Roesch if (curr_advisor != ksm_advisor)
384566790e9aSStefan Roesch set_advisor_defaults();
384666790e9aSStefan Roesch
384766790e9aSStefan Roesch return count;
384866790e9aSStefan Roesch }
384966790e9aSStefan Roesch KSM_ATTR(advisor_mode);
385066790e9aSStefan Roesch
advisor_max_cpu_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)385166790e9aSStefan Roesch static ssize_t advisor_max_cpu_show(struct kobject *kobj,
385266790e9aSStefan Roesch struct kobj_attribute *attr, char *buf)
385366790e9aSStefan Roesch {
385466790e9aSStefan Roesch return sysfs_emit(buf, "%u\n", ksm_advisor_max_cpu);
385566790e9aSStefan Roesch }
385666790e9aSStefan Roesch
advisor_max_cpu_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)385766790e9aSStefan Roesch static ssize_t advisor_max_cpu_store(struct kobject *kobj,
385866790e9aSStefan Roesch struct kobj_attribute *attr,
385966790e9aSStefan Roesch const char *buf, size_t count)
386066790e9aSStefan Roesch {
386166790e9aSStefan Roesch int err;
386266790e9aSStefan Roesch unsigned long value;
386366790e9aSStefan Roesch
386466790e9aSStefan Roesch err = kstrtoul(buf, 10, &value);
386566790e9aSStefan Roesch if (err)
386666790e9aSStefan Roesch return -EINVAL;
386766790e9aSStefan Roesch
386866790e9aSStefan Roesch ksm_advisor_max_cpu = value;
386966790e9aSStefan Roesch return count;
387066790e9aSStefan Roesch }
387166790e9aSStefan Roesch KSM_ATTR(advisor_max_cpu);
387266790e9aSStefan Roesch
advisor_min_pages_to_scan_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)387366790e9aSStefan Roesch static ssize_t advisor_min_pages_to_scan_show(struct kobject *kobj,
387466790e9aSStefan Roesch struct kobj_attribute *attr, char *buf)
387566790e9aSStefan Roesch {
387666790e9aSStefan Roesch return sysfs_emit(buf, "%lu\n", ksm_advisor_min_pages_to_scan);
387766790e9aSStefan Roesch }
387866790e9aSStefan Roesch
advisor_min_pages_to_scan_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)387966790e9aSStefan Roesch static ssize_t advisor_min_pages_to_scan_store(struct kobject *kobj,
388066790e9aSStefan Roesch struct kobj_attribute *attr,
388166790e9aSStefan Roesch const char *buf, size_t count)
388266790e9aSStefan Roesch {
388366790e9aSStefan Roesch int err;
388466790e9aSStefan Roesch unsigned long value;
388566790e9aSStefan Roesch
388666790e9aSStefan Roesch err = kstrtoul(buf, 10, &value);
388766790e9aSStefan Roesch if (err)
388866790e9aSStefan Roesch return -EINVAL;
388966790e9aSStefan Roesch
389066790e9aSStefan Roesch ksm_advisor_min_pages_to_scan = value;
389166790e9aSStefan Roesch return count;
389266790e9aSStefan Roesch }
389366790e9aSStefan Roesch KSM_ATTR(advisor_min_pages_to_scan);
389466790e9aSStefan Roesch
advisor_max_pages_to_scan_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)389566790e9aSStefan Roesch static ssize_t advisor_max_pages_to_scan_show(struct kobject *kobj,
389666790e9aSStefan Roesch struct kobj_attribute *attr, char *buf)
389766790e9aSStefan Roesch {
389866790e9aSStefan Roesch return sysfs_emit(buf, "%lu\n", ksm_advisor_max_pages_to_scan);
389966790e9aSStefan Roesch }
390066790e9aSStefan Roesch
advisor_max_pages_to_scan_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)390166790e9aSStefan Roesch static ssize_t advisor_max_pages_to_scan_store(struct kobject *kobj,
390266790e9aSStefan Roesch struct kobj_attribute *attr,
390366790e9aSStefan Roesch const char *buf, size_t count)
390466790e9aSStefan Roesch {
390566790e9aSStefan Roesch int err;
390666790e9aSStefan Roesch unsigned long value;
390766790e9aSStefan Roesch
390866790e9aSStefan Roesch err = kstrtoul(buf, 10, &value);
390966790e9aSStefan Roesch if (err)
391066790e9aSStefan Roesch return -EINVAL;
391166790e9aSStefan Roesch
391266790e9aSStefan Roesch ksm_advisor_max_pages_to_scan = value;
391366790e9aSStefan Roesch return count;
391466790e9aSStefan Roesch }
391566790e9aSStefan Roesch KSM_ATTR(advisor_max_pages_to_scan);
391666790e9aSStefan Roesch
advisor_target_scan_time_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)391766790e9aSStefan Roesch static ssize_t advisor_target_scan_time_show(struct kobject *kobj,
391866790e9aSStefan Roesch struct kobj_attribute *attr, char *buf)
391966790e9aSStefan Roesch {
392066790e9aSStefan Roesch return sysfs_emit(buf, "%lu\n", ksm_advisor_target_scan_time);
392166790e9aSStefan Roesch }
392266790e9aSStefan Roesch
advisor_target_scan_time_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)392366790e9aSStefan Roesch static ssize_t advisor_target_scan_time_store(struct kobject *kobj,
392466790e9aSStefan Roesch struct kobj_attribute *attr,
392566790e9aSStefan Roesch const char *buf, size_t count)
392666790e9aSStefan Roesch {
392766790e9aSStefan Roesch int err;
392866790e9aSStefan Roesch unsigned long value;
392966790e9aSStefan Roesch
393066790e9aSStefan Roesch err = kstrtoul(buf, 10, &value);
393166790e9aSStefan Roesch if (err)
393266790e9aSStefan Roesch return -EINVAL;
393366790e9aSStefan Roesch if (value < 1)
393466790e9aSStefan Roesch return -EINVAL;
393566790e9aSStefan Roesch
393666790e9aSStefan Roesch ksm_advisor_target_scan_time = value;
393766790e9aSStefan Roesch return count;
393866790e9aSStefan Roesch }
393966790e9aSStefan Roesch KSM_ATTR(advisor_target_scan_time);
394066790e9aSStefan Roesch
394131dbd01fSIzik Eidus static struct attribute *ksm_attrs[] = {
394231dbd01fSIzik Eidus &sleep_millisecs_attr.attr,
394331dbd01fSIzik Eidus &pages_to_scan_attr.attr,
394431dbd01fSIzik Eidus &run_attr.attr,
3945b348b5feSStefan Roesch &pages_scanned_attr.attr,
3946b4028260SHugh Dickins &pages_shared_attr.attr,
3947b4028260SHugh Dickins &pages_sharing_attr.attr,
3948473b0ce4SHugh Dickins &pages_unshared_attr.attr,
3949473b0ce4SHugh Dickins &pages_volatile_attr.attr,
3950e5a68991SStefan Roesch &pages_skipped_attr.attr,
3951e2942062Sxu xin &ksm_zero_pages_attr.attr,
3952473b0ce4SHugh Dickins &full_scans_attr.attr,
395390bd6fd3SPetr Holasek #ifdef CONFIG_NUMA
395490bd6fd3SPetr Holasek &merge_across_nodes_attr.attr,
395590bd6fd3SPetr Holasek #endif
39562c653d0eSAndrea Arcangeli &max_page_sharing_attr.attr,
39572c653d0eSAndrea Arcangeli &stable_node_chains_attr.attr,
39582c653d0eSAndrea Arcangeli &stable_node_dups_attr.attr,
39592c653d0eSAndrea Arcangeli &stable_node_chains_prune_millisecs_attr.attr,
3960e86c59b1SClaudio Imbrenda &use_zero_pages_attr.attr,
3961d21077fbSStefan Roesch &general_profit_attr.attr,
39625e924ff5SStefan Roesch &smart_scan_attr.attr,
396366790e9aSStefan Roesch &advisor_mode_attr.attr,
396466790e9aSStefan Roesch &advisor_max_cpu_attr.attr,
396566790e9aSStefan Roesch &advisor_min_pages_to_scan_attr.attr,
396666790e9aSStefan Roesch &advisor_max_pages_to_scan_attr.attr,
396766790e9aSStefan Roesch &advisor_target_scan_time_attr.attr,
396831dbd01fSIzik Eidus NULL,
396931dbd01fSIzik Eidus };
397031dbd01fSIzik Eidus
3971f907c26aSArvind Yadav static const struct attribute_group ksm_attr_group = {
397231dbd01fSIzik Eidus .attrs = ksm_attrs,
397331dbd01fSIzik Eidus .name = "ksm",
397431dbd01fSIzik Eidus };
39752ffd8679SHugh Dickins #endif /* CONFIG_SYSFS */
397631dbd01fSIzik Eidus
ksm_init(void)397731dbd01fSIzik Eidus static int __init ksm_init(void)
397831dbd01fSIzik Eidus {
397931dbd01fSIzik Eidus struct task_struct *ksm_thread;
398031dbd01fSIzik Eidus int err;
398131dbd01fSIzik Eidus
3982e86c59b1SClaudio Imbrenda /* The correct value depends on page size and endianness */
3983e86c59b1SClaudio Imbrenda zero_checksum = calc_checksum(ZERO_PAGE(0));
3984e86c59b1SClaudio Imbrenda /* Default to false for backwards compatibility */
3985e86c59b1SClaudio Imbrenda ksm_use_zero_pages = false;
3986e86c59b1SClaudio Imbrenda
398731dbd01fSIzik Eidus err = ksm_slab_init();
398831dbd01fSIzik Eidus if (err)
398931dbd01fSIzik Eidus goto out;
399031dbd01fSIzik Eidus
399131dbd01fSIzik Eidus ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
399231dbd01fSIzik Eidus if (IS_ERR(ksm_thread)) {
399325acde31SPaul McQuade pr_err("ksm: creating kthread failed\n");
399431dbd01fSIzik Eidus err = PTR_ERR(ksm_thread);
3995d9f8984cSLai Jiangshan goto out_free;
399631dbd01fSIzik Eidus }
399731dbd01fSIzik Eidus
39982ffd8679SHugh Dickins #ifdef CONFIG_SYSFS
399931dbd01fSIzik Eidus err = sysfs_create_group(mm_kobj, &ksm_attr_group);
400031dbd01fSIzik Eidus if (err) {
400125acde31SPaul McQuade pr_err("ksm: register sysfs failed\n");
40022ffd8679SHugh Dickins kthread_stop(ksm_thread);
4003d9f8984cSLai Jiangshan goto out_free;
400431dbd01fSIzik Eidus }
4005c73602adSHugh Dickins #else
4006c73602adSHugh Dickins ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
4007c73602adSHugh Dickins
40082ffd8679SHugh Dickins #endif /* CONFIG_SYSFS */
400931dbd01fSIzik Eidus
401062b61f61SHugh Dickins #ifdef CONFIG_MEMORY_HOTREMOVE
4011ef4d43a8SHugh Dickins /* There is no significance to this priority 100 */
40121eeaa4fdSLiu Shixin hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI);
401362b61f61SHugh Dickins #endif
401431dbd01fSIzik Eidus return 0;
401531dbd01fSIzik Eidus
4016d9f8984cSLai Jiangshan out_free:
401731dbd01fSIzik Eidus ksm_slab_free();
401831dbd01fSIzik Eidus out:
401931dbd01fSIzik Eidus return err;
402031dbd01fSIzik Eidus }
4021a64fb3cdSPaul Gortmaker subsys_initcall(ksm_init);
4022