linux/mm/workingset.c

b2441318SGreg Kroah-Hartman// SPDX-License-Identifier: GPL-2.0
a528910eSJohannes Weiner/*
a528910eSJohannes Weiner * Workingset detection
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
a528910eSJohannes Weiner */
a528910eSJohannes Weiner
a528910eSJohannes Weiner#include <linux/memcontrol.h>
170b04b7SJoonsoo Kim#include <linux/mm_inline.h>
a528910eSJohannes Weiner#include <linux/writeback.h>
3a4f8a0bSHugh Dickins#include <linux/shmem_fs.h>
a528910eSJohannes Weiner#include <linux/pagemap.h>
a528910eSJohannes Weiner#include <linux/atomic.h>
a528910eSJohannes Weiner#include <linux/module.h>
a528910eSJohannes Weiner#include <linux/swap.h>
14b46879SJohannes Weiner#include <linux/dax.h>
a528910eSJohannes Weiner#include <linux/fs.h>
a528910eSJohannes Weiner#include <linux/mm.h>
f3d652b0SKairui Song#include "swap_table.h"
b64e74e9SChristoph Hellwig#include "internal.h"
a528910eSJohannes Weiner
a528910eSJohannes Weiner/*
a528910eSJohannes Weiner *		Double CLOCK lists
a528910eSJohannes Weiner *
1e6b1085SMel Gorman * Per node, two clock lists are maintained for file pages: the
a528910eSJohannes Weiner * inactive and the active list.  Freshly faulted pages start out at
a528910eSJohannes Weiner * the head of the inactive list and page reclaim scans pages from the
a528910eSJohannes Weiner * tail.  Pages that are accessed multiple times on the inactive list
a528910eSJohannes Weiner * are promoted to the active list, to protect them from reclaim,
a528910eSJohannes Weiner * whereas active pages are demoted to the inactive list when the
a528910eSJohannes Weiner * active list grows too big.
a528910eSJohannes Weiner *
a528910eSJohannes Weiner *   fault ------------------------+
a528910eSJohannes Weiner *                                 |
a528910eSJohannes Weiner *              +--------------+   |            +-------------+
a528910eSJohannes Weiner *   reclaim <- |   inactive   | <-+-- demotion |    active   | <--+
a528910eSJohannes Weiner *              +--------------+                +-------------+    |
a528910eSJohannes Weiner *                     |                                           |
a528910eSJohannes Weiner *                     +-------------- promotion ------------------+
a528910eSJohannes Weiner *
a528910eSJohannes Weiner *
a528910eSJohannes Weiner *		Access frequency and refault distance
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * A workload is thrashing when its pages are frequently used but they
a528910eSJohannes Weiner * are evicted from the inactive list every time before another access
a528910eSJohannes Weiner * would have promoted them to the active list.
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * In cases where the average access distance between thrashing pages
a528910eSJohannes Weiner * is bigger than the size of memory there is nothing that can be
a528910eSJohannes Weiner * done - the thrashing set could never fit into memory under any
a528910eSJohannes Weiner * circumstance.
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * However, the average access distance could be bigger than the
a528910eSJohannes Weiner * inactive list, yet smaller than the size of memory.  In this case,
a528910eSJohannes Weiner * the set could fit into memory if it weren't for the currently
a528910eSJohannes Weiner * active pages - which may be used more, hopefully less frequently:
a528910eSJohannes Weiner *
a528910eSJohannes Weiner *      +-memory available to cache-+
a528910eSJohannes Weiner *      |                           |
a528910eSJohannes Weiner *      +-inactive------+-active----+
a528910eSJohannes Weiner *  a b | c d e f g h i | J K L M N |
a528910eSJohannes Weiner *      +---------------+-----------+
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * It is prohibitively expensive to accurately track access frequency
a528910eSJohannes Weiner * of pages.  But a reasonable approximation can be made to measure
a528910eSJohannes Weiner * thrashing on the inactive list, after which refaulting pages can be
a528910eSJohannes Weiner * activated optimistically to compete with the existing active pages.
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * Approximating inactive page access frequency - Observations:
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * 1. When a page is accessed for the first time, it is added to the
a528910eSJohannes Weiner *    head of the inactive list, slides every existing inactive page
a528910eSJohannes Weiner *    towards the tail by one slot, and pushes the current tail page
a528910eSJohannes Weiner *    out of memory.
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * 2. When a page is accessed for the second time, it is promoted to
a528910eSJohannes Weiner *    the active list, shrinking the inactive list by one slot.  This
a528910eSJohannes Weiner *    also slides all inactive pages that were faulted into the cache
a528910eSJohannes Weiner *    more recently than the activated page towards the tail of the
a528910eSJohannes Weiner *    inactive list.
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * Thus:
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * 1. The sum of evictions and activations between any two points in
a528910eSJohannes Weiner *    time indicate the minimum number of inactive pages accessed in
a528910eSJohannes Weiner *    between.
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * 2. Moving one inactive page N page slots towards the tail of the
a528910eSJohannes Weiner *    list requires at least N inactive page accesses.
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * Combining these:
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * 1. When a page is finally evicted from memory, the number of
a528910eSJohannes Weiner *    inactive pages accessed while the page was in cache is at least
a528910eSJohannes Weiner *    the number of page slots on the inactive list.
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * 2. In addition, measuring the sum of evictions and activations (E)
a528910eSJohannes Weiner *    at the time of a page's eviction, and comparing it to another
a528910eSJohannes Weiner *    reading (R) at the time the page faults back into memory tells
a528910eSJohannes Weiner *    the minimum number of accesses while the page was not cached.
a528910eSJohannes Weiner *    This is called the refault distance.
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * Because the first access of the page was the fault and the second
a528910eSJohannes Weiner * access the refault, we combine the in-cache distance with the
a528910eSJohannes Weiner * out-of-cache distance to get the complete minimum access distance
a528910eSJohannes Weiner * of this page:
a528910eSJohannes Weiner *
a528910eSJohannes Weiner *      NR_inactive + (R - E)
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * And knowing the minimum access distance of a page, we can easily
a528910eSJohannes Weiner * tell if the page would be able to stay in cache assuming all page
a528910eSJohannes Weiner * slots in the cache were available:
a528910eSJohannes Weiner *
a528910eSJohannes Weiner *   NR_inactive + (R - E) <= NR_inactive + NR_active
a528910eSJohannes Weiner *
ed8f3f99SYang Yang * If we have swap we should consider about NR_inactive_anon and
ed8f3f99SYang Yang * NR_active_anon, so for page cache and anonymous respectively:
a528910eSJohannes Weiner *
ed8f3f99SYang Yang *   NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
ed8f3f99SYang Yang *   + NR_inactive_anon + NR_active_anon
ed8f3f99SYang Yang *
ed8f3f99SYang Yang *   NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
ed8f3f99SYang Yang *   + NR_inactive_file + NR_active_file
ed8f3f99SYang Yang *
ed8f3f99SYang Yang * Which can be further simplified to:
ed8f3f99SYang Yang *
ed8f3f99SYang Yang *   (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
ed8f3f99SYang Yang *
ed8f3f99SYang Yang *   (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * Put into words, the refault distance (out-of-cache) can be seen as
a528910eSJohannes Weiner * a deficit in inactive list space (in-cache).  If the inactive list
a528910eSJohannes Weiner * had (R - E) more page slots, the page would not have been evicted
a528910eSJohannes Weiner * in between accesses, but activated instead.  And on a full system,
a528910eSJohannes Weiner * the only thing eating into inactive list space is active pages.
a528910eSJohannes Weiner *
a528910eSJohannes Weiner *
1899ad18SJohannes Weiner *		Refaulting inactive pages
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * All that is known about the active list is that the pages have been
a528910eSJohannes Weiner * accessed more than once in the past.  This means that at any given
a528910eSJohannes Weiner * time there is actually a good chance that pages on the active list
a528910eSJohannes Weiner * are no longer in active use.
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * So when a refault distance of (R - E) is observed and there are at
ed8f3f99SYang Yang * least (R - E) pages in the userspace workingset, the refaulting page
ed8f3f99SYang Yang * is activated optimistically in the hope that (R - E) pages are actually
a528910eSJohannes Weiner * used less frequently than the refaulting page - or even not used at
a528910eSJohannes Weiner * all anymore.
a528910eSJohannes Weiner *
1899ad18SJohannes Weiner * That means if inactive cache is refaulting with a suitable refault
1899ad18SJohannes Weiner * distance, we assume the cache workingset is transitioning and put
ed8f3f99SYang Yang * pressure on the current workingset.
1899ad18SJohannes Weiner *
a528910eSJohannes Weiner * If this is wrong and demotion kicks in, the pages which are truly
a528910eSJohannes Weiner * used more frequently will be reactivated while the less frequently
a528910eSJohannes Weiner * used once will be evicted from memory.
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * But if this is right, the stale pages will be pushed out of memory
a528910eSJohannes Weiner * and the used pages get to stay in cache.
a528910eSJohannes Weiner *
1899ad18SJohannes Weiner *		Refaulting active pages
1899ad18SJohannes Weiner *
1899ad18SJohannes Weiner * If on the other hand the refaulting pages have recently been
1899ad18SJohannes Weiner * deactivated, it means that the active list is no longer protecting
1899ad18SJohannes Weiner * actively used cache from reclaim. The cache is NOT transitioning to
1899ad18SJohannes Weiner * a different workingset; the existing workingset is thrashing in the
1899ad18SJohannes Weiner * space allocated to the page cache.
1899ad18SJohannes Weiner *
a528910eSJohannes Weiner *
a528910eSJohannes Weiner *		Implementation
a528910eSJohannes Weiner *
31d8fcacSJohannes Weiner * For each node's LRU lists, a counter for inactive evictions and
31d8fcacSJohannes Weiner * activations is maintained (node->nonresident_age).
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * On eviction, a snapshot of this counter (along with some bits to
a97e7904SMatthew Wilcox * identify the node) is stored in the now empty page cache
a528910eSJohannes Weiner * slot of the evicted page.  This is called a shadow entry.
a528910eSJohannes Weiner *
a528910eSJohannes Weiner * On cache misses for which there are shadow entries, an eligible
a528910eSJohannes Weiner * refault distance will immediately activate the refaulting page.
a528910eSJohannes Weiner */
a528910eSJohannes Weiner
3ebc57f4SMiaohe Lin#define WORKINGSET_SHIFT 1
3159f943SMatthew Wilcox#define EVICTION_SHIFT	((BITS_PER_LONG - BITS_PER_XA_VALUE) +	\
3ebc57f4SMiaohe Lin			 WORKINGSET_SHIFT + NODES_SHIFT + \
3ebc57f4SMiaohe Lin			 MEM_CGROUP_ID_SHIFT)
f3d652b0SKairui Song#define EVICTION_SHIFT_ANON	(EVICTION_SHIFT + SWAP_COUNT_SHIFT)
689c94f0SJohannes Weiner#define EVICTION_MASK	(~0UL >> EVICTION_SHIFT)
f3d652b0SKairui Song#define EVICTION_MASK_ANON	(~0UL >> EVICTION_SHIFT_ANON)
689c94f0SJohannes Weiner
612e4493SJohannes Weiner/*
612e4493SJohannes Weiner * Eviction timestamps need to be able to cover the full range of
a97e7904SMatthew Wilcox * actionable refaults. However, bits are tight in the xarray
612e4493SJohannes Weiner * entry, and after storing the identifier for the lruvec there might
612e4493SJohannes Weiner * not be enough left to represent every single actionable refault. In
612e4493SJohannes Weiner * that case, we have to sacrifice granularity for distance, and group
612e4493SJohannes Weiner * evictions into coarser buckets by shaving off lower timestamp bits.
612e4493SJohannes Weiner */
f3d652b0SKairui Songstatic unsigned int bucket_order[ANON_AND_FILE] __read_mostly;
612e4493SJohannes Weiner
1899ad18SJohannes Weinerstatic void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
f3d652b0SKairui Song			 bool workingset, bool file)
a528910eSJohannes Weiner{
f3d652b0SKairui Song	eviction &= file ? EVICTION_MASK : EVICTION_MASK_ANON;
23047a96SJohannes Weiner	eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
1e6b1085SMel Gorman	eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
3ebc57f4SMiaohe Lin	eviction = (eviction << WORKINGSET_SHIFT) | workingset;
a528910eSJohannes Weiner
3159f943SMatthew Wilcox	return xa_mk_value(eviction);
a528910eSJohannes Weiner}
a528910eSJohannes Weiner
1e6b1085SMel Gormanstatic void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
1899ad18SJohannes Weiner			  unsigned long *evictionp, bool *workingsetp)
a528910eSJohannes Weiner{
3159f943SMatthew Wilcox	unsigned long entry = xa_to_value(shadow);
1e6b1085SMel Gorman	int memcgid, nid;
1899ad18SJohannes Weiner	bool workingset;
a528910eSJohannes Weiner
3ebc57f4SMiaohe Lin	workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1);
3ebc57f4SMiaohe Lin	entry >>= WORKINGSET_SHIFT;
a528910eSJohannes Weiner	nid = entry & ((1UL << NODES_SHIFT) - 1);
a528910eSJohannes Weiner	entry >>= NODES_SHIFT;
23047a96SJohannes Weiner	memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
23047a96SJohannes Weiner	entry >>= MEM_CGROUP_ID_SHIFT;
a528910eSJohannes Weiner
23047a96SJohannes Weiner	*memcgidp = memcgid;
1e6b1085SMel Gorman	*pgdat = NODE_DATA(nid);
ac35a490SYu Zhao	*evictionp = entry;
1899ad18SJohannes Weiner	*workingsetp = workingset;
a528910eSJohannes Weiner}
a528910eSJohannes Weiner
ac35a490SYu Zhao#ifdef CONFIG_LRU_GEN
ac35a490SYu Zhao
ac35a490SYu Zhaostatic void *lru_gen_eviction(struct folio *folio)
ac35a490SYu Zhao{
ac35a490SYu Zhao	int hist;
ac35a490SYu Zhao	unsigned long token;
ac35a490SYu Zhao	unsigned long min_seq;
ac35a490SYu Zhao	struct lruvec *lruvec;
391655feSYu Zhao	struct lru_gen_folio *lrugen;
ac35a490SYu Zhao	int type = folio_is_file_lru(folio);
ac35a490SYu Zhao	int delta = folio_nr_pages(folio);
ac35a490SYu Zhao	int refs = folio_lru_refs(folio);
4d5d14a0SYu Zhao	bool workingset = folio_test_workingset(folio);
4d5d14a0SYu Zhao	int tier = lru_tier_from_refs(refs, workingset);
b3ca9829SMuchun Song	struct mem_cgroup *memcg;
ac35a490SYu Zhao	struct pglist_data *pgdat = folio_pgdat(folio);
b3ca9829SMuchun Song	unsigned short memcg_id;
ac35a490SYu Zhao
f3d652b0SKairui Song	BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH >
f3d652b0SKairui Song		     BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON));
ac35a490SYu Zhao
b3ca9829SMuchun Song	rcu_read_lock();
b3ca9829SMuchun Song	memcg = folio_memcg(folio);
ac35a490SYu Zhao	lruvec = mem_cgroup_lruvec(memcg, pgdat);
ac35a490SYu Zhao	lrugen = &lruvec->lrugen;
ac35a490SYu Zhao	min_seq = READ_ONCE(lrugen->min_seq[type]);
ac35a490SYu Zhao	token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
ac35a490SYu Zhao
ac35a490SYu Zhao	hist = lru_hist_from_seq(min_seq);
ac35a490SYu Zhao	atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
b3ca9829SMuchun Song	memcg_id = mem_cgroup_private_id(memcg);
b3ca9829SMuchun Song	rcu_read_unlock();
ac35a490SYu Zhao
b3ca9829SMuchun Song	return pack_shadow(memcg_id, pgdat, token, workingset, type);
ac35a490SYu Zhao}
ac35a490SYu Zhao
ffcb5f52SNhat Pham/*
ffcb5f52SNhat Pham * Tests if the shadow entry is for a folio that was recently evicted.
d7f1afd0ST.J. Alumbaugh * Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
ffcb5f52SNhat Pham */
b1a71694SYu Zhaostatic bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
f3d652b0SKairui Song				unsigned long *token, bool *workingset, bool file)
ffcb5f52SNhat Pham{
d7f1afd0ST.J. Alumbaugh	int memcg_id;
b1a71694SYu Zhao	unsigned long max_seq;
d7f1afd0ST.J. Alumbaugh	struct mem_cgroup *memcg;
d7f1afd0ST.J. Alumbaugh	struct pglist_data *pgdat;
ffcb5f52SNhat Pham
d7f1afd0ST.J. Alumbaugh	unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
ffcb5f52SNhat Pham
e77786b4SShakeel Butt	memcg = mem_cgroup_from_private_id(memcg_id);
d7f1afd0ST.J. Alumbaugh	*lruvec = mem_cgroup_lruvec(memcg, pgdat);
ffcb5f52SNhat Pham
b1a71694SYu Zhao	max_seq = READ_ONCE((*lruvec)->lrugen.max_seq);
f3d652b0SKairui Song	max_seq &= (file ? EVICTION_MASK : EVICTION_MASK_ANON) >> LRU_REFS_WIDTH;
b1a71694SYu Zhao
b1a71694SYu Zhao	return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS;
ffcb5f52SNhat Pham}
ffcb5f52SNhat Pham
ac35a490SYu Zhaostatic void lru_gen_refault(struct folio *folio, void *shadow)
ac35a490SYu Zhao{
3af0191aSKalesh Singh	bool recent;
ac35a490SYu Zhao	int hist, tier, refs;
ac35a490SYu Zhao	bool workingset;
ac35a490SYu Zhao	unsigned long token;
ac35a490SYu Zhao	struct lruvec *lruvec;
391655feSYu Zhao	struct lru_gen_folio *lrugen;
ac35a490SYu Zhao	int type = folio_is_file_lru(folio);
ac35a490SYu Zhao	int delta = folio_nr_pages(folio);
ac35a490SYu Zhao
ac35a490SYu Zhao	rcu_read_lock();
ac35a490SYu Zhao
f3d652b0SKairui Song	recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset, type);
3af0191aSKalesh Singh	if (lruvec != folio_lruvec(folio))
ffcb5f52SNhat Pham		goto unlock;
ffcb5f52SNhat Pham
3af0191aSKalesh Singh	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
3af0191aSKalesh Singh
3af0191aSKalesh Singh	if (!recent)
ac35a490SYu Zhao		goto unlock;
ac35a490SYu Zhao
ac35a490SYu Zhao	lrugen = &lruvec->lrugen;
ac35a490SYu Zhao
d7f1afd0ST.J. Alumbaugh	hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
4d5d14a0SYu Zhao	refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1;
4d5d14a0SYu Zhao	tier = lru_tier_from_refs(refs, workingset);
ac35a490SYu Zhao
ac35a490SYu Zhao	atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
4d5d14a0SYu Zhao
4d5d14a0SYu Zhao	/* see folio_add_lru() where folio_set_active() will be called */
4d5d14a0SYu Zhao	if (lru_gen_in_fault())
3af0191aSKalesh Singh		mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
ac35a490SYu Zhao
4d5d14a0SYu Zhao	if (workingset) {
4d5d14a0SYu Zhao		folio_set_workingset(folio);
ac35a490SYu Zhao		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
4d5d14a0SYu Zhao	} else
53fbef56SMatthew Wilcox (Oracle)		set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
ac35a490SYu Zhaounlock:
ac35a490SYu Zhao	rcu_read_unlock();
ac35a490SYu Zhao}
ac35a490SYu Zhao
ac35a490SYu Zhao#else /* !CONFIG_LRU_GEN */
ac35a490SYu Zhao
ac35a490SYu Zhaostatic void *lru_gen_eviction(struct folio *folio)
ac35a490SYu Zhao{
ac35a490SYu Zhao	return NULL;
ac35a490SYu Zhao}
ac35a490SYu Zhao
b1a71694SYu Zhaostatic bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
f3d652b0SKairui Song				unsigned long *token, bool *workingset, bool file)
ffcb5f52SNhat Pham{
ffcb5f52SNhat Pham	return false;
ffcb5f52SNhat Pham}
ffcb5f52SNhat Pham
ac35a490SYu Zhaostatic void lru_gen_refault(struct folio *folio, void *shadow)
ac35a490SYu Zhao{
ac35a490SYu Zhao}
ac35a490SYu Zhao
ac35a490SYu Zhao#endif /* CONFIG_LRU_GEN */
ac35a490SYu Zhao
31d8fcacSJohannes Weiner/**
31d8fcacSJohannes Weiner * workingset_age_nonresident - age non-resident entries as LRU ages
e755f4afSXiaofei Tan * @lruvec: the lruvec that was aged
31d8fcacSJohannes Weiner * @nr_pages: the number of pages to count
31d8fcacSJohannes Weiner *
31d8fcacSJohannes Weiner * As in-memory pages are aged, non-resident pages need to be aged as
31d8fcacSJohannes Weiner * well, in order for the refault distances later on to be comparable
31d8fcacSJohannes Weiner * to the in-memory dimensions. This function allows reclaim and LRU
31d8fcacSJohannes Weiner * operations to drive the non-resident aging along in parallel.
31d8fcacSJohannes Weiner */
31d8fcacSJohannes Weinervoid workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
b910718aSJohannes Weiner{
b910718aSJohannes Weiner	/*
b910718aSJohannes Weiner	 * Reclaiming a cgroup means reclaiming all its children in a
b910718aSJohannes Weiner	 * round-robin fashion. That means that each cgroup has an LRU
b910718aSJohannes Weiner	 * order that is composed of the LRU orders of its child
b910718aSJohannes Weiner	 * cgroups; and every page has an LRU position not just in the
b910718aSJohannes Weiner	 * cgroup that owns it, but in all of that group's ancestors.
b910718aSJohannes Weiner	 *
b910718aSJohannes Weiner	 * So when the physical inactive list of a leaf cgroup ages,
b910718aSJohannes Weiner	 * the virtual inactive lists of all its parents, including
b910718aSJohannes Weiner	 * the root cgroup's, age as well.
b910718aSJohannes Weiner	 */
b910718aSJohannes Weiner	do {
31d8fcacSJohannes Weiner		atomic_long_add(nr_pages, &lruvec->nonresident_age);
31d8fcacSJohannes Weiner	} while ((lruvec = parent_lruvec(lruvec)));
b910718aSJohannes Weiner}
b910718aSJohannes Weiner
a528910eSJohannes Weiner/**
8927f647SMatthew Wilcox (Oracle) * workingset_eviction - note the eviction of a folio from memory
b910718aSJohannes Weiner * @target_memcg: the cgroup that is causing the reclaim
8927f647SMatthew Wilcox (Oracle) * @folio: the folio being evicted
a528910eSJohannes Weiner *
8927f647SMatthew Wilcox (Oracle) * Return: a shadow entry to be stored in @folio->mapping->i_pages in place
8927f647SMatthew Wilcox (Oracle) * of the evicted @folio so that a later refault can be detected.
a528910eSJohannes Weiner */
8927f647SMatthew Wilcox (Oracle)void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
a528910eSJohannes Weiner{
8927f647SMatthew Wilcox (Oracle)	struct pglist_data *pgdat = folio_pgdat(folio);
f3d652b0SKairui Song	int file = folio_is_file_lru(folio);
a528910eSJohannes Weiner	unsigned long eviction;
23047a96SJohannes Weiner	struct lruvec *lruvec;
b910718aSJohannes Weiner	int memcgid;
a528910eSJohannes Weiner
8927f647SMatthew Wilcox (Oracle)	/* Folio is fully exclusive and pins folio's memory cgroup pointer */
8927f647SMatthew Wilcox (Oracle)	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
8927f647SMatthew Wilcox (Oracle)	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
8927f647SMatthew Wilcox (Oracle)	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
23047a96SJohannes Weiner
ac35a490SYu Zhao	if (lru_gen_enabled())
ac35a490SYu Zhao		return lru_gen_eviction(folio);
ac35a490SYu Zhao
b910718aSJohannes Weiner	lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
b910718aSJohannes Weiner	/* XXX: target_memcg can be NULL, go through lruvec */
e77786b4SShakeel Butt	memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec));
31d8fcacSJohannes Weiner	eviction = atomic_long_read(&lruvec->nonresident_age);
f3d652b0SKairui Song	eviction >>= bucket_order[file];
8927f647SMatthew Wilcox (Oracle)	workingset_age_nonresident(lruvec, folio_nr_pages(folio));
8927f647SMatthew Wilcox (Oracle)	return pack_shadow(memcgid, pgdat, eviction,
f3d652b0SKairui Song			   folio_test_workingset(folio), file);
a528910eSJohannes Weiner}
a528910eSJohannes Weiner
a528910eSJohannes Weiner/**
ffcb5f52SNhat Pham * workingset_test_recent - tests if the shadow entry is for a folio that was
ffcb5f52SNhat Pham * recently evicted. Also fills in @workingset with the value unpacked from
ffcb5f52SNhat Pham * shadow.
ffcb5f52SNhat Pham * @shadow: the shadow entry to be tested.
ffcb5f52SNhat Pham * @file: whether the corresponding folio is from the file lru.
ffcb5f52SNhat Pham * @workingset: where the workingset value unpacked from shadow should
ffcb5f52SNhat Pham * be stored.
5a4d8944SNhat Pham * @flush: whether to flush cgroup rstat.
a528910eSJohannes Weiner *
ffcb5f52SNhat Pham * Return: true if the shadow is for a recently evicted folio; false otherwise.
a528910eSJohannes Weiner */
5a4d8944SNhat Phambool workingset_test_recent(void *shadow, bool file, bool *workingset,
5a4d8944SNhat Pham				bool flush)
a528910eSJohannes Weiner{
b910718aSJohannes Weiner	struct mem_cgroup *eviction_memcg;
b910718aSJohannes Weiner	struct lruvec *eviction_lruvec;
a528910eSJohannes Weiner	unsigned long refault_distance;
34e58cacSJohannes Weiner	unsigned long workingset_size;
162453bfSJohannes Weiner	unsigned long refault;
23047a96SJohannes Weiner	int memcgid;
ffcb5f52SNhat Pham	struct pglist_data *pgdat;
ffcb5f52SNhat Pham	unsigned long eviction;
a528910eSJohannes Weiner
b0068472SYosry Ahmed	if (lru_gen_enabled()) {
9cbfd1c3SYu Zhao		bool recent;
b0068472SYosry Ahmed
9cbfd1c3SYu Zhao		rcu_read_lock();
f3d652b0SKairui Song		recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction,
f3d652b0SKairui Song					     workingset, file);
b0068472SYosry Ahmed		rcu_read_unlock();
b0068472SYosry Ahmed		return recent;
b0068472SYosry Ahmed	}
b0068472SYosry Ahmed
9cbfd1c3SYu Zhao	rcu_read_lock();
ffcb5f52SNhat Pham	unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
f3d652b0SKairui Song	eviction <<= bucket_order[file];
162453bfSJohannes Weiner
23047a96SJohannes Weiner	/*
23047a96SJohannes Weiner	 * Look up the memcg associated with the stored ID. It might
0995d7e5SMatthew Wilcox (Oracle)	 * have been deleted since the folio's eviction.
23047a96SJohannes Weiner	 *
23047a96SJohannes Weiner	 * Note that in rare events the ID could have been recycled
0995d7e5SMatthew Wilcox (Oracle)	 * for a new cgroup that refaults a shared folio. This is
23047a96SJohannes Weiner	 * impossible to tell from the available data. However, this
23047a96SJohannes Weiner	 * should be a rare and limited disturbance, and activations
23047a96SJohannes Weiner	 * are always speculative anyway. Ultimately, it's the aging
23047a96SJohannes Weiner	 * algorithm's job to shake out the minimum access frequency
23047a96SJohannes Weiner	 * for the active cache.
23047a96SJohannes Weiner	 *
23047a96SJohannes Weiner	 * XXX: On !CONFIG_MEMCG, this will always return NULL; it
23047a96SJohannes Weiner	 * would be better if the root_mem_cgroup existed in all
23047a96SJohannes Weiner	 * configurations instead.
23047a96SJohannes Weiner	 */
e77786b4SShakeel Butt	eviction_memcg = mem_cgroup_from_private_id(memcgid);
9cbfd1c3SYu Zhao	if (!mem_cgroup_tryget(eviction_memcg))
9cbfd1c3SYu Zhao		eviction_memcg = NULL;
b0068472SYosry Ahmed	rcu_read_unlock();
9cbfd1c3SYu Zhao
9cbfd1c3SYu Zhao	if (!mem_cgroup_disabled() && !eviction_memcg)
ffcb5f52SNhat Pham		return false;
7d7ef0a4SYosry Ahmed	/*
7d7ef0a4SYosry Ahmed	 * Flush stats (and potentially sleep) outside the RCU read section.
5a4d8944SNhat Pham	 *
5a4d8944SNhat Pham	 * Note that workingset_test_recent() itself might be called in RCU read
5a4d8944SNhat Pham	 * section (for e.g, in cachestat) - these callers need to skip flushing
5a4d8944SNhat Pham	 * stats (via the flush argument).
5a4d8944SNhat Pham	 *
7d7ef0a4SYosry Ahmed	 * XXX: With per-memcg flushing and thresholding, is ratelimiting
7d7ef0a4SYosry Ahmed	 * still needed here?
7d7ef0a4SYosry Ahmed	 */
5a4d8944SNhat Pham	if (flush)
7d7ef0a4SYosry Ahmed		mem_cgroup_flush_stats_ratelimited(eviction_memcg);
ffcb5f52SNhat Pham
b910718aSJohannes Weiner	eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
31d8fcacSJohannes Weiner	refault = atomic_long_read(&eviction_lruvec->nonresident_age);
162453bfSJohannes Weiner
162453bfSJohannes Weiner	/*
1899ad18SJohannes Weiner	 * Calculate the refault distance
162453bfSJohannes Weiner	 *
1899ad18SJohannes Weiner	 * The unsigned subtraction here gives an accurate distance
31d8fcacSJohannes Weiner	 * across nonresident_age overflows in most cases. There is a
1899ad18SJohannes Weiner	 * special case: usually, shadow entries have a short lifetime
1899ad18SJohannes Weiner	 * and are either refaulted or reclaimed along with the inode
1899ad18SJohannes Weiner	 * before they get too old.  But it is not impossible for the
31d8fcacSJohannes Weiner	 * nonresident_age to lap a shadow entry in the field, which
31d8fcacSJohannes Weiner	 * can then result in a false small refault distance, leading
31d8fcacSJohannes Weiner	 * to a false activation should this old entry actually
31d8fcacSJohannes Weiner	 * refault again.  However, earlier kernels used to deactivate
1899ad18SJohannes Weiner	 * unconditionally with *every* reclaim invocation for the
1899ad18SJohannes Weiner	 * longest time, so the occasional inappropriate activation
1899ad18SJohannes Weiner	 * leading to pressure on the active list is not a problem.
162453bfSJohannes Weiner	 */
f3d652b0SKairui Song	refault_distance = ((refault - eviction) &
f3d652b0SKairui Song			    (file ? EVICTION_MASK : EVICTION_MASK_ANON));
162453bfSJohannes Weiner
b910718aSJohannes Weiner	/*
1899ad18SJohannes Weiner	 * Compare the distance to the existing workingset size. We
34e58cacSJohannes Weiner	 * don't activate pages that couldn't stay resident even if
aae466b0SJoonsoo Kim	 * all the memory was available to the workingset. Whether
aae466b0SJoonsoo Kim	 * workingset competition needs to consider anon or not depends
ed8f3f99SYang Yang	 * on having free swap space.
1899ad18SJohannes Weiner	 */
34e58cacSJohannes Weiner	workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
aae466b0SJoonsoo Kim	if (!file) {
aae466b0SJoonsoo Kim		workingset_size += lruvec_page_state(eviction_lruvec,
aae466b0SJoonsoo Kim						     NR_INACTIVE_FILE);
aae466b0SJoonsoo Kim	}
f78dfc7bSJohannes Weiner	if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
34e58cacSJohannes Weiner		workingset_size += lruvec_page_state(eviction_lruvec,
34e58cacSJohannes Weiner						     NR_ACTIVE_ANON);
aae466b0SJoonsoo Kim		if (file) {
aae466b0SJoonsoo Kim			workingset_size += lruvec_page_state(eviction_lruvec,
aae466b0SJoonsoo Kim						     NR_INACTIVE_ANON);
aae466b0SJoonsoo Kim		}
34e58cacSJohannes Weiner	}
ffcb5f52SNhat Pham
b0068472SYosry Ahmed	mem_cgroup_put(eviction_memcg);
ffcb5f52SNhat Pham	return refault_distance <= workingset_size;
ffcb5f52SNhat Pham}
ffcb5f52SNhat Pham
ffcb5f52SNhat Pham/**
ffcb5f52SNhat Pham * workingset_refault - Evaluate the refault of a previously evicted folio.
ffcb5f52SNhat Pham * @folio: The freshly allocated replacement folio.
ffcb5f52SNhat Pham * @shadow: Shadow entry of the evicted folio.
ffcb5f52SNhat Pham *
ffcb5f52SNhat Pham * Calculates and evaluates the refault distance of the previously
ffcb5f52SNhat Pham * evicted folio in the context of the node and the memcg whose memory
ffcb5f52SNhat Pham * pressure caused the eviction.
ffcb5f52SNhat Pham */
ffcb5f52SNhat Phamvoid workingset_refault(struct folio *folio, void *shadow)
ffcb5f52SNhat Pham{
ffcb5f52SNhat Pham	bool file = folio_is_file_lru(folio);
fe132152SMuchun Song	struct mem_cgroup *memcg;
ffcb5f52SNhat Pham	struct lruvec *lruvec;
ffcb5f52SNhat Pham	bool workingset;
ffcb5f52SNhat Pham	long nr;
ffcb5f52SNhat Pham
9cbfd1c3SYu Zhao	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
9cbfd1c3SYu Zhao
ffcb5f52SNhat Pham	if (lru_gen_enabled()) {
ffcb5f52SNhat Pham		lru_gen_refault(folio, shadow);
ffcb5f52SNhat Pham		return;
ffcb5f52SNhat Pham	}
ffcb5f52SNhat Pham
ffcb5f52SNhat Pham	/*
ffcb5f52SNhat Pham	 * The activation decision for this folio is made at the level
ffcb5f52SNhat Pham	 * where the eviction occurred, as that is where the LRU order
ffcb5f52SNhat Pham	 * during folio reclaim is being determined.
ffcb5f52SNhat Pham	 *
ffcb5f52SNhat Pham	 * However, the cgroup that will own the folio is the one that
b0068472SYosry Ahmed	 * is actually experiencing the refault event. Make sure the folio is
b0068472SYosry Ahmed	 * locked to guarantee folio_memcg() stability throughout.
ffcb5f52SNhat Pham	 */
ffcb5f52SNhat Pham	nr = folio_nr_pages(folio);
fe132152SMuchun Song	memcg = get_mem_cgroup_from_folio(folio);
fe132152SMuchun Song	lruvec = mem_cgroup_lruvec(memcg, folio_pgdat(folio));
ffcb5f52SNhat Pham	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
ffcb5f52SNhat Pham
5a4d8944SNhat Pham	if (!workingset_test_recent(shadow, file, &workingset, true))
fe132152SMuchun Song		goto out;
1899ad18SJohannes Weiner
0995d7e5SMatthew Wilcox (Oracle)	folio_set_active(folio);
0995d7e5SMatthew Wilcox (Oracle)	workingset_age_nonresident(lruvec, nr);
0995d7e5SMatthew Wilcox (Oracle)	mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
1899ad18SJohannes Weiner
0995d7e5SMatthew Wilcox (Oracle)	/* Folio was active prior to eviction */
1899ad18SJohannes Weiner	if (workingset) {
0995d7e5SMatthew Wilcox (Oracle)		folio_set_workingset(folio);
6e1ca48dSVishal Moola (Oracle)		/*
6e1ca48dSVishal Moola (Oracle)		 * XXX: Move to folio_add_lru() when it supports new vs
6e1ca48dSVishal Moola (Oracle)		 * putback
6e1ca48dSVishal Moola (Oracle)		 */
0538a82cSJohannes Weiner		lru_note_cost_refault(folio);
0995d7e5SMatthew Wilcox (Oracle)		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
a528910eSJohannes Weiner	}
fe132152SMuchun Songout:
fe132152SMuchun Song	mem_cgroup_put(memcg);
a528910eSJohannes Weiner}
a528910eSJohannes Weiner
a528910eSJohannes Weiner/**
a528910eSJohannes Weiner * workingset_activation - note a page activation
c5ce619aSMatthew Wilcox (Oracle) * @folio: Folio that is being activated.
a528910eSJohannes Weiner */
c5ce619aSMatthew Wilcox (Oracle)void workingset_activation(struct folio *folio)
a528910eSJohannes Weiner{
23047a96SJohannes Weiner	/*
23047a96SJohannes Weiner	 * Filter non-memcg pages here, e.g. unmap can call
23047a96SJohannes Weiner	 * mark_page_accessed() on VDSO pages.
23047a96SJohannes Weiner	 */
50738297SMuchun Song	if (mem_cgroup_disabled() || folio_memcg_charged(folio)) {
50738297SMuchun Song		rcu_read_lock();
c5ce619aSMatthew Wilcox (Oracle)		workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
50738297SMuchun Song		rcu_read_unlock();
50738297SMuchun Song	}
a528910eSJohannes Weiner}
449dd698SJohannes Weiner
449dd698SJohannes Weiner/*
449dd698SJohannes Weiner * Shadow entries reflect the share of the working set that does not
449dd698SJohannes Weiner * fit into memory, so their number depends on the access pattern of
449dd698SJohannes Weiner * the workload.  In most cases, they will refault or get reclaimed
449dd698SJohannes Weiner * along with the inode, but a (malicious) workload that streams
449dd698SJohannes Weiner * through files with a total size several times that of available
449dd698SJohannes Weiner * memory, while preventing the inodes from being reclaimed, can
449dd698SJohannes Weiner * create excessive amounts of shadow nodes.  To keep a lid on this,
449dd698SJohannes Weiner * track shadow nodes and reclaim them when they grow way past the
449dd698SJohannes Weiner * point where they would still be useful.
449dd698SJohannes Weiner */
449dd698SJohannes Weiner
9bbdc0f3SMuchun Songstruct list_lru shadow_nodes;
14b46879SJohannes Weiner
a97e7904SMatthew Wilcoxvoid workingset_update_node(struct xa_node *node)
14b46879SJohannes Weiner{
4715c6a7SShakeel Butt	struct page *page = virt_to_page(node);
2386eef2SSebastian Andrzej Siewior
14b46879SJohannes Weiner	/*
14b46879SJohannes Weiner	 * Track non-empty nodes that contain only shadow entries;
14b46879SJohannes Weiner	 * unlink those that contain pages or are being freed.
14b46879SJohannes Weiner	 *
14b46879SJohannes Weiner	 * Avoid acquiring the list_lru lock when the nodes are
14b46879SJohannes Weiner	 * already where they should be. The list_empty() test is safe
b93b0163SMatthew Wilcox	 * as node->private_list is protected by the i_pages lock.
14b46879SJohannes Weiner	 */
551c643fSPedro Falcato	lockdep_assert_held(&node->array->xa_lock);
68d48e6aSJohannes Weiner
01959dfeSMatthew Wilcox	if (node->count && node->count == node->nr_values) {
68d48e6aSJohannes Weiner		if (list_empty(&node->private_list)) {
0a97c01cSNhat Pham			list_lru_add_obj(&shadow_nodes, &node->private_list);
4715c6a7SShakeel Butt			__inc_node_page_state(page, WORKINGSET_NODES);
68d48e6aSJohannes Weiner		}
14b46879SJohannes Weiner	} else {
68d48e6aSJohannes Weiner		if (!list_empty(&node->private_list)) {
0a97c01cSNhat Pham			list_lru_del_obj(&shadow_nodes, &node->private_list);
4715c6a7SShakeel Butt			__dec_node_page_state(page, WORKINGSET_NODES);
68d48e6aSJohannes Weiner		}
14b46879SJohannes Weiner	}
14b46879SJohannes Weiner}
449dd698SJohannes Weiner
449dd698SJohannes Weinerstatic unsigned long count_shadow_nodes(struct shrinker *shrinker,
449dd698SJohannes Weiner					struct shrink_control *sc)
449dd698SJohannes Weiner{
449dd698SJohannes Weiner	unsigned long max_nodes;
14b46879SJohannes Weiner	unsigned long nodes;
95f9ab2dSJohannes Weiner	unsigned long pages;
449dd698SJohannes Weiner
14b46879SJohannes Weiner	nodes = list_lru_shrink_count(&shadow_nodes, sc);
725cac1cSMiaohe Lin	if (!nodes)
725cac1cSMiaohe Lin		return SHRINK_EMPTY;
449dd698SJohannes Weiner
449dd698SJohannes Weiner	/*
a97e7904SMatthew Wilcox	 * Approximate a reasonable limit for the nodes
b5388998SJohannes Weiner	 * containing shadow entries. We don't need to keep more
b5388998SJohannes Weiner	 * shadow entries than possible pages on the active list,
b5388998SJohannes Weiner	 * since refault distances bigger than that are dismissed.
b5388998SJohannes Weiner	 *
b5388998SJohannes Weiner	 * The size of the active list converges toward 100% of
b5388998SJohannes Weiner	 * overall page cache as memory grows, with only a tiny
b5388998SJohannes Weiner	 * inactive list. Assume the total cache size for that.
b5388998SJohannes Weiner	 *
b5388998SJohannes Weiner	 * Nodes might be sparsely populated, with only one shadow
b5388998SJohannes Weiner	 * entry in the extreme case. Obviously, we cannot keep one
b5388998SJohannes Weiner	 * node for every eligible shadow entry, so compromise on a
b5388998SJohannes Weiner	 * worst-case density of 1/8th. Below that, not all eligible
b5388998SJohannes Weiner	 * refaults can be detected anymore.
449dd698SJohannes Weiner	 *
a97e7904SMatthew Wilcox	 * On 64-bit with 7 xa_nodes per page and 64 slots
449dd698SJohannes Weiner	 * each, this will reclaim shadow entries when they consume
b5388998SJohannes Weiner	 * ~1.8% of available memory:
449dd698SJohannes Weiner	 *
a97e7904SMatthew Wilcox	 * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
449dd698SJohannes Weiner	 */
95f9ab2dSJohannes Weiner#ifdef CONFIG_MEMCG
b5388998SJohannes Weiner	if (sc->memcg) {
95f9ab2dSJohannes Weiner		struct lruvec *lruvec;
2b487e59SJohannes Weiner		int i;
95f9ab2dSJohannes Weiner
d4a5b369SShakeel Butt		mem_cgroup_flush_stats_ratelimited(sc->memcg);
867e5e1dSJohannes Weiner		lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
*7404bd37SQi Zheng
2b487e59SJohannes Weiner		for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
*7404bd37SQi Zheng			pages += lruvec_lru_size(lruvec, i, MAX_NR_ZONES - 1);
*7404bd37SQi Zheng
d42f3245SRoman Gushchin		pages += lruvec_page_state_local(
d42f3245SRoman Gushchin			lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
d42f3245SRoman Gushchin		pages += lruvec_page_state_local(
d42f3245SRoman Gushchin			lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
95f9ab2dSJohannes Weiner	} else
95f9ab2dSJohannes Weiner#endif
95f9ab2dSJohannes Weiner		pages = node_present_pages(sc->nid);
95f9ab2dSJohannes Weiner
dad4f140SLinus Torvalds	max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
449dd698SJohannes Weiner
14b46879SJohannes Weiner	if (nodes <= max_nodes)
449dd698SJohannes Weiner		return 0;
14b46879SJohannes Weiner	return nodes - max_nodes;
449dd698SJohannes Weiner}
449dd698SJohannes Weiner
449dd698SJohannes Weinerstatic enum lru_status shadow_lru_isolate(struct list_head *item,
3f97b163SVladimir Davydov					  struct list_lru_one *lru,
da0c0251SKairui Song					  void *arg) __must_hold(lru->lock)
449dd698SJohannes Weiner{
a97e7904SMatthew Wilcox	struct xa_node *node = container_of(item, struct xa_node, private_list);
449dd698SJohannes Weiner	struct address_space *mapping;
449dd698SJohannes Weiner	int ret;
449dd698SJohannes Weiner
449dd698SJohannes Weiner	/*
f82cd2f0SMatthew Wilcox (Oracle)	 * Page cache insertions and deletions synchronously maintain
b93b0163SMatthew Wilcox	 * the shadow node LRU under the i_pages lock and the
da0c0251SKairui Song	 * &lru->lock. Because the page cache tree is emptied before
da0c0251SKairui Song	 * the inode can be destroyed, holding the &lru->lock pins any
a97e7904SMatthew Wilcox	 * address_space that has nodes on the LRU.
449dd698SJohannes Weiner	 *
b93b0163SMatthew Wilcox	 * We can then safely transition to the i_pages lock to
449dd698SJohannes Weiner	 * pin only the address_space of the particular node we want
da0c0251SKairui Song	 * to reclaim, take the node off-LRU, and drop the &lru->lock.
449dd698SJohannes Weiner	 */
449dd698SJohannes Weiner
01959dfeSMatthew Wilcox	mapping = container_of(node->array, struct address_space, i_pages);
449dd698SJohannes Weiner
449dd698SJohannes Weiner	/* Coming from the list, invert the lock order */
b93b0163SMatthew Wilcox	if (!xa_trylock(&mapping->i_pages)) {
da0c0251SKairui Song		spin_unlock_irq(&lru->lock);
449dd698SJohannes Weiner		ret = LRU_RETRY;
449dd698SJohannes Weiner		goto out;
449dd698SJohannes Weiner	}
449dd698SJohannes Weiner
5649d113SYang Yang	/* For page cache we need to hold i_lock */
5649d113SYang Yang	if (mapping->host != NULL) {
51b8c1feSJohannes Weiner		if (!spin_trylock(&mapping->host->i_lock)) {
51b8c1feSJohannes Weiner			xa_unlock(&mapping->i_pages);
da0c0251SKairui Song			spin_unlock_irq(&lru->lock);
51b8c1feSJohannes Weiner			ret = LRU_RETRY;
51b8c1feSJohannes Weiner			goto out;
51b8c1feSJohannes Weiner		}
5649d113SYang Yang	}
51b8c1feSJohannes Weiner
3f97b163SVladimir Davydov	list_lru_isolate(lru, item);
4715c6a7SShakeel Butt	__dec_node_page_state(virt_to_page(node), WORKINGSET_NODES);
68d48e6aSJohannes Weiner
da0c0251SKairui Song	spin_unlock(&lru->lock);
449dd698SJohannes Weiner
449dd698SJohannes Weiner	/*
449dd698SJohannes Weiner	 * The nodes should only contain one or more shadow entries,
449dd698SJohannes Weiner	 * no pages, so we expect to be able to remove them all and
449dd698SJohannes Weiner	 * delete and free the empty node afterwards.
449dd698SJohannes Weiner	 */
01959dfeSMatthew Wilcox	if (WARN_ON_ONCE(!node->nr_values))
b936887eSJohannes Weiner		goto out_invalid;
01959dfeSMatthew Wilcox	if (WARN_ON_ONCE(node->count != node->nr_values))
b936887eSJohannes Weiner		goto out_invalid;
f82cd2f0SMatthew Wilcox (Oracle)	xa_delete_node(node, workingset_update_node);
f3b566d7SChen Ridong	mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1);
449dd698SJohannes Weiner
b936887eSJohannes Weinerout_invalid:
6ca342d0SSebastian Andrzej Siewior	xa_unlock_irq(&mapping->i_pages);
5649d113SYang Yang	if (mapping->host != NULL) {
51b8c1feSJohannes Weiner		if (mapping_shrinkable(mapping))
4c6b4087SMateusz Guzik			inode_lru_list_add(mapping->host);
51b8c1feSJohannes Weiner		spin_unlock(&mapping->host->i_lock);
5649d113SYang Yang	}
449dd698SJohannes Weiner	ret = LRU_REMOVED_RETRY;
449dd698SJohannes Weinerout:
449dd698SJohannes Weiner	cond_resched();
449dd698SJohannes Weiner	return ret;
449dd698SJohannes Weiner}
449dd698SJohannes Weiner
449dd698SJohannes Weinerstatic unsigned long scan_shadow_nodes(struct shrinker *shrinker,
449dd698SJohannes Weiner				       struct shrink_control *sc)
449dd698SJohannes Weiner{
b93b0163SMatthew Wilcox	/* list_lru lock nests inside the IRQ-safe i_pages lock */
6b51e881SSebastian Andrzej Siewior	return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
6b51e881SSebastian Andrzej Siewior					NULL);
449dd698SJohannes Weiner}
449dd698SJohannes Weiner
449dd698SJohannes Weiner/*
449dd698SJohannes Weiner * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
b93b0163SMatthew Wilcox * i_pages lock.
449dd698SJohannes Weiner */
449dd698SJohannes Weinerstatic struct lock_class_key shadow_nodes_key;
449dd698SJohannes Weiner
449dd698SJohannes Weinerstatic int __init workingset_init(void)
449dd698SJohannes Weiner{
f3d652b0SKairui Song	unsigned int timestamp_bits, timestamp_bits_anon;
219c666eSQi Zheng	struct shrinker *workingset_shadow_shrinker;
612e4493SJohannes Weiner	unsigned int max_order;
219c666eSQi Zheng	int ret = -ENOMEM;
449dd698SJohannes Weiner
612e4493SJohannes Weiner	BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
612e4493SJohannes Weiner	/*
612e4493SJohannes Weiner	 * Calculate the eviction bucket size to cover the longest
612e4493SJohannes Weiner	 * actionable refault distance, which is currently half of
612e4493SJohannes Weiner	 * memory (totalram_pages/2). However, memory hotplug may add
612e4493SJohannes Weiner	 * some more pages at runtime, so keep working with up to
612e4493SJohannes Weiner	 * double the initial memory by using totalram_pages as-is.
612e4493SJohannes Weiner	 */
612e4493SJohannes Weiner	timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
f3d652b0SKairui Song	timestamp_bits_anon = BITS_PER_LONG - EVICTION_SHIFT_ANON;
ca79b0c2SArun KS	max_order = fls_long(totalram_pages() - 1);
f3d652b0SKairui Song	if (max_order > (BITS_PER_LONG - EVICTION_SHIFT))
f3d652b0SKairui Song		bucket_order[WORKINGSET_FILE] = max_order - timestamp_bits;
f3d652b0SKairui Song	if (max_order > timestamp_bits_anon)
f3d652b0SKairui Song		bucket_order[WORKINGSET_ANON] = max_order - timestamp_bits_anon;
f3d652b0SKairui Song	pr_info("workingset: timestamp_bits=%d (anon: %d) max_order=%d bucket_order=%u (anon: %d)\n",
f3d652b0SKairui Song		timestamp_bits, timestamp_bits_anon, max_order,
f3d652b0SKairui Song		bucket_order[WORKINGSET_FILE], bucket_order[WORKINGSET_ANON]);
612e4493SJohannes Weiner
219c666eSQi Zheng	workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
219c666eSQi Zheng						    SHRINKER_MEMCG_AWARE,
219c666eSQi Zheng						    "mm-shadow");
219c666eSQi Zheng	if (!workingset_shadow_shrinker)
449dd698SJohannes Weiner		goto err;
219c666eSQi Zheng
3f28bbe5SKairui Song	ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker,
3f28bbe5SKairui Song				      &shadow_nodes_key);
449dd698SJohannes Weiner	if (ret)
449dd698SJohannes Weiner		goto err_list_lru;
219c666eSQi Zheng
219c666eSQi Zheng	workingset_shadow_shrinker->count_objects = count_shadow_nodes;
219c666eSQi Zheng	workingset_shadow_shrinker->scan_objects = scan_shadow_nodes;
219c666eSQi Zheng	/* ->count reports only fully expendable nodes */
219c666eSQi Zheng	workingset_shadow_shrinker->seeks = 0;
219c666eSQi Zheng
219c666eSQi Zheng	shrinker_register(workingset_shadow_shrinker);
449dd698SJohannes Weiner	return 0;
449dd698SJohannes Weinererr_list_lru:
219c666eSQi Zheng	shrinker_free(workingset_shadow_shrinker);
449dd698SJohannes Weinererr:
449dd698SJohannes Weiner	return ret;
449dd698SJohannes Weiner}
449dd698SJohannes Weinermodule_init(workingset_init);