xref: /linux/mm/page_counter.c (revision bbfd5594756011167b8f8de9a00e0c946afda1e6)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
23e32cb2eSJohannes Weiner /*
33e32cb2eSJohannes Weiner  * Lockless hierarchical page accounting & limiting
43e32cb2eSJohannes Weiner  *
53e32cb2eSJohannes Weiner  * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
63e32cb2eSJohannes Weiner  */
73e32cb2eSJohannes Weiner 
83e32cb2eSJohannes Weiner #include <linux/page_counter.h>
93e32cb2eSJohannes Weiner #include <linux/atomic.h>
103e32cb2eSJohannes Weiner #include <linux/kernel.h>
113e32cb2eSJohannes Weiner #include <linux/string.h>
123e32cb2eSJohannes Weiner #include <linux/sched.h>
133e32cb2eSJohannes Weiner #include <linux/bug.h>
143e32cb2eSJohannes Weiner #include <asm/page.h>
153e32cb2eSJohannes Weiner 
track_protection(struct page_counter * c)16f77bd4b1SRoman Gushchin static bool track_protection(struct page_counter *c)
17f77bd4b1SRoman Gushchin {
18f77bd4b1SRoman Gushchin 	return c->protection_support;
19f77bd4b1SRoman Gushchin }
20f77bd4b1SRoman Gushchin 
propagate_protected_usage(struct page_counter * c,unsigned long usage)21bf8d5d52SRoman Gushchin static void propagate_protected_usage(struct page_counter *c,
22bf8d5d52SRoman Gushchin 				      unsigned long usage)
2323067153SRoman Gushchin {
24bf8d5d52SRoman Gushchin 	unsigned long protected, old_protected;
2523067153SRoman Gushchin 	long delta;
2623067153SRoman Gushchin 
2723067153SRoman Gushchin 	if (!c->parent)
2823067153SRoman Gushchin 		return;
2923067153SRoman Gushchin 
30cfdab60bSShakeel Butt 	protected = min(usage, READ_ONCE(c->min));
31cfdab60bSShakeel Butt 	old_protected = atomic_long_read(&c->min_usage);
32cfdab60bSShakeel Butt 	if (protected != old_protected) {
33bf8d5d52SRoman Gushchin 		old_protected = atomic_long_xchg(&c->min_usage, protected);
34bf8d5d52SRoman Gushchin 		delta = protected - old_protected;
35bf8d5d52SRoman Gushchin 		if (delta)
36bf8d5d52SRoman Gushchin 			atomic_long_add(delta, &c->parent->children_min_usage);
37bf8d5d52SRoman Gushchin 	}
38bf8d5d52SRoman Gushchin 
39cfdab60bSShakeel Butt 	protected = min(usage, READ_ONCE(c->low));
40cfdab60bSShakeel Butt 	old_protected = atomic_long_read(&c->low_usage);
41cfdab60bSShakeel Butt 	if (protected != old_protected) {
42bf8d5d52SRoman Gushchin 		old_protected = atomic_long_xchg(&c->low_usage, protected);
43bf8d5d52SRoman Gushchin 		delta = protected - old_protected;
4423067153SRoman Gushchin 		if (delta)
4523067153SRoman Gushchin 			atomic_long_add(delta, &c->parent->children_low_usage);
4623067153SRoman Gushchin 	}
47bf8d5d52SRoman Gushchin }
4823067153SRoman Gushchin 
493e32cb2eSJohannes Weiner /**
503e32cb2eSJohannes Weiner  * page_counter_cancel - take pages out of the local counter
513e32cb2eSJohannes Weiner  * @counter: counter
523e32cb2eSJohannes Weiner  * @nr_pages: number of pages to cancel
533e32cb2eSJohannes Weiner  */
page_counter_cancel(struct page_counter * counter,unsigned long nr_pages)5464f21993SJohannes Weiner void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
553e32cb2eSJohannes Weiner {
563e32cb2eSJohannes Weiner 	long new;
573e32cb2eSJohannes Weiner 
58bbec2e15SRoman Gushchin 	new = atomic_long_sub_return(nr_pages, &counter->usage);
593e32cb2eSJohannes Weiner 	/* More uncharges than charges? */
609317d0ffSJohannes Weiner 	if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n",
619317d0ffSJohannes Weiner 		      new, nr_pages)) {
629317d0ffSJohannes Weiner 		new = 0;
639317d0ffSJohannes Weiner 		atomic_long_set(&counter->usage, new);
649317d0ffSJohannes Weiner 	}
65f77bd4b1SRoman Gushchin 	if (track_protection(counter))
669317d0ffSJohannes Weiner 		propagate_protected_usage(counter, new);
673e32cb2eSJohannes Weiner }
683e32cb2eSJohannes Weiner 
693e32cb2eSJohannes Weiner /**
703e32cb2eSJohannes Weiner  * page_counter_charge - hierarchically charge pages
713e32cb2eSJohannes Weiner  * @counter: counter
723e32cb2eSJohannes Weiner  * @nr_pages: number of pages to charge
733e32cb2eSJohannes Weiner  *
743e32cb2eSJohannes Weiner  * NOTE: This does not consider any configured counter limits.
753e32cb2eSJohannes Weiner  */
page_counter_charge(struct page_counter * counter,unsigned long nr_pages)763e32cb2eSJohannes Weiner void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
773e32cb2eSJohannes Weiner {
783e32cb2eSJohannes Weiner 	struct page_counter *c;
79f77bd4b1SRoman Gushchin 	bool protection = track_protection(counter);
803e32cb2eSJohannes Weiner 
813e32cb2eSJohannes Weiner 	for (c = counter; c; c = c->parent) {
823e32cb2eSJohannes Weiner 		long new;
833e32cb2eSJohannes Weiner 
84bbec2e15SRoman Gushchin 		new = atomic_long_add_return(nr_pages, &c->usage);
85f77bd4b1SRoman Gushchin 		if (protection)
86a6f23d14SMichal Koutný 			propagate_protected_usage(c, new);
873e32cb2eSJohannes Weiner 		/*
883e32cb2eSJohannes Weiner 		 * This is indeed racy, but we can live with some
893e32cb2eSJohannes Weiner 		 * inaccuracy in the watermark.
90c6f53ed8SDavid Finkel 		 *
91c6f53ed8SDavid Finkel 		 * Notably, we have two watermarks to allow for both a globally
92c6f53ed8SDavid Finkel 		 * visible peak and one that can be reset at a smaller scope.
93c6f53ed8SDavid Finkel 		 *
94c6f53ed8SDavid Finkel 		 * Since we reset both watermarks when the global reset occurs,
95c6f53ed8SDavid Finkel 		 * we can guarantee that watermark >= local_watermark, so we
96c6f53ed8SDavid Finkel 		 * don't need to do both comparisons every time.
97c6f53ed8SDavid Finkel 		 *
98c6f53ed8SDavid Finkel 		 * On systems with branch predictors, the inner condition should
99c6f53ed8SDavid Finkel 		 * be almost free.
1003e32cb2eSJohannes Weiner 		 */
101c6f53ed8SDavid Finkel 		if (new > READ_ONCE(c->local_watermark)) {
102c6f53ed8SDavid Finkel 			WRITE_ONCE(c->local_watermark, new);
1036e4bd50fSQian Cai 			if (new > READ_ONCE(c->watermark))
1046e4bd50fSQian Cai 				WRITE_ONCE(c->watermark, new);
1053e32cb2eSJohannes Weiner 		}
1063e32cb2eSJohannes Weiner 	}
107c6f53ed8SDavid Finkel }
1083e32cb2eSJohannes Weiner 
1093e32cb2eSJohannes Weiner /**
1103e32cb2eSJohannes Weiner  * page_counter_try_charge - try to hierarchically charge pages
1113e32cb2eSJohannes Weiner  * @counter: counter
1123e32cb2eSJohannes Weiner  * @nr_pages: number of pages to charge
1133e32cb2eSJohannes Weiner  * @fail: points first counter to hit its limit, if any
1143e32cb2eSJohannes Weiner  *
1156071ca52SJohannes Weiner  * Returns %true on success, or %false and @fail if the counter or one
1166071ca52SJohannes Weiner  * of its ancestors has hit its configured limit.
1173e32cb2eSJohannes Weiner  */
page_counter_try_charge(struct page_counter * counter,unsigned long nr_pages,struct page_counter ** fail)1186071ca52SJohannes Weiner bool page_counter_try_charge(struct page_counter *counter,
1193e32cb2eSJohannes Weiner 			     unsigned long nr_pages,
1203e32cb2eSJohannes Weiner 			     struct page_counter **fail)
1213e32cb2eSJohannes Weiner {
1223e32cb2eSJohannes Weiner 	struct page_counter *c;
123f77bd4b1SRoman Gushchin 	bool protection = track_protection(counter);
124*0e2759afSShakeel Butt 	bool track_failcnt = counter->track_failcnt;
1253e32cb2eSJohannes Weiner 
1263e32cb2eSJohannes Weiner 	for (c = counter; c; c = c->parent) {
1273e32cb2eSJohannes Weiner 		long new;
1283e32cb2eSJohannes Weiner 		/*
1293e32cb2eSJohannes Weiner 		 * Charge speculatively to avoid an expensive CAS.  If
1303e32cb2eSJohannes Weiner 		 * a bigger charge fails, it might falsely lock out a
1313e32cb2eSJohannes Weiner 		 * racing smaller charge and send it into reclaim
1323e32cb2eSJohannes Weiner 		 * early, but the error is limited to the difference
1333e32cb2eSJohannes Weiner 		 * between the two sizes, which is less than 2M/4M in
1343e32cb2eSJohannes Weiner 		 * case of a THP locking out a regular page charge.
1353e32cb2eSJohannes Weiner 		 *
1363e32cb2eSJohannes Weiner 		 * The atomic_long_add_return() implies a full memory
1373e32cb2eSJohannes Weiner 		 * barrier between incrementing the count and reading
138d437024eSMiaohe Lin 		 * the limit.  When racing with page_counter_set_max(),
1393e32cb2eSJohannes Weiner 		 * we either see the new limit or the setter sees the
1403e32cb2eSJohannes Weiner 		 * counter has changed and retries.
1413e32cb2eSJohannes Weiner 		 */
142bbec2e15SRoman Gushchin 		new = atomic_long_add_return(nr_pages, &c->usage);
143bbec2e15SRoman Gushchin 		if (new > c->max) {
144bbec2e15SRoman Gushchin 			atomic_long_sub(nr_pages, &c->usage);
1453e32cb2eSJohannes Weiner 			/*
1463e32cb2eSJohannes Weiner 			 * This is racy, but we can live with some
1476e4bd50fSQian Cai 			 * inaccuracy in the failcnt which is only used
1486e4bd50fSQian Cai 			 * to report stats.
1493e32cb2eSJohannes Weiner 			 */
150*0e2759afSShakeel Butt 			if (track_failcnt)
1516e4bd50fSQian Cai 				data_race(c->failcnt++);
1523e32cb2eSJohannes Weiner 			*fail = c;
1533e32cb2eSJohannes Weiner 			goto failed;
1543e32cb2eSJohannes Weiner 		}
155f77bd4b1SRoman Gushchin 		if (protection)
156a6f23d14SMichal Koutný 			propagate_protected_usage(c, new);
157f77bd4b1SRoman Gushchin 
158c6f53ed8SDavid Finkel 		/* see comment on page_counter_charge */
159c6f53ed8SDavid Finkel 		if (new > READ_ONCE(c->local_watermark)) {
160c6f53ed8SDavid Finkel 			WRITE_ONCE(c->local_watermark, new);
1616e4bd50fSQian Cai 			if (new > READ_ONCE(c->watermark))
1626e4bd50fSQian Cai 				WRITE_ONCE(c->watermark, new);
1633e32cb2eSJohannes Weiner 		}
164c6f53ed8SDavid Finkel 	}
1656071ca52SJohannes Weiner 	return true;
1663e32cb2eSJohannes Weiner 
1673e32cb2eSJohannes Weiner failed:
1683e32cb2eSJohannes Weiner 	for (c = counter; c != *fail; c = c->parent)
1693e32cb2eSJohannes Weiner 		page_counter_cancel(c, nr_pages);
1703e32cb2eSJohannes Weiner 
1716071ca52SJohannes Weiner 	return false;
1723e32cb2eSJohannes Weiner }
1733e32cb2eSJohannes Weiner 
1743e32cb2eSJohannes Weiner /**
1753e32cb2eSJohannes Weiner  * page_counter_uncharge - hierarchically uncharge pages
1763e32cb2eSJohannes Weiner  * @counter: counter
1773e32cb2eSJohannes Weiner  * @nr_pages: number of pages to uncharge
1783e32cb2eSJohannes Weiner  */
page_counter_uncharge(struct page_counter * counter,unsigned long nr_pages)17964f21993SJohannes Weiner void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
1803e32cb2eSJohannes Weiner {
1813e32cb2eSJohannes Weiner 	struct page_counter *c;
1823e32cb2eSJohannes Weiner 
18364f21993SJohannes Weiner 	for (c = counter; c; c = c->parent)
18464f21993SJohannes Weiner 		page_counter_cancel(c, nr_pages);
1853e32cb2eSJohannes Weiner }
1863e32cb2eSJohannes Weiner 
1873e32cb2eSJohannes Weiner /**
188bbec2e15SRoman Gushchin  * page_counter_set_max - set the maximum number of pages allowed
1893e32cb2eSJohannes Weiner  * @counter: counter
190bbec2e15SRoman Gushchin  * @nr_pages: limit to set
1913e32cb2eSJohannes Weiner  *
1923e32cb2eSJohannes Weiner  * Returns 0 on success, -EBUSY if the current number of pages on the
1933e32cb2eSJohannes Weiner  * counter already exceeds the specified limit.
1943e32cb2eSJohannes Weiner  *
1953e32cb2eSJohannes Weiner  * The caller must serialize invocations on the same counter.
1963e32cb2eSJohannes Weiner  */
page_counter_set_max(struct page_counter * counter,unsigned long nr_pages)197bbec2e15SRoman Gushchin int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
1983e32cb2eSJohannes Weiner {
1993e32cb2eSJohannes Weiner 	for (;;) {
2003e32cb2eSJohannes Weiner 		unsigned long old;
201bbec2e15SRoman Gushchin 		long usage;
2023e32cb2eSJohannes Weiner 
2033e32cb2eSJohannes Weiner 		/*
2043e32cb2eSJohannes Weiner 		 * Update the limit while making sure that it's not
2053e32cb2eSJohannes Weiner 		 * below the concurrently-changing counter value.
2063e32cb2eSJohannes Weiner 		 *
2073e32cb2eSJohannes Weiner 		 * The xchg implies two full memory barriers before
2083e32cb2eSJohannes Weiner 		 * and after, so the read-swap-read is ordered and
2093e32cb2eSJohannes Weiner 		 * ensures coherency with page_counter_try_charge():
2103e32cb2eSJohannes Weiner 		 * that function modifies the count before checking
2113e32cb2eSJohannes Weiner 		 * the limit, so if it sees the old limit, we see the
2123e32cb2eSJohannes Weiner 		 * modified counter and retry.
2133e32cb2eSJohannes Weiner 		 */
21413064781SHui Su 		usage = page_counter_read(counter);
2153e32cb2eSJohannes Weiner 
216bbec2e15SRoman Gushchin 		if (usage > nr_pages)
2173e32cb2eSJohannes Weiner 			return -EBUSY;
2183e32cb2eSJohannes Weiner 
219bbec2e15SRoman Gushchin 		old = xchg(&counter->max, nr_pages);
2203e32cb2eSJohannes Weiner 
22132d77270SBui Quang Minh 		if (page_counter_read(counter) <= usage || nr_pages >= old)
2223e32cb2eSJohannes Weiner 			return 0;
2233e32cb2eSJohannes Weiner 
224bbec2e15SRoman Gushchin 		counter->max = old;
2253e32cb2eSJohannes Weiner 		cond_resched();
2263e32cb2eSJohannes Weiner 	}
2273e32cb2eSJohannes Weiner }
2283e32cb2eSJohannes Weiner 
2293e32cb2eSJohannes Weiner /**
230bf8d5d52SRoman Gushchin  * page_counter_set_min - set the amount of protected memory
231bf8d5d52SRoman Gushchin  * @counter: counter
232bf8d5d52SRoman Gushchin  * @nr_pages: value to set
233bf8d5d52SRoman Gushchin  *
234bf8d5d52SRoman Gushchin  * The caller must serialize invocations on the same counter.
235bf8d5d52SRoman Gushchin  */
page_counter_set_min(struct page_counter * counter,unsigned long nr_pages)236bf8d5d52SRoman Gushchin void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
237bf8d5d52SRoman Gushchin {
238bf8d5d52SRoman Gushchin 	struct page_counter *c;
239bf8d5d52SRoman Gushchin 
240c3d53200SChris Down 	WRITE_ONCE(counter->min, nr_pages);
241bf8d5d52SRoman Gushchin 
242bf8d5d52SRoman Gushchin 	for (c = counter; c; c = c->parent)
243bf8d5d52SRoman Gushchin 		propagate_protected_usage(c, atomic_long_read(&c->usage));
244bf8d5d52SRoman Gushchin }
245bf8d5d52SRoman Gushchin 
246bf8d5d52SRoman Gushchin /**
24723067153SRoman Gushchin  * page_counter_set_low - set the amount of protected memory
24823067153SRoman Gushchin  * @counter: counter
24923067153SRoman Gushchin  * @nr_pages: value to set
25023067153SRoman Gushchin  *
25123067153SRoman Gushchin  * The caller must serialize invocations on the same counter.
25223067153SRoman Gushchin  */
page_counter_set_low(struct page_counter * counter,unsigned long nr_pages)25323067153SRoman Gushchin void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
25423067153SRoman Gushchin {
25523067153SRoman Gushchin 	struct page_counter *c;
25623067153SRoman Gushchin 
257f86b810cSChris Down 	WRITE_ONCE(counter->low, nr_pages);
25823067153SRoman Gushchin 
25923067153SRoman Gushchin 	for (c = counter; c; c = c->parent)
260bf8d5d52SRoman Gushchin 		propagate_protected_usage(c, atomic_long_read(&c->usage));
26123067153SRoman Gushchin }
26223067153SRoman Gushchin 
26323067153SRoman Gushchin /**
2643e32cb2eSJohannes Weiner  * page_counter_memparse - memparse() for page counter limits
2653e32cb2eSJohannes Weiner  * @buf: string to parse
266650c5e56SJohannes Weiner  * @max: string meaning maximum possible value
2673e32cb2eSJohannes Weiner  * @nr_pages: returns the result in number of pages
2683e32cb2eSJohannes Weiner  *
2693e32cb2eSJohannes Weiner  * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
2703e32cb2eSJohannes Weiner  * limited to %PAGE_COUNTER_MAX.
2713e32cb2eSJohannes Weiner  */
page_counter_memparse(const char * buf,const char * max,unsigned long * nr_pages)272650c5e56SJohannes Weiner int page_counter_memparse(const char *buf, const char *max,
273650c5e56SJohannes Weiner 			  unsigned long *nr_pages)
2743e32cb2eSJohannes Weiner {
2753e32cb2eSJohannes Weiner 	char *end;
2763e32cb2eSJohannes Weiner 	u64 bytes;
2773e32cb2eSJohannes Weiner 
278650c5e56SJohannes Weiner 	if (!strcmp(buf, max)) {
2793e32cb2eSJohannes Weiner 		*nr_pages = PAGE_COUNTER_MAX;
2803e32cb2eSJohannes Weiner 		return 0;
2813e32cb2eSJohannes Weiner 	}
2823e32cb2eSJohannes Weiner 
2833e32cb2eSJohannes Weiner 	bytes = memparse(buf, &end);
2843e32cb2eSJohannes Weiner 	if (*end != '\0')
2853e32cb2eSJohannes Weiner 		return -EINVAL;
2863e32cb2eSJohannes Weiner 
2873e32cb2eSJohannes Weiner 	*nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
2883e32cb2eSJohannes Weiner 
2893e32cb2eSJohannes Weiner 	return 0;
2903e32cb2eSJohannes Weiner }
291a8585ac6SMaarten Lankhorst 
292a8585ac6SMaarten Lankhorst 
293b168ed45SMaarten Lankhorst #if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
294a8585ac6SMaarten Lankhorst /*
295a8585ac6SMaarten Lankhorst  * This function calculates an individual page counter's effective
296a8585ac6SMaarten Lankhorst  * protection which is derived from its own memory.min/low, its
297a8585ac6SMaarten Lankhorst  * parent's and siblings' settings, as well as the actual memory
298a8585ac6SMaarten Lankhorst  * distribution in the tree.
299a8585ac6SMaarten Lankhorst  *
300a8585ac6SMaarten Lankhorst  * The following rules apply to the effective protection values:
301a8585ac6SMaarten Lankhorst  *
302a8585ac6SMaarten Lankhorst  * 1. At the first level of reclaim, effective protection is equal to
303a8585ac6SMaarten Lankhorst  *    the declared protection in memory.min and memory.low.
304a8585ac6SMaarten Lankhorst  *
305a8585ac6SMaarten Lankhorst  * 2. To enable safe delegation of the protection configuration, at
306a8585ac6SMaarten Lankhorst  *    subsequent levels the effective protection is capped to the
307a8585ac6SMaarten Lankhorst  *    parent's effective protection.
308a8585ac6SMaarten Lankhorst  *
309a8585ac6SMaarten Lankhorst  * 3. To make complex and dynamic subtrees easier to configure, the
310a8585ac6SMaarten Lankhorst  *    user is allowed to overcommit the declared protection at a given
311a8585ac6SMaarten Lankhorst  *    level. If that is the case, the parent's effective protection is
312a8585ac6SMaarten Lankhorst  *    distributed to the children in proportion to how much protection
313a8585ac6SMaarten Lankhorst  *    they have declared and how much of it they are utilizing.
314a8585ac6SMaarten Lankhorst  *
315a8585ac6SMaarten Lankhorst  *    This makes distribution proportional, but also work-conserving:
316a8585ac6SMaarten Lankhorst  *    if one counter claims much more protection than it uses memory,
317a8585ac6SMaarten Lankhorst  *    the unused remainder is available to its siblings.
318a8585ac6SMaarten Lankhorst  *
319a8585ac6SMaarten Lankhorst  * 4. Conversely, when the declared protection is undercommitted at a
320a8585ac6SMaarten Lankhorst  *    given level, the distribution of the larger parental protection
321a8585ac6SMaarten Lankhorst  *    budget is NOT proportional. A counter's protection from a sibling
322a8585ac6SMaarten Lankhorst  *    is capped to its own memory.min/low setting.
323a8585ac6SMaarten Lankhorst  *
324a8585ac6SMaarten Lankhorst  * 5. However, to allow protecting recursive subtrees from each other
325a8585ac6SMaarten Lankhorst  *    without having to declare each individual counter's fixed share
326a8585ac6SMaarten Lankhorst  *    of the ancestor's claim to protection, any unutilized -
327a8585ac6SMaarten Lankhorst  *    "floating" - protection from up the tree is distributed in
328a8585ac6SMaarten Lankhorst  *    proportion to each counter's *usage*. This makes the protection
329a8585ac6SMaarten Lankhorst  *    neutral wrt sibling cgroups and lets them compete freely over
330a8585ac6SMaarten Lankhorst  *    the shared parental protection budget, but it protects the
331a8585ac6SMaarten Lankhorst  *    subtree as a whole from neighboring subtrees.
332a8585ac6SMaarten Lankhorst  *
333a8585ac6SMaarten Lankhorst  * Note that 4. and 5. are not in conflict: 4. is about protecting
334a8585ac6SMaarten Lankhorst  * against immediate siblings whereas 5. is about protecting against
335a8585ac6SMaarten Lankhorst  * neighboring subtrees.
336a8585ac6SMaarten Lankhorst  */
effective_protection(unsigned long usage,unsigned long parent_usage,unsigned long setting,unsigned long parent_effective,unsigned long siblings_protected,bool recursive_protection)337a8585ac6SMaarten Lankhorst static unsigned long effective_protection(unsigned long usage,
338a8585ac6SMaarten Lankhorst 					  unsigned long parent_usage,
339a8585ac6SMaarten Lankhorst 					  unsigned long setting,
340a8585ac6SMaarten Lankhorst 					  unsigned long parent_effective,
341a8585ac6SMaarten Lankhorst 					  unsigned long siblings_protected,
342a8585ac6SMaarten Lankhorst 					  bool recursive_protection)
343a8585ac6SMaarten Lankhorst {
344a8585ac6SMaarten Lankhorst 	unsigned long protected;
345a8585ac6SMaarten Lankhorst 	unsigned long ep;
346a8585ac6SMaarten Lankhorst 
347a8585ac6SMaarten Lankhorst 	protected = min(usage, setting);
348a8585ac6SMaarten Lankhorst 	/*
349a8585ac6SMaarten Lankhorst 	 * If all cgroups at this level combined claim and use more
350a8585ac6SMaarten Lankhorst 	 * protection than what the parent affords them, distribute
351a8585ac6SMaarten Lankhorst 	 * shares in proportion to utilization.
352a8585ac6SMaarten Lankhorst 	 *
353a8585ac6SMaarten Lankhorst 	 * We are using actual utilization rather than the statically
354a8585ac6SMaarten Lankhorst 	 * claimed protection in order to be work-conserving: claimed
355a8585ac6SMaarten Lankhorst 	 * but unused protection is available to siblings that would
356a8585ac6SMaarten Lankhorst 	 * otherwise get a smaller chunk than what they claimed.
357a8585ac6SMaarten Lankhorst 	 */
358a8585ac6SMaarten Lankhorst 	if (siblings_protected > parent_effective)
359a8585ac6SMaarten Lankhorst 		return protected * parent_effective / siblings_protected;
360a8585ac6SMaarten Lankhorst 
361a8585ac6SMaarten Lankhorst 	/*
362a8585ac6SMaarten Lankhorst 	 * Ok, utilized protection of all children is within what the
363a8585ac6SMaarten Lankhorst 	 * parent affords them, so we know whatever this child claims
364a8585ac6SMaarten Lankhorst 	 * and utilizes is effectively protected.
365a8585ac6SMaarten Lankhorst 	 *
366a8585ac6SMaarten Lankhorst 	 * If there is unprotected usage beyond this value, reclaim
367a8585ac6SMaarten Lankhorst 	 * will apply pressure in proportion to that amount.
368a8585ac6SMaarten Lankhorst 	 *
369a8585ac6SMaarten Lankhorst 	 * If there is unutilized protection, the cgroup will be fully
370a8585ac6SMaarten Lankhorst 	 * shielded from reclaim, but we do return a smaller value for
371a8585ac6SMaarten Lankhorst 	 * protection than what the group could enjoy in theory. This
372a8585ac6SMaarten Lankhorst 	 * is okay. With the overcommit distribution above, effective
373a8585ac6SMaarten Lankhorst 	 * protection is always dependent on how memory is actually
374a8585ac6SMaarten Lankhorst 	 * consumed among the siblings anyway.
375a8585ac6SMaarten Lankhorst 	 */
376a8585ac6SMaarten Lankhorst 	ep = protected;
377a8585ac6SMaarten Lankhorst 
378a8585ac6SMaarten Lankhorst 	/*
379a8585ac6SMaarten Lankhorst 	 * If the children aren't claiming (all of) the protection
380a8585ac6SMaarten Lankhorst 	 * afforded to them by the parent, distribute the remainder in
381a8585ac6SMaarten Lankhorst 	 * proportion to the (unprotected) memory of each cgroup. That
382a8585ac6SMaarten Lankhorst 	 * way, cgroups that aren't explicitly prioritized wrt each
383a8585ac6SMaarten Lankhorst 	 * other compete freely over the allowance, but they are
384a8585ac6SMaarten Lankhorst 	 * collectively protected from neighboring trees.
385a8585ac6SMaarten Lankhorst 	 *
386a8585ac6SMaarten Lankhorst 	 * We're using unprotected memory for the weight so that if
387a8585ac6SMaarten Lankhorst 	 * some cgroups DO claim explicit protection, we don't protect
388a8585ac6SMaarten Lankhorst 	 * the same bytes twice.
389a8585ac6SMaarten Lankhorst 	 *
390a8585ac6SMaarten Lankhorst 	 * Check both usage and parent_usage against the respective
391a8585ac6SMaarten Lankhorst 	 * protected values. One should imply the other, but they
392a8585ac6SMaarten Lankhorst 	 * aren't read atomically - make sure the division is sane.
393a8585ac6SMaarten Lankhorst 	 */
394a8585ac6SMaarten Lankhorst 	if (!recursive_protection)
395a8585ac6SMaarten Lankhorst 		return ep;
396a8585ac6SMaarten Lankhorst 
397a8585ac6SMaarten Lankhorst 	if (parent_effective > siblings_protected &&
398a8585ac6SMaarten Lankhorst 	    parent_usage > siblings_protected &&
399a8585ac6SMaarten Lankhorst 	    usage > protected) {
400a8585ac6SMaarten Lankhorst 		unsigned long unclaimed;
401a8585ac6SMaarten Lankhorst 
402a8585ac6SMaarten Lankhorst 		unclaimed = parent_effective - siblings_protected;
403a8585ac6SMaarten Lankhorst 		unclaimed *= usage - protected;
404a8585ac6SMaarten Lankhorst 		unclaimed /= parent_usage - siblings_protected;
405a8585ac6SMaarten Lankhorst 
406a8585ac6SMaarten Lankhorst 		ep += unclaimed;
407a8585ac6SMaarten Lankhorst 	}
408a8585ac6SMaarten Lankhorst 
409a8585ac6SMaarten Lankhorst 	return ep;
410a8585ac6SMaarten Lankhorst }
411a8585ac6SMaarten Lankhorst 
412a8585ac6SMaarten Lankhorst 
413a8585ac6SMaarten Lankhorst /**
414a8585ac6SMaarten Lankhorst  * page_counter_calculate_protection - check if memory consumption is in the normal range
415a8585ac6SMaarten Lankhorst  * @root: the top ancestor of the sub-tree being checked
416a8585ac6SMaarten Lankhorst  * @counter: the page_counter the counter to update
417a8585ac6SMaarten Lankhorst  * @recursive_protection: Whether to use memory_recursiveprot behavior.
418a8585ac6SMaarten Lankhorst  *
419a8585ac6SMaarten Lankhorst  * Calculates elow/emin thresholds for given page_counter.
420a8585ac6SMaarten Lankhorst  *
421a8585ac6SMaarten Lankhorst  * WARNING: This function is not stateless! It can only be used as part
422a8585ac6SMaarten Lankhorst  *          of a top-down tree iteration, not for isolated queries.
423a8585ac6SMaarten Lankhorst  */
page_counter_calculate_protection(struct page_counter * root,struct page_counter * counter,bool recursive_protection)424a8585ac6SMaarten Lankhorst void page_counter_calculate_protection(struct page_counter *root,
425a8585ac6SMaarten Lankhorst 				       struct page_counter *counter,
426a8585ac6SMaarten Lankhorst 				       bool recursive_protection)
427a8585ac6SMaarten Lankhorst {
428a8585ac6SMaarten Lankhorst 	unsigned long usage, parent_usage;
429a8585ac6SMaarten Lankhorst 	struct page_counter *parent = counter->parent;
430a8585ac6SMaarten Lankhorst 
431a8585ac6SMaarten Lankhorst 	/*
432a8585ac6SMaarten Lankhorst 	 * Effective values of the reclaim targets are ignored so they
433a8585ac6SMaarten Lankhorst 	 * can be stale. Have a look at mem_cgroup_protection for more
434a8585ac6SMaarten Lankhorst 	 * details.
435a8585ac6SMaarten Lankhorst 	 * TODO: calculation should be more robust so that we do not need
436a8585ac6SMaarten Lankhorst 	 * that special casing.
437a8585ac6SMaarten Lankhorst 	 */
438a8585ac6SMaarten Lankhorst 	if (root == counter)
439a8585ac6SMaarten Lankhorst 		return;
440a8585ac6SMaarten Lankhorst 
441a8585ac6SMaarten Lankhorst 	usage = page_counter_read(counter);
442a8585ac6SMaarten Lankhorst 	if (!usage)
443a8585ac6SMaarten Lankhorst 		return;
444a8585ac6SMaarten Lankhorst 
445a8585ac6SMaarten Lankhorst 	if (parent == root) {
446a8585ac6SMaarten Lankhorst 		counter->emin = READ_ONCE(counter->min);
447a8585ac6SMaarten Lankhorst 		counter->elow = READ_ONCE(counter->low);
448a8585ac6SMaarten Lankhorst 		return;
449a8585ac6SMaarten Lankhorst 	}
450a8585ac6SMaarten Lankhorst 
451a8585ac6SMaarten Lankhorst 	parent_usage = page_counter_read(parent);
452a8585ac6SMaarten Lankhorst 
453a8585ac6SMaarten Lankhorst 	WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage,
454a8585ac6SMaarten Lankhorst 			READ_ONCE(counter->min),
455a8585ac6SMaarten Lankhorst 			READ_ONCE(parent->emin),
456a8585ac6SMaarten Lankhorst 			atomic_long_read(&parent->children_min_usage),
457a8585ac6SMaarten Lankhorst 			recursive_protection));
458a8585ac6SMaarten Lankhorst 
459a8585ac6SMaarten Lankhorst 	WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage,
460a8585ac6SMaarten Lankhorst 			READ_ONCE(counter->low),
461a8585ac6SMaarten Lankhorst 			READ_ONCE(parent->elow),
462a8585ac6SMaarten Lankhorst 			atomic_long_read(&parent->children_low_usage),
463a8585ac6SMaarten Lankhorst 			recursive_protection));
464a8585ac6SMaarten Lankhorst }
465b168ed45SMaarten Lankhorst #endif /* CONFIG_MEMCG || CONFIG_CGROUP_DMEM */
466