1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
23e32cb2eSJohannes Weiner /*
33e32cb2eSJohannes Weiner * Lockless hierarchical page accounting & limiting
43e32cb2eSJohannes Weiner *
53e32cb2eSJohannes Weiner * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
63e32cb2eSJohannes Weiner */
73e32cb2eSJohannes Weiner
83e32cb2eSJohannes Weiner #include <linux/page_counter.h>
93e32cb2eSJohannes Weiner #include <linux/atomic.h>
103e32cb2eSJohannes Weiner #include <linux/kernel.h>
113e32cb2eSJohannes Weiner #include <linux/string.h>
123e32cb2eSJohannes Weiner #include <linux/sched.h>
133e32cb2eSJohannes Weiner #include <linux/bug.h>
143e32cb2eSJohannes Weiner #include <asm/page.h>
153e32cb2eSJohannes Weiner
track_protection(struct page_counter * c)16f77bd4b1SRoman Gushchin static bool track_protection(struct page_counter *c)
17f77bd4b1SRoman Gushchin {
18f77bd4b1SRoman Gushchin return c->protection_support;
19f77bd4b1SRoman Gushchin }
20f77bd4b1SRoman Gushchin
propagate_protected_usage(struct page_counter * c,unsigned long usage)21bf8d5d52SRoman Gushchin static void propagate_protected_usage(struct page_counter *c,
22bf8d5d52SRoman Gushchin unsigned long usage)
2323067153SRoman Gushchin {
24bf8d5d52SRoman Gushchin unsigned long protected, old_protected;
2523067153SRoman Gushchin long delta;
2623067153SRoman Gushchin
2723067153SRoman Gushchin if (!c->parent)
2823067153SRoman Gushchin return;
2923067153SRoman Gushchin
30cfdab60bSShakeel Butt protected = min(usage, READ_ONCE(c->min));
31cfdab60bSShakeel Butt old_protected = atomic_long_read(&c->min_usage);
32cfdab60bSShakeel Butt if (protected != old_protected) {
33bf8d5d52SRoman Gushchin old_protected = atomic_long_xchg(&c->min_usage, protected);
34bf8d5d52SRoman Gushchin delta = protected - old_protected;
35bf8d5d52SRoman Gushchin if (delta)
36bf8d5d52SRoman Gushchin atomic_long_add(delta, &c->parent->children_min_usage);
37bf8d5d52SRoman Gushchin }
38bf8d5d52SRoman Gushchin
39cfdab60bSShakeel Butt protected = min(usage, READ_ONCE(c->low));
40cfdab60bSShakeel Butt old_protected = atomic_long_read(&c->low_usage);
41cfdab60bSShakeel Butt if (protected != old_protected) {
42bf8d5d52SRoman Gushchin old_protected = atomic_long_xchg(&c->low_usage, protected);
43bf8d5d52SRoman Gushchin delta = protected - old_protected;
4423067153SRoman Gushchin if (delta)
4523067153SRoman Gushchin atomic_long_add(delta, &c->parent->children_low_usage);
4623067153SRoman Gushchin }
47bf8d5d52SRoman Gushchin }
4823067153SRoman Gushchin
493e32cb2eSJohannes Weiner /**
503e32cb2eSJohannes Weiner * page_counter_cancel - take pages out of the local counter
513e32cb2eSJohannes Weiner * @counter: counter
523e32cb2eSJohannes Weiner * @nr_pages: number of pages to cancel
533e32cb2eSJohannes Weiner */
page_counter_cancel(struct page_counter * counter,unsigned long nr_pages)5464f21993SJohannes Weiner void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
553e32cb2eSJohannes Weiner {
563e32cb2eSJohannes Weiner long new;
573e32cb2eSJohannes Weiner
58bbec2e15SRoman Gushchin new = atomic_long_sub_return(nr_pages, &counter->usage);
593e32cb2eSJohannes Weiner /* More uncharges than charges? */
609317d0ffSJohannes Weiner if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n",
619317d0ffSJohannes Weiner new, nr_pages)) {
629317d0ffSJohannes Weiner new = 0;
639317d0ffSJohannes Weiner atomic_long_set(&counter->usage, new);
649317d0ffSJohannes Weiner }
65f77bd4b1SRoman Gushchin if (track_protection(counter))
669317d0ffSJohannes Weiner propagate_protected_usage(counter, new);
673e32cb2eSJohannes Weiner }
683e32cb2eSJohannes Weiner
693e32cb2eSJohannes Weiner /**
703e32cb2eSJohannes Weiner * page_counter_charge - hierarchically charge pages
713e32cb2eSJohannes Weiner * @counter: counter
723e32cb2eSJohannes Weiner * @nr_pages: number of pages to charge
733e32cb2eSJohannes Weiner *
743e32cb2eSJohannes Weiner * NOTE: This does not consider any configured counter limits.
753e32cb2eSJohannes Weiner */
page_counter_charge(struct page_counter * counter,unsigned long nr_pages)763e32cb2eSJohannes Weiner void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
773e32cb2eSJohannes Weiner {
783e32cb2eSJohannes Weiner struct page_counter *c;
79f77bd4b1SRoman Gushchin bool protection = track_protection(counter);
803e32cb2eSJohannes Weiner
813e32cb2eSJohannes Weiner for (c = counter; c; c = c->parent) {
823e32cb2eSJohannes Weiner long new;
833e32cb2eSJohannes Weiner
84bbec2e15SRoman Gushchin new = atomic_long_add_return(nr_pages, &c->usage);
85f77bd4b1SRoman Gushchin if (protection)
86a6f23d14SMichal Koutný propagate_protected_usage(c, new);
873e32cb2eSJohannes Weiner /*
883e32cb2eSJohannes Weiner * This is indeed racy, but we can live with some
893e32cb2eSJohannes Weiner * inaccuracy in the watermark.
90c6f53ed8SDavid Finkel *
91c6f53ed8SDavid Finkel * Notably, we have two watermarks to allow for both a globally
92c6f53ed8SDavid Finkel * visible peak and one that can be reset at a smaller scope.
93c6f53ed8SDavid Finkel *
94c6f53ed8SDavid Finkel * Since we reset both watermarks when the global reset occurs,
95c6f53ed8SDavid Finkel * we can guarantee that watermark >= local_watermark, so we
96c6f53ed8SDavid Finkel * don't need to do both comparisons every time.
97c6f53ed8SDavid Finkel *
98c6f53ed8SDavid Finkel * On systems with branch predictors, the inner condition should
99c6f53ed8SDavid Finkel * be almost free.
1003e32cb2eSJohannes Weiner */
101c6f53ed8SDavid Finkel if (new > READ_ONCE(c->local_watermark)) {
102c6f53ed8SDavid Finkel WRITE_ONCE(c->local_watermark, new);
1036e4bd50fSQian Cai if (new > READ_ONCE(c->watermark))
1046e4bd50fSQian Cai WRITE_ONCE(c->watermark, new);
1053e32cb2eSJohannes Weiner }
1063e32cb2eSJohannes Weiner }
107c6f53ed8SDavid Finkel }
1083e32cb2eSJohannes Weiner
1093e32cb2eSJohannes Weiner /**
1103e32cb2eSJohannes Weiner * page_counter_try_charge - try to hierarchically charge pages
1113e32cb2eSJohannes Weiner * @counter: counter
1123e32cb2eSJohannes Weiner * @nr_pages: number of pages to charge
1133e32cb2eSJohannes Weiner * @fail: points first counter to hit its limit, if any
1143e32cb2eSJohannes Weiner *
1156071ca52SJohannes Weiner * Returns %true on success, or %false and @fail if the counter or one
1166071ca52SJohannes Weiner * of its ancestors has hit its configured limit.
1173e32cb2eSJohannes Weiner */
page_counter_try_charge(struct page_counter * counter,unsigned long nr_pages,struct page_counter ** fail)1186071ca52SJohannes Weiner bool page_counter_try_charge(struct page_counter *counter,
1193e32cb2eSJohannes Weiner unsigned long nr_pages,
1203e32cb2eSJohannes Weiner struct page_counter **fail)
1213e32cb2eSJohannes Weiner {
1223e32cb2eSJohannes Weiner struct page_counter *c;
123f77bd4b1SRoman Gushchin bool protection = track_protection(counter);
124*0e2759afSShakeel Butt bool track_failcnt = counter->track_failcnt;
1253e32cb2eSJohannes Weiner
1263e32cb2eSJohannes Weiner for (c = counter; c; c = c->parent) {
1273e32cb2eSJohannes Weiner long new;
1283e32cb2eSJohannes Weiner /*
1293e32cb2eSJohannes Weiner * Charge speculatively to avoid an expensive CAS. If
1303e32cb2eSJohannes Weiner * a bigger charge fails, it might falsely lock out a
1313e32cb2eSJohannes Weiner * racing smaller charge and send it into reclaim
1323e32cb2eSJohannes Weiner * early, but the error is limited to the difference
1333e32cb2eSJohannes Weiner * between the two sizes, which is less than 2M/4M in
1343e32cb2eSJohannes Weiner * case of a THP locking out a regular page charge.
1353e32cb2eSJohannes Weiner *
1363e32cb2eSJohannes Weiner * The atomic_long_add_return() implies a full memory
1373e32cb2eSJohannes Weiner * barrier between incrementing the count and reading
138d437024eSMiaohe Lin * the limit. When racing with page_counter_set_max(),
1393e32cb2eSJohannes Weiner * we either see the new limit or the setter sees the
1403e32cb2eSJohannes Weiner * counter has changed and retries.
1413e32cb2eSJohannes Weiner */
142bbec2e15SRoman Gushchin new = atomic_long_add_return(nr_pages, &c->usage);
143bbec2e15SRoman Gushchin if (new > c->max) {
144bbec2e15SRoman Gushchin atomic_long_sub(nr_pages, &c->usage);
1453e32cb2eSJohannes Weiner /*
1463e32cb2eSJohannes Weiner * This is racy, but we can live with some
1476e4bd50fSQian Cai * inaccuracy in the failcnt which is only used
1486e4bd50fSQian Cai * to report stats.
1493e32cb2eSJohannes Weiner */
150*0e2759afSShakeel Butt if (track_failcnt)
1516e4bd50fSQian Cai data_race(c->failcnt++);
1523e32cb2eSJohannes Weiner *fail = c;
1533e32cb2eSJohannes Weiner goto failed;
1543e32cb2eSJohannes Weiner }
155f77bd4b1SRoman Gushchin if (protection)
156a6f23d14SMichal Koutný propagate_protected_usage(c, new);
157f77bd4b1SRoman Gushchin
158c6f53ed8SDavid Finkel /* see comment on page_counter_charge */
159c6f53ed8SDavid Finkel if (new > READ_ONCE(c->local_watermark)) {
160c6f53ed8SDavid Finkel WRITE_ONCE(c->local_watermark, new);
1616e4bd50fSQian Cai if (new > READ_ONCE(c->watermark))
1626e4bd50fSQian Cai WRITE_ONCE(c->watermark, new);
1633e32cb2eSJohannes Weiner }
164c6f53ed8SDavid Finkel }
1656071ca52SJohannes Weiner return true;
1663e32cb2eSJohannes Weiner
1673e32cb2eSJohannes Weiner failed:
1683e32cb2eSJohannes Weiner for (c = counter; c != *fail; c = c->parent)
1693e32cb2eSJohannes Weiner page_counter_cancel(c, nr_pages);
1703e32cb2eSJohannes Weiner
1716071ca52SJohannes Weiner return false;
1723e32cb2eSJohannes Weiner }
1733e32cb2eSJohannes Weiner
1743e32cb2eSJohannes Weiner /**
1753e32cb2eSJohannes Weiner * page_counter_uncharge - hierarchically uncharge pages
1763e32cb2eSJohannes Weiner * @counter: counter
1773e32cb2eSJohannes Weiner * @nr_pages: number of pages to uncharge
1783e32cb2eSJohannes Weiner */
page_counter_uncharge(struct page_counter * counter,unsigned long nr_pages)17964f21993SJohannes Weiner void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
1803e32cb2eSJohannes Weiner {
1813e32cb2eSJohannes Weiner struct page_counter *c;
1823e32cb2eSJohannes Weiner
18364f21993SJohannes Weiner for (c = counter; c; c = c->parent)
18464f21993SJohannes Weiner page_counter_cancel(c, nr_pages);
1853e32cb2eSJohannes Weiner }
1863e32cb2eSJohannes Weiner
1873e32cb2eSJohannes Weiner /**
188bbec2e15SRoman Gushchin * page_counter_set_max - set the maximum number of pages allowed
1893e32cb2eSJohannes Weiner * @counter: counter
190bbec2e15SRoman Gushchin * @nr_pages: limit to set
1913e32cb2eSJohannes Weiner *
1923e32cb2eSJohannes Weiner * Returns 0 on success, -EBUSY if the current number of pages on the
1933e32cb2eSJohannes Weiner * counter already exceeds the specified limit.
1943e32cb2eSJohannes Weiner *
1953e32cb2eSJohannes Weiner * The caller must serialize invocations on the same counter.
1963e32cb2eSJohannes Weiner */
page_counter_set_max(struct page_counter * counter,unsigned long nr_pages)197bbec2e15SRoman Gushchin int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
1983e32cb2eSJohannes Weiner {
1993e32cb2eSJohannes Weiner for (;;) {
2003e32cb2eSJohannes Weiner unsigned long old;
201bbec2e15SRoman Gushchin long usage;
2023e32cb2eSJohannes Weiner
2033e32cb2eSJohannes Weiner /*
2043e32cb2eSJohannes Weiner * Update the limit while making sure that it's not
2053e32cb2eSJohannes Weiner * below the concurrently-changing counter value.
2063e32cb2eSJohannes Weiner *
2073e32cb2eSJohannes Weiner * The xchg implies two full memory barriers before
2083e32cb2eSJohannes Weiner * and after, so the read-swap-read is ordered and
2093e32cb2eSJohannes Weiner * ensures coherency with page_counter_try_charge():
2103e32cb2eSJohannes Weiner * that function modifies the count before checking
2113e32cb2eSJohannes Weiner * the limit, so if it sees the old limit, we see the
2123e32cb2eSJohannes Weiner * modified counter and retry.
2133e32cb2eSJohannes Weiner */
21413064781SHui Su usage = page_counter_read(counter);
2153e32cb2eSJohannes Weiner
216bbec2e15SRoman Gushchin if (usage > nr_pages)
2173e32cb2eSJohannes Weiner return -EBUSY;
2183e32cb2eSJohannes Weiner
219bbec2e15SRoman Gushchin old = xchg(&counter->max, nr_pages);
2203e32cb2eSJohannes Weiner
22132d77270SBui Quang Minh if (page_counter_read(counter) <= usage || nr_pages >= old)
2223e32cb2eSJohannes Weiner return 0;
2233e32cb2eSJohannes Weiner
224bbec2e15SRoman Gushchin counter->max = old;
2253e32cb2eSJohannes Weiner cond_resched();
2263e32cb2eSJohannes Weiner }
2273e32cb2eSJohannes Weiner }
2283e32cb2eSJohannes Weiner
2293e32cb2eSJohannes Weiner /**
230bf8d5d52SRoman Gushchin * page_counter_set_min - set the amount of protected memory
231bf8d5d52SRoman Gushchin * @counter: counter
232bf8d5d52SRoman Gushchin * @nr_pages: value to set
233bf8d5d52SRoman Gushchin *
234bf8d5d52SRoman Gushchin * The caller must serialize invocations on the same counter.
235bf8d5d52SRoman Gushchin */
page_counter_set_min(struct page_counter * counter,unsigned long nr_pages)236bf8d5d52SRoman Gushchin void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
237bf8d5d52SRoman Gushchin {
238bf8d5d52SRoman Gushchin struct page_counter *c;
239bf8d5d52SRoman Gushchin
240c3d53200SChris Down WRITE_ONCE(counter->min, nr_pages);
241bf8d5d52SRoman Gushchin
242bf8d5d52SRoman Gushchin for (c = counter; c; c = c->parent)
243bf8d5d52SRoman Gushchin propagate_protected_usage(c, atomic_long_read(&c->usage));
244bf8d5d52SRoman Gushchin }
245bf8d5d52SRoman Gushchin
246bf8d5d52SRoman Gushchin /**
24723067153SRoman Gushchin * page_counter_set_low - set the amount of protected memory
24823067153SRoman Gushchin * @counter: counter
24923067153SRoman Gushchin * @nr_pages: value to set
25023067153SRoman Gushchin *
25123067153SRoman Gushchin * The caller must serialize invocations on the same counter.
25223067153SRoman Gushchin */
page_counter_set_low(struct page_counter * counter,unsigned long nr_pages)25323067153SRoman Gushchin void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
25423067153SRoman Gushchin {
25523067153SRoman Gushchin struct page_counter *c;
25623067153SRoman Gushchin
257f86b810cSChris Down WRITE_ONCE(counter->low, nr_pages);
25823067153SRoman Gushchin
25923067153SRoman Gushchin for (c = counter; c; c = c->parent)
260bf8d5d52SRoman Gushchin propagate_protected_usage(c, atomic_long_read(&c->usage));
26123067153SRoman Gushchin }
26223067153SRoman Gushchin
26323067153SRoman Gushchin /**
2643e32cb2eSJohannes Weiner * page_counter_memparse - memparse() for page counter limits
2653e32cb2eSJohannes Weiner * @buf: string to parse
266650c5e56SJohannes Weiner * @max: string meaning maximum possible value
2673e32cb2eSJohannes Weiner * @nr_pages: returns the result in number of pages
2683e32cb2eSJohannes Weiner *
2693e32cb2eSJohannes Weiner * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be
2703e32cb2eSJohannes Weiner * limited to %PAGE_COUNTER_MAX.
2713e32cb2eSJohannes Weiner */
page_counter_memparse(const char * buf,const char * max,unsigned long * nr_pages)272650c5e56SJohannes Weiner int page_counter_memparse(const char *buf, const char *max,
273650c5e56SJohannes Weiner unsigned long *nr_pages)
2743e32cb2eSJohannes Weiner {
2753e32cb2eSJohannes Weiner char *end;
2763e32cb2eSJohannes Weiner u64 bytes;
2773e32cb2eSJohannes Weiner
278650c5e56SJohannes Weiner if (!strcmp(buf, max)) {
2793e32cb2eSJohannes Weiner *nr_pages = PAGE_COUNTER_MAX;
2803e32cb2eSJohannes Weiner return 0;
2813e32cb2eSJohannes Weiner }
2823e32cb2eSJohannes Weiner
2833e32cb2eSJohannes Weiner bytes = memparse(buf, &end);
2843e32cb2eSJohannes Weiner if (*end != '\0')
2853e32cb2eSJohannes Weiner return -EINVAL;
2863e32cb2eSJohannes Weiner
2873e32cb2eSJohannes Weiner *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
2883e32cb2eSJohannes Weiner
2893e32cb2eSJohannes Weiner return 0;
2903e32cb2eSJohannes Weiner }
291a8585ac6SMaarten Lankhorst
292a8585ac6SMaarten Lankhorst
293b168ed45SMaarten Lankhorst #if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
294a8585ac6SMaarten Lankhorst /*
295a8585ac6SMaarten Lankhorst * This function calculates an individual page counter's effective
296a8585ac6SMaarten Lankhorst * protection which is derived from its own memory.min/low, its
297a8585ac6SMaarten Lankhorst * parent's and siblings' settings, as well as the actual memory
298a8585ac6SMaarten Lankhorst * distribution in the tree.
299a8585ac6SMaarten Lankhorst *
300a8585ac6SMaarten Lankhorst * The following rules apply to the effective protection values:
301a8585ac6SMaarten Lankhorst *
302a8585ac6SMaarten Lankhorst * 1. At the first level of reclaim, effective protection is equal to
303a8585ac6SMaarten Lankhorst * the declared protection in memory.min and memory.low.
304a8585ac6SMaarten Lankhorst *
305a8585ac6SMaarten Lankhorst * 2. To enable safe delegation of the protection configuration, at
306a8585ac6SMaarten Lankhorst * subsequent levels the effective protection is capped to the
307a8585ac6SMaarten Lankhorst * parent's effective protection.
308a8585ac6SMaarten Lankhorst *
309a8585ac6SMaarten Lankhorst * 3. To make complex and dynamic subtrees easier to configure, the
310a8585ac6SMaarten Lankhorst * user is allowed to overcommit the declared protection at a given
311a8585ac6SMaarten Lankhorst * level. If that is the case, the parent's effective protection is
312a8585ac6SMaarten Lankhorst * distributed to the children in proportion to how much protection
313a8585ac6SMaarten Lankhorst * they have declared and how much of it they are utilizing.
314a8585ac6SMaarten Lankhorst *
315a8585ac6SMaarten Lankhorst * This makes distribution proportional, but also work-conserving:
316a8585ac6SMaarten Lankhorst * if one counter claims much more protection than it uses memory,
317a8585ac6SMaarten Lankhorst * the unused remainder is available to its siblings.
318a8585ac6SMaarten Lankhorst *
319a8585ac6SMaarten Lankhorst * 4. Conversely, when the declared protection is undercommitted at a
320a8585ac6SMaarten Lankhorst * given level, the distribution of the larger parental protection
321a8585ac6SMaarten Lankhorst * budget is NOT proportional. A counter's protection from a sibling
322a8585ac6SMaarten Lankhorst * is capped to its own memory.min/low setting.
323a8585ac6SMaarten Lankhorst *
324a8585ac6SMaarten Lankhorst * 5. However, to allow protecting recursive subtrees from each other
325a8585ac6SMaarten Lankhorst * without having to declare each individual counter's fixed share
326a8585ac6SMaarten Lankhorst * of the ancestor's claim to protection, any unutilized -
327a8585ac6SMaarten Lankhorst * "floating" - protection from up the tree is distributed in
328a8585ac6SMaarten Lankhorst * proportion to each counter's *usage*. This makes the protection
329a8585ac6SMaarten Lankhorst * neutral wrt sibling cgroups and lets them compete freely over
330a8585ac6SMaarten Lankhorst * the shared parental protection budget, but it protects the
331a8585ac6SMaarten Lankhorst * subtree as a whole from neighboring subtrees.
332a8585ac6SMaarten Lankhorst *
333a8585ac6SMaarten Lankhorst * Note that 4. and 5. are not in conflict: 4. is about protecting
334a8585ac6SMaarten Lankhorst * against immediate siblings whereas 5. is about protecting against
335a8585ac6SMaarten Lankhorst * neighboring subtrees.
336a8585ac6SMaarten Lankhorst */
effective_protection(unsigned long usage,unsigned long parent_usage,unsigned long setting,unsigned long parent_effective,unsigned long siblings_protected,bool recursive_protection)337a8585ac6SMaarten Lankhorst static unsigned long effective_protection(unsigned long usage,
338a8585ac6SMaarten Lankhorst unsigned long parent_usage,
339a8585ac6SMaarten Lankhorst unsigned long setting,
340a8585ac6SMaarten Lankhorst unsigned long parent_effective,
341a8585ac6SMaarten Lankhorst unsigned long siblings_protected,
342a8585ac6SMaarten Lankhorst bool recursive_protection)
343a8585ac6SMaarten Lankhorst {
344a8585ac6SMaarten Lankhorst unsigned long protected;
345a8585ac6SMaarten Lankhorst unsigned long ep;
346a8585ac6SMaarten Lankhorst
347a8585ac6SMaarten Lankhorst protected = min(usage, setting);
348a8585ac6SMaarten Lankhorst /*
349a8585ac6SMaarten Lankhorst * If all cgroups at this level combined claim and use more
350a8585ac6SMaarten Lankhorst * protection than what the parent affords them, distribute
351a8585ac6SMaarten Lankhorst * shares in proportion to utilization.
352a8585ac6SMaarten Lankhorst *
353a8585ac6SMaarten Lankhorst * We are using actual utilization rather than the statically
354a8585ac6SMaarten Lankhorst * claimed protection in order to be work-conserving: claimed
355a8585ac6SMaarten Lankhorst * but unused protection is available to siblings that would
356a8585ac6SMaarten Lankhorst * otherwise get a smaller chunk than what they claimed.
357a8585ac6SMaarten Lankhorst */
358a8585ac6SMaarten Lankhorst if (siblings_protected > parent_effective)
359a8585ac6SMaarten Lankhorst return protected * parent_effective / siblings_protected;
360a8585ac6SMaarten Lankhorst
361a8585ac6SMaarten Lankhorst /*
362a8585ac6SMaarten Lankhorst * Ok, utilized protection of all children is within what the
363a8585ac6SMaarten Lankhorst * parent affords them, so we know whatever this child claims
364a8585ac6SMaarten Lankhorst * and utilizes is effectively protected.
365a8585ac6SMaarten Lankhorst *
366a8585ac6SMaarten Lankhorst * If there is unprotected usage beyond this value, reclaim
367a8585ac6SMaarten Lankhorst * will apply pressure in proportion to that amount.
368a8585ac6SMaarten Lankhorst *
369a8585ac6SMaarten Lankhorst * If there is unutilized protection, the cgroup will be fully
370a8585ac6SMaarten Lankhorst * shielded from reclaim, but we do return a smaller value for
371a8585ac6SMaarten Lankhorst * protection than what the group could enjoy in theory. This
372a8585ac6SMaarten Lankhorst * is okay. With the overcommit distribution above, effective
373a8585ac6SMaarten Lankhorst * protection is always dependent on how memory is actually
374a8585ac6SMaarten Lankhorst * consumed among the siblings anyway.
375a8585ac6SMaarten Lankhorst */
376a8585ac6SMaarten Lankhorst ep = protected;
377a8585ac6SMaarten Lankhorst
378a8585ac6SMaarten Lankhorst /*
379a8585ac6SMaarten Lankhorst * If the children aren't claiming (all of) the protection
380a8585ac6SMaarten Lankhorst * afforded to them by the parent, distribute the remainder in
381a8585ac6SMaarten Lankhorst * proportion to the (unprotected) memory of each cgroup. That
382a8585ac6SMaarten Lankhorst * way, cgroups that aren't explicitly prioritized wrt each
383a8585ac6SMaarten Lankhorst * other compete freely over the allowance, but they are
384a8585ac6SMaarten Lankhorst * collectively protected from neighboring trees.
385a8585ac6SMaarten Lankhorst *
386a8585ac6SMaarten Lankhorst * We're using unprotected memory for the weight so that if
387a8585ac6SMaarten Lankhorst * some cgroups DO claim explicit protection, we don't protect
388a8585ac6SMaarten Lankhorst * the same bytes twice.
389a8585ac6SMaarten Lankhorst *
390a8585ac6SMaarten Lankhorst * Check both usage and parent_usage against the respective
391a8585ac6SMaarten Lankhorst * protected values. One should imply the other, but they
392a8585ac6SMaarten Lankhorst * aren't read atomically - make sure the division is sane.
393a8585ac6SMaarten Lankhorst */
394a8585ac6SMaarten Lankhorst if (!recursive_protection)
395a8585ac6SMaarten Lankhorst return ep;
396a8585ac6SMaarten Lankhorst
397a8585ac6SMaarten Lankhorst if (parent_effective > siblings_protected &&
398a8585ac6SMaarten Lankhorst parent_usage > siblings_protected &&
399a8585ac6SMaarten Lankhorst usage > protected) {
400a8585ac6SMaarten Lankhorst unsigned long unclaimed;
401a8585ac6SMaarten Lankhorst
402a8585ac6SMaarten Lankhorst unclaimed = parent_effective - siblings_protected;
403a8585ac6SMaarten Lankhorst unclaimed *= usage - protected;
404a8585ac6SMaarten Lankhorst unclaimed /= parent_usage - siblings_protected;
405a8585ac6SMaarten Lankhorst
406a8585ac6SMaarten Lankhorst ep += unclaimed;
407a8585ac6SMaarten Lankhorst }
408a8585ac6SMaarten Lankhorst
409a8585ac6SMaarten Lankhorst return ep;
410a8585ac6SMaarten Lankhorst }
411a8585ac6SMaarten Lankhorst
412a8585ac6SMaarten Lankhorst
413a8585ac6SMaarten Lankhorst /**
414a8585ac6SMaarten Lankhorst * page_counter_calculate_protection - check if memory consumption is in the normal range
415a8585ac6SMaarten Lankhorst * @root: the top ancestor of the sub-tree being checked
416a8585ac6SMaarten Lankhorst * @counter: the page_counter the counter to update
417a8585ac6SMaarten Lankhorst * @recursive_protection: Whether to use memory_recursiveprot behavior.
418a8585ac6SMaarten Lankhorst *
419a8585ac6SMaarten Lankhorst * Calculates elow/emin thresholds for given page_counter.
420a8585ac6SMaarten Lankhorst *
421a8585ac6SMaarten Lankhorst * WARNING: This function is not stateless! It can only be used as part
422a8585ac6SMaarten Lankhorst * of a top-down tree iteration, not for isolated queries.
423a8585ac6SMaarten Lankhorst */
page_counter_calculate_protection(struct page_counter * root,struct page_counter * counter,bool recursive_protection)424a8585ac6SMaarten Lankhorst void page_counter_calculate_protection(struct page_counter *root,
425a8585ac6SMaarten Lankhorst struct page_counter *counter,
426a8585ac6SMaarten Lankhorst bool recursive_protection)
427a8585ac6SMaarten Lankhorst {
428a8585ac6SMaarten Lankhorst unsigned long usage, parent_usage;
429a8585ac6SMaarten Lankhorst struct page_counter *parent = counter->parent;
430a8585ac6SMaarten Lankhorst
431a8585ac6SMaarten Lankhorst /*
432a8585ac6SMaarten Lankhorst * Effective values of the reclaim targets are ignored so they
433a8585ac6SMaarten Lankhorst * can be stale. Have a look at mem_cgroup_protection for more
434a8585ac6SMaarten Lankhorst * details.
435a8585ac6SMaarten Lankhorst * TODO: calculation should be more robust so that we do not need
436a8585ac6SMaarten Lankhorst * that special casing.
437a8585ac6SMaarten Lankhorst */
438a8585ac6SMaarten Lankhorst if (root == counter)
439a8585ac6SMaarten Lankhorst return;
440a8585ac6SMaarten Lankhorst
441a8585ac6SMaarten Lankhorst usage = page_counter_read(counter);
442a8585ac6SMaarten Lankhorst if (!usage)
443a8585ac6SMaarten Lankhorst return;
444a8585ac6SMaarten Lankhorst
445a8585ac6SMaarten Lankhorst if (parent == root) {
446a8585ac6SMaarten Lankhorst counter->emin = READ_ONCE(counter->min);
447a8585ac6SMaarten Lankhorst counter->elow = READ_ONCE(counter->low);
448a8585ac6SMaarten Lankhorst return;
449a8585ac6SMaarten Lankhorst }
450a8585ac6SMaarten Lankhorst
451a8585ac6SMaarten Lankhorst parent_usage = page_counter_read(parent);
452a8585ac6SMaarten Lankhorst
453a8585ac6SMaarten Lankhorst WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage,
454a8585ac6SMaarten Lankhorst READ_ONCE(counter->min),
455a8585ac6SMaarten Lankhorst READ_ONCE(parent->emin),
456a8585ac6SMaarten Lankhorst atomic_long_read(&parent->children_min_usage),
457a8585ac6SMaarten Lankhorst recursive_protection));
458a8585ac6SMaarten Lankhorst
459a8585ac6SMaarten Lankhorst WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage,
460a8585ac6SMaarten Lankhorst READ_ONCE(counter->low),
461a8585ac6SMaarten Lankhorst READ_ONCE(parent->elow),
462a8585ac6SMaarten Lankhorst atomic_long_read(&parent->children_low_usage),
463a8585ac6SMaarten Lankhorst recursive_protection));
464a8585ac6SMaarten Lankhorst }
465b168ed45SMaarten Lankhorst #endif /* CONFIG_MEMCG || CONFIG_CGROUP_DMEM */
466