11b1e1344SRoman Gushchin // SPDX-License-Identifier: GPL-2.0-or-later
21b1e1344SRoman Gushchin
3d12f6d22SRoman Gushchin #include <linux/memcontrol.h>
4d12f6d22SRoman Gushchin #include <linux/swap.h>
5d12f6d22SRoman Gushchin #include <linux/mm_inline.h>
6e548ad4aSRoman Gushchin #include <linux/pagewalk.h>
7e548ad4aSRoman Gushchin #include <linux/backing-dev.h>
8e548ad4aSRoman Gushchin #include <linux/swap_cgroup.h>
966d60c42SRoman Gushchin #include <linux/eventfd.h>
1066d60c42SRoman Gushchin #include <linux/poll.h>
1166d60c42SRoman Gushchin #include <linux/sort.h>
1266d60c42SRoman Gushchin #include <linux/file.h>
13ea1e8796SRoman Gushchin #include <linux/seq_buf.h>
14d12f6d22SRoman Gushchin
15e548ad4aSRoman Gushchin #include "internal.h"
16e548ad4aSRoman Gushchin #include "swap.h"
171b1e1344SRoman Gushchin #include "memcontrol-v1.h"
18d12f6d22SRoman Gushchin
19d12f6d22SRoman Gushchin /*
20d12f6d22SRoman Gushchin * Cgroups above their limits are maintained in a RB-Tree, independent of
21d12f6d22SRoman Gushchin * their hierarchy representation
22d12f6d22SRoman Gushchin */
23d12f6d22SRoman Gushchin
24d12f6d22SRoman Gushchin struct mem_cgroup_tree_per_node {
25d12f6d22SRoman Gushchin struct rb_root rb_root;
26d12f6d22SRoman Gushchin struct rb_node *rb_rightmost;
27d12f6d22SRoman Gushchin spinlock_t lock;
28d12f6d22SRoman Gushchin };
29d12f6d22SRoman Gushchin
30d12f6d22SRoman Gushchin struct mem_cgroup_tree {
31d12f6d22SRoman Gushchin struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
32d12f6d22SRoman Gushchin };
33d12f6d22SRoman Gushchin
34d12f6d22SRoman Gushchin static struct mem_cgroup_tree soft_limit_tree __read_mostly;
35d12f6d22SRoman Gushchin
36d12f6d22SRoman Gushchin /*
37d12f6d22SRoman Gushchin * Maximum loops in mem_cgroup_soft_reclaim(), used for soft
38d12f6d22SRoman Gushchin * limit reclaim to prevent infinite loops, if they ever occur.
39d12f6d22SRoman Gushchin */
40d12f6d22SRoman Gushchin #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
41d12f6d22SRoman Gushchin #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
42d12f6d22SRoman Gushchin
4366d60c42SRoman Gushchin /* for OOM */
4466d60c42SRoman Gushchin struct mem_cgroup_eventfd_list {
4566d60c42SRoman Gushchin struct list_head list;
4666d60c42SRoman Gushchin struct eventfd_ctx *eventfd;
4766d60c42SRoman Gushchin };
4866d60c42SRoman Gushchin
4966d60c42SRoman Gushchin /*
5066d60c42SRoman Gushchin * cgroup_event represents events which userspace want to receive.
5166d60c42SRoman Gushchin */
5266d60c42SRoman Gushchin struct mem_cgroup_event {
5366d60c42SRoman Gushchin /*
5466d60c42SRoman Gushchin * memcg which the event belongs to.
5566d60c42SRoman Gushchin */
5666d60c42SRoman Gushchin struct mem_cgroup *memcg;
5766d60c42SRoman Gushchin /*
5866d60c42SRoman Gushchin * eventfd to signal userspace about the event.
5966d60c42SRoman Gushchin */
6066d60c42SRoman Gushchin struct eventfd_ctx *eventfd;
6166d60c42SRoman Gushchin /*
6266d60c42SRoman Gushchin * Each of these stored in a list by the cgroup.
6366d60c42SRoman Gushchin */
6466d60c42SRoman Gushchin struct list_head list;
6566d60c42SRoman Gushchin /*
6666d60c42SRoman Gushchin * register_event() callback will be used to add new userspace
6766d60c42SRoman Gushchin * waiter for changes related to this event. Use eventfd_signal()
6866d60c42SRoman Gushchin * on eventfd to send notification to userspace.
6966d60c42SRoman Gushchin */
7066d60c42SRoman Gushchin int (*register_event)(struct mem_cgroup *memcg,
7166d60c42SRoman Gushchin struct eventfd_ctx *eventfd, const char *args);
7266d60c42SRoman Gushchin /*
7366d60c42SRoman Gushchin * unregister_event() callback will be called when userspace closes
7466d60c42SRoman Gushchin * the eventfd or on cgroup removing. This callback must be set,
7566d60c42SRoman Gushchin * if you want provide notification functionality.
7666d60c42SRoman Gushchin */
7766d60c42SRoman Gushchin void (*unregister_event)(struct mem_cgroup *memcg,
7866d60c42SRoman Gushchin struct eventfd_ctx *eventfd);
7966d60c42SRoman Gushchin /*
8066d60c42SRoman Gushchin * All fields below needed to unregister event when
8166d60c42SRoman Gushchin * userspace closes eventfd.
8266d60c42SRoman Gushchin */
8366d60c42SRoman Gushchin poll_table pt;
8466d60c42SRoman Gushchin wait_queue_head_t *wqh;
8566d60c42SRoman Gushchin wait_queue_entry_t wait;
8666d60c42SRoman Gushchin struct work_struct remove;
8766d60c42SRoman Gushchin };
8866d60c42SRoman Gushchin
89ea1e8796SRoman Gushchin #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
90ea1e8796SRoman Gushchin #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
91ea1e8796SRoman Gushchin #define MEMFILE_ATTR(val) ((val) & 0xffff)
92ea1e8796SRoman Gushchin
93ea1e8796SRoman Gushchin enum {
94ea1e8796SRoman Gushchin RES_USAGE,
95ea1e8796SRoman Gushchin RES_LIMIT,
96ea1e8796SRoman Gushchin RES_MAX_USAGE,
97ea1e8796SRoman Gushchin RES_FAILCNT,
98ea1e8796SRoman Gushchin RES_SOFT_LIMIT,
99ea1e8796SRoman Gushchin };
100ea1e8796SRoman Gushchin
101292fc2e0SRoman Gushchin #ifdef CONFIG_LOCKDEP
102292fc2e0SRoman Gushchin static struct lockdep_map memcg_oom_lock_dep_map = {
103292fc2e0SRoman Gushchin .name = "memcg_oom_lock",
104292fc2e0SRoman Gushchin };
105292fc2e0SRoman Gushchin #endif
106292fc2e0SRoman Gushchin
107292fc2e0SRoman Gushchin DEFINE_SPINLOCK(memcg_oom_lock);
10866d60c42SRoman Gushchin
__mem_cgroup_insert_exceeded(struct mem_cgroup_per_node * mz,struct mem_cgroup_tree_per_node * mctz,unsigned long new_usage_in_excess)109d12f6d22SRoman Gushchin static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
110d12f6d22SRoman Gushchin struct mem_cgroup_tree_per_node *mctz,
111d12f6d22SRoman Gushchin unsigned long new_usage_in_excess)
112d12f6d22SRoman Gushchin {
113d12f6d22SRoman Gushchin struct rb_node **p = &mctz->rb_root.rb_node;
114d12f6d22SRoman Gushchin struct rb_node *parent = NULL;
115d12f6d22SRoman Gushchin struct mem_cgroup_per_node *mz_node;
116d12f6d22SRoman Gushchin bool rightmost = true;
117d12f6d22SRoman Gushchin
118d12f6d22SRoman Gushchin if (mz->on_tree)
119d12f6d22SRoman Gushchin return;
120d12f6d22SRoman Gushchin
121d12f6d22SRoman Gushchin mz->usage_in_excess = new_usage_in_excess;
122d12f6d22SRoman Gushchin if (!mz->usage_in_excess)
123d12f6d22SRoman Gushchin return;
124d12f6d22SRoman Gushchin while (*p) {
125d12f6d22SRoman Gushchin parent = *p;
126d12f6d22SRoman Gushchin mz_node = rb_entry(parent, struct mem_cgroup_per_node,
127d12f6d22SRoman Gushchin tree_node);
128d12f6d22SRoman Gushchin if (mz->usage_in_excess < mz_node->usage_in_excess) {
129d12f6d22SRoman Gushchin p = &(*p)->rb_left;
130d12f6d22SRoman Gushchin rightmost = false;
131d12f6d22SRoman Gushchin } else {
132d12f6d22SRoman Gushchin p = &(*p)->rb_right;
133d12f6d22SRoman Gushchin }
134d12f6d22SRoman Gushchin }
135d12f6d22SRoman Gushchin
136d12f6d22SRoman Gushchin if (rightmost)
137d12f6d22SRoman Gushchin mctz->rb_rightmost = &mz->tree_node;
138d12f6d22SRoman Gushchin
139d12f6d22SRoman Gushchin rb_link_node(&mz->tree_node, parent, p);
140d12f6d22SRoman Gushchin rb_insert_color(&mz->tree_node, &mctz->rb_root);
141d12f6d22SRoman Gushchin mz->on_tree = true;
142d12f6d22SRoman Gushchin }
143d12f6d22SRoman Gushchin
__mem_cgroup_remove_exceeded(struct mem_cgroup_per_node * mz,struct mem_cgroup_tree_per_node * mctz)144d12f6d22SRoman Gushchin static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
145d12f6d22SRoman Gushchin struct mem_cgroup_tree_per_node *mctz)
146d12f6d22SRoman Gushchin {
147d12f6d22SRoman Gushchin if (!mz->on_tree)
148d12f6d22SRoman Gushchin return;
149d12f6d22SRoman Gushchin
150d12f6d22SRoman Gushchin if (&mz->tree_node == mctz->rb_rightmost)
151d12f6d22SRoman Gushchin mctz->rb_rightmost = rb_prev(&mz->tree_node);
152d12f6d22SRoman Gushchin
153d12f6d22SRoman Gushchin rb_erase(&mz->tree_node, &mctz->rb_root);
154d12f6d22SRoman Gushchin mz->on_tree = false;
155d12f6d22SRoman Gushchin }
156d12f6d22SRoman Gushchin
mem_cgroup_remove_exceeded(struct mem_cgroup_per_node * mz,struct mem_cgroup_tree_per_node * mctz)157d12f6d22SRoman Gushchin static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
158d12f6d22SRoman Gushchin struct mem_cgroup_tree_per_node *mctz)
159d12f6d22SRoman Gushchin {
160d12f6d22SRoman Gushchin unsigned long flags;
161d12f6d22SRoman Gushchin
162d12f6d22SRoman Gushchin spin_lock_irqsave(&mctz->lock, flags);
163d12f6d22SRoman Gushchin __mem_cgroup_remove_exceeded(mz, mctz);
164d12f6d22SRoman Gushchin spin_unlock_irqrestore(&mctz->lock, flags);
165d12f6d22SRoman Gushchin }
166d12f6d22SRoman Gushchin
soft_limit_excess(struct mem_cgroup * memcg)167d12f6d22SRoman Gushchin static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
168d12f6d22SRoman Gushchin {
169d12f6d22SRoman Gushchin unsigned long nr_pages = page_counter_read(&memcg->memory);
170d12f6d22SRoman Gushchin unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
171d12f6d22SRoman Gushchin unsigned long excess = 0;
172d12f6d22SRoman Gushchin
173d12f6d22SRoman Gushchin if (nr_pages > soft_limit)
174d12f6d22SRoman Gushchin excess = nr_pages - soft_limit;
175d12f6d22SRoman Gushchin
176d12f6d22SRoman Gushchin return excess;
177d12f6d22SRoman Gushchin }
178d12f6d22SRoman Gushchin
memcg1_update_tree(struct mem_cgroup * memcg,int nid)17934926e10SRoman Gushchin static void memcg1_update_tree(struct mem_cgroup *memcg, int nid)
180d12f6d22SRoman Gushchin {
181d12f6d22SRoman Gushchin unsigned long excess;
182d12f6d22SRoman Gushchin struct mem_cgroup_per_node *mz;
183d12f6d22SRoman Gushchin struct mem_cgroup_tree_per_node *mctz;
184d12f6d22SRoman Gushchin
185d12f6d22SRoman Gushchin if (lru_gen_enabled()) {
186d12f6d22SRoman Gushchin if (soft_limit_excess(memcg))
187d12f6d22SRoman Gushchin lru_gen_soft_reclaim(memcg, nid);
188d12f6d22SRoman Gushchin return;
189d12f6d22SRoman Gushchin }
190d12f6d22SRoman Gushchin
191d12f6d22SRoman Gushchin mctz = soft_limit_tree.rb_tree_per_node[nid];
192d12f6d22SRoman Gushchin if (!mctz)
193d12f6d22SRoman Gushchin return;
194d12f6d22SRoman Gushchin /*
195d12f6d22SRoman Gushchin * Necessary to update all ancestors when hierarchy is used.
196d12f6d22SRoman Gushchin * because their event counter is not touched.
197d12f6d22SRoman Gushchin */
198d12f6d22SRoman Gushchin for (; memcg; memcg = parent_mem_cgroup(memcg)) {
199d12f6d22SRoman Gushchin mz = memcg->nodeinfo[nid];
200d12f6d22SRoman Gushchin excess = soft_limit_excess(memcg);
201d12f6d22SRoman Gushchin /*
202d12f6d22SRoman Gushchin * We have to update the tree if mz is on RB-tree or
203d12f6d22SRoman Gushchin * mem is over its softlimit.
204d12f6d22SRoman Gushchin */
205d12f6d22SRoman Gushchin if (excess || mz->on_tree) {
206d12f6d22SRoman Gushchin unsigned long flags;
207d12f6d22SRoman Gushchin
208d12f6d22SRoman Gushchin spin_lock_irqsave(&mctz->lock, flags);
209d12f6d22SRoman Gushchin /* if on-tree, remove it */
210d12f6d22SRoman Gushchin if (mz->on_tree)
211d12f6d22SRoman Gushchin __mem_cgroup_remove_exceeded(mz, mctz);
212d12f6d22SRoman Gushchin /*
213d12f6d22SRoman Gushchin * Insert again. mz->usage_in_excess will be updated.
214d12f6d22SRoman Gushchin * If excess is 0, no tree ops.
215d12f6d22SRoman Gushchin */
216d12f6d22SRoman Gushchin __mem_cgroup_insert_exceeded(mz, mctz, excess);
217d12f6d22SRoman Gushchin spin_unlock_irqrestore(&mctz->lock, flags);
218d12f6d22SRoman Gushchin }
219d12f6d22SRoman Gushchin }
220d12f6d22SRoman Gushchin }
221d12f6d22SRoman Gushchin
memcg1_remove_from_trees(struct mem_cgroup * memcg)22287024f58SRoman Gushchin void memcg1_remove_from_trees(struct mem_cgroup *memcg)
223d12f6d22SRoman Gushchin {
224d12f6d22SRoman Gushchin struct mem_cgroup_tree_per_node *mctz;
225d12f6d22SRoman Gushchin struct mem_cgroup_per_node *mz;
226d12f6d22SRoman Gushchin int nid;
227d12f6d22SRoman Gushchin
228d12f6d22SRoman Gushchin for_each_node(nid) {
229d12f6d22SRoman Gushchin mz = memcg->nodeinfo[nid];
230d12f6d22SRoman Gushchin mctz = soft_limit_tree.rb_tree_per_node[nid];
231d12f6d22SRoman Gushchin if (mctz)
232d12f6d22SRoman Gushchin mem_cgroup_remove_exceeded(mz, mctz);
233d12f6d22SRoman Gushchin }
234d12f6d22SRoman Gushchin }
235d12f6d22SRoman Gushchin
236d12f6d22SRoman Gushchin static struct mem_cgroup_per_node *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node * mctz)237d12f6d22SRoman Gushchin __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
238d12f6d22SRoman Gushchin {
239d12f6d22SRoman Gushchin struct mem_cgroup_per_node *mz;
240d12f6d22SRoman Gushchin
241d12f6d22SRoman Gushchin retry:
242d12f6d22SRoman Gushchin mz = NULL;
243d12f6d22SRoman Gushchin if (!mctz->rb_rightmost)
244d12f6d22SRoman Gushchin goto done; /* Nothing to reclaim from */
245d12f6d22SRoman Gushchin
246d12f6d22SRoman Gushchin mz = rb_entry(mctz->rb_rightmost,
247d12f6d22SRoman Gushchin struct mem_cgroup_per_node, tree_node);
248d12f6d22SRoman Gushchin /*
249d12f6d22SRoman Gushchin * Remove the node now but someone else can add it back,
250d12f6d22SRoman Gushchin * we will to add it back at the end of reclaim to its correct
251d12f6d22SRoman Gushchin * position in the tree.
252d12f6d22SRoman Gushchin */
253d12f6d22SRoman Gushchin __mem_cgroup_remove_exceeded(mz, mctz);
254d12f6d22SRoman Gushchin if (!soft_limit_excess(mz->memcg) ||
255d12f6d22SRoman Gushchin !css_tryget(&mz->memcg->css))
256d12f6d22SRoman Gushchin goto retry;
257d12f6d22SRoman Gushchin done:
258d12f6d22SRoman Gushchin return mz;
259d12f6d22SRoman Gushchin }
260d12f6d22SRoman Gushchin
261d12f6d22SRoman Gushchin static struct mem_cgroup_per_node *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node * mctz)262d12f6d22SRoman Gushchin mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
263d12f6d22SRoman Gushchin {
264d12f6d22SRoman Gushchin struct mem_cgroup_per_node *mz;
265d12f6d22SRoman Gushchin
266d12f6d22SRoman Gushchin spin_lock_irq(&mctz->lock);
267d12f6d22SRoman Gushchin mz = __mem_cgroup_largest_soft_limit_node(mctz);
268d12f6d22SRoman Gushchin spin_unlock_irq(&mctz->lock);
269d12f6d22SRoman Gushchin return mz;
270d12f6d22SRoman Gushchin }
271d12f6d22SRoman Gushchin
mem_cgroup_soft_reclaim(struct mem_cgroup * root_memcg,pg_data_t * pgdat,gfp_t gfp_mask,unsigned long * total_scanned)272d12f6d22SRoman Gushchin static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
273d12f6d22SRoman Gushchin pg_data_t *pgdat,
274d12f6d22SRoman Gushchin gfp_t gfp_mask,
275d12f6d22SRoman Gushchin unsigned long *total_scanned)
276d12f6d22SRoman Gushchin {
277d12f6d22SRoman Gushchin struct mem_cgroup *victim = NULL;
278d12f6d22SRoman Gushchin int total = 0;
279d12f6d22SRoman Gushchin int loop = 0;
280d12f6d22SRoman Gushchin unsigned long excess;
281d12f6d22SRoman Gushchin unsigned long nr_scanned;
282d12f6d22SRoman Gushchin struct mem_cgroup_reclaim_cookie reclaim = {
283d12f6d22SRoman Gushchin .pgdat = pgdat,
284d12f6d22SRoman Gushchin };
285d12f6d22SRoman Gushchin
286d12f6d22SRoman Gushchin excess = soft_limit_excess(root_memcg);
287d12f6d22SRoman Gushchin
288d12f6d22SRoman Gushchin while (1) {
289d12f6d22SRoman Gushchin victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
290d12f6d22SRoman Gushchin if (!victim) {
291d12f6d22SRoman Gushchin loop++;
292d12f6d22SRoman Gushchin if (loop >= 2) {
293d12f6d22SRoman Gushchin /*
294d12f6d22SRoman Gushchin * If we have not been able to reclaim
295d12f6d22SRoman Gushchin * anything, it might because there are
296d12f6d22SRoman Gushchin * no reclaimable pages under this hierarchy
297d12f6d22SRoman Gushchin */
298d12f6d22SRoman Gushchin if (!total)
299d12f6d22SRoman Gushchin break;
300d12f6d22SRoman Gushchin /*
301d12f6d22SRoman Gushchin * We want to do more targeted reclaim.
302d12f6d22SRoman Gushchin * excess >> 2 is not to excessive so as to
303d12f6d22SRoman Gushchin * reclaim too much, nor too less that we keep
304d12f6d22SRoman Gushchin * coming back to reclaim from this cgroup
305d12f6d22SRoman Gushchin */
306d12f6d22SRoman Gushchin if (total >= (excess >> 2) ||
307d12f6d22SRoman Gushchin (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
308d12f6d22SRoman Gushchin break;
309d12f6d22SRoman Gushchin }
310d12f6d22SRoman Gushchin continue;
311d12f6d22SRoman Gushchin }
312d12f6d22SRoman Gushchin total += mem_cgroup_shrink_node(victim, gfp_mask, false,
313d12f6d22SRoman Gushchin pgdat, &nr_scanned);
314d12f6d22SRoman Gushchin *total_scanned += nr_scanned;
315d12f6d22SRoman Gushchin if (!soft_limit_excess(root_memcg))
316d12f6d22SRoman Gushchin break;
317d12f6d22SRoman Gushchin }
318d12f6d22SRoman Gushchin mem_cgroup_iter_break(root_memcg, victim);
319d12f6d22SRoman Gushchin return total;
320d12f6d22SRoman Gushchin }
321d12f6d22SRoman Gushchin
memcg1_soft_limit_reclaim(pg_data_t * pgdat,int order,gfp_t gfp_mask,unsigned long * total_scanned)32287024f58SRoman Gushchin unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
323d12f6d22SRoman Gushchin gfp_t gfp_mask,
324d12f6d22SRoman Gushchin unsigned long *total_scanned)
325d12f6d22SRoman Gushchin {
326d12f6d22SRoman Gushchin unsigned long nr_reclaimed = 0;
327d12f6d22SRoman Gushchin struct mem_cgroup_per_node *mz, *next_mz = NULL;
328d12f6d22SRoman Gushchin unsigned long reclaimed;
329d12f6d22SRoman Gushchin int loop = 0;
330d12f6d22SRoman Gushchin struct mem_cgroup_tree_per_node *mctz;
331d12f6d22SRoman Gushchin unsigned long excess;
332d12f6d22SRoman Gushchin
333d12f6d22SRoman Gushchin if (lru_gen_enabled())
334d12f6d22SRoman Gushchin return 0;
335d12f6d22SRoman Gushchin
336d12f6d22SRoman Gushchin if (order > 0)
337d12f6d22SRoman Gushchin return 0;
338d12f6d22SRoman Gushchin
339d12f6d22SRoman Gushchin mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
340d12f6d22SRoman Gushchin
341d12f6d22SRoman Gushchin /*
342d12f6d22SRoman Gushchin * Do not even bother to check the largest node if the root
343d12f6d22SRoman Gushchin * is empty. Do it lockless to prevent lock bouncing. Races
344d12f6d22SRoman Gushchin * are acceptable as soft limit is best effort anyway.
345d12f6d22SRoman Gushchin */
346d12f6d22SRoman Gushchin if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
347d12f6d22SRoman Gushchin return 0;
348d12f6d22SRoman Gushchin
349d12f6d22SRoman Gushchin /*
350d12f6d22SRoman Gushchin * This loop can run a while, specially if mem_cgroup's continuously
351d12f6d22SRoman Gushchin * keep exceeding their soft limit and putting the system under
352d12f6d22SRoman Gushchin * pressure
353d12f6d22SRoman Gushchin */
354d12f6d22SRoman Gushchin do {
355d12f6d22SRoman Gushchin if (next_mz)
356d12f6d22SRoman Gushchin mz = next_mz;
357d12f6d22SRoman Gushchin else
358d12f6d22SRoman Gushchin mz = mem_cgroup_largest_soft_limit_node(mctz);
359d12f6d22SRoman Gushchin if (!mz)
360d12f6d22SRoman Gushchin break;
361d12f6d22SRoman Gushchin
362d12f6d22SRoman Gushchin reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
363d12f6d22SRoman Gushchin gfp_mask, total_scanned);
364d12f6d22SRoman Gushchin nr_reclaimed += reclaimed;
365d12f6d22SRoman Gushchin spin_lock_irq(&mctz->lock);
366d12f6d22SRoman Gushchin
367d12f6d22SRoman Gushchin /*
368d12f6d22SRoman Gushchin * If we failed to reclaim anything from this memory cgroup
369d12f6d22SRoman Gushchin * it is time to move on to the next cgroup
370d12f6d22SRoman Gushchin */
371d12f6d22SRoman Gushchin next_mz = NULL;
372d12f6d22SRoman Gushchin if (!reclaimed)
373d12f6d22SRoman Gushchin next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
374d12f6d22SRoman Gushchin
375d12f6d22SRoman Gushchin excess = soft_limit_excess(mz->memcg);
376d12f6d22SRoman Gushchin /*
377d12f6d22SRoman Gushchin * One school of thought says that we should not add
378d12f6d22SRoman Gushchin * back the node to the tree if reclaim returns 0.
379d12f6d22SRoman Gushchin * But our reclaim could return 0, simply because due
380d12f6d22SRoman Gushchin * to priority we are exposing a smaller subset of
381d12f6d22SRoman Gushchin * memory to reclaim from. Consider this as a longer
382d12f6d22SRoman Gushchin * term TODO.
383d12f6d22SRoman Gushchin */
384d12f6d22SRoman Gushchin /* If excess == 0, no tree ops */
385d12f6d22SRoman Gushchin __mem_cgroup_insert_exceeded(mz, mctz, excess);
386d12f6d22SRoman Gushchin spin_unlock_irq(&mctz->lock);
387d12f6d22SRoman Gushchin css_put(&mz->memcg->css);
388d12f6d22SRoman Gushchin loop++;
389d12f6d22SRoman Gushchin /*
390d12f6d22SRoman Gushchin * Could not reclaim anything and there are no more
391d12f6d22SRoman Gushchin * mem cgroups to try or we seem to be looping without
392d12f6d22SRoman Gushchin * reclaiming anything.
393d12f6d22SRoman Gushchin */
394d12f6d22SRoman Gushchin if (!nr_reclaimed &&
395d12f6d22SRoman Gushchin (next_mz == NULL ||
396d12f6d22SRoman Gushchin loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
397d12f6d22SRoman Gushchin break;
398d12f6d22SRoman Gushchin } while (!nr_reclaimed);
399d12f6d22SRoman Gushchin if (next_mz)
400d12f6d22SRoman Gushchin css_put(&next_mz->memcg->css);
401d12f6d22SRoman Gushchin return nr_reclaimed;
402d12f6d22SRoman Gushchin }
403d12f6d22SRoman Gushchin
mem_cgroup_move_charge_read(struct cgroup_subsys_state * css,struct cftype * cft)404ea1e8796SRoman Gushchin static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
405e548ad4aSRoman Gushchin struct cftype *cft)
406e548ad4aSRoman Gushchin {
407aa6b4fdfSShakeel Butt return 0;
408e548ad4aSRoman Gushchin }
409e548ad4aSRoman Gushchin
410e548ad4aSRoman Gushchin #ifdef CONFIG_MMU
mem_cgroup_move_charge_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)411ea1e8796SRoman Gushchin static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
412e548ad4aSRoman Gushchin struct cftype *cft, u64 val)
413e548ad4aSRoman Gushchin {
414e548ad4aSRoman Gushchin pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
415e548ad4aSRoman Gushchin "Please report your usecase to linux-mm@kvack.org if you "
416e548ad4aSRoman Gushchin "depend on this functionality.\n");
417e548ad4aSRoman Gushchin
418aa6b4fdfSShakeel Butt if (val != 0)
419e548ad4aSRoman Gushchin return -EINVAL;
420e548ad4aSRoman Gushchin return 0;
421e548ad4aSRoman Gushchin }
422e548ad4aSRoman Gushchin #else
mem_cgroup_move_charge_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)423ea1e8796SRoman Gushchin static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
424e548ad4aSRoman Gushchin struct cftype *cft, u64 val)
425e548ad4aSRoman Gushchin {
426e548ad4aSRoman Gushchin return -ENOSYS;
427e548ad4aSRoman Gushchin }
428e548ad4aSRoman Gushchin #endif
429e548ad4aSRoman Gushchin
mem_cgroup_usage(struct mem_cgroup * memcg,bool swap)430558605a5SChen Ridong static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
431558605a5SChen Ridong {
432558605a5SChen Ridong unsigned long val;
433558605a5SChen Ridong
434558605a5SChen Ridong if (mem_cgroup_is_root(memcg)) {
435558605a5SChen Ridong /*
436558605a5SChen Ridong * Approximate root's usage from global state. This isn't
437558605a5SChen Ridong * perfect, but the root usage was always an approximation.
438558605a5SChen Ridong */
439558605a5SChen Ridong val = global_node_page_state(NR_FILE_PAGES) +
440558605a5SChen Ridong global_node_page_state(NR_ANON_MAPPED);
441558605a5SChen Ridong if (swap)
442558605a5SChen Ridong val += total_swap_pages - get_nr_swap_pages();
443558605a5SChen Ridong } else {
444558605a5SChen Ridong if (!swap)
445558605a5SChen Ridong val = page_counter_read(&memcg->memory);
446558605a5SChen Ridong else
447558605a5SChen Ridong val = page_counter_read(&memcg->memsw);
448558605a5SChen Ridong }
449558605a5SChen Ridong return val;
450558605a5SChen Ridong }
451558605a5SChen Ridong
__mem_cgroup_threshold(struct mem_cgroup * memcg,bool swap)45266d60c42SRoman Gushchin static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
45366d60c42SRoman Gushchin {
45466d60c42SRoman Gushchin struct mem_cgroup_threshold_ary *t;
45566d60c42SRoman Gushchin unsigned long usage;
45666d60c42SRoman Gushchin int i;
45766d60c42SRoman Gushchin
45866d60c42SRoman Gushchin rcu_read_lock();
45966d60c42SRoman Gushchin if (!swap)
46066d60c42SRoman Gushchin t = rcu_dereference(memcg->thresholds.primary);
46166d60c42SRoman Gushchin else
46266d60c42SRoman Gushchin t = rcu_dereference(memcg->memsw_thresholds.primary);
46366d60c42SRoman Gushchin
46466d60c42SRoman Gushchin if (!t)
46566d60c42SRoman Gushchin goto unlock;
46666d60c42SRoman Gushchin
46766d60c42SRoman Gushchin usage = mem_cgroup_usage(memcg, swap);
46866d60c42SRoman Gushchin
46966d60c42SRoman Gushchin /*
47066d60c42SRoman Gushchin * current_threshold points to threshold just below or equal to usage.
47166d60c42SRoman Gushchin * If it's not true, a threshold was crossed after last
47266d60c42SRoman Gushchin * call of __mem_cgroup_threshold().
47366d60c42SRoman Gushchin */
47466d60c42SRoman Gushchin i = t->current_threshold;
47566d60c42SRoman Gushchin
47666d60c42SRoman Gushchin /*
47766d60c42SRoman Gushchin * Iterate backward over array of thresholds starting from
47866d60c42SRoman Gushchin * current_threshold and check if a threshold is crossed.
47966d60c42SRoman Gushchin * If none of thresholds below usage is crossed, we read
48066d60c42SRoman Gushchin * only one element of the array here.
48166d60c42SRoman Gushchin */
48266d60c42SRoman Gushchin for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
48366d60c42SRoman Gushchin eventfd_signal(t->entries[i].eventfd);
48466d60c42SRoman Gushchin
48566d60c42SRoman Gushchin /* i = current_threshold + 1 */
48666d60c42SRoman Gushchin i++;
48766d60c42SRoman Gushchin
48866d60c42SRoman Gushchin /*
48966d60c42SRoman Gushchin * Iterate forward over array of thresholds starting from
49066d60c42SRoman Gushchin * current_threshold+1 and check if a threshold is crossed.
49166d60c42SRoman Gushchin * If none of thresholds above usage is crossed, we read
49266d60c42SRoman Gushchin * only one element of the array here.
49366d60c42SRoman Gushchin */
49466d60c42SRoman Gushchin for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
49566d60c42SRoman Gushchin eventfd_signal(t->entries[i].eventfd);
49666d60c42SRoman Gushchin
49766d60c42SRoman Gushchin /* Update current_threshold */
49866d60c42SRoman Gushchin t->current_threshold = i - 1;
49966d60c42SRoman Gushchin unlock:
50066d60c42SRoman Gushchin rcu_read_unlock();
50166d60c42SRoman Gushchin }
50266d60c42SRoman Gushchin
mem_cgroup_threshold(struct mem_cgroup * memcg)50366d60c42SRoman Gushchin static void mem_cgroup_threshold(struct mem_cgroup *memcg)
50466d60c42SRoman Gushchin {
50566d60c42SRoman Gushchin while (memcg) {
50666d60c42SRoman Gushchin __mem_cgroup_threshold(memcg, false);
50766d60c42SRoman Gushchin if (do_memsw_account())
50866d60c42SRoman Gushchin __mem_cgroup_threshold(memcg, true);
50966d60c42SRoman Gushchin
51066d60c42SRoman Gushchin memcg = parent_mem_cgroup(memcg);
51166d60c42SRoman Gushchin }
51266d60c42SRoman Gushchin }
51366d60c42SRoman Gushchin
5140ccaf421SShakeel Butt /* Cgroup1: threshold notifications & softlimit tree updates */
5150d892bbbSJohannes Weiner
5160d892bbbSJohannes Weiner /*
5170d892bbbSJohannes Weiner * Per memcg event counter is incremented at every pagein/pageout. With THP,
5180d892bbbSJohannes Weiner * it will be incremented by the number of pages. This counter is used
5190d892bbbSJohannes Weiner * to trigger some periodic events. This is straightforward and better
5200d892bbbSJohannes Weiner * than using jiffies etc. to handle periodic memcg event.
5210d892bbbSJohannes Weiner */
5220d892bbbSJohannes Weiner enum mem_cgroup_events_target {
5230d892bbbSJohannes Weiner MEM_CGROUP_TARGET_THRESH,
5240d892bbbSJohannes Weiner MEM_CGROUP_TARGET_SOFTLIMIT,
5250d892bbbSJohannes Weiner MEM_CGROUP_NTARGETS,
5260d892bbbSJohannes Weiner };
5270d892bbbSJohannes Weiner
5280ccaf421SShakeel Butt struct memcg1_events_percpu {
5290ccaf421SShakeel Butt unsigned long nr_page_events;
5300ccaf421SShakeel Butt unsigned long targets[MEM_CGROUP_NTARGETS];
5310ccaf421SShakeel Butt };
5320ccaf421SShakeel Butt
memcg1_charge_statistics(struct mem_cgroup * memcg,int nr_pages)533a5ebe6bbSShakeel Butt static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
5347d7602b4SShakeel Butt {
5357d7602b4SShakeel Butt /* pagein of a big page is an event. So, ignore page size */
5367d7602b4SShakeel Butt if (nr_pages > 0)
537e52401e7SShakeel Butt count_memcg_events(memcg, PGPGIN, 1);
5387d7602b4SShakeel Butt else {
539e52401e7SShakeel Butt count_memcg_events(memcg, PGPGOUT, 1);
5407d7602b4SShakeel Butt nr_pages = -nr_pages; /* for event */
5417d7602b4SShakeel Butt }
5427d7602b4SShakeel Butt
5437d7602b4SShakeel Butt __this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages);
5447d7602b4SShakeel Butt }
5457d7602b4SShakeel Butt
54641213dd0SShakeel Butt #define THRESHOLDS_EVENTS_TARGET 128
54741213dd0SShakeel Butt #define SOFTLIMIT_EVENTS_TARGET 1024
54841213dd0SShakeel Butt
memcg1_event_ratelimit(struct mem_cgroup * memcg,enum mem_cgroup_events_target target)54941213dd0SShakeel Butt static bool memcg1_event_ratelimit(struct mem_cgroup *memcg,
55041213dd0SShakeel Butt enum mem_cgroup_events_target target)
55141213dd0SShakeel Butt {
55241213dd0SShakeel Butt unsigned long val, next;
55341213dd0SShakeel Butt
55441213dd0SShakeel Butt val = __this_cpu_read(memcg->events_percpu->nr_page_events);
55541213dd0SShakeel Butt next = __this_cpu_read(memcg->events_percpu->targets[target]);
55641213dd0SShakeel Butt /* from time_after() in jiffies.h */
55741213dd0SShakeel Butt if ((long)(next - val) < 0) {
55841213dd0SShakeel Butt switch (target) {
55941213dd0SShakeel Butt case MEM_CGROUP_TARGET_THRESH:
56041213dd0SShakeel Butt next = val + THRESHOLDS_EVENTS_TARGET;
56141213dd0SShakeel Butt break;
56241213dd0SShakeel Butt case MEM_CGROUP_TARGET_SOFTLIMIT:
56341213dd0SShakeel Butt next = val + SOFTLIMIT_EVENTS_TARGET;
56441213dd0SShakeel Butt break;
56541213dd0SShakeel Butt default:
56641213dd0SShakeel Butt break;
56741213dd0SShakeel Butt }
56841213dd0SShakeel Butt __this_cpu_write(memcg->events_percpu->targets[target], next);
56941213dd0SShakeel Butt return true;
57041213dd0SShakeel Butt }
57141213dd0SShakeel Butt return false;
57241213dd0SShakeel Butt }
57341213dd0SShakeel Butt
57466d60c42SRoman Gushchin /*
57566d60c42SRoman Gushchin * Check events in order.
57666d60c42SRoman Gushchin *
57766d60c42SRoman Gushchin */
memcg1_check_events(struct mem_cgroup * memcg,int nid)578a5ebe6bbSShakeel Butt static void memcg1_check_events(struct mem_cgroup *memcg, int nid)
57966d60c42SRoman Gushchin {
58066d60c42SRoman Gushchin if (IS_ENABLED(CONFIG_PREEMPT_RT))
58166d60c42SRoman Gushchin return;
58266d60c42SRoman Gushchin
58366d60c42SRoman Gushchin /* threshold event is triggered in finer grain than soft limit */
58441213dd0SShakeel Butt if (unlikely(memcg1_event_ratelimit(memcg,
58566d60c42SRoman Gushchin MEM_CGROUP_TARGET_THRESH))) {
58666d60c42SRoman Gushchin bool do_softlimit;
58766d60c42SRoman Gushchin
58841213dd0SShakeel Butt do_softlimit = memcg1_event_ratelimit(memcg,
58966d60c42SRoman Gushchin MEM_CGROUP_TARGET_SOFTLIMIT);
59066d60c42SRoman Gushchin mem_cgroup_threshold(memcg);
59166d60c42SRoman Gushchin if (unlikely(do_softlimit))
59266d60c42SRoman Gushchin memcg1_update_tree(memcg, nid);
59366d60c42SRoman Gushchin }
59466d60c42SRoman Gushchin }
59566d60c42SRoman Gushchin
memcg1_commit_charge(struct folio * folio,struct mem_cgroup * memcg)596f7d49ba0SShakeel Butt void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
597f7d49ba0SShakeel Butt {
598f7d49ba0SShakeel Butt unsigned long flags;
599f7d49ba0SShakeel Butt
600f7d49ba0SShakeel Butt local_irq_save(flags);
601f7d49ba0SShakeel Butt memcg1_charge_statistics(memcg, folio_nr_pages(folio));
602f7d49ba0SShakeel Butt memcg1_check_events(memcg, folio_nid(folio));
603f7d49ba0SShakeel Butt local_irq_restore(flags);
604f7d49ba0SShakeel Butt }
605f7d49ba0SShakeel Butt
60689ce924fSJohannes Weiner /**
60789ce924fSJohannes Weiner * memcg1_swapout - transfer a memsw charge to swap
60889ce924fSJohannes Weiner * @folio: folio whose memsw charge to transfer
60989ce924fSJohannes Weiner * @entry: swap entry to move the charge to
61089ce924fSJohannes Weiner *
61189ce924fSJohannes Weiner * Transfer the memsw charge of @folio to @entry.
61289ce924fSJohannes Weiner */
memcg1_swapout(struct folio * folio,swp_entry_t entry)61389ce924fSJohannes Weiner void memcg1_swapout(struct folio *folio, swp_entry_t entry)
614f7d49ba0SShakeel Butt {
61589ce924fSJohannes Weiner struct mem_cgroup *memcg, *swap_memcg;
616*f1cf8d2fSMuchun Song struct obj_cgroup *objcg;
61789ce924fSJohannes Weiner unsigned int nr_entries;
61889ce924fSJohannes Weiner
61989ce924fSJohannes Weiner VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
62089ce924fSJohannes Weiner VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
62189ce924fSJohannes Weiner
62289ce924fSJohannes Weiner if (mem_cgroup_disabled())
62389ce924fSJohannes Weiner return;
62489ce924fSJohannes Weiner
62589ce924fSJohannes Weiner if (!do_memsw_account())
62689ce924fSJohannes Weiner return;
62789ce924fSJohannes Weiner
628*f1cf8d2fSMuchun Song objcg = folio_objcg(folio);
629*f1cf8d2fSMuchun Song VM_WARN_ON_ONCE_FOLIO(!objcg, folio);
630*f1cf8d2fSMuchun Song if (!objcg)
63189ce924fSJohannes Weiner return;
63289ce924fSJohannes Weiner
633*f1cf8d2fSMuchun Song rcu_read_lock();
634*f1cf8d2fSMuchun Song memcg = obj_cgroup_memcg(objcg);
63589ce924fSJohannes Weiner /*
63689ce924fSJohannes Weiner * In case the memcg owning these pages has been offlined and doesn't
63789ce924fSJohannes Weiner * have an ID allocated to it anymore, charge the closest online
63889ce924fSJohannes Weiner * ancestor for the swap instead and transfer the memory+swap charge.
63989ce924fSJohannes Weiner */
64089ce924fSJohannes Weiner nr_entries = folio_nr_pages(folio);
64137cb8cd0SKairui Song swap_memcg = mem_cgroup_private_id_get_online(memcg, nr_entries);
64289ce924fSJohannes Weiner mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
64389ce924fSJohannes Weiner
644e77786b4SShakeel Butt swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), entry);
64589ce924fSJohannes Weiner
64689ce924fSJohannes Weiner folio_unqueue_deferred_split(folio);
64789ce924fSJohannes Weiner folio->memcg_data = 0;
64889ce924fSJohannes Weiner
649*f1cf8d2fSMuchun Song if (!obj_cgroup_is_root(objcg))
65089ce924fSJohannes Weiner page_counter_uncharge(&memcg->memory, nr_entries);
65189ce924fSJohannes Weiner
65289ce924fSJohannes Weiner if (memcg != swap_memcg) {
65389ce924fSJohannes Weiner if (!mem_cgroup_is_root(swap_memcg))
65489ce924fSJohannes Weiner page_counter_charge(&swap_memcg->memsw, nr_entries);
65589ce924fSJohannes Weiner page_counter_uncharge(&memcg->memsw, nr_entries);
65689ce924fSJohannes Weiner }
65789ce924fSJohannes Weiner
658f7d49ba0SShakeel Butt /*
659f7d49ba0SShakeel Butt * Interrupts should be disabled here because the caller holds the
660f7d49ba0SShakeel Butt * i_pages lock which is taken with interrupts-off. It is
661f7d49ba0SShakeel Butt * important here to have the interrupts disabled because it is the
662f7d49ba0SShakeel Butt * only synchronisation we have for updating the per-CPU variables.
663f7d49ba0SShakeel Butt */
664f7d49ba0SShakeel Butt preempt_disable_nested();
665f7d49ba0SShakeel Butt VM_WARN_ON_IRQS_ENABLED();
666f7d49ba0SShakeel Butt memcg1_charge_statistics(memcg, -folio_nr_pages(folio));
667f7d49ba0SShakeel Butt preempt_enable_nested();
668f7d49ba0SShakeel Butt memcg1_check_events(memcg, folio_nid(folio));
66989ce924fSJohannes Weiner
670*f1cf8d2fSMuchun Song rcu_read_unlock();
671*f1cf8d2fSMuchun Song obj_cgroup_put(objcg);
67289ce924fSJohannes Weiner }
67389ce924fSJohannes Weiner
67489ce924fSJohannes Weiner /*
67589ce924fSJohannes Weiner * memcg1_swapin - uncharge swap slot
67689ce924fSJohannes Weiner * @entry: the first swap entry for which the pages are charged
67789ce924fSJohannes Weiner * @nr_pages: number of pages which will be uncharged
67889ce924fSJohannes Weiner *
67989ce924fSJohannes Weiner * Call this function after successfully adding the charged page to swapcache.
68089ce924fSJohannes Weiner *
68189ce924fSJohannes Weiner * Note: This function assumes the page for which swap slot is being uncharged
68289ce924fSJohannes Weiner * is order 0 page.
68389ce924fSJohannes Weiner */
memcg1_swapin(swp_entry_t entry,unsigned int nr_pages)68489ce924fSJohannes Weiner void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages)
68589ce924fSJohannes Weiner {
68689ce924fSJohannes Weiner /*
68789ce924fSJohannes Weiner * Cgroup1's unified memory+swap counter has been charged with the
68889ce924fSJohannes Weiner * new swapcache page, finish the transfer by uncharging the swap
68989ce924fSJohannes Weiner * slot. The swap slot would also get uncharged when it dies, but
69089ce924fSJohannes Weiner * it can stick around indefinitely and we'd count the page twice
69189ce924fSJohannes Weiner * the entire time.
69289ce924fSJohannes Weiner *
69389ce924fSJohannes Weiner * Cgroup2 has separate resource counters for memory and swap,
69489ce924fSJohannes Weiner * so this is a non-issue here. Memory and swap charge lifetimes
69589ce924fSJohannes Weiner * correspond 1:1 to page and swap slot lifetimes: we charge the
69689ce924fSJohannes Weiner * page to memory here, and uncharge swap when the slot is freed.
69789ce924fSJohannes Weiner */
69889ce924fSJohannes Weiner if (do_memsw_account()) {
69989ce924fSJohannes Weiner /*
70089ce924fSJohannes Weiner * The swap entry might not get freed for a long time,
70189ce924fSJohannes Weiner * let's not wait for it. The page already received a
70289ce924fSJohannes Weiner * memory+swap charge, drop the swap entry duplicate.
70389ce924fSJohannes Weiner */
70489ce924fSJohannes Weiner mem_cgroup_uncharge_swap(entry, nr_pages);
70589ce924fSJohannes Weiner }
706f7d49ba0SShakeel Butt }
707f7d49ba0SShakeel Butt
memcg1_uncharge_batch(struct mem_cgroup * memcg,unsigned long pgpgout,unsigned long nr_memory,int nid)708f7d49ba0SShakeel Butt void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
709f7d49ba0SShakeel Butt unsigned long nr_memory, int nid)
710f7d49ba0SShakeel Butt {
711f7d49ba0SShakeel Butt unsigned long flags;
712f7d49ba0SShakeel Butt
713f7d49ba0SShakeel Butt local_irq_save(flags);
714e52401e7SShakeel Butt count_memcg_events(memcg, PGPGOUT, pgpgout);
715f7d49ba0SShakeel Butt __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory);
716f7d49ba0SShakeel Butt memcg1_check_events(memcg, nid);
717f7d49ba0SShakeel Butt local_irq_restore(flags);
718f7d49ba0SShakeel Butt }
719f7d49ba0SShakeel Butt
compare_thresholds(const void * a,const void * b)72066d60c42SRoman Gushchin static int compare_thresholds(const void *a, const void *b)
72166d60c42SRoman Gushchin {
72266d60c42SRoman Gushchin const struct mem_cgroup_threshold *_a = a;
72366d60c42SRoman Gushchin const struct mem_cgroup_threshold *_b = b;
72466d60c42SRoman Gushchin
72566d60c42SRoman Gushchin if (_a->threshold > _b->threshold)
72666d60c42SRoman Gushchin return 1;
72766d60c42SRoman Gushchin
72866d60c42SRoman Gushchin if (_a->threshold < _b->threshold)
72966d60c42SRoman Gushchin return -1;
73066d60c42SRoman Gushchin
73166d60c42SRoman Gushchin return 0;
73266d60c42SRoman Gushchin }
73366d60c42SRoman Gushchin
mem_cgroup_oom_notify_cb(struct mem_cgroup * memcg)73466d60c42SRoman Gushchin static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
73566d60c42SRoman Gushchin {
73666d60c42SRoman Gushchin struct mem_cgroup_eventfd_list *ev;
73766d60c42SRoman Gushchin
73866d60c42SRoman Gushchin spin_lock(&memcg_oom_lock);
73966d60c42SRoman Gushchin
74066d60c42SRoman Gushchin list_for_each_entry(ev, &memcg->oom_notify, list)
74166d60c42SRoman Gushchin eventfd_signal(ev->eventfd);
74266d60c42SRoman Gushchin
74366d60c42SRoman Gushchin spin_unlock(&memcg_oom_lock);
74466d60c42SRoman Gushchin return 0;
74566d60c42SRoman Gushchin }
74666d60c42SRoman Gushchin
mem_cgroup_oom_notify(struct mem_cgroup * memcg)747292fc2e0SRoman Gushchin static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
74866d60c42SRoman Gushchin {
74966d60c42SRoman Gushchin struct mem_cgroup *iter;
75066d60c42SRoman Gushchin
75166d60c42SRoman Gushchin for_each_mem_cgroup_tree(iter, memcg)
75266d60c42SRoman Gushchin mem_cgroup_oom_notify_cb(iter);
75366d60c42SRoman Gushchin }
75466d60c42SRoman Gushchin
__mem_cgroup_usage_register_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd,const char * args,enum res_type type)75566d60c42SRoman Gushchin static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
75666d60c42SRoman Gushchin struct eventfd_ctx *eventfd, const char *args, enum res_type type)
75766d60c42SRoman Gushchin {
75866d60c42SRoman Gushchin struct mem_cgroup_thresholds *thresholds;
75966d60c42SRoman Gushchin struct mem_cgroup_threshold_ary *new;
76066d60c42SRoman Gushchin unsigned long threshold;
76166d60c42SRoman Gushchin unsigned long usage;
76266d60c42SRoman Gushchin int i, size, ret;
76366d60c42SRoman Gushchin
76466d60c42SRoman Gushchin ret = page_counter_memparse(args, "-1", &threshold);
76566d60c42SRoman Gushchin if (ret)
76666d60c42SRoman Gushchin return ret;
76766d60c42SRoman Gushchin
76866d60c42SRoman Gushchin mutex_lock(&memcg->thresholds_lock);
76966d60c42SRoman Gushchin
77066d60c42SRoman Gushchin if (type == _MEM) {
77166d60c42SRoman Gushchin thresholds = &memcg->thresholds;
77266d60c42SRoman Gushchin usage = mem_cgroup_usage(memcg, false);
77366d60c42SRoman Gushchin } else if (type == _MEMSWAP) {
77466d60c42SRoman Gushchin thresholds = &memcg->memsw_thresholds;
77566d60c42SRoman Gushchin usage = mem_cgroup_usage(memcg, true);
77666d60c42SRoman Gushchin } else
77766d60c42SRoman Gushchin BUG();
77866d60c42SRoman Gushchin
77966d60c42SRoman Gushchin /* Check if a threshold crossed before adding a new one */
78066d60c42SRoman Gushchin if (thresholds->primary)
78166d60c42SRoman Gushchin __mem_cgroup_threshold(memcg, type == _MEMSWAP);
78266d60c42SRoman Gushchin
78366d60c42SRoman Gushchin size = thresholds->primary ? thresholds->primary->size + 1 : 1;
78466d60c42SRoman Gushchin
78566d60c42SRoman Gushchin /* Allocate memory for new array of thresholds */
78669050f8dSKees Cook new = kmalloc_flex(*new, entries, size, GFP_KERNEL_ACCOUNT);
78766d60c42SRoman Gushchin if (!new) {
78866d60c42SRoman Gushchin ret = -ENOMEM;
78966d60c42SRoman Gushchin goto unlock;
79066d60c42SRoman Gushchin }
79166d60c42SRoman Gushchin new->size = size;
79266d60c42SRoman Gushchin
79366d60c42SRoman Gushchin /* Copy thresholds (if any) to new array */
79466d60c42SRoman Gushchin if (thresholds->primary)
79566d60c42SRoman Gushchin memcpy(new->entries, thresholds->primary->entries,
79666d60c42SRoman Gushchin flex_array_size(new, entries, size - 1));
79766d60c42SRoman Gushchin
79866d60c42SRoman Gushchin /* Add new threshold */
79966d60c42SRoman Gushchin new->entries[size - 1].eventfd = eventfd;
80066d60c42SRoman Gushchin new->entries[size - 1].threshold = threshold;
80166d60c42SRoman Gushchin
80266d60c42SRoman Gushchin /* Sort thresholds. Registering of new threshold isn't time-critical */
80366d60c42SRoman Gushchin sort(new->entries, size, sizeof(*new->entries),
80466d60c42SRoman Gushchin compare_thresholds, NULL);
80566d60c42SRoman Gushchin
80666d60c42SRoman Gushchin /* Find current threshold */
80766d60c42SRoman Gushchin new->current_threshold = -1;
80866d60c42SRoman Gushchin for (i = 0; i < size; i++) {
80966d60c42SRoman Gushchin if (new->entries[i].threshold <= usage) {
81066d60c42SRoman Gushchin /*
81166d60c42SRoman Gushchin * new->current_threshold will not be used until
81266d60c42SRoman Gushchin * rcu_assign_pointer(), so it's safe to increment
81366d60c42SRoman Gushchin * it here.
81466d60c42SRoman Gushchin */
81566d60c42SRoman Gushchin ++new->current_threshold;
81666d60c42SRoman Gushchin } else
81766d60c42SRoman Gushchin break;
81866d60c42SRoman Gushchin }
81966d60c42SRoman Gushchin
82066d60c42SRoman Gushchin /* Free old spare buffer and save old primary buffer as spare */
82166d60c42SRoman Gushchin kfree(thresholds->spare);
82266d60c42SRoman Gushchin thresholds->spare = thresholds->primary;
82366d60c42SRoman Gushchin
82466d60c42SRoman Gushchin rcu_assign_pointer(thresholds->primary, new);
82566d60c42SRoman Gushchin
82666d60c42SRoman Gushchin /* To be sure that nobody uses thresholds */
82766d60c42SRoman Gushchin synchronize_rcu();
82866d60c42SRoman Gushchin
82966d60c42SRoman Gushchin unlock:
83066d60c42SRoman Gushchin mutex_unlock(&memcg->thresholds_lock);
83166d60c42SRoman Gushchin
83266d60c42SRoman Gushchin return ret;
83366d60c42SRoman Gushchin }
83466d60c42SRoman Gushchin
mem_cgroup_usage_register_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd,const char * args)83566d60c42SRoman Gushchin static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
83666d60c42SRoman Gushchin struct eventfd_ctx *eventfd, const char *args)
83766d60c42SRoman Gushchin {
83866d60c42SRoman Gushchin return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
83966d60c42SRoman Gushchin }
84066d60c42SRoman Gushchin
memsw_cgroup_usage_register_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd,const char * args)84166d60c42SRoman Gushchin static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
84266d60c42SRoman Gushchin struct eventfd_ctx *eventfd, const char *args)
84366d60c42SRoman Gushchin {
84466d60c42SRoman Gushchin return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
84566d60c42SRoman Gushchin }
84666d60c42SRoman Gushchin
__mem_cgroup_usage_unregister_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd,enum res_type type)84766d60c42SRoman Gushchin static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
84866d60c42SRoman Gushchin struct eventfd_ctx *eventfd, enum res_type type)
84966d60c42SRoman Gushchin {
85066d60c42SRoman Gushchin struct mem_cgroup_thresholds *thresholds;
85166d60c42SRoman Gushchin struct mem_cgroup_threshold_ary *new;
85266d60c42SRoman Gushchin unsigned long usage;
85366d60c42SRoman Gushchin int i, j, size, entries;
85466d60c42SRoman Gushchin
85566d60c42SRoman Gushchin mutex_lock(&memcg->thresholds_lock);
85666d60c42SRoman Gushchin
85766d60c42SRoman Gushchin if (type == _MEM) {
85866d60c42SRoman Gushchin thresholds = &memcg->thresholds;
85966d60c42SRoman Gushchin usage = mem_cgroup_usage(memcg, false);
86066d60c42SRoman Gushchin } else if (type == _MEMSWAP) {
86166d60c42SRoman Gushchin thresholds = &memcg->memsw_thresholds;
86266d60c42SRoman Gushchin usage = mem_cgroup_usage(memcg, true);
86366d60c42SRoman Gushchin } else
86466d60c42SRoman Gushchin BUG();
86566d60c42SRoman Gushchin
86666d60c42SRoman Gushchin if (!thresholds->primary)
86766d60c42SRoman Gushchin goto unlock;
86866d60c42SRoman Gushchin
86966d60c42SRoman Gushchin /* Check if a threshold crossed before removing */
87066d60c42SRoman Gushchin __mem_cgroup_threshold(memcg, type == _MEMSWAP);
87166d60c42SRoman Gushchin
87266d60c42SRoman Gushchin /* Calculate new number of threshold */
87366d60c42SRoman Gushchin size = entries = 0;
87466d60c42SRoman Gushchin for (i = 0; i < thresholds->primary->size; i++) {
87566d60c42SRoman Gushchin if (thresholds->primary->entries[i].eventfd != eventfd)
87666d60c42SRoman Gushchin size++;
87766d60c42SRoman Gushchin else
87866d60c42SRoman Gushchin entries++;
87966d60c42SRoman Gushchin }
88066d60c42SRoman Gushchin
88166d60c42SRoman Gushchin new = thresholds->spare;
88266d60c42SRoman Gushchin
88366d60c42SRoman Gushchin /* If no items related to eventfd have been cleared, nothing to do */
88466d60c42SRoman Gushchin if (!entries)
88566d60c42SRoman Gushchin goto unlock;
88666d60c42SRoman Gushchin
88766d60c42SRoman Gushchin /* Set thresholds array to NULL if we don't have thresholds */
88866d60c42SRoman Gushchin if (!size) {
88966d60c42SRoman Gushchin kfree(new);
89066d60c42SRoman Gushchin new = NULL;
89166d60c42SRoman Gushchin goto swap_buffers;
89266d60c42SRoman Gushchin }
89366d60c42SRoman Gushchin
89466d60c42SRoman Gushchin new->size = size;
89566d60c42SRoman Gushchin
89666d60c42SRoman Gushchin /* Copy thresholds and find current threshold */
89766d60c42SRoman Gushchin new->current_threshold = -1;
89866d60c42SRoman Gushchin for (i = 0, j = 0; i < thresholds->primary->size; i++) {
89966d60c42SRoman Gushchin if (thresholds->primary->entries[i].eventfd == eventfd)
90066d60c42SRoman Gushchin continue;
90166d60c42SRoman Gushchin
90266d60c42SRoman Gushchin new->entries[j] = thresholds->primary->entries[i];
90366d60c42SRoman Gushchin if (new->entries[j].threshold <= usage) {
90466d60c42SRoman Gushchin /*
90566d60c42SRoman Gushchin * new->current_threshold will not be used
90666d60c42SRoman Gushchin * until rcu_assign_pointer(), so it's safe to increment
90766d60c42SRoman Gushchin * it here.
90866d60c42SRoman Gushchin */
90966d60c42SRoman Gushchin ++new->current_threshold;
91066d60c42SRoman Gushchin }
91166d60c42SRoman Gushchin j++;
91266d60c42SRoman Gushchin }
91366d60c42SRoman Gushchin
91466d60c42SRoman Gushchin swap_buffers:
91566d60c42SRoman Gushchin /* Swap primary and spare array */
91666d60c42SRoman Gushchin thresholds->spare = thresholds->primary;
91766d60c42SRoman Gushchin
91866d60c42SRoman Gushchin rcu_assign_pointer(thresholds->primary, new);
91966d60c42SRoman Gushchin
92066d60c42SRoman Gushchin /* To be sure that nobody uses thresholds */
92166d60c42SRoman Gushchin synchronize_rcu();
92266d60c42SRoman Gushchin
92366d60c42SRoman Gushchin /* If all events are unregistered, free the spare array */
92466d60c42SRoman Gushchin if (!new) {
92566d60c42SRoman Gushchin kfree(thresholds->spare);
92666d60c42SRoman Gushchin thresholds->spare = NULL;
92766d60c42SRoman Gushchin }
92866d60c42SRoman Gushchin unlock:
92966d60c42SRoman Gushchin mutex_unlock(&memcg->thresholds_lock);
93066d60c42SRoman Gushchin }
93166d60c42SRoman Gushchin
mem_cgroup_usage_unregister_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd)93266d60c42SRoman Gushchin static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
93366d60c42SRoman Gushchin struct eventfd_ctx *eventfd)
93466d60c42SRoman Gushchin {
93566d60c42SRoman Gushchin return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
93666d60c42SRoman Gushchin }
93766d60c42SRoman Gushchin
memsw_cgroup_usage_unregister_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd)93866d60c42SRoman Gushchin static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
93966d60c42SRoman Gushchin struct eventfd_ctx *eventfd)
94066d60c42SRoman Gushchin {
94166d60c42SRoman Gushchin return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
94266d60c42SRoman Gushchin }
94366d60c42SRoman Gushchin
mem_cgroup_oom_register_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd,const char * args)94466d60c42SRoman Gushchin static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
94566d60c42SRoman Gushchin struct eventfd_ctx *eventfd, const char *args)
94666d60c42SRoman Gushchin {
94766d60c42SRoman Gushchin struct mem_cgroup_eventfd_list *event;
94866d60c42SRoman Gushchin
94969050f8dSKees Cook event = kmalloc_obj(*event, GFP_KERNEL_ACCOUNT);
95066d60c42SRoman Gushchin if (!event)
95166d60c42SRoman Gushchin return -ENOMEM;
95266d60c42SRoman Gushchin
95366d60c42SRoman Gushchin spin_lock(&memcg_oom_lock);
95466d60c42SRoman Gushchin
95566d60c42SRoman Gushchin event->eventfd = eventfd;
95666d60c42SRoman Gushchin list_add(&event->list, &memcg->oom_notify);
95766d60c42SRoman Gushchin
95866d60c42SRoman Gushchin /* already in OOM ? */
95966d60c42SRoman Gushchin if (memcg->under_oom)
96066d60c42SRoman Gushchin eventfd_signal(eventfd);
96166d60c42SRoman Gushchin spin_unlock(&memcg_oom_lock);
96266d60c42SRoman Gushchin
96366d60c42SRoman Gushchin return 0;
96466d60c42SRoman Gushchin }
96566d60c42SRoman Gushchin
mem_cgroup_oom_unregister_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd)96666d60c42SRoman Gushchin static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
96766d60c42SRoman Gushchin struct eventfd_ctx *eventfd)
96866d60c42SRoman Gushchin {
96966d60c42SRoman Gushchin struct mem_cgroup_eventfd_list *ev, *tmp;
97066d60c42SRoman Gushchin
97166d60c42SRoman Gushchin spin_lock(&memcg_oom_lock);
97266d60c42SRoman Gushchin
97366d60c42SRoman Gushchin list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
97466d60c42SRoman Gushchin if (ev->eventfd == eventfd) {
97566d60c42SRoman Gushchin list_del(&ev->list);
97666d60c42SRoman Gushchin kfree(ev);
97766d60c42SRoman Gushchin }
97866d60c42SRoman Gushchin }
97966d60c42SRoman Gushchin
98066d60c42SRoman Gushchin spin_unlock(&memcg_oom_lock);
98166d60c42SRoman Gushchin }
98266d60c42SRoman Gushchin
98366d60c42SRoman Gushchin /*
98466d60c42SRoman Gushchin * DO NOT USE IN NEW FILES.
98566d60c42SRoman Gushchin *
98666d60c42SRoman Gushchin * "cgroup.event_control" implementation.
98766d60c42SRoman Gushchin *
98866d60c42SRoman Gushchin * This is way over-engineered. It tries to support fully configurable
98966d60c42SRoman Gushchin * events for each user. Such level of flexibility is completely
99066d60c42SRoman Gushchin * unnecessary especially in the light of the planned unified hierarchy.
99166d60c42SRoman Gushchin *
99266d60c42SRoman Gushchin * Please deprecate this and replace with something simpler if at all
99366d60c42SRoman Gushchin * possible.
99466d60c42SRoman Gushchin */
99566d60c42SRoman Gushchin
99666d60c42SRoman Gushchin /*
99766d60c42SRoman Gushchin * Unregister event and free resources.
99866d60c42SRoman Gushchin *
99966d60c42SRoman Gushchin * Gets called from workqueue.
100066d60c42SRoman Gushchin */
memcg_event_remove(struct work_struct * work)100166d60c42SRoman Gushchin static void memcg_event_remove(struct work_struct *work)
100266d60c42SRoman Gushchin {
100366d60c42SRoman Gushchin struct mem_cgroup_event *event =
100466d60c42SRoman Gushchin container_of(work, struct mem_cgroup_event, remove);
100566d60c42SRoman Gushchin struct mem_cgroup *memcg = event->memcg;
100666d60c42SRoman Gushchin
100766d60c42SRoman Gushchin remove_wait_queue(event->wqh, &event->wait);
100866d60c42SRoman Gushchin
100966d60c42SRoman Gushchin event->unregister_event(memcg, event->eventfd);
101066d60c42SRoman Gushchin
101166d60c42SRoman Gushchin /* Notify userspace the event is going away. */
101266d60c42SRoman Gushchin eventfd_signal(event->eventfd);
101366d60c42SRoman Gushchin
101466d60c42SRoman Gushchin eventfd_ctx_put(event->eventfd);
101566d60c42SRoman Gushchin kfree(event);
101666d60c42SRoman Gushchin css_put(&memcg->css);
101766d60c42SRoman Gushchin }
101866d60c42SRoman Gushchin
101966d60c42SRoman Gushchin /*
102066d60c42SRoman Gushchin * Gets called on EPOLLHUP on eventfd when user closes it.
102166d60c42SRoman Gushchin *
102266d60c42SRoman Gushchin * Called with wqh->lock held and interrupts disabled.
102366d60c42SRoman Gushchin */
memcg_event_wake(wait_queue_entry_t * wait,unsigned int mode,int sync,void * key)102455c1d6a4SKeren Sun static int memcg_event_wake(wait_queue_entry_t *wait, unsigned int mode,
102566d60c42SRoman Gushchin int sync, void *key)
102666d60c42SRoman Gushchin {
102766d60c42SRoman Gushchin struct mem_cgroup_event *event =
102866d60c42SRoman Gushchin container_of(wait, struct mem_cgroup_event, wait);
102966d60c42SRoman Gushchin struct mem_cgroup *memcg = event->memcg;
103066d60c42SRoman Gushchin __poll_t flags = key_to_poll(key);
103166d60c42SRoman Gushchin
103266d60c42SRoman Gushchin if (flags & EPOLLHUP) {
103366d60c42SRoman Gushchin /*
103466d60c42SRoman Gushchin * If the event has been detached at cgroup removal, we
103566d60c42SRoman Gushchin * can simply return knowing the other side will cleanup
103666d60c42SRoman Gushchin * for us.
103766d60c42SRoman Gushchin *
103866d60c42SRoman Gushchin * We can't race against event freeing since the other
103966d60c42SRoman Gushchin * side will require wqh->lock via remove_wait_queue(),
104066d60c42SRoman Gushchin * which we hold.
104166d60c42SRoman Gushchin */
104266d60c42SRoman Gushchin spin_lock(&memcg->event_list_lock);
104366d60c42SRoman Gushchin if (!list_empty(&event->list)) {
104466d60c42SRoman Gushchin list_del_init(&event->list);
104566d60c42SRoman Gushchin /*
104666d60c42SRoman Gushchin * We are in atomic context, but cgroup_event_remove()
104766d60c42SRoman Gushchin * may sleep, so we have to call it in workqueue.
104866d60c42SRoman Gushchin */
104966d60c42SRoman Gushchin schedule_work(&event->remove);
105066d60c42SRoman Gushchin }
105166d60c42SRoman Gushchin spin_unlock(&memcg->event_list_lock);
105266d60c42SRoman Gushchin }
105366d60c42SRoman Gushchin
105466d60c42SRoman Gushchin return 0;
105566d60c42SRoman Gushchin }
105666d60c42SRoman Gushchin
memcg_event_ptable_queue_proc(struct file * file,wait_queue_head_t * wqh,poll_table * pt)105766d60c42SRoman Gushchin static void memcg_event_ptable_queue_proc(struct file *file,
105866d60c42SRoman Gushchin wait_queue_head_t *wqh, poll_table *pt)
105966d60c42SRoman Gushchin {
106066d60c42SRoman Gushchin struct mem_cgroup_event *event =
106166d60c42SRoman Gushchin container_of(pt, struct mem_cgroup_event, pt);
106266d60c42SRoman Gushchin
106366d60c42SRoman Gushchin event->wqh = wqh;
106466d60c42SRoman Gushchin add_wait_queue(wqh, &event->wait);
106566d60c42SRoman Gushchin }
106666d60c42SRoman Gushchin
106766d60c42SRoman Gushchin /*
106866d60c42SRoman Gushchin * DO NOT USE IN NEW FILES.
106966d60c42SRoman Gushchin *
107066d60c42SRoman Gushchin * Parse input and register new cgroup event handler.
107166d60c42SRoman Gushchin *
107266d60c42SRoman Gushchin * Input must be in format '<event_fd> <control_fd> <args>'.
107366d60c42SRoman Gushchin * Interpretation of args is defined by control file implementation.
107466d60c42SRoman Gushchin */
memcg_write_event_control(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)1075ea1e8796SRoman Gushchin static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
107666d60c42SRoman Gushchin char *buf, size_t nbytes, loff_t off)
107766d60c42SRoman Gushchin {
107866d60c42SRoman Gushchin struct cgroup_subsys_state *css = of_css(of);
107966d60c42SRoman Gushchin struct mem_cgroup *memcg = mem_cgroup_from_css(css);
108066d60c42SRoman Gushchin struct mem_cgroup_event *event;
108166d60c42SRoman Gushchin struct cgroup_subsys_state *cfile_css;
108266d60c42SRoman Gushchin unsigned int efd, cfd;
108366d60c42SRoman Gushchin struct dentry *cdentry;
108466d60c42SRoman Gushchin const char *name;
108566d60c42SRoman Gushchin char *endp;
108666d60c42SRoman Gushchin int ret;
108766d60c42SRoman Gushchin
108866d60c42SRoman Gushchin if (IS_ENABLED(CONFIG_PREEMPT_RT))
108966d60c42SRoman Gushchin return -EOPNOTSUPP;
109066d60c42SRoman Gushchin
109166d60c42SRoman Gushchin buf = strstrip(buf);
109266d60c42SRoman Gushchin
109366d60c42SRoman Gushchin efd = simple_strtoul(buf, &endp, 10);
109466d60c42SRoman Gushchin if (*endp != ' ')
109566d60c42SRoman Gushchin return -EINVAL;
109666d60c42SRoman Gushchin buf = endp + 1;
109766d60c42SRoman Gushchin
109866d60c42SRoman Gushchin cfd = simple_strtoul(buf, &endp, 10);
1099046667c4SAl Viro if (*endp == '\0')
1100046667c4SAl Viro buf = endp;
1101046667c4SAl Viro else if (*endp == ' ')
110266d60c42SRoman Gushchin buf = endp + 1;
1103046667c4SAl Viro else
1104046667c4SAl Viro return -EINVAL;
110566d60c42SRoman Gushchin
11067133dd5aSAl Viro CLASS(fd, efile)(efd);
11077133dd5aSAl Viro if (fd_empty(efile))
11087133dd5aSAl Viro return -EBADF;
11097133dd5aSAl Viro
11107133dd5aSAl Viro CLASS(fd, cfile)(cfd);
11117133dd5aSAl Viro
111269050f8dSKees Cook event = kzalloc_obj(*event, GFP_KERNEL_ACCOUNT);
111366d60c42SRoman Gushchin if (!event)
111466d60c42SRoman Gushchin return -ENOMEM;
111566d60c42SRoman Gushchin
111666d60c42SRoman Gushchin event->memcg = memcg;
111766d60c42SRoman Gushchin INIT_LIST_HEAD(&event->list);
111866d60c42SRoman Gushchin init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
111966d60c42SRoman Gushchin init_waitqueue_func_entry(&event->wait, memcg_event_wake);
112066d60c42SRoman Gushchin INIT_WORK(&event->remove, memcg_event_remove);
112166d60c42SRoman Gushchin
11221da91ea8SAl Viro event->eventfd = eventfd_ctx_fileget(fd_file(efile));
112366d60c42SRoman Gushchin if (IS_ERR(event->eventfd)) {
112466d60c42SRoman Gushchin ret = PTR_ERR(event->eventfd);
11257133dd5aSAl Viro goto out_kfree;
112666d60c42SRoman Gushchin }
112766d60c42SRoman Gushchin
11287133dd5aSAl Viro if (fd_empty(cfile)) {
112966d60c42SRoman Gushchin ret = -EBADF;
113066d60c42SRoman Gushchin goto out_put_eventfd;
113166d60c42SRoman Gushchin }
113266d60c42SRoman Gushchin
113366d60c42SRoman Gushchin /* the process need read permission on control file */
113466d60c42SRoman Gushchin /* AV: shouldn't we check that it's been opened for read instead? */
11351da91ea8SAl Viro ret = file_permission(fd_file(cfile), MAY_READ);
113666d60c42SRoman Gushchin if (ret < 0)
11377133dd5aSAl Viro goto out_put_eventfd;
113866d60c42SRoman Gushchin
113966d60c42SRoman Gushchin /*
114066d60c42SRoman Gushchin * The control file must be a regular cgroup1 file. As a regular cgroup
114166d60c42SRoman Gushchin * file can't be renamed, it's safe to access its name afterwards.
114266d60c42SRoman Gushchin */
11431da91ea8SAl Viro cdentry = fd_file(cfile)->f_path.dentry;
114466d60c42SRoman Gushchin if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
114566d60c42SRoman Gushchin ret = -EINVAL;
11467133dd5aSAl Viro goto out_put_eventfd;
114766d60c42SRoman Gushchin }
114866d60c42SRoman Gushchin
114966d60c42SRoman Gushchin /*
115066d60c42SRoman Gushchin * Determine the event callbacks and set them in @event. This used
115166d60c42SRoman Gushchin * to be done via struct cftype but cgroup core no longer knows
115266d60c42SRoman Gushchin * about these events. The following is crude but the whole thing
115366d60c42SRoman Gushchin * is for compatibility anyway.
115466d60c42SRoman Gushchin *
115566d60c42SRoman Gushchin * DO NOT ADD NEW FILES.
115666d60c42SRoman Gushchin */
115766d60c42SRoman Gushchin name = cdentry->d_name.name;
115866d60c42SRoman Gushchin
115966d60c42SRoman Gushchin if (!strcmp(name, "memory.usage_in_bytes")) {
116066d60c42SRoman Gushchin event->register_event = mem_cgroup_usage_register_event;
116166d60c42SRoman Gushchin event->unregister_event = mem_cgroup_usage_unregister_event;
116266d60c42SRoman Gushchin } else if (!strcmp(name, "memory.oom_control")) {
11636df4ad70SShakeel Butt pr_warn_once("oom_control is deprecated and will be removed. "
11646df4ad70SShakeel Butt "Please report your usecase to linux-mm-@kvack.org"
11656df4ad70SShakeel Butt " if you depend on this functionality.\n");
116666d60c42SRoman Gushchin event->register_event = mem_cgroup_oom_register_event;
116766d60c42SRoman Gushchin event->unregister_event = mem_cgroup_oom_unregister_event;
116866d60c42SRoman Gushchin } else if (!strcmp(name, "memory.pressure_level")) {
1169340afb80SShakeel Butt pr_warn_once("pressure_level is deprecated and will be removed. "
1170340afb80SShakeel Butt "Please report your usecase to linux-mm-@kvack.org "
1171340afb80SShakeel Butt "if you depend on this functionality.\n");
117266d60c42SRoman Gushchin event->register_event = vmpressure_register_event;
117366d60c42SRoman Gushchin event->unregister_event = vmpressure_unregister_event;
117466d60c42SRoman Gushchin } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
117566d60c42SRoman Gushchin event->register_event = memsw_cgroup_usage_register_event;
117666d60c42SRoman Gushchin event->unregister_event = memsw_cgroup_usage_unregister_event;
117766d60c42SRoman Gushchin } else {
117866d60c42SRoman Gushchin ret = -EINVAL;
11797133dd5aSAl Viro goto out_put_eventfd;
118066d60c42SRoman Gushchin }
118166d60c42SRoman Gushchin
118266d60c42SRoman Gushchin /*
118366d60c42SRoman Gushchin * Verify @cfile should belong to @css. Also, remaining events are
118466d60c42SRoman Gushchin * automatically removed on cgroup destruction but the removal is
118566d60c42SRoman Gushchin * asynchronous, so take an extra ref on @css.
118666d60c42SRoman Gushchin */
118766d60c42SRoman Gushchin cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
118866d60c42SRoman Gushchin &memory_cgrp_subsys);
118966d60c42SRoman Gushchin ret = -EINVAL;
119066d60c42SRoman Gushchin if (IS_ERR(cfile_css))
11917133dd5aSAl Viro goto out_put_eventfd;
11927133dd5aSAl Viro if (cfile_css != css)
11937133dd5aSAl Viro goto out_put_css;
119466d60c42SRoman Gushchin
119566d60c42SRoman Gushchin ret = event->register_event(memcg, event->eventfd, buf);
119666d60c42SRoman Gushchin if (ret)
119766d60c42SRoman Gushchin goto out_put_css;
119866d60c42SRoman Gushchin
11991da91ea8SAl Viro vfs_poll(fd_file(efile), &event->pt);
120066d60c42SRoman Gushchin
120166d60c42SRoman Gushchin spin_lock_irq(&memcg->event_list_lock);
120266d60c42SRoman Gushchin list_add(&event->list, &memcg->event_list);
120366d60c42SRoman Gushchin spin_unlock_irq(&memcg->event_list_lock);
120466d60c42SRoman Gushchin return nbytes;
120566d60c42SRoman Gushchin
120666d60c42SRoman Gushchin out_put_css:
12077133dd5aSAl Viro css_put(cfile_css);
120866d60c42SRoman Gushchin out_put_eventfd:
120966d60c42SRoman Gushchin eventfd_ctx_put(event->eventfd);
121066d60c42SRoman Gushchin out_kfree:
121166d60c42SRoman Gushchin kfree(event);
121266d60c42SRoman Gushchin return ret;
121366d60c42SRoman Gushchin }
121466d60c42SRoman Gushchin
memcg1_memcg_init(struct mem_cgroup * memcg)1215b5855a26SRoman Gushchin void memcg1_memcg_init(struct mem_cgroup *memcg)
1216b5855a26SRoman Gushchin {
1217b5855a26SRoman Gushchin INIT_LIST_HEAD(&memcg->oom_notify);
1218b5855a26SRoman Gushchin mutex_init(&memcg->thresholds_lock);
1219b5855a26SRoman Gushchin INIT_LIST_HEAD(&memcg->event_list);
1220b5855a26SRoman Gushchin spin_lock_init(&memcg->event_list_lock);
1221b5855a26SRoman Gushchin }
1222b5855a26SRoman Gushchin
memcg1_css_offline(struct mem_cgroup * memcg)122366d60c42SRoman Gushchin void memcg1_css_offline(struct mem_cgroup *memcg)
122466d60c42SRoman Gushchin {
122566d60c42SRoman Gushchin struct mem_cgroup_event *event, *tmp;
122666d60c42SRoman Gushchin
122766d60c42SRoman Gushchin /*
122866d60c42SRoman Gushchin * Unregister events and notify userspace.
122966d60c42SRoman Gushchin * Notify userspace about cgroup removing only after rmdir of cgroup
123066d60c42SRoman Gushchin * directory to avoid race between userspace and kernelspace.
123166d60c42SRoman Gushchin */
123266d60c42SRoman Gushchin spin_lock_irq(&memcg->event_list_lock);
123366d60c42SRoman Gushchin list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
123466d60c42SRoman Gushchin list_del_init(&event->list);
123566d60c42SRoman Gushchin schedule_work(&event->remove);
123666d60c42SRoman Gushchin }
123766d60c42SRoman Gushchin spin_unlock_irq(&memcg->event_list_lock);
123866d60c42SRoman Gushchin }
123966d60c42SRoman Gushchin
1240292fc2e0SRoman Gushchin /*
1241292fc2e0SRoman Gushchin * Check OOM-Killer is already running under our hierarchy.
1242292fc2e0SRoman Gushchin * If someone is running, return false.
1243292fc2e0SRoman Gushchin */
mem_cgroup_oom_trylock(struct mem_cgroup * memcg)1244292fc2e0SRoman Gushchin static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1245292fc2e0SRoman Gushchin {
1246292fc2e0SRoman Gushchin struct mem_cgroup *iter, *failed = NULL;
1247292fc2e0SRoman Gushchin
1248292fc2e0SRoman Gushchin spin_lock(&memcg_oom_lock);
1249292fc2e0SRoman Gushchin
1250292fc2e0SRoman Gushchin for_each_mem_cgroup_tree(iter, memcg) {
1251292fc2e0SRoman Gushchin if (iter->oom_lock) {
1252292fc2e0SRoman Gushchin /*
1253292fc2e0SRoman Gushchin * this subtree of our hierarchy is already locked
1254292fc2e0SRoman Gushchin * so we cannot give a lock.
1255292fc2e0SRoman Gushchin */
1256292fc2e0SRoman Gushchin failed = iter;
1257292fc2e0SRoman Gushchin mem_cgroup_iter_break(memcg, iter);
1258292fc2e0SRoman Gushchin break;
12593472f639SKeren Sun }
1260292fc2e0SRoman Gushchin iter->oom_lock = true;
1261292fc2e0SRoman Gushchin }
1262292fc2e0SRoman Gushchin
1263292fc2e0SRoman Gushchin if (failed) {
1264292fc2e0SRoman Gushchin /*
1265292fc2e0SRoman Gushchin * OK, we failed to lock the whole subtree so we have
1266292fc2e0SRoman Gushchin * to clean up what we set up to the failing subtree
1267292fc2e0SRoman Gushchin */
1268292fc2e0SRoman Gushchin for_each_mem_cgroup_tree(iter, memcg) {
1269292fc2e0SRoman Gushchin if (iter == failed) {
1270292fc2e0SRoman Gushchin mem_cgroup_iter_break(memcg, iter);
1271292fc2e0SRoman Gushchin break;
1272292fc2e0SRoman Gushchin }
1273292fc2e0SRoman Gushchin iter->oom_lock = false;
1274292fc2e0SRoman Gushchin }
1275292fc2e0SRoman Gushchin } else
1276292fc2e0SRoman Gushchin mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1277292fc2e0SRoman Gushchin
1278292fc2e0SRoman Gushchin spin_unlock(&memcg_oom_lock);
1279292fc2e0SRoman Gushchin
1280292fc2e0SRoman Gushchin return !failed;
1281292fc2e0SRoman Gushchin }
1282292fc2e0SRoman Gushchin
mem_cgroup_oom_unlock(struct mem_cgroup * memcg)1283292fc2e0SRoman Gushchin static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1284292fc2e0SRoman Gushchin {
1285292fc2e0SRoman Gushchin struct mem_cgroup *iter;
1286292fc2e0SRoman Gushchin
1287292fc2e0SRoman Gushchin spin_lock(&memcg_oom_lock);
1288292fc2e0SRoman Gushchin mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1289292fc2e0SRoman Gushchin for_each_mem_cgroup_tree(iter, memcg)
1290292fc2e0SRoman Gushchin iter->oom_lock = false;
1291292fc2e0SRoman Gushchin spin_unlock(&memcg_oom_lock);
1292292fc2e0SRoman Gushchin }
1293292fc2e0SRoman Gushchin
mem_cgroup_mark_under_oom(struct mem_cgroup * memcg)1294292fc2e0SRoman Gushchin static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1295292fc2e0SRoman Gushchin {
1296292fc2e0SRoman Gushchin struct mem_cgroup *iter;
1297292fc2e0SRoman Gushchin
1298292fc2e0SRoman Gushchin spin_lock(&memcg_oom_lock);
1299292fc2e0SRoman Gushchin for_each_mem_cgroup_tree(iter, memcg)
1300292fc2e0SRoman Gushchin iter->under_oom++;
1301292fc2e0SRoman Gushchin spin_unlock(&memcg_oom_lock);
1302292fc2e0SRoman Gushchin }
1303292fc2e0SRoman Gushchin
mem_cgroup_unmark_under_oom(struct mem_cgroup * memcg)1304292fc2e0SRoman Gushchin static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1305292fc2e0SRoman Gushchin {
1306292fc2e0SRoman Gushchin struct mem_cgroup *iter;
1307292fc2e0SRoman Gushchin
1308292fc2e0SRoman Gushchin /*
1309292fc2e0SRoman Gushchin * Be careful about under_oom underflows because a child memcg
1310292fc2e0SRoman Gushchin * could have been added after mem_cgroup_mark_under_oom.
1311292fc2e0SRoman Gushchin */
1312292fc2e0SRoman Gushchin spin_lock(&memcg_oom_lock);
1313292fc2e0SRoman Gushchin for_each_mem_cgroup_tree(iter, memcg)
1314292fc2e0SRoman Gushchin if (iter->under_oom > 0)
1315292fc2e0SRoman Gushchin iter->under_oom--;
1316292fc2e0SRoman Gushchin spin_unlock(&memcg_oom_lock);
1317292fc2e0SRoman Gushchin }
1318292fc2e0SRoman Gushchin
1319292fc2e0SRoman Gushchin static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1320292fc2e0SRoman Gushchin
1321292fc2e0SRoman Gushchin struct oom_wait_info {
1322292fc2e0SRoman Gushchin struct mem_cgroup *memcg;
1323292fc2e0SRoman Gushchin wait_queue_entry_t wait;
1324292fc2e0SRoman Gushchin };
1325292fc2e0SRoman Gushchin
memcg_oom_wake_function(wait_queue_entry_t * wait,unsigned int mode,int sync,void * arg)1326292fc2e0SRoman Gushchin static int memcg_oom_wake_function(wait_queue_entry_t *wait,
132755c1d6a4SKeren Sun unsigned int mode, int sync, void *arg)
1328292fc2e0SRoman Gushchin {
1329292fc2e0SRoman Gushchin struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1330292fc2e0SRoman Gushchin struct mem_cgroup *oom_wait_memcg;
1331292fc2e0SRoman Gushchin struct oom_wait_info *oom_wait_info;
1332292fc2e0SRoman Gushchin
1333292fc2e0SRoman Gushchin oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1334292fc2e0SRoman Gushchin oom_wait_memcg = oom_wait_info->memcg;
1335292fc2e0SRoman Gushchin
1336292fc2e0SRoman Gushchin if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1337292fc2e0SRoman Gushchin !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1338292fc2e0SRoman Gushchin return 0;
1339292fc2e0SRoman Gushchin return autoremove_wake_function(wait, mode, sync, arg);
1340292fc2e0SRoman Gushchin }
1341292fc2e0SRoman Gushchin
memcg1_oom_recover(struct mem_cgroup * memcg)13428d49b699SRoman Gushchin void memcg1_oom_recover(struct mem_cgroup *memcg)
1343292fc2e0SRoman Gushchin {
1344292fc2e0SRoman Gushchin /*
1345292fc2e0SRoman Gushchin * For the following lockless ->under_oom test, the only required
1346292fc2e0SRoman Gushchin * guarantee is that it must see the state asserted by an OOM when
1347292fc2e0SRoman Gushchin * this function is called as a result of userland actions
1348292fc2e0SRoman Gushchin * triggered by the notification of the OOM. This is trivially
1349292fc2e0SRoman Gushchin * achieved by invoking mem_cgroup_mark_under_oom() before
1350292fc2e0SRoman Gushchin * triggering notification.
1351292fc2e0SRoman Gushchin */
1352292fc2e0SRoman Gushchin if (memcg && memcg->under_oom)
1353292fc2e0SRoman Gushchin __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1354292fc2e0SRoman Gushchin }
1355292fc2e0SRoman Gushchin
1356292fc2e0SRoman Gushchin /**
1357292fc2e0SRoman Gushchin * mem_cgroup_oom_synchronize - complete memcg OOM handling
1358292fc2e0SRoman Gushchin * @handle: actually kill/wait or just clean up the OOM state
1359292fc2e0SRoman Gushchin *
1360292fc2e0SRoman Gushchin * This has to be called at the end of a page fault if the memcg OOM
1361292fc2e0SRoman Gushchin * handler was enabled.
1362292fc2e0SRoman Gushchin *
1363292fc2e0SRoman Gushchin * Memcg supports userspace OOM handling where failed allocations must
1364292fc2e0SRoman Gushchin * sleep on a waitqueue until the userspace task resolves the
1365292fc2e0SRoman Gushchin * situation. Sleeping directly in the charge context with all kinds
1366292fc2e0SRoman Gushchin * of locks held is not a good idea, instead we remember an OOM state
1367292fc2e0SRoman Gushchin * in the task and mem_cgroup_oom_synchronize() has to be called at
1368292fc2e0SRoman Gushchin * the end of the page fault to complete the OOM handling.
1369292fc2e0SRoman Gushchin *
1370292fc2e0SRoman Gushchin * Returns %true if an ongoing memcg OOM situation was detected and
1371292fc2e0SRoman Gushchin * completed, %false otherwise.
1372292fc2e0SRoman Gushchin */
mem_cgroup_oom_synchronize(bool handle)1373292fc2e0SRoman Gushchin bool mem_cgroup_oom_synchronize(bool handle)
1374292fc2e0SRoman Gushchin {
1375292fc2e0SRoman Gushchin struct mem_cgroup *memcg = current->memcg_in_oom;
1376292fc2e0SRoman Gushchin struct oom_wait_info owait;
1377292fc2e0SRoman Gushchin bool locked;
1378292fc2e0SRoman Gushchin
1379292fc2e0SRoman Gushchin /* OOM is global, do not handle */
1380292fc2e0SRoman Gushchin if (!memcg)
1381292fc2e0SRoman Gushchin return false;
1382292fc2e0SRoman Gushchin
1383292fc2e0SRoman Gushchin if (!handle)
1384292fc2e0SRoman Gushchin goto cleanup;
1385292fc2e0SRoman Gushchin
1386292fc2e0SRoman Gushchin owait.memcg = memcg;
1387292fc2e0SRoman Gushchin owait.wait.flags = 0;
1388292fc2e0SRoman Gushchin owait.wait.func = memcg_oom_wake_function;
1389292fc2e0SRoman Gushchin owait.wait.private = current;
1390292fc2e0SRoman Gushchin INIT_LIST_HEAD(&owait.wait.entry);
1391292fc2e0SRoman Gushchin
1392292fc2e0SRoman Gushchin prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1393292fc2e0SRoman Gushchin mem_cgroup_mark_under_oom(memcg);
1394292fc2e0SRoman Gushchin
1395292fc2e0SRoman Gushchin locked = mem_cgroup_oom_trylock(memcg);
1396292fc2e0SRoman Gushchin
1397292fc2e0SRoman Gushchin if (locked)
1398292fc2e0SRoman Gushchin mem_cgroup_oom_notify(memcg);
1399292fc2e0SRoman Gushchin
1400292fc2e0SRoman Gushchin schedule();
1401292fc2e0SRoman Gushchin mem_cgroup_unmark_under_oom(memcg);
1402292fc2e0SRoman Gushchin finish_wait(&memcg_oom_waitq, &owait.wait);
1403292fc2e0SRoman Gushchin
1404292fc2e0SRoman Gushchin if (locked)
1405292fc2e0SRoman Gushchin mem_cgroup_oom_unlock(memcg);
1406292fc2e0SRoman Gushchin cleanup:
1407292fc2e0SRoman Gushchin current->memcg_in_oom = NULL;
1408292fc2e0SRoman Gushchin css_put(&memcg->css);
1409292fc2e0SRoman Gushchin return true;
1410292fc2e0SRoman Gushchin }
1411292fc2e0SRoman Gushchin
1412292fc2e0SRoman Gushchin
memcg1_oom_prepare(struct mem_cgroup * memcg,bool * locked)1413292fc2e0SRoman Gushchin bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked)
1414292fc2e0SRoman Gushchin {
1415292fc2e0SRoman Gushchin /*
1416292fc2e0SRoman Gushchin * We are in the middle of the charge context here, so we
1417292fc2e0SRoman Gushchin * don't want to block when potentially sitting on a callstack
1418292fc2e0SRoman Gushchin * that holds all kinds of filesystem and mm locks.
1419292fc2e0SRoman Gushchin *
1420292fc2e0SRoman Gushchin * cgroup1 allows disabling the OOM killer and waiting for outside
1421292fc2e0SRoman Gushchin * handling until the charge can succeed; remember the context and put
1422292fc2e0SRoman Gushchin * the task to sleep at the end of the page fault when all locks are
1423292fc2e0SRoman Gushchin * released.
1424292fc2e0SRoman Gushchin *
1425292fc2e0SRoman Gushchin * On the other hand, in-kernel OOM killer allows for an async victim
1426292fc2e0SRoman Gushchin * memory reclaim (oom_reaper) and that means that we are not solely
1427292fc2e0SRoman Gushchin * relying on the oom victim to make a forward progress and we can
1428292fc2e0SRoman Gushchin * invoke the oom killer here.
1429292fc2e0SRoman Gushchin *
1430292fc2e0SRoman Gushchin * Please note that mem_cgroup_out_of_memory might fail to find a
1431292fc2e0SRoman Gushchin * victim and then we have to bail out from the charge path.
1432292fc2e0SRoman Gushchin */
1433292fc2e0SRoman Gushchin if (READ_ONCE(memcg->oom_kill_disable)) {
1434292fc2e0SRoman Gushchin if (current->in_user_fault) {
1435292fc2e0SRoman Gushchin css_get(&memcg->css);
1436292fc2e0SRoman Gushchin current->memcg_in_oom = memcg;
1437292fc2e0SRoman Gushchin }
1438292fc2e0SRoman Gushchin return false;
1439292fc2e0SRoman Gushchin }
1440292fc2e0SRoman Gushchin
1441292fc2e0SRoman Gushchin mem_cgroup_mark_under_oom(memcg);
1442292fc2e0SRoman Gushchin
1443292fc2e0SRoman Gushchin *locked = mem_cgroup_oom_trylock(memcg);
1444292fc2e0SRoman Gushchin
1445292fc2e0SRoman Gushchin if (*locked)
1446292fc2e0SRoman Gushchin mem_cgroup_oom_notify(memcg);
1447292fc2e0SRoman Gushchin
1448292fc2e0SRoman Gushchin mem_cgroup_unmark_under_oom(memcg);
1449292fc2e0SRoman Gushchin
1450292fc2e0SRoman Gushchin return true;
1451292fc2e0SRoman Gushchin }
1452292fc2e0SRoman Gushchin
memcg1_oom_finish(struct mem_cgroup * memcg,bool locked)1453292fc2e0SRoman Gushchin void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked)
1454292fc2e0SRoman Gushchin {
1455292fc2e0SRoman Gushchin if (locked)
1456292fc2e0SRoman Gushchin mem_cgroup_oom_unlock(memcg);
1457292fc2e0SRoman Gushchin }
1458292fc2e0SRoman Gushchin
1459ea1e8796SRoman Gushchin static DEFINE_MUTEX(memcg_max_mutex);
1460ea1e8796SRoman Gushchin
mem_cgroup_resize_max(struct mem_cgroup * memcg,unsigned long max,bool memsw)1461ea1e8796SRoman Gushchin static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
1462ea1e8796SRoman Gushchin unsigned long max, bool memsw)
1463ea1e8796SRoman Gushchin {
1464ea1e8796SRoman Gushchin bool enlarge = false;
1465ea1e8796SRoman Gushchin bool drained = false;
1466ea1e8796SRoman Gushchin int ret;
1467ea1e8796SRoman Gushchin bool limits_invariant;
1468ea1e8796SRoman Gushchin struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
1469ea1e8796SRoman Gushchin
1470ea1e8796SRoman Gushchin do {
1471ea1e8796SRoman Gushchin if (signal_pending(current)) {
1472ea1e8796SRoman Gushchin ret = -EINTR;
1473ea1e8796SRoman Gushchin break;
1474ea1e8796SRoman Gushchin }
1475ea1e8796SRoman Gushchin
1476ea1e8796SRoman Gushchin mutex_lock(&memcg_max_mutex);
1477ea1e8796SRoman Gushchin /*
1478ea1e8796SRoman Gushchin * Make sure that the new limit (memsw or memory limit) doesn't
1479ea1e8796SRoman Gushchin * break our basic invariant rule memory.max <= memsw.max.
1480ea1e8796SRoman Gushchin */
1481ea1e8796SRoman Gushchin limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
1482ea1e8796SRoman Gushchin max <= memcg->memsw.max;
1483ea1e8796SRoman Gushchin if (!limits_invariant) {
1484ea1e8796SRoman Gushchin mutex_unlock(&memcg_max_mutex);
1485ea1e8796SRoman Gushchin ret = -EINVAL;
1486ea1e8796SRoman Gushchin break;
1487ea1e8796SRoman Gushchin }
1488ea1e8796SRoman Gushchin if (max > counter->max)
1489ea1e8796SRoman Gushchin enlarge = true;
1490ea1e8796SRoman Gushchin ret = page_counter_set_max(counter, max);
1491ea1e8796SRoman Gushchin mutex_unlock(&memcg_max_mutex);
1492ea1e8796SRoman Gushchin
1493ea1e8796SRoman Gushchin if (!ret)
1494ea1e8796SRoman Gushchin break;
1495ea1e8796SRoman Gushchin
1496ea1e8796SRoman Gushchin if (!drained) {
1497ea1e8796SRoman Gushchin drain_all_stock(memcg);
1498ea1e8796SRoman Gushchin drained = true;
1499ea1e8796SRoman Gushchin continue;
1500ea1e8796SRoman Gushchin }
1501ea1e8796SRoman Gushchin
1502ea1e8796SRoman Gushchin if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
150368cd9050SDan Schatzberg memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
1504ea1e8796SRoman Gushchin ret = -EBUSY;
1505ea1e8796SRoman Gushchin break;
1506ea1e8796SRoman Gushchin }
1507ea1e8796SRoman Gushchin } while (true);
1508ea1e8796SRoman Gushchin
1509ea1e8796SRoman Gushchin if (!ret && enlarge)
1510ea1e8796SRoman Gushchin memcg1_oom_recover(memcg);
1511ea1e8796SRoman Gushchin
1512ea1e8796SRoman Gushchin return ret;
1513ea1e8796SRoman Gushchin }
1514ea1e8796SRoman Gushchin
1515ea1e8796SRoman Gushchin /*
1516ea1e8796SRoman Gushchin * Reclaims as many pages from the given memcg as possible.
1517ea1e8796SRoman Gushchin *
1518ea1e8796SRoman Gushchin * Caller is responsible for holding css reference for memcg.
1519ea1e8796SRoman Gushchin */
mem_cgroup_force_empty(struct mem_cgroup * memcg)1520ea1e8796SRoman Gushchin static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
1521ea1e8796SRoman Gushchin {
1522ea1e8796SRoman Gushchin int nr_retries = MAX_RECLAIM_RETRIES;
1523ea1e8796SRoman Gushchin
1524ea1e8796SRoman Gushchin /* we call try-to-free pages for make this cgroup empty */
1525ea1e8796SRoman Gushchin lru_add_drain_all();
1526ea1e8796SRoman Gushchin
1527ea1e8796SRoman Gushchin drain_all_stock(memcg);
1528ea1e8796SRoman Gushchin
1529ea1e8796SRoman Gushchin /* try to free all pages in this cgroup */
1530ea1e8796SRoman Gushchin while (nr_retries && page_counter_read(&memcg->memory)) {
1531ea1e8796SRoman Gushchin if (signal_pending(current))
1532ea1e8796SRoman Gushchin return -EINTR;
1533ea1e8796SRoman Gushchin
1534ea1e8796SRoman Gushchin if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
153568cd9050SDan Schatzberg MEMCG_RECLAIM_MAY_SWAP, NULL))
1536ea1e8796SRoman Gushchin nr_retries--;
1537ea1e8796SRoman Gushchin }
1538ea1e8796SRoman Gushchin
1539ea1e8796SRoman Gushchin return 0;
1540ea1e8796SRoman Gushchin }
1541ea1e8796SRoman Gushchin
mem_cgroup_force_empty_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)1542ea1e8796SRoman Gushchin static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
1543ea1e8796SRoman Gushchin char *buf, size_t nbytes,
1544ea1e8796SRoman Gushchin loff_t off)
1545ea1e8796SRoman Gushchin {
1546ea1e8796SRoman Gushchin struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
1547ea1e8796SRoman Gushchin
1548ea1e8796SRoman Gushchin if (mem_cgroup_is_root(memcg))
1549ea1e8796SRoman Gushchin return -EINVAL;
1550ea1e8796SRoman Gushchin return mem_cgroup_force_empty(memcg) ?: nbytes;
1551ea1e8796SRoman Gushchin }
1552ea1e8796SRoman Gushchin
mem_cgroup_hierarchy_read(struct cgroup_subsys_state * css,struct cftype * cft)1553ea1e8796SRoman Gushchin static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
1554ea1e8796SRoman Gushchin struct cftype *cft)
1555ea1e8796SRoman Gushchin {
1556ea1e8796SRoman Gushchin return 1;
1557ea1e8796SRoman Gushchin }
1558ea1e8796SRoman Gushchin
mem_cgroup_hierarchy_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)1559ea1e8796SRoman Gushchin static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
1560ea1e8796SRoman Gushchin struct cftype *cft, u64 val)
1561ea1e8796SRoman Gushchin {
1562ea1e8796SRoman Gushchin if (val == 1)
1563ea1e8796SRoman Gushchin return 0;
1564ea1e8796SRoman Gushchin
1565ea1e8796SRoman Gushchin pr_warn_once("Non-hierarchical mode is deprecated. "
1566ea1e8796SRoman Gushchin "Please report your usecase to linux-mm@kvack.org if you "
1567ea1e8796SRoman Gushchin "depend on this functionality.\n");
1568ea1e8796SRoman Gushchin
1569ea1e8796SRoman Gushchin return -EINVAL;
1570ea1e8796SRoman Gushchin }
1571ea1e8796SRoman Gushchin
mem_cgroup_read_u64(struct cgroup_subsys_state * css,struct cftype * cft)1572ea1e8796SRoman Gushchin static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
1573ea1e8796SRoman Gushchin struct cftype *cft)
1574ea1e8796SRoman Gushchin {
1575ea1e8796SRoman Gushchin struct mem_cgroup *memcg = mem_cgroup_from_css(css);
1576ea1e8796SRoman Gushchin struct page_counter *counter;
1577ea1e8796SRoman Gushchin
1578ea1e8796SRoman Gushchin switch (MEMFILE_TYPE(cft->private)) {
1579ea1e8796SRoman Gushchin case _MEM:
1580ea1e8796SRoman Gushchin counter = &memcg->memory;
1581ea1e8796SRoman Gushchin break;
1582ea1e8796SRoman Gushchin case _MEMSWAP:
1583ea1e8796SRoman Gushchin counter = &memcg->memsw;
1584ea1e8796SRoman Gushchin break;
1585ea1e8796SRoman Gushchin case _KMEM:
1586ea1e8796SRoman Gushchin counter = &memcg->kmem;
1587ea1e8796SRoman Gushchin break;
1588ea1e8796SRoman Gushchin case _TCP:
1589ea1e8796SRoman Gushchin counter = &memcg->tcpmem;
1590ea1e8796SRoman Gushchin break;
1591ea1e8796SRoman Gushchin default:
1592ea1e8796SRoman Gushchin BUG();
1593ea1e8796SRoman Gushchin }
1594ea1e8796SRoman Gushchin
1595ea1e8796SRoman Gushchin switch (MEMFILE_ATTR(cft->private)) {
1596ea1e8796SRoman Gushchin case RES_USAGE:
1597ea1e8796SRoman Gushchin if (counter == &memcg->memory)
1598ea1e8796SRoman Gushchin return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
1599ea1e8796SRoman Gushchin if (counter == &memcg->memsw)
1600ea1e8796SRoman Gushchin return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
1601ea1e8796SRoman Gushchin return (u64)page_counter_read(counter) * PAGE_SIZE;
1602ea1e8796SRoman Gushchin case RES_LIMIT:
1603ea1e8796SRoman Gushchin return (u64)counter->max * PAGE_SIZE;
1604ea1e8796SRoman Gushchin case RES_MAX_USAGE:
1605ea1e8796SRoman Gushchin return (u64)counter->watermark * PAGE_SIZE;
1606ea1e8796SRoman Gushchin case RES_FAILCNT:
1607ea1e8796SRoman Gushchin return counter->failcnt;
1608ea1e8796SRoman Gushchin case RES_SOFT_LIMIT:
1609ea1e8796SRoman Gushchin return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE;
1610ea1e8796SRoman Gushchin default:
1611ea1e8796SRoman Gushchin BUG();
1612ea1e8796SRoman Gushchin }
1613ea1e8796SRoman Gushchin }
1614ea1e8796SRoman Gushchin
1615ea1e8796SRoman Gushchin /*
1616ea1e8796SRoman Gushchin * This function doesn't do anything useful. Its only job is to provide a read
1617ea1e8796SRoman Gushchin * handler for a file so that cgroup_file_mode() will add read permissions.
1618ea1e8796SRoman Gushchin */
mem_cgroup_dummy_seq_show(__always_unused struct seq_file * m,__always_unused void * v)1619ea1e8796SRoman Gushchin static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m,
1620ea1e8796SRoman Gushchin __always_unused void *v)
1621ea1e8796SRoman Gushchin {
1622ea1e8796SRoman Gushchin return -EINVAL;
1623ea1e8796SRoman Gushchin }
1624ea1e8796SRoman Gushchin
memcg_update_tcp_max(struct mem_cgroup * memcg,unsigned long max)1625ea1e8796SRoman Gushchin static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
1626ea1e8796SRoman Gushchin {
1627ea1e8796SRoman Gushchin int ret;
1628ea1e8796SRoman Gushchin
1629ea1e8796SRoman Gushchin mutex_lock(&memcg_max_mutex);
1630ea1e8796SRoman Gushchin
1631ea1e8796SRoman Gushchin ret = page_counter_set_max(&memcg->tcpmem, max);
1632ea1e8796SRoman Gushchin if (ret)
1633ea1e8796SRoman Gushchin goto out;
1634ea1e8796SRoman Gushchin
1635ea1e8796SRoman Gushchin if (!memcg->tcpmem_active) {
1636ea1e8796SRoman Gushchin /*
1637ea1e8796SRoman Gushchin * The active flag needs to be written after the static_key
1638ea1e8796SRoman Gushchin * update. This is what guarantees that the socket activation
1639ea1e8796SRoman Gushchin * function is the last one to run. See mem_cgroup_sk_alloc()
1640ea1e8796SRoman Gushchin * for details, and note that we don't mark any socket as
1641ea1e8796SRoman Gushchin * belonging to this memcg until that flag is up.
1642ea1e8796SRoman Gushchin *
1643ea1e8796SRoman Gushchin * We need to do this, because static_keys will span multiple
1644ea1e8796SRoman Gushchin * sites, but we can't control their order. If we mark a socket
1645ea1e8796SRoman Gushchin * as accounted, but the accounting functions are not patched in
1646ea1e8796SRoman Gushchin * yet, we'll lose accounting.
1647ea1e8796SRoman Gushchin *
1648ea1e8796SRoman Gushchin * We never race with the readers in mem_cgroup_sk_alloc(),
1649ea1e8796SRoman Gushchin * because when this value change, the code to process it is not
1650ea1e8796SRoman Gushchin * patched in yet.
1651ea1e8796SRoman Gushchin */
1652ea1e8796SRoman Gushchin static_branch_inc(&memcg_sockets_enabled_key);
1653ea1e8796SRoman Gushchin memcg->tcpmem_active = true;
1654ea1e8796SRoman Gushchin }
1655ea1e8796SRoman Gushchin out:
1656ea1e8796SRoman Gushchin mutex_unlock(&memcg_max_mutex);
1657ea1e8796SRoman Gushchin return ret;
1658ea1e8796SRoman Gushchin }
1659ea1e8796SRoman Gushchin
1660ea1e8796SRoman Gushchin /*
1661ea1e8796SRoman Gushchin * The user of this function is...
1662ea1e8796SRoman Gushchin * RES_LIMIT.
1663ea1e8796SRoman Gushchin */
mem_cgroup_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)1664ea1e8796SRoman Gushchin static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
1665ea1e8796SRoman Gushchin char *buf, size_t nbytes, loff_t off)
1666ea1e8796SRoman Gushchin {
1667ea1e8796SRoman Gushchin struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
1668ea1e8796SRoman Gushchin unsigned long nr_pages;
1669ea1e8796SRoman Gushchin int ret;
1670ea1e8796SRoman Gushchin
1671ea1e8796SRoman Gushchin buf = strstrip(buf);
1672ea1e8796SRoman Gushchin ret = page_counter_memparse(buf, "-1", &nr_pages);
1673ea1e8796SRoman Gushchin if (ret)
1674ea1e8796SRoman Gushchin return ret;
1675ea1e8796SRoman Gushchin
1676ea1e8796SRoman Gushchin switch (MEMFILE_ATTR(of_cft(of)->private)) {
1677ea1e8796SRoman Gushchin case RES_LIMIT:
1678ea1e8796SRoman Gushchin if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
1679ea1e8796SRoman Gushchin ret = -EINVAL;
1680ea1e8796SRoman Gushchin break;
1681ea1e8796SRoman Gushchin }
1682ea1e8796SRoman Gushchin switch (MEMFILE_TYPE(of_cft(of)->private)) {
1683ea1e8796SRoman Gushchin case _MEM:
1684ea1e8796SRoman Gushchin ret = mem_cgroup_resize_max(memcg, nr_pages, false);
1685ea1e8796SRoman Gushchin break;
1686ea1e8796SRoman Gushchin case _MEMSWAP:
1687ea1e8796SRoman Gushchin ret = mem_cgroup_resize_max(memcg, nr_pages, true);
1688ea1e8796SRoman Gushchin break;
1689ea1e8796SRoman Gushchin case _KMEM:
1690ea1e8796SRoman Gushchin pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
1691ea1e8796SRoman Gushchin "Writing any value to this file has no effect. "
1692ea1e8796SRoman Gushchin "Please report your usecase to linux-mm@kvack.org if you "
1693ea1e8796SRoman Gushchin "depend on this functionality.\n");
1694ea1e8796SRoman Gushchin ret = 0;
1695ea1e8796SRoman Gushchin break;
1696ea1e8796SRoman Gushchin case _TCP:
1697d046ff46SShakeel Butt pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. "
1698d046ff46SShakeel Butt "Please report your usecase to linux-mm@kvack.org if you "
1699d046ff46SShakeel Butt "depend on this functionality.\n");
1700ea1e8796SRoman Gushchin ret = memcg_update_tcp_max(memcg, nr_pages);
1701ea1e8796SRoman Gushchin break;
1702ea1e8796SRoman Gushchin }
1703ea1e8796SRoman Gushchin break;
1704ea1e8796SRoman Gushchin case RES_SOFT_LIMIT:
1705ea1e8796SRoman Gushchin if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
1706ea1e8796SRoman Gushchin ret = -EOPNOTSUPP;
1707ea1e8796SRoman Gushchin } else {
1708569c4f62SShakeel Butt pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. "
1709569c4f62SShakeel Butt "Please report your usecase to linux-mm@kvack.org if you "
1710569c4f62SShakeel Butt "depend on this functionality.\n");
1711ea1e8796SRoman Gushchin WRITE_ONCE(memcg->soft_limit, nr_pages);
1712ea1e8796SRoman Gushchin ret = 0;
1713ea1e8796SRoman Gushchin }
1714ea1e8796SRoman Gushchin break;
1715ea1e8796SRoman Gushchin }
1716ea1e8796SRoman Gushchin return ret ?: nbytes;
1717ea1e8796SRoman Gushchin }
1718ea1e8796SRoman Gushchin
mem_cgroup_reset(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)1719ea1e8796SRoman Gushchin static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
1720ea1e8796SRoman Gushchin size_t nbytes, loff_t off)
1721ea1e8796SRoman Gushchin {
1722ea1e8796SRoman Gushchin struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
1723ea1e8796SRoman Gushchin struct page_counter *counter;
1724ea1e8796SRoman Gushchin
1725ea1e8796SRoman Gushchin switch (MEMFILE_TYPE(of_cft(of)->private)) {
1726ea1e8796SRoman Gushchin case _MEM:
1727ea1e8796SRoman Gushchin counter = &memcg->memory;
1728ea1e8796SRoman Gushchin break;
1729ea1e8796SRoman Gushchin case _MEMSWAP:
1730ea1e8796SRoman Gushchin counter = &memcg->memsw;
1731ea1e8796SRoman Gushchin break;
1732ea1e8796SRoman Gushchin case _KMEM:
1733ea1e8796SRoman Gushchin counter = &memcg->kmem;
1734ea1e8796SRoman Gushchin break;
1735ea1e8796SRoman Gushchin case _TCP:
1736ea1e8796SRoman Gushchin counter = &memcg->tcpmem;
1737ea1e8796SRoman Gushchin break;
1738ea1e8796SRoman Gushchin default:
1739ea1e8796SRoman Gushchin BUG();
1740ea1e8796SRoman Gushchin }
1741ea1e8796SRoman Gushchin
1742ea1e8796SRoman Gushchin switch (MEMFILE_ATTR(of_cft(of)->private)) {
1743ea1e8796SRoman Gushchin case RES_MAX_USAGE:
1744ea1e8796SRoman Gushchin page_counter_reset_watermark(counter);
1745ea1e8796SRoman Gushchin break;
1746ea1e8796SRoman Gushchin case RES_FAILCNT:
1747ea1e8796SRoman Gushchin counter->failcnt = 0;
1748ea1e8796SRoman Gushchin break;
1749ea1e8796SRoman Gushchin default:
1750ea1e8796SRoman Gushchin BUG();
1751ea1e8796SRoman Gushchin }
1752ea1e8796SRoman Gushchin
1753ea1e8796SRoman Gushchin return nbytes;
1754ea1e8796SRoman Gushchin }
1755ea1e8796SRoman Gushchin
1756ea1e8796SRoman Gushchin #ifdef CONFIG_NUMA
1757ea1e8796SRoman Gushchin
1758ea1e8796SRoman Gushchin #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
1759ea1e8796SRoman Gushchin #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
1760ea1e8796SRoman Gushchin #define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
1761ea1e8796SRoman Gushchin
mem_cgroup_node_nr_lru_pages(struct mem_cgroup * memcg,int nid,unsigned int lru_mask,bool tree)1762ea1e8796SRoman Gushchin static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
1763ea1e8796SRoman Gushchin int nid, unsigned int lru_mask, bool tree)
1764ea1e8796SRoman Gushchin {
1765ea1e8796SRoman Gushchin struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
1766ea1e8796SRoman Gushchin unsigned long nr = 0;
1767ea1e8796SRoman Gushchin enum lru_list lru;
1768ea1e8796SRoman Gushchin
176955c1d6a4SKeren Sun VM_BUG_ON((unsigned int)nid >= nr_node_ids);
1770ea1e8796SRoman Gushchin
1771ea1e8796SRoman Gushchin for_each_lru(lru) {
1772ea1e8796SRoman Gushchin if (!(BIT(lru) & lru_mask))
1773ea1e8796SRoman Gushchin continue;
1774ea1e8796SRoman Gushchin if (tree)
1775ea1e8796SRoman Gushchin nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
1776ea1e8796SRoman Gushchin else
1777ea1e8796SRoman Gushchin nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
1778ea1e8796SRoman Gushchin }
1779ea1e8796SRoman Gushchin return nr;
1780ea1e8796SRoman Gushchin }
1781ea1e8796SRoman Gushchin
mem_cgroup_nr_lru_pages(struct mem_cgroup * memcg,unsigned int lru_mask,bool tree)1782ea1e8796SRoman Gushchin static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
1783ea1e8796SRoman Gushchin unsigned int lru_mask,
1784ea1e8796SRoman Gushchin bool tree)
1785ea1e8796SRoman Gushchin {
1786ea1e8796SRoman Gushchin unsigned long nr = 0;
1787ea1e8796SRoman Gushchin enum lru_list lru;
1788ea1e8796SRoman Gushchin
1789ea1e8796SRoman Gushchin for_each_lru(lru) {
1790ea1e8796SRoman Gushchin if (!(BIT(lru) & lru_mask))
1791ea1e8796SRoman Gushchin continue;
1792ea1e8796SRoman Gushchin if (tree)
1793ea1e8796SRoman Gushchin nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
1794ea1e8796SRoman Gushchin else
1795ea1e8796SRoman Gushchin nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
1796ea1e8796SRoman Gushchin }
1797ea1e8796SRoman Gushchin return nr;
1798ea1e8796SRoman Gushchin }
1799ea1e8796SRoman Gushchin
memcg_numa_stat_show(struct seq_file * m,void * v)1800ea1e8796SRoman Gushchin static int memcg_numa_stat_show(struct seq_file *m, void *v)
1801ea1e8796SRoman Gushchin {
1802ea1e8796SRoman Gushchin struct numa_stat {
1803ea1e8796SRoman Gushchin const char *name;
1804ea1e8796SRoman Gushchin unsigned int lru_mask;
1805ea1e8796SRoman Gushchin };
1806ea1e8796SRoman Gushchin
1807ea1e8796SRoman Gushchin static const struct numa_stat stats[] = {
1808ea1e8796SRoman Gushchin { "total", LRU_ALL },
1809ea1e8796SRoman Gushchin { "file", LRU_ALL_FILE },
1810ea1e8796SRoman Gushchin { "anon", LRU_ALL_ANON },
1811ea1e8796SRoman Gushchin { "unevictable", BIT(LRU_UNEVICTABLE) },
1812ea1e8796SRoman Gushchin };
1813ea1e8796SRoman Gushchin const struct numa_stat *stat;
1814ea1e8796SRoman Gushchin int nid;
1815ea1e8796SRoman Gushchin struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
1816ea1e8796SRoman Gushchin
1817ea1e8796SRoman Gushchin mem_cgroup_flush_stats(memcg);
1818ea1e8796SRoman Gushchin
181961e9210eSAlejandro Colomar for (stat = stats; stat < ARRAY_END(stats); stat++) {
1820ea1e8796SRoman Gushchin seq_printf(m, "%s=%lu", stat->name,
1821ea1e8796SRoman Gushchin mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
1822ea1e8796SRoman Gushchin false));
1823ea1e8796SRoman Gushchin for_each_node_state(nid, N_MEMORY)
1824ea1e8796SRoman Gushchin seq_printf(m, " N%d=%lu", nid,
1825ea1e8796SRoman Gushchin mem_cgroup_node_nr_lru_pages(memcg, nid,
1826ea1e8796SRoman Gushchin stat->lru_mask, false));
1827ea1e8796SRoman Gushchin seq_putc(m, '\n');
1828ea1e8796SRoman Gushchin }
1829ea1e8796SRoman Gushchin
183061e9210eSAlejandro Colomar for (stat = stats; stat < ARRAY_END(stats); stat++) {
1831ea1e8796SRoman Gushchin
1832ea1e8796SRoman Gushchin seq_printf(m, "hierarchical_%s=%lu", stat->name,
1833ea1e8796SRoman Gushchin mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
1834ea1e8796SRoman Gushchin true));
1835ea1e8796SRoman Gushchin for_each_node_state(nid, N_MEMORY)
1836ea1e8796SRoman Gushchin seq_printf(m, " N%d=%lu", nid,
1837ea1e8796SRoman Gushchin mem_cgroup_node_nr_lru_pages(memcg, nid,
1838ea1e8796SRoman Gushchin stat->lru_mask, true));
1839ea1e8796SRoman Gushchin seq_putc(m, '\n');
1840ea1e8796SRoman Gushchin }
1841ea1e8796SRoman Gushchin
1842ea1e8796SRoman Gushchin return 0;
1843ea1e8796SRoman Gushchin }
1844ea1e8796SRoman Gushchin #endif /* CONFIG_NUMA */
1845ea1e8796SRoman Gushchin
1846ea1e8796SRoman Gushchin static const unsigned int memcg1_stats[] = {
1847ea1e8796SRoman Gushchin NR_FILE_PAGES,
1848ea1e8796SRoman Gushchin NR_ANON_MAPPED,
1849ea1e8796SRoman Gushchin #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1850ea1e8796SRoman Gushchin NR_ANON_THPS,
1851ea1e8796SRoman Gushchin #endif
1852ea1e8796SRoman Gushchin NR_SHMEM,
1853ea1e8796SRoman Gushchin NR_FILE_MAPPED,
1854ea1e8796SRoman Gushchin NR_FILE_DIRTY,
1855ea1e8796SRoman Gushchin NR_WRITEBACK,
1856ea1e8796SRoman Gushchin WORKINGSET_REFAULT_ANON,
1857ea1e8796SRoman Gushchin WORKINGSET_REFAULT_FILE,
1858ea1e8796SRoman Gushchin #ifdef CONFIG_SWAP
1859ea1e8796SRoman Gushchin MEMCG_SWAP,
1860ea1e8796SRoman Gushchin NR_SWAPCACHE,
1861ea1e8796SRoman Gushchin #endif
1862ea1e8796SRoman Gushchin };
1863ea1e8796SRoman Gushchin
1864ea1e8796SRoman Gushchin static const char *const memcg1_stat_names[] = {
1865ea1e8796SRoman Gushchin "cache",
1866ea1e8796SRoman Gushchin "rss",
1867ea1e8796SRoman Gushchin #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1868ea1e8796SRoman Gushchin "rss_huge",
1869ea1e8796SRoman Gushchin #endif
1870ea1e8796SRoman Gushchin "shmem",
1871ea1e8796SRoman Gushchin "mapped_file",
1872ea1e8796SRoman Gushchin "dirty",
1873ea1e8796SRoman Gushchin "writeback",
1874ea1e8796SRoman Gushchin "workingset_refault_anon",
1875ea1e8796SRoman Gushchin "workingset_refault_file",
1876ea1e8796SRoman Gushchin #ifdef CONFIG_SWAP
1877ea1e8796SRoman Gushchin "swap",
1878ea1e8796SRoman Gushchin "swapcached",
1879ea1e8796SRoman Gushchin #endif
1880ea1e8796SRoman Gushchin };
1881ea1e8796SRoman Gushchin
1882ea1e8796SRoman Gushchin /* Universal VM events cgroup1 shows, original sort order */
1883ea1e8796SRoman Gushchin static const unsigned int memcg1_events[] = {
1884ea1e8796SRoman Gushchin PGPGIN,
1885ea1e8796SRoman Gushchin PGPGOUT,
1886ea1e8796SRoman Gushchin PGFAULT,
1887ea1e8796SRoman Gushchin PGMAJFAULT,
1888ea1e8796SRoman Gushchin };
1889ea1e8796SRoman Gushchin
reparent_memcg1_state_local(struct mem_cgroup * memcg,struct mem_cgroup * parent)18908285917dSQi Zheng void reparent_memcg1_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
18918285917dSQi Zheng {
18928285917dSQi Zheng int i;
18938285917dSQi Zheng
18948285917dSQi Zheng for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++)
18958285917dSQi Zheng reparent_memcg_state_local(memcg, parent, memcg1_stats[i]);
18968285917dSQi Zheng }
18978285917dSQi Zheng
reparent_memcg1_lruvec_state_local(struct mem_cgroup * memcg,struct mem_cgroup * parent)18988285917dSQi Zheng void reparent_memcg1_lruvec_state_local(struct mem_cgroup *memcg, struct mem_cgroup *parent)
18998285917dSQi Zheng {
19008285917dSQi Zheng int i;
19018285917dSQi Zheng
19028285917dSQi Zheng for (i = 0; i < NR_LRU_LISTS; i++)
19038285917dSQi Zheng reparent_memcg_lruvec_state_local(memcg, parent, i);
19048285917dSQi Zheng }
19058285917dSQi Zheng
memcg1_stat_format(struct mem_cgroup * memcg,struct seq_buf * s)1906ea1e8796SRoman Gushchin void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
1907ea1e8796SRoman Gushchin {
1908ea1e8796SRoman Gushchin unsigned long memory, memsw;
1909ea1e8796SRoman Gushchin struct mem_cgroup *mi;
1910ea1e8796SRoman Gushchin unsigned int i;
1911ea1e8796SRoman Gushchin
1912ea1e8796SRoman Gushchin BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
1913ea1e8796SRoman Gushchin
1914ea1e8796SRoman Gushchin mem_cgroup_flush_stats(memcg);
1915ea1e8796SRoman Gushchin
1916ea1e8796SRoman Gushchin for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1917ea1e8796SRoman Gushchin unsigned long nr;
1918ea1e8796SRoman Gushchin
1919ea1e8796SRoman Gushchin nr = memcg_page_state_local_output(memcg, memcg1_stats[i]);
1920ea1e8796SRoman Gushchin seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr);
1921ea1e8796SRoman Gushchin }
1922ea1e8796SRoman Gushchin
1923ea1e8796SRoman Gushchin for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
1924ea1e8796SRoman Gushchin seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
1925ea1e8796SRoman Gushchin memcg_events_local(memcg, memcg1_events[i]));
1926ea1e8796SRoman Gushchin
1927ea1e8796SRoman Gushchin for (i = 0; i < NR_LRU_LISTS; i++)
1928ea1e8796SRoman Gushchin seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
1929ea1e8796SRoman Gushchin memcg_page_state_local(memcg, NR_LRU_BASE + i) *
1930ea1e8796SRoman Gushchin PAGE_SIZE);
1931ea1e8796SRoman Gushchin
1932ea1e8796SRoman Gushchin /* Hierarchical information */
1933ea1e8796SRoman Gushchin memory = memsw = PAGE_COUNTER_MAX;
1934ea1e8796SRoman Gushchin for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
1935ea1e8796SRoman Gushchin memory = min(memory, READ_ONCE(mi->memory.max));
1936ea1e8796SRoman Gushchin memsw = min(memsw, READ_ONCE(mi->memsw.max));
1937ea1e8796SRoman Gushchin }
1938ea1e8796SRoman Gushchin seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
1939ea1e8796SRoman Gushchin (u64)memory * PAGE_SIZE);
1940ea1e8796SRoman Gushchin seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
1941ea1e8796SRoman Gushchin (u64)memsw * PAGE_SIZE);
1942ea1e8796SRoman Gushchin
1943ea1e8796SRoman Gushchin for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1944ea1e8796SRoman Gushchin unsigned long nr;
1945ea1e8796SRoman Gushchin
1946ea1e8796SRoman Gushchin nr = memcg_page_state_output(memcg, memcg1_stats[i]);
1947ea1e8796SRoman Gushchin seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
1948ea1e8796SRoman Gushchin (u64)nr);
1949ea1e8796SRoman Gushchin }
1950ea1e8796SRoman Gushchin
1951ea1e8796SRoman Gushchin for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
1952ea1e8796SRoman Gushchin seq_buf_printf(s, "total_%s %llu\n",
1953ea1e8796SRoman Gushchin vm_event_name(memcg1_events[i]),
1954ea1e8796SRoman Gushchin (u64)memcg_events(memcg, memcg1_events[i]));
1955ea1e8796SRoman Gushchin
1956ea1e8796SRoman Gushchin for (i = 0; i < NR_LRU_LISTS; i++)
1957ea1e8796SRoman Gushchin seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
1958ea1e8796SRoman Gushchin (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
1959ea1e8796SRoman Gushchin PAGE_SIZE);
1960ea1e8796SRoman Gushchin
1961ea1e8796SRoman Gushchin #ifdef CONFIG_DEBUG_VM
1962ea1e8796SRoman Gushchin {
1963ea1e8796SRoman Gushchin pg_data_t *pgdat;
1964ea1e8796SRoman Gushchin struct mem_cgroup_per_node *mz;
1965ea1e8796SRoman Gushchin unsigned long anon_cost = 0;
1966ea1e8796SRoman Gushchin unsigned long file_cost = 0;
1967ea1e8796SRoman Gushchin
1968ea1e8796SRoman Gushchin for_each_online_pgdat(pgdat) {
1969ea1e8796SRoman Gushchin mz = memcg->nodeinfo[pgdat->node_id];
1970ea1e8796SRoman Gushchin
1971ea1e8796SRoman Gushchin anon_cost += mz->lruvec.anon_cost;
1972ea1e8796SRoman Gushchin file_cost += mz->lruvec.file_cost;
1973ea1e8796SRoman Gushchin }
1974ea1e8796SRoman Gushchin seq_buf_printf(s, "anon_cost %lu\n", anon_cost);
1975ea1e8796SRoman Gushchin seq_buf_printf(s, "file_cost %lu\n", file_cost);
1976ea1e8796SRoman Gushchin }
1977ea1e8796SRoman Gushchin #endif
1978ea1e8796SRoman Gushchin }
1979ea1e8796SRoman Gushchin
mem_cgroup_swappiness_read(struct cgroup_subsys_state * css,struct cftype * cft)1980ea1e8796SRoman Gushchin static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
1981ea1e8796SRoman Gushchin struct cftype *cft)
1982ea1e8796SRoman Gushchin {
1983ea1e8796SRoman Gushchin struct mem_cgroup *memcg = mem_cgroup_from_css(css);
1984ea1e8796SRoman Gushchin
1985ea1e8796SRoman Gushchin return mem_cgroup_swappiness(memcg);
1986ea1e8796SRoman Gushchin }
1987ea1e8796SRoman Gushchin
mem_cgroup_swappiness_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)1988ea1e8796SRoman Gushchin static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
1989ea1e8796SRoman Gushchin struct cftype *cft, u64 val)
1990ea1e8796SRoman Gushchin {
1991ea1e8796SRoman Gushchin struct mem_cgroup *memcg = mem_cgroup_from_css(css);
1992ea1e8796SRoman Gushchin
1993410abb20SDan Schatzberg if (val > MAX_SWAPPINESS)
1994ea1e8796SRoman Gushchin return -EINVAL;
1995ea1e8796SRoman Gushchin
1996fd4fd0a8SMichal Koutný if (!mem_cgroup_is_root(memcg)) {
1997fd4fd0a8SMichal Koutný pr_info_once("Per memcg swappiness does not exist in cgroup v2. "
1998b0543d50STejun Heo "See memory.reclaim or memory.swap.max there\n ");
1999ea1e8796SRoman Gushchin WRITE_ONCE(memcg->swappiness, val);
2000fd4fd0a8SMichal Koutný } else
2001ea1e8796SRoman Gushchin WRITE_ONCE(vm_swappiness, val);
2002ea1e8796SRoman Gushchin
2003ea1e8796SRoman Gushchin return 0;
2004ea1e8796SRoman Gushchin }
2005ea1e8796SRoman Gushchin
mem_cgroup_oom_control_read(struct seq_file * sf,void * v)2006ea1e8796SRoman Gushchin static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
2007ea1e8796SRoman Gushchin {
2008ea1e8796SRoman Gushchin struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
2009ea1e8796SRoman Gushchin
2010ea1e8796SRoman Gushchin seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable));
2011ea1e8796SRoman Gushchin seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
2012ea1e8796SRoman Gushchin seq_printf(sf, "oom_kill %lu\n",
2013ea1e8796SRoman Gushchin atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
2014ea1e8796SRoman Gushchin return 0;
2015ea1e8796SRoman Gushchin }
2016ea1e8796SRoman Gushchin
mem_cgroup_oom_control_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)2017ea1e8796SRoman Gushchin static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
2018ea1e8796SRoman Gushchin struct cftype *cft, u64 val)
2019ea1e8796SRoman Gushchin {
2020ea1e8796SRoman Gushchin struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2021ea1e8796SRoman Gushchin
20226df4ad70SShakeel Butt pr_warn_once("oom_control is deprecated and will be removed. "
20236df4ad70SShakeel Butt "Please report your usecase to linux-mm-@kvack.org if you "
20246df4ad70SShakeel Butt "depend on this functionality.\n");
20256df4ad70SShakeel Butt
2026ea1e8796SRoman Gushchin /* cannot set to root cgroup and only 0 and 1 are allowed */
2027ea1e8796SRoman Gushchin if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
2028ea1e8796SRoman Gushchin return -EINVAL;
2029ea1e8796SRoman Gushchin
2030ea1e8796SRoman Gushchin WRITE_ONCE(memcg->oom_kill_disable, val);
2031ea1e8796SRoman Gushchin if (!val)
2032ea1e8796SRoman Gushchin memcg1_oom_recover(memcg);
2033ea1e8796SRoman Gushchin
2034ea1e8796SRoman Gushchin return 0;
2035ea1e8796SRoman Gushchin }
2036ea1e8796SRoman Gushchin
20373a3b7fecSJohannes Weiner #ifdef CONFIG_SLUB_DEBUG
mem_cgroup_slab_show(struct seq_file * m,void * p)2038ea1e8796SRoman Gushchin static int mem_cgroup_slab_show(struct seq_file *m, void *p)
2039ea1e8796SRoman Gushchin {
2040ea1e8796SRoman Gushchin /*
2041ea1e8796SRoman Gushchin * Deprecated.
2042ea1e8796SRoman Gushchin * Please, take a look at tools/cgroup/memcg_slabinfo.py .
2043ea1e8796SRoman Gushchin */
2044ea1e8796SRoman Gushchin return 0;
2045ea1e8796SRoman Gushchin }
2046ea1e8796SRoman Gushchin #endif
2047ea1e8796SRoman Gushchin
2048ea1e8796SRoman Gushchin struct cftype mem_cgroup_legacy_files[] = {
2049ea1e8796SRoman Gushchin {
2050ea1e8796SRoman Gushchin .name = "usage_in_bytes",
2051ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
2052ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2053ea1e8796SRoman Gushchin },
2054ea1e8796SRoman Gushchin {
2055ea1e8796SRoman Gushchin .name = "max_usage_in_bytes",
2056ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
2057ea1e8796SRoman Gushchin .write = mem_cgroup_reset,
2058ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2059ea1e8796SRoman Gushchin },
2060ea1e8796SRoman Gushchin {
2061ea1e8796SRoman Gushchin .name = "limit_in_bytes",
2062ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
2063ea1e8796SRoman Gushchin .write = mem_cgroup_write,
2064ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2065ea1e8796SRoman Gushchin },
2066ea1e8796SRoman Gushchin {
2067ea1e8796SRoman Gushchin .name = "soft_limit_in_bytes",
2068ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
2069ea1e8796SRoman Gushchin .write = mem_cgroup_write,
2070ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2071ea1e8796SRoman Gushchin },
2072ea1e8796SRoman Gushchin {
2073ea1e8796SRoman Gushchin .name = "failcnt",
2074ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2075ea1e8796SRoman Gushchin .write = mem_cgroup_reset,
2076ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2077ea1e8796SRoman Gushchin },
2078ea1e8796SRoman Gushchin {
2079ea1e8796SRoman Gushchin .name = "stat",
2080ea1e8796SRoman Gushchin .seq_show = memory_stat_show,
2081ea1e8796SRoman Gushchin },
2082ea1e8796SRoman Gushchin {
2083ea1e8796SRoman Gushchin .name = "force_empty",
2084ea1e8796SRoman Gushchin .write = mem_cgroup_force_empty_write,
2085ea1e8796SRoman Gushchin },
2086ea1e8796SRoman Gushchin {
2087ea1e8796SRoman Gushchin .name = "use_hierarchy",
2088ea1e8796SRoman Gushchin .write_u64 = mem_cgroup_hierarchy_write,
2089ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_hierarchy_read,
2090ea1e8796SRoman Gushchin },
2091ea1e8796SRoman Gushchin {
2092ea1e8796SRoman Gushchin .name = "cgroup.event_control", /* XXX: for compat */
2093ea1e8796SRoman Gushchin .write = memcg_write_event_control,
209472797d21SStanislav Fort .flags = CFTYPE_NO_PREFIX,
2095ea1e8796SRoman Gushchin },
2096ea1e8796SRoman Gushchin {
2097ea1e8796SRoman Gushchin .name = "swappiness",
2098ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_swappiness_read,
2099ea1e8796SRoman Gushchin .write_u64 = mem_cgroup_swappiness_write,
2100ea1e8796SRoman Gushchin },
2101ea1e8796SRoman Gushchin {
2102ea1e8796SRoman Gushchin .name = "move_charge_at_immigrate",
2103ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_move_charge_read,
2104ea1e8796SRoman Gushchin .write_u64 = mem_cgroup_move_charge_write,
2105ea1e8796SRoman Gushchin },
2106ea1e8796SRoman Gushchin {
2107ea1e8796SRoman Gushchin .name = "oom_control",
2108ea1e8796SRoman Gushchin .seq_show = mem_cgroup_oom_control_read,
2109ea1e8796SRoman Gushchin .write_u64 = mem_cgroup_oom_control_write,
2110ea1e8796SRoman Gushchin },
2111ea1e8796SRoman Gushchin {
2112ea1e8796SRoman Gushchin .name = "pressure_level",
2113ea1e8796SRoman Gushchin .seq_show = mem_cgroup_dummy_seq_show,
2114ea1e8796SRoman Gushchin },
2115ea1e8796SRoman Gushchin #ifdef CONFIG_NUMA
2116ea1e8796SRoman Gushchin {
2117ea1e8796SRoman Gushchin .name = "numa_stat",
2118ea1e8796SRoman Gushchin .seq_show = memcg_numa_stat_show,
2119ea1e8796SRoman Gushchin },
2120ea1e8796SRoman Gushchin #endif
2121ea1e8796SRoman Gushchin {
2122ea1e8796SRoman Gushchin .name = "kmem.limit_in_bytes",
2123ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
2124ea1e8796SRoman Gushchin .write = mem_cgroup_write,
2125ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2126ea1e8796SRoman Gushchin },
2127ea1e8796SRoman Gushchin {
2128ea1e8796SRoman Gushchin .name = "kmem.usage_in_bytes",
2129ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
2130ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2131ea1e8796SRoman Gushchin },
2132ea1e8796SRoman Gushchin {
2133ea1e8796SRoman Gushchin .name = "kmem.failcnt",
2134ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
2135ea1e8796SRoman Gushchin .write = mem_cgroup_reset,
2136ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2137ea1e8796SRoman Gushchin },
2138ea1e8796SRoman Gushchin {
2139ea1e8796SRoman Gushchin .name = "kmem.max_usage_in_bytes",
2140ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
2141ea1e8796SRoman Gushchin .write = mem_cgroup_reset,
2142ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2143ea1e8796SRoman Gushchin },
21443a3b7fecSJohannes Weiner #ifdef CONFIG_SLUB_DEBUG
2145ea1e8796SRoman Gushchin {
2146ea1e8796SRoman Gushchin .name = "kmem.slabinfo",
2147ea1e8796SRoman Gushchin .seq_show = mem_cgroup_slab_show,
2148ea1e8796SRoman Gushchin },
2149ea1e8796SRoman Gushchin #endif
2150ea1e8796SRoman Gushchin {
2151ea1e8796SRoman Gushchin .name = "kmem.tcp.limit_in_bytes",
2152ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
2153ea1e8796SRoman Gushchin .write = mem_cgroup_write,
2154ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2155ea1e8796SRoman Gushchin },
2156ea1e8796SRoman Gushchin {
2157ea1e8796SRoman Gushchin .name = "kmem.tcp.usage_in_bytes",
2158ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
2159ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2160ea1e8796SRoman Gushchin },
2161ea1e8796SRoman Gushchin {
2162ea1e8796SRoman Gushchin .name = "kmem.tcp.failcnt",
2163ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
2164ea1e8796SRoman Gushchin .write = mem_cgroup_reset,
2165ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2166ea1e8796SRoman Gushchin },
2167ea1e8796SRoman Gushchin {
2168ea1e8796SRoman Gushchin .name = "kmem.tcp.max_usage_in_bytes",
2169ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
2170ea1e8796SRoman Gushchin .write = mem_cgroup_reset,
2171ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2172ea1e8796SRoman Gushchin },
2173ea1e8796SRoman Gushchin { }, /* terminate */
2174ea1e8796SRoman Gushchin };
2175ea1e8796SRoman Gushchin
2176ea1e8796SRoman Gushchin struct cftype memsw_files[] = {
2177ea1e8796SRoman Gushchin {
2178ea1e8796SRoman Gushchin .name = "memsw.usage_in_bytes",
2179ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
2180ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2181ea1e8796SRoman Gushchin },
2182ea1e8796SRoman Gushchin {
2183ea1e8796SRoman Gushchin .name = "memsw.max_usage_in_bytes",
2184ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
2185ea1e8796SRoman Gushchin .write = mem_cgroup_reset,
2186ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2187ea1e8796SRoman Gushchin },
2188ea1e8796SRoman Gushchin {
2189ea1e8796SRoman Gushchin .name = "memsw.limit_in_bytes",
2190ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
2191ea1e8796SRoman Gushchin .write = mem_cgroup_write,
2192ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2193ea1e8796SRoman Gushchin },
2194ea1e8796SRoman Gushchin {
2195ea1e8796SRoman Gushchin .name = "memsw.failcnt",
2196ea1e8796SRoman Gushchin .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
2197ea1e8796SRoman Gushchin .write = mem_cgroup_reset,
2198ea1e8796SRoman Gushchin .read_u64 = mem_cgroup_read_u64,
2199ea1e8796SRoman Gushchin },
2200ea1e8796SRoman Gushchin { }, /* terminate */
2201ea1e8796SRoman Gushchin };
2202ea1e8796SRoman Gushchin
memcg1_account_kmem(struct mem_cgroup * memcg,int nr_pages)220304fbe921SRoman Gushchin void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages)
220404fbe921SRoman Gushchin {
220504fbe921SRoman Gushchin if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
220604fbe921SRoman Gushchin if (nr_pages > 0)
220704fbe921SRoman Gushchin page_counter_charge(&memcg->kmem, nr_pages);
220804fbe921SRoman Gushchin else
220904fbe921SRoman Gushchin page_counter_uncharge(&memcg->kmem, -nr_pages);
221004fbe921SRoman Gushchin }
221104fbe921SRoman Gushchin }
221204fbe921SRoman Gushchin
memcg1_charge_skmem(struct mem_cgroup * memcg,unsigned int nr_pages,gfp_t gfp_mask)2213773e9ae7SRoman Gushchin bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
2214773e9ae7SRoman Gushchin gfp_t gfp_mask)
2215773e9ae7SRoman Gushchin {
2216773e9ae7SRoman Gushchin struct page_counter *fail;
2217773e9ae7SRoman Gushchin
2218773e9ae7SRoman Gushchin if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
2219773e9ae7SRoman Gushchin memcg->tcpmem_pressure = 0;
2220773e9ae7SRoman Gushchin return true;
2221773e9ae7SRoman Gushchin }
2222773e9ae7SRoman Gushchin memcg->tcpmem_pressure = 1;
2223773e9ae7SRoman Gushchin if (gfp_mask & __GFP_NOFAIL) {
2224773e9ae7SRoman Gushchin page_counter_charge(&memcg->tcpmem, nr_pages);
2225773e9ae7SRoman Gushchin return true;
2226773e9ae7SRoman Gushchin }
2227773e9ae7SRoman Gushchin return false;
2228773e9ae7SRoman Gushchin }
2229773e9ae7SRoman Gushchin
memcg1_alloc_events(struct mem_cgroup * memcg)22300ccaf421SShakeel Butt bool memcg1_alloc_events(struct mem_cgroup *memcg)
22310ccaf421SShakeel Butt {
22320ccaf421SShakeel Butt memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu,
22330ccaf421SShakeel Butt GFP_KERNEL_ACCOUNT);
22340ccaf421SShakeel Butt return !!memcg->events_percpu;
22350ccaf421SShakeel Butt }
22360ccaf421SShakeel Butt
memcg1_free_events(struct mem_cgroup * memcg)22370ccaf421SShakeel Butt void memcg1_free_events(struct mem_cgroup *memcg)
22380ccaf421SShakeel Butt {
22390ccaf421SShakeel Butt free_percpu(memcg->events_percpu);
22400ccaf421SShakeel Butt }
22410ccaf421SShakeel Butt
memcg1_init(void)2242d12f6d22SRoman Gushchin static int __init memcg1_init(void)
2243d12f6d22SRoman Gushchin {
2244d12f6d22SRoman Gushchin int node;
2245d12f6d22SRoman Gushchin
2246d12f6d22SRoman Gushchin for_each_node(node) {
2247d12f6d22SRoman Gushchin struct mem_cgroup_tree_per_node *rtpn;
2248d12f6d22SRoman Gushchin
2249d12f6d22SRoman Gushchin rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
2250d12f6d22SRoman Gushchin
2251d12f6d22SRoman Gushchin rtpn->rb_root = RB_ROOT;
2252d12f6d22SRoman Gushchin rtpn->rb_rightmost = NULL;
2253d12f6d22SRoman Gushchin spin_lock_init(&rtpn->lock);
2254d12f6d22SRoman Gushchin soft_limit_tree.rb_tree_per_node[node] = rtpn;
2255d12f6d22SRoman Gushchin }
2256d12f6d22SRoman Gushchin
2257d12f6d22SRoman Gushchin return 0;
2258d12f6d22SRoman Gushchin }
2259d12f6d22SRoman Gushchin subsys_initcall(memcg1_init);
2260