146aeb7e6SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds * Simple NUMA memory policy for the Linux kernel.
41da177e4SLinus Torvalds *
51da177e4SLinus Torvalds * Copyright 2003,2004 Andi Kleen, SuSE Labs.
68bccd85fSChristoph Lameter * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
71da177e4SLinus Torvalds *
81da177e4SLinus Torvalds * NUMA policy allows the user to give hints in which node(s) memory should
91da177e4SLinus Torvalds * be allocated.
101da177e4SLinus Torvalds *
111cd1a4e7STanya Agarwal * Support six policies per VMA and per process:
121da177e4SLinus Torvalds *
131da177e4SLinus Torvalds * The VMA policy has priority over the process policy for a page fault.
141da177e4SLinus Torvalds *
151da177e4SLinus Torvalds * interleave Allocate memory interleaved over a set of nodes,
161da177e4SLinus Torvalds * with normal fallback if it fails.
171da177e4SLinus Torvalds * For VMA based allocations this interleaves based on the
181da177e4SLinus Torvalds * offset into the backing object or offset into the mapping
191da177e4SLinus Torvalds * for anonymous memory. For process policy an process counter
201da177e4SLinus Torvalds * is used.
218bccd85fSChristoph Lameter *
22fa3bea4eSGregory Price * weighted interleave
23fa3bea4eSGregory Price * Allocate memory interleaved over a set of nodes based on
24fa3bea4eSGregory Price * a set of weights (per-node), with normal fallback if it
25fa3bea4eSGregory Price * fails. Otherwise operates the same as interleave.
26fa3bea4eSGregory Price * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
27fa3bea4eSGregory Price * on node 0 for every 1 page allocated on node 1.
28fa3bea4eSGregory Price *
291da177e4SLinus Torvalds * bind Only allocate memory on a specific set of nodes,
301da177e4SLinus Torvalds * no fallback.
318bccd85fSChristoph Lameter * FIXME: memory is allocated starting with the first node
328bccd85fSChristoph Lameter * to the last. It would be better if bind would truly restrict
338bccd85fSChristoph Lameter * the allocation to memory nodes instead
348bccd85fSChristoph Lameter *
351da177e4SLinus Torvalds * preferred Try a specific node first before normal fallback.
3600ef2d2fSDavid Rientjes * As a special case NUMA_NO_NODE here means do the allocation
371da177e4SLinus Torvalds * on the local CPU. This is normally identical to default,
381da177e4SLinus Torvalds * but useful to set in a VMA when you have a non default
391da177e4SLinus Torvalds * process policy.
408bccd85fSChristoph Lameter *
41b27abaccSDave Hansen * preferred many Try a set of nodes first before normal fallback. This is
42b27abaccSDave Hansen * similar to preferred without the special case.
43b27abaccSDave Hansen *
441da177e4SLinus Torvalds * default Allocate on the local node first, or when on a VMA
451da177e4SLinus Torvalds * use the process policy. This is what Linux always did
461da177e4SLinus Torvalds * in a NUMA aware kernel and still does by, ahem, default.
471da177e4SLinus Torvalds *
481da177e4SLinus Torvalds * The process policy is applied for most non interrupt memory allocations
491da177e4SLinus Torvalds * in that process' context. Interrupts ignore the policies and always
501da177e4SLinus Torvalds * try to allocate on the local CPU. The VMA policy is only applied for memory
511da177e4SLinus Torvalds * allocations for a VMA in the VM.
521da177e4SLinus Torvalds *
531da177e4SLinus Torvalds * Currently there are a few corner cases in swapping where the policy
541da177e4SLinus Torvalds * is not applied, but the majority should be handled. When process policy
551da177e4SLinus Torvalds * is used it is not remembered over swap outs/swap ins.
561da177e4SLinus Torvalds *
571da177e4SLinus Torvalds * Only the highest zone in the zone hierarchy gets policied. Allocations
581da177e4SLinus Torvalds * requesting a lower zone just use default policy. This implies that
591da177e4SLinus Torvalds * on systems with highmem kernel lowmem allocation don't get policied.
601da177e4SLinus Torvalds * Same with GFP_DMA allocations.
611da177e4SLinus Torvalds *
62c36f6e6dSHugh Dickins * For shmem/tmpfs shared memory the policy is shared between
631da177e4SLinus Torvalds * all users and remembered even when nobody has memory mapped.
641da177e4SLinus Torvalds */
651da177e4SLinus Torvalds
661da177e4SLinus Torvalds /* Notebook:
671da177e4SLinus Torvalds fix mmap readahead to honour policy and enable policy for any page cache
681da177e4SLinus Torvalds object
691da177e4SLinus Torvalds statistics for bigpages
701da177e4SLinus Torvalds global policy for page cache? currently it uses process policy. Requires
711da177e4SLinus Torvalds first item above.
721da177e4SLinus Torvalds handle mremap for shared memory (currently ignored for the policy)
731da177e4SLinus Torvalds grows down?
741da177e4SLinus Torvalds make bind policy root only? It can trigger oom much faster and the
751da177e4SLinus Torvalds kernel is not always grateful with that.
761da177e4SLinus Torvalds */
771da177e4SLinus Torvalds
78b1de0d13SMitchel Humpherys #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79b1de0d13SMitchel Humpherys
801da177e4SLinus Torvalds #include <linux/mempolicy.h>
81a520110eSChristoph Hellwig #include <linux/pagewalk.h>
821da177e4SLinus Torvalds #include <linux/highmem.h>
831da177e4SLinus Torvalds #include <linux/hugetlb.h>
841da177e4SLinus Torvalds #include <linux/kernel.h>
851da177e4SLinus Torvalds #include <linux/sched.h>
866e84f315SIngo Molnar #include <linux/sched/mm.h>
876a3827d7SIngo Molnar #include <linux/sched/numa_balancing.h>
88ca43034cSKefeng Wang #include <linux/sched/sysctl.h>
89f719ff9bSIngo Molnar #include <linux/sched/task.h>
901da177e4SLinus Torvalds #include <linux/nodemask.h>
911da177e4SLinus Torvalds #include <linux/cpuset.h>
921da177e4SLinus Torvalds #include <linux/slab.h>
931da177e4SLinus Torvalds #include <linux/string.h>
94b95f1b31SPaul Gortmaker #include <linux/export.h>
95b488893aSPavel Emelyanov #include <linux/nsproxy.h>
961da177e4SLinus Torvalds #include <linux/interrupt.h>
971da177e4SLinus Torvalds #include <linux/init.h>
981da177e4SLinus Torvalds #include <linux/compat.h>
9931367466SOtto Ebeling #include <linux/ptrace.h>
100dc9aa5b9SChristoph Lameter #include <linux/swap.h>
1011a75a6c8SChristoph Lameter #include <linux/seq_file.h>
1021a75a6c8SChristoph Lameter #include <linux/proc_fs.h>
103ca43034cSKefeng Wang #include <linux/memory-tiers.h>
104b20a3503SChristoph Lameter #include <linux/migrate.h>
10562b61f61SHugh Dickins #include <linux/ksm.h>
10695a402c3SChristoph Lameter #include <linux/rmap.h>
10786c3a764SDavid Quigley #include <linux/security.h>
108dbcb0f19SAdrian Bunk #include <linux/syscalls.h>
109095f1fc4SLee Schermerhorn #include <linux/ctype.h>
1106d9c285aSKOSAKI Motohiro #include <linux/mm_inline.h>
111b24f53a0SLee Schermerhorn #include <linux/mmu_notifier.h>
112b1de0d13SMitchel Humpherys #include <linux/printk.h>
1130ac881efSLorenzo Stoakes #include <linux/leafops.h>
114e341f9c3SJoshua Hahn #include <linux/gcd.h>
115dc9aa5b9SChristoph Lameter
1161da177e4SLinus Torvalds #include <asm/tlbflush.h>
1174a18419fSNadav Amit #include <asm/tlb.h>
1187c0f6ba6SLinus Torvalds #include <linux/uaccess.h>
119dec92bf9SRakie Kim #include <linux/memory.h>
1201da177e4SLinus Torvalds
12162695a84SNick Piggin #include "internal.h"
12262695a84SNick Piggin
12338e35860SChristoph Lameter /* Internal flags */
124dc9aa5b9SChristoph Lameter #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
12538e35860SChristoph Lameter #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
1261cb5d11aSHugh Dickins #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */
127dc9aa5b9SChristoph Lameter
128fcc234f8SPekka Enberg static struct kmem_cache *policy_cache;
129fcc234f8SPekka Enberg static struct kmem_cache *sn_cache;
1301da177e4SLinus Torvalds
1311da177e4SLinus Torvalds /* Highest zone. An specific allocation for a zone below that is not
1321da177e4SLinus Torvalds policied. */
1336267276fSChristoph Lameter enum zone_type policy_zone = 0;
1341da177e4SLinus Torvalds
135bea904d5SLee Schermerhorn /*
136bea904d5SLee Schermerhorn * run-time system-wide default policy => local allocation
137bea904d5SLee Schermerhorn */
138e754d79dSH Hartley Sweeten static struct mempolicy default_policy = {
1391da177e4SLinus Torvalds .refcnt = ATOMIC_INIT(1), /* never free it */
1407858d7bcSFeng Tang .mode = MPOL_LOCAL,
1411da177e4SLinus Torvalds };
1421da177e4SLinus Torvalds
1435606e387SMel Gorman static struct mempolicy preferred_node_policy[MAX_NUMNODES];
1445606e387SMel Gorman
145dce41f5aSRakie Kim /*
146e341f9c3SJoshua Hahn * weightiness balances the tradeoff between small weights (cycles through nodes
147e341f9c3SJoshua Hahn * faster, more fair/even distribution) and large weights (smaller errors
148e341f9c3SJoshua Hahn * between actual bandwidth ratios and weight ratios). 32 is a number that has
149e341f9c3SJoshua Hahn * been found to perform at a reasonable compromise between the two goals.
150dce41f5aSRakie Kim */
151e341f9c3SJoshua Hahn static const int weightiness = 32;
152e341f9c3SJoshua Hahn
153e341f9c3SJoshua Hahn /*
154e341f9c3SJoshua Hahn * A null weighted_interleave_state is interpreted as having .mode="auto",
155e341f9c3SJoshua Hahn * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
156e341f9c3SJoshua Hahn */
157e341f9c3SJoshua Hahn struct weighted_interleave_state {
158e341f9c3SJoshua Hahn bool mode_auto;
159e341f9c3SJoshua Hahn u8 iw_table[];
160e341f9c3SJoshua Hahn };
161e341f9c3SJoshua Hahn static struct weighted_interleave_state __rcu *wi_state;
162e341f9c3SJoshua Hahn static unsigned int *node_bw_table;
163e341f9c3SJoshua Hahn
164e341f9c3SJoshua Hahn /*
165e341f9c3SJoshua Hahn * wi_state_lock protects both wi_state and node_bw_table.
166e341f9c3SJoshua Hahn * node_bw_table is only used by writers to update wi_state.
167e341f9c3SJoshua Hahn */
168e341f9c3SJoshua Hahn static DEFINE_MUTEX(wi_state_lock);
169dce41f5aSRakie Kim
get_il_weight(int node)170dce41f5aSRakie Kim static u8 get_il_weight(int node)
171dce41f5aSRakie Kim {
172e341f9c3SJoshua Hahn struct weighted_interleave_state *state;
173e341f9c3SJoshua Hahn u8 weight = 1;
174dce41f5aSRakie Kim
175dce41f5aSRakie Kim rcu_read_lock();
176e341f9c3SJoshua Hahn state = rcu_dereference(wi_state);
177e341f9c3SJoshua Hahn if (state)
178e341f9c3SJoshua Hahn weight = state->iw_table[node];
179dce41f5aSRakie Kim rcu_read_unlock();
180dce41f5aSRakie Kim return weight;
181dce41f5aSRakie Kim }
182dce41f5aSRakie Kim
183e341f9c3SJoshua Hahn /*
184e341f9c3SJoshua Hahn * Convert bandwidth values into weighted interleave weights.
185e341f9c3SJoshua Hahn * Call with wi_state_lock.
186e341f9c3SJoshua Hahn */
reduce_interleave_weights(unsigned int * bw,u8 * new_iw)187e341f9c3SJoshua Hahn static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
188e341f9c3SJoshua Hahn {
189e341f9c3SJoshua Hahn u64 sum_bw = 0;
190e341f9c3SJoshua Hahn unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
191e341f9c3SJoshua Hahn int nid;
192e341f9c3SJoshua Hahn
193e341f9c3SJoshua Hahn for_each_node_state(nid, N_MEMORY)
194e341f9c3SJoshua Hahn sum_bw += bw[nid];
195e341f9c3SJoshua Hahn
196e341f9c3SJoshua Hahn /* Scale bandwidths to whole numbers in the range [1, weightiness] */
197e341f9c3SJoshua Hahn for_each_node_state(nid, N_MEMORY) {
198e341f9c3SJoshua Hahn /*
199e341f9c3SJoshua Hahn * Try not to perform 64-bit division.
200e341f9c3SJoshua Hahn * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
201e341f9c3SJoshua Hahn * If sum_bw > scaling_factor, then round the weight up to 1.
202e341f9c3SJoshua Hahn */
203e341f9c3SJoshua Hahn scaling_factor = weightiness * bw[nid];
204e341f9c3SJoshua Hahn if (bw[nid] && sum_bw < scaling_factor) {
205e341f9c3SJoshua Hahn cast_sum_bw = (unsigned int)sum_bw;
206e341f9c3SJoshua Hahn new_iw[nid] = scaling_factor / cast_sum_bw;
207e341f9c3SJoshua Hahn } else {
208e341f9c3SJoshua Hahn new_iw[nid] = 1;
209e341f9c3SJoshua Hahn }
210e341f9c3SJoshua Hahn if (!iw_gcd)
211e341f9c3SJoshua Hahn iw_gcd = new_iw[nid];
212e341f9c3SJoshua Hahn iw_gcd = gcd(iw_gcd, new_iw[nid]);
213e341f9c3SJoshua Hahn }
214e341f9c3SJoshua Hahn
215e341f9c3SJoshua Hahn /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
216e341f9c3SJoshua Hahn for_each_node_state(nid, N_MEMORY)
217e341f9c3SJoshua Hahn new_iw[nid] /= iw_gcd;
218e341f9c3SJoshua Hahn }
219e341f9c3SJoshua Hahn
mempolicy_set_node_perf(unsigned int node,struct access_coordinate * coords)220e341f9c3SJoshua Hahn int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
221e341f9c3SJoshua Hahn {
222e341f9c3SJoshua Hahn struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
223e341f9c3SJoshua Hahn unsigned int *old_bw, *new_bw;
224e341f9c3SJoshua Hahn unsigned int bw_val;
225e341f9c3SJoshua Hahn int i;
226e341f9c3SJoshua Hahn
227e341f9c3SJoshua Hahn bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
228e341f9c3SJoshua Hahn new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
229e341f9c3SJoshua Hahn if (!new_bw)
230e341f9c3SJoshua Hahn return -ENOMEM;
231e341f9c3SJoshua Hahn
23232a92f8cSLinus Torvalds new_wi_state = kmalloc_flex(*new_wi_state, iw_table, nr_node_ids);
233e341f9c3SJoshua Hahn if (!new_wi_state) {
234e341f9c3SJoshua Hahn kfree(new_bw);
235e341f9c3SJoshua Hahn return -ENOMEM;
236e341f9c3SJoshua Hahn }
237e341f9c3SJoshua Hahn new_wi_state->mode_auto = true;
238e341f9c3SJoshua Hahn for (i = 0; i < nr_node_ids; i++)
239e341f9c3SJoshua Hahn new_wi_state->iw_table[i] = 1;
240e341f9c3SJoshua Hahn
241e341f9c3SJoshua Hahn /*
242e341f9c3SJoshua Hahn * Update bandwidth info, even in manual mode. That way, when switching
243e341f9c3SJoshua Hahn * to auto mode in the future, iw_table can be overwritten using
244e341f9c3SJoshua Hahn * accurate bw data.
245e341f9c3SJoshua Hahn */
246e341f9c3SJoshua Hahn mutex_lock(&wi_state_lock);
247e341f9c3SJoshua Hahn
248e341f9c3SJoshua Hahn old_bw = node_bw_table;
249e341f9c3SJoshua Hahn if (old_bw)
250e341f9c3SJoshua Hahn memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
251e341f9c3SJoshua Hahn new_bw[node] = bw_val;
252e341f9c3SJoshua Hahn node_bw_table = new_bw;
253e341f9c3SJoshua Hahn
254e341f9c3SJoshua Hahn old_wi_state = rcu_dereference_protected(wi_state,
255e341f9c3SJoshua Hahn lockdep_is_held(&wi_state_lock));
256e341f9c3SJoshua Hahn if (old_wi_state && !old_wi_state->mode_auto) {
257e341f9c3SJoshua Hahn /* Manual mode; skip reducing weights and updating wi_state */
258e341f9c3SJoshua Hahn mutex_unlock(&wi_state_lock);
259e341f9c3SJoshua Hahn kfree(new_wi_state);
260e341f9c3SJoshua Hahn goto out;
261e341f9c3SJoshua Hahn }
262e341f9c3SJoshua Hahn
263e341f9c3SJoshua Hahn /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
264e341f9c3SJoshua Hahn reduce_interleave_weights(new_bw, new_wi_state->iw_table);
265e341f9c3SJoshua Hahn rcu_assign_pointer(wi_state, new_wi_state);
266e341f9c3SJoshua Hahn
267e341f9c3SJoshua Hahn mutex_unlock(&wi_state_lock);
268e341f9c3SJoshua Hahn if (old_wi_state) {
269e341f9c3SJoshua Hahn synchronize_rcu();
270e341f9c3SJoshua Hahn kfree(old_wi_state);
271e341f9c3SJoshua Hahn }
272e341f9c3SJoshua Hahn out:
273e341f9c3SJoshua Hahn kfree(old_bw);
274e341f9c3SJoshua Hahn return 0;
275e341f9c3SJoshua Hahn }
276e341f9c3SJoshua Hahn
277b2ca916cSDan Williams /**
278b1f099b1SYury Norov * numa_nearest_node - Find nearest node by state
279f6e92f40SKrzysztof Kozlowski * @node: Node id to start the search
280b1f099b1SYury Norov * @state: State to filter the search
281b2ca916cSDan Williams *
282b1f099b1SYury Norov * Lookup the closest node by distance if @nid is not in state.
283dad5b023SRandy Dunlap *
284b1f099b1SYury Norov * Return: this @node if it is in state, otherwise the closest node by distance
285b2ca916cSDan Williams */
numa_nearest_node(int node,unsigned int state)286b1f099b1SYury Norov int numa_nearest_node(int node, unsigned int state)
287b2ca916cSDan Williams {
2884fcbe96eSDan Williams int min_dist = INT_MAX, dist, n, min_node;
289b2ca916cSDan Williams
290b1f099b1SYury Norov if (state >= NR_NODE_STATES)
291b1f099b1SYury Norov return -EINVAL;
292b1f099b1SYury Norov
293b1f099b1SYury Norov if (node == NUMA_NO_NODE || node_state(node, state))
2944fcbe96eSDan Williams return node;
295b2ca916cSDan Williams
296b2ca916cSDan Williams min_node = node;
297b1f099b1SYury Norov for_each_node_state(n, state) {
298b2ca916cSDan Williams dist = node_distance(node, n);
299b2ca916cSDan Williams if (dist < min_dist) {
300b2ca916cSDan Williams min_dist = dist;
301b2ca916cSDan Williams min_node = n;
302b2ca916cSDan Williams }
303b2ca916cSDan Williams }
304b2ca916cSDan Williams
305b2ca916cSDan Williams return min_node;
306b2ca916cSDan Williams }
307b1f099b1SYury Norov EXPORT_SYMBOL_GPL(numa_nearest_node);
308b2ca916cSDan Williams
30916d79f2aSAndrea Righi /**
31016d79f2aSAndrea Righi * nearest_node_nodemask - Find the node in @mask at the nearest distance
31116d79f2aSAndrea Righi * from @node.
31216d79f2aSAndrea Righi *
31316d79f2aSAndrea Righi * @node: a valid node ID to start the search from.
31416d79f2aSAndrea Righi * @mask: a pointer to a nodemask representing the allowed nodes.
31516d79f2aSAndrea Righi *
31616d79f2aSAndrea Righi * This function iterates over all nodes in @mask and calculates the
31716d79f2aSAndrea Righi * distance from the starting @node, then it returns the node ID that is
31816d79f2aSAndrea Righi * the closest to @node, or MAX_NUMNODES if no node is found.
31916d79f2aSAndrea Righi *
32016d79f2aSAndrea Righi * Note that @node must be a valid node ID usable with node_distance(),
32116d79f2aSAndrea Righi * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
32216d79f2aSAndrea Righi * or unexpected behavior.
32316d79f2aSAndrea Righi */
nearest_node_nodemask(int node,nodemask_t * mask)32416d79f2aSAndrea Righi int nearest_node_nodemask(int node, nodemask_t *mask)
32516d79f2aSAndrea Righi {
32616d79f2aSAndrea Righi int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
32716d79f2aSAndrea Righi
32816d79f2aSAndrea Righi for_each_node_mask(n, *mask) {
32916d79f2aSAndrea Righi dist = node_distance(node, n);
33016d79f2aSAndrea Righi if (dist < min_dist) {
33116d79f2aSAndrea Righi min_dist = dist;
33216d79f2aSAndrea Righi min_node = n;
33316d79f2aSAndrea Righi }
33416d79f2aSAndrea Righi }
33516d79f2aSAndrea Righi
33616d79f2aSAndrea Righi return min_node;
33716d79f2aSAndrea Righi }
33816d79f2aSAndrea Righi EXPORT_SYMBOL_GPL(nearest_node_nodemask);
33916d79f2aSAndrea Righi
get_task_policy(struct task_struct * p)34074d2c3a0SOleg Nesterov struct mempolicy *get_task_policy(struct task_struct *p)
3415606e387SMel Gorman {
3425606e387SMel Gorman struct mempolicy *pol = p->mempolicy;
343f15ca78eSOleg Nesterov int node;
3445606e387SMel Gorman
345f15ca78eSOleg Nesterov if (pol)
346f15ca78eSOleg Nesterov return pol;
3475606e387SMel Gorman
348f15ca78eSOleg Nesterov node = numa_node_id();
3491da6f0e1SJianguo Wu if (node != NUMA_NO_NODE) {
3501da6f0e1SJianguo Wu pol = &preferred_node_policy[node];
351f15ca78eSOleg Nesterov /* preferred_node_policy is not initialised early in boot */
352f15ca78eSOleg Nesterov if (pol->mode)
353f15ca78eSOleg Nesterov return pol;
3541da6f0e1SJianguo Wu }
3555606e387SMel Gorman
356f15ca78eSOleg Nesterov return &default_policy;
3575606e387SMel Gorman }
358f634f108SShivank Garg EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm");
3595606e387SMel Gorman
36037012946SDavid Rientjes static const struct mempolicy_operations {
36137012946SDavid Rientjes int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
362213980c0SVlastimil Babka void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
36337012946SDavid Rientjes } mpol_ops[MPOL_MAX];
36437012946SDavid Rientjes
mpol_store_user_nodemask(const struct mempolicy * pol)365f5b087b5SDavid Rientjes static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
366f5b087b5SDavid Rientjes {
3673d702678SJinjiang Tu return pol->flags & MPOL_USER_NODEMASK_FLAGS;
3684c50bc01SDavid Rientjes }
3694c50bc01SDavid Rientjes
mpol_relative_nodemask(nodemask_t * ret,const nodemask_t * orig,const nodemask_t * rel)3704c50bc01SDavid Rientjes static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
3714c50bc01SDavid Rientjes const nodemask_t *rel)
3724c50bc01SDavid Rientjes {
3734c50bc01SDavid Rientjes nodemask_t tmp;
3744c50bc01SDavid Rientjes nodes_fold(tmp, *orig, nodes_weight(*rel));
3754c50bc01SDavid Rientjes nodes_onto(*ret, tmp, *rel);
376f5b087b5SDavid Rientjes }
377f5b087b5SDavid Rientjes
mpol_new_nodemask(struct mempolicy * pol,const nodemask_t * nodes)378be897d48SFeng Tang static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
37937012946SDavid Rientjes {
38037012946SDavid Rientjes if (nodes_empty(*nodes))
38137012946SDavid Rientjes return -EINVAL;
382269fbe72SBen Widawsky pol->nodes = *nodes;
38337012946SDavid Rientjes return 0;
38437012946SDavid Rientjes }
38537012946SDavid Rientjes
mpol_new_preferred(struct mempolicy * pol,const nodemask_t * nodes)38637012946SDavid Rientjes static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
38737012946SDavid Rientjes {
3887858d7bcSFeng Tang if (nodes_empty(*nodes))
3897858d7bcSFeng Tang return -EINVAL;
390269fbe72SBen Widawsky
391269fbe72SBen Widawsky nodes_clear(pol->nodes);
392269fbe72SBen Widawsky node_set(first_node(*nodes), pol->nodes);
39337012946SDavid Rientjes return 0;
39437012946SDavid Rientjes }
39537012946SDavid Rientjes
39658568d2aSMiao Xie /*
39758568d2aSMiao Xie * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
39858568d2aSMiao Xie * any, for the new policy. mpol_new() has already validated the nodes
3997858d7bcSFeng Tang * parameter with respect to the policy mode and flags.
40058568d2aSMiao Xie *
40158568d2aSMiao Xie * Must be called holding task's alloc_lock to protect task's mems_allowed
402c1e8d7c6SMichel Lespinasse * and mempolicy. May also be called holding the mmap_lock for write.
40358568d2aSMiao Xie */
mpol_set_nodemask(struct mempolicy * pol,const nodemask_t * nodes,struct nodemask_scratch * nsc)4044bfc4495SKAMEZAWA Hiroyuki static int mpol_set_nodemask(struct mempolicy *pol,
4054bfc4495SKAMEZAWA Hiroyuki const nodemask_t *nodes, struct nodemask_scratch *nsc)
40658568d2aSMiao Xie {
40758568d2aSMiao Xie int ret;
40858568d2aSMiao Xie
4097858d7bcSFeng Tang /*
4107858d7bcSFeng Tang * Default (pol==NULL) resp. local memory policies are not a
4117858d7bcSFeng Tang * subject of any remapping. They also do not need any special
4127858d7bcSFeng Tang * constructor.
4137858d7bcSFeng Tang */
4147858d7bcSFeng Tang if (!pol || pol->mode == MPOL_LOCAL)
41558568d2aSMiao Xie return 0;
4167858d7bcSFeng Tang
41701f13bd6SLai Jiangshan /* Check N_MEMORY */
4184bfc4495SKAMEZAWA Hiroyuki nodes_and(nsc->mask1,
41901f13bd6SLai Jiangshan cpuset_current_mems_allowed, node_states[N_MEMORY]);
42058568d2aSMiao Xie
42158568d2aSMiao Xie VM_BUG_ON(!nodes);
4227858d7bcSFeng Tang
42358568d2aSMiao Xie if (pol->flags & MPOL_F_RELATIVE_NODES)
4244bfc4495SKAMEZAWA Hiroyuki mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
42558568d2aSMiao Xie else
4264bfc4495SKAMEZAWA Hiroyuki nodes_and(nsc->mask2, *nodes, nsc->mask1);
4274bfc4495SKAMEZAWA Hiroyuki
42858568d2aSMiao Xie if (mpol_store_user_nodemask(pol))
42958568d2aSMiao Xie pol->w.user_nodemask = *nodes;
43058568d2aSMiao Xie else
4317858d7bcSFeng Tang pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
43258568d2aSMiao Xie
4334bfc4495SKAMEZAWA Hiroyuki ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
43458568d2aSMiao Xie return ret;
43558568d2aSMiao Xie }
43658568d2aSMiao Xie
43758568d2aSMiao Xie /*
43858568d2aSMiao Xie * This function just creates a new policy, does some check and simple
43958568d2aSMiao Xie * initialization. You must invoke mpol_set_nodemask() to set nodes.
44058568d2aSMiao Xie */
mpol_new(unsigned short mode,unsigned short flags,nodemask_t * nodes)441028fec41SDavid Rientjes static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
442028fec41SDavid Rientjes nodemask_t *nodes)
4431da177e4SLinus Torvalds {
4441da177e4SLinus Torvalds struct mempolicy *policy;
4451da177e4SLinus Torvalds
4463e1f0645SDavid Rientjes if (mode == MPOL_DEFAULT) {
4473e1f0645SDavid Rientjes if (nodes && !nodes_empty(*nodes))
44837012946SDavid Rientjes return ERR_PTR(-EINVAL);
449d3a71033SLee Schermerhorn return NULL;
45037012946SDavid Rientjes }
4513e1f0645SDavid Rientjes VM_BUG_ON(!nodes);
4523e1f0645SDavid Rientjes
4533e1f0645SDavid Rientjes /*
4543e1f0645SDavid Rientjes * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
4553e1f0645SDavid Rientjes * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
4563e1f0645SDavid Rientjes * All other modes require a valid pointer to a non-empty nodemask.
4573e1f0645SDavid Rientjes */
4583e1f0645SDavid Rientjes if (mode == MPOL_PREFERRED) {
4593e1f0645SDavid Rientjes if (nodes_empty(*nodes)) {
4603e1f0645SDavid Rientjes if (((flags & MPOL_F_STATIC_NODES) ||
4613e1f0645SDavid Rientjes (flags & MPOL_F_RELATIVE_NODES)))
4623e1f0645SDavid Rientjes return ERR_PTR(-EINVAL);
4637858d7bcSFeng Tang
4647858d7bcSFeng Tang mode = MPOL_LOCAL;
4653e1f0645SDavid Rientjes }
466479e2802SPeter Zijlstra } else if (mode == MPOL_LOCAL) {
4678d303e44SPiotr Kwapulinski if (!nodes_empty(*nodes) ||
4688d303e44SPiotr Kwapulinski (flags & MPOL_F_STATIC_NODES) ||
4698d303e44SPiotr Kwapulinski (flags & MPOL_F_RELATIVE_NODES))
470479e2802SPeter Zijlstra return ERR_PTR(-EINVAL);
4713e1f0645SDavid Rientjes } else if (nodes_empty(*nodes))
4723e1f0645SDavid Rientjes return ERR_PTR(-EINVAL);
473c36f6e6dSHugh Dickins
4741da177e4SLinus Torvalds policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
4751da177e4SLinus Torvalds if (!policy)
4761da177e4SLinus Torvalds return ERR_PTR(-ENOMEM);
4771da177e4SLinus Torvalds atomic_set(&policy->refcnt, 1);
47845c4745aSLee Schermerhorn policy->mode = mode;
47937012946SDavid Rientjes policy->flags = flags;
480c6018b4bSAneesh Kumar K.V policy->home_node = NUMA_NO_NODE;
4813e1f0645SDavid Rientjes
48237012946SDavid Rientjes return policy;
48337012946SDavid Rientjes }
48437012946SDavid Rientjes
48552cd3b07SLee Schermerhorn /* Slow path of a mpol destructor. */
__mpol_put(struct mempolicy * pol)486c36f6e6dSHugh Dickins void __mpol_put(struct mempolicy *pol)
48752cd3b07SLee Schermerhorn {
488c36f6e6dSHugh Dickins if (!atomic_dec_and_test(&pol->refcnt))
48952cd3b07SLee Schermerhorn return;
490190a8c48SHao-Yu Yang /*
491190a8c48SHao-Yu Yang * Required to allow mmap_lock_speculative*() access, see for example
492190a8c48SHao-Yu Yang * futex_key_to_node_opt(). All accesses are serialized by mmap_lock,
493190a8c48SHao-Yu Yang * however the speculative lock section unbound by the normal lock
494190a8c48SHao-Yu Yang * boundaries, requiring RCU freeing.
495190a8c48SHao-Yu Yang */
496190a8c48SHao-Yu Yang kfree_rcu(pol, rcu);
49752cd3b07SLee Schermerhorn }
498f634f108SShivank Garg EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm");
49952cd3b07SLee Schermerhorn
mpol_rebind_default(struct mempolicy * pol,const nodemask_t * nodes)500213980c0SVlastimil Babka static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
50137012946SDavid Rientjes {
50237012946SDavid Rientjes }
50337012946SDavid Rientjes
mpol_rebind_nodemask(struct mempolicy * pol,const nodemask_t * nodes)504213980c0SVlastimil Babka static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
5051d0d2680SDavid Rientjes {
5061d0d2680SDavid Rientjes nodemask_t tmp;
5071d0d2680SDavid Rientjes
50837012946SDavid Rientjes if (pol->flags & MPOL_F_STATIC_NODES)
50937012946SDavid Rientjes nodes_and(tmp, pol->w.user_nodemask, *nodes);
51037012946SDavid Rientjes else if (pol->flags & MPOL_F_RELATIVE_NODES)
51137012946SDavid Rientjes mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
5121d0d2680SDavid Rientjes else {
513269fbe72SBen Widawsky nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
514213980c0SVlastimil Babka *nodes);
51529b190faSzhong jiang pol->w.cpuset_mems_allowed = *nodes;
5161d0d2680SDavid Rientjes }
51737012946SDavid Rientjes
518708c1bbcSMiao Xie if (nodes_empty(tmp))
519708c1bbcSMiao Xie tmp = *nodes;
520708c1bbcSMiao Xie
521269fbe72SBen Widawsky pol->nodes = tmp;
52237012946SDavid Rientjes }
52337012946SDavid Rientjes
mpol_rebind_preferred(struct mempolicy * pol,const nodemask_t * nodes)52437012946SDavid Rientjes static void mpol_rebind_preferred(struct mempolicy *pol,
525213980c0SVlastimil Babka const nodemask_t *nodes)
52637012946SDavid Rientjes {
52737012946SDavid Rientjes pol->w.cpuset_mems_allowed = *nodes;
5281d0d2680SDavid Rientjes }
52937012946SDavid Rientjes
530708c1bbcSMiao Xie /*
531708c1bbcSMiao Xie * mpol_rebind_policy - Migrate a policy to a different set of nodes
532708c1bbcSMiao Xie *
533c1e8d7c6SMichel Lespinasse * Per-vma policies are protected by mmap_lock. Allocations using per-task
534213980c0SVlastimil Babka * policies are protected by task->mems_allowed_seq to prevent a premature
535213980c0SVlastimil Babka * OOM/allocation failure due to parallel nodemask modification.
536708c1bbcSMiao Xie */
mpol_rebind_policy(struct mempolicy * pol,const nodemask_t * newmask)537213980c0SVlastimil Babka static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
53837012946SDavid Rientjes {
539018160adSWang Cheng if (!pol || pol->mode == MPOL_LOCAL)
54037012946SDavid Rientjes return;
5417858d7bcSFeng Tang if (!mpol_store_user_nodemask(pol) &&
54237012946SDavid Rientjes nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
54337012946SDavid Rientjes return;
544708c1bbcSMiao Xie
545213980c0SVlastimil Babka mpol_ops[pol->mode].rebind(pol, newmask);
5461d0d2680SDavid Rientjes }
5471d0d2680SDavid Rientjes
5481d0d2680SDavid Rientjes /*
5491d0d2680SDavid Rientjes * Wrapper for mpol_rebind_policy() that just requires task
5501d0d2680SDavid Rientjes * pointer, and updates task mempolicy.
55158568d2aSMiao Xie *
55258568d2aSMiao Xie * Called with task's alloc_lock held.
5531d0d2680SDavid Rientjes */
mpol_rebind_task(struct task_struct * tsk,const nodemask_t * new)554213980c0SVlastimil Babka void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
5551d0d2680SDavid Rientjes {
556213980c0SVlastimil Babka mpol_rebind_policy(tsk->mempolicy, new);
5571d0d2680SDavid Rientjes }
5581d0d2680SDavid Rientjes
5591d0d2680SDavid Rientjes /*
5601d0d2680SDavid Rientjes * Rebind each vma in mm to new nodemask.
5611d0d2680SDavid Rientjes *
562c1e8d7c6SMichel Lespinasse * Call holding a reference to mm. Takes mm->mmap_lock during call.
5631d0d2680SDavid Rientjes */
mpol_rebind_mm(struct mm_struct * mm,nodemask_t * new)5641d0d2680SDavid Rientjes void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
5651d0d2680SDavid Rientjes {
5661d0d2680SDavid Rientjes struct vm_area_struct *vma;
56766850be5SLiam R. Howlett VMA_ITERATOR(vmi, mm, 0);
5681d0d2680SDavid Rientjes
569d8ed45c5SMichel Lespinasse mmap_write_lock(mm);
5706c21e066SJann Horn for_each_vma(vmi, vma) {
5716c21e066SJann Horn vma_start_write(vma);
572213980c0SVlastimil Babka mpol_rebind_policy(vma->vm_policy, new);
5736c21e066SJann Horn }
574d8ed45c5SMichel Lespinasse mmap_write_unlock(mm);
5751d0d2680SDavid Rientjes }
5761d0d2680SDavid Rientjes
57737012946SDavid Rientjes static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
57837012946SDavid Rientjes [MPOL_DEFAULT] = {
57937012946SDavid Rientjes .rebind = mpol_rebind_default,
58037012946SDavid Rientjes },
58137012946SDavid Rientjes [MPOL_INTERLEAVE] = {
582be897d48SFeng Tang .create = mpol_new_nodemask,
58337012946SDavid Rientjes .rebind = mpol_rebind_nodemask,
58437012946SDavid Rientjes },
58537012946SDavid Rientjes [MPOL_PREFERRED] = {
58637012946SDavid Rientjes .create = mpol_new_preferred,
58737012946SDavid Rientjes .rebind = mpol_rebind_preferred,
58837012946SDavid Rientjes },
58937012946SDavid Rientjes [MPOL_BIND] = {
590be897d48SFeng Tang .create = mpol_new_nodemask,
59137012946SDavid Rientjes .rebind = mpol_rebind_nodemask,
59237012946SDavid Rientjes },
5937858d7bcSFeng Tang [MPOL_LOCAL] = {
5947858d7bcSFeng Tang .rebind = mpol_rebind_default,
5957858d7bcSFeng Tang },
596b27abaccSDave Hansen [MPOL_PREFERRED_MANY] = {
597be897d48SFeng Tang .create = mpol_new_nodemask,
598b27abaccSDave Hansen .rebind = mpol_rebind_preferred,
599b27abaccSDave Hansen },
600fa3bea4eSGregory Price [MPOL_WEIGHTED_INTERLEAVE] = {
601fa3bea4eSGregory Price .create = mpol_new_nodemask,
602fa3bea4eSGregory Price .rebind = mpol_rebind_nodemask,
603fa3bea4eSGregory Price },
60437012946SDavid Rientjes };
60537012946SDavid Rientjes
6061cb5d11aSHugh Dickins static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
607fc301289SChristoph Lameter unsigned long flags);
60872e315f7SHugh Dickins static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
60972e315f7SHugh Dickins pgoff_t ilx, int *nid);
6101a75a6c8SChristoph Lameter
strictly_unmovable(unsigned long flags)6111cb5d11aSHugh Dickins static bool strictly_unmovable(unsigned long flags)
6121cb5d11aSHugh Dickins {
6131cb5d11aSHugh Dickins /*
6141cb5d11aSHugh Dickins * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
6151cb5d11aSHugh Dickins * if any misplaced page is found.
6161cb5d11aSHugh Dickins */
6171cb5d11aSHugh Dickins return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
6181cb5d11aSHugh Dickins MPOL_MF_STRICT;
6191cb5d11aSHugh Dickins }
6201cb5d11aSHugh Dickins
62188c91dc5SHugh Dickins struct migration_mpol { /* for alloc_migration_target_by_mpol() */
62288c91dc5SHugh Dickins struct mempolicy *pol;
62388c91dc5SHugh Dickins pgoff_t ilx;
62488c91dc5SHugh Dickins };
625dc9aa5b9SChristoph Lameter
6266f4576e3SNaoya Horiguchi struct queue_pages {
6276f4576e3SNaoya Horiguchi struct list_head *pagelist;
6286f4576e3SNaoya Horiguchi unsigned long flags;
6296f4576e3SNaoya Horiguchi nodemask_t *nmask;
630f18da660SLi Xinhai unsigned long start;
631f18da660SLi Xinhai unsigned long end;
632f18da660SLi Xinhai struct vm_area_struct *first;
6331cb5d11aSHugh Dickins struct folio *large; /* note last large folio encountered */
6341cb5d11aSHugh Dickins long nr_failed; /* could not be isolated at this time */
6356f4576e3SNaoya Horiguchi };
6366f4576e3SNaoya Horiguchi
63798094945SNaoya Horiguchi /*
638d451b89dSVishal Moola (Oracle) * Check if the folio's nid is in qp->nmask.
63988aaa2a1SNaoya Horiguchi *
64088aaa2a1SNaoya Horiguchi * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
64188aaa2a1SNaoya Horiguchi * in the invert of qp->nmask.
64288aaa2a1SNaoya Horiguchi */
queue_folio_required(struct folio * folio,struct queue_pages * qp)643d451b89dSVishal Moola (Oracle) static inline bool queue_folio_required(struct folio *folio,
64488aaa2a1SNaoya Horiguchi struct queue_pages *qp)
64588aaa2a1SNaoya Horiguchi {
646d451b89dSVishal Moola (Oracle) int nid = folio_nid(folio);
64788aaa2a1SNaoya Horiguchi unsigned long flags = qp->flags;
64888aaa2a1SNaoya Horiguchi
64988aaa2a1SNaoya Horiguchi return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
65088aaa2a1SNaoya Horiguchi }
65188aaa2a1SNaoya Horiguchi
queue_folios_pmd(pmd_t * pmd,struct mm_walk * walk)6521cb5d11aSHugh Dickins static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
653c8633798SNaoya Horiguchi {
654de1f5055SVishal Moola (Oracle) struct folio *folio;
655c8633798SNaoya Horiguchi struct queue_pages *qp = walk->private;
656c8633798SNaoya Horiguchi
6570ac881efSLorenzo Stoakes if (unlikely(pmd_is_migration_entry(*pmd))) {
6581cb5d11aSHugh Dickins qp->nr_failed++;
6591cb5d11aSHugh Dickins return;
660c8633798SNaoya Horiguchi }
661e06d03d5SMatthew Wilcox (Oracle) folio = pmd_folio(*pmd);
6625beaee54SMatthew Wilcox (Oracle) if (is_huge_zero_folio(folio)) {
663e5947d23SYang Shi walk->action = ACTION_CONTINUE;
6641cb5d11aSHugh Dickins return;
665c8633798SNaoya Horiguchi }
666d451b89dSVishal Moola (Oracle) if (!queue_folio_required(folio, qp))
6671cb5d11aSHugh Dickins return;
6681cb5d11aSHugh Dickins if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
6691cb5d11aSHugh Dickins !vma_migratable(walk->vma) ||
6701cb5d11aSHugh Dickins !migrate_folio_add(folio, qp->pagelist, qp->flags))
6711cb5d11aSHugh Dickins qp->nr_failed++;
672c8633798SNaoya Horiguchi }
673c8633798SNaoya Horiguchi
67488aaa2a1SNaoya Horiguchi /*
6751cb5d11aSHugh Dickins * Scan through folios, checking if they satisfy the required conditions,
6761cb5d11aSHugh Dickins * moving them from LRU to local pagelist for migration if they do (or not).
677d8835445SYang Shi *
6781cb5d11aSHugh Dickins * queue_folios_pte_range() has two possible return values:
6791cb5d11aSHugh Dickins * 0 - continue walking to scan for more, even if an existing folio on the
6801cb5d11aSHugh Dickins * wrong node could not be isolated and queued for migration.
6811cb5d11aSHugh Dickins * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
6821cb5d11aSHugh Dickins * and an existing folio was on a node that does not follow the policy.
68398094945SNaoya Horiguchi */
queue_folios_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)6843dae02bbSVishal Moola (Oracle) static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
6856f4576e3SNaoya Horiguchi unsigned long end, struct mm_walk *walk)
6861da177e4SLinus Torvalds {
6876f4576e3SNaoya Horiguchi struct vm_area_struct *vma = walk->vma;
6883dae02bbSVishal Moola (Oracle) struct folio *folio;
6896f4576e3SNaoya Horiguchi struct queue_pages *qp = walk->private;
6906f4576e3SNaoya Horiguchi unsigned long flags = qp->flags;
6913f088420SShijie Luo pte_t *pte, *mapped_pte;
692c33c7948SRyan Roberts pte_t ptent;
693705e87c0SHugh Dickins spinlock_t *ptl;
6944a34c584SDev Jain int max_nr, nr;
695941150a3SHugh Dickins
696c8633798SNaoya Horiguchi ptl = pmd_trans_huge_lock(pmd, vma);
6971cb5d11aSHugh Dickins if (ptl) {
6981cb5d11aSHugh Dickins queue_folios_pmd(pmd, walk);
6991cb5d11aSHugh Dickins spin_unlock(ptl);
7001cb5d11aSHugh Dickins goto out;
7011cb5d11aSHugh Dickins }
70291612e0dSHugh Dickins
7033f088420SShijie Luo mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
7047780d040SHugh Dickins if (!pte) {
7057780d040SHugh Dickins walk->action = ACTION_AGAIN;
7067780d040SHugh Dickins return 0;
7077780d040SHugh Dickins }
7084a34c584SDev Jain for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
7094a34c584SDev Jain max_nr = (end - addr) >> PAGE_SHIFT;
7104a34c584SDev Jain nr = 1;
711c33c7948SRyan Roberts ptent = ptep_get(pte);
7121cb5d11aSHugh Dickins if (pte_none(ptent))
71391612e0dSHugh Dickins continue;
7141cb5d11aSHugh Dickins if (!pte_present(ptent)) {
71593976a20SLorenzo Stoakes const softleaf_t entry = softleaf_from_pte(ptent);
71693976a20SLorenzo Stoakes
71793976a20SLorenzo Stoakes if (softleaf_is_migration(entry))
7181cb5d11aSHugh Dickins qp->nr_failed++;
7191cb5d11aSHugh Dickins continue;
7201cb5d11aSHugh Dickins }
721c33c7948SRyan Roberts folio = vm_normal_folio(vma, addr, ptent);
7223dae02bbSVishal Moola (Oracle) if (!folio || folio_is_zone_device(folio))
72391612e0dSHugh Dickins continue;
7244a34c584SDev Jain if (folio_test_large(folio) && max_nr != 1)
725dd80cfd4SDavid Hildenbrand nr = folio_pte_batch(folio, pte, ptent, max_nr);
726053837fcSNick Piggin /*
7273dae02bbSVishal Moola (Oracle) * vm_normal_folio() filters out zero pages, but there might
7283dae02bbSVishal Moola (Oracle) * still be reserved folios to skip, perhaps in a VDSO.
729053837fcSNick Piggin */
7303dae02bbSVishal Moola (Oracle) if (folio_test_reserved(folio))
731f4598c8bSChristoph Lameter continue;
732d451b89dSVishal Moola (Oracle) if (!queue_folio_required(folio, qp))
73338e35860SChristoph Lameter continue;
7341cb5d11aSHugh Dickins if (folio_test_large(folio)) {
73524526268SYang Shi /*
7361cb5d11aSHugh Dickins * A large folio can only be isolated from LRU once,
7371cb5d11aSHugh Dickins * but may be mapped by many PTEs (and Copy-On-Write may
7381cb5d11aSHugh Dickins * intersperse PTEs of other, order 0, folios). This is
7391cb5d11aSHugh Dickins * a common case, so don't mistake it for failure (but
7401cb5d11aSHugh Dickins * there can be other cases of multi-mapped pages which
7411cb5d11aSHugh Dickins * this quick check does not help to filter out - and a
7421cb5d11aSHugh Dickins * search of the pagelist might grow to be prohibitive).
7431cb5d11aSHugh Dickins *
7441cb5d11aSHugh Dickins * migrate_pages(&pagelist) returns nr_failed folios, so
7451cb5d11aSHugh Dickins * check "large" now so that queue_pages_range() returns
7461cb5d11aSHugh Dickins * a comparable nr_failed folios. This does imply that
7471cb5d11aSHugh Dickins * if folio could not be isolated for some racy reason
7481cb5d11aSHugh Dickins * at its first PTE, later PTEs will not give it another
7491cb5d11aSHugh Dickins * chance of isolation; but keeps the accounting simple.
75024526268SYang Shi */
7511cb5d11aSHugh Dickins if (folio == qp->large)
7521cb5d11aSHugh Dickins continue;
7531cb5d11aSHugh Dickins qp->large = folio;
7541cb5d11aSHugh Dickins }
7551cb5d11aSHugh Dickins if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
7561cb5d11aSHugh Dickins !vma_migratable(vma) ||
7571cb5d11aSHugh Dickins !migrate_folio_add(folio, qp->pagelist, flags)) {
7584a34c584SDev Jain qp->nr_failed += nr;
7591cb5d11aSHugh Dickins if (strictly_unmovable(flags))
760a7f40cfeSYang Shi break;
7616f4576e3SNaoya Horiguchi }
7621cb5d11aSHugh Dickins }
7633f088420SShijie Luo pte_unmap_unlock(mapped_pte, ptl);
7646f4576e3SNaoya Horiguchi cond_resched();
7651cb5d11aSHugh Dickins out:
7661cb5d11aSHugh Dickins if (qp->nr_failed && strictly_unmovable(flags))
7671cb5d11aSHugh Dickins return -EIO;
7681cb5d11aSHugh Dickins return 0;
76991612e0dSHugh Dickins }
77091612e0dSHugh Dickins
queue_folios_hugetlb(pte_t * pte,unsigned long hmask,unsigned long addr,unsigned long end,struct mm_walk * walk)7710a2c1e81SVishal Moola (Oracle) static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
7726f4576e3SNaoya Horiguchi unsigned long addr, unsigned long end,
7736f4576e3SNaoya Horiguchi struct mm_walk *walk)
774e2d8cf40SNaoya Horiguchi {
775e2d8cf40SNaoya Horiguchi #ifdef CONFIG_HUGETLB_PAGE
7766f4576e3SNaoya Horiguchi struct queue_pages *qp = walk->private;
7771cb5d11aSHugh Dickins unsigned long flags = qp->flags;
7780a2c1e81SVishal Moola (Oracle) struct folio *folio;
779cb900f41SKirill A. Shutemov spinlock_t *ptl;
78003bfbc3aSLorenzo Stoakes pte_t ptep;
781e2d8cf40SNaoya Horiguchi
7826f4576e3SNaoya Horiguchi ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
78303bfbc3aSLorenzo Stoakes ptep = huge_ptep_get(walk->mm, addr, pte);
78403bfbc3aSLorenzo Stoakes if (!pte_present(ptep)) {
78503bfbc3aSLorenzo Stoakes if (!huge_pte_none(ptep)) {
78603bfbc3aSLorenzo Stoakes const softleaf_t entry = softleaf_from_pte(ptep);
78703bfbc3aSLorenzo Stoakes
78803bfbc3aSLorenzo Stoakes if (unlikely(softleaf_is_migration(entry)))
7891cb5d11aSHugh Dickins qp->nr_failed++;
79003bfbc3aSLorenzo Stoakes }
79103bfbc3aSLorenzo Stoakes
792d4c54919SNaoya Horiguchi goto unlock;
7931cb5d11aSHugh Dickins }
79403bfbc3aSLorenzo Stoakes folio = pfn_folio(pte_pfn(ptep));
795d451b89dSVishal Moola (Oracle) if (!queue_folio_required(folio, qp))
796e2d8cf40SNaoya Horiguchi goto unlock;
7971cb5d11aSHugh Dickins if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
7981cb5d11aSHugh Dickins !vma_migratable(walk->vma)) {
7991cb5d11aSHugh Dickins qp->nr_failed++;
800dcf17635SLi Xinhai goto unlock;
801dcf17635SLi Xinhai }
802dcf17635SLi Xinhai /*
8031cb5d11aSHugh Dickins * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
8041cb5d11aSHugh Dickins * Choosing not to migrate a shared folio is not counted as a failure.
8050a2c1e81SVishal Moola (Oracle) *
806003fde44SDavid Hildenbrand * See folio_maybe_mapped_shared() on possible imprecision when we
807ebb34f78SDavid Hildenbrand * cannot easily detect if a folio is shared.
8080a2c1e81SVishal Moola (Oracle) */
8091cb5d11aSHugh Dickins if ((flags & MPOL_MF_MOVE_ALL) ||
810003fde44SDavid Hildenbrand (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
8114c640f12SDavid Hildenbrand if (!folio_isolate_hugetlb(folio, qp->pagelist))
8121cb5d11aSHugh Dickins qp->nr_failed++;
813e2d8cf40SNaoya Horiguchi unlock:
814cb900f41SKirill A. Shutemov spin_unlock(ptl);
8151cb5d11aSHugh Dickins if (qp->nr_failed && strictly_unmovable(flags))
8161cb5d11aSHugh Dickins return -EIO;
817e2d8cf40SNaoya Horiguchi #endif
8181cb5d11aSHugh Dickins return 0;
8191da177e4SLinus Torvalds }
8201da177e4SLinus Torvalds
8215877231fSAneesh Kumar K.V #ifdef CONFIG_NUMA_BALANCING
822ca43034cSKefeng Wang /**
823ca43034cSKefeng Wang * folio_can_map_prot_numa() - check whether the folio can map prot numa
824ca43034cSKefeng Wang * @folio: The folio whose mapping considered for being made NUMA hintable
825ca43034cSKefeng Wang * @vma: The VMA that the folio belongs to.
826ca43034cSKefeng Wang * @is_private_single_threaded: Is this a single-threaded private VMA or not
827ca43034cSKefeng Wang *
828ca43034cSKefeng Wang * This function checks to see if the folio actually indicates that
829ca43034cSKefeng Wang * we need to make the mapping one which causes a NUMA hinting fault,
830ca43034cSKefeng Wang * as there are cases where it's simply unnecessary, and the folio's
831ca43034cSKefeng Wang * access time is adjusted for memory tiering if prot numa needed.
832ca43034cSKefeng Wang *
833ca43034cSKefeng Wang * Return: True if the mapping of the folio needs to be changed, false otherwise.
834ca43034cSKefeng Wang */
folio_can_map_prot_numa(struct folio * folio,struct vm_area_struct * vma,bool is_private_single_threaded)835ca43034cSKefeng Wang bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma,
836ca43034cSKefeng Wang bool is_private_single_threaded)
837ca43034cSKefeng Wang {
838ca43034cSKefeng Wang int nid;
839ca43034cSKefeng Wang
840ca43034cSKefeng Wang if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio))
841ca43034cSKefeng Wang return false;
842ca43034cSKefeng Wang
843ca43034cSKefeng Wang /* Also skip shared copy-on-write folios */
844ca43034cSKefeng Wang if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio))
845ca43034cSKefeng Wang return false;
846ca43034cSKefeng Wang
847ca43034cSKefeng Wang /* Folios are pinned and can't be migrated */
848ca43034cSKefeng Wang if (folio_maybe_dma_pinned(folio))
849ca43034cSKefeng Wang return false;
850ca43034cSKefeng Wang
851ca43034cSKefeng Wang /*
852ca43034cSKefeng Wang * While migration can move some dirty folios,
853ca43034cSKefeng Wang * it cannot move them all from MIGRATE_ASYNC
854ca43034cSKefeng Wang * context.
855ca43034cSKefeng Wang */
856ca43034cSKefeng Wang if (folio_is_file_lru(folio) && folio_test_dirty(folio))
857ca43034cSKefeng Wang return false;
858ca43034cSKefeng Wang
859ca43034cSKefeng Wang /*
860ca43034cSKefeng Wang * Don't mess with PTEs if folio is already on the node
861ca43034cSKefeng Wang * a single-threaded process is running on.
862ca43034cSKefeng Wang */
863ca43034cSKefeng Wang nid = folio_nid(folio);
864ca43034cSKefeng Wang if (is_private_single_threaded && (nid == numa_node_id()))
865ca43034cSKefeng Wang return false;
866ca43034cSKefeng Wang
867ca43034cSKefeng Wang /*
868ca43034cSKefeng Wang * Skip scanning top tier node if normal numa
869ca43034cSKefeng Wang * balancing is disabled
870ca43034cSKefeng Wang */
871ca43034cSKefeng Wang if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
872ca43034cSKefeng Wang node_is_toptier(nid))
873ca43034cSKefeng Wang return false;
874ca43034cSKefeng Wang
875ca43034cSKefeng Wang if (folio_use_access_time(folio))
876ca43034cSKefeng Wang folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
877ca43034cSKefeng Wang
878ca43034cSKefeng Wang return true;
879ca43034cSKefeng Wang }
880ca43034cSKefeng Wang
881b24f53a0SLee Schermerhorn /*
8824b10e7d5SMel Gorman * This is used to mark a range of virtual addresses to be inaccessible.
8834b10e7d5SMel Gorman * These are later cleared by a NUMA hinting fault. Depending on these
8844b10e7d5SMel Gorman * faults, pages may be migrated for better NUMA placement.
8854b10e7d5SMel Gorman *
8864b10e7d5SMel Gorman * This is assuming that NUMA faults are handled using PROT_NONE. If
8874b10e7d5SMel Gorman * an architecture makes a different choice, it will need further
8884b10e7d5SMel Gorman * changes to the core.
889b24f53a0SLee Schermerhorn */
change_prot_numa(struct vm_area_struct * vma,unsigned long addr,unsigned long end)8904b10e7d5SMel Gorman unsigned long change_prot_numa(struct vm_area_struct *vma,
8914b10e7d5SMel Gorman unsigned long addr, unsigned long end)
892b24f53a0SLee Schermerhorn {
8934a18419fSNadav Amit struct mmu_gather tlb;
894a79390f5SPeter Xu long nr_updated;
895b24f53a0SLee Schermerhorn
8964a18419fSNadav Amit tlb_gather_mmu(&tlb, vma->vm_mm);
8974a18419fSNadav Amit
8981ef488edSDavid Hildenbrand nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
899f77f0c75SKaiyang Zhao if (nr_updated > 0) {
90003c5a6e1SMel Gorman count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
901f77f0c75SKaiyang Zhao count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
902f77f0c75SKaiyang Zhao }
903b24f53a0SLee Schermerhorn
9044a18419fSNadav Amit tlb_finish_mmu(&tlb);
9054a18419fSNadav Amit
9064b10e7d5SMel Gorman return nr_updated;
907b24f53a0SLee Schermerhorn }
9085877231fSAneesh Kumar K.V #endif /* CONFIG_NUMA_BALANCING */
909b24f53a0SLee Schermerhorn
queue_pages_test_walk(unsigned long start,unsigned long end,struct mm_walk * walk)9106f4576e3SNaoya Horiguchi static int queue_pages_test_walk(unsigned long start, unsigned long end,
9116f4576e3SNaoya Horiguchi struct mm_walk *walk)
9121da177e4SLinus Torvalds {
91366850be5SLiam R. Howlett struct vm_area_struct *next, *vma = walk->vma;
9146f4576e3SNaoya Horiguchi struct queue_pages *qp = walk->private;
9156f4576e3SNaoya Horiguchi unsigned long flags = qp->flags;
916dc9aa5b9SChristoph Lameter
917a18b3ac2SLi Xinhai /* range check first */
918ce33135cSMiaohe Lin VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
919f18da660SLi Xinhai
920f18da660SLi Xinhai if (!qp->first) {
921f18da660SLi Xinhai qp->first = vma;
922f18da660SLi Xinhai if (!(flags & MPOL_MF_DISCONTIG_OK) &&
923f18da660SLi Xinhai (qp->start < vma->vm_start))
924f18da660SLi Xinhai /* hole at head side of range */
925a18b3ac2SLi Xinhai return -EFAULT;
926a18b3ac2SLi Xinhai }
92766850be5SLiam R. Howlett next = find_vma(vma->vm_mm, vma->vm_end);
928f18da660SLi Xinhai if (!(flags & MPOL_MF_DISCONTIG_OK) &&
929f18da660SLi Xinhai ((vma->vm_end < qp->end) &&
93066850be5SLiam R. Howlett (!next || vma->vm_end < next->vm_start)))
931f18da660SLi Xinhai /* hole at middle or tail of range */
932f18da660SLi Xinhai return -EFAULT;
933a18b3ac2SLi Xinhai
934a7f40cfeSYang Shi /*
935a7f40cfeSYang Shi * Need check MPOL_MF_STRICT to return -EIO if possible
936a7f40cfeSYang Shi * regardless of vma_migratable
937a7f40cfeSYang Shi */
938a7f40cfeSYang Shi if (!vma_migratable(vma) &&
939a7f40cfeSYang Shi !(flags & MPOL_MF_STRICT))
94048684a65SNaoya Horiguchi return 1;
94148684a65SNaoya Horiguchi
9421cb5d11aSHugh Dickins /*
9431cb5d11aSHugh Dickins * Check page nodes, and queue pages to move, in the current vma.
9441cb5d11aSHugh Dickins * But if no moving, and no strict checking, the scan can be skipped.
9451cb5d11aSHugh Dickins */
9461cb5d11aSHugh Dickins if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
9476f4576e3SNaoya Horiguchi return 0;
9486f4576e3SNaoya Horiguchi return 1;
9496f4576e3SNaoya Horiguchi }
950b24f53a0SLee Schermerhorn
9517b86ac33SChristoph Hellwig static const struct mm_walk_ops queue_pages_walk_ops = {
9520a2c1e81SVishal Moola (Oracle) .hugetlb_entry = queue_folios_hugetlb,
9533dae02bbSVishal Moola (Oracle) .pmd_entry = queue_folios_pte_range,
9547b86ac33SChristoph Hellwig .test_walk = queue_pages_test_walk,
95549b06385SSuren Baghdasaryan .walk_lock = PGWALK_RDLOCK,
95649b06385SSuren Baghdasaryan };
95749b06385SSuren Baghdasaryan
95849b06385SSuren Baghdasaryan static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
95949b06385SSuren Baghdasaryan .hugetlb_entry = queue_folios_hugetlb,
96049b06385SSuren Baghdasaryan .pmd_entry = queue_folios_pte_range,
96149b06385SSuren Baghdasaryan .test_walk = queue_pages_test_walk,
96249b06385SSuren Baghdasaryan .walk_lock = PGWALK_WRLOCK,
9637b86ac33SChristoph Hellwig };
9647b86ac33SChristoph Hellwig
9656f4576e3SNaoya Horiguchi /*
9666f4576e3SNaoya Horiguchi * Walk through page tables and collect pages to be migrated.
9676f4576e3SNaoya Horiguchi *
9681cb5d11aSHugh Dickins * If pages found in a given range are not on the required set of @nodes,
9691cb5d11aSHugh Dickins * and migration is allowed, they are isolated and queued to @pagelist.
970d8835445SYang Shi *
9711cb5d11aSHugh Dickins * queue_pages_range() may return:
9721cb5d11aSHugh Dickins * 0 - all pages already on the right node, or successfully queued for moving
9731cb5d11aSHugh Dickins * (or neither strict checking nor moving requested: only range checking).
9741cb5d11aSHugh Dickins * >0 - this number of misplaced folios could not be queued for moving
9751cb5d11aSHugh Dickins * (a hugetlbfs page or a transparent huge page being counted as 1).
9761cb5d11aSHugh Dickins * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
9771cb5d11aSHugh Dickins * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
9786f4576e3SNaoya Horiguchi */
9791cb5d11aSHugh Dickins static long
queue_pages_range(struct mm_struct * mm,unsigned long start,unsigned long end,nodemask_t * nodes,unsigned long flags,struct list_head * pagelist)9806f4576e3SNaoya Horiguchi queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
9816f4576e3SNaoya Horiguchi nodemask_t *nodes, unsigned long flags,
9821cb5d11aSHugh Dickins struct list_head *pagelist)
9836f4576e3SNaoya Horiguchi {
984f18da660SLi Xinhai int err;
9856f4576e3SNaoya Horiguchi struct queue_pages qp = {
9866f4576e3SNaoya Horiguchi .pagelist = pagelist,
9876f4576e3SNaoya Horiguchi .flags = flags,
9886f4576e3SNaoya Horiguchi .nmask = nodes,
989f18da660SLi Xinhai .start = start,
990f18da660SLi Xinhai .end = end,
991f18da660SLi Xinhai .first = NULL,
9926f4576e3SNaoya Horiguchi };
9931cb5d11aSHugh Dickins const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
99449b06385SSuren Baghdasaryan &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
9956f4576e3SNaoya Horiguchi
99649b06385SSuren Baghdasaryan err = walk_page_range(mm, start, end, ops, &qp);
997f18da660SLi Xinhai
998f18da660SLi Xinhai if (!qp.first)
999f18da660SLi Xinhai /* whole range in hole */
1000f18da660SLi Xinhai err = -EFAULT;
1001f18da660SLi Xinhai
10021cb5d11aSHugh Dickins return err ? : qp.nr_failed;
10031da177e4SLinus Torvalds }
10041da177e4SLinus Torvalds
1005869833f2SKOSAKI Motohiro /*
1006869833f2SKOSAKI Motohiro * Apply policy to a single VMA
1007c1e8d7c6SMichel Lespinasse * This must be called with the mmap_lock held for writing.
1008869833f2SKOSAKI Motohiro */
vma_replace_policy(struct vm_area_struct * vma,struct mempolicy * pol)1009869833f2SKOSAKI Motohiro static int vma_replace_policy(struct vm_area_struct *vma,
1010869833f2SKOSAKI Motohiro struct mempolicy *pol)
10118d34694cSKOSAKI Motohiro {
1012869833f2SKOSAKI Motohiro int err;
1013869833f2SKOSAKI Motohiro struct mempolicy *old;
1014869833f2SKOSAKI Motohiro struct mempolicy *new;
10158d34694cSKOSAKI Motohiro
10166c21e066SJann Horn vma_assert_write_locked(vma);
10176c21e066SJann Horn
1018869833f2SKOSAKI Motohiro new = mpol_dup(pol);
1019869833f2SKOSAKI Motohiro if (IS_ERR(new))
1020869833f2SKOSAKI Motohiro return PTR_ERR(new);
1021869833f2SKOSAKI Motohiro
1022869833f2SKOSAKI Motohiro if (vma->vm_ops && vma->vm_ops->set_policy) {
10238d34694cSKOSAKI Motohiro err = vma->vm_ops->set_policy(vma, new);
1024869833f2SKOSAKI Motohiro if (err)
1025869833f2SKOSAKI Motohiro goto err_out;
10268d34694cSKOSAKI Motohiro }
1027869833f2SKOSAKI Motohiro
1028869833f2SKOSAKI Motohiro old = vma->vm_policy;
1029190a8c48SHao-Yu Yang WRITE_ONCE(vma->vm_policy, new); /* protected by mmap_lock */
1030869833f2SKOSAKI Motohiro mpol_put(old);
1031869833f2SKOSAKI Motohiro
1032869833f2SKOSAKI Motohiro return 0;
1033869833f2SKOSAKI Motohiro err_out:
1034869833f2SKOSAKI Motohiro mpol_put(new);
10358d34694cSKOSAKI Motohiro return err;
10368d34694cSKOSAKI Motohiro }
10378d34694cSKOSAKI Motohiro
1038f4e9e0e6SLiam R. Howlett /* Split or merge the VMA (if required) and apply the new policy */
mbind_range(struct vma_iterator * vmi,struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,struct mempolicy * new_pol)1039f4e9e0e6SLiam R. Howlett static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
1040f4e9e0e6SLiam R. Howlett struct vm_area_struct **prev, unsigned long start,
10419d8cebd4SKOSAKI Motohiro unsigned long end, struct mempolicy *new_pol)
10421da177e4SLinus Torvalds {
1043f4e9e0e6SLiam R. Howlett unsigned long vmstart, vmend;
10441da177e4SLinus Torvalds
1045f4e9e0e6SLiam R. Howlett vmend = min(end, vma->vm_end);
1046f4e9e0e6SLiam R. Howlett if (start > vma->vm_start) {
1047f4e9e0e6SLiam R. Howlett *prev = vma;
1048f4e9e0e6SLiam R. Howlett vmstart = start;
1049f4e9e0e6SLiam R. Howlett } else {
1050f4e9e0e6SLiam R. Howlett vmstart = vma->vm_start;
1051f4e9e0e6SLiam R. Howlett }
10529d8cebd4SKOSAKI Motohiro
1053c36f6e6dSHugh Dickins if (mpol_equal(vma->vm_policy, new_pol)) {
105400ca0f2eSLorenzo Stoakes *prev = vma;
1055f4e9e0e6SLiam R. Howlett return 0;
105600ca0f2eSLorenzo Stoakes }
1057e26a5114SKOSAKI Motohiro
105894d7d923SLorenzo Stoakes vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
105994d7d923SLorenzo Stoakes if (IS_ERR(vma))
106094d7d923SLorenzo Stoakes return PTR_ERR(vma);
1061f4e9e0e6SLiam R. Howlett
1062f4e9e0e6SLiam R. Howlett *prev = vma;
1063f4e9e0e6SLiam R. Howlett return vma_replace_policy(vma, new_pol);
1064f4e9e0e6SLiam R. Howlett }
1065f4e9e0e6SLiam R. Howlett
10661da177e4SLinus Torvalds /* Set the process memory policy */
do_set_mempolicy(unsigned short mode,unsigned short flags,nodemask_t * nodes)1067028fec41SDavid Rientjes static long do_set_mempolicy(unsigned short mode, unsigned short flags,
1068028fec41SDavid Rientjes nodemask_t *nodes)
10691da177e4SLinus Torvalds {
107058568d2aSMiao Xie struct mempolicy *new, *old;
10714bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH(scratch);
107258568d2aSMiao Xie int ret;
10731da177e4SLinus Torvalds
10744bfc4495SKAMEZAWA Hiroyuki if (!scratch)
10754bfc4495SKAMEZAWA Hiroyuki return -ENOMEM;
1076f4e53d91SLee Schermerhorn
10774bfc4495SKAMEZAWA Hiroyuki new = mpol_new(mode, flags, nodes);
10784bfc4495SKAMEZAWA Hiroyuki if (IS_ERR(new)) {
10794bfc4495SKAMEZAWA Hiroyuki ret = PTR_ERR(new);
10804bfc4495SKAMEZAWA Hiroyuki goto out;
10814bfc4495SKAMEZAWA Hiroyuki }
10822c7c3a7dSOleg Nesterov
108312c1dc8eSAbel Wu task_lock(current);
10844bfc4495SKAMEZAWA Hiroyuki ret = mpol_set_nodemask(new, nodes, scratch);
108558568d2aSMiao Xie if (ret) {
108612c1dc8eSAbel Wu task_unlock(current);
108758568d2aSMiao Xie mpol_put(new);
10884bfc4495SKAMEZAWA Hiroyuki goto out;
108958568d2aSMiao Xie }
109012c1dc8eSAbel Wu
109158568d2aSMiao Xie old = current->mempolicy;
10921da177e4SLinus Torvalds current->mempolicy = new;
1093fa3bea4eSGregory Price if (new && (new->mode == MPOL_INTERLEAVE ||
1094fa3bea4eSGregory Price new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
109545816682SVlastimil Babka current->il_prev = MAX_NUMNODES-1;
1096fa3bea4eSGregory Price current->il_weight = 0;
1097fa3bea4eSGregory Price }
109858568d2aSMiao Xie task_unlock(current);
109958568d2aSMiao Xie mpol_put(old);
11004bfc4495SKAMEZAWA Hiroyuki ret = 0;
11014bfc4495SKAMEZAWA Hiroyuki out:
11024bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH_FREE(scratch);
11034bfc4495SKAMEZAWA Hiroyuki return ret;
11041da177e4SLinus Torvalds }
11051da177e4SLinus Torvalds
1106bea904d5SLee Schermerhorn /*
1107bea904d5SLee Schermerhorn * Return nodemask for policy for get_mempolicy() query
110858568d2aSMiao Xie *
110958568d2aSMiao Xie * Called with task's alloc_lock held
1110bea904d5SLee Schermerhorn */
get_policy_nodemask(struct mempolicy * pol,nodemask_t * nodes)1111c36f6e6dSHugh Dickins static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
11121da177e4SLinus Torvalds {
1113dfcd3c0dSAndi Kleen nodes_clear(*nodes);
1114c36f6e6dSHugh Dickins if (pol == &default_policy)
1115bea904d5SLee Schermerhorn return;
1116bea904d5SLee Schermerhorn
1117c36f6e6dSHugh Dickins switch (pol->mode) {
111819770b32SMel Gorman case MPOL_BIND:
11191da177e4SLinus Torvalds case MPOL_INTERLEAVE:
1120269fbe72SBen Widawsky case MPOL_PREFERRED:
1121b27abaccSDave Hansen case MPOL_PREFERRED_MANY:
1122fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE:
1123c36f6e6dSHugh Dickins *nodes = pol->nodes;
11241da177e4SLinus Torvalds break;
11257858d7bcSFeng Tang case MPOL_LOCAL:
11267858d7bcSFeng Tang /* return empty node mask for local allocation */
11277858d7bcSFeng Tang break;
11281da177e4SLinus Torvalds default:
11291da177e4SLinus Torvalds BUG();
11301da177e4SLinus Torvalds }
11311da177e4SLinus Torvalds }
11321da177e4SLinus Torvalds
lookup_node(struct mm_struct * mm,unsigned long addr)11333b9aadf7SAndrea Arcangeli static int lookup_node(struct mm_struct *mm, unsigned long addr)
11341da177e4SLinus Torvalds {
1135ba841078SPeter Xu struct page *p = NULL;
1136f728b9c4SJohn Hubbard int ret;
11371da177e4SLinus Torvalds
1138f728b9c4SJohn Hubbard ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
1139f728b9c4SJohn Hubbard if (ret > 0) {
1140f728b9c4SJohn Hubbard ret = page_to_nid(p);
11411da177e4SLinus Torvalds put_page(p);
11421da177e4SLinus Torvalds }
1143f728b9c4SJohn Hubbard return ret;
11441da177e4SLinus Torvalds }
11451da177e4SLinus Torvalds
11461da177e4SLinus Torvalds /* Retrieve NUMA policy */
do_get_mempolicy(int * policy,nodemask_t * nmask,unsigned long addr,unsigned long flags)1147dbcb0f19SAdrian Bunk static long do_get_mempolicy(int *policy, nodemask_t *nmask,
11481da177e4SLinus Torvalds unsigned long addr, unsigned long flags)
11491da177e4SLinus Torvalds {
11508bccd85fSChristoph Lameter int err;
11511da177e4SLinus Torvalds struct mm_struct *mm = current->mm;
11521da177e4SLinus Torvalds struct vm_area_struct *vma = NULL;
11533b9aadf7SAndrea Arcangeli struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
11541da177e4SLinus Torvalds
1155754af6f5SLee Schermerhorn if (flags &
1156754af6f5SLee Schermerhorn ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
11571da177e4SLinus Torvalds return -EINVAL;
1158754af6f5SLee Schermerhorn
1159754af6f5SLee Schermerhorn if (flags & MPOL_F_MEMS_ALLOWED) {
1160754af6f5SLee Schermerhorn if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
1161754af6f5SLee Schermerhorn return -EINVAL;
1162754af6f5SLee Schermerhorn *policy = 0; /* just so it's initialized */
116358568d2aSMiao Xie task_lock(current);
1164754af6f5SLee Schermerhorn *nmask = cpuset_current_mems_allowed;
116558568d2aSMiao Xie task_unlock(current);
1166754af6f5SLee Schermerhorn return 0;
1167754af6f5SLee Schermerhorn }
1168754af6f5SLee Schermerhorn
11691da177e4SLinus Torvalds if (flags & MPOL_F_ADDR) {
1170ddc1a5cbSHugh Dickins pgoff_t ilx; /* ignored here */
1171bea904d5SLee Schermerhorn /*
1172bea904d5SLee Schermerhorn * Do NOT fall back to task policy if the
1173bea904d5SLee Schermerhorn * vma/shared policy at addr is NULL. We
1174bea904d5SLee Schermerhorn * want to return MPOL_DEFAULT in this case.
1175bea904d5SLee Schermerhorn */
1176d8ed45c5SMichel Lespinasse mmap_read_lock(mm);
117733e3575cSLiam Howlett vma = vma_lookup(mm, addr);
11781da177e4SLinus Torvalds if (!vma) {
1179d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
11801da177e4SLinus Torvalds return -EFAULT;
11811da177e4SLinus Torvalds }
1182ddc1a5cbSHugh Dickins pol = __get_vma_policy(vma, addr, &ilx);
11831da177e4SLinus Torvalds } else if (addr)
11841da177e4SLinus Torvalds return -EINVAL;
11851da177e4SLinus Torvalds
11861da177e4SLinus Torvalds if (!pol)
1187bea904d5SLee Schermerhorn pol = &default_policy; /* indicates default behavior */
11881da177e4SLinus Torvalds
11891da177e4SLinus Torvalds if (flags & MPOL_F_NODE) {
11901da177e4SLinus Torvalds if (flags & MPOL_F_ADDR) {
11913b9aadf7SAndrea Arcangeli /*
1192f728b9c4SJohn Hubbard * Take a refcount on the mpol, because we are about to
1193f728b9c4SJohn Hubbard * drop the mmap_lock, after which only "pol" remains
1194f728b9c4SJohn Hubbard * valid, "vma" is stale.
11953b9aadf7SAndrea Arcangeli */
11963b9aadf7SAndrea Arcangeli pol_refcount = pol;
11973b9aadf7SAndrea Arcangeli vma = NULL;
11983b9aadf7SAndrea Arcangeli mpol_get(pol);
1199f728b9c4SJohn Hubbard mmap_read_unlock(mm);
12003b9aadf7SAndrea Arcangeli err = lookup_node(mm, addr);
12011da177e4SLinus Torvalds if (err < 0)
12021da177e4SLinus Torvalds goto out;
12038bccd85fSChristoph Lameter *policy = err;
12041da177e4SLinus Torvalds } else if (pol == current->mempolicy &&
120545c4745aSLee Schermerhorn pol->mode == MPOL_INTERLEAVE) {
1206269fbe72SBen Widawsky *policy = next_node_in(current->il_prev, pol->nodes);
1207fa3bea4eSGregory Price } else if (pol == current->mempolicy &&
1208fa3bea4eSGregory Price pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
1209fa3bea4eSGregory Price if (current->il_weight)
1210fa3bea4eSGregory Price *policy = current->il_prev;
1211fa3bea4eSGregory Price else
1212fa3bea4eSGregory Price *policy = next_node_in(current->il_prev,
1213fa3bea4eSGregory Price pol->nodes);
12141da177e4SLinus Torvalds } else {
12151da177e4SLinus Torvalds err = -EINVAL;
12161da177e4SLinus Torvalds goto out;
12171da177e4SLinus Torvalds }
1218bea904d5SLee Schermerhorn } else {
1219bea904d5SLee Schermerhorn *policy = pol == &default_policy ? MPOL_DEFAULT :
1220bea904d5SLee Schermerhorn pol->mode;
1221d79df630SDavid Rientjes /*
1222d79df630SDavid Rientjes * Internal mempolicy flags must be masked off before exposing
1223d79df630SDavid Rientjes * the policy to userspace.
1224d79df630SDavid Rientjes */
1225d79df630SDavid Rientjes *policy |= (pol->flags & MPOL_MODE_FLAGS);
1226bea904d5SLee Schermerhorn }
12271da177e4SLinus Torvalds
12281da177e4SLinus Torvalds err = 0;
122958568d2aSMiao Xie if (nmask) {
1230c6b6ef8bSLee Schermerhorn if (mpol_store_user_nodemask(pol)) {
1231c6b6ef8bSLee Schermerhorn *nmask = pol->w.user_nodemask;
1232c6b6ef8bSLee Schermerhorn } else {
123358568d2aSMiao Xie task_lock(current);
1234bea904d5SLee Schermerhorn get_policy_nodemask(pol, nmask);
123558568d2aSMiao Xie task_unlock(current);
123658568d2aSMiao Xie }
1237c6b6ef8bSLee Schermerhorn }
12381da177e4SLinus Torvalds
12391da177e4SLinus Torvalds out:
124052cd3b07SLee Schermerhorn mpol_cond_put(pol);
12411da177e4SLinus Torvalds if (vma)
1242d8ed45c5SMichel Lespinasse mmap_read_unlock(mm);
12433b9aadf7SAndrea Arcangeli if (pol_refcount)
12443b9aadf7SAndrea Arcangeli mpol_put(pol_refcount);
12451da177e4SLinus Torvalds return err;
12461da177e4SLinus Torvalds }
12471da177e4SLinus Torvalds
12486ebf98d7SDavid Hildenbrand (Arm) #ifdef CONFIG_NUMA_MIGRATION
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)12491cb5d11aSHugh Dickins static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1250fc301289SChristoph Lameter unsigned long flags)
12516ce3c4c0SChristoph Lameter {
12526ce3c4c0SChristoph Lameter /*
12531cb5d11aSHugh Dickins * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
12541cb5d11aSHugh Dickins * Choosing not to migrate a shared folio is not counted as a failure.
12554a64981dSVishal Moola (Oracle) *
1256003fde44SDavid Hildenbrand * See folio_maybe_mapped_shared() on possible imprecision when we
1257ebb34f78SDavid Hildenbrand * cannot easily detect if a folio is shared.
12586ce3c4c0SChristoph Lameter */
1259003fde44SDavid Hildenbrand if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
1260be2d5756SBaolin Wang if (folio_isolate_lru(folio)) {
12614a64981dSVishal Moola (Oracle) list_add_tail(&folio->lru, foliolist);
12624a64981dSVishal Moola (Oracle) node_stat_mod_folio(folio,
12634a64981dSVishal Moola (Oracle) NR_ISOLATED_ANON + folio_is_file_lru(folio),
12644a64981dSVishal Moola (Oracle) folio_nr_pages(folio));
12651cb5d11aSHugh Dickins } else {
1266a53190a4SYang Shi /*
12674a64981dSVishal Moola (Oracle) * Non-movable folio may reach here. And, there may be
12684a64981dSVishal Moola (Oracle) * temporary off LRU folios or non-LRU movable folios.
12694a64981dSVishal Moola (Oracle) * Treat them as unmovable folios since they can't be
12701cb5d11aSHugh Dickins * isolated, so they can't be moved at the moment.
1271a53190a4SYang Shi */
12721cb5d11aSHugh Dickins return false;
127362695a84SNick Piggin }
127462695a84SNick Piggin }
12751cb5d11aSHugh Dickins return true;
12766ce3c4c0SChristoph Lameter }
12776ce3c4c0SChristoph Lameter
12786ce3c4c0SChristoph Lameter /*
12797e2ab150SChristoph Lameter * Migrate pages from one node to a target node.
12807e2ab150SChristoph Lameter * Returns error or the number of pages not migrated.
12817e2ab150SChristoph Lameter */
migrate_to_node(struct mm_struct * mm,int source,int dest,int flags)12821cb5d11aSHugh Dickins static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1283dbcb0f19SAdrian Bunk int flags)
12847e2ab150SChristoph Lameter {
12857e2ab150SChristoph Lameter nodemask_t nmask;
128666850be5SLiam R. Howlett struct vm_area_struct *vma;
12877e2ab150SChristoph Lameter LIST_HEAD(pagelist);
12881cb5d11aSHugh Dickins long nr_failed;
12891cb5d11aSHugh Dickins long err = 0;
1290a0976311SJoonsoo Kim struct migration_target_control mtc = {
1291a0976311SJoonsoo Kim .nid = dest,
1292a0976311SJoonsoo Kim .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1293e42dfe4eSBaolin Wang .reason = MR_SYSCALL,
1294a0976311SJoonsoo Kim };
12957e2ab150SChristoph Lameter
12967e2ab150SChristoph Lameter nodes_clear(nmask);
12977e2ab150SChristoph Lameter node_set(source, nmask);
12987e2ab150SChristoph Lameter
129908270807SMinchan Kim VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
130072e315f7SHugh Dickins
130172e315f7SHugh Dickins mmap_read_lock(mm);
13021cb5d11aSHugh Dickins vma = find_vma(mm, 0);
1303091c1dd2SDavid Hildenbrand if (unlikely(!vma)) {
1304091c1dd2SDavid Hildenbrand mmap_read_unlock(mm);
1305091c1dd2SDavid Hildenbrand return 0;
1306091c1dd2SDavid Hildenbrand }
13071cb5d11aSHugh Dickins
13081cb5d11aSHugh Dickins /*
13091cb5d11aSHugh Dickins * This does not migrate the range, but isolates all pages that
13101cb5d11aSHugh Dickins * need migration. Between passing in the full user address
13111cb5d11aSHugh Dickins * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
13121cb5d11aSHugh Dickins * but passes back the count of pages which could not be isolated.
13131cb5d11aSHugh Dickins */
13141cb5d11aSHugh Dickins nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
13151cb5d11aSHugh Dickins flags | MPOL_MF_DISCONTIG_OK, &pagelist);
131672e315f7SHugh Dickins mmap_read_unlock(mm);
13177e2ab150SChristoph Lameter
1318cf608ac1SMinchan Kim if (!list_empty(&pagelist)) {
1319a0976311SJoonsoo Kim err = migrate_pages(&pagelist, alloc_migration_target, NULL,
13205ac95884SYang Shi (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1321cf608ac1SMinchan Kim if (err)
1322e2d8cf40SNaoya Horiguchi putback_movable_pages(&pagelist);
1323cf608ac1SMinchan Kim }
132495a402c3SChristoph Lameter
13251cb5d11aSHugh Dickins if (err >= 0)
13261cb5d11aSHugh Dickins err += nr_failed;
13277e2ab150SChristoph Lameter return err;
13287e2ab150SChristoph Lameter }
13297e2ab150SChristoph Lameter
13307e2ab150SChristoph Lameter /*
13317e2ab150SChristoph Lameter * Move pages between the two nodesets so as to preserve the physical
13327e2ab150SChristoph Lameter * layout as much as possible.
133339743889SChristoph Lameter *
133439743889SChristoph Lameter * Returns the number of page that could not be moved.
133539743889SChristoph Lameter */
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)13360ce72d4fSAndrew Morton int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
13370ce72d4fSAndrew Morton const nodemask_t *to, int flags)
133839743889SChristoph Lameter {
13391cb5d11aSHugh Dickins long nr_failed = 0;
13401cb5d11aSHugh Dickins long err = 0;
13417e2ab150SChristoph Lameter nodemask_t tmp;
134239743889SChristoph Lameter
1343361a2a22SMinchan Kim lru_cache_disable();
13440aedadf9SChristoph Lameter
13457e2ab150SChristoph Lameter /*
13467e2ab150SChristoph Lameter * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
13477e2ab150SChristoph Lameter * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
13487e2ab150SChristoph Lameter * bit in 'tmp', and return that <source, dest> pair for migration.
13497e2ab150SChristoph Lameter * The pair of nodemasks 'to' and 'from' define the map.
13507e2ab150SChristoph Lameter *
13517e2ab150SChristoph Lameter * If no pair of bits is found that way, fallback to picking some
13527e2ab150SChristoph Lameter * pair of 'source' and 'dest' bits that are not the same. If the
13537e2ab150SChristoph Lameter * 'source' and 'dest' bits are the same, this represents a node
13547e2ab150SChristoph Lameter * that will be migrating to itself, so no pages need move.
13557e2ab150SChristoph Lameter *
13567e2ab150SChristoph Lameter * If no bits are left in 'tmp', or if all remaining bits left
13577e2ab150SChristoph Lameter * in 'tmp' correspond to the same bit in 'to', return false
13587e2ab150SChristoph Lameter * (nothing left to migrate).
13597e2ab150SChristoph Lameter *
13607e2ab150SChristoph Lameter * This lets us pick a pair of nodes to migrate between, such that
13617e2ab150SChristoph Lameter * if possible the dest node is not already occupied by some other
13627e2ab150SChristoph Lameter * source node, minimizing the risk of overloading the memory on a
13637e2ab150SChristoph Lameter * node that would happen if we migrated incoming memory to a node
13647e2ab150SChristoph Lameter * before migrating outgoing memory source that same node.
13657e2ab150SChristoph Lameter *
13667e2ab150SChristoph Lameter * A single scan of tmp is sufficient. As we go, we remember the
13677e2ab150SChristoph Lameter * most recent <s, d> pair that moved (s != d). If we find a pair
13687e2ab150SChristoph Lameter * that not only moved, but what's better, moved to an empty slot
13697e2ab150SChristoph Lameter * (d is not set in tmp), then we break out then, with that pair.
1370ae0e47f0SJustin P. Mattock * Otherwise when we finish scanning from_tmp, we at least have the
13717e2ab150SChristoph Lameter * most recent <s, d> pair that moved. If we get all the way through
13727e2ab150SChristoph Lameter * the scan of tmp without finding any node that moved, much less
13737e2ab150SChristoph Lameter * moved to an empty node, then there is nothing left worth migrating.
13747e2ab150SChristoph Lameter */
13757e2ab150SChristoph Lameter
13760ce72d4fSAndrew Morton tmp = *from;
13777e2ab150SChristoph Lameter while (!nodes_empty(tmp)) {
13787e2ab150SChristoph Lameter int s, d;
1379b76ac7e7SJianguo Wu int source = NUMA_NO_NODE;
13807e2ab150SChristoph Lameter int dest = 0;
13817e2ab150SChristoph Lameter
13827e2ab150SChristoph Lameter for_each_node_mask(s, tmp) {
13834a5b18ccSLarry Woodman
13844a5b18ccSLarry Woodman /*
13854a5b18ccSLarry Woodman * do_migrate_pages() tries to maintain the relative
13864a5b18ccSLarry Woodman * node relationship of the pages established between
13874a5b18ccSLarry Woodman * threads and memory areas.
13884a5b18ccSLarry Woodman *
13894a5b18ccSLarry Woodman * However if the number of source nodes is not equal to
13904a5b18ccSLarry Woodman * the number of destination nodes we can not preserve
13914a5b18ccSLarry Woodman * this node relative relationship. In that case, skip
13924a5b18ccSLarry Woodman * copying memory from a node that is in the destination
13934a5b18ccSLarry Woodman * mask.
13944a5b18ccSLarry Woodman *
13954a5b18ccSLarry Woodman * Example: [2,3,4] -> [3,4,5] moves everything.
13964a5b18ccSLarry Woodman * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
13974a5b18ccSLarry Woodman */
13984a5b18ccSLarry Woodman
13990ce72d4fSAndrew Morton if ((nodes_weight(*from) != nodes_weight(*to)) &&
14000ce72d4fSAndrew Morton (node_isset(s, *to)))
14014a5b18ccSLarry Woodman continue;
14024a5b18ccSLarry Woodman
14030ce72d4fSAndrew Morton d = node_remap(s, *from, *to);
14047e2ab150SChristoph Lameter if (s == d)
14057e2ab150SChristoph Lameter continue;
14067e2ab150SChristoph Lameter
14077e2ab150SChristoph Lameter source = s; /* Node moved. Memorize */
14087e2ab150SChristoph Lameter dest = d;
14097e2ab150SChristoph Lameter
14107e2ab150SChristoph Lameter /* dest not in remaining from nodes? */
14117e2ab150SChristoph Lameter if (!node_isset(dest, tmp))
14127e2ab150SChristoph Lameter break;
14137e2ab150SChristoph Lameter }
1414b76ac7e7SJianguo Wu if (source == NUMA_NO_NODE)
14157e2ab150SChristoph Lameter break;
14167e2ab150SChristoph Lameter
14177e2ab150SChristoph Lameter node_clear(source, tmp);
14187e2ab150SChristoph Lameter err = migrate_to_node(mm, source, dest, flags);
14197e2ab150SChristoph Lameter if (err > 0)
14201cb5d11aSHugh Dickins nr_failed += err;
14217e2ab150SChristoph Lameter if (err < 0)
14227e2ab150SChristoph Lameter break;
142339743889SChristoph Lameter }
1424d479960eSMinchan Kim
1425361a2a22SMinchan Kim lru_cache_enable();
14267e2ab150SChristoph Lameter if (err < 0)
14277e2ab150SChristoph Lameter return err;
14281cb5d11aSHugh Dickins return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
142939743889SChristoph Lameter }
143039743889SChristoph Lameter
14313ad33b24SLee Schermerhorn /*
143272e315f7SHugh Dickins * Allocate a new folio for page migration, according to NUMA mempolicy.
14333ad33b24SLee Schermerhorn */
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)143472e315f7SHugh Dickins static struct folio *alloc_migration_target_by_mpol(struct folio *src,
143572e315f7SHugh Dickins unsigned long private)
143695a402c3SChristoph Lameter {
143788c91dc5SHugh Dickins struct migration_mpol *mmpol = (struct migration_mpol *)private;
143888c91dc5SHugh Dickins struct mempolicy *pol = mmpol->pol;
143988c91dc5SHugh Dickins pgoff_t ilx = mmpol->ilx;
144072e315f7SHugh Dickins unsigned int order;
144172e315f7SHugh Dickins int nid = numa_node_id();
144272e315f7SHugh Dickins gfp_t gfp;
144395a402c3SChristoph Lameter
144472e315f7SHugh Dickins order = folio_order(src);
144572e315f7SHugh Dickins ilx += src->index >> order;
14463ad33b24SLee Schermerhorn
1447d0ce0e47SSidhartha Kumar if (folio_test_hugetlb(src)) {
144872e315f7SHugh Dickins nodemask_t *nodemask;
144972e315f7SHugh Dickins struct hstate *h;
145072e315f7SHugh Dickins
145172e315f7SHugh Dickins h = folio_hstate(src);
145272e315f7SHugh Dickins gfp = htlb_alloc_mask(h);
145372e315f7SHugh Dickins nodemask = policy_nodemask(gfp, pol, ilx, &nid);
145442d0c3fbSBaolin Wang return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
145542d0c3fbSBaolin Wang htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
1456d0ce0e47SSidhartha Kumar }
1457c8633798SNaoya Horiguchi
1458ec4858e0SMatthew Wilcox (Oracle) if (folio_test_large(src))
1459ec4858e0SMatthew Wilcox (Oracle) gfp = GFP_TRANSHUGE;
146072e315f7SHugh Dickins else
146172e315f7SHugh Dickins gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
1462ec4858e0SMatthew Wilcox (Oracle)
14631d9cb785SKefeng Wang return folio_alloc_mpol(gfp, order, pol, ilx, nid);
146495a402c3SChristoph Lameter }
1465b20a3503SChristoph Lameter #else
1466b20a3503SChristoph Lameter
migrate_folio_add(struct folio * folio,struct list_head * foliolist,unsigned long flags)14671cb5d11aSHugh Dickins static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1468b20a3503SChristoph Lameter unsigned long flags)
1469b20a3503SChristoph Lameter {
14701cb5d11aSHugh Dickins return false;
1471b20a3503SChristoph Lameter }
1472b20a3503SChristoph Lameter
do_migrate_pages(struct mm_struct * mm,const nodemask_t * from,const nodemask_t * to,int flags)14730ce72d4fSAndrew Morton int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
14740ce72d4fSAndrew Morton const nodemask_t *to, int flags)
1475b20a3503SChristoph Lameter {
1476b20a3503SChristoph Lameter return -ENOSYS;
1477b20a3503SChristoph Lameter }
147895a402c3SChristoph Lameter
alloc_migration_target_by_mpol(struct folio * src,unsigned long private)147972e315f7SHugh Dickins static struct folio *alloc_migration_target_by_mpol(struct folio *src,
148072e315f7SHugh Dickins unsigned long private)
148195a402c3SChristoph Lameter {
148295a402c3SChristoph Lameter return NULL;
148395a402c3SChristoph Lameter }
1484b20a3503SChristoph Lameter #endif
1485b20a3503SChristoph Lameter
do_mbind(unsigned long start,unsigned long len,unsigned short mode,unsigned short mode_flags,nodemask_t * nmask,unsigned long flags)1486dbcb0f19SAdrian Bunk static long do_mbind(unsigned long start, unsigned long len,
1487028fec41SDavid Rientjes unsigned short mode, unsigned short mode_flags,
1488028fec41SDavid Rientjes nodemask_t *nmask, unsigned long flags)
14896ce3c4c0SChristoph Lameter {
14906ce3c4c0SChristoph Lameter struct mm_struct *mm = current->mm;
1491f4e9e0e6SLiam R. Howlett struct vm_area_struct *vma, *prev;
1492f4e9e0e6SLiam R. Howlett struct vma_iterator vmi;
149388c91dc5SHugh Dickins struct migration_mpol mmpol;
14946ce3c4c0SChristoph Lameter struct mempolicy *new;
14956ce3c4c0SChristoph Lameter unsigned long end;
14961cb5d11aSHugh Dickins long err;
14971cb5d11aSHugh Dickins long nr_failed;
14986ce3c4c0SChristoph Lameter LIST_HEAD(pagelist);
14996ce3c4c0SChristoph Lameter
1500b24f53a0SLee Schermerhorn if (flags & ~(unsigned long)MPOL_MF_VALID)
15016ce3c4c0SChristoph Lameter return -EINVAL;
150274c00241SChristoph Lameter if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
15036ce3c4c0SChristoph Lameter return -EPERM;
15046ce3c4c0SChristoph Lameter
15056ce3c4c0SChristoph Lameter if (start & ~PAGE_MASK)
15066ce3c4c0SChristoph Lameter return -EINVAL;
15076ce3c4c0SChristoph Lameter
15086ce3c4c0SChristoph Lameter if (mode == MPOL_DEFAULT)
15096ce3c4c0SChristoph Lameter flags &= ~MPOL_MF_STRICT;
15106ce3c4c0SChristoph Lameter
1511aaa31e05Sze zuo len = PAGE_ALIGN(len);
15126ce3c4c0SChristoph Lameter end = start + len;
15136ce3c4c0SChristoph Lameter
15146ce3c4c0SChristoph Lameter if (end < start)
15156ce3c4c0SChristoph Lameter return -EINVAL;
15166ce3c4c0SChristoph Lameter if (end == start)
15176ce3c4c0SChristoph Lameter return 0;
15186ce3c4c0SChristoph Lameter
1519028fec41SDavid Rientjes new = mpol_new(mode, mode_flags, nmask);
15206ce3c4c0SChristoph Lameter if (IS_ERR(new))
15216ce3c4c0SChristoph Lameter return PTR_ERR(new);
15226ce3c4c0SChristoph Lameter
15236ce3c4c0SChristoph Lameter /*
15246ce3c4c0SChristoph Lameter * If we are using the default policy then operation
15256ce3c4c0SChristoph Lameter * on discontinuous address spaces is okay after all
15266ce3c4c0SChristoph Lameter */
15276ce3c4c0SChristoph Lameter if (!new)
15286ce3c4c0SChristoph Lameter flags |= MPOL_MF_DISCONTIG_OK;
15296ce3c4c0SChristoph Lameter
15301cb5d11aSHugh Dickins if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1531361a2a22SMinchan Kim lru_cache_disable();
15324bfc4495SKAMEZAWA Hiroyuki {
15334bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH(scratch);
15344bfc4495SKAMEZAWA Hiroyuki if (scratch) {
1535d8ed45c5SMichel Lespinasse mmap_write_lock(mm);
15364bfc4495SKAMEZAWA Hiroyuki err = mpol_set_nodemask(new, nmask, scratch);
15374bfc4495SKAMEZAWA Hiroyuki if (err)
1538d8ed45c5SMichel Lespinasse mmap_write_unlock(mm);
15394bfc4495SKAMEZAWA Hiroyuki } else
15404bfc4495SKAMEZAWA Hiroyuki err = -ENOMEM;
15414bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH_FREE(scratch);
15424bfc4495SKAMEZAWA Hiroyuki }
1543b05ca738SKOSAKI Motohiro if (err)
1544b05ca738SKOSAKI Motohiro goto mpol_out;
1545b05ca738SKOSAKI Motohiro
15466c21e066SJann Horn /*
15471cb5d11aSHugh Dickins * Lock the VMAs before scanning for pages to migrate,
15481cb5d11aSHugh Dickins * to ensure we don't miss a concurrently inserted page.
15496c21e066SJann Horn */
15501cb5d11aSHugh Dickins nr_failed = queue_pages_range(mm, start, end, nmask,
15511cb5d11aSHugh Dickins flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
1552d8835445SYang Shi
15531cb5d11aSHugh Dickins if (nr_failed < 0) {
15541cb5d11aSHugh Dickins err = nr_failed;
155572e315f7SHugh Dickins nr_failed = 0;
15561cb5d11aSHugh Dickins } else {
1557f4e9e0e6SLiam R. Howlett vma_iter_init(&vmi, mm, start);
1558f4e9e0e6SLiam R. Howlett prev = vma_prev(&vmi);
1559f4e9e0e6SLiam R. Howlett for_each_vma_range(vmi, vma, end) {
1560f4e9e0e6SLiam R. Howlett err = mbind_range(&vmi, vma, &prev, start, end, new);
1561f4e9e0e6SLiam R. Howlett if (err)
1562f4e9e0e6SLiam R. Howlett break;
1563f4e9e0e6SLiam R. Howlett }
1564cf608ac1SMinchan Kim }
15656ce3c4c0SChristoph Lameter
156672e315f7SHugh Dickins if (!err && !list_empty(&pagelist)) {
156772e315f7SHugh Dickins /* Convert MPOL_DEFAULT's NULL to task or default policy */
156872e315f7SHugh Dickins if (!new) {
156972e315f7SHugh Dickins new = get_task_policy(current);
157072e315f7SHugh Dickins mpol_get(new);
15711cb5d11aSHugh Dickins }
157288c91dc5SHugh Dickins mmpol.pol = new;
157388c91dc5SHugh Dickins mmpol.ilx = 0;
157488c91dc5SHugh Dickins
157588c91dc5SHugh Dickins /*
157688c91dc5SHugh Dickins * In the interleaved case, attempt to allocate on exactly the
157788c91dc5SHugh Dickins * targeted nodes, for the first VMA to be migrated; for later
157888c91dc5SHugh Dickins * VMAs, the nodes will still be interleaved from the targeted
157988c91dc5SHugh Dickins * nodemask, but one by one may be selected differently.
158088c91dc5SHugh Dickins */
1581fa3bea4eSGregory Price if (new->mode == MPOL_INTERLEAVE ||
1582fa3bea4eSGregory Price new->mode == MPOL_WEIGHTED_INTERLEAVE) {
1583f1cce6f7SMatthew Wilcox (Oracle) struct folio *folio;
158488c91dc5SHugh Dickins unsigned int order;
158588c91dc5SHugh Dickins unsigned long addr = -EFAULT;
158688c91dc5SHugh Dickins
1587f1cce6f7SMatthew Wilcox (Oracle) list_for_each_entry(folio, &pagelist, lru) {
1588f1cce6f7SMatthew Wilcox (Oracle) if (!folio_test_ksm(folio))
158988c91dc5SHugh Dickins break;
159088c91dc5SHugh Dickins }
1591f1cce6f7SMatthew Wilcox (Oracle) if (!list_entry_is_head(folio, &pagelist, lru)) {
159288c91dc5SHugh Dickins vma_iter_init(&vmi, mm, start);
159388c91dc5SHugh Dickins for_each_vma_range(vmi, vma, end) {
1594713da0b3SMatthew Wilcox (Oracle) addr = page_address_in_vma(folio,
1595f1cce6f7SMatthew Wilcox (Oracle) folio_page(folio, 0), vma);
159688c91dc5SHugh Dickins if (addr != -EFAULT)
159788c91dc5SHugh Dickins break;
159888c91dc5SHugh Dickins }
159988c91dc5SHugh Dickins }
160088c91dc5SHugh Dickins if (addr != -EFAULT) {
1601f1cce6f7SMatthew Wilcox (Oracle) order = folio_order(folio);
160288c91dc5SHugh Dickins /* We already know the pol, but not the ilx */
160388c91dc5SHugh Dickins mpol_cond_put(get_vma_policy(vma, addr, order,
160488c91dc5SHugh Dickins &mmpol.ilx));
160588c91dc5SHugh Dickins /* Set base from which to increment by index */
1606f1cce6f7SMatthew Wilcox (Oracle) mmpol.ilx -= folio->index >> order;
160788c91dc5SHugh Dickins }
160888c91dc5SHugh Dickins }
1609a85dfc30SYang Shi }
1610a85dfc30SYang Shi
1611d8ed45c5SMichel Lespinasse mmap_write_unlock(mm);
161288c91dc5SHugh Dickins
161388c91dc5SHugh Dickins if (!err && !list_empty(&pagelist)) {
161472e315f7SHugh Dickins nr_failed |= migrate_pages(&pagelist,
161572e315f7SHugh Dickins alloc_migration_target_by_mpol, NULL,
161688c91dc5SHugh Dickins (unsigned long)&mmpol, MIGRATE_SYNC,
161772e315f7SHugh Dickins MR_MEMPOLICY_MBIND, NULL);
161872e315f7SHugh Dickins }
161972e315f7SHugh Dickins
16201cb5d11aSHugh Dickins if (nr_failed && (flags & MPOL_MF_STRICT))
16211cb5d11aSHugh Dickins err = -EIO;
16226ce3c4c0SChristoph Lameter if (!list_empty(&pagelist))
1623b05ca738SKOSAKI Motohiro putback_movable_pages(&pagelist);
16246ce3c4c0SChristoph Lameter mpol_out:
1625f0be3d32SLee Schermerhorn mpol_put(new);
1626d479960eSMinchan Kim if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1627361a2a22SMinchan Kim lru_cache_enable();
16286ce3c4c0SChristoph Lameter return err;
16296ce3c4c0SChristoph Lameter }
16306ce3c4c0SChristoph Lameter
163139743889SChristoph Lameter /*
16328bccd85fSChristoph Lameter * User space interface with variable sized bitmaps for nodelists.
16338bccd85fSChristoph Lameter */
get_bitmap(unsigned long * mask,const unsigned long __user * nmask,unsigned long maxnode)1634e130242dSArnd Bergmann static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1635e130242dSArnd Bergmann unsigned long maxnode)
1636e130242dSArnd Bergmann {
1637e130242dSArnd Bergmann unsigned long nlongs = BITS_TO_LONGS(maxnode);
1638e130242dSArnd Bergmann int ret;
1639e130242dSArnd Bergmann
1640e130242dSArnd Bergmann if (in_compat_syscall())
1641e130242dSArnd Bergmann ret = compat_get_bitmap(mask,
1642e130242dSArnd Bergmann (const compat_ulong_t __user *)nmask,
1643e130242dSArnd Bergmann maxnode);
1644e130242dSArnd Bergmann else
1645e130242dSArnd Bergmann ret = copy_from_user(mask, nmask,
1646e130242dSArnd Bergmann nlongs * sizeof(unsigned long));
1647e130242dSArnd Bergmann
1648e130242dSArnd Bergmann if (ret)
1649e130242dSArnd Bergmann return -EFAULT;
1650e130242dSArnd Bergmann
1651e130242dSArnd Bergmann if (maxnode % BITS_PER_LONG)
1652e130242dSArnd Bergmann mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1653e130242dSArnd Bergmann
1654e130242dSArnd Bergmann return 0;
1655e130242dSArnd Bergmann }
16568bccd85fSChristoph Lameter
16578bccd85fSChristoph Lameter /* Copy a node mask from user space. */
get_nodes(nodemask_t * nodes,const unsigned long __user * nmask,unsigned long maxnode)165839743889SChristoph Lameter static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
16598bccd85fSChristoph Lameter unsigned long maxnode)
16608bccd85fSChristoph Lameter {
16618bccd85fSChristoph Lameter --maxnode;
16628bccd85fSChristoph Lameter nodes_clear(*nodes);
16638bccd85fSChristoph Lameter if (maxnode == 0 || !nmask)
16648bccd85fSChristoph Lameter return 0;
1665a9c930baSAndi Kleen if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1666636f13c1SChris Wright return -EINVAL;
16678bccd85fSChristoph Lameter
166856521e7aSYisheng Xie /*
166956521e7aSYisheng Xie * When the user specified more nodes than supported just check
1670e130242dSArnd Bergmann * if the non supported part is all zero, one word at a time,
1671e130242dSArnd Bergmann * starting at the end.
167256521e7aSYisheng Xie */
1673e130242dSArnd Bergmann while (maxnode > MAX_NUMNODES) {
1674e130242dSArnd Bergmann unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1675e130242dSArnd Bergmann unsigned long t;
16768bccd85fSChristoph Lameter
1677000eca5dSTianyu Li if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
167856521e7aSYisheng Xie return -EFAULT;
1679e130242dSArnd Bergmann
1680e130242dSArnd Bergmann if (maxnode - bits >= MAX_NUMNODES) {
1681e130242dSArnd Bergmann maxnode -= bits;
1682e130242dSArnd Bergmann } else {
1683e130242dSArnd Bergmann maxnode = MAX_NUMNODES;
1684e130242dSArnd Bergmann t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1685e130242dSArnd Bergmann }
1686e130242dSArnd Bergmann if (t)
168756521e7aSYisheng Xie return -EINVAL;
168856521e7aSYisheng Xie }
168956521e7aSYisheng Xie
1690e130242dSArnd Bergmann return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
16918bccd85fSChristoph Lameter }
16928bccd85fSChristoph Lameter
16938bccd85fSChristoph Lameter /* Copy a kernel node mask to user space */
copy_nodes_to_user(unsigned long __user * mask,unsigned long maxnode,nodemask_t * nodes)16948bccd85fSChristoph Lameter static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
16958bccd85fSChristoph Lameter nodemask_t *nodes)
16968bccd85fSChristoph Lameter {
16978bccd85fSChristoph Lameter unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1698050c17f2SRalph Campbell unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1699e130242dSArnd Bergmann bool compat = in_compat_syscall();
1700e130242dSArnd Bergmann
1701e130242dSArnd Bergmann if (compat)
1702e130242dSArnd Bergmann nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
17038bccd85fSChristoph Lameter
17048bccd85fSChristoph Lameter if (copy > nbytes) {
17058bccd85fSChristoph Lameter if (copy > PAGE_SIZE)
17068bccd85fSChristoph Lameter return -EINVAL;
17078bccd85fSChristoph Lameter if (clear_user((char __user *)mask + nbytes, copy - nbytes))
17088bccd85fSChristoph Lameter return -EFAULT;
17098bccd85fSChristoph Lameter copy = nbytes;
1710e130242dSArnd Bergmann maxnode = nr_node_ids;
17118bccd85fSChristoph Lameter }
1712e130242dSArnd Bergmann
1713e130242dSArnd Bergmann if (compat)
1714e130242dSArnd Bergmann return compat_put_bitmap((compat_ulong_t __user *)mask,
1715e130242dSArnd Bergmann nodes_addr(*nodes), maxnode);
1716e130242dSArnd Bergmann
17178bccd85fSChristoph Lameter return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
17188bccd85fSChristoph Lameter }
17198bccd85fSChristoph Lameter
172095837924SFeng Tang /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
sanitize_mpol_flags(int * mode,unsigned short * flags)172195837924SFeng Tang static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
172295837924SFeng Tang {
172395837924SFeng Tang *flags = *mode & MPOL_MODE_FLAGS;
172495837924SFeng Tang *mode &= ~MPOL_MODE_FLAGS;
1725b27abaccSDave Hansen
1726a38a59fdSBen Widawsky if ((unsigned int)(*mode) >= MPOL_MAX)
172795837924SFeng Tang return -EINVAL;
172895837924SFeng Tang if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
172995837924SFeng Tang return -EINVAL;
17306d2aec9eSEric Dumazet if (*flags & MPOL_F_NUMA_BALANCING) {
1731133d04b1SDonet Tom if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
17326d2aec9eSEric Dumazet *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1733133d04b1SDonet Tom else
1734133d04b1SDonet Tom return -EINVAL;
17356d2aec9eSEric Dumazet }
173695837924SFeng Tang return 0;
173795837924SFeng Tang }
173895837924SFeng Tang
kernel_mbind(unsigned long start,unsigned long len,unsigned long mode,const unsigned long __user * nmask,unsigned long maxnode,unsigned int flags)1739e7dc9ad6SDominik Brodowski static long kernel_mbind(unsigned long start, unsigned long len,
1740e7dc9ad6SDominik Brodowski unsigned long mode, const unsigned long __user *nmask,
1741e7dc9ad6SDominik Brodowski unsigned long maxnode, unsigned int flags)
17428bccd85fSChristoph Lameter {
1743028fec41SDavid Rientjes unsigned short mode_flags;
174495837924SFeng Tang nodemask_t nodes;
174595837924SFeng Tang int lmode = mode;
174695837924SFeng Tang int err;
17478bccd85fSChristoph Lameter
1748057d3389SAndrey Konovalov start = untagged_addr(start);
174995837924SFeng Tang err = sanitize_mpol_flags(&lmode, &mode_flags);
175095837924SFeng Tang if (err)
175195837924SFeng Tang return err;
175295837924SFeng Tang
17538bccd85fSChristoph Lameter err = get_nodes(&nodes, nmask, maxnode);
17548bccd85fSChristoph Lameter if (err)
17558bccd85fSChristoph Lameter return err;
175695837924SFeng Tang
175795837924SFeng Tang return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
17588bccd85fSChristoph Lameter }
17598bccd85fSChristoph Lameter
SYSCALL_DEFINE4(set_mempolicy_home_node,unsigned long,start,unsigned long,len,unsigned long,home_node,unsigned long,flags)1760c6018b4bSAneesh Kumar K.V SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1761c6018b4bSAneesh Kumar K.V unsigned long, home_node, unsigned long, flags)
1762c6018b4bSAneesh Kumar K.V {
1763c6018b4bSAneesh Kumar K.V struct mm_struct *mm = current->mm;
1764f4e9e0e6SLiam R. Howlett struct vm_area_struct *vma, *prev;
1765e976936cSMichal Hocko struct mempolicy *new, *old;
1766c6018b4bSAneesh Kumar K.V unsigned long end;
1767c6018b4bSAneesh Kumar K.V int err = -ENOENT;
176866850be5SLiam R. Howlett VMA_ITERATOR(vmi, mm, start);
1769c6018b4bSAneesh Kumar K.V
1770c6018b4bSAneesh Kumar K.V start = untagged_addr(start);
1771c6018b4bSAneesh Kumar K.V if (start & ~PAGE_MASK)
1772c6018b4bSAneesh Kumar K.V return -EINVAL;
1773c6018b4bSAneesh Kumar K.V /*
1774c6018b4bSAneesh Kumar K.V * flags is used for future extension if any.
1775c6018b4bSAneesh Kumar K.V */
1776c6018b4bSAneesh Kumar K.V if (flags != 0)
1777c6018b4bSAneesh Kumar K.V return -EINVAL;
1778c6018b4bSAneesh Kumar K.V
1779c6018b4bSAneesh Kumar K.V /*
1780c6018b4bSAneesh Kumar K.V * Check home_node is online to avoid accessing uninitialized
1781c6018b4bSAneesh Kumar K.V * NODE_DATA.
1782c6018b4bSAneesh Kumar K.V */
1783c6018b4bSAneesh Kumar K.V if (home_node >= MAX_NUMNODES || !node_online(home_node))
1784c6018b4bSAneesh Kumar K.V return -EINVAL;
1785c6018b4bSAneesh Kumar K.V
1786aaa31e05Sze zuo len = PAGE_ALIGN(len);
1787c6018b4bSAneesh Kumar K.V end = start + len;
1788c6018b4bSAneesh Kumar K.V
1789c6018b4bSAneesh Kumar K.V if (end < start)
1790c6018b4bSAneesh Kumar K.V return -EINVAL;
1791c6018b4bSAneesh Kumar K.V if (end == start)
1792c6018b4bSAneesh Kumar K.V return 0;
1793c6018b4bSAneesh Kumar K.V mmap_write_lock(mm);
1794f4e9e0e6SLiam R. Howlett prev = vma_prev(&vmi);
179566850be5SLiam R. Howlett for_each_vma_range(vmi, vma, end) {
1796c6018b4bSAneesh Kumar K.V /*
1797c6018b4bSAneesh Kumar K.V * If any vma in the range got policy other than MPOL_BIND
1798c6018b4bSAneesh Kumar K.V * or MPOL_PREFERRED_MANY we return error. We don't reset
1799c6018b4bSAneesh Kumar K.V * the home node for vmas we already updated before.
1800c6018b4bSAneesh Kumar K.V */
1801e976936cSMichal Hocko old = vma_policy(vma);
180251f62537SLiam R. Howlett if (!old) {
180351f62537SLiam R. Howlett prev = vma;
1804e976936cSMichal Hocko continue;
180551f62537SLiam R. Howlett }
1806e976936cSMichal Hocko if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1807c6018b4bSAneesh Kumar K.V err = -EOPNOTSUPP;
1808c6018b4bSAneesh Kumar K.V break;
1809c6018b4bSAneesh Kumar K.V }
1810e976936cSMichal Hocko new = mpol_dup(old);
1811e976936cSMichal Hocko if (IS_ERR(new)) {
1812e976936cSMichal Hocko err = PTR_ERR(new);
1813e976936cSMichal Hocko break;
1814e976936cSMichal Hocko }
1815c6018b4bSAneesh Kumar K.V
18166c21e066SJann Horn vma_start_write(vma);
1817c6018b4bSAneesh Kumar K.V new->home_node = home_node;
1818f4e9e0e6SLiam R. Howlett err = mbind_range(&vmi, vma, &prev, start, end, new);
1819c6018b4bSAneesh Kumar K.V mpol_put(new);
1820c6018b4bSAneesh Kumar K.V if (err)
1821c6018b4bSAneesh Kumar K.V break;
1822c6018b4bSAneesh Kumar K.V }
1823c6018b4bSAneesh Kumar K.V mmap_write_unlock(mm);
1824c6018b4bSAneesh Kumar K.V return err;
1825c6018b4bSAneesh Kumar K.V }
1826c6018b4bSAneesh Kumar K.V
SYSCALL_DEFINE6(mbind,unsigned long,start,unsigned long,len,unsigned long,mode,const unsigned long __user *,nmask,unsigned long,maxnode,unsigned int,flags)1827e7dc9ad6SDominik Brodowski SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1828e7dc9ad6SDominik Brodowski unsigned long, mode, const unsigned long __user *, nmask,
1829e7dc9ad6SDominik Brodowski unsigned long, maxnode, unsigned int, flags)
1830e7dc9ad6SDominik Brodowski {
1831e7dc9ad6SDominik Brodowski return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1832e7dc9ad6SDominik Brodowski }
1833e7dc9ad6SDominik Brodowski
18348bccd85fSChristoph Lameter /* Set the process memory policy */
kernel_set_mempolicy(int mode,const unsigned long __user * nmask,unsigned long maxnode)1835af03c4acSDominik Brodowski static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1836af03c4acSDominik Brodowski unsigned long maxnode)
18378bccd85fSChristoph Lameter {
183895837924SFeng Tang unsigned short mode_flags;
18398bccd85fSChristoph Lameter nodemask_t nodes;
184095837924SFeng Tang int lmode = mode;
184195837924SFeng Tang int err;
18428bccd85fSChristoph Lameter
184395837924SFeng Tang err = sanitize_mpol_flags(&lmode, &mode_flags);
184495837924SFeng Tang if (err)
184595837924SFeng Tang return err;
184695837924SFeng Tang
18478bccd85fSChristoph Lameter err = get_nodes(&nodes, nmask, maxnode);
18488bccd85fSChristoph Lameter if (err)
18498bccd85fSChristoph Lameter return err;
185095837924SFeng Tang
185195837924SFeng Tang return do_set_mempolicy(lmode, mode_flags, &nodes);
18528bccd85fSChristoph Lameter }
18538bccd85fSChristoph Lameter
SYSCALL_DEFINE3(set_mempolicy,int,mode,const unsigned long __user *,nmask,unsigned long,maxnode)1854af03c4acSDominik Brodowski SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1855af03c4acSDominik Brodowski unsigned long, maxnode)
1856af03c4acSDominik Brodowski {
1857af03c4acSDominik Brodowski return kernel_set_mempolicy(mode, nmask, maxnode);
1858af03c4acSDominik Brodowski }
1859af03c4acSDominik Brodowski
kernel_migrate_pages(pid_t pid,unsigned long maxnode,const unsigned long __user * old_nodes,const unsigned long __user * new_nodes)1860b6e9b0baSDominik Brodowski static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1861b6e9b0baSDominik Brodowski const unsigned long __user *old_nodes,
1862b6e9b0baSDominik Brodowski const unsigned long __user *new_nodes)
186339743889SChristoph Lameter {
1864596d7cfaSKOSAKI Motohiro struct mm_struct *mm = NULL;
186539743889SChristoph Lameter struct task_struct *task;
186639743889SChristoph Lameter nodemask_t task_nodes;
186739743889SChristoph Lameter int err;
1868596d7cfaSKOSAKI Motohiro nodemask_t *old;
1869596d7cfaSKOSAKI Motohiro nodemask_t *new;
1870596d7cfaSKOSAKI Motohiro NODEMASK_SCRATCH(scratch);
187139743889SChristoph Lameter
1872596d7cfaSKOSAKI Motohiro if (!scratch)
1873596d7cfaSKOSAKI Motohiro return -ENOMEM;
187439743889SChristoph Lameter
1875596d7cfaSKOSAKI Motohiro old = &scratch->mask1;
1876596d7cfaSKOSAKI Motohiro new = &scratch->mask2;
1877596d7cfaSKOSAKI Motohiro
1878596d7cfaSKOSAKI Motohiro err = get_nodes(old, old_nodes, maxnode);
187939743889SChristoph Lameter if (err)
1880596d7cfaSKOSAKI Motohiro goto out;
1881596d7cfaSKOSAKI Motohiro
1882596d7cfaSKOSAKI Motohiro err = get_nodes(new, new_nodes, maxnode);
1883596d7cfaSKOSAKI Motohiro if (err)
1884596d7cfaSKOSAKI Motohiro goto out;
188539743889SChristoph Lameter
188639743889SChristoph Lameter /* Find the mm_struct */
188755cfaa3cSZeng Zhaoming rcu_read_lock();
1888228ebcbeSPavel Emelyanov task = pid ? find_task_by_vpid(pid) : current;
188939743889SChristoph Lameter if (!task) {
189055cfaa3cSZeng Zhaoming rcu_read_unlock();
1891596d7cfaSKOSAKI Motohiro err = -ESRCH;
1892596d7cfaSKOSAKI Motohiro goto out;
189339743889SChristoph Lameter }
18943268c63eSChristoph Lameter get_task_struct(task);
189539743889SChristoph Lameter
1896596d7cfaSKOSAKI Motohiro err = -EINVAL;
189739743889SChristoph Lameter
189839743889SChristoph Lameter /*
189931367466SOtto Ebeling * Check if this process has the right to modify the specified process.
190031367466SOtto Ebeling * Use the regular "ptrace_may_access()" checks.
190139743889SChristoph Lameter */
190231367466SOtto Ebeling if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1903c69e8d9cSDavid Howells rcu_read_unlock();
190439743889SChristoph Lameter err = -EPERM;
19053268c63eSChristoph Lameter goto out_put;
190639743889SChristoph Lameter }
1907c69e8d9cSDavid Howells rcu_read_unlock();
190839743889SChristoph Lameter
190939743889SChristoph Lameter task_nodes = cpuset_mems_allowed(task);
191039743889SChristoph Lameter /* Is the user allowed to access the target nodes? */
1911596d7cfaSKOSAKI Motohiro if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
191239743889SChristoph Lameter err = -EPERM;
19133268c63eSChristoph Lameter goto out_put;
191439743889SChristoph Lameter }
191539743889SChristoph Lameter
19160486a38bSYisheng Xie task_nodes = cpuset_mems_allowed(current);
1917386781dfSYury Norov if (!nodes_and(*new, *new, task_nodes))
19183268c63eSChristoph Lameter goto out_put;
19190486a38bSYisheng Xie
192086c3a764SDavid Quigley err = security_task_movememory(task);
192186c3a764SDavid Quigley if (err)
19223268c63eSChristoph Lameter goto out_put;
192386c3a764SDavid Quigley
19243268c63eSChristoph Lameter mm = get_task_mm(task);
19253268c63eSChristoph Lameter put_task_struct(task);
1926f2a9ef88SSasha Levin
1927f2a9ef88SSasha Levin if (!mm) {
1928f2a9ef88SSasha Levin err = -EINVAL;
1929f2a9ef88SSasha Levin goto out;
1930f2a9ef88SSasha Levin }
1931f2a9ef88SSasha Levin
1932596d7cfaSKOSAKI Motohiro err = do_migrate_pages(mm, old, new,
193374c00241SChristoph Lameter capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
19343268c63eSChristoph Lameter
193539743889SChristoph Lameter mmput(mm);
19363268c63eSChristoph Lameter out:
1937596d7cfaSKOSAKI Motohiro NODEMASK_SCRATCH_FREE(scratch);
1938596d7cfaSKOSAKI Motohiro
193939743889SChristoph Lameter return err;
19403268c63eSChristoph Lameter
19413268c63eSChristoph Lameter out_put:
19423268c63eSChristoph Lameter put_task_struct(task);
19433268c63eSChristoph Lameter goto out;
194439743889SChristoph Lameter }
194539743889SChristoph Lameter
SYSCALL_DEFINE4(migrate_pages,pid_t,pid,unsigned long,maxnode,const unsigned long __user *,old_nodes,const unsigned long __user *,new_nodes)1946b6e9b0baSDominik Brodowski SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1947b6e9b0baSDominik Brodowski const unsigned long __user *, old_nodes,
1948b6e9b0baSDominik Brodowski const unsigned long __user *, new_nodes)
1949b6e9b0baSDominik Brodowski {
1950b6e9b0baSDominik Brodowski return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1951b6e9b0baSDominik Brodowski }
1952b6e9b0baSDominik Brodowski
19538bccd85fSChristoph Lameter /* Retrieve NUMA policy */
kernel_get_mempolicy(int __user * policy,unsigned long __user * nmask,unsigned long maxnode,unsigned long addr,unsigned long flags)1954af03c4acSDominik Brodowski static int kernel_get_mempolicy(int __user *policy,
1955af03c4acSDominik Brodowski unsigned long __user *nmask,
1956af03c4acSDominik Brodowski unsigned long maxnode,
1957af03c4acSDominik Brodowski unsigned long addr,
1958af03c4acSDominik Brodowski unsigned long flags)
19598bccd85fSChristoph Lameter {
1960dbcb0f19SAdrian Bunk int err;
19613f649ab7SKees Cook int pval;
19628bccd85fSChristoph Lameter nodemask_t nodes;
19638bccd85fSChristoph Lameter
1964050c17f2SRalph Campbell if (nmask != NULL && maxnode < nr_node_ids)
19658bccd85fSChristoph Lameter return -EINVAL;
19668bccd85fSChristoph Lameter
19674605f057SWenchao Hao addr = untagged_addr(addr);
19684605f057SWenchao Hao
19698bccd85fSChristoph Lameter err = do_get_mempolicy(&pval, &nodes, addr, flags);
19708bccd85fSChristoph Lameter
19718bccd85fSChristoph Lameter if (err)
19728bccd85fSChristoph Lameter return err;
19738bccd85fSChristoph Lameter
19748bccd85fSChristoph Lameter if (policy && put_user(pval, policy))
19758bccd85fSChristoph Lameter return -EFAULT;
19768bccd85fSChristoph Lameter
19778bccd85fSChristoph Lameter if (nmask)
19788bccd85fSChristoph Lameter err = copy_nodes_to_user(nmask, maxnode, &nodes);
19798bccd85fSChristoph Lameter
19808bccd85fSChristoph Lameter return err;
19818bccd85fSChristoph Lameter }
19828bccd85fSChristoph Lameter
SYSCALL_DEFINE5(get_mempolicy,int __user *,policy,unsigned long __user *,nmask,unsigned long,maxnode,unsigned long,addr,unsigned long,flags)1983af03c4acSDominik Brodowski SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1984af03c4acSDominik Brodowski unsigned long __user *, nmask, unsigned long, maxnode,
1985af03c4acSDominik Brodowski unsigned long, addr, unsigned long, flags)
1986af03c4acSDominik Brodowski {
1987af03c4acSDominik Brodowski return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1988af03c4acSDominik Brodowski }
1989af03c4acSDominik Brodowski
vma_migratable(struct vm_area_struct * vma)199020ca87f2SLi Xinhai bool vma_migratable(struct vm_area_struct *vma)
199120ca87f2SLi Xinhai {
199220ca87f2SLi Xinhai if (vma->vm_flags & (VM_IO | VM_PFNMAP))
199320ca87f2SLi Xinhai return false;
199420ca87f2SLi Xinhai
199520ca87f2SLi Xinhai /*
199620ca87f2SLi Xinhai * DAX device mappings require predictable access latency, so avoid
199720ca87f2SLi Xinhai * incurring periodic faults.
199820ca87f2SLi Xinhai */
199920ca87f2SLi Xinhai if (vma_is_dax(vma))
200020ca87f2SLi Xinhai return false;
200120ca87f2SLi Xinhai
200220ca87f2SLi Xinhai if (is_vm_hugetlb_page(vma) &&
200320ca87f2SLi Xinhai !hugepage_migration_supported(hstate_vma(vma)))
200420ca87f2SLi Xinhai return false;
200520ca87f2SLi Xinhai
200620ca87f2SLi Xinhai /*
200720ca87f2SLi Xinhai * Migration allocates pages in the highest zone. If we cannot
200820ca87f2SLi Xinhai * do so then migration (at least from node to node) is not
200920ca87f2SLi Xinhai * possible.
201020ca87f2SLi Xinhai */
201120ca87f2SLi Xinhai if (vma->vm_file &&
201220ca87f2SLi Xinhai gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
201320ca87f2SLi Xinhai < policy_zone)
201420ca87f2SLi Xinhai return false;
201520ca87f2SLi Xinhai return true;
201620ca87f2SLi Xinhai }
201720ca87f2SLi Xinhai
__get_vma_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * ilx)201874d2c3a0SOleg Nesterov struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
2019ddc1a5cbSHugh Dickins unsigned long addr, pgoff_t *ilx)
20201da177e4SLinus Torvalds {
2021ddc1a5cbSHugh Dickins *ilx = 0;
2022ddc1a5cbSHugh Dickins return (vma->vm_ops && vma->vm_ops->get_policy) ?
2023ddc1a5cbSHugh Dickins vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
202474d2c3a0SOleg Nesterov }
202574d2c3a0SOleg Nesterov
202674d2c3a0SOleg Nesterov /*
2027ddc1a5cbSHugh Dickins * get_vma_policy(@vma, @addr, @order, @ilx)
202874d2c3a0SOleg Nesterov * @vma: virtual memory area whose policy is sought
202974d2c3a0SOleg Nesterov * @addr: address in @vma for shared policy lookup
2030ddc1a5cbSHugh Dickins * @order: 0, or appropriate huge_page_order for interleaving
2031fa3bea4eSGregory Price * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
2032fa3bea4eSGregory Price * MPOL_WEIGHTED_INTERLEAVE
203374d2c3a0SOleg Nesterov *
203474d2c3a0SOleg Nesterov * Returns effective policy for a VMA at specified address.
2035dd6eecb9SOleg Nesterov * Falls back to current->mempolicy or system default policy, as necessary.
203674d2c3a0SOleg Nesterov * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
203774d2c3a0SOleg Nesterov * count--added by the get_policy() vm_op, as appropriate--to protect against
203874d2c3a0SOleg Nesterov * freeing by another task. It is the caller's responsibility to free the
203974d2c3a0SOleg Nesterov * extra reference for shared policies.
204074d2c3a0SOleg Nesterov */
get_vma_policy(struct vm_area_struct * vma,unsigned long addr,int order,pgoff_t * ilx)2041ddc1a5cbSHugh Dickins struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
2042ddc1a5cbSHugh Dickins unsigned long addr, int order, pgoff_t *ilx)
204374d2c3a0SOleg Nesterov {
2044ddc1a5cbSHugh Dickins struct mempolicy *pol;
204574d2c3a0SOleg Nesterov
2046ddc1a5cbSHugh Dickins pol = __get_vma_policy(vma, addr, ilx);
20478d90274bSOleg Nesterov if (!pol)
2048dd6eecb9SOleg Nesterov pol = get_task_policy(current);
2049fa3bea4eSGregory Price if (pol->mode == MPOL_INTERLEAVE ||
2050fa3bea4eSGregory Price pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
2051ddc1a5cbSHugh Dickins *ilx += vma->vm_pgoff >> order;
2052ddc1a5cbSHugh Dickins *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
2053ddc1a5cbSHugh Dickins }
20541da177e4SLinus Torvalds return pol;
20551da177e4SLinus Torvalds }
20561da177e4SLinus Torvalds
vma_policy_mof(struct vm_area_struct * vma)20576b6482bbSOleg Nesterov bool vma_policy_mof(struct vm_area_struct *vma)
2058fc314724SMel Gorman {
20596b6482bbSOleg Nesterov struct mempolicy *pol;
2060f15ca78eSOleg Nesterov
2061fc314724SMel Gorman if (vma->vm_ops && vma->vm_ops->get_policy) {
2062fc314724SMel Gorman bool ret = false;
2063ddc1a5cbSHugh Dickins pgoff_t ilx; /* ignored here */
2064fc314724SMel Gorman
2065ddc1a5cbSHugh Dickins pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
2066fc314724SMel Gorman if (pol && (pol->flags & MPOL_F_MOF))
2067fc314724SMel Gorman ret = true;
2068fc314724SMel Gorman mpol_cond_put(pol);
2069fc314724SMel Gorman
2070fc314724SMel Gorman return ret;
20718d90274bSOleg Nesterov }
20728d90274bSOleg Nesterov
2073fc314724SMel Gorman pol = vma->vm_policy;
20748d90274bSOleg Nesterov if (!pol)
20756b6482bbSOleg Nesterov pol = get_task_policy(current);
2076fc314724SMel Gorman
2077fc314724SMel Gorman return pol->flags & MPOL_F_MOF;
2078fc314724SMel Gorman }
2079fc314724SMel Gorman
apply_policy_zone(struct mempolicy * policy,enum zone_type zone)2080d2226ebdSFeng Tang bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
2081d3eb1570SLai Jiangshan {
2082d3eb1570SLai Jiangshan enum zone_type dynamic_policy_zone = policy_zone;
2083d3eb1570SLai Jiangshan
2084d3eb1570SLai Jiangshan BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
2085d3eb1570SLai Jiangshan
2086d3eb1570SLai Jiangshan /*
2087269fbe72SBen Widawsky * if policy->nodes has movable memory only,
2088d3eb1570SLai Jiangshan * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
2089d3eb1570SLai Jiangshan *
2090269fbe72SBen Widawsky * policy->nodes is intersect with node_states[N_MEMORY].
2091f0953a1bSIngo Molnar * so if the following test fails, it implies
2092269fbe72SBen Widawsky * policy->nodes has movable memory only.
2093d3eb1570SLai Jiangshan */
2094269fbe72SBen Widawsky if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
2095d3eb1570SLai Jiangshan dynamic_policy_zone = ZONE_MOVABLE;
2096d3eb1570SLai Jiangshan
2097d3eb1570SLai Jiangshan return zone >= dynamic_policy_zone;
2098d3eb1570SLai Jiangshan }
2099d3eb1570SLai Jiangshan
weighted_interleave_nodes(struct mempolicy * policy)2100fa3bea4eSGregory Price static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
2101fa3bea4eSGregory Price {
2102274519edSGregory Price unsigned int node;
2103274519edSGregory Price unsigned int cpuset_mems_cookie;
2104fa3bea4eSGregory Price
2105274519edSGregory Price retry:
2106274519edSGregory Price /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
2107274519edSGregory Price cpuset_mems_cookie = read_mems_allowed_begin();
2108274519edSGregory Price node = current->il_prev;
2109fa3bea4eSGregory Price if (!current->il_weight || !node_isset(node, policy->nodes)) {
2110fa3bea4eSGregory Price node = next_node_in(node, policy->nodes);
2111274519edSGregory Price if (read_mems_allowed_retry(cpuset_mems_cookie))
2112274519edSGregory Price goto retry;
2113fa3bea4eSGregory Price if (node == MAX_NUMNODES)
2114fa3bea4eSGregory Price return node;
2115fa3bea4eSGregory Price current->il_prev = node;
2116fa3bea4eSGregory Price current->il_weight = get_il_weight(node);
2117fa3bea4eSGregory Price }
2118fa3bea4eSGregory Price current->il_weight--;
2119fa3bea4eSGregory Price return node;
2120fa3bea4eSGregory Price }
2121fa3bea4eSGregory Price
21221da177e4SLinus Torvalds /* Do dynamic interleaving for a process */
interleave_nodes(struct mempolicy * policy)2123c36f6e6dSHugh Dickins static unsigned int interleave_nodes(struct mempolicy *policy)
21241da177e4SLinus Torvalds {
2125c36f6e6dSHugh Dickins unsigned int nid;
2126274519edSGregory Price unsigned int cpuset_mems_cookie;
21271da177e4SLinus Torvalds
2128274519edSGregory Price /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
2129274519edSGregory Price do {
2130274519edSGregory Price cpuset_mems_cookie = read_mems_allowed_begin();
2131c36f6e6dSHugh Dickins nid = next_node_in(current->il_prev, policy->nodes);
2132274519edSGregory Price } while (read_mems_allowed_retry(cpuset_mems_cookie));
2133274519edSGregory Price
2134c36f6e6dSHugh Dickins if (nid < MAX_NUMNODES)
2135c36f6e6dSHugh Dickins current->il_prev = nid;
2136c36f6e6dSHugh Dickins return nid;
21371da177e4SLinus Torvalds }
21381da177e4SLinus Torvalds
2139dc85da15SChristoph Lameter /*
2140dc85da15SChristoph Lameter * Depending on the memory policy provide a node from which to allocate the
2141dc85da15SChristoph Lameter * next slab entry.
2142dc85da15SChristoph Lameter */
mempolicy_slab_node(void)21432a389610SDavid Rientjes unsigned int mempolicy_slab_node(void)
2144dc85da15SChristoph Lameter {
2145e7b691b0SAndi Kleen struct mempolicy *policy;
21462a389610SDavid Rientjes int node = numa_mem_id();
2147e7b691b0SAndi Kleen
214838b031ddSVasily Averin if (!in_task())
21492a389610SDavid Rientjes return node;
2150e7b691b0SAndi Kleen
2151e7b691b0SAndi Kleen policy = current->mempolicy;
21527858d7bcSFeng Tang if (!policy)
21532a389610SDavid Rientjes return node;
2154765c4507SChristoph Lameter
2155bea904d5SLee Schermerhorn switch (policy->mode) {
2156bea904d5SLee Schermerhorn case MPOL_PREFERRED:
2157269fbe72SBen Widawsky return first_node(policy->nodes);
2158bea904d5SLee Schermerhorn
2159dc85da15SChristoph Lameter case MPOL_INTERLEAVE:
2160dc85da15SChristoph Lameter return interleave_nodes(policy);
2161dc85da15SChristoph Lameter
2162fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE:
2163fa3bea4eSGregory Price return weighted_interleave_nodes(policy);
2164fa3bea4eSGregory Price
2165b27abaccSDave Hansen case MPOL_BIND:
2166b27abaccSDave Hansen case MPOL_PREFERRED_MANY:
2167b27abaccSDave Hansen {
2168c33d6c06SMel Gorman struct zoneref *z;
2169c33d6c06SMel Gorman
2170dc85da15SChristoph Lameter /*
2171dc85da15SChristoph Lameter * Follow bind policy behavior and start allocation at the
2172dc85da15SChristoph Lameter * first node.
2173dc85da15SChristoph Lameter */
217419770b32SMel Gorman struct zonelist *zonelist;
217519770b32SMel Gorman enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
2176c9634cf0SAneesh Kumar K.V zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
2177c33d6c06SMel Gorman z = first_zones_zonelist(zonelist, highest_zoneidx,
2178269fbe72SBen Widawsky &policy->nodes);
217929943248SWei Yang return zonelist_zone(z) ? zonelist_node_idx(z) : node;
2180dd1a239fSMel Gorman }
21817858d7bcSFeng Tang case MPOL_LOCAL:
21827858d7bcSFeng Tang return node;
2183dc85da15SChristoph Lameter
2184dc85da15SChristoph Lameter default:
2185bea904d5SLee Schermerhorn BUG();
2186dc85da15SChristoph Lameter }
2187dc85da15SChristoph Lameter }
2188dc85da15SChristoph Lameter
read_once_policy_nodemask(struct mempolicy * pol,nodemask_t * mask)21899685e6e3SGregory Price static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
21909685e6e3SGregory Price nodemask_t *mask)
21919685e6e3SGregory Price {
21929685e6e3SGregory Price /*
21939685e6e3SGregory Price * barrier stabilizes the nodemask locally so that it can be iterated
21949685e6e3SGregory Price * over safely without concern for changes. Allocators validate node
21959685e6e3SGregory Price * selection does not violate mems_allowed, so this is safe.
21969685e6e3SGregory Price */
21979685e6e3SGregory Price barrier();
21989685e6e3SGregory Price memcpy(mask, &pol->nodes, sizeof(nodemask_t));
21999685e6e3SGregory Price barrier();
22009685e6e3SGregory Price return nodes_weight(*mask);
22019685e6e3SGregory Price }
22029685e6e3SGregory Price
weighted_interleave_nid(struct mempolicy * pol,pgoff_t ilx)2203fa3bea4eSGregory Price static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
2204fa3bea4eSGregory Price {
2205e341f9c3SJoshua Hahn struct weighted_interleave_state *state;
2206fa3bea4eSGregory Price nodemask_t nodemask;
2207fa3bea4eSGregory Price unsigned int target, nr_nodes;
2208e341f9c3SJoshua Hahn u8 *table = NULL;
2209fa3bea4eSGregory Price unsigned int weight_total = 0;
2210fa3bea4eSGregory Price u8 weight;
2211e341f9c3SJoshua Hahn int nid = 0;
2212fa3bea4eSGregory Price
2213fa3bea4eSGregory Price nr_nodes = read_once_policy_nodemask(pol, &nodemask);
2214fa3bea4eSGregory Price if (!nr_nodes)
2215fa3bea4eSGregory Price return numa_node_id();
2216fa3bea4eSGregory Price
2217fa3bea4eSGregory Price rcu_read_lock();
2218e341f9c3SJoshua Hahn
2219e341f9c3SJoshua Hahn state = rcu_dereference(wi_state);
2220e341f9c3SJoshua Hahn /* Uninitialized wi_state means we should assume all weights are 1 */
2221e341f9c3SJoshua Hahn if (state)
2222e341f9c3SJoshua Hahn table = state->iw_table;
2223e341f9c3SJoshua Hahn
2224fa3bea4eSGregory Price /* calculate the total weight */
2225e341f9c3SJoshua Hahn for_each_node_mask(nid, nodemask)
2226e341f9c3SJoshua Hahn weight_total += table ? table[nid] : 1;
2227fa3bea4eSGregory Price
2228fa3bea4eSGregory Price /* Calculate the node offset based on totals */
2229fa3bea4eSGregory Price target = ilx % weight_total;
2230fa3bea4eSGregory Price nid = first_node(nodemask);
2231fa3bea4eSGregory Price while (target) {
2232fa3bea4eSGregory Price /* detect system default usage */
2233fa3bea4eSGregory Price weight = table ? table[nid] : 1;
2234fa3bea4eSGregory Price if (target < weight)
2235fa3bea4eSGregory Price break;
2236fa3bea4eSGregory Price target -= weight;
2237fa3bea4eSGregory Price nid = next_node_in(nid, nodemask);
2238fa3bea4eSGregory Price }
2239fa3bea4eSGregory Price rcu_read_unlock();
2240fa3bea4eSGregory Price return nid;
2241fa3bea4eSGregory Price }
2242fa3bea4eSGregory Price
2243fee83b3aSAndrew Morton /*
2244ddc1a5cbSHugh Dickins * Do static interleaving for interleave index @ilx. Returns the ilx'th
2245ddc1a5cbSHugh Dickins * node in pol->nodes (starting from ilx=0), wrapping around if ilx
2246ddc1a5cbSHugh Dickins * exceeds the number of present nodes.
2247fee83b3aSAndrew Morton */
interleave_nid(struct mempolicy * pol,pgoff_t ilx)2248ddc1a5cbSHugh Dickins static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
22491da177e4SLinus Torvalds {
22509685e6e3SGregory Price nodemask_t nodemask;
2251276aeee1Syanghui unsigned int target, nnodes;
2252fee83b3aSAndrew Morton int i;
2253fee83b3aSAndrew Morton int nid;
22541da177e4SLinus Torvalds
22559685e6e3SGregory Price nnodes = read_once_policy_nodemask(pol, &nodemask);
2256f5b087b5SDavid Rientjes if (!nnodes)
2257f5b087b5SDavid Rientjes return numa_node_id();
2258ddc1a5cbSHugh Dickins target = ilx % nnodes;
2259276aeee1Syanghui nid = first_node(nodemask);
2260fee83b3aSAndrew Morton for (i = 0; i < target; i++)
2261276aeee1Syanghui nid = next_node(nid, nodemask);
22621da177e4SLinus Torvalds return nid;
22631da177e4SLinus Torvalds }
22641da177e4SLinus Torvalds
22653b98b087SNishanth Aravamudan /*
2266ddc1a5cbSHugh Dickins * Return a nodemask representing a mempolicy for filtering nodes for
2267ddc1a5cbSHugh Dickins * page allocation, together with preferred node id (or the input node id).
22683b98b087SNishanth Aravamudan */
policy_nodemask(gfp_t gfp,struct mempolicy * pol,pgoff_t ilx,int * nid)2269ddc1a5cbSHugh Dickins static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
2270ddc1a5cbSHugh Dickins pgoff_t ilx, int *nid)
2271ddc1a5cbSHugh Dickins {
2272ddc1a5cbSHugh Dickins nodemask_t *nodemask = NULL;
2273ddc1a5cbSHugh Dickins
2274ddc1a5cbSHugh Dickins switch (pol->mode) {
2275ddc1a5cbSHugh Dickins case MPOL_PREFERRED:
2276ddc1a5cbSHugh Dickins /* Override input node id */
2277ddc1a5cbSHugh Dickins *nid = first_node(pol->nodes);
2278ddc1a5cbSHugh Dickins break;
2279ddc1a5cbSHugh Dickins case MPOL_PREFERRED_MANY:
2280ddc1a5cbSHugh Dickins nodemask = &pol->nodes;
2281ddc1a5cbSHugh Dickins if (pol->home_node != NUMA_NO_NODE)
2282ddc1a5cbSHugh Dickins *nid = pol->home_node;
2283ddc1a5cbSHugh Dickins break;
2284ddc1a5cbSHugh Dickins case MPOL_BIND:
2285ddc1a5cbSHugh Dickins /* Restrict to nodemask (but not on lower zones) */
2286ddc1a5cbSHugh Dickins if (apply_policy_zone(pol, gfp_zone(gfp)) &&
2287ddc1a5cbSHugh Dickins cpuset_nodemask_valid_mems_allowed(&pol->nodes))
2288ddc1a5cbSHugh Dickins nodemask = &pol->nodes;
2289ddc1a5cbSHugh Dickins if (pol->home_node != NUMA_NO_NODE)
2290ddc1a5cbSHugh Dickins *nid = pol->home_node;
2291ddc1a5cbSHugh Dickins /*
2292ddc1a5cbSHugh Dickins * __GFP_THISNODE shouldn't even be used with the bind policy
2293ddc1a5cbSHugh Dickins * because we might easily break the expectation to stay on the
2294ddc1a5cbSHugh Dickins * requested node and not break the policy.
2295ddc1a5cbSHugh Dickins */
2296ddc1a5cbSHugh Dickins WARN_ON_ONCE(gfp & __GFP_THISNODE);
2297ddc1a5cbSHugh Dickins break;
2298ddc1a5cbSHugh Dickins case MPOL_INTERLEAVE:
2299ddc1a5cbSHugh Dickins /* Override input node id */
2300ddc1a5cbSHugh Dickins *nid = (ilx == NO_INTERLEAVE_INDEX) ?
2301ddc1a5cbSHugh Dickins interleave_nodes(pol) : interleave_nid(pol, ilx);
2302ddc1a5cbSHugh Dickins break;
2303fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE:
2304fa3bea4eSGregory Price *nid = (ilx == NO_INTERLEAVE_INDEX) ?
2305fa3bea4eSGregory Price weighted_interleave_nodes(pol) :
2306fa3bea4eSGregory Price weighted_interleave_nid(pol, ilx);
2307fa3bea4eSGregory Price break;
2308ddc1a5cbSHugh Dickins }
2309ddc1a5cbSHugh Dickins
2310ddc1a5cbSHugh Dickins return nodemask;
23115da7ca86SChristoph Lameter }
23125da7ca86SChristoph Lameter
231300ac59adSChen, Kenneth W #ifdef CONFIG_HUGETLBFS
2314480eccf9SLee Schermerhorn /*
231504ec6264SVlastimil Babka * huge_node(@vma, @addr, @gfp_flags, @mpol)
2316b46e14acSFabian Frederick * @vma: virtual memory area whose policy is sought
2317b46e14acSFabian Frederick * @addr: address in @vma for shared policy lookup and interleave policy
2318b46e14acSFabian Frederick * @gfp_flags: for requested zone
2319b46e14acSFabian Frederick * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2320b27abaccSDave Hansen * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2321480eccf9SLee Schermerhorn *
232204ec6264SVlastimil Babka * Returns a nid suitable for a huge page allocation and a pointer
232352cd3b07SLee Schermerhorn * to the struct mempolicy for conditional unref after allocation.
2324b27abaccSDave Hansen * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2325b27abaccSDave Hansen * to the mempolicy's @nodemask for filtering the zonelist.
2326480eccf9SLee Schermerhorn */
huge_node(struct vm_area_struct * vma,unsigned long addr,gfp_t gfp_flags,struct mempolicy ** mpol,nodemask_t ** nodemask)232704ec6264SVlastimil Babka int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
232804ec6264SVlastimil Babka struct mempolicy **mpol, nodemask_t **nodemask)
23295da7ca86SChristoph Lameter {
2330ddc1a5cbSHugh Dickins pgoff_t ilx;
233104ec6264SVlastimil Babka int nid;
23325da7ca86SChristoph Lameter
2333ddc1a5cbSHugh Dickins nid = numa_node_id();
2334ddc1a5cbSHugh Dickins *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
2335ddc1a5cbSHugh Dickins *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
233604ec6264SVlastimil Babka return nid;
23375da7ca86SChristoph Lameter }
233806808b08SLee Schermerhorn
233906808b08SLee Schermerhorn /*
234006808b08SLee Schermerhorn * init_nodemask_of_mempolicy
234106808b08SLee Schermerhorn *
234206808b08SLee Schermerhorn * If the current task's mempolicy is "default" [NULL], return 'false'
234306808b08SLee Schermerhorn * to indicate default policy. Otherwise, extract the policy nodemask
234406808b08SLee Schermerhorn * for 'bind' or 'interleave' policy into the argument nodemask, or
234506808b08SLee Schermerhorn * initialize the argument nodemask to contain the single node for
234606808b08SLee Schermerhorn * 'preferred' or 'local' policy and return 'true' to indicate presence
234706808b08SLee Schermerhorn * of non-default mempolicy.
234806808b08SLee Schermerhorn *
234906808b08SLee Schermerhorn * We don't bother with reference counting the mempolicy [mpol_get/put]
235006808b08SLee Schermerhorn * because the current task is examining it's own mempolicy and a task's
235106808b08SLee Schermerhorn * mempolicy is only ever changed by the task itself.
235206808b08SLee Schermerhorn *
235306808b08SLee Schermerhorn * N.B., it is the caller's responsibility to free a returned nodemask.
235406808b08SLee Schermerhorn */
init_nodemask_of_mempolicy(nodemask_t * mask)235506808b08SLee Schermerhorn bool init_nodemask_of_mempolicy(nodemask_t *mask)
235606808b08SLee Schermerhorn {
235706808b08SLee Schermerhorn struct mempolicy *mempolicy;
235806808b08SLee Schermerhorn
235906808b08SLee Schermerhorn if (!(mask && current->mempolicy))
236006808b08SLee Schermerhorn return false;
236106808b08SLee Schermerhorn
2362c0ff7453SMiao Xie task_lock(current);
236306808b08SLee Schermerhorn mempolicy = current->mempolicy;
236406808b08SLee Schermerhorn switch (mempolicy->mode) {
236506808b08SLee Schermerhorn case MPOL_PREFERRED:
2366b27abaccSDave Hansen case MPOL_PREFERRED_MANY:
236706808b08SLee Schermerhorn case MPOL_BIND:
236806808b08SLee Schermerhorn case MPOL_INTERLEAVE:
2369fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE:
2370269fbe72SBen Widawsky *mask = mempolicy->nodes;
237106808b08SLee Schermerhorn break;
237206808b08SLee Schermerhorn
23737858d7bcSFeng Tang case MPOL_LOCAL:
2374269fbe72SBen Widawsky init_nodemask_of_node(mask, numa_node_id());
23757858d7bcSFeng Tang break;
23767858d7bcSFeng Tang
237706808b08SLee Schermerhorn default:
237806808b08SLee Schermerhorn BUG();
237906808b08SLee Schermerhorn }
2380c0ff7453SMiao Xie task_unlock(current);
238106808b08SLee Schermerhorn
238206808b08SLee Schermerhorn return true;
238306808b08SLee Schermerhorn }
238400ac59adSChen, Kenneth W #endif
23855da7ca86SChristoph Lameter
23866f48d0ebSDavid Rientjes /*
2387b26e517aSFeng Tang * mempolicy_in_oom_domain
23886f48d0ebSDavid Rientjes *
2389b26e517aSFeng Tang * If tsk's mempolicy is "bind", check for intersection between mask and
2390b26e517aSFeng Tang * the policy nodemask. Otherwise, return true for all other policies
2391b26e517aSFeng Tang * including "interleave", as a tsk with "interleave" policy may have
2392b26e517aSFeng Tang * memory allocated from all nodes in system.
23936f48d0ebSDavid Rientjes *
23946f48d0ebSDavid Rientjes * Takes task_lock(tsk) to prevent freeing of its mempolicy.
23956f48d0ebSDavid Rientjes */
mempolicy_in_oom_domain(struct task_struct * tsk,const nodemask_t * mask)2396b26e517aSFeng Tang bool mempolicy_in_oom_domain(struct task_struct *tsk,
23976f48d0ebSDavid Rientjes const nodemask_t *mask)
23986f48d0ebSDavid Rientjes {
23996f48d0ebSDavid Rientjes struct mempolicy *mempolicy;
24006f48d0ebSDavid Rientjes bool ret = true;
24016f48d0ebSDavid Rientjes
24026f48d0ebSDavid Rientjes if (!mask)
24036f48d0ebSDavid Rientjes return ret;
2404b26e517aSFeng Tang
24056f48d0ebSDavid Rientjes task_lock(tsk);
24066f48d0ebSDavid Rientjes mempolicy = tsk->mempolicy;
2407b26e517aSFeng Tang if (mempolicy && mempolicy->mode == MPOL_BIND)
2408269fbe72SBen Widawsky ret = nodes_intersects(mempolicy->nodes, *mask);
24096f48d0ebSDavid Rientjes task_unlock(tsk);
2410b26e517aSFeng Tang
24116f48d0ebSDavid Rientjes return ret;
24126f48d0ebSDavid Rientjes }
24136f48d0ebSDavid Rientjes
alloc_pages_preferred_many(gfp_t gfp,unsigned int order,int nid,nodemask_t * nodemask)24144c54d949SFeng Tang static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2415ddc1a5cbSHugh Dickins int nid, nodemask_t *nodemask)
24164c54d949SFeng Tang {
24174c54d949SFeng Tang struct page *page;
24184c54d949SFeng Tang gfp_t preferred_gfp;
24194c54d949SFeng Tang
24204c54d949SFeng Tang /*
24214c54d949SFeng Tang * This is a two pass approach. The first pass will only try the
24224c54d949SFeng Tang * preferred nodes but skip the direct reclaim and allow the
24234c54d949SFeng Tang * allocation to fail, while the second pass will try all the
24244c54d949SFeng Tang * nodes in system.
24254c54d949SFeng Tang */
24264c54d949SFeng Tang preferred_gfp = gfp | __GFP_NOWARN;
24274c54d949SFeng Tang preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
242864297524SMatthew Wilcox (Oracle) page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
24294c54d949SFeng Tang if (!page)
243064297524SMatthew Wilcox (Oracle) page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
24314c54d949SFeng Tang
24324c54d949SFeng Tang return page;
24334c54d949SFeng Tang }
24344c54d949SFeng Tang
24351da177e4SLinus Torvalds /**
2436ddc1a5cbSHugh Dickins * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
2437eb350739SMatthew Wilcox (Oracle) * @gfp: GFP flags.
2438ddc1a5cbSHugh Dickins * @order: Order of the page allocation.
2439ddc1a5cbSHugh Dickins * @pol: Pointer to the NUMA mempolicy.
2440ddc1a5cbSHugh Dickins * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2441ddc1a5cbSHugh Dickins * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
24421da177e4SLinus Torvalds *
2443ddc1a5cbSHugh Dickins * Return: The page on success or NULL if allocation fails.
24441da177e4SLinus Torvalds */
alloc_pages_mpol(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)244538558b24SMatthew Wilcox (Oracle) static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2446ddc1a5cbSHugh Dickins struct mempolicy *pol, pgoff_t ilx, int nid)
24471da177e4SLinus Torvalds {
2448ddc1a5cbSHugh Dickins nodemask_t *nodemask;
2449adf88aa8SMatthew Wilcox (Oracle) struct page *page;
2450adf88aa8SMatthew Wilcox (Oracle)
2451ddc1a5cbSHugh Dickins nodemask = policy_nodemask(gfp, pol, ilx, &nid);
24524c54d949SFeng Tang
2453ddc1a5cbSHugh Dickins if (pol->mode == MPOL_PREFERRED_MANY)
2454ddc1a5cbSHugh Dickins return alloc_pages_preferred_many(gfp, order, nid, nodemask);
245519deb769SDavid Rientjes
2456ddc1a5cbSHugh Dickins if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2457ddc1a5cbSHugh Dickins /* filter "hugepage" allocation, unless from alloc_pages() */
2458b90c453dSNico Pache is_pmd_order(order) && ilx != NO_INTERLEAVE_INDEX) {
245919deb769SDavid Rientjes /*
246019deb769SDavid Rientjes * For hugepage allocation and non-interleave policy which
246119deb769SDavid Rientjes * allows the current node (or other explicitly preferred
246219deb769SDavid Rientjes * node) we only try to allocate from the current/preferred
246319deb769SDavid Rientjes * node and don't fall back to other nodes, as the cost of
246419deb769SDavid Rientjes * remote accesses would likely offset THP benefits.
246519deb769SDavid Rientjes *
2466b27abaccSDave Hansen * If the policy is interleave or does not allow the current
246719deb769SDavid Rientjes * node in its nodemask, we allocate the standard way.
246819deb769SDavid Rientjes */
2469ddc1a5cbSHugh Dickins if (pol->mode != MPOL_INTERLEAVE &&
2470fa3bea4eSGregory Price pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
2471ddc1a5cbSHugh Dickins (!nodemask || node_isset(nid, *nodemask))) {
2472cc638f32SVlastimil Babka /*
2473cc638f32SVlastimil Babka * First, try to allocate THP only on local node, but
2474cc638f32SVlastimil Babka * don't reclaim unnecessarily, just compact.
2475cc638f32SVlastimil Babka */
247664297524SMatthew Wilcox (Oracle) page = __alloc_frozen_pages_noprof(
247764297524SMatthew Wilcox (Oracle) gfp | __GFP_THISNODE | __GFP_NORETRY, order,
247864297524SMatthew Wilcox (Oracle) nid, NULL);
2479ddc1a5cbSHugh Dickins if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2480ddc1a5cbSHugh Dickins return page;
248176e654ccSDavid Rientjes /*
248276e654ccSDavid Rientjes * If hugepage allocations are configured to always
248376e654ccSDavid Rientjes * synchronous compact or the vma has been madvised
248476e654ccSDavid Rientjes * to prefer hugepage backing, retry allowing remote
2485cc638f32SVlastimil Babka * memory with both reclaim and compact as well.
248676e654ccSDavid Rientjes */
248719deb769SDavid Rientjes }
248819deb769SDavid Rientjes }
248919deb769SDavid Rientjes
249064297524SMatthew Wilcox (Oracle) page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
2491ddc1a5cbSHugh Dickins
2492264a88caSHonggyu Kim if (unlikely(pol->mode == MPOL_INTERLEAVE ||
2493264a88caSHonggyu Kim pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
2494ddc1a5cbSHugh Dickins /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2495ddc1a5cbSHugh Dickins if (static_branch_likely(&vm_numa_stat_key) &&
2496ddc1a5cbSHugh Dickins page_to_nid(page) == nid) {
2497ddc1a5cbSHugh Dickins preempt_disable();
2498ddc1a5cbSHugh Dickins __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2499ddc1a5cbSHugh Dickins preempt_enable();
2500ddc1a5cbSHugh Dickins }
2501ddc1a5cbSHugh Dickins }
2502ddc1a5cbSHugh Dickins
2503ddc1a5cbSHugh Dickins return page;
2504ddc1a5cbSHugh Dickins }
2505ddc1a5cbSHugh Dickins
folio_alloc_mpol_noprof(gfp_t gfp,unsigned int order,struct mempolicy * pol,pgoff_t ilx,int nid)2506a19621edSKefeng Wang struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
2507a19621edSKefeng Wang struct mempolicy *pol, pgoff_t ilx, int nid)
2508a19621edSKefeng Wang {
250964297524SMatthew Wilcox (Oracle) struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
251064297524SMatthew Wilcox (Oracle) ilx, nid);
251164297524SMatthew Wilcox (Oracle) if (!page)
251264297524SMatthew Wilcox (Oracle) return NULL;
251364297524SMatthew Wilcox (Oracle)
251464297524SMatthew Wilcox (Oracle) set_page_refcounted(page);
251564297524SMatthew Wilcox (Oracle) return page_rmappable_folio(page);
2516a19621edSKefeng Wang }
2517a19621edSKefeng Wang
2518ddc1a5cbSHugh Dickins /**
2519ddc1a5cbSHugh Dickins * vma_alloc_folio - Allocate a folio for a VMA.
2520ddc1a5cbSHugh Dickins * @gfp: GFP flags.
2521ddc1a5cbSHugh Dickins * @order: Order of the folio.
2522ddc1a5cbSHugh Dickins * @vma: Pointer to VMA.
2523ddc1a5cbSHugh Dickins * @addr: Virtual address of the allocation. Must be inside @vma.
2524ddc1a5cbSHugh Dickins *
2525ddc1a5cbSHugh Dickins * Allocate a folio for a specific address in @vma, using the appropriate
2526ddc1a5cbSHugh Dickins * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
2527ddc1a5cbSHugh Dickins * VMA to prevent it from going away. Should be used for all allocations
2528ddc1a5cbSHugh Dickins * for folios that will be mapped into user space, excepting hugetlbfs, and
252938558b24SMatthew Wilcox (Oracle) * excepting where direct use of folio_alloc_mpol() is more appropriate.
2530ddc1a5cbSHugh Dickins *
2531ddc1a5cbSHugh Dickins * Return: The folio on success or NULL if allocation fails.
2532ddc1a5cbSHugh Dickins */
vma_alloc_folio_noprof(gfp_t gfp,int order,struct vm_area_struct * vma,unsigned long addr)2533b951aaffSSuren Baghdasaryan struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
25346359c39cSKefeng Wang unsigned long addr)
2535ddc1a5cbSHugh Dickins {
2536ddc1a5cbSHugh Dickins struct mempolicy *pol;
2537ddc1a5cbSHugh Dickins pgoff_t ilx;
25383174d70cSKefeng Wang struct folio *folio;
2539ddc1a5cbSHugh Dickins
25409651fcedSJason A. Donenfeld if (vma->vm_flags & VM_DROPPABLE)
25419651fcedSJason A. Donenfeld gfp |= __GFP_NOWARN;
25429651fcedSJason A. Donenfeld
2543ddc1a5cbSHugh Dickins pol = get_vma_policy(vma, addr, order, &ilx);
25443174d70cSKefeng Wang folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
2545d51e9894SVlastimil Babka mpol_cond_put(pol);
25463174d70cSKefeng Wang return folio;
2547f584b680SMatthew Wilcox (Oracle) }
2548b951aaffSSuren Baghdasaryan EXPORT_SYMBOL(vma_alloc_folio_noprof);
2549f584b680SMatthew Wilcox (Oracle)
alloc_frozen_pages_noprof(gfp_t gfp,unsigned order)255064297524SMatthew Wilcox (Oracle) struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
255164297524SMatthew Wilcox (Oracle) {
255264297524SMatthew Wilcox (Oracle) struct mempolicy *pol = &default_policy;
255364297524SMatthew Wilcox (Oracle)
255464297524SMatthew Wilcox (Oracle) /*
255564297524SMatthew Wilcox (Oracle) * No reference counting needed for current->mempolicy
255664297524SMatthew Wilcox (Oracle) * nor system default_policy
255764297524SMatthew Wilcox (Oracle) */
255864297524SMatthew Wilcox (Oracle) if (!in_interrupt() && !(gfp & __GFP_THISNODE))
255964297524SMatthew Wilcox (Oracle) pol = get_task_policy(current);
256064297524SMatthew Wilcox (Oracle)
256164297524SMatthew Wilcox (Oracle) return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
256264297524SMatthew Wilcox (Oracle) numa_node_id());
256364297524SMatthew Wilcox (Oracle) }
256464297524SMatthew Wilcox (Oracle)
25651da177e4SLinus Torvalds /**
2566d7f946d0SMatthew Wilcox (Oracle) * alloc_pages - Allocate pages.
25676421ec76SMatthew Wilcox (Oracle) * @gfp: GFP flags.
25686421ec76SMatthew Wilcox (Oracle) * @order: Power of two of number of pages to allocate.
25691da177e4SLinus Torvalds *
25706421ec76SMatthew Wilcox (Oracle) * Allocate 1 << @order contiguous pages. The physical address of the
25716421ec76SMatthew Wilcox (Oracle) * first page is naturally aligned (eg an order-3 allocation will be aligned
25726421ec76SMatthew Wilcox (Oracle) * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
25736421ec76SMatthew Wilcox (Oracle) * process is honoured when in process context.
25741da177e4SLinus Torvalds *
25756421ec76SMatthew Wilcox (Oracle) * Context: Can be called from any context, providing the appropriate GFP
25766421ec76SMatthew Wilcox (Oracle) * flags are used.
25776421ec76SMatthew Wilcox (Oracle) * Return: The page on success or NULL if allocation fails.
25781da177e4SLinus Torvalds */
alloc_pages_noprof(gfp_t gfp,unsigned int order)2579b951aaffSSuren Baghdasaryan struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
25801da177e4SLinus Torvalds {
258164297524SMatthew Wilcox (Oracle) struct page *page = alloc_frozen_pages_noprof(gfp, order);
258252cd3b07SLee Schermerhorn
258364297524SMatthew Wilcox (Oracle) if (page)
258464297524SMatthew Wilcox (Oracle) set_page_refcounted(page);
258564297524SMatthew Wilcox (Oracle) return page;
25861da177e4SLinus Torvalds }
2587b951aaffSSuren Baghdasaryan EXPORT_SYMBOL(alloc_pages_noprof);
25881da177e4SLinus Torvalds
folio_alloc_noprof(gfp_t gfp,unsigned int order)2589b951aaffSSuren Baghdasaryan struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
2590cc09cb13SMatthew Wilcox (Oracle) {
2591b951aaffSSuren Baghdasaryan return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
2592cc09cb13SMatthew Wilcox (Oracle) }
2593b951aaffSSuren Baghdasaryan EXPORT_SYMBOL(folio_alloc_noprof);
2594cc09cb13SMatthew Wilcox (Oracle)
alloc_pages_bulk_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)25956bf9b5b4SLuiz Capitulino static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
2596c00b6b96SChen Wandun struct mempolicy *pol, unsigned long nr_pages,
2597c00b6b96SChen Wandun struct page **page_array)
2598c00b6b96SChen Wandun {
2599c00b6b96SChen Wandun int nodes;
2600c00b6b96SChen Wandun unsigned long nr_pages_per_node;
2601c00b6b96SChen Wandun int delta;
2602c00b6b96SChen Wandun int i;
2603c00b6b96SChen Wandun unsigned long nr_allocated;
2604c00b6b96SChen Wandun unsigned long total_allocated = 0;
2605c00b6b96SChen Wandun
2606c00b6b96SChen Wandun nodes = nodes_weight(pol->nodes);
2607c00b6b96SChen Wandun nr_pages_per_node = nr_pages / nodes;
2608c00b6b96SChen Wandun delta = nr_pages - nodes * nr_pages_per_node;
2609c00b6b96SChen Wandun
2610c00b6b96SChen Wandun for (i = 0; i < nodes; i++) {
2611c00b6b96SChen Wandun if (delta) {
2612b951aaffSSuren Baghdasaryan nr_allocated = alloc_pages_bulk_noprof(gfp,
2613c00b6b96SChen Wandun interleave_nodes(pol), NULL,
2614c8b97953SLuiz Capitulino nr_pages_per_node + 1,
2615c00b6b96SChen Wandun page_array);
2616c00b6b96SChen Wandun delta--;
2617c00b6b96SChen Wandun } else {
2618b951aaffSSuren Baghdasaryan nr_allocated = alloc_pages_bulk_noprof(gfp,
2619c00b6b96SChen Wandun interleave_nodes(pol), NULL,
2620c8b97953SLuiz Capitulino nr_pages_per_node, page_array);
2621c00b6b96SChen Wandun }
2622c00b6b96SChen Wandun
2623c00b6b96SChen Wandun page_array += nr_allocated;
2624c00b6b96SChen Wandun total_allocated += nr_allocated;
2625c00b6b96SChen Wandun }
2626c00b6b96SChen Wandun
2627c00b6b96SChen Wandun return total_allocated;
2628c00b6b96SChen Wandun }
2629c00b6b96SChen Wandun
alloc_pages_bulk_weighted_interleave(gfp_t gfp,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)26306bf9b5b4SLuiz Capitulino static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
2631fa3bea4eSGregory Price struct mempolicy *pol, unsigned long nr_pages,
2632fa3bea4eSGregory Price struct page **page_array)
2633fa3bea4eSGregory Price {
2634e341f9c3SJoshua Hahn struct weighted_interleave_state *state;
2635fa3bea4eSGregory Price struct task_struct *me = current;
2636274519edSGregory Price unsigned int cpuset_mems_cookie;
2637fa3bea4eSGregory Price unsigned long total_allocated = 0;
2638fa3bea4eSGregory Price unsigned long nr_allocated = 0;
2639fa3bea4eSGregory Price unsigned long rounds;
2640fa3bea4eSGregory Price unsigned long node_pages, delta;
2641e341f9c3SJoshua Hahn u8 *weights, weight;
2642fa3bea4eSGregory Price unsigned int weight_total = 0;
2643fa3bea4eSGregory Price unsigned long rem_pages = nr_pages;
2644fa3bea4eSGregory Price nodemask_t nodes;
2645fa3bea4eSGregory Price int nnodes, node;
2646fa3bea4eSGregory Price int resume_node = MAX_NUMNODES - 1;
2647fa3bea4eSGregory Price u8 resume_weight = 0;
2648fa3bea4eSGregory Price int prev_node;
2649fa3bea4eSGregory Price int i;
2650fa3bea4eSGregory Price
2651fa3bea4eSGregory Price if (!nr_pages)
2652fa3bea4eSGregory Price return 0;
2653fa3bea4eSGregory Price
2654274519edSGregory Price /* read the nodes onto the stack, retry if done during rebind */
2655274519edSGregory Price do {
2656274519edSGregory Price cpuset_mems_cookie = read_mems_allowed_begin();
2657fa3bea4eSGregory Price nnodes = read_once_policy_nodemask(pol, &nodes);
2658274519edSGregory Price } while (read_mems_allowed_retry(cpuset_mems_cookie));
2659274519edSGregory Price
2660274519edSGregory Price /* if the nodemask has become invalid, we cannot do anything */
2661fa3bea4eSGregory Price if (!nnodes)
2662fa3bea4eSGregory Price return 0;
2663fa3bea4eSGregory Price
2664fa3bea4eSGregory Price /* Continue allocating from most recent node and adjust the nr_pages */
2665fa3bea4eSGregory Price node = me->il_prev;
2666fa3bea4eSGregory Price weight = me->il_weight;
2667fa3bea4eSGregory Price if (weight && node_isset(node, nodes)) {
2668fa3bea4eSGregory Price node_pages = min(rem_pages, weight);
2669fa3bea4eSGregory Price nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2670c8b97953SLuiz Capitulino page_array);
2671fa3bea4eSGregory Price page_array += nr_allocated;
2672fa3bea4eSGregory Price total_allocated += nr_allocated;
2673fa3bea4eSGregory Price /* if that's all the pages, no need to interleave */
2674fa3bea4eSGregory Price if (rem_pages <= weight) {
2675fa3bea4eSGregory Price me->il_weight -= rem_pages;
2676fa3bea4eSGregory Price return total_allocated;
2677fa3bea4eSGregory Price }
2678fa3bea4eSGregory Price /* Otherwise we adjust remaining pages, continue from there */
2679fa3bea4eSGregory Price rem_pages -= weight;
2680fa3bea4eSGregory Price }
2681fa3bea4eSGregory Price /* clear active weight in case of an allocation failure */
2682fa3bea4eSGregory Price me->il_weight = 0;
2683fa3bea4eSGregory Price prev_node = node;
2684fa3bea4eSGregory Price
2685fa3bea4eSGregory Price /* create a local copy of node weights to operate on outside rcu */
2686fa3bea4eSGregory Price weights = kzalloc(nr_node_ids, GFP_KERNEL);
2687fa3bea4eSGregory Price if (!weights)
2688fa3bea4eSGregory Price return total_allocated;
2689fa3bea4eSGregory Price
2690fa3bea4eSGregory Price rcu_read_lock();
2691e341f9c3SJoshua Hahn state = rcu_dereference(wi_state);
2692e341f9c3SJoshua Hahn if (state) {
2693e341f9c3SJoshua Hahn memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
2694fa3bea4eSGregory Price rcu_read_unlock();
2695e341f9c3SJoshua Hahn } else {
2696e341f9c3SJoshua Hahn rcu_read_unlock();
2697e341f9c3SJoshua Hahn for (i = 0; i < nr_node_ids; i++)
2698e341f9c3SJoshua Hahn weights[i] = 1;
2699e341f9c3SJoshua Hahn }
2700fa3bea4eSGregory Price
2701fa3bea4eSGregory Price /* calculate total, detect system default usage */
2702e341f9c3SJoshua Hahn for_each_node_mask(node, nodes)
2703fa3bea4eSGregory Price weight_total += weights[node];
2704fa3bea4eSGregory Price
2705fa3bea4eSGregory Price /*
2706fa3bea4eSGregory Price * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
2707fa3bea4eSGregory Price * Track which node weighted interleave should resume from.
2708fa3bea4eSGregory Price *
2709fa3bea4eSGregory Price * if (rounds > 0) and (delta == 0), resume_node will always be
2710fa3bea4eSGregory Price * the node following prev_node and its weight.
2711fa3bea4eSGregory Price */
2712fa3bea4eSGregory Price rounds = rem_pages / weight_total;
2713fa3bea4eSGregory Price delta = rem_pages % weight_total;
2714fa3bea4eSGregory Price resume_node = next_node_in(prev_node, nodes);
2715fa3bea4eSGregory Price resume_weight = weights[resume_node];
2716fa3bea4eSGregory Price for (i = 0; i < nnodes; i++) {
2717fa3bea4eSGregory Price node = next_node_in(prev_node, nodes);
2718fa3bea4eSGregory Price weight = weights[node];
2719fa3bea4eSGregory Price node_pages = weight * rounds;
2720fa3bea4eSGregory Price /* If a delta exists, add this node's portion of the delta */
2721fa3bea4eSGregory Price if (delta > weight) {
2722fa3bea4eSGregory Price node_pages += weight;
2723fa3bea4eSGregory Price delta -= weight;
2724fa3bea4eSGregory Price } else if (delta) {
2725fa3bea4eSGregory Price /* when delta is depleted, resume from that node */
2726fa3bea4eSGregory Price node_pages += delta;
2727fa3bea4eSGregory Price resume_node = node;
2728fa3bea4eSGregory Price resume_weight = weight - delta;
2729fa3bea4eSGregory Price delta = 0;
2730fa3bea4eSGregory Price }
2731fa3bea4eSGregory Price /* node_pages can be 0 if an allocation fails and rounds == 0 */
2732fa3bea4eSGregory Price if (!node_pages)
2733fa3bea4eSGregory Price break;
2734fa3bea4eSGregory Price nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
2735c8b97953SLuiz Capitulino page_array);
2736fa3bea4eSGregory Price page_array += nr_allocated;
2737fa3bea4eSGregory Price total_allocated += nr_allocated;
2738fa3bea4eSGregory Price if (total_allocated == nr_pages)
2739fa3bea4eSGregory Price break;
2740fa3bea4eSGregory Price prev_node = node;
2741fa3bea4eSGregory Price }
2742fa3bea4eSGregory Price me->il_prev = resume_node;
2743fa3bea4eSGregory Price me->il_weight = resume_weight;
2744fa3bea4eSGregory Price kfree(weights);
2745fa3bea4eSGregory Price return total_allocated;
2746fa3bea4eSGregory Price }
2747fa3bea4eSGregory Price
alloc_pages_bulk_preferred_many(gfp_t gfp,int nid,struct mempolicy * pol,unsigned long nr_pages,struct page ** page_array)27486bf9b5b4SLuiz Capitulino static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
2749c00b6b96SChen Wandun struct mempolicy *pol, unsigned long nr_pages,
2750c00b6b96SChen Wandun struct page **page_array)
2751c00b6b96SChen Wandun {
2752c00b6b96SChen Wandun gfp_t preferred_gfp;
2753c00b6b96SChen Wandun unsigned long nr_allocated = 0;
2754c00b6b96SChen Wandun
2755c00b6b96SChen Wandun preferred_gfp = gfp | __GFP_NOWARN;
2756c00b6b96SChen Wandun preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2757c00b6b96SChen Wandun
2758b951aaffSSuren Baghdasaryan nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
2759c8b97953SLuiz Capitulino nr_pages, page_array);
2760c00b6b96SChen Wandun
2761c00b6b96SChen Wandun if (nr_allocated < nr_pages)
2762b951aaffSSuren Baghdasaryan nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
2763c8b97953SLuiz Capitulino nr_pages - nr_allocated,
2764c00b6b96SChen Wandun page_array + nr_allocated);
2765c00b6b96SChen Wandun return nr_allocated;
2766c00b6b96SChen Wandun }
2767c00b6b96SChen Wandun
2768c00b6b96SChen Wandun /* alloc pages bulk and mempolicy should be considered at the
2769c00b6b96SChen Wandun * same time in some situation such as vmalloc.
2770c00b6b96SChen Wandun *
2771c00b6b96SChen Wandun * It can accelerate memory allocation especially interleaving
2772c00b6b96SChen Wandun * allocate memory.
2773c00b6b96SChen Wandun */
alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,unsigned long nr_pages,struct page ** page_array)27746bf9b5b4SLuiz Capitulino unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
2775c00b6b96SChen Wandun unsigned long nr_pages, struct page **page_array)
2776c00b6b96SChen Wandun {
2777c00b6b96SChen Wandun struct mempolicy *pol = &default_policy;
2778ddc1a5cbSHugh Dickins nodemask_t *nodemask;
2779ddc1a5cbSHugh Dickins int nid;
2780c00b6b96SChen Wandun
2781c00b6b96SChen Wandun if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2782c00b6b96SChen Wandun pol = get_task_policy(current);
2783c00b6b96SChen Wandun
2784c00b6b96SChen Wandun if (pol->mode == MPOL_INTERLEAVE)
27856bf9b5b4SLuiz Capitulino return alloc_pages_bulk_interleave(gfp, pol,
2786c00b6b96SChen Wandun nr_pages, page_array);
2787c00b6b96SChen Wandun
2788fa3bea4eSGregory Price if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
27896bf9b5b4SLuiz Capitulino return alloc_pages_bulk_weighted_interleave(
2790fa3bea4eSGregory Price gfp, pol, nr_pages, page_array);
2791fa3bea4eSGregory Price
2792c00b6b96SChen Wandun if (pol->mode == MPOL_PREFERRED_MANY)
27936bf9b5b4SLuiz Capitulino return alloc_pages_bulk_preferred_many(gfp,
2794c00b6b96SChen Wandun numa_node_id(), pol, nr_pages, page_array);
2795c00b6b96SChen Wandun
2796ddc1a5cbSHugh Dickins nid = numa_node_id();
2797ddc1a5cbSHugh Dickins nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2798b951aaffSSuren Baghdasaryan return alloc_pages_bulk_noprof(gfp, nid, nodemask,
2799c8b97953SLuiz Capitulino nr_pages, page_array);
2800c00b6b96SChen Wandun }
2801c00b6b96SChen Wandun
vma_dup_policy(struct vm_area_struct * src,struct vm_area_struct * dst)2802ef0855d3SOleg Nesterov int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2803ef0855d3SOleg Nesterov {
2804c36f6e6dSHugh Dickins struct mempolicy *pol = mpol_dup(src->vm_policy);
2805ef0855d3SOleg Nesterov
2806ef0855d3SOleg Nesterov if (IS_ERR(pol))
2807ef0855d3SOleg Nesterov return PTR_ERR(pol);
2808ef0855d3SOleg Nesterov dst->vm_policy = pol;
2809ef0855d3SOleg Nesterov return 0;
2810ef0855d3SOleg Nesterov }
2811ef0855d3SOleg Nesterov
28124225399aSPaul Jackson /*
2813846a16bfSLee Schermerhorn * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
28144225399aSPaul Jackson * rebinds the mempolicy its copying by calling mpol_rebind_policy()
28154225399aSPaul Jackson * with the mems_allowed returned by cpuset_mems_allowed(). This
28164225399aSPaul Jackson * keeps mempolicies cpuset relative after its cpuset moves. See
28174225399aSPaul Jackson * further kernel/cpuset.c update_nodemask().
2818708c1bbcSMiao Xie *
2819708c1bbcSMiao Xie * current's mempolicy may be rebinded by the other task(the task that changes
2820708c1bbcSMiao Xie * cpuset's mems), so we needn't do rebind work for current task.
28214225399aSPaul Jackson */
28224225399aSPaul Jackson
2823846a16bfSLee Schermerhorn /* Slow path of a mempolicy duplicate */
__mpol_dup(struct mempolicy * old)2824846a16bfSLee Schermerhorn struct mempolicy *__mpol_dup(struct mempolicy *old)
28251da177e4SLinus Torvalds {
28261da177e4SLinus Torvalds struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
28271da177e4SLinus Torvalds
28281da177e4SLinus Torvalds if (!new)
28291da177e4SLinus Torvalds return ERR_PTR(-ENOMEM);
2830708c1bbcSMiao Xie
2831708c1bbcSMiao Xie /* task's mempolicy is protected by alloc_lock */
2832708c1bbcSMiao Xie if (old == current->mempolicy) {
2833708c1bbcSMiao Xie task_lock(current);
2834708c1bbcSMiao Xie *new = *old;
2835708c1bbcSMiao Xie task_unlock(current);
2836708c1bbcSMiao Xie } else
2837708c1bbcSMiao Xie *new = *old;
2838708c1bbcSMiao Xie
28394225399aSPaul Jackson if (current_cpuset_is_being_rebound()) {
28404225399aSPaul Jackson nodemask_t mems = cpuset_mems_allowed(current);
2841213980c0SVlastimil Babka mpol_rebind_policy(new, &mems);
28424225399aSPaul Jackson }
28431da177e4SLinus Torvalds atomic_set(&new->refcnt, 1);
28441da177e4SLinus Torvalds return new;
28451da177e4SLinus Torvalds }
28461da177e4SLinus Torvalds
28471da177e4SLinus Torvalds /* Slow path of a mempolicy comparison */
__mpol_equal(struct mempolicy * a,struct mempolicy * b)2848fcfb4dccSKOSAKI Motohiro bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
28491da177e4SLinus Torvalds {
28501da177e4SLinus Torvalds if (!a || !b)
2851fcfb4dccSKOSAKI Motohiro return false;
285245c4745aSLee Schermerhorn if (a->mode != b->mode)
2853fcfb4dccSKOSAKI Motohiro return false;
285419800502SBob Liu if (a->flags != b->flags)
2855fcfb4dccSKOSAKI Motohiro return false;
2856c6018b4bSAneesh Kumar K.V if (a->home_node != b->home_node)
2857c6018b4bSAneesh Kumar K.V return false;
285819800502SBob Liu if (mpol_store_user_nodemask(a))
285919800502SBob Liu if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2860fcfb4dccSKOSAKI Motohiro return false;
286119800502SBob Liu
286245c4745aSLee Schermerhorn switch (a->mode) {
286319770b32SMel Gorman case MPOL_BIND:
28641da177e4SLinus Torvalds case MPOL_INTERLEAVE:
28651da177e4SLinus Torvalds case MPOL_PREFERRED:
2866b27abaccSDave Hansen case MPOL_PREFERRED_MANY:
2867fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE:
2868269fbe72SBen Widawsky return !!nodes_equal(a->nodes, b->nodes);
28697858d7bcSFeng Tang case MPOL_LOCAL:
28707858d7bcSFeng Tang return true;
28711da177e4SLinus Torvalds default:
28721da177e4SLinus Torvalds BUG();
2873fcfb4dccSKOSAKI Motohiro return false;
28741da177e4SLinus Torvalds }
28751da177e4SLinus Torvalds }
28761da177e4SLinus Torvalds
28771da177e4SLinus Torvalds /*
28781da177e4SLinus Torvalds * Shared memory backing store policy support.
28791da177e4SLinus Torvalds *
28801da177e4SLinus Torvalds * Remember policies even when nobody has shared memory mapped.
28811da177e4SLinus Torvalds * The policies are kept in Red-Black tree linked from the inode.
28824a8c7bb5SNathan Zimmer * They are protected by the sp->lock rwlock, which should be held
28831da177e4SLinus Torvalds * for any accesses to the tree.
28841da177e4SLinus Torvalds */
28851da177e4SLinus Torvalds
28864a8c7bb5SNathan Zimmer /*
28874a8c7bb5SNathan Zimmer * lookup first element intersecting start-end. Caller holds sp->lock for
28884a8c7bb5SNathan Zimmer * reading or for writing
28894a8c7bb5SNathan Zimmer */
sp_lookup(struct shared_policy * sp,pgoff_t start,pgoff_t end)289093397c3bSHugh Dickins static struct sp_node *sp_lookup(struct shared_policy *sp,
289193397c3bSHugh Dickins pgoff_t start, pgoff_t end)
28921da177e4SLinus Torvalds {
28931da177e4SLinus Torvalds struct rb_node *n = sp->root.rb_node;
28941da177e4SLinus Torvalds
28951da177e4SLinus Torvalds while (n) {
28961da177e4SLinus Torvalds struct sp_node *p = rb_entry(n, struct sp_node, nd);
28971da177e4SLinus Torvalds
28981da177e4SLinus Torvalds if (start >= p->end)
28991da177e4SLinus Torvalds n = n->rb_right;
29001da177e4SLinus Torvalds else if (end <= p->start)
29011da177e4SLinus Torvalds n = n->rb_left;
29021da177e4SLinus Torvalds else
29031da177e4SLinus Torvalds break;
29041da177e4SLinus Torvalds }
29051da177e4SLinus Torvalds if (!n)
29061da177e4SLinus Torvalds return NULL;
29071da177e4SLinus Torvalds for (;;) {
29081da177e4SLinus Torvalds struct sp_node *w = NULL;
29091da177e4SLinus Torvalds struct rb_node *prev = rb_prev(n);
29101da177e4SLinus Torvalds if (!prev)
29111da177e4SLinus Torvalds break;
29121da177e4SLinus Torvalds w = rb_entry(prev, struct sp_node, nd);
29131da177e4SLinus Torvalds if (w->end <= start)
29141da177e4SLinus Torvalds break;
29151da177e4SLinus Torvalds n = prev;
29161da177e4SLinus Torvalds }
29171da177e4SLinus Torvalds return rb_entry(n, struct sp_node, nd);
29181da177e4SLinus Torvalds }
29191da177e4SLinus Torvalds
29204a8c7bb5SNathan Zimmer /*
29214a8c7bb5SNathan Zimmer * Insert a new shared policy into the list. Caller holds sp->lock for
29224a8c7bb5SNathan Zimmer * writing.
29234a8c7bb5SNathan Zimmer */
sp_insert(struct shared_policy * sp,struct sp_node * new)29241da177e4SLinus Torvalds static void sp_insert(struct shared_policy *sp, struct sp_node *new)
29251da177e4SLinus Torvalds {
29261da177e4SLinus Torvalds struct rb_node **p = &sp->root.rb_node;
29271da177e4SLinus Torvalds struct rb_node *parent = NULL;
29281da177e4SLinus Torvalds struct sp_node *nd;
29291da177e4SLinus Torvalds
29301da177e4SLinus Torvalds while (*p) {
29311da177e4SLinus Torvalds parent = *p;
29321da177e4SLinus Torvalds nd = rb_entry(parent, struct sp_node, nd);
29331da177e4SLinus Torvalds if (new->start < nd->start)
29341da177e4SLinus Torvalds p = &(*p)->rb_left;
29351da177e4SLinus Torvalds else if (new->end > nd->end)
29361da177e4SLinus Torvalds p = &(*p)->rb_right;
29371da177e4SLinus Torvalds else
29381da177e4SLinus Torvalds BUG();
29391da177e4SLinus Torvalds }
29401da177e4SLinus Torvalds rb_link_node(&new->nd, parent, p);
29411da177e4SLinus Torvalds rb_insert_color(&new->nd, &sp->root);
29421da177e4SLinus Torvalds }
29431da177e4SLinus Torvalds
29441da177e4SLinus Torvalds /* Find shared policy intersecting idx */
mpol_shared_policy_lookup(struct shared_policy * sp,pgoff_t idx)294593397c3bSHugh Dickins struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
294693397c3bSHugh Dickins pgoff_t idx)
29471da177e4SLinus Torvalds {
29481da177e4SLinus Torvalds struct mempolicy *pol = NULL;
29491da177e4SLinus Torvalds struct sp_node *sn;
29501da177e4SLinus Torvalds
29511da177e4SLinus Torvalds if (!sp->root.rb_node)
29521da177e4SLinus Torvalds return NULL;
29534a8c7bb5SNathan Zimmer read_lock(&sp->lock);
29541da177e4SLinus Torvalds sn = sp_lookup(sp, idx, idx+1);
29551da177e4SLinus Torvalds if (sn) {
29561da177e4SLinus Torvalds mpol_get(sn->policy);
29571da177e4SLinus Torvalds pol = sn->policy;
29581da177e4SLinus Torvalds }
29594a8c7bb5SNathan Zimmer read_unlock(&sp->lock);
29601da177e4SLinus Torvalds return pol;
29611da177e4SLinus Torvalds }
2962f634f108SShivank Garg EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm");
29631da177e4SLinus Torvalds
sp_free(struct sp_node * n)296463f74ca2SKOSAKI Motohiro static void sp_free(struct sp_node *n)
296563f74ca2SKOSAKI Motohiro {
296663f74ca2SKOSAKI Motohiro mpol_put(n->policy);
296763f74ca2SKOSAKI Motohiro kmem_cache_free(sn_cache, n);
296863f74ca2SKOSAKI Motohiro }
296963f74ca2SKOSAKI Motohiro
2970771fb4d8SLee Schermerhorn /**
297175c70128SKefeng Wang * mpol_misplaced - check whether current folio node is valid in policy
2972771fb4d8SLee Schermerhorn *
297375c70128SKefeng Wang * @folio: folio to be checked
2974f8fd525bSDonet Tom * @vmf: structure describing the fault
297575c70128SKefeng Wang * @addr: virtual address in @vma for shared policy lookup and interleave policy
2976771fb4d8SLee Schermerhorn *
297775c70128SKefeng Wang * Lookup current policy node id for vma,addr and "compare to" folio's
29785f076944SMatthew Wilcox (Oracle) * node id. Policy determination "mimics" alloc_page_vma().
2979771fb4d8SLee Schermerhorn * Called from fault path where we know the vma and faulting address.
29805f076944SMatthew Wilcox (Oracle) *
2981062db293SBaolin Wang * Return: NUMA_NO_NODE if the page is in a node that is valid for this
298275c70128SKefeng Wang * policy, or a suitable node ID to allocate a replacement folio from.
2983771fb4d8SLee Schermerhorn */
mpol_misplaced(struct folio * folio,struct vm_fault * vmf,unsigned long addr)2984f8fd525bSDonet Tom int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
298575c70128SKefeng Wang unsigned long addr)
2986771fb4d8SLee Schermerhorn {
2987771fb4d8SLee Schermerhorn struct mempolicy *pol;
2988ddc1a5cbSHugh Dickins pgoff_t ilx;
2989c33d6c06SMel Gorman struct zoneref *z;
299075c70128SKefeng Wang int curnid = folio_nid(folio);
2991f8fd525bSDonet Tom struct vm_area_struct *vma = vmf->vma;
299290572890SPeter Zijlstra int thiscpu = raw_smp_processor_id();
2993f8fd525bSDonet Tom int thisnid = numa_node_id();
299498fa15f3SAnshuman Khandual int polnid = NUMA_NO_NODE;
2995062db293SBaolin Wang int ret = NUMA_NO_NODE;
2996771fb4d8SLee Schermerhorn
2997f8fd525bSDonet Tom /*
2998f8fd525bSDonet Tom * Make sure ptl is held so that we don't preempt and we
2999f8fd525bSDonet Tom * have a stable smp processor id
3000f8fd525bSDonet Tom */
3001f8fd525bSDonet Tom lockdep_assert_held(vmf->ptl);
3002ddc1a5cbSHugh Dickins pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
3003771fb4d8SLee Schermerhorn if (!(pol->flags & MPOL_F_MOF))
3004771fb4d8SLee Schermerhorn goto out;
3005771fb4d8SLee Schermerhorn
3006771fb4d8SLee Schermerhorn switch (pol->mode) {
3007771fb4d8SLee Schermerhorn case MPOL_INTERLEAVE:
3008ddc1a5cbSHugh Dickins polnid = interleave_nid(pol, ilx);
3009771fb4d8SLee Schermerhorn break;
3010771fb4d8SLee Schermerhorn
3011fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE:
3012fa3bea4eSGregory Price polnid = weighted_interleave_nid(pol, ilx);
3013fa3bea4eSGregory Price break;
3014fa3bea4eSGregory Price
3015771fb4d8SLee Schermerhorn case MPOL_PREFERRED:
3016b27abaccSDave Hansen if (node_isset(curnid, pol->nodes))
3017b27abaccSDave Hansen goto out;
3018269fbe72SBen Widawsky polnid = first_node(pol->nodes);
3019771fb4d8SLee Schermerhorn break;
3020771fb4d8SLee Schermerhorn
30217858d7bcSFeng Tang case MPOL_LOCAL:
30227858d7bcSFeng Tang polnid = numa_node_id();
30237858d7bcSFeng Tang break;
30247858d7bcSFeng Tang
3025771fb4d8SLee Schermerhorn case MPOL_BIND:
3026133d04b1SDonet Tom case MPOL_PREFERRED_MANY:
3027133d04b1SDonet Tom /*
3028133d04b1SDonet Tom * Even though MPOL_PREFERRED_MANY can allocate pages outside
3029133d04b1SDonet Tom * policy nodemask we don't allow numa migration to nodes
3030133d04b1SDonet Tom * outside policy nodemask for now. This is done so that if we
3031133d04b1SDonet Tom * want demotion to slow memory to happen, before allocating
3032133d04b1SDonet Tom * from some DRAM node say 'x', we will end up using a
3033133d04b1SDonet Tom * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
3034133d04b1SDonet Tom * we should not promote to node 'x' from slow memory node.
3035133d04b1SDonet Tom */
3036bda420b9SHuang Ying if (pol->flags & MPOL_F_MORON) {
3037133d04b1SDonet Tom /*
3038133d04b1SDonet Tom * Optimize placement among multiple nodes
3039133d04b1SDonet Tom * via NUMA balancing
3040133d04b1SDonet Tom */
3041269fbe72SBen Widawsky if (node_isset(thisnid, pol->nodes))
3042bda420b9SHuang Ying break;
3043bda420b9SHuang Ying goto out;
3044bda420b9SHuang Ying }
3045c33d6c06SMel Gorman
3046771fb4d8SLee Schermerhorn /*
3047771fb4d8SLee Schermerhorn * use current page if in policy nodemask,
3048771fb4d8SLee Schermerhorn * else select nearest allowed node, if any.
3049771fb4d8SLee Schermerhorn * If no allowed nodes, use current [!misplaced].
3050771fb4d8SLee Schermerhorn */
3051269fbe72SBen Widawsky if (node_isset(curnid, pol->nodes))
3052771fb4d8SLee Schermerhorn goto out;
3053c33d6c06SMel Gorman z = first_zones_zonelist(
3054f8fd525bSDonet Tom node_zonelist(thisnid, GFP_HIGHUSER),
3055771fb4d8SLee Schermerhorn gfp_zone(GFP_HIGHUSER),
3056269fbe72SBen Widawsky &pol->nodes);
305729943248SWei Yang polnid = zonelist_node_idx(z);
3058771fb4d8SLee Schermerhorn break;
3059771fb4d8SLee Schermerhorn
3060771fb4d8SLee Schermerhorn default:
3061771fb4d8SLee Schermerhorn BUG();
3062771fb4d8SLee Schermerhorn }
30635606e387SMel Gorman
306475c70128SKefeng Wang /* Migrate the folio towards the node whose CPU is referencing it */
3065e42c8ff2SMel Gorman if (pol->flags & MPOL_F_MORON) {
306690572890SPeter Zijlstra polnid = thisnid;
30675606e387SMel Gorman
30688c9ae56dSKefeng Wang if (!should_numa_migrate_memory(current, folio, curnid,
306975c70128SKefeng Wang thiscpu))
3070de1c9ce6SRik van Riel goto out;
3071de1c9ce6SRik van Riel }
3072e42c8ff2SMel Gorman
3073771fb4d8SLee Schermerhorn if (curnid != polnid)
3074771fb4d8SLee Schermerhorn ret = polnid;
3075771fb4d8SLee Schermerhorn out:
3076771fb4d8SLee Schermerhorn mpol_cond_put(pol);
3077771fb4d8SLee Schermerhorn
3078771fb4d8SLee Schermerhorn return ret;
3079771fb4d8SLee Schermerhorn }
3080771fb4d8SLee Schermerhorn
3081c11600e4SDavid Rientjes /*
3082c11600e4SDavid Rientjes * Drop the (possibly final) reference to task->mempolicy. It needs to be
3083c11600e4SDavid Rientjes * dropped after task->mempolicy is set to NULL so that any allocation done as
3084c11600e4SDavid Rientjes * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
3085c11600e4SDavid Rientjes * policy.
3086c11600e4SDavid Rientjes */
mpol_put_task_policy(struct task_struct * task)3087c11600e4SDavid Rientjes void mpol_put_task_policy(struct task_struct *task)
3088c11600e4SDavid Rientjes {
3089c11600e4SDavid Rientjes struct mempolicy *pol;
3090c11600e4SDavid Rientjes
3091c11600e4SDavid Rientjes task_lock(task);
3092c11600e4SDavid Rientjes pol = task->mempolicy;
3093c11600e4SDavid Rientjes task->mempolicy = NULL;
3094c11600e4SDavid Rientjes task_unlock(task);
3095c11600e4SDavid Rientjes mpol_put(pol);
3096c11600e4SDavid Rientjes }
3097c11600e4SDavid Rientjes
sp_delete(struct shared_policy * sp,struct sp_node * n)30981da177e4SLinus Torvalds static void sp_delete(struct shared_policy *sp, struct sp_node *n)
30991da177e4SLinus Torvalds {
31001da177e4SLinus Torvalds rb_erase(&n->nd, &sp->root);
310163f74ca2SKOSAKI Motohiro sp_free(n);
31021da177e4SLinus Torvalds }
31031da177e4SLinus Torvalds
sp_node_init(struct sp_node * node,unsigned long start,unsigned long end,struct mempolicy * pol)310442288fe3SMel Gorman static void sp_node_init(struct sp_node *node, unsigned long start,
310542288fe3SMel Gorman unsigned long end, struct mempolicy *pol)
310642288fe3SMel Gorman {
310742288fe3SMel Gorman node->start = start;
310842288fe3SMel Gorman node->end = end;
310942288fe3SMel Gorman node->policy = pol;
311042288fe3SMel Gorman }
311142288fe3SMel Gorman
sp_alloc(unsigned long start,unsigned long end,struct mempolicy * pol)3112dbcb0f19SAdrian Bunk static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
3113dbcb0f19SAdrian Bunk struct mempolicy *pol)
31141da177e4SLinus Torvalds {
3115869833f2SKOSAKI Motohiro struct sp_node *n;
3116869833f2SKOSAKI Motohiro struct mempolicy *newpol;
31171da177e4SLinus Torvalds
3118869833f2SKOSAKI Motohiro n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
31191da177e4SLinus Torvalds if (!n)
31201da177e4SLinus Torvalds return NULL;
3121869833f2SKOSAKI Motohiro
3122869833f2SKOSAKI Motohiro newpol = mpol_dup(pol);
3123869833f2SKOSAKI Motohiro if (IS_ERR(newpol)) {
3124869833f2SKOSAKI Motohiro kmem_cache_free(sn_cache, n);
3125869833f2SKOSAKI Motohiro return NULL;
3126869833f2SKOSAKI Motohiro }
3127869833f2SKOSAKI Motohiro newpol->flags |= MPOL_F_SHARED;
312842288fe3SMel Gorman sp_node_init(n, start, end, newpol);
3129869833f2SKOSAKI Motohiro
31301da177e4SLinus Torvalds return n;
31311da177e4SLinus Torvalds }
31321da177e4SLinus Torvalds
31331da177e4SLinus Torvalds /* Replace a policy range. */
shared_policy_replace(struct shared_policy * sp,pgoff_t start,pgoff_t end,struct sp_node * new)313493397c3bSHugh Dickins static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
313593397c3bSHugh Dickins pgoff_t end, struct sp_node *new)
31361da177e4SLinus Torvalds {
3137b22d127aSMel Gorman struct sp_node *n;
313842288fe3SMel Gorman struct sp_node *n_new = NULL;
313942288fe3SMel Gorman struct mempolicy *mpol_new = NULL;
3140b22d127aSMel Gorman int ret = 0;
31411da177e4SLinus Torvalds
314242288fe3SMel Gorman restart:
31434a8c7bb5SNathan Zimmer write_lock(&sp->lock);
31441da177e4SLinus Torvalds n = sp_lookup(sp, start, end);
31451da177e4SLinus Torvalds /* Take care of old policies in the same range. */
31461da177e4SLinus Torvalds while (n && n->start < end) {
31471da177e4SLinus Torvalds struct rb_node *next = rb_next(&n->nd);
31481da177e4SLinus Torvalds if (n->start >= start) {
31491da177e4SLinus Torvalds if (n->end <= end)
31501da177e4SLinus Torvalds sp_delete(sp, n);
31511da177e4SLinus Torvalds else
31521da177e4SLinus Torvalds n->start = end;
31531da177e4SLinus Torvalds } else {
31541da177e4SLinus Torvalds /* Old policy spanning whole new range. */
31551da177e4SLinus Torvalds if (n->end > end) {
315642288fe3SMel Gorman if (!n_new)
315742288fe3SMel Gorman goto alloc_new;
315842288fe3SMel Gorman
315942288fe3SMel Gorman *mpol_new = *n->policy;
316042288fe3SMel Gorman atomic_set(&mpol_new->refcnt, 1);
31617880639cSKOSAKI Motohiro sp_node_init(n_new, end, n->end, mpol_new);
31621da177e4SLinus Torvalds n->end = start;
31635ca39575SHillf Danton sp_insert(sp, n_new);
316442288fe3SMel Gorman n_new = NULL;
316542288fe3SMel Gorman mpol_new = NULL;
31661da177e4SLinus Torvalds break;
31671da177e4SLinus Torvalds } else
31681da177e4SLinus Torvalds n->end = start;
31691da177e4SLinus Torvalds }
31701da177e4SLinus Torvalds if (!next)
31711da177e4SLinus Torvalds break;
31721da177e4SLinus Torvalds n = rb_entry(next, struct sp_node, nd);
31731da177e4SLinus Torvalds }
31741da177e4SLinus Torvalds if (new)
31751da177e4SLinus Torvalds sp_insert(sp, new);
31764a8c7bb5SNathan Zimmer write_unlock(&sp->lock);
317742288fe3SMel Gorman ret = 0;
317842288fe3SMel Gorman
317942288fe3SMel Gorman err_out:
318042288fe3SMel Gorman if (mpol_new)
318142288fe3SMel Gorman mpol_put(mpol_new);
318242288fe3SMel Gorman if (n_new)
318342288fe3SMel Gorman kmem_cache_free(sn_cache, n_new);
318442288fe3SMel Gorman
3185b22d127aSMel Gorman return ret;
318642288fe3SMel Gorman
318742288fe3SMel Gorman alloc_new:
31884a8c7bb5SNathan Zimmer write_unlock(&sp->lock);
318942288fe3SMel Gorman ret = -ENOMEM;
319042288fe3SMel Gorman n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
319142288fe3SMel Gorman if (!n_new)
319242288fe3SMel Gorman goto err_out;
319342288fe3SMel Gorman mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
319442288fe3SMel Gorman if (!mpol_new)
319542288fe3SMel Gorman goto err_out;
31964ad09955SMiaohe Lin atomic_set(&mpol_new->refcnt, 1);
319742288fe3SMel Gorman goto restart;
31981da177e4SLinus Torvalds }
31991da177e4SLinus Torvalds
320071fe804bSLee Schermerhorn /**
320171fe804bSLee Schermerhorn * mpol_shared_policy_init - initialize shared policy for inode
320271fe804bSLee Schermerhorn * @sp: pointer to inode shared policy
320371fe804bSLee Schermerhorn * @mpol: struct mempolicy to install
320471fe804bSLee Schermerhorn *
320571fe804bSLee Schermerhorn * Install non-NULL @mpol in inode's shared policy rb-tree.
320671fe804bSLee Schermerhorn * On entry, the current task has a reference on a non-NULL @mpol.
320771fe804bSLee Schermerhorn * This must be released on exit.
32084bfc4495SKAMEZAWA Hiroyuki * This is called at get_inode() calls and we can use GFP_KERNEL.
320971fe804bSLee Schermerhorn */
mpol_shared_policy_init(struct shared_policy * sp,struct mempolicy * mpol)321071fe804bSLee Schermerhorn void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
32117339ff83SRobin Holt {
321258568d2aSMiao Xie int ret;
321358568d2aSMiao Xie
321471fe804bSLee Schermerhorn sp->root = RB_ROOT; /* empty tree == default mempolicy */
32154a8c7bb5SNathan Zimmer rwlock_init(&sp->lock);
32167339ff83SRobin Holt
321771fe804bSLee Schermerhorn if (mpol) {
321835ec8fa0SHugh Dickins struct sp_node *sn;
321935ec8fa0SHugh Dickins struct mempolicy *npol;
32204bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH(scratch);
32217339ff83SRobin Holt
32224bfc4495SKAMEZAWA Hiroyuki if (!scratch)
32235c0c1654SLee Schermerhorn goto put_mpol;
322435ec8fa0SHugh Dickins
322535ec8fa0SHugh Dickins /* contextualize the tmpfs mount point mempolicy to this file */
322635ec8fa0SHugh Dickins npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
322735ec8fa0SHugh Dickins if (IS_ERR(npol))
32280cae3457SDan Carpenter goto free_scratch; /* no valid nodemask intersection */
322958568d2aSMiao Xie
323058568d2aSMiao Xie task_lock(current);
323135ec8fa0SHugh Dickins ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
323258568d2aSMiao Xie task_unlock(current);
323315d77835SLee Schermerhorn if (ret)
323435ec8fa0SHugh Dickins goto put_npol;
323571fe804bSLee Schermerhorn
323635ec8fa0SHugh Dickins /* alloc node covering entire file; adds ref to file's npol */
323735ec8fa0SHugh Dickins sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
323835ec8fa0SHugh Dickins if (sn)
323935ec8fa0SHugh Dickins sp_insert(sp, sn);
324035ec8fa0SHugh Dickins put_npol:
324135ec8fa0SHugh Dickins mpol_put(npol); /* drop initial ref on file's npol */
32420cae3457SDan Carpenter free_scratch:
32434bfc4495SKAMEZAWA Hiroyuki NODEMASK_SCRATCH_FREE(scratch);
32445c0c1654SLee Schermerhorn put_mpol:
32455c0c1654SLee Schermerhorn mpol_put(mpol); /* drop our incoming ref on sb mpol */
32467339ff83SRobin Holt }
32477339ff83SRobin Holt }
3248f634f108SShivank Garg EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm");
32497339ff83SRobin Holt
mpol_set_shared_policy(struct shared_policy * sp,struct vm_area_struct * vma,struct mempolicy * pol)3250c36f6e6dSHugh Dickins int mpol_set_shared_policy(struct shared_policy *sp,
3251c36f6e6dSHugh Dickins struct vm_area_struct *vma, struct mempolicy *pol)
32521da177e4SLinus Torvalds {
32531da177e4SLinus Torvalds int err;
32541da177e4SLinus Torvalds struct sp_node *new = NULL;
32551da177e4SLinus Torvalds unsigned long sz = vma_pages(vma);
32561da177e4SLinus Torvalds
3257c36f6e6dSHugh Dickins if (pol) {
3258c36f6e6dSHugh Dickins new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
32591da177e4SLinus Torvalds if (!new)
32601da177e4SLinus Torvalds return -ENOMEM;
32611da177e4SLinus Torvalds }
3262c36f6e6dSHugh Dickins err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
32631da177e4SLinus Torvalds if (err && new)
326463f74ca2SKOSAKI Motohiro sp_free(new);
32651da177e4SLinus Torvalds return err;
32661da177e4SLinus Torvalds }
3267f634f108SShivank Garg EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm");
32681da177e4SLinus Torvalds
32691da177e4SLinus Torvalds /* Free a backing policy store on inode delete. */
mpol_free_shared_policy(struct shared_policy * sp)3270c36f6e6dSHugh Dickins void mpol_free_shared_policy(struct shared_policy *sp)
32711da177e4SLinus Torvalds {
32721da177e4SLinus Torvalds struct sp_node *n;
32731da177e4SLinus Torvalds struct rb_node *next;
32741da177e4SLinus Torvalds
3275c36f6e6dSHugh Dickins if (!sp->root.rb_node)
32761da177e4SLinus Torvalds return;
3277c36f6e6dSHugh Dickins write_lock(&sp->lock);
3278c36f6e6dSHugh Dickins next = rb_first(&sp->root);
32791da177e4SLinus Torvalds while (next) {
32801da177e4SLinus Torvalds n = rb_entry(next, struct sp_node, nd);
32811da177e4SLinus Torvalds next = rb_next(&n->nd);
3282c36f6e6dSHugh Dickins sp_delete(sp, n);
32831da177e4SLinus Torvalds }
3284c36f6e6dSHugh Dickins write_unlock(&sp->lock);
32851da177e4SLinus Torvalds }
3286f634f108SShivank Garg EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm");
32871da177e4SLinus Torvalds
32881a687c2eSMel Gorman #ifdef CONFIG_NUMA_BALANCING
3289c297663cSMel Gorman static int __initdata numabalancing_override;
32901a687c2eSMel Gorman
check_numabalancing_enable(void)32911a687c2eSMel Gorman static void __init check_numabalancing_enable(void)
32921a687c2eSMel Gorman {
32931a687c2eSMel Gorman bool numabalancing_default = false;
32941a687c2eSMel Gorman
32951a687c2eSMel Gorman if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
32961a687c2eSMel Gorman numabalancing_default = true;
32971a687c2eSMel Gorman
3298c297663cSMel Gorman /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
3299c297663cSMel Gorman if (numabalancing_override)
3300c297663cSMel Gorman set_numabalancing_state(numabalancing_override == 1);
3301c297663cSMel Gorman
3302b0dc2b9bSMel Gorman if (num_online_nodes() > 1 && !numabalancing_override) {
3303756a025fSJoe Perches pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
3304c297663cSMel Gorman numabalancing_default ? "Enabling" : "Disabling");
33051a687c2eSMel Gorman set_numabalancing_state(numabalancing_default);
33061a687c2eSMel Gorman }
33071a687c2eSMel Gorman }
33081a687c2eSMel Gorman
setup_numabalancing(char * str)33091a687c2eSMel Gorman static int __init setup_numabalancing(char *str)
33101a687c2eSMel Gorman {
33111a687c2eSMel Gorman int ret = 0;
33121a687c2eSMel Gorman if (!str)
33131a687c2eSMel Gorman goto out;
33141a687c2eSMel Gorman
33151a687c2eSMel Gorman if (!strcmp(str, "enable")) {
3316c297663cSMel Gorman numabalancing_override = 1;
33171a687c2eSMel Gorman ret = 1;
33181a687c2eSMel Gorman } else if (!strcmp(str, "disable")) {
3319c297663cSMel Gorman numabalancing_override = -1;
33201a687c2eSMel Gorman ret = 1;
33211a687c2eSMel Gorman }
33221a687c2eSMel Gorman out:
33231a687c2eSMel Gorman if (!ret)
33244a404beaSAndrew Morton pr_warn("Unable to parse numa_balancing=\n");
33251a687c2eSMel Gorman
33261a687c2eSMel Gorman return ret;
33271a687c2eSMel Gorman }
33281a687c2eSMel Gorman __setup("numa_balancing=", setup_numabalancing);
33291a687c2eSMel Gorman #else
check_numabalancing_enable(void)33301a687c2eSMel Gorman static inline void __init check_numabalancing_enable(void)
33311a687c2eSMel Gorman {
33321a687c2eSMel Gorman }
33331a687c2eSMel Gorman #endif /* CONFIG_NUMA_BALANCING */
33341a687c2eSMel Gorman
numa_policy_init(void)33351da177e4SLinus Torvalds void __init numa_policy_init(void)
33361da177e4SLinus Torvalds {
3337b71636e2SPaul Mundt nodemask_t interleave_nodes;
3338b71636e2SPaul Mundt unsigned long largest = 0;
3339b71636e2SPaul Mundt int nid, prefer = 0;
3340b71636e2SPaul Mundt
33411da177e4SLinus Torvalds policy_cache = kmem_cache_create("numa_policy",
33421da177e4SLinus Torvalds sizeof(struct mempolicy),
334320c2df83SPaul Mundt 0, SLAB_PANIC, NULL);
33441da177e4SLinus Torvalds
33451da177e4SLinus Torvalds sn_cache = kmem_cache_create("shared_policy_node",
33461da177e4SLinus Torvalds sizeof(struct sp_node),
334720c2df83SPaul Mundt 0, SLAB_PANIC, NULL);
33481da177e4SLinus Torvalds
33495606e387SMel Gorman for_each_node(nid) {
33505606e387SMel Gorman preferred_node_policy[nid] = (struct mempolicy) {
33515606e387SMel Gorman .refcnt = ATOMIC_INIT(1),
33525606e387SMel Gorman .mode = MPOL_PREFERRED,
33535606e387SMel Gorman .flags = MPOL_F_MOF | MPOL_F_MORON,
3354269fbe72SBen Widawsky .nodes = nodemask_of_node(nid),
33555606e387SMel Gorman };
33565606e387SMel Gorman }
33575606e387SMel Gorman
3358b71636e2SPaul Mundt /*
3359b71636e2SPaul Mundt * Set interleaving policy for system init. Interleaving is only
3360b71636e2SPaul Mundt * enabled across suitably sized nodes (default is >= 16MB), or
3361b71636e2SPaul Mundt * fall back to the largest node if they're all smaller.
3362b71636e2SPaul Mundt */
3363b71636e2SPaul Mundt nodes_clear(interleave_nodes);
336401f13bd6SLai Jiangshan for_each_node_state(nid, N_MEMORY) {
3365b71636e2SPaul Mundt unsigned long total_pages = node_present_pages(nid);
33661da177e4SLinus Torvalds
3367b71636e2SPaul Mundt /* Preserve the largest node */
3368b71636e2SPaul Mundt if (largest < total_pages) {
3369b71636e2SPaul Mundt largest = total_pages;
3370b71636e2SPaul Mundt prefer = nid;
3371b71636e2SPaul Mundt }
3372b71636e2SPaul Mundt
3373b71636e2SPaul Mundt /* Interleave this node? */
3374b71636e2SPaul Mundt if ((total_pages << PAGE_SHIFT) >= (16 << 20))
3375b71636e2SPaul Mundt node_set(nid, interleave_nodes);
3376b71636e2SPaul Mundt }
3377b71636e2SPaul Mundt
3378b71636e2SPaul Mundt /* All too small, use the largest */
3379b71636e2SPaul Mundt if (unlikely(nodes_empty(interleave_nodes)))
3380b71636e2SPaul Mundt node_set(prefer, interleave_nodes);
3381b71636e2SPaul Mundt
3382028fec41SDavid Rientjes if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
3383b1de0d13SMitchel Humpherys pr_err("%s: interleaving failed\n", __func__);
33841a687c2eSMel Gorman
33851a687c2eSMel Gorman check_numabalancing_enable();
33861da177e4SLinus Torvalds }
33871da177e4SLinus Torvalds
33888bccd85fSChristoph Lameter /* Reset policy of current process to default */
numa_default_policy(void)33891da177e4SLinus Torvalds void numa_default_policy(void)
33901da177e4SLinus Torvalds {
3391028fec41SDavid Rientjes do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
33921da177e4SLinus Torvalds }
339368860ec1SPaul Jackson
33944225399aSPaul Jackson /*
3395095f1fc4SLee Schermerhorn * Parse and format mempolicy from/to strings
3396095f1fc4SLee Schermerhorn */
3397345ace9cSLee Schermerhorn static const char * const policy_modes[] =
3398345ace9cSLee Schermerhorn {
3399345ace9cSLee Schermerhorn [MPOL_DEFAULT] = "default",
3400345ace9cSLee Schermerhorn [MPOL_PREFERRED] = "prefer",
3401345ace9cSLee Schermerhorn [MPOL_BIND] = "bind",
3402345ace9cSLee Schermerhorn [MPOL_INTERLEAVE] = "interleave",
3403fa3bea4eSGregory Price [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
3404d3a71033SLee Schermerhorn [MPOL_LOCAL] = "local",
3405b27abaccSDave Hansen [MPOL_PREFERRED_MANY] = "prefer (many)",
3406345ace9cSLee Schermerhorn };
34071a75a6c8SChristoph Lameter
3408095f1fc4SLee Schermerhorn #ifdef CONFIG_TMPFS
3409095f1fc4SLee Schermerhorn /**
3410f2a07f40SHugh Dickins * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
3411095f1fc4SLee Schermerhorn * @str: string containing mempolicy to parse
341271fe804bSLee Schermerhorn * @mpol: pointer to struct mempolicy pointer, returned on success.
3413095f1fc4SLee Schermerhorn *
3414095f1fc4SLee Schermerhorn * Format of input:
3415095f1fc4SLee Schermerhorn * <mode>[=<flags>][:<nodelist>]
3416095f1fc4SLee Schermerhorn *
3417dad5b023SRandy Dunlap * Return: %0 on success, else %1
3418095f1fc4SLee Schermerhorn */
mpol_parse_str(char * str,struct mempolicy ** mpol)3419a7a88b23SHugh Dickins int mpol_parse_str(char *str, struct mempolicy **mpol)
3420095f1fc4SLee Schermerhorn {
342171fe804bSLee Schermerhorn struct mempolicy *new = NULL;
3422f2a07f40SHugh Dickins unsigned short mode_flags;
342371fe804bSLee Schermerhorn nodemask_t nodes;
3424095f1fc4SLee Schermerhorn char *nodelist = strchr(str, ':');
3425095f1fc4SLee Schermerhorn char *flags = strchr(str, '=');
3426dedf2c73Szhong jiang int err = 1, mode;
3427095f1fc4SLee Schermerhorn
3428c7a91bc7SDan Carpenter if (flags)
3429c7a91bc7SDan Carpenter *flags++ = '\0'; /* terminate mode string */
3430c7a91bc7SDan Carpenter
3431095f1fc4SLee Schermerhorn if (nodelist) {
3432095f1fc4SLee Schermerhorn /* NUL-terminate mode or flags string */
3433095f1fc4SLee Schermerhorn *nodelist++ = '\0';
343471fe804bSLee Schermerhorn if (nodelist_parse(nodelist, nodes))
3435095f1fc4SLee Schermerhorn goto out;
343601f13bd6SLai Jiangshan if (!nodes_subset(nodes, node_states[N_MEMORY]))
3437095f1fc4SLee Schermerhorn goto out;
343871fe804bSLee Schermerhorn } else
343971fe804bSLee Schermerhorn nodes_clear(nodes);
344071fe804bSLee Schermerhorn
3441dedf2c73Szhong jiang mode = match_string(policy_modes, MPOL_MAX, str);
3442dedf2c73Szhong jiang if (mode < 0)
3443095f1fc4SLee Schermerhorn goto out;
3444095f1fc4SLee Schermerhorn
344571fe804bSLee Schermerhorn switch (mode) {
3446095f1fc4SLee Schermerhorn case MPOL_PREFERRED:
344771fe804bSLee Schermerhorn /*
3448aa9f7d51SRandy Dunlap * Insist on a nodelist of one node only, although later
3449aa9f7d51SRandy Dunlap * we use first_node(nodes) to grab a single node, so here
3450aa9f7d51SRandy Dunlap * nodelist (or nodes) cannot be empty.
345171fe804bSLee Schermerhorn */
3452095f1fc4SLee Schermerhorn if (nodelist) {
3453095f1fc4SLee Schermerhorn char *rest = nodelist;
3454095f1fc4SLee Schermerhorn while (isdigit(*rest))
3455095f1fc4SLee Schermerhorn rest++;
3456926f2ae0SKOSAKI Motohiro if (*rest)
3457926f2ae0SKOSAKI Motohiro goto out;
3458aa9f7d51SRandy Dunlap if (nodes_empty(nodes))
3459aa9f7d51SRandy Dunlap goto out;
3460095f1fc4SLee Schermerhorn }
3461095f1fc4SLee Schermerhorn break;
3462095f1fc4SLee Schermerhorn case MPOL_INTERLEAVE:
3463fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE:
3464095f1fc4SLee Schermerhorn /*
3465095f1fc4SLee Schermerhorn * Default to online nodes with memory if no nodelist
3466095f1fc4SLee Schermerhorn */
3467095f1fc4SLee Schermerhorn if (!nodelist)
346801f13bd6SLai Jiangshan nodes = node_states[N_MEMORY];
34693f226aa1SLee Schermerhorn break;
347071fe804bSLee Schermerhorn case MPOL_LOCAL:
34713f226aa1SLee Schermerhorn /*
347271fe804bSLee Schermerhorn * Don't allow a nodelist; mpol_new() checks flags
34733f226aa1SLee Schermerhorn */
347471fe804bSLee Schermerhorn if (nodelist)
34753f226aa1SLee Schermerhorn goto out;
34763f226aa1SLee Schermerhorn break;
3477413b43deSRavikiran G Thirumalai case MPOL_DEFAULT:
3478413b43deSRavikiran G Thirumalai /*
3479413b43deSRavikiran G Thirumalai * Insist on a empty nodelist
3480413b43deSRavikiran G Thirumalai */
3481413b43deSRavikiran G Thirumalai if (!nodelist)
3482413b43deSRavikiran G Thirumalai err = 0;
3483413b43deSRavikiran G Thirumalai goto out;
3484b27abaccSDave Hansen case MPOL_PREFERRED_MANY:
3485d69b2e63SKOSAKI Motohiro case MPOL_BIND:
348671fe804bSLee Schermerhorn /*
3487d69b2e63SKOSAKI Motohiro * Insist on a nodelist
348871fe804bSLee Schermerhorn */
3489d69b2e63SKOSAKI Motohiro if (!nodelist)
3490d69b2e63SKOSAKI Motohiro goto out;
3491095f1fc4SLee Schermerhorn }
3492095f1fc4SLee Schermerhorn
349371fe804bSLee Schermerhorn mode_flags = 0;
3494095f1fc4SLee Schermerhorn if (flags) {
3495095f1fc4SLee Schermerhorn /*
3496095f1fc4SLee Schermerhorn * Currently, we only support two mutually exclusive
3497095f1fc4SLee Schermerhorn * mode flags.
3498095f1fc4SLee Schermerhorn */
3499095f1fc4SLee Schermerhorn if (!strcmp(flags, "static"))
350071fe804bSLee Schermerhorn mode_flags |= MPOL_F_STATIC_NODES;
3501095f1fc4SLee Schermerhorn else if (!strcmp(flags, "relative"))
350271fe804bSLee Schermerhorn mode_flags |= MPOL_F_RELATIVE_NODES;
3503095f1fc4SLee Schermerhorn else
3504926f2ae0SKOSAKI Motohiro goto out;
3505095f1fc4SLee Schermerhorn }
350671fe804bSLee Schermerhorn
350771fe804bSLee Schermerhorn new = mpol_new(mode, mode_flags, &nodes);
350871fe804bSLee Schermerhorn if (IS_ERR(new))
3509926f2ae0SKOSAKI Motohiro goto out;
3510926f2ae0SKOSAKI Motohiro
3511f2a07f40SHugh Dickins /*
3512f2a07f40SHugh Dickins * Save nodes for mpol_to_str() to show the tmpfs mount options
3513f2a07f40SHugh Dickins * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3514f2a07f40SHugh Dickins */
3515269fbe72SBen Widawsky if (mode != MPOL_PREFERRED) {
3516269fbe72SBen Widawsky new->nodes = nodes;
3517269fbe72SBen Widawsky } else if (nodelist) {
3518269fbe72SBen Widawsky nodes_clear(new->nodes);
3519269fbe72SBen Widawsky node_set(first_node(nodes), new->nodes);
3520269fbe72SBen Widawsky } else {
35217858d7bcSFeng Tang new->mode = MPOL_LOCAL;
3522269fbe72SBen Widawsky }
3523f2a07f40SHugh Dickins
3524f2a07f40SHugh Dickins /*
3525f2a07f40SHugh Dickins * Save nodes for contextualization: this will be used to "clone"
3526f2a07f40SHugh Dickins * the mempolicy in a specific context [cpuset] at a later time.
3527f2a07f40SHugh Dickins */
3528e17f74afSLee Schermerhorn new->w.user_nodemask = nodes;
3529f2a07f40SHugh Dickins
3530926f2ae0SKOSAKI Motohiro err = 0;
353171fe804bSLee Schermerhorn
3532095f1fc4SLee Schermerhorn out:
3533095f1fc4SLee Schermerhorn /* Restore string for error message */
3534095f1fc4SLee Schermerhorn if (nodelist)
3535095f1fc4SLee Schermerhorn *--nodelist = ':';
3536095f1fc4SLee Schermerhorn if (flags)
3537095f1fc4SLee Schermerhorn *--flags = '=';
353871fe804bSLee Schermerhorn if (!err)
353971fe804bSLee Schermerhorn *mpol = new;
3540095f1fc4SLee Schermerhorn return err;
3541095f1fc4SLee Schermerhorn }
3542095f1fc4SLee Schermerhorn #endif /* CONFIG_TMPFS */
3543095f1fc4SLee Schermerhorn
354471fe804bSLee Schermerhorn /**
354571fe804bSLee Schermerhorn * mpol_to_str - format a mempolicy structure for printing
354671fe804bSLee Schermerhorn * @buffer: to contain formatted mempolicy string
354771fe804bSLee Schermerhorn * @maxlen: length of @buffer
354871fe804bSLee Schermerhorn * @pol: pointer to mempolicy to be formatted
354971fe804bSLee Schermerhorn *
3550948927eeSDavid Rientjes * Convert @pol into a string. If @buffer is too short, truncate the string.
3551af649773STvrtko Ursulin * Recommend a @maxlen of at least 51 for the longest mode, "weighted
3552af649773STvrtko Ursulin * interleave", plus the longest flag flags, "relative|balancing", and to
3553af649773STvrtko Ursulin * display at least a few node ids.
35541a75a6c8SChristoph Lameter */
mpol_to_str(char * buffer,int maxlen,struct mempolicy * pol)3555948927eeSDavid Rientjes void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
35561a75a6c8SChristoph Lameter {
35571a75a6c8SChristoph Lameter char *p = buffer;
3558948927eeSDavid Rientjes nodemask_t nodes = NODE_MASK_NONE;
3559948927eeSDavid Rientjes unsigned short mode = MPOL_DEFAULT;
3560948927eeSDavid Rientjes unsigned short flags = 0;
35611a75a6c8SChristoph Lameter
3562af649773STvrtko Ursulin if (pol &&
3563af649773STvrtko Ursulin pol != &default_policy &&
3564af649773STvrtko Ursulin !(pol >= &preferred_node_policy[0] &&
3565af649773STvrtko Ursulin pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
3566bea904d5SLee Schermerhorn mode = pol->mode;
3567948927eeSDavid Rientjes flags = pol->flags;
3568948927eeSDavid Rientjes }
3569bea904d5SLee Schermerhorn
35701a75a6c8SChristoph Lameter switch (mode) {
35711a75a6c8SChristoph Lameter case MPOL_DEFAULT:
35727858d7bcSFeng Tang case MPOL_LOCAL:
35731a75a6c8SChristoph Lameter break;
35741a75a6c8SChristoph Lameter case MPOL_PREFERRED:
3575b27abaccSDave Hansen case MPOL_PREFERRED_MANY:
35761a75a6c8SChristoph Lameter case MPOL_BIND:
35771a75a6c8SChristoph Lameter case MPOL_INTERLEAVE:
3578fa3bea4eSGregory Price case MPOL_WEIGHTED_INTERLEAVE:
3579269fbe72SBen Widawsky nodes = pol->nodes;
35801a75a6c8SChristoph Lameter break;
35811a75a6c8SChristoph Lameter default:
3582948927eeSDavid Rientjes WARN_ON_ONCE(1);
3583948927eeSDavid Rientjes snprintf(p, maxlen, "unknown");
3584948927eeSDavid Rientjes return;
35851a75a6c8SChristoph Lameter }
35861a75a6c8SChristoph Lameter
3587b7a9f420SDavid Rientjes p += snprintf(p, maxlen, "%s", policy_modes[mode]);
35881a75a6c8SChristoph Lameter
3589fc36b8d3SLee Schermerhorn if (flags & MPOL_MODE_FLAGS) {
3590948927eeSDavid Rientjes p += snprintf(p, buffer + maxlen - p, "=");
3591f5b087b5SDavid Rientjes
35922291990aSLee Schermerhorn /*
3593af649773STvrtko Ursulin * Static and relative are mutually exclusive.
35942291990aSLee Schermerhorn */
3595f5b087b5SDavid Rientjes if (flags & MPOL_F_STATIC_NODES)
35962291990aSLee Schermerhorn p += snprintf(p, buffer + maxlen - p, "static");
35972291990aSLee Schermerhorn else if (flags & MPOL_F_RELATIVE_NODES)
35982291990aSLee Schermerhorn p += snprintf(p, buffer + maxlen - p, "relative");
3599af649773STvrtko Ursulin
3600af649773STvrtko Ursulin if (flags & MPOL_F_NUMA_BALANCING) {
3601af649773STvrtko Ursulin if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
3602af649773STvrtko Ursulin p += snprintf(p, buffer + maxlen - p, "|");
3603af649773STvrtko Ursulin p += snprintf(p, buffer + maxlen - p, "balancing");
3604af649773STvrtko Ursulin }
3605f5b087b5SDavid Rientjes }
3606f5b087b5SDavid Rientjes
36079e763e0fSTejun Heo if (!nodes_empty(nodes))
36089e763e0fSTejun Heo p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
36099e763e0fSTejun Heo nodemask_pr_args(&nodes));
36101a75a6c8SChristoph Lameter }
3611dce41f5aSRakie Kim
3612dce41f5aSRakie Kim #ifdef CONFIG_SYSFS
3613dce41f5aSRakie Kim struct iw_node_attr {
3614dce41f5aSRakie Kim struct kobj_attribute kobj_attr;
3615dce41f5aSRakie Kim int nid;
3616dce41f5aSRakie Kim };
3617dce41f5aSRakie Kim
3618cf8cecf2SRakie Kim struct sysfs_wi_group {
3619cf8cecf2SRakie Kim struct kobject wi_kobj;
3620dec92bf9SRakie Kim struct mutex kobj_lock;
3621cf8cecf2SRakie Kim struct iw_node_attr *nattrs[];
3622cf8cecf2SRakie Kim };
3623cf8cecf2SRakie Kim
3624cf8cecf2SRakie Kim static struct sysfs_wi_group *wi_group;
3625cf8cecf2SRakie Kim
node_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3626dce41f5aSRakie Kim static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
3627dce41f5aSRakie Kim char *buf)
3628dce41f5aSRakie Kim {
3629dce41f5aSRakie Kim struct iw_node_attr *node_attr;
3630dce41f5aSRakie Kim u8 weight;
3631dce41f5aSRakie Kim
3632dce41f5aSRakie Kim node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3633dce41f5aSRakie Kim weight = get_il_weight(node_attr->nid);
3634dce41f5aSRakie Kim return sysfs_emit(buf, "%d\n", weight);
3635dce41f5aSRakie Kim }
3636dce41f5aSRakie Kim
node_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3637dce41f5aSRakie Kim static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
3638dce41f5aSRakie Kim const char *buf, size_t count)
3639dce41f5aSRakie Kim {
3640e341f9c3SJoshua Hahn struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3641dce41f5aSRakie Kim struct iw_node_attr *node_attr;
3642dce41f5aSRakie Kim u8 weight = 0;
3643e341f9c3SJoshua Hahn int i;
3644dce41f5aSRakie Kim
3645dce41f5aSRakie Kim node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
3646e341f9c3SJoshua Hahn if (count == 0 || sysfs_streq(buf, "") ||
3647e341f9c3SJoshua Hahn kstrtou8(buf, 0, &weight) || weight == 0)
3648dce41f5aSRakie Kim return -EINVAL;
3649dce41f5aSRakie Kim
365032a92f8cSLinus Torvalds new_wi_state = kzalloc_flex(*new_wi_state, iw_table, nr_node_ids);
3651e341f9c3SJoshua Hahn if (!new_wi_state)
3652dce41f5aSRakie Kim return -ENOMEM;
3653dce41f5aSRakie Kim
3654e341f9c3SJoshua Hahn mutex_lock(&wi_state_lock);
3655e341f9c3SJoshua Hahn old_wi_state = rcu_dereference_protected(wi_state,
3656e341f9c3SJoshua Hahn lockdep_is_held(&wi_state_lock));
3657e341f9c3SJoshua Hahn if (old_wi_state) {
3658e341f9c3SJoshua Hahn memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3659e341f9c3SJoshua Hahn nr_node_ids * sizeof(u8));
3660e341f9c3SJoshua Hahn } else {
3661e341f9c3SJoshua Hahn for (i = 0; i < nr_node_ids; i++)
3662e341f9c3SJoshua Hahn new_wi_state->iw_table[i] = 1;
3663e341f9c3SJoshua Hahn }
3664e341f9c3SJoshua Hahn new_wi_state->iw_table[node_attr->nid] = weight;
3665e341f9c3SJoshua Hahn new_wi_state->mode_auto = false;
3666e341f9c3SJoshua Hahn
3667e341f9c3SJoshua Hahn rcu_assign_pointer(wi_state, new_wi_state);
3668e341f9c3SJoshua Hahn mutex_unlock(&wi_state_lock);
3669e341f9c3SJoshua Hahn if (old_wi_state) {
3670dce41f5aSRakie Kim synchronize_rcu();
3671e341f9c3SJoshua Hahn kfree(old_wi_state);
3672e341f9c3SJoshua Hahn }
3673e341f9c3SJoshua Hahn return count;
3674e341f9c3SJoshua Hahn }
3675e341f9c3SJoshua Hahn
weighted_interleave_auto_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3676e341f9c3SJoshua Hahn static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
3677e341f9c3SJoshua Hahn struct kobj_attribute *attr, char *buf)
3678e341f9c3SJoshua Hahn {
3679e341f9c3SJoshua Hahn struct weighted_interleave_state *state;
3680e341f9c3SJoshua Hahn bool wi_auto = true;
3681e341f9c3SJoshua Hahn
3682e341f9c3SJoshua Hahn rcu_read_lock();
3683e341f9c3SJoshua Hahn state = rcu_dereference(wi_state);
3684e341f9c3SJoshua Hahn if (state)
3685e341f9c3SJoshua Hahn wi_auto = state->mode_auto;
3686e341f9c3SJoshua Hahn rcu_read_unlock();
3687e341f9c3SJoshua Hahn
3688e341f9c3SJoshua Hahn return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
3689e341f9c3SJoshua Hahn }
3690e341f9c3SJoshua Hahn
weighted_interleave_auto_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)3691e341f9c3SJoshua Hahn static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
3692e341f9c3SJoshua Hahn struct kobj_attribute *attr, const char *buf, size_t count)
3693e341f9c3SJoshua Hahn {
3694e341f9c3SJoshua Hahn struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
3695e341f9c3SJoshua Hahn unsigned int *bw;
3696e341f9c3SJoshua Hahn bool input;
3697e341f9c3SJoshua Hahn int i;
3698e341f9c3SJoshua Hahn
3699e341f9c3SJoshua Hahn if (kstrtobool(buf, &input))
3700e341f9c3SJoshua Hahn return -EINVAL;
3701e341f9c3SJoshua Hahn
370232a92f8cSLinus Torvalds new_wi_state = kzalloc_flex(*new_wi_state, iw_table, nr_node_ids);
3703e341f9c3SJoshua Hahn if (!new_wi_state)
3704e341f9c3SJoshua Hahn return -ENOMEM;
3705e341f9c3SJoshua Hahn for (i = 0; i < nr_node_ids; i++)
3706e341f9c3SJoshua Hahn new_wi_state->iw_table[i] = 1;
3707e341f9c3SJoshua Hahn
3708e341f9c3SJoshua Hahn mutex_lock(&wi_state_lock);
3709e341f9c3SJoshua Hahn old_wi_state = rcu_dereference_protected(wi_state,
3710e341f9c3SJoshua Hahn lockdep_is_held(&wi_state_lock));
37116fae274cSJackie Liu
37126fae274cSJackie Liu if (old_wi_state && input == old_wi_state->mode_auto) {
3713e341f9c3SJoshua Hahn mutex_unlock(&wi_state_lock);
37146fae274cSJackie Liu kfree(new_wi_state);
3715e341f9c3SJoshua Hahn return count;
3716e341f9c3SJoshua Hahn }
3717e341f9c3SJoshua Hahn
37186fae274cSJackie Liu if (!input) {
37196fae274cSJackie Liu if (old_wi_state)
3720e341f9c3SJoshua Hahn memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
3721e341f9c3SJoshua Hahn nr_node_ids * sizeof(u8));
3722e341f9c3SJoshua Hahn goto update_wi_state;
3723e341f9c3SJoshua Hahn }
3724e341f9c3SJoshua Hahn
3725e341f9c3SJoshua Hahn bw = node_bw_table;
3726e341f9c3SJoshua Hahn if (!bw) {
3727e341f9c3SJoshua Hahn mutex_unlock(&wi_state_lock);
3728e341f9c3SJoshua Hahn kfree(new_wi_state);
3729e341f9c3SJoshua Hahn return -ENODEV;
3730e341f9c3SJoshua Hahn }
3731e341f9c3SJoshua Hahn
3732e341f9c3SJoshua Hahn new_wi_state->mode_auto = true;
3733e341f9c3SJoshua Hahn reduce_interleave_weights(bw, new_wi_state->iw_table);
3734e341f9c3SJoshua Hahn
3735e341f9c3SJoshua Hahn update_wi_state:
3736e341f9c3SJoshua Hahn rcu_assign_pointer(wi_state, new_wi_state);
3737e341f9c3SJoshua Hahn mutex_unlock(&wi_state_lock);
3738e341f9c3SJoshua Hahn if (old_wi_state) {
3739e341f9c3SJoshua Hahn synchronize_rcu();
3740e341f9c3SJoshua Hahn kfree(old_wi_state);
3741e341f9c3SJoshua Hahn }
3742dce41f5aSRakie Kim return count;
3743dce41f5aSRakie Kim }
3744dce41f5aSRakie Kim
sysfs_wi_node_delete(int nid)3745cf8cecf2SRakie Kim static void sysfs_wi_node_delete(int nid)
3746dce41f5aSRakie Kim {
3747dec92bf9SRakie Kim struct iw_node_attr *attr;
3748dec92bf9SRakie Kim
3749dec92bf9SRakie Kim if (nid < 0 || nid >= nr_node_ids)
3750dce41f5aSRakie Kim return;
3751cf8cecf2SRakie Kim
3752dec92bf9SRakie Kim mutex_lock(&wi_group->kobj_lock);
3753dec92bf9SRakie Kim attr = wi_group->nattrs[nid];
3754dec92bf9SRakie Kim if (!attr) {
3755dec92bf9SRakie Kim mutex_unlock(&wi_group->kobj_lock);
3756dec92bf9SRakie Kim return;
3757dec92bf9SRakie Kim }
3758dec92bf9SRakie Kim
3759dec92bf9SRakie Kim wi_group->nattrs[nid] = NULL;
3760dec92bf9SRakie Kim mutex_unlock(&wi_group->kobj_lock);
3761dec92bf9SRakie Kim
3762dec92bf9SRakie Kim sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
3763dec92bf9SRakie Kim kfree(attr->kobj_attr.attr.name);
3764dec92bf9SRakie Kim kfree(attr);
3765dce41f5aSRakie Kim }
3766dce41f5aSRakie Kim
sysfs_wi_node_delete_all(void)3767cf8cecf2SRakie Kim static void sysfs_wi_node_delete_all(void)
3768dce41f5aSRakie Kim {
3769bb52e89dSRakie Kim int nid;
3770dce41f5aSRakie Kim
3771bb52e89dSRakie Kim for (nid = 0; nid < nr_node_ids; nid++)
3772cf8cecf2SRakie Kim sysfs_wi_node_delete(nid);
3773bb52e89dSRakie Kim }
3774bb52e89dSRakie Kim
wi_state_free(void)3775e341f9c3SJoshua Hahn static void wi_state_free(void)
3776bb52e89dSRakie Kim {
3777e341f9c3SJoshua Hahn struct weighted_interleave_state *old_wi_state;
3778bb52e89dSRakie Kim
3779e341f9c3SJoshua Hahn mutex_lock(&wi_state_lock);
3780e341f9c3SJoshua Hahn old_wi_state = rcu_dereference_protected(wi_state,
3781e341f9c3SJoshua Hahn lockdep_is_held(&wi_state_lock));
3782e341f9c3SJoshua Hahn rcu_assign_pointer(wi_state, NULL);
3783e341f9c3SJoshua Hahn mutex_unlock(&wi_state_lock);
37841ec8a6e3SJoshua Hahn
37851ec8a6e3SJoshua Hahn if (old_wi_state) {
3786e341f9c3SJoshua Hahn synchronize_rcu();
3787e341f9c3SJoshua Hahn kfree(old_wi_state);
3788e341f9c3SJoshua Hahn }
37891ec8a6e3SJoshua Hahn }
3790e341f9c3SJoshua Hahn
3791*8fedac32SJoshua Hahn static struct kobj_attribute wi_auto_attr = {
3792*8fedac32SJoshua Hahn .attr = { .name = "auto", .mode = 0664 },
3793*8fedac32SJoshua Hahn .show = weighted_interleave_auto_show,
3794*8fedac32SJoshua Hahn .store = weighted_interleave_auto_store,
3795*8fedac32SJoshua Hahn };
3796e341f9c3SJoshua Hahn
wi_cleanup(void)3797cf8cecf2SRakie Kim static void wi_cleanup(void) {
3798e341f9c3SJoshua Hahn sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3799cf8cecf2SRakie Kim sysfs_wi_node_delete_all();
3800e341f9c3SJoshua Hahn wi_state_free();
3801bb52e89dSRakie Kim }
3802bb52e89dSRakie Kim
wi_kobj_release(struct kobject * wi_kobj)3803bb52e89dSRakie Kim static void wi_kobj_release(struct kobject *wi_kobj)
3804bb52e89dSRakie Kim {
3805cf8cecf2SRakie Kim kfree(wi_group);
3806dce41f5aSRakie Kim }
3807dce41f5aSRakie Kim
3808dce41f5aSRakie Kim static const struct kobj_type wi_ktype = {
3809dce41f5aSRakie Kim .sysfs_ops = &kobj_sysfs_ops,
3810bb52e89dSRakie Kim .release = wi_kobj_release,
3811dce41f5aSRakie Kim };
3812dce41f5aSRakie Kim
sysfs_wi_node_add(int nid)3813cf8cecf2SRakie Kim static int sysfs_wi_node_add(int nid)
3814dce41f5aSRakie Kim {
3815dec92bf9SRakie Kim int ret;
3816dce41f5aSRakie Kim char *name;
3817dec92bf9SRakie Kim struct iw_node_attr *new_attr;
3818dce41f5aSRakie Kim
3819dec92bf9SRakie Kim if (nid < 0 || nid >= nr_node_ids) {
3820dec92bf9SRakie Kim pr_err("invalid node id: %d\n", nid);
3821dec92bf9SRakie Kim return -EINVAL;
3822dec92bf9SRakie Kim }
3823dec92bf9SRakie Kim
3824bf4afc53SLinus Torvalds new_attr = kzalloc_obj(*new_attr);
3825dec92bf9SRakie Kim if (!new_attr)
3826dce41f5aSRakie Kim return -ENOMEM;
3827dce41f5aSRakie Kim
3828dce41f5aSRakie Kim name = kasprintf(GFP_KERNEL, "node%d", nid);
3829dce41f5aSRakie Kim if (!name) {
3830dec92bf9SRakie Kim kfree(new_attr);
3831dce41f5aSRakie Kim return -ENOMEM;
3832dce41f5aSRakie Kim }
3833dce41f5aSRakie Kim
3834dec92bf9SRakie Kim sysfs_attr_init(&new_attr->kobj_attr.attr);
3835dec92bf9SRakie Kim new_attr->kobj_attr.attr.name = name;
3836dec92bf9SRakie Kim new_attr->kobj_attr.attr.mode = 0644;
3837dec92bf9SRakie Kim new_attr->kobj_attr.show = node_show;
3838dec92bf9SRakie Kim new_attr->kobj_attr.store = node_store;
3839dec92bf9SRakie Kim new_attr->nid = nid;
3840dce41f5aSRakie Kim
3841dec92bf9SRakie Kim mutex_lock(&wi_group->kobj_lock);
3842dec92bf9SRakie Kim if (wi_group->nattrs[nid]) {
3843dec92bf9SRakie Kim mutex_unlock(&wi_group->kobj_lock);
3844dec92bf9SRakie Kim ret = -EEXIST;
3845dec92bf9SRakie Kim goto out;
3846dce41f5aSRakie Kim }
3847dce41f5aSRakie Kim
3848dec92bf9SRakie Kim ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
3849dec92bf9SRakie Kim if (ret) {
3850dec92bf9SRakie Kim mutex_unlock(&wi_group->kobj_lock);
3851dec92bf9SRakie Kim goto out;
3852dec92bf9SRakie Kim }
3853dec92bf9SRakie Kim wi_group->nattrs[nid] = new_attr;
3854dec92bf9SRakie Kim mutex_unlock(&wi_group->kobj_lock);
3855dce41f5aSRakie Kim return 0;
3856dec92bf9SRakie Kim
3857dec92bf9SRakie Kim out:
3858dec92bf9SRakie Kim kfree(new_attr->kobj_attr.attr.name);
3859dec92bf9SRakie Kim kfree(new_attr);
3860dec92bf9SRakie Kim return ret;
3861dec92bf9SRakie Kim }
3862dec92bf9SRakie Kim
wi_node_notifier(struct notifier_block * nb,unsigned long action,void * data)3863dec92bf9SRakie Kim static int wi_node_notifier(struct notifier_block *nb,
3864dec92bf9SRakie Kim unsigned long action, void *data)
3865dec92bf9SRakie Kim {
3866dec92bf9SRakie Kim int err;
3867cf0b61adSOscar Salvador struct node_notify *nn = data;
3868cf0b61adSOscar Salvador int nid = nn->nid;
3869dec92bf9SRakie Kim
3870dec92bf9SRakie Kim switch (action) {
3871cf0b61adSOscar Salvador case NODE_ADDED_FIRST_MEMORY:
3872dec92bf9SRakie Kim err = sysfs_wi_node_add(nid);
3873dec92bf9SRakie Kim if (err)
3874dec92bf9SRakie Kim pr_err("failed to add sysfs for node%d during hotplug: %d\n",
3875dec92bf9SRakie Kim nid, err);
3876dec92bf9SRakie Kim break;
3877cf0b61adSOscar Salvador case NODE_REMOVED_LAST_MEMORY:
3878dec92bf9SRakie Kim sysfs_wi_node_delete(nid);
3879dec92bf9SRakie Kim break;
3880dec92bf9SRakie Kim }
3881dec92bf9SRakie Kim
3882dec92bf9SRakie Kim return NOTIFY_OK;
3883dce41f5aSRakie Kim }
3884dce41f5aSRakie Kim
add_weighted_interleave_group(struct kobject * mempolicy_kobj)3885cf8cecf2SRakie Kim static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
3886dce41f5aSRakie Kim {
3887dce41f5aSRakie Kim int nid, err;
3888dce41f5aSRakie Kim
3889323bbfcfSLinus Torvalds wi_group = kzalloc_flex(*wi_group, nattrs, nr_node_ids);
3890cf8cecf2SRakie Kim if (!wi_group)
3891dce41f5aSRakie Kim return -ENOMEM;
3892dec92bf9SRakie Kim mutex_init(&wi_group->kobj_lock);
3893dce41f5aSRakie Kim
3894cf8cecf2SRakie Kim err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
3895dce41f5aSRakie Kim "weighted_interleave");
3896bb52e89dSRakie Kim if (err)
3897bb52e89dSRakie Kim goto err_put_kobj;
3898dce41f5aSRakie Kim
3899e341f9c3SJoshua Hahn err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
3900e341f9c3SJoshua Hahn if (err)
3901e341f9c3SJoshua Hahn goto err_put_kobj;
3902e341f9c3SJoshua Hahn
3903dec92bf9SRakie Kim for_each_online_node(nid) {
3904dec92bf9SRakie Kim if (!node_state(nid, N_MEMORY))
3905dec92bf9SRakie Kim continue;
3906dec92bf9SRakie Kim
3907cf8cecf2SRakie Kim err = sysfs_wi_node_add(nid);
3908dce41f5aSRakie Kim if (err) {
3909dec92bf9SRakie Kim pr_err("failed to add sysfs for node%d during init: %d\n",
3910dec92bf9SRakie Kim nid, err);
3911bb52e89dSRakie Kim goto err_cleanup_kobj;
3912dce41f5aSRakie Kim }
3913dce41f5aSRakie Kim }
3914bb52e89dSRakie Kim
3915cf0b61adSOscar Salvador hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
3916dce41f5aSRakie Kim return 0;
3917bb52e89dSRakie Kim
3918bb52e89dSRakie Kim err_cleanup_kobj:
3919cf8cecf2SRakie Kim wi_cleanup();
3920cf8cecf2SRakie Kim kobject_del(&wi_group->wi_kobj);
3921bb52e89dSRakie Kim err_put_kobj:
3922cf8cecf2SRakie Kim kobject_put(&wi_group->wi_kobj);
3923bb52e89dSRakie Kim return err;
3924dce41f5aSRakie Kim }
3925dce41f5aSRakie Kim
mempolicy_sysfs_init(void)3926dce41f5aSRakie Kim static int __init mempolicy_sysfs_init(void)
3927dce41f5aSRakie Kim {
3928dce41f5aSRakie Kim int err;
3929dce41f5aSRakie Kim static struct kobject *mempolicy_kobj;
3930dce41f5aSRakie Kim
3931bb52e89dSRakie Kim mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
3932bb52e89dSRakie Kim if (!mempolicy_kobj)
3933bb52e89dSRakie Kim return -ENOMEM;
3934dce41f5aSRakie Kim
3935dce41f5aSRakie Kim err = add_weighted_interleave_group(mempolicy_kobj);
3936bb52e89dSRakie Kim if (err)
3937bb52e89dSRakie Kim goto err_kobj;
3938dce41f5aSRakie Kim
3939bb52e89dSRakie Kim return 0;
3940bb52e89dSRakie Kim
3941bb52e89dSRakie Kim err_kobj:
3942bb52e89dSRakie Kim kobject_del(mempolicy_kobj);
3943bb52e89dSRakie Kim kobject_put(mempolicy_kobj);
3944dce41f5aSRakie Kim return err;
3945dce41f5aSRakie Kim }
3946dce41f5aSRakie Kim
3947dce41f5aSRakie Kim late_initcall(mempolicy_sysfs_init);
3948dce41f5aSRakie Kim #endif /* CONFIG_SYSFS */
3949