1/*
2 *  kernel/cpuset.c
3 *
4 *  Processor and Memory placement constraints for sets of tasks.
5 *
6 *  Copyright (C) 2003 BULL SA.
7 *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
8 *  Copyright (C) 2006 Google, Inc
9 *
10 *  Portions derived from Patrick Mochel's sysfs code.
11 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
12 *
13 *  2003-10-10 Written by Simon Derr.
14 *  2003-10-22 Updates by Stephen Hemminger.
15 *  2004 May-July Rework by Paul Jackson.
16 *  2006 Rework by Paul Menage to use generic cgroups
17 *  2008 Rework of the scheduler domains and CPU hotplug handling
18 *       by Max Krasnyansky
19 *
20 *  This file is subject to the terms and conditions of the GNU General Public
21 *  License.  See the file COPYING in the main directory of the Linux
22 *  distribution for more details.
23 */
24
25#include <linux/cpu.h>
26#include <linux/cpumask.h>
27#include <linux/cpuset.h>
28#include <linux/delay.h>
29#include <linux/init.h>
30#include <linux/interrupt.h>
31#include <linux/kernel.h>
32#include <linux/mempolicy.h>
33#include <linux/mm.h>
34#include <linux/memory.h>
35#include <linux/export.h>
36#include <linux/rcupdate.h>
37#include <linux/sched.h>
38#include <linux/sched/deadline.h>
39#include <linux/sched/mm.h>
40#include <linux/sched/task.h>
41#include <linux/security.h>
42#include <linux/spinlock.h>
43#include <linux/oom.h>
44#include <linux/sched/isolation.h>
45#include <linux/cgroup.h>
46#include <linux/wait.h>
47#include <linux/workqueue.h>
48
49DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
50DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
51
52/*
53 * There could be abnormal cpuset configurations for cpu or memory
54 * node binding, add this key to provide a quick low-cost judgment
55 * of the situation.
56 */
57DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
58
59/* See "Frequency meter" comments, below. */
60
61struct fmeter {
62	int cnt;		/* unprocessed events count */
63	int val;		/* most recent output value */
64	time64_t time;		/* clock (secs) when val computed */
65	spinlock_t lock;	/* guards read or write of above */
66};
67
68/*
69 * Invalid partition error code
70 */
71enum prs_errcode {
72	PERR_NONE = 0,
73	PERR_INVCPUS,
74	PERR_INVPARENT,
75	PERR_NOTPART,
76	PERR_NOTEXCL,
77	PERR_NOCPUS,
78	PERR_HOTPLUG,
79	PERR_CPUSEMPTY,
80	PERR_HKEEPING,
81};
82
83static const char * const perr_strings[] = {
84	[PERR_INVCPUS]   = "Invalid cpu list in cpuset.cpus.exclusive",
85	[PERR_INVPARENT] = "Parent is an invalid partition root",
86	[PERR_NOTPART]   = "Parent is not a partition root",
87	[PERR_NOTEXCL]   = "Cpu list in cpuset.cpus not exclusive",
88	[PERR_NOCPUS]    = "Parent unable to distribute cpu downstream",
89	[PERR_HOTPLUG]   = "No cpu available due to hotplug",
90	[PERR_CPUSEMPTY] = "cpuset.cpus is empty",
91	[PERR_HKEEPING]  = "partition config conflicts with housekeeping setup",
92};
93
94struct cpuset {
95	struct cgroup_subsys_state css;
96
97	unsigned long flags;		/* "unsigned long" so bitops work */
98
99	/*
100	 * On default hierarchy:
101	 *
102	 * The user-configured masks can only be changed by writing to
103	 * cpuset.cpus and cpuset.mems, and won't be limited by the
104	 * parent masks.
105	 *
106	 * The effective masks is the real masks that apply to the tasks
107	 * in the cpuset. They may be changed if the configured masks are
108	 * changed or hotplug happens.
109	 *
110	 * effective_mask == configured_mask & parent's effective_mask,
111	 * and if it ends up empty, it will inherit the parent's mask.
112	 *
113	 *
114	 * On legacy hierarchy:
115	 *
116	 * The user-configured masks are always the same with effective masks.
117	 */
118
119	/* user-configured CPUs and Memory Nodes allow to tasks */
120	cpumask_var_t cpus_allowed;
121	nodemask_t mems_allowed;
122
123	/* effective CPUs and Memory Nodes allow to tasks */
124	cpumask_var_t effective_cpus;
125	nodemask_t effective_mems;
126
127	/*
128	 * Exclusive CPUs dedicated to current cgroup (default hierarchy only)
129	 *
130	 * This exclusive CPUs must be a subset of cpus_allowed. A parent
131	 * cgroup can only grant exclusive CPUs to one of its children.
132	 *
133	 * When the cgroup becomes a valid partition root, effective_xcpus
134	 * defaults to cpus_allowed if not set. The effective_cpus of a valid
135	 * partition root comes solely from its effective_xcpus and some of the
136	 * effective_xcpus may be distributed to sub-partitions below & hence
137	 * excluded from its effective_cpus.
138	 */
139	cpumask_var_t effective_xcpus;
140
141	/*
142	 * Exclusive CPUs as requested by the user (default hierarchy only)
143	 */
144	cpumask_var_t exclusive_cpus;
145
146	/*
147	 * This is old Memory Nodes tasks took on.
148	 *
149	 * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
150	 * - A new cpuset's old_mems_allowed is initialized when some
151	 *   task is moved into it.
152	 * - old_mems_allowed is used in cpuset_migrate_mm() when we change
153	 *   cpuset.mems_allowed and have tasks' nodemask updated, and
154	 *   then old_mems_allowed is updated to mems_allowed.
155	 */
156	nodemask_t old_mems_allowed;
157
158	struct fmeter fmeter;		/* memory_pressure filter */
159
160	/*
161	 * Tasks are being attached to this cpuset.  Used to prevent
162	 * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
163	 */
164	int attach_in_progress;
165
166	/* partition number for rebuild_sched_domains() */
167	int pn;
168
169	/* for custom sched domain */
170	int relax_domain_level;
171
172	/* number of valid sub-partitions */
173	int nr_subparts;
174
175	/* partition root state */
176	int partition_root_state;
177
178	/*
179	 * Default hierarchy only:
180	 * use_parent_ecpus - set if using parent's effective_cpus
181	 * child_ecpus_count - # of children with use_parent_ecpus set
182	 */
183	int use_parent_ecpus;
184	int child_ecpus_count;
185
186	/*
187	 * number of SCHED_DEADLINE tasks attached to this cpuset, so that we
188	 * know when to rebuild associated root domain bandwidth information.
189	 */
190	int nr_deadline_tasks;
191	int nr_migrate_dl_tasks;
192	u64 sum_migrate_dl_bw;
193
194	/* Invalid partition error code, not lock protected */
195	enum prs_errcode prs_err;
196
197	/* Handle for cpuset.cpus.partition */
198	struct cgroup_file partition_file;
199
200	/* Remote partition silbling list anchored at remote_children */
201	struct list_head remote_sibling;
202};
203
204/*
205 * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
206 */
207struct cpuset_remove_tasks_struct {
208	struct work_struct work;
209	struct cpuset *cs;
210};
211
212/*
213 * Exclusive CPUs distributed out to sub-partitions of top_cpuset
214 */
215static cpumask_var_t	subpartitions_cpus;
216
217/*
218 * Exclusive CPUs in isolated partitions
219 */
220static cpumask_var_t	isolated_cpus;
221
222/* List of remote partition root children */
223static struct list_head remote_children;
224
225/*
226 * Partition root states:
227 *
228 *   0 - member (not a partition root)
229 *   1 - partition root
230 *   2 - partition root without load balancing (isolated)
231 *  -1 - invalid partition root
232 *  -2 - invalid isolated partition root
233 */
234#define PRS_MEMBER		0
235#define PRS_ROOT		1
236#define PRS_ISOLATED		2
237#define PRS_INVALID_ROOT	-1
238#define PRS_INVALID_ISOLATED	-2
239
240static inline bool is_prs_invalid(int prs_state)
241{
242	return prs_state < 0;
243}
244
245/*
246 * Temporary cpumasks for working with partitions that are passed among
247 * functions to avoid memory allocation in inner functions.
248 */
249struct tmpmasks {
250	cpumask_var_t addmask, delmask;	/* For partition root */
251	cpumask_var_t new_cpus;		/* For update_cpumasks_hier() */
252};
253
254static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
255{
256	return css ? container_of(css, struct cpuset, css) : NULL;
257}
258
259/* Retrieve the cpuset for a task */
260static inline struct cpuset *task_cs(struct task_struct *task)
261{
262	return css_cs(task_css(task, cpuset_cgrp_id));
263}
264
265static inline struct cpuset *parent_cs(struct cpuset *cs)
266{
267	return css_cs(cs->css.parent);
268}
269
270void inc_dl_tasks_cs(struct task_struct *p)
271{
272	struct cpuset *cs = task_cs(p);
273
274	cs->nr_deadline_tasks++;
275}
276
277void dec_dl_tasks_cs(struct task_struct *p)
278{
279	struct cpuset *cs = task_cs(p);
280
281	cs->nr_deadline_tasks--;
282}
283
284/* bits in struct cpuset flags field */
285typedef enum {
286	CS_ONLINE,
287	CS_CPU_EXCLUSIVE,
288	CS_MEM_EXCLUSIVE,
289	CS_MEM_HARDWALL,
290	CS_MEMORY_MIGRATE,
291	CS_SCHED_LOAD_BALANCE,
292	CS_SPREAD_PAGE,
293	CS_SPREAD_SLAB,
294} cpuset_flagbits_t;
295
296/* convenient tests for these bits */
297static inline bool is_cpuset_online(struct cpuset *cs)
298{
299	return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
300}
301
302static inline int is_cpu_exclusive(const struct cpuset *cs)
303{
304	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
305}
306
307static inline int is_mem_exclusive(const struct cpuset *cs)
308{
309	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
310}
311
312static inline int is_mem_hardwall(const struct cpuset *cs)
313{
314	return test_bit(CS_MEM_HARDWALL, &cs->flags);
315}
316
317static inline int is_sched_load_balance(const struct cpuset *cs)
318{
319	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
320}
321
322static inline int is_memory_migrate(const struct cpuset *cs)
323{
324	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
325}
326
327static inline int is_spread_page(const struct cpuset *cs)
328{
329	return test_bit(CS_SPREAD_PAGE, &cs->flags);
330}
331
332static inline int is_spread_slab(const struct cpuset *cs)
333{
334	return test_bit(CS_SPREAD_SLAB, &cs->flags);
335}
336
337static inline int is_partition_valid(const struct cpuset *cs)
338{
339	return cs->partition_root_state > 0;
340}
341
342static inline int is_partition_invalid(const struct cpuset *cs)
343{
344	return cs->partition_root_state < 0;
345}
346
347/*
348 * Callers should hold callback_lock to modify partition_root_state.
349 */
350static inline void make_partition_invalid(struct cpuset *cs)
351{
352	if (cs->partition_root_state > 0)
353		cs->partition_root_state = -cs->partition_root_state;
354}
355
356/*
357 * Send notification event of whenever partition_root_state changes.
358 */
359static inline void notify_partition_change(struct cpuset *cs, int old_prs)
360{
361	if (old_prs == cs->partition_root_state)
362		return;
363	cgroup_file_notify(&cs->partition_file);
364
365	/* Reset prs_err if not invalid */
366	if (is_partition_valid(cs))
367		WRITE_ONCE(cs->prs_err, PERR_NONE);
368}
369
370static struct cpuset top_cpuset = {
371	.flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
372		 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
373	.partition_root_state = PRS_ROOT,
374	.relax_domain_level = -1,
375	.remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
376};
377
378/**
379 * cpuset_for_each_child - traverse online children of a cpuset
380 * @child_cs: loop cursor pointing to the current child
381 * @pos_css: used for iteration
382 * @parent_cs: target cpuset to walk children of
383 *
384 * Walk @child_cs through the online children of @parent_cs.  Must be used
385 * with RCU read locked.
386 */
387#define cpuset_for_each_child(child_cs, pos_css, parent_cs)		\
388	css_for_each_child((pos_css), &(parent_cs)->css)		\
389		if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
390
391/**
392 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
393 * @des_cs: loop cursor pointing to the current descendant
394 * @pos_css: used for iteration
395 * @root_cs: target cpuset to walk ancestor of
396 *
397 * Walk @des_cs through the online descendants of @root_cs.  Must be used
398 * with RCU read locked.  The caller may modify @pos_css by calling
399 * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
400 * iteration and the first node to be visited.
401 */
402#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)	\
403	css_for_each_descendant_pre((pos_css), &(root_cs)->css)		\
404		if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
405
406/*
407 * There are two global locks guarding cpuset structures - cpuset_mutex and
408 * callback_lock. We also require taking task_lock() when dereferencing a
409 * task's cpuset pointer. See "The task_lock() exception", at the end of this
410 * comment.  The cpuset code uses only cpuset_mutex. Other kernel subsystems
411 * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
412 * structures. Note that cpuset_mutex needs to be a mutex as it is used in
413 * paths that rely on priority inheritance (e.g. scheduler - on RT) for
414 * correctness.
415 *
416 * A task must hold both locks to modify cpusets.  If a task holds
417 * cpuset_mutex, it blocks others, ensuring that it is the only task able to
418 * also acquire callback_lock and be able to modify cpusets.  It can perform
419 * various checks on the cpuset structure first, knowing nothing will change.
420 * It can also allocate memory while just holding cpuset_mutex.  While it is
421 * performing these checks, various callback routines can briefly acquire
422 * callback_lock to query cpusets.  Once it is ready to make the changes, it
423 * takes callback_lock, blocking everyone else.
424 *
425 * Calls to the kernel memory allocator can not be made while holding
426 * callback_lock, as that would risk double tripping on callback_lock
427 * from one of the callbacks into the cpuset code from within
428 * __alloc_pages().
429 *
430 * If a task is only holding callback_lock, then it has read-only
431 * access to cpusets.
432 *
433 * Now, the task_struct fields mems_allowed and mempolicy may be changed
434 * by other task, we use alloc_lock in the task_struct fields to protect
435 * them.
436 *
437 * The cpuset_common_file_read() handlers only hold callback_lock across
438 * small pieces of code, such as when reading out possibly multi-word
439 * cpumasks and nodemasks.
440 *
441 * Accessing a task's cpuset should be done in accordance with the
442 * guidelines for accessing subsystem state in kernel/cgroup.c
443 */
444
445static DEFINE_MUTEX(cpuset_mutex);
446
447void cpuset_lock(void)
448{
449	mutex_lock(&cpuset_mutex);
450}
451
452void cpuset_unlock(void)
453{
454	mutex_unlock(&cpuset_mutex);
455}
456
457static DEFINE_SPINLOCK(callback_lock);
458
459static struct workqueue_struct *cpuset_migrate_mm_wq;
460
461static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
462
463static inline void check_insane_mems_config(nodemask_t *nodes)
464{
465	if (!cpusets_insane_config() &&
466		movable_only_nodes(nodes)) {
467		static_branch_enable(&cpusets_insane_config_key);
468		pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
469			"Cpuset allocations might fail even with a lot of memory available.\n",
470			nodemask_pr_args(nodes));
471	}
472}
473
474/*
475 * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
476 * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
477 * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
478 * With v2 behavior, "cpus" and "mems" are always what the users have
479 * requested and won't be changed by hotplug events. Only the effective
480 * cpus or mems will be affected.
481 */
482static inline bool is_in_v2_mode(void)
483{
484	return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
485	      (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
486}
487
488/**
489 * partition_is_populated - check if partition has tasks
490 * @cs: partition root to be checked
491 * @excluded_child: a child cpuset to be excluded in task checking
492 * Return: true if there are tasks, false otherwise
493 *
494 * It is assumed that @cs is a valid partition root. @excluded_child should
495 * be non-NULL when this cpuset is going to become a partition itself.
496 */
497static inline bool partition_is_populated(struct cpuset *cs,
498					  struct cpuset *excluded_child)
499{
500	struct cgroup_subsys_state *css;
501	struct cpuset *child;
502
503	if (cs->css.cgroup->nr_populated_csets)
504		return true;
505	if (!excluded_child && !cs->nr_subparts)
506		return cgroup_is_populated(cs->css.cgroup);
507
508	rcu_read_lock();
509	cpuset_for_each_child(child, css, cs) {
510		if (child == excluded_child)
511			continue;
512		if (is_partition_valid(child))
513			continue;
514		if (cgroup_is_populated(child->css.cgroup)) {
515			rcu_read_unlock();
516			return true;
517		}
518	}
519	rcu_read_unlock();
520	return false;
521}
522
523/*
524 * Return in pmask the portion of a task's cpusets's cpus_allowed that
525 * are online and are capable of running the task.  If none are found,
526 * walk up the cpuset hierarchy until we find one that does have some
527 * appropriate cpus.
528 *
529 * One way or another, we guarantee to return some non-empty subset
530 * of cpu_online_mask.
531 *
532 * Call with callback_lock or cpuset_mutex held.
533 */
534static void guarantee_online_cpus(struct task_struct *tsk,
535				  struct cpumask *pmask)
536{
537	const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
538	struct cpuset *cs;
539
540	if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
541		cpumask_copy(pmask, cpu_online_mask);
542
543	rcu_read_lock();
544	cs = task_cs(tsk);
545
546	while (!cpumask_intersects(cs->effective_cpus, pmask))
547		cs = parent_cs(cs);
548
549	cpumask_and(pmask, pmask, cs->effective_cpus);
550	rcu_read_unlock();
551}
552
553/*
554 * Return in *pmask the portion of a cpusets's mems_allowed that
555 * are online, with memory.  If none are online with memory, walk
556 * up the cpuset hierarchy until we find one that does have some
557 * online mems.  The top cpuset always has some mems online.
558 *
559 * One way or another, we guarantee to return some non-empty subset
560 * of node_states[N_MEMORY].
561 *
562 * Call with callback_lock or cpuset_mutex held.
563 */
564static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
565{
566	while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
567		cs = parent_cs(cs);
568	nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
569}
570
571/*
572 * update task's spread flag if cpuset's page/slab spread flag is set
573 *
574 * Call with callback_lock or cpuset_mutex held. The check can be skipped
575 * if on default hierarchy.
576 */
577static void cpuset_update_task_spread_flags(struct cpuset *cs,
578					struct task_struct *tsk)
579{
580	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
581		return;
582
583	if (is_spread_page(cs))
584		task_set_spread_page(tsk);
585	else
586		task_clear_spread_page(tsk);
587
588	if (is_spread_slab(cs))
589		task_set_spread_slab(tsk);
590	else
591		task_clear_spread_slab(tsk);
592}
593
594/*
595 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
596 *
597 * One cpuset is a subset of another if all its allowed CPUs and
598 * Memory Nodes are a subset of the other, and its exclusive flags
599 * are only set if the other's are set.  Call holding cpuset_mutex.
600 */
601
602static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
603{
604	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
605		nodes_subset(p->mems_allowed, q->mems_allowed) &&
606		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
607		is_mem_exclusive(p) <= is_mem_exclusive(q);
608}
609
610/**
611 * alloc_cpumasks - allocate three cpumasks for cpuset
612 * @cs:  the cpuset that have cpumasks to be allocated.
613 * @tmp: the tmpmasks structure pointer
614 * Return: 0 if successful, -ENOMEM otherwise.
615 *
616 * Only one of the two input arguments should be non-NULL.
617 */
618static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
619{
620	cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4;
621
622	if (cs) {
623		pmask1 = &cs->cpus_allowed;
624		pmask2 = &cs->effective_cpus;
625		pmask3 = &cs->effective_xcpus;
626		pmask4 = &cs->exclusive_cpus;
627	} else {
628		pmask1 = &tmp->new_cpus;
629		pmask2 = &tmp->addmask;
630		pmask3 = &tmp->delmask;
631		pmask4 = NULL;
632	}
633
634	if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
635		return -ENOMEM;
636
637	if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
638		goto free_one;
639
640	if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
641		goto free_two;
642
643	if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
644		goto free_three;
645
646
647	return 0;
648
649free_three:
650	free_cpumask_var(*pmask3);
651free_two:
652	free_cpumask_var(*pmask2);
653free_one:
654	free_cpumask_var(*pmask1);
655	return -ENOMEM;
656}
657
658/**
659 * free_cpumasks - free cpumasks in a tmpmasks structure
660 * @cs:  the cpuset that have cpumasks to be free.
661 * @tmp: the tmpmasks structure pointer
662 */
663static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
664{
665	if (cs) {
666		free_cpumask_var(cs->cpus_allowed);
667		free_cpumask_var(cs->effective_cpus);
668		free_cpumask_var(cs->effective_xcpus);
669		free_cpumask_var(cs->exclusive_cpus);
670	}
671	if (tmp) {
672		free_cpumask_var(tmp->new_cpus);
673		free_cpumask_var(tmp->addmask);
674		free_cpumask_var(tmp->delmask);
675	}
676}
677
678/**
679 * alloc_trial_cpuset - allocate a trial cpuset
680 * @cs: the cpuset that the trial cpuset duplicates
681 */
682static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
683{
684	struct cpuset *trial;
685
686	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
687	if (!trial)
688		return NULL;
689
690	if (alloc_cpumasks(trial, NULL)) {
691		kfree(trial);
692		return NULL;
693	}
694
695	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
696	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
697	cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
698	cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
699	return trial;
700}
701
702/**
703 * free_cpuset - free the cpuset
704 * @cs: the cpuset to be freed
705 */
706static inline void free_cpuset(struct cpuset *cs)
707{
708	free_cpumasks(cs, NULL);
709	kfree(cs);
710}
711
712static inline struct cpumask *fetch_xcpus(struct cpuset *cs)
713{
714	return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus :
715	       cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed
716						  : cs->effective_xcpus;
717}
718
719/*
720 * cpusets_are_exclusive() - check if two cpusets are exclusive
721 *
722 * Return true if exclusive, false if not
723 */
724static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
725{
726	struct cpumask *xcpus1 = fetch_xcpus(cs1);
727	struct cpumask *xcpus2 = fetch_xcpus(cs2);
728
729	if (cpumask_intersects(xcpus1, xcpus2))
730		return false;
731	return true;
732}
733
734/*
735 * validate_change_legacy() - Validate conditions specific to legacy (v1)
736 *                            behavior.
737 */
738static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
739{
740	struct cgroup_subsys_state *css;
741	struct cpuset *c, *par;
742	int ret;
743
744	WARN_ON_ONCE(!rcu_read_lock_held());
745
746	/* Each of our child cpusets must be a subset of us */
747	ret = -EBUSY;
748	cpuset_for_each_child(c, css, cur)
749		if (!is_cpuset_subset(c, trial))
750			goto out;
751
752	/* On legacy hierarchy, we must be a subset of our parent cpuset. */
753	ret = -EACCES;
754	par = parent_cs(cur);
755	if (par && !is_cpuset_subset(trial, par))
756		goto out;
757
758	ret = 0;
759out:
760	return ret;
761}
762
763/*
764 * validate_change() - Used to validate that any proposed cpuset change
765 *		       follows the structural rules for cpusets.
766 *
767 * If we replaced the flag and mask values of the current cpuset
768 * (cur) with those values in the trial cpuset (trial), would
769 * our various subset and exclusive rules still be valid?  Presumes
770 * cpuset_mutex held.
771 *
772 * 'cur' is the address of an actual, in-use cpuset.  Operations
773 * such as list traversal that depend on the actual address of the
774 * cpuset in the list must use cur below, not trial.
775 *
776 * 'trial' is the address of bulk structure copy of cur, with
777 * perhaps one or more of the fields cpus_allowed, mems_allowed,
778 * or flags changed to new, trial values.
779 *
780 * Return 0 if valid, -errno if not.
781 */
782
783static int validate_change(struct cpuset *cur, struct cpuset *trial)
784{
785	struct cgroup_subsys_state *css;
786	struct cpuset *c, *par;
787	int ret = 0;
788
789	rcu_read_lock();
790
791	if (!is_in_v2_mode())
792		ret = validate_change_legacy(cur, trial);
793	if (ret)
794		goto out;
795
796	/* Remaining checks don't apply to root cpuset */
797	if (cur == &top_cpuset)
798		goto out;
799
800	par = parent_cs(cur);
801
802	/*
803	 * Cpusets with tasks - existing or newly being attached - can't
804	 * be changed to have empty cpus_allowed or mems_allowed.
805	 */
806	ret = -ENOSPC;
807	if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
808		if (!cpumask_empty(cur->cpus_allowed) &&
809		    cpumask_empty(trial->cpus_allowed))
810			goto out;
811		if (!nodes_empty(cur->mems_allowed) &&
812		    nodes_empty(trial->mems_allowed))
813			goto out;
814	}
815
816	/*
817	 * We can't shrink if we won't have enough room for SCHED_DEADLINE
818	 * tasks.
819	 */
820	ret = -EBUSY;
821	if (is_cpu_exclusive(cur) &&
822	    !cpuset_cpumask_can_shrink(cur->cpus_allowed,
823				       trial->cpus_allowed))
824		goto out;
825
826	/*
827	 * If either I or some sibling (!= me) is exclusive, we can't
828	 * overlap
829	 */
830	ret = -EINVAL;
831	cpuset_for_each_child(c, css, par) {
832		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
833		    c != cur) {
834			if (!cpusets_are_exclusive(trial, c))
835				goto out;
836		}
837		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
838		    c != cur &&
839		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
840			goto out;
841	}
842
843	ret = 0;
844out:
845	rcu_read_unlock();
846	return ret;
847}
848
849#ifdef CONFIG_SMP
850/*
851 * Helper routine for generate_sched_domains().
852 * Do cpusets a, b have overlapping effective cpus_allowed masks?
853 */
854static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
855{
856	return cpumask_intersects(a->effective_cpus, b->effective_cpus);
857}
858
859static void
860update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
861{
862	if (dattr->relax_domain_level < c->relax_domain_level)
863		dattr->relax_domain_level = c->relax_domain_level;
864	return;
865}
866
867static void update_domain_attr_tree(struct sched_domain_attr *dattr,
868				    struct cpuset *root_cs)
869{
870	struct cpuset *cp;
871	struct cgroup_subsys_state *pos_css;
872
873	rcu_read_lock();
874	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
875		/* skip the whole subtree if @cp doesn't have any CPU */
876		if (cpumask_empty(cp->cpus_allowed)) {
877			pos_css = css_rightmost_descendant(pos_css);
878			continue;
879		}
880
881		if (is_sched_load_balance(cp))
882			update_domain_attr(dattr, cp);
883	}
884	rcu_read_unlock();
885}
886
887/* Must be called with cpuset_mutex held.  */
888static inline int nr_cpusets(void)
889{
890	/* jump label reference count + the top-level cpuset */
891	return static_key_count(&cpusets_enabled_key.key) + 1;
892}
893
894/*
895 * generate_sched_domains()
896 *
897 * This function builds a partial partition of the systems CPUs
898 * A 'partial partition' is a set of non-overlapping subsets whose
899 * union is a subset of that set.
900 * The output of this function needs to be passed to kernel/sched/core.c
901 * partition_sched_domains() routine, which will rebuild the scheduler's
902 * load balancing domains (sched domains) as specified by that partial
903 * partition.
904 *
905 * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
906 * for a background explanation of this.
907 *
908 * Does not return errors, on the theory that the callers of this
909 * routine would rather not worry about failures to rebuild sched
910 * domains when operating in the severe memory shortage situations
911 * that could cause allocation failures below.
912 *
913 * Must be called with cpuset_mutex held.
914 *
915 * The three key local variables below are:
916 *    cp - cpuset pointer, used (together with pos_css) to perform a
917 *	   top-down scan of all cpusets. For our purposes, rebuilding
918 *	   the schedulers sched domains, we can ignore !is_sched_load_
919 *	   balance cpusets.
920 *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
921 *	   that need to be load balanced, for convenient iterative
922 *	   access by the subsequent code that finds the best partition,
923 *	   i.e the set of domains (subsets) of CPUs such that the
924 *	   cpus_allowed of every cpuset marked is_sched_load_balance
925 *	   is a subset of one of these domains, while there are as
926 *	   many such domains as possible, each as small as possible.
927 * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
928 *	   the kernel/sched/core.c routine partition_sched_domains() in a
929 *	   convenient format, that can be easily compared to the prior
930 *	   value to determine what partition elements (sched domains)
931 *	   were changed (added or removed.)
932 *
933 * Finding the best partition (set of domains):
934 *	The triple nested loops below over i, j, k scan over the
935 *	load balanced cpusets (using the array of cpuset pointers in
936 *	csa[]) looking for pairs of cpusets that have overlapping
937 *	cpus_allowed, but which don't have the same 'pn' partition
938 *	number and gives them in the same partition number.  It keeps
939 *	looping on the 'restart' label until it can no longer find
940 *	any such pairs.
941 *
942 *	The union of the cpus_allowed masks from the set of
943 *	all cpusets having the same 'pn' value then form the one
944 *	element of the partition (one sched domain) to be passed to
945 *	partition_sched_domains().
946 */
947static int generate_sched_domains(cpumask_var_t **domains,
948			struct sched_domain_attr **attributes)
949{
950	struct cpuset *cp;	/* top-down scan of cpusets */
951	struct cpuset **csa;	/* array of all cpuset ptrs */
952	int csn;		/* how many cpuset ptrs in csa so far */
953	int i, j, k;		/* indices for partition finding loops */
954	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
955	struct sched_domain_attr *dattr;  /* attributes for custom domains */
956	int ndoms = 0;		/* number of sched domains in result */
957	int nslot;		/* next empty doms[] struct cpumask slot */
958	struct cgroup_subsys_state *pos_css;
959	bool root_load_balance = is_sched_load_balance(&top_cpuset);
960
961	doms = NULL;
962	dattr = NULL;
963	csa = NULL;
964
965	/* Special case for the 99% of systems with one, full, sched domain */
966	if (root_load_balance && !top_cpuset.nr_subparts) {
967		ndoms = 1;
968		doms = alloc_sched_domains(ndoms);
969		if (!doms)
970			goto done;
971
972		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
973		if (dattr) {
974			*dattr = SD_ATTR_INIT;
975			update_domain_attr_tree(dattr, &top_cpuset);
976		}
977		cpumask_and(doms[0], top_cpuset.effective_cpus,
978			    housekeeping_cpumask(HK_TYPE_DOMAIN));
979
980		goto done;
981	}
982
983	csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
984	if (!csa)
985		goto done;
986	csn = 0;
987
988	rcu_read_lock();
989	if (root_load_balance)
990		csa[csn++] = &top_cpuset;
991	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
992		if (cp == &top_cpuset)
993			continue;
994		/*
995		 * Continue traversing beyond @cp iff @cp has some CPUs and
996		 * isn't load balancing.  The former is obvious.  The
997		 * latter: All child cpusets contain a subset of the
998		 * parent's cpus, so just skip them, and then we call
999		 * update_domain_attr_tree() to calc relax_domain_level of
1000		 * the corresponding sched domain.
1001		 *
1002		 * If root is load-balancing, we can skip @cp if it
1003		 * is a subset of the root's effective_cpus.
1004		 */
1005		if (!cpumask_empty(cp->cpus_allowed) &&
1006		    !(is_sched_load_balance(cp) &&
1007		      cpumask_intersects(cp->cpus_allowed,
1008					 housekeeping_cpumask(HK_TYPE_DOMAIN))))
1009			continue;
1010
1011		if (root_load_balance &&
1012		    cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
1013			continue;
1014
1015		if (is_sched_load_balance(cp) &&
1016		    !cpumask_empty(cp->effective_cpus))
1017			csa[csn++] = cp;
1018
1019		/* skip @cp's subtree if not a partition root */
1020		if (!is_partition_valid(cp))
1021			pos_css = css_rightmost_descendant(pos_css);
1022	}
1023	rcu_read_unlock();
1024
1025	for (i = 0; i < csn; i++)
1026		csa[i]->pn = i;
1027	ndoms = csn;
1028
1029restart:
1030	/* Find the best partition (set of sched domains) */
1031	for (i = 0; i < csn; i++) {
1032		struct cpuset *a = csa[i];
1033		int apn = a->pn;
1034
1035		for (j = 0; j < csn; j++) {
1036			struct cpuset *b = csa[j];
1037			int bpn = b->pn;
1038
1039			if (apn != bpn && cpusets_overlap(a, b)) {
1040				for (k = 0; k < csn; k++) {
1041					struct cpuset *c = csa[k];
1042
1043					if (c->pn == bpn)
1044						c->pn = apn;
1045				}
1046				ndoms--;	/* one less element */
1047				goto restart;
1048			}
1049		}
1050	}
1051
1052	/*
1053	 * Now we know how many domains to create.
1054	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
1055	 */
1056	doms = alloc_sched_domains(ndoms);
1057	if (!doms)
1058		goto done;
1059
1060	/*
1061	 * The rest of the code, including the scheduler, can deal with
1062	 * dattr==NULL case. No need to abort if alloc fails.
1063	 */
1064	dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
1065			      GFP_KERNEL);
1066
1067	for (nslot = 0, i = 0; i < csn; i++) {
1068		struct cpuset *a = csa[i];
1069		struct cpumask *dp;
1070		int apn = a->pn;
1071
1072		if (apn < 0) {
1073			/* Skip completed partitions */
1074			continue;
1075		}
1076
1077		dp = doms[nslot];
1078
1079		if (nslot == ndoms) {
1080			static int warnings = 10;
1081			if (warnings) {
1082				pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
1083					nslot, ndoms, csn, i, apn);
1084				warnings--;
1085			}
1086			continue;
1087		}
1088
1089		cpumask_clear(dp);
1090		if (dattr)
1091			*(dattr + nslot) = SD_ATTR_INIT;
1092		for (j = i; j < csn; j++) {
1093			struct cpuset *b = csa[j];
1094
1095			if (apn == b->pn) {
1096				cpumask_or(dp, dp, b->effective_cpus);
1097				cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
1098				if (dattr)
1099					update_domain_attr_tree(dattr + nslot, b);
1100
1101				/* Done with this partition */
1102				b->pn = -1;
1103			}
1104		}
1105		nslot++;
1106	}
1107	BUG_ON(nslot != ndoms);
1108
1109done:
1110	kfree(csa);
1111
1112	/*
1113	 * Fallback to the default domain if kmalloc() failed.
1114	 * See comments in partition_sched_domains().
1115	 */
1116	if (doms == NULL)
1117		ndoms = 1;
1118
1119	*domains    = doms;
1120	*attributes = dattr;
1121	return ndoms;
1122}
1123
1124static void dl_update_tasks_root_domain(struct cpuset *cs)
1125{
1126	struct css_task_iter it;
1127	struct task_struct *task;
1128
1129	if (cs->nr_deadline_tasks == 0)
1130		return;
1131
1132	css_task_iter_start(&cs->css, 0, &it);
1133
1134	while ((task = css_task_iter_next(&it)))
1135		dl_add_task_root_domain(task);
1136
1137	css_task_iter_end(&it);
1138}
1139
1140static void dl_rebuild_rd_accounting(void)
1141{
1142	struct cpuset *cs = NULL;
1143	struct cgroup_subsys_state *pos_css;
1144
1145	lockdep_assert_held(&cpuset_mutex);
1146	lockdep_assert_cpus_held();
1147	lockdep_assert_held(&sched_domains_mutex);
1148
1149	rcu_read_lock();
1150
1151	/*
1152	 * Clear default root domain DL accounting, it will be computed again
1153	 * if a task belongs to it.
1154	 */
1155	dl_clear_root_domain(&def_root_domain);
1156
1157	cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1158
1159		if (cpumask_empty(cs->effective_cpus)) {
1160			pos_css = css_rightmost_descendant(pos_css);
1161			continue;
1162		}
1163
1164		css_get(&cs->css);
1165
1166		rcu_read_unlock();
1167
1168		dl_update_tasks_root_domain(cs);
1169
1170		rcu_read_lock();
1171		css_put(&cs->css);
1172	}
1173	rcu_read_unlock();
1174}
1175
1176static void
1177partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1178				    struct sched_domain_attr *dattr_new)
1179{
1180	mutex_lock(&sched_domains_mutex);
1181	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
1182	dl_rebuild_rd_accounting();
1183	mutex_unlock(&sched_domains_mutex);
1184}
1185
1186/*
1187 * Rebuild scheduler domains.
1188 *
1189 * If the flag 'sched_load_balance' of any cpuset with non-empty
1190 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
1191 * which has that flag enabled, or if any cpuset with a non-empty
1192 * 'cpus' is removed, then call this routine to rebuild the
1193 * scheduler's dynamic sched domains.
1194 *
1195 * Call with cpuset_mutex held.  Takes cpus_read_lock().
1196 */
1197static void rebuild_sched_domains_locked(void)
1198{
1199	struct cgroup_subsys_state *pos_css;
1200	struct sched_domain_attr *attr;
1201	cpumask_var_t *doms;
1202	struct cpuset *cs;
1203	int ndoms;
1204
1205	lockdep_assert_cpus_held();
1206	lockdep_assert_held(&cpuset_mutex);
1207
1208	/*
1209	 * If we have raced with CPU hotplug, return early to avoid
1210	 * passing doms with offlined cpu to partition_sched_domains().
1211	 * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
1212	 *
1213	 * With no CPUs in any subpartitions, top_cpuset's effective CPUs
1214	 * should be the same as the active CPUs, so checking only top_cpuset
1215	 * is enough to detect racing CPU offlines.
1216	 */
1217	if (cpumask_empty(subpartitions_cpus) &&
1218	    !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
1219		return;
1220
1221	/*
1222	 * With subpartition CPUs, however, the effective CPUs of a partition
1223	 * root should be only a subset of the active CPUs.  Since a CPU in any
1224	 * partition root could be offlined, all must be checked.
1225	 */
1226	if (top_cpuset.nr_subparts) {
1227		rcu_read_lock();
1228		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1229			if (!is_partition_valid(cs)) {
1230				pos_css = css_rightmost_descendant(pos_css);
1231				continue;
1232			}
1233			if (!cpumask_subset(cs->effective_cpus,
1234					    cpu_active_mask)) {
1235				rcu_read_unlock();
1236				return;
1237			}
1238		}
1239		rcu_read_unlock();
1240	}
1241
1242	/* Generate domain masks and attrs */
1243	ndoms = generate_sched_domains(&doms, &attr);
1244
1245	/* Have scheduler rebuild the domains */
1246	partition_and_rebuild_sched_domains(ndoms, doms, attr);
1247}
1248#else /* !CONFIG_SMP */
1249static void rebuild_sched_domains_locked(void)
1250{
1251}
1252#endif /* CONFIG_SMP */
1253
1254static void rebuild_sched_domains_cpuslocked(void)
1255{
1256	mutex_lock(&cpuset_mutex);
1257	rebuild_sched_domains_locked();
1258	mutex_unlock(&cpuset_mutex);
1259}
1260
1261void rebuild_sched_domains(void)
1262{
1263	cpus_read_lock();
1264	rebuild_sched_domains_cpuslocked();
1265	cpus_read_unlock();
1266}
1267
1268/**
1269 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
1270 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
1271 * @new_cpus: the temp variable for the new effective_cpus mask
1272 *
1273 * Iterate through each task of @cs updating its cpus_allowed to the
1274 * effective cpuset's.  As this function is called with cpuset_mutex held,
1275 * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask()
1276 * is used instead of effective_cpus to make sure all offline CPUs are also
1277 * included as hotplug code won't update cpumasks for tasks in top_cpuset.
1278 */
1279static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
1280{
1281	struct css_task_iter it;
1282	struct task_struct *task;
1283	bool top_cs = cs == &top_cpuset;
1284
1285	css_task_iter_start(&cs->css, 0, &it);
1286	while ((task = css_task_iter_next(&it))) {
1287		const struct cpumask *possible_mask = task_cpu_possible_mask(task);
1288
1289		if (top_cs) {
1290			/*
1291			 * Percpu kthreads in top_cpuset are ignored
1292			 */
1293			if (kthread_is_per_cpu(task))
1294				continue;
1295			cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
1296		} else {
1297			cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
1298		}
1299		set_cpus_allowed_ptr(task, new_cpus);
1300	}
1301	css_task_iter_end(&it);
1302}
1303
1304/**
1305 * compute_effective_cpumask - Compute the effective cpumask of the cpuset
1306 * @new_cpus: the temp variable for the new effective_cpus mask
1307 * @cs: the cpuset the need to recompute the new effective_cpus mask
1308 * @parent: the parent cpuset
1309 *
1310 * The result is valid only if the given cpuset isn't a partition root.
1311 */
1312static void compute_effective_cpumask(struct cpumask *new_cpus,
1313				      struct cpuset *cs, struct cpuset *parent)
1314{
1315	cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
1316}
1317
1318/*
1319 * Commands for update_parent_effective_cpumask
1320 */
1321enum partition_cmd {
1322	partcmd_enable,		/* Enable partition root	  */
1323	partcmd_enablei,	/* Enable isolated partition root */
1324	partcmd_disable,	/* Disable partition root	  */
1325	partcmd_update,		/* Update parent's effective_cpus */
1326	partcmd_invalidate,	/* Make partition invalid	  */
1327};
1328
1329static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1330		       int turning_on);
1331static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1332				    struct tmpmasks *tmp);
1333
1334/*
1335 * Update partition exclusive flag
1336 *
1337 * Return: 0 if successful, an error code otherwise
1338 */
1339static int update_partition_exclusive(struct cpuset *cs, int new_prs)
1340{
1341	bool exclusive = (new_prs > 0);
1342
1343	if (exclusive && !is_cpu_exclusive(cs)) {
1344		if (update_flag(CS_CPU_EXCLUSIVE, cs, 1))
1345			return PERR_NOTEXCL;
1346	} else if (!exclusive && is_cpu_exclusive(cs)) {
1347		/* Turning off CS_CPU_EXCLUSIVE will not return error */
1348		update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1349	}
1350	return 0;
1351}
1352
1353/*
1354 * Update partition load balance flag and/or rebuild sched domain
1355 *
1356 * Changing load balance flag will automatically call
1357 * rebuild_sched_domains_locked().
1358 * This function is for cgroup v2 only.
1359 */
1360static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
1361{
1362	int new_prs = cs->partition_root_state;
1363	bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
1364	bool new_lb;
1365
1366	/*
1367	 * If cs is not a valid partition root, the load balance state
1368	 * will follow its parent.
1369	 */
1370	if (new_prs > 0) {
1371		new_lb = (new_prs != PRS_ISOLATED);
1372	} else {
1373		new_lb = is_sched_load_balance(parent_cs(cs));
1374	}
1375	if (new_lb != !!is_sched_load_balance(cs)) {
1376		rebuild_domains = true;
1377		if (new_lb)
1378			set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1379		else
1380			clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1381	}
1382
1383	if (rebuild_domains)
1384		rebuild_sched_domains_locked();
1385}
1386
1387/*
1388 * tasks_nocpu_error - Return true if tasks will have no effective_cpus
1389 */
1390static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,
1391			      struct cpumask *xcpus)
1392{
1393	/*
1394	 * A populated partition (cs or parent) can't have empty effective_cpus
1395	 */
1396	return (cpumask_subset(parent->effective_cpus, xcpus) &&
1397		partition_is_populated(parent, cs)) ||
1398	       (!cpumask_intersects(xcpus, cpu_active_mask) &&
1399		partition_is_populated(cs, NULL));
1400}
1401
1402static void reset_partition_data(struct cpuset *cs)
1403{
1404	struct cpuset *parent = parent_cs(cs);
1405
1406	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
1407		return;
1408
1409	lockdep_assert_held(&callback_lock);
1410
1411	cs->nr_subparts = 0;
1412	if (cpumask_empty(cs->exclusive_cpus)) {
1413		cpumask_clear(cs->effective_xcpus);
1414		if (is_cpu_exclusive(cs))
1415			clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
1416	}
1417	if (!cpumask_and(cs->effective_cpus,
1418			 parent->effective_cpus, cs->cpus_allowed)) {
1419		cs->use_parent_ecpus = true;
1420		parent->child_ecpus_count++;
1421		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1422	}
1423}
1424
1425/*
1426 * partition_xcpus_newstate - Exclusive CPUs state change
1427 * @old_prs: old partition_root_state
1428 * @new_prs: new partition_root_state
1429 * @xcpus: exclusive CPUs with state change
1430 */
1431static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus)
1432{
1433	WARN_ON_ONCE(old_prs == new_prs);
1434	if (new_prs == PRS_ISOLATED)
1435		cpumask_or(isolated_cpus, isolated_cpus, xcpus);
1436	else
1437		cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
1438}
1439
1440/*
1441 * partition_xcpus_add - Add new exclusive CPUs to partition
1442 * @new_prs: new partition_root_state
1443 * @parent: parent cpuset
1444 * @xcpus: exclusive CPUs to be added
1445 * Return: true if isolated_cpus modified, false otherwise
1446 *
1447 * Remote partition if parent == NULL
1448 */
1449static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
1450				struct cpumask *xcpus)
1451{
1452	bool isolcpus_updated;
1453
1454	WARN_ON_ONCE(new_prs < 0);
1455	lockdep_assert_held(&callback_lock);
1456	if (!parent)
1457		parent = &top_cpuset;
1458
1459
1460	if (parent == &top_cpuset)
1461		cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
1462
1463	isolcpus_updated = (new_prs != parent->partition_root_state);
1464	if (isolcpus_updated)
1465		partition_xcpus_newstate(parent->partition_root_state, new_prs,
1466					 xcpus);
1467
1468	cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
1469	return isolcpus_updated;
1470}
1471
1472/*
1473 * partition_xcpus_del - Remove exclusive CPUs from partition
1474 * @old_prs: old partition_root_state
1475 * @parent: parent cpuset
1476 * @xcpus: exclusive CPUs to be removed
1477 * Return: true if isolated_cpus modified, false otherwise
1478 *
1479 * Remote partition if parent == NULL
1480 */
1481static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
1482				struct cpumask *xcpus)
1483{
1484	bool isolcpus_updated;
1485
1486	WARN_ON_ONCE(old_prs < 0);
1487	lockdep_assert_held(&callback_lock);
1488	if (!parent)
1489		parent = &top_cpuset;
1490
1491	if (parent == &top_cpuset)
1492		cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
1493
1494	isolcpus_updated = (old_prs != parent->partition_root_state);
1495	if (isolcpus_updated)
1496		partition_xcpus_newstate(old_prs, parent->partition_root_state,
1497					 xcpus);
1498
1499	cpumask_and(xcpus, xcpus, cpu_active_mask);
1500	cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
1501	return isolcpus_updated;
1502}
1503
1504static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
1505{
1506	int ret;
1507
1508	lockdep_assert_cpus_held();
1509
1510	if (!isolcpus_updated)
1511		return;
1512
1513	ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
1514	WARN_ON_ONCE(ret < 0);
1515}
1516
1517/**
1518 * cpuset_cpu_is_isolated - Check if the given CPU is isolated
1519 * @cpu: the CPU number to be checked
1520 * Return: true if CPU is used in an isolated partition, false otherwise
1521 */
1522bool cpuset_cpu_is_isolated(int cpu)
1523{
1524	return cpumask_test_cpu(cpu, isolated_cpus);
1525}
1526EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
1527
1528/*
1529 * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
1530 * @cs: cpuset
1531 * @xcpus: effective exclusive CPUs value to be set
1532 * Return: true if xcpus is not empty, false otherwise.
1533 *
1534 * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
1535 * it must be a subset of cpus_allowed and parent's effective_xcpus.
1536 */
1537static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
1538						struct cpumask *xcpus)
1539{
1540	struct cpuset *parent = parent_cs(cs);
1541
1542	if (!xcpus)
1543		xcpus = cs->effective_xcpus;
1544
1545	if (!cpumask_empty(cs->exclusive_cpus))
1546		cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed);
1547	else
1548		cpumask_copy(xcpus, cs->cpus_allowed);
1549
1550	return cpumask_and(xcpus, xcpus, parent->effective_xcpus);
1551}
1552
1553static inline bool is_remote_partition(struct cpuset *cs)
1554{
1555	return !list_empty(&cs->remote_sibling);
1556}
1557
1558static inline bool is_local_partition(struct cpuset *cs)
1559{
1560	return is_partition_valid(cs) && !is_remote_partition(cs);
1561}
1562
1563/*
1564 * remote_partition_enable - Enable current cpuset as a remote partition root
1565 * @cs: the cpuset to update
1566 * @new_prs: new partition_root_state
1567 * @tmp: temparary masks
1568 * Return: 1 if successful, 0 if error
1569 *
1570 * Enable the current cpuset to become a remote partition root taking CPUs
1571 * directly from the top cpuset. cpuset_mutex must be held by the caller.
1572 */
1573static int remote_partition_enable(struct cpuset *cs, int new_prs,
1574				   struct tmpmasks *tmp)
1575{
1576	bool isolcpus_updated;
1577
1578	/*
1579	 * The user must have sysadmin privilege.
1580	 */
1581	if (!capable(CAP_SYS_ADMIN))
1582		return 0;
1583
1584	/*
1585	 * The requested exclusive_cpus must not be allocated to other
1586	 * partitions and it can't use up all the root's effective_cpus.
1587	 *
1588	 * Note that if there is any local partition root above it or
1589	 * remote partition root underneath it, its exclusive_cpus must
1590	 * have overlapped with subpartitions_cpus.
1591	 */
1592	compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
1593	if (cpumask_empty(tmp->new_cpus) ||
1594	    cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
1595	    cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
1596		return 0;
1597
1598	spin_lock_irq(&callback_lock);
1599	isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
1600	list_add(&cs->remote_sibling, &remote_children);
1601	if (cs->use_parent_ecpus) {
1602		struct cpuset *parent = parent_cs(cs);
1603
1604		cs->use_parent_ecpus = false;
1605		parent->child_ecpus_count--;
1606	}
1607	spin_unlock_irq(&callback_lock);
1608	update_unbound_workqueue_cpumask(isolcpus_updated);
1609
1610	/*
1611	 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
1612	 */
1613	update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
1614	update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1615	return 1;
1616}
1617
1618/*
1619 * remote_partition_disable - Remove current cpuset from remote partition list
1620 * @cs: the cpuset to update
1621 * @tmp: temparary masks
1622 *
1623 * The effective_cpus is also updated.
1624 *
1625 * cpuset_mutex must be held by the caller.
1626 */
1627static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
1628{
1629	bool isolcpus_updated;
1630
1631	compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
1632	WARN_ON_ONCE(!is_remote_partition(cs));
1633	WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
1634
1635	spin_lock_irq(&callback_lock);
1636	list_del_init(&cs->remote_sibling);
1637	isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
1638					       NULL, tmp->new_cpus);
1639	cs->partition_root_state = -cs->partition_root_state;
1640	if (!cs->prs_err)
1641		cs->prs_err = PERR_INVCPUS;
1642	reset_partition_data(cs);
1643	spin_unlock_irq(&callback_lock);
1644	update_unbound_workqueue_cpumask(isolcpus_updated);
1645
1646	/*
1647	 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
1648	 */
1649	update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
1650	update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1651}
1652
1653/*
1654 * remote_cpus_update - cpus_exclusive change of remote partition
1655 * @cs: the cpuset to be updated
1656 * @newmask: the new effective_xcpus mask
1657 * @tmp: temparary masks
1658 *
1659 * top_cpuset and subpartitions_cpus will be updated or partition can be
1660 * invalidated.
1661 */
1662static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
1663			       struct tmpmasks *tmp)
1664{
1665	bool adding, deleting;
1666	int prs = cs->partition_root_state;
1667	int isolcpus_updated = 0;
1668
1669	if (WARN_ON_ONCE(!is_remote_partition(cs)))
1670		return;
1671
1672	WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
1673
1674	if (cpumask_empty(newmask))
1675		goto invalidate;
1676
1677	adding   = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus);
1678	deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask);
1679
1680	/*
1681	 * Additions of remote CPUs is only allowed if those CPUs are
1682	 * not allocated to other partitions and there are effective_cpus
1683	 * left in the top cpuset.
1684	 */
1685	if (adding && (!capable(CAP_SYS_ADMIN) ||
1686		       cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
1687		       cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)))
1688		goto invalidate;
1689
1690	spin_lock_irq(&callback_lock);
1691	if (adding)
1692		isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
1693	if (deleting)
1694		isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
1695	spin_unlock_irq(&callback_lock);
1696	update_unbound_workqueue_cpumask(isolcpus_updated);
1697
1698	/*
1699	 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
1700	 */
1701	update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
1702	update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1703	return;
1704
1705invalidate:
1706	remote_partition_disable(cs, tmp);
1707}
1708
1709/*
1710 * remote_partition_check - check if a child remote partition needs update
1711 * @cs: the cpuset to be updated
1712 * @newmask: the new effective_xcpus mask
1713 * @delmask: temporary mask for deletion (not in tmp)
1714 * @tmp: temparary masks
1715 *
1716 * This should be called before the given cs has updated its cpus_allowed
1717 * and/or effective_xcpus.
1718 */
1719static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
1720				   struct cpumask *delmask, struct tmpmasks *tmp)
1721{
1722	struct cpuset *child, *next;
1723	int disable_cnt = 0;
1724
1725	/*
1726	 * Compute the effective exclusive CPUs that will be deleted.
1727	 */
1728	if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) ||
1729	    !cpumask_intersects(delmask, subpartitions_cpus))
1730		return;	/* No deletion of exclusive CPUs in partitions */
1731
1732	/*
1733	 * Searching the remote children list to look for those that will
1734	 * be impacted by the deletion of exclusive CPUs.
1735	 *
1736	 * Since a cpuset must be removed from the remote children list
1737	 * before it can go offline and holding cpuset_mutex will prevent
1738	 * any change in cpuset status. RCU read lock isn't needed.
1739	 */
1740	lockdep_assert_held(&cpuset_mutex);
1741	list_for_each_entry_safe(child, next, &remote_children, remote_sibling)
1742		if (cpumask_intersects(child->effective_cpus, delmask)) {
1743			remote_partition_disable(child, tmp);
1744			disable_cnt++;
1745		}
1746	if (disable_cnt)
1747		rebuild_sched_domains_locked();
1748}
1749
1750/*
1751 * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
1752 * @prstate: partition root state to be checked
1753 * @new_cpus: cpu mask
1754 * Return: true if there is conflict, false otherwise
1755 *
1756 * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in
1757 * an isolated partition.
1758 */
1759static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
1760{
1761	const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN);
1762	bool all_in_hk = cpumask_subset(new_cpus, hk_domain);
1763
1764	if (!all_in_hk && (prstate != PRS_ISOLATED))
1765		return true;
1766
1767	return false;
1768}
1769
1770/**
1771 * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
1772 * @cs:      The cpuset that requests change in partition root state
1773 * @cmd:     Partition root state change command
1774 * @newmask: Optional new cpumask for partcmd_update
1775 * @tmp:     Temporary addmask and delmask
1776 * Return:   0 or a partition root state error code
1777 *
1778 * For partcmd_enable*, the cpuset is being transformed from a non-partition
1779 * root to a partition root. The effective_xcpus (cpus_allowed if
1780 * effective_xcpus not set) mask of the given cpuset will be taken away from
1781 * parent's effective_cpus. The function will return 0 if all the CPUs listed
1782 * in effective_xcpus can be granted or an error code will be returned.
1783 *
1784 * For partcmd_disable, the cpuset is being transformed from a partition
1785 * root back to a non-partition root. Any CPUs in effective_xcpus will be
1786 * given back to parent's effective_cpus. 0 will always be returned.
1787 *
1788 * For partcmd_update, if the optional newmask is specified, the cpu list is
1789 * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is
1790 * assumed to remain the same. The cpuset should either be a valid or invalid
1791 * partition root. The partition root state may change from valid to invalid
1792 * or vice versa. An error code will be returned if transitioning from
1793 * invalid to valid violates the exclusivity rule.
1794 *
1795 * For partcmd_invalidate, the current partition will be made invalid.
1796 *
1797 * The partcmd_enable* and partcmd_disable commands are used by
1798 * update_prstate(). An error code may be returned and the caller will check
1799 * for error.
1800 *
1801 * The partcmd_update command is used by update_cpumasks_hier() with newmask
1802 * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
1803 * by update_cpumask() with NULL newmask. In both cases, the callers won't
1804 * check for error and so partition_root_state and prs_error will be updated
1805 * directly.
1806 */
1807static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
1808					   struct cpumask *newmask,
1809					   struct tmpmasks *tmp)
1810{
1811	struct cpuset *parent = parent_cs(cs);
1812	int adding;	/* Adding cpus to parent's effective_cpus	*/
1813	int deleting;	/* Deleting cpus from parent's effective_cpus	*/
1814	int old_prs, new_prs;
1815	int part_error = PERR_NONE;	/* Partition error? */
1816	int subparts_delta = 0;
1817	struct cpumask *xcpus;		/* cs effective_xcpus */
1818	int isolcpus_updated = 0;
1819	bool nocpu;
1820
1821	lockdep_assert_held(&cpuset_mutex);
1822
1823	/*
1824	 * new_prs will only be changed for the partcmd_update and
1825	 * partcmd_invalidate commands.
1826	 */
1827	adding = deleting = false;
1828	old_prs = new_prs = cs->partition_root_state;
1829	xcpus = !cpumask_empty(cs->exclusive_cpus)
1830		? cs->effective_xcpus : cs->cpus_allowed;
1831
1832	if (cmd == partcmd_invalidate) {
1833		if (is_prs_invalid(old_prs))
1834			return 0;
1835
1836		/*
1837		 * Make the current partition invalid.
1838		 */
1839		if (is_partition_valid(parent))
1840			adding = cpumask_and(tmp->addmask,
1841					     xcpus, parent->effective_xcpus);
1842		if (old_prs > 0) {
1843			new_prs = -old_prs;
1844			subparts_delta--;
1845		}
1846		goto write_error;
1847	}
1848
1849	/*
1850	 * The parent must be a partition root.
1851	 * The new cpumask, if present, or the current cpus_allowed must
1852	 * not be empty.
1853	 */
1854	if (!is_partition_valid(parent)) {
1855		return is_partition_invalid(parent)
1856		       ? PERR_INVPARENT : PERR_NOTPART;
1857	}
1858	if (!newmask && cpumask_empty(cs->cpus_allowed))
1859		return PERR_CPUSEMPTY;
1860
1861	nocpu = tasks_nocpu_error(parent, cs, xcpus);
1862
1863	if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
1864		/*
1865		 * Enabling partition root is not allowed if its
1866		 * effective_xcpus is empty or doesn't overlap with
1867		 * parent's effective_xcpus.
1868		 */
1869		if (cpumask_empty(xcpus) ||
1870		    !cpumask_intersects(xcpus, parent->effective_xcpus))
1871			return PERR_INVCPUS;
1872
1873		if (prstate_housekeeping_conflict(new_prs, xcpus))
1874			return PERR_HKEEPING;
1875
1876		/*
1877		 * A parent can be left with no CPU as long as there is no
1878		 * task directly associated with the parent partition.
1879		 */
1880		if (nocpu)
1881			return PERR_NOCPUS;
1882
1883		cpumask_copy(tmp->delmask, xcpus);
1884		deleting = true;
1885		subparts_delta++;
1886		new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
1887	} else if (cmd == partcmd_disable) {
1888		/*
1889		 * May need to add cpus to parent's effective_cpus for
1890		 * valid partition root.
1891		 */
1892		adding = !is_prs_invalid(old_prs) &&
1893			  cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
1894		if (adding)
1895			subparts_delta--;
1896		new_prs = PRS_MEMBER;
1897	} else if (newmask) {
1898		/*
1899		 * Empty cpumask is not allowed
1900		 */
1901		if (cpumask_empty(newmask)) {
1902			part_error = PERR_CPUSEMPTY;
1903			goto write_error;
1904		}
1905
1906		/*
1907		 * partcmd_update with newmask:
1908		 *
1909		 * Compute add/delete mask to/from effective_cpus
1910		 *
1911		 * For valid partition:
1912		 *   addmask = exclusive_cpus & ~newmask
1913		 *			      & parent->effective_xcpus
1914		 *   delmask = newmask & ~exclusive_cpus
1915		 *		       & parent->effective_xcpus
1916		 *
1917		 * For invalid partition:
1918		 *   delmask = newmask & parent->effective_xcpus
1919		 */
1920		if (is_prs_invalid(old_prs)) {
1921			adding = false;
1922			deleting = cpumask_and(tmp->delmask,
1923					newmask, parent->effective_xcpus);
1924		} else {
1925			cpumask_andnot(tmp->addmask, xcpus, newmask);
1926			adding = cpumask_and(tmp->addmask, tmp->addmask,
1927					     parent->effective_xcpus);
1928
1929			cpumask_andnot(tmp->delmask, newmask, xcpus);
1930			deleting = cpumask_and(tmp->delmask, tmp->delmask,
1931					       parent->effective_xcpus);
1932		}
1933		/*
1934		 * Make partition invalid if parent's effective_cpus could
1935		 * become empty and there are tasks in the parent.
1936		 */
1937		if (nocpu && (!adding ||
1938		    !cpumask_intersects(tmp->addmask, cpu_active_mask))) {
1939			part_error = PERR_NOCPUS;
1940			deleting = false;
1941			adding = cpumask_and(tmp->addmask,
1942					     xcpus, parent->effective_xcpus);
1943		}
1944	} else {
1945		/*
1946		 * partcmd_update w/o newmask
1947		 *
1948		 * delmask = effective_xcpus & parent->effective_cpus
1949		 *
1950		 * This can be called from:
1951		 * 1) update_cpumasks_hier()
1952		 * 2) cpuset_hotplug_update_tasks()
1953		 *
1954		 * Check to see if it can be transitioned from valid to
1955		 * invalid partition or vice versa.
1956		 *
1957		 * A partition error happens when parent has tasks and all
1958		 * its effective CPUs will have to be distributed out.
1959		 */
1960		WARN_ON_ONCE(!is_partition_valid(parent));
1961		if (nocpu) {
1962			part_error = PERR_NOCPUS;
1963			if (is_partition_valid(cs))
1964				adding = cpumask_and(tmp->addmask,
1965						xcpus, parent->effective_xcpus);
1966		} else if (is_partition_invalid(cs) &&
1967			   cpumask_subset(xcpus, parent->effective_xcpus)) {
1968			struct cgroup_subsys_state *css;
1969			struct cpuset *child;
1970			bool exclusive = true;
1971
1972			/*
1973			 * Convert invalid partition to valid has to
1974			 * pass the cpu exclusivity test.
1975			 */
1976			rcu_read_lock();
1977			cpuset_for_each_child(child, css, parent) {
1978				if (child == cs)
1979					continue;
1980				if (!cpusets_are_exclusive(cs, child)) {
1981					exclusive = false;
1982					break;
1983				}
1984			}
1985			rcu_read_unlock();
1986			if (exclusive)
1987				deleting = cpumask_and(tmp->delmask,
1988						xcpus, parent->effective_cpus);
1989			else
1990				part_error = PERR_NOTEXCL;
1991		}
1992	}
1993
1994write_error:
1995	if (part_error)
1996		WRITE_ONCE(cs->prs_err, part_error);
1997
1998	if (cmd == partcmd_update) {
1999		/*
2000		 * Check for possible transition between valid and invalid
2001		 * partition root.
2002		 */
2003		switch (cs->partition_root_state) {
2004		case PRS_ROOT:
2005		case PRS_ISOLATED:
2006			if (part_error) {
2007				new_prs = -old_prs;
2008				subparts_delta--;
2009			}
2010			break;
2011		case PRS_INVALID_ROOT:
2012		case PRS_INVALID_ISOLATED:
2013			if (!part_error) {
2014				new_prs = -old_prs;
2015				subparts_delta++;
2016			}
2017			break;
2018		}
2019	}
2020
2021	if (!adding && !deleting && (new_prs == old_prs))
2022		return 0;
2023
2024	/*
2025	 * Transitioning between invalid to valid or vice versa may require
2026	 * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,
2027	 * validate_change() has already been successfully called and
2028	 * CPU lists in cs haven't been updated yet. So defer it to later.
2029	 */
2030	if ((old_prs != new_prs) && (cmd != partcmd_update))  {
2031		int err = update_partition_exclusive(cs, new_prs);
2032
2033		if (err)
2034			return err;
2035	}
2036
2037	/*
2038	 * Change the parent's effective_cpus & effective_xcpus (top cpuset
2039	 * only).
2040	 *
2041	 * Newly added CPUs will be removed from effective_cpus and
2042	 * newly deleted ones will be added back to effective_cpus.
2043	 */
2044	spin_lock_irq(&callback_lock);
2045	if (old_prs != new_prs) {
2046		cs->partition_root_state = new_prs;
2047		if (new_prs <= 0)
2048			cs->nr_subparts = 0;
2049	}
2050	/*
2051	 * Adding to parent's effective_cpus means deletion CPUs from cs
2052	 * and vice versa.
2053	 */
2054	if (adding)
2055		isolcpus_updated += partition_xcpus_del(old_prs, parent,
2056							tmp->addmask);
2057	if (deleting)
2058		isolcpus_updated += partition_xcpus_add(new_prs, parent,
2059							tmp->delmask);
2060
2061	if (is_partition_valid(parent)) {
2062		parent->nr_subparts += subparts_delta;
2063		WARN_ON_ONCE(parent->nr_subparts < 0);
2064	}
2065	spin_unlock_irq(&callback_lock);
2066	update_unbound_workqueue_cpumask(isolcpus_updated);
2067
2068	if ((old_prs != new_prs) && (cmd == partcmd_update))
2069		update_partition_exclusive(cs, new_prs);
2070
2071	if (adding || deleting) {
2072		update_tasks_cpumask(parent, tmp->addmask);
2073		update_sibling_cpumasks(parent, cs, tmp);
2074	}
2075
2076	/*
2077	 * For partcmd_update without newmask, it is being called from
2078	 * cpuset_handle_hotplug(). Update the load balance flag and
2079	 * scheduling domain accordingly.
2080	 */
2081	if ((cmd == partcmd_update) && !newmask)
2082		update_partition_sd_lb(cs, old_prs);
2083
2084	notify_partition_change(cs, old_prs);
2085	return 0;
2086}
2087
2088/**
2089 * compute_partition_effective_cpumask - compute effective_cpus for partition
2090 * @cs: partition root cpuset
2091 * @new_ecpus: previously computed effective_cpus to be updated
2092 *
2093 * Compute the effective_cpus of a partition root by scanning effective_xcpus
2094 * of child partition roots and excluding their effective_xcpus.
2095 *
2096 * This has the side effect of invalidating valid child partition roots,
2097 * if necessary. Since it is called from either cpuset_hotplug_update_tasks()
2098 * or update_cpumasks_hier() where parent and children are modified
2099 * successively, we don't need to call update_parent_effective_cpumask()
2100 * and the child's effective_cpus will be updated in later iterations.
2101 *
2102 * Note that rcu_read_lock() is assumed to be held.
2103 */
2104static void compute_partition_effective_cpumask(struct cpuset *cs,
2105						struct cpumask *new_ecpus)
2106{
2107	struct cgroup_subsys_state *css;
2108	struct cpuset *child;
2109	bool populated = partition_is_populated(cs, NULL);
2110
2111	/*
2112	 * Check child partition roots to see if they should be
2113	 * invalidated when
2114	 *  1) child effective_xcpus not a subset of new
2115	 *     excluisve_cpus
2116	 *  2) All the effective_cpus will be used up and cp
2117	 *     has tasks
2118	 */
2119	compute_effective_exclusive_cpumask(cs, new_ecpus);
2120	cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
2121
2122	rcu_read_lock();
2123	cpuset_for_each_child(child, css, cs) {
2124		if (!is_partition_valid(child))
2125			continue;
2126
2127		child->prs_err = 0;
2128		if (!cpumask_subset(child->effective_xcpus,
2129				    cs->effective_xcpus))
2130			child->prs_err = PERR_INVCPUS;
2131		else if (populated &&
2132			 cpumask_subset(new_ecpus, child->effective_xcpus))
2133			child->prs_err = PERR_NOCPUS;
2134
2135		if (child->prs_err) {
2136			int old_prs = child->partition_root_state;
2137
2138			/*
2139			 * Invalidate child partition
2140			 */
2141			spin_lock_irq(&callback_lock);
2142			make_partition_invalid(child);
2143			cs->nr_subparts--;
2144			child->nr_subparts = 0;
2145			spin_unlock_irq(&callback_lock);
2146			notify_partition_change(child, old_prs);
2147			continue;
2148		}
2149		cpumask_andnot(new_ecpus, new_ecpus,
2150			       child->effective_xcpus);
2151	}
2152	rcu_read_unlock();
2153}
2154
2155/*
2156 * update_cpumasks_hier() flags
2157 */
2158#define HIER_CHECKALL		0x01	/* Check all cpusets with no skipping */
2159#define HIER_NO_SD_REBUILD	0x02	/* Don't rebuild sched domains */
2160
2161/*
2162 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
2163 * @cs:  the cpuset to consider
2164 * @tmp: temp variables for calculating effective_cpus & partition setup
2165 * @force: don't skip any descendant cpusets if set
2166 *
2167 * When configured cpumask is changed, the effective cpumasks of this cpuset
2168 * and all its descendants need to be updated.
2169 *
2170 * On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
2171 *
2172 * Called with cpuset_mutex held
2173 */
2174static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
2175				 int flags)
2176{
2177	struct cpuset *cp;
2178	struct cgroup_subsys_state *pos_css;
2179	bool need_rebuild_sched_domains = false;
2180	int old_prs, new_prs;
2181
2182	rcu_read_lock();
2183	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
2184		struct cpuset *parent = parent_cs(cp);
2185		bool remote = is_remote_partition(cp);
2186		bool update_parent = false;
2187
2188		/*
2189		 * Skip descendent remote partition that acquires CPUs
2190		 * directly from top cpuset unless it is cs.
2191		 */
2192		if (remote && (cp != cs)) {
2193			pos_css = css_rightmost_descendant(pos_css);
2194			continue;
2195		}
2196
2197		/*
2198		 * Update effective_xcpus if exclusive_cpus set.
2199		 * The case when exclusive_cpus isn't set is handled later.
2200		 */
2201		if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) {
2202			spin_lock_irq(&callback_lock);
2203			compute_effective_exclusive_cpumask(cp, NULL);
2204			spin_unlock_irq(&callback_lock);
2205		}
2206
2207		old_prs = new_prs = cp->partition_root_state;
2208		if (remote || (is_partition_valid(parent) &&
2209			       is_partition_valid(cp)))
2210			compute_partition_effective_cpumask(cp, tmp->new_cpus);
2211		else
2212			compute_effective_cpumask(tmp->new_cpus, cp, parent);
2213
2214		/*
2215		 * A partition with no effective_cpus is allowed as long as
2216		 * there is no task associated with it. Call
2217		 * update_parent_effective_cpumask() to check it.
2218		 */
2219		if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {
2220			update_parent = true;
2221			goto update_parent_effective;
2222		}
2223
2224		/*
2225		 * If it becomes empty, inherit the effective mask of the
2226		 * parent, which is guaranteed to have some CPUs unless
2227		 * it is a partition root that has explicitly distributed
2228		 * out all its CPUs.
2229		 */
2230		if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) {
2231			cpumask_copy(tmp->new_cpus, parent->effective_cpus);
2232			if (!cp->use_parent_ecpus) {
2233				cp->use_parent_ecpus = true;
2234				parent->child_ecpus_count++;
2235			}
2236		} else if (cp->use_parent_ecpus) {
2237			cp->use_parent_ecpus = false;
2238			WARN_ON_ONCE(!parent->child_ecpus_count);
2239			parent->child_ecpus_count--;
2240		}
2241
2242		if (remote)
2243			goto get_css;
2244
2245		/*
2246		 * Skip the whole subtree if
2247		 * 1) the cpumask remains the same,
2248		 * 2) has no partition root state,
2249		 * 3) HIER_CHECKALL flag not set, and
2250		 * 4) for v2 load balance state same as its parent.
2251		 */
2252		if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
2253		    cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
2254		    (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
2255		    (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
2256			pos_css = css_rightmost_descendant(pos_css);
2257			continue;
2258		}
2259
2260update_parent_effective:
2261		/*
2262		 * update_parent_effective_cpumask() should have been called
2263		 * for cs already in update_cpumask(). We should also call
2264		 * update_tasks_cpumask() again for tasks in the parent
2265		 * cpuset if the parent's effective_cpus changes.
2266		 */
2267		if ((cp != cs) && old_prs) {
2268			switch (parent->partition_root_state) {
2269			case PRS_ROOT:
2270			case PRS_ISOLATED:
2271				update_parent = true;
2272				break;
2273
2274			default:
2275				/*
2276				 * When parent is not a partition root or is
2277				 * invalid, child partition roots become
2278				 * invalid too.
2279				 */
2280				if (is_partition_valid(cp))
2281					new_prs = -cp->partition_root_state;
2282				WRITE_ONCE(cp->prs_err,
2283					   is_partition_invalid(parent)
2284					   ? PERR_INVPARENT : PERR_NOTPART);
2285				break;
2286			}
2287		}
2288get_css:
2289		if (!css_tryget_online(&cp->css))
2290			continue;
2291		rcu_read_unlock();
2292
2293		if (update_parent) {
2294			update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);
2295			/*
2296			 * The cpuset partition_root_state may become
2297			 * invalid. Capture it.
2298			 */
2299			new_prs = cp->partition_root_state;
2300		}
2301
2302		spin_lock_irq(&callback_lock);
2303		cpumask_copy(cp->effective_cpus, tmp->new_cpus);
2304		cp->partition_root_state = new_prs;
2305		/*
2306		 * Make sure effective_xcpus is properly set for a valid
2307		 * partition root.
2308		 */
2309		if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
2310			cpumask_and(cp->effective_xcpus,
2311				    cp->cpus_allowed, parent->effective_xcpus);
2312		else if (new_prs < 0)
2313			reset_partition_data(cp);
2314		spin_unlock_irq(&callback_lock);
2315
2316		notify_partition_change(cp, old_prs);
2317
2318		WARN_ON(!is_in_v2_mode() &&
2319			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
2320
2321		update_tasks_cpumask(cp, cp->effective_cpus);
2322
2323		/*
2324		 * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
2325		 * from parent if current cpuset isn't a valid partition root
2326		 * and their load balance states differ.
2327		 */
2328		if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
2329		    !is_partition_valid(cp) &&
2330		    (is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
2331			if (is_sched_load_balance(parent))
2332				set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
2333			else
2334				clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
2335		}
2336
2337		/*
2338		 * On legacy hierarchy, if the effective cpumask of any non-
2339		 * empty cpuset is changed, we need to rebuild sched domains.
2340		 * On default hierarchy, the cpuset needs to be a partition
2341		 * root as well.
2342		 */
2343		if (!cpumask_empty(cp->cpus_allowed) &&
2344		    is_sched_load_balance(cp) &&
2345		   (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
2346		    is_partition_valid(cp)))
2347			need_rebuild_sched_domains = true;
2348
2349		rcu_read_lock();
2350		css_put(&cp->css);
2351	}
2352	rcu_read_unlock();
2353
2354	if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD))
2355		rebuild_sched_domains_locked();
2356}
2357
2358/**
2359 * update_sibling_cpumasks - Update siblings cpumasks
2360 * @parent:  Parent cpuset
2361 * @cs:      Current cpuset
2362 * @tmp:     Temp variables
2363 */
2364static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
2365				    struct tmpmasks *tmp)
2366{
2367	struct cpuset *sibling;
2368	struct cgroup_subsys_state *pos_css;
2369
2370	lockdep_assert_held(&cpuset_mutex);
2371
2372	/*
2373	 * Check all its siblings and call update_cpumasks_hier()
2374	 * if their effective_cpus will need to be changed.
2375	 *
2376	 * With the addition of effective_xcpus which is a subset of
2377	 * cpus_allowed. It is possible a change in parent's effective_cpus
2378	 * due to a change in a child partition's effective_xcpus will impact
2379	 * its siblings even if they do not inherit parent's effective_cpus
2380	 * directly.
2381	 *
2382	 * The update_cpumasks_hier() function may sleep. So we have to
2383	 * release the RCU read lock before calling it. HIER_NO_SD_REBUILD
2384	 * flag is used to suppress rebuild of sched domains as the callers
2385	 * will take care of that.
2386	 */
2387	rcu_read_lock();
2388	cpuset_for_each_child(sibling, pos_css, parent) {
2389		if (sibling == cs)
2390			continue;
2391		if (!sibling->use_parent_ecpus &&
2392		    !is_partition_valid(sibling)) {
2393			compute_effective_cpumask(tmp->new_cpus, sibling,
2394						  parent);
2395			if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
2396				continue;
2397		}
2398		if (!css_tryget_online(&sibling->css))
2399			continue;
2400
2401		rcu_read_unlock();
2402		update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD);
2403		rcu_read_lock();
2404		css_put(&sibling->css);
2405	}
2406	rcu_read_unlock();
2407}
2408
2409/**
2410 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
2411 * @cs: the cpuset to consider
2412 * @trialcs: trial cpuset
2413 * @buf: buffer of cpu numbers written to this cpuset
2414 */
2415static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
2416			  const char *buf)
2417{
2418	int retval;
2419	struct tmpmasks tmp;
2420	struct cpuset *parent = parent_cs(cs);
2421	bool invalidate = false;
2422	int hier_flags = 0;
2423	int old_prs = cs->partition_root_state;
2424
2425	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
2426	if (cs == &top_cpuset)
2427		return -EACCES;
2428
2429	/*
2430	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
2431	 * Since cpulist_parse() fails on an empty mask, we special case
2432	 * that parsing.  The validate_change() call ensures that cpusets
2433	 * with tasks have cpus.
2434	 */
2435	if (!*buf) {
2436		cpumask_clear(trialcs->cpus_allowed);
2437		cpumask_clear(trialcs->effective_xcpus);
2438	} else {
2439		retval = cpulist_parse(buf, trialcs->cpus_allowed);
2440		if (retval < 0)
2441			return retval;
2442
2443		if (!cpumask_subset(trialcs->cpus_allowed,
2444				    top_cpuset.cpus_allowed))
2445			return -EINVAL;
2446
2447		/*
2448		 * When exclusive_cpus isn't explicitly set, it is constrainted
2449		 * by cpus_allowed and parent's effective_xcpus. Otherwise,
2450		 * trialcs->effective_xcpus is used as a temporary cpumask
2451		 * for checking validity of the partition root.
2452		 */
2453		if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
2454			compute_effective_exclusive_cpumask(trialcs, NULL);
2455	}
2456
2457	/* Nothing to do if the cpus didn't change */
2458	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
2459		return 0;
2460
2461	if (alloc_cpumasks(NULL, &tmp))
2462		return -ENOMEM;
2463
2464	if (old_prs) {
2465		if (is_partition_valid(cs) &&
2466		    cpumask_empty(trialcs->effective_xcpus)) {
2467			invalidate = true;
2468			cs->prs_err = PERR_INVCPUS;
2469		} else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
2470			invalidate = true;
2471			cs->prs_err = PERR_HKEEPING;
2472		} else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
2473			invalidate = true;
2474			cs->prs_err = PERR_NOCPUS;
2475		}
2476	}
2477
2478	/*
2479	 * Check all the descendants in update_cpumasks_hier() if
2480	 * effective_xcpus is to be changed.
2481	 */
2482	if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
2483		hier_flags = HIER_CHECKALL;
2484
2485	retval = validate_change(cs, trialcs);
2486
2487	if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
2488		struct cgroup_subsys_state *css;
2489		struct cpuset *cp;
2490
2491		/*
2492		 * The -EINVAL error code indicates that partition sibling
2493		 * CPU exclusivity rule has been violated. We still allow
2494		 * the cpumask change to proceed while invalidating the
2495		 * partition. However, any conflicting sibling partitions
2496		 * have to be marked as invalid too.
2497		 */
2498		invalidate = true;
2499		rcu_read_lock();
2500		cpuset_for_each_child(cp, css, parent) {
2501			struct cpumask *xcpus = fetch_xcpus(trialcs);
2502
2503			if (is_partition_valid(cp) &&
2504			    cpumask_intersects(xcpus, cp->effective_xcpus)) {
2505				rcu_read_unlock();
2506				update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp);
2507				rcu_read_lock();
2508			}
2509		}
2510		rcu_read_unlock();
2511		retval = 0;
2512	}
2513
2514	if (retval < 0)
2515		goto out_free;
2516
2517	if (is_partition_valid(cs) ||
2518	   (is_partition_invalid(cs) && !invalidate)) {
2519		struct cpumask *xcpus = trialcs->effective_xcpus;
2520
2521		if (cpumask_empty(xcpus) && is_partition_invalid(cs))
2522			xcpus = trialcs->cpus_allowed;
2523
2524		/*
2525		 * Call remote_cpus_update() to handle valid remote partition
2526		 */
2527		if (is_remote_partition(cs))
2528			remote_cpus_update(cs, xcpus, &tmp);
2529		else if (invalidate)
2530			update_parent_effective_cpumask(cs, partcmd_invalidate,
2531							NULL, &tmp);
2532		else
2533			update_parent_effective_cpumask(cs, partcmd_update,
2534							xcpus, &tmp);
2535	} else if (!cpumask_empty(cs->exclusive_cpus)) {
2536		/*
2537		 * Use trialcs->effective_cpus as a temp cpumask
2538		 */
2539		remote_partition_check(cs, trialcs->effective_xcpus,
2540				       trialcs->effective_cpus, &tmp);
2541	}
2542
2543	spin_lock_irq(&callback_lock);
2544	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
2545	cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
2546	if ((old_prs > 0) && !is_partition_valid(cs))
2547		reset_partition_data(cs);
2548	spin_unlock_irq(&callback_lock);
2549
2550	/* effective_cpus/effective_xcpus will be updated here */
2551	update_cpumasks_hier(cs, &tmp, hier_flags);
2552
2553	/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
2554	if (cs->partition_root_state)
2555		update_partition_sd_lb(cs, old_prs);
2556out_free:
2557	free_cpumasks(NULL, &tmp);
2558	return retval;
2559}
2560
2561/**
2562 * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset
2563 * @cs: the cpuset to consider
2564 * @trialcs: trial cpuset
2565 * @buf: buffer of cpu numbers written to this cpuset
2566 *
2567 * The tasks' cpumask will be updated if cs is a valid partition root.
2568 */
2569static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
2570				    const char *buf)
2571{
2572	int retval;
2573	struct tmpmasks tmp;
2574	struct cpuset *parent = parent_cs(cs);
2575	bool invalidate = false;
2576	int hier_flags = 0;
2577	int old_prs = cs->partition_root_state;
2578
2579	if (!*buf) {
2580		cpumask_clear(trialcs->exclusive_cpus);
2581		cpumask_clear(trialcs->effective_xcpus);
2582	} else {
2583		retval = cpulist_parse(buf, trialcs->exclusive_cpus);
2584		if (retval < 0)
2585			return retval;
2586		if (!is_cpu_exclusive(cs))
2587			set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags);
2588	}
2589
2590	/* Nothing to do if the CPUs didn't change */
2591	if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
2592		return 0;
2593
2594	if (*buf)
2595		compute_effective_exclusive_cpumask(trialcs, NULL);
2596
2597	/*
2598	 * Check all the descendants in update_cpumasks_hier() if
2599	 * effective_xcpus is to be changed.
2600	 */
2601	if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
2602		hier_flags = HIER_CHECKALL;
2603
2604	retval = validate_change(cs, trialcs);
2605	if (retval)
2606		return retval;
2607
2608	if (alloc_cpumasks(NULL, &tmp))
2609		return -ENOMEM;
2610
2611	if (old_prs) {
2612		if (cpumask_empty(trialcs->effective_xcpus)) {
2613			invalidate = true;
2614			cs->prs_err = PERR_INVCPUS;
2615		} else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
2616			invalidate = true;
2617			cs->prs_err = PERR_HKEEPING;
2618		} else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
2619			invalidate = true;
2620			cs->prs_err = PERR_NOCPUS;
2621		}
2622
2623		if (is_remote_partition(cs)) {
2624			if (invalidate)
2625				remote_partition_disable(cs, &tmp);
2626			else
2627				remote_cpus_update(cs, trialcs->effective_xcpus,
2628						   &tmp);
2629		} else if (invalidate) {
2630			update_parent_effective_cpumask(cs, partcmd_invalidate,
2631							NULL, &tmp);
2632		} else {
2633			update_parent_effective_cpumask(cs, partcmd_update,
2634						trialcs->effective_xcpus, &tmp);
2635		}
2636	} else if (!cpumask_empty(trialcs->exclusive_cpus)) {
2637		/*
2638		 * Use trialcs->effective_cpus as a temp cpumask
2639		 */
2640		remote_partition_check(cs, trialcs->effective_xcpus,
2641				       trialcs->effective_cpus, &tmp);
2642	}
2643	spin_lock_irq(&callback_lock);
2644	cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
2645	cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
2646	if ((old_prs > 0) && !is_partition_valid(cs))
2647		reset_partition_data(cs);
2648	spin_unlock_irq(&callback_lock);
2649
2650	/*
2651	 * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus
2652	 * of the subtree when it is a valid partition root or effective_xcpus
2653	 * is updated.
2654	 */
2655	if (is_partition_valid(cs) || hier_flags)
2656		update_cpumasks_hier(cs, &tmp, hier_flags);
2657
2658	/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
2659	if (cs->partition_root_state)
2660		update_partition_sd_lb(cs, old_prs);
2661
2662	free_cpumasks(NULL, &tmp);
2663	return 0;
2664}
2665
2666/*
2667 * Migrate memory region from one set of nodes to another.  This is
2668 * performed asynchronously as it can be called from process migration path
2669 * holding locks involved in process management.  All mm migrations are
2670 * performed in the queued order and can be waited for by flushing
2671 * cpuset_migrate_mm_wq.
2672 */
2673
2674struct cpuset_migrate_mm_work {
2675	struct work_struct	work;
2676	struct mm_struct	*mm;
2677	nodemask_t		from;
2678	nodemask_t		to;
2679};
2680
2681static void cpuset_migrate_mm_workfn(struct work_struct *work)
2682{
2683	struct cpuset_migrate_mm_work *mwork =
2684		container_of(work, struct cpuset_migrate_mm_work, work);
2685
2686	/* on a wq worker, no need to worry about %current's mems_allowed */
2687	do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
2688	mmput(mwork->mm);
2689	kfree(mwork);
2690}
2691
2692static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
2693							const nodemask_t *to)
2694{
2695	struct cpuset_migrate_mm_work *mwork;
2696
2697	if (nodes_equal(*from, *to)) {
2698		mmput(mm);
2699		return;
2700	}
2701
2702	mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
2703	if (mwork) {
2704		mwork->mm = mm;
2705		mwork->from = *from;
2706		mwork->to = *to;
2707		INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
2708		queue_work(cpuset_migrate_mm_wq, &mwork->work);
2709	} else {
2710		mmput(mm);
2711	}
2712}
2713
2714static void cpuset_post_attach(void)
2715{
2716	flush_workqueue(cpuset_migrate_mm_wq);
2717}
2718
2719/*
2720 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
2721 * @tsk: the task to change
2722 * @newmems: new nodes that the task will be set
2723 *
2724 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
2725 * and rebind an eventual tasks' mempolicy. If the task is allocating in
2726 * parallel, it might temporarily see an empty intersection, which results in
2727 * a seqlock check and retry before OOM or allocation failure.
2728 */
2729static void cpuset_change_task_nodemask(struct task_struct *tsk,
2730					nodemask_t *newmems)
2731{
2732	task_lock(tsk);
2733
2734	local_irq_disable();
2735	write_seqcount_begin(&tsk->mems_allowed_seq);
2736
2737	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
2738	mpol_rebind_task(tsk, newmems);
2739	tsk->mems_allowed = *newmems;
2740
2741	write_seqcount_end(&tsk->mems_allowed_seq);
2742	local_irq_enable();
2743
2744	task_unlock(tsk);
2745}
2746
2747static void *cpuset_being_rebound;
2748
2749/**
2750 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
2751 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
2752 *
2753 * Iterate through each task of @cs updating its mems_allowed to the
2754 * effective cpuset's.  As this function is called with cpuset_mutex held,
2755 * cpuset membership stays stable.
2756 */
2757static void update_tasks_nodemask(struct cpuset *cs)
2758{
2759	static nodemask_t newmems;	/* protected by cpuset_mutex */
2760	struct css_task_iter it;
2761	struct task_struct *task;
2762
2763	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
2764
2765	guarantee_online_mems(cs, &newmems);
2766
2767	/*
2768	 * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
2769	 * take while holding tasklist_lock.  Forks can happen - the
2770	 * mpol_dup() cpuset_being_rebound check will catch such forks,
2771	 * and rebind their vma mempolicies too.  Because we still hold
2772	 * the global cpuset_mutex, we know that no other rebind effort
2773	 * will be contending for the global variable cpuset_being_rebound.
2774	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
2775	 * is idempotent.  Also migrate pages in each mm to new nodes.
2776	 */
2777	css_task_iter_start(&cs->css, 0, &it);
2778	while ((task = css_task_iter_next(&it))) {
2779		struct mm_struct *mm;
2780		bool migrate;
2781
2782		cpuset_change_task_nodemask(task, &newmems);
2783
2784		mm = get_task_mm(task);
2785		if (!mm)
2786			continue;
2787
2788		migrate = is_memory_migrate(cs);
2789
2790		mpol_rebind_mm(mm, &cs->mems_allowed);
2791		if (migrate)
2792			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
2793		else
2794			mmput(mm);
2795	}
2796	css_task_iter_end(&it);
2797
2798	/*
2799	 * All the tasks' nodemasks have been updated, update
2800	 * cs->old_mems_allowed.
2801	 */
2802	cs->old_mems_allowed = newmems;
2803
2804	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
2805	cpuset_being_rebound = NULL;
2806}
2807
2808/*
2809 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
2810 * @cs: the cpuset to consider
2811 * @new_mems: a temp variable for calculating new effective_mems
2812 *
2813 * When configured nodemask is changed, the effective nodemasks of this cpuset
2814 * and all its descendants need to be updated.
2815 *
2816 * On legacy hierarchy, effective_mems will be the same with mems_allowed.
2817 *
2818 * Called with cpuset_mutex held
2819 */
2820static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
2821{
2822	struct cpuset *cp;
2823	struct cgroup_subsys_state *pos_css;
2824
2825	rcu_read_lock();
2826	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
2827		struct cpuset *parent = parent_cs(cp);
2828
2829		nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
2830
2831		/*
2832		 * If it becomes empty, inherit the effective mask of the
2833		 * parent, which is guaranteed to have some MEMs.
2834		 */
2835		if (is_in_v2_mode() && nodes_empty(*new_mems))
2836			*new_mems = parent->effective_mems;
2837
2838		/* Skip the whole subtree if the nodemask remains the same. */
2839		if (nodes_equal(*new_mems, cp->effective_mems)) {
2840			pos_css = css_rightmost_descendant(pos_css);
2841			continue;
2842		}
2843
2844		if (!css_tryget_online(&cp->css))
2845			continue;
2846		rcu_read_unlock();
2847
2848		spin_lock_irq(&callback_lock);
2849		cp->effective_mems = *new_mems;
2850		spin_unlock_irq(&callback_lock);
2851
2852		WARN_ON(!is_in_v2_mode() &&
2853			!nodes_equal(cp->mems_allowed, cp->effective_mems));
2854
2855		update_tasks_nodemask(cp);
2856
2857		rcu_read_lock();
2858		css_put(&cp->css);
2859	}
2860	rcu_read_unlock();
2861}
2862
2863/*
2864 * Handle user request to change the 'mems' memory placement
2865 * of a cpuset.  Needs to validate the request, update the
2866 * cpusets mems_allowed, and for each task in the cpuset,
2867 * update mems_allowed and rebind task's mempolicy and any vma
2868 * mempolicies and if the cpuset is marked 'memory_migrate',
2869 * migrate the tasks pages to the new memory.
2870 *
2871 * Call with cpuset_mutex held. May take callback_lock during call.
2872 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
2873 * lock each such tasks mm->mmap_lock, scan its vma's and rebind
2874 * their mempolicies to the cpusets new mems_allowed.
2875 */
2876static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
2877			   const char *buf)
2878{
2879	int retval;
2880
2881	/*
2882	 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
2883	 * it's read-only
2884	 */
2885	if (cs == &top_cpuset) {
2886		retval = -EACCES;
2887		goto done;
2888	}
2889
2890	/*
2891	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
2892	 * Since nodelist_parse() fails on an empty mask, we special case
2893	 * that parsing.  The validate_change() call ensures that cpusets
2894	 * with tasks have memory.
2895	 */
2896	if (!*buf) {
2897		nodes_clear(trialcs->mems_allowed);
2898	} else {
2899		retval = nodelist_parse(buf, trialcs->mems_allowed);
2900		if (retval < 0)
2901			goto done;
2902
2903		if (!nodes_subset(trialcs->mems_allowed,
2904				  top_cpuset.mems_allowed)) {
2905			retval = -EINVAL;
2906			goto done;
2907		}
2908	}
2909
2910	if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
2911		retval = 0;		/* Too easy - nothing to do */
2912		goto done;
2913	}
2914	retval = validate_change(cs, trialcs);
2915	if (retval < 0)
2916		goto done;
2917
2918	check_insane_mems_config(&trialcs->mems_allowed);
2919
2920	spin_lock_irq(&callback_lock);
2921	cs->mems_allowed = trialcs->mems_allowed;
2922	spin_unlock_irq(&callback_lock);
2923
2924	/* use trialcs->mems_allowed as a temp variable */
2925	update_nodemasks_hier(cs, &trialcs->mems_allowed);
2926done:
2927	return retval;
2928}
2929
2930bool current_cpuset_is_being_rebound(void)
2931{
2932	bool ret;
2933
2934	rcu_read_lock();
2935	ret = task_cs(current) == cpuset_being_rebound;
2936	rcu_read_unlock();
2937
2938	return ret;
2939}
2940
2941static int update_relax_domain_level(struct cpuset *cs, s64 val)
2942{
2943#ifdef CONFIG_SMP
2944	if (val < -1 || val > sched_domain_level_max + 1)
2945		return -EINVAL;
2946#endif
2947
2948	if (val != cs->relax_domain_level) {
2949		cs->relax_domain_level = val;
2950		if (!cpumask_empty(cs->cpus_allowed) &&
2951		    is_sched_load_balance(cs))
2952			rebuild_sched_domains_locked();
2953	}
2954
2955	return 0;
2956}
2957
2958/**
2959 * update_tasks_flags - update the spread flags of tasks in the cpuset.
2960 * @cs: the cpuset in which each task's spread flags needs to be changed
2961 *
2962 * Iterate through each task of @cs updating its spread flags.  As this
2963 * function is called with cpuset_mutex held, cpuset membership stays
2964 * stable.
2965 */
2966static void update_tasks_flags(struct cpuset *cs)
2967{
2968	struct css_task_iter it;
2969	struct task_struct *task;
2970
2971	css_task_iter_start(&cs->css, 0, &it);
2972	while ((task = css_task_iter_next(&it)))
2973		cpuset_update_task_spread_flags(cs, task);
2974	css_task_iter_end(&it);
2975}
2976
2977/*
2978 * update_flag - read a 0 or a 1 in a file and update associated flag
2979 * bit:		the bit to update (see cpuset_flagbits_t)
2980 * cs:		the cpuset to update
2981 * turning_on: 	whether the flag is being set or cleared
2982 *
2983 * Call with cpuset_mutex held.
2984 */
2985
2986static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
2987		       int turning_on)
2988{
2989	struct cpuset *trialcs;
2990	int balance_flag_changed;
2991	int spread_flag_changed;
2992	int err;
2993
2994	trialcs = alloc_trial_cpuset(cs);
2995	if (!trialcs)
2996		return -ENOMEM;
2997
2998	if (turning_on)
2999		set_bit(bit, &trialcs->flags);
3000	else
3001		clear_bit(bit, &trialcs->flags);
3002
3003	err = validate_change(cs, trialcs);
3004	if (err < 0)
3005		goto out;
3006
3007	balance_flag_changed = (is_sched_load_balance(cs) !=
3008				is_sched_load_balance(trialcs));
3009
3010	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
3011			|| (is_spread_page(cs) != is_spread_page(trialcs)));
3012
3013	spin_lock_irq(&callback_lock);
3014	cs->flags = trialcs->flags;
3015	spin_unlock_irq(&callback_lock);
3016
3017	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
3018		rebuild_sched_domains_locked();
3019
3020	if (spread_flag_changed)
3021		update_tasks_flags(cs);
3022out:
3023	free_cpuset(trialcs);
3024	return err;
3025}
3026
3027/**
3028 * update_prstate - update partition_root_state
3029 * @cs: the cpuset to update
3030 * @new_prs: new partition root state
3031 * Return: 0 if successful, != 0 if error
3032 *
3033 * Call with cpuset_mutex held.
3034 */
3035static int update_prstate(struct cpuset *cs, int new_prs)
3036{
3037	int err = PERR_NONE, old_prs = cs->partition_root_state;
3038	struct cpuset *parent = parent_cs(cs);
3039	struct tmpmasks tmpmask;
3040	bool new_xcpus_state = false;
3041
3042	if (old_prs == new_prs)
3043		return 0;
3044
3045	/*
3046	 * Treat a previously invalid partition root as if it is a "member".
3047	 */
3048	if (new_prs && is_prs_invalid(old_prs))
3049		old_prs = PRS_MEMBER;
3050
3051	if (alloc_cpumasks(NULL, &tmpmask))
3052		return -ENOMEM;
3053
3054	/*
3055	 * Setup effective_xcpus if not properly set yet, it will be cleared
3056	 * later if partition becomes invalid.
3057	 */
3058	if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) {
3059		spin_lock_irq(&callback_lock);
3060		cpumask_and(cs->effective_xcpus,
3061			    cs->cpus_allowed, parent->effective_xcpus);
3062		spin_unlock_irq(&callback_lock);
3063	}
3064
3065	err = update_partition_exclusive(cs, new_prs);
3066	if (err)
3067		goto out;
3068
3069	if (!old_prs) {
3070		enum partition_cmd cmd = (new_prs == PRS_ROOT)
3071				       ? partcmd_enable : partcmd_enablei;
3072
3073		/*
3074		 * cpus_allowed cannot be empty.
3075		 */
3076		if (cpumask_empty(cs->cpus_allowed)) {
3077			err = PERR_CPUSEMPTY;
3078			goto out;
3079		}
3080
3081		err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
3082		/*
3083		 * If an attempt to become local partition root fails,
3084		 * try to become a remote partition root instead.
3085		 */
3086		if (err && remote_partition_enable(cs, new_prs, &tmpmask))
3087			err = 0;
3088	} else if (old_prs && new_prs) {
3089		/*
3090		 * A change in load balance state only, no change in cpumasks.
3091		 */
3092		new_xcpus_state = true;
3093	} else {
3094		/*
3095		 * Switching back to member is always allowed even if it
3096		 * disables child partitions.
3097		 */
3098		if (is_remote_partition(cs))
3099			remote_partition_disable(cs, &tmpmask);
3100		else
3101			update_parent_effective_cpumask(cs, partcmd_disable,
3102							NULL, &tmpmask);
3103
3104		/*
3105		 * Invalidation of child partitions will be done in
3106		 * update_cpumasks_hier().
3107		 */
3108	}
3109out:
3110	/*
3111	 * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
3112	 * happens.
3113	 */
3114	if (err) {
3115		new_prs = -new_prs;
3116		update_partition_exclusive(cs, new_prs);
3117	}
3118
3119	spin_lock_irq(&callback_lock);
3120	cs->partition_root_state = new_prs;
3121	WRITE_ONCE(cs->prs_err, err);
3122	if (!is_partition_valid(cs))
3123		reset_partition_data(cs);
3124	else if (new_xcpus_state)
3125		partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
3126	spin_unlock_irq(&callback_lock);
3127	update_unbound_workqueue_cpumask(new_xcpus_state);
3128
3129	/* Force update if switching back to member */
3130	update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
3131
3132	/* Update sched domains and load balance flag */
3133	update_partition_sd_lb(cs, old_prs);
3134
3135	notify_partition_change(cs, old_prs);
3136	free_cpumasks(NULL, &tmpmask);
3137	return 0;
3138}
3139
3140/*
3141 * Frequency meter - How fast is some event occurring?
3142 *
3143 * These routines manage a digitally filtered, constant time based,
3144 * event frequency meter.  There are four routines:
3145 *   fmeter_init() - initialize a frequency meter.
3146 *   fmeter_markevent() - called each time the event happens.
3147 *   fmeter_getrate() - returns the recent rate of such events.
3148 *   fmeter_update() - internal routine used to update fmeter.
3149 *
3150 * A common data structure is passed to each of these routines,
3151 * which is used to keep track of the state required to manage the
3152 * frequency meter and its digital filter.
3153 *
3154 * The filter works on the number of events marked per unit time.
3155 * The filter is single-pole low-pass recursive (IIR).  The time unit
3156 * is 1 second.  Arithmetic is done using 32-bit integers scaled to
3157 * simulate 3 decimal digits of precision (multiplied by 1000).
3158 *
3159 * With an FM_COEF of 933, and a time base of 1 second, the filter
3160 * has a half-life of 10 seconds, meaning that if the events quit
3161 * happening, then the rate returned from the fmeter_getrate()
3162 * will be cut in half each 10 seconds, until it converges to zero.
3163 *
3164 * It is not worth doing a real infinitely recursive filter.  If more
3165 * than FM_MAXTICKS ticks have elapsed since the last filter event,
3166 * just compute FM_MAXTICKS ticks worth, by which point the level
3167 * will be stable.
3168 *
3169 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
3170 * arithmetic overflow in the fmeter_update() routine.
3171 *
3172 * Given the simple 32 bit integer arithmetic used, this meter works
3173 * best for reporting rates between one per millisecond (msec) and
3174 * one per 32 (approx) seconds.  At constant rates faster than one
3175 * per msec it maxes out at values just under 1,000,000.  At constant
3176 * rates between one per msec, and one per second it will stabilize
3177 * to a value N*1000, where N is the rate of events per second.
3178 * At constant rates between one per second and one per 32 seconds,
3179 * it will be choppy, moving up on the seconds that have an event,
3180 * and then decaying until the next event.  At rates slower than
3181 * about one in 32 seconds, it decays all the way back to zero between
3182 * each event.
3183 */
3184
3185#define FM_COEF 933		/* coefficient for half-life of 10 secs */
3186#define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
3187#define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
3188#define FM_SCALE 1000		/* faux fixed point scale */
3189
3190/* Initialize a frequency meter */
3191static void fmeter_init(struct fmeter *fmp)
3192{
3193	fmp->cnt = 0;
3194	fmp->val = 0;
3195	fmp->time = 0;
3196	spin_lock_init(&fmp->lock);
3197}
3198
3199/* Internal meter update - process cnt events and update value */
3200static void fmeter_update(struct fmeter *fmp)
3201{
3202	time64_t now;
3203	u32 ticks;
3204
3205	now = ktime_get_seconds();
3206	ticks = now - fmp->time;
3207
3208	if (ticks == 0)
3209		return;
3210
3211	ticks = min(FM_MAXTICKS, ticks);
3212	while (ticks-- > 0)
3213		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
3214	fmp->time = now;
3215
3216	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
3217	fmp->cnt = 0;
3218}
3219
3220/* Process any previous ticks, then bump cnt by one (times scale). */
3221static void fmeter_markevent(struct fmeter *fmp)
3222{
3223	spin_lock(&fmp->lock);
3224	fmeter_update(fmp);
3225	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
3226	spin_unlock(&fmp->lock);
3227}
3228
3229/* Process any previous ticks, then return current value. */
3230static int fmeter_getrate(struct fmeter *fmp)
3231{
3232	int val;
3233
3234	spin_lock(&fmp->lock);
3235	fmeter_update(fmp);
3236	val = fmp->val;
3237	spin_unlock(&fmp->lock);
3238	return val;
3239}
3240
3241static struct cpuset *cpuset_attach_old_cs;
3242
3243/*
3244 * Check to see if a cpuset can accept a new task
3245 * For v1, cpus_allowed and mems_allowed can't be empty.
3246 * For v2, effective_cpus can't be empty.
3247 * Note that in v1, effective_cpus = cpus_allowed.
3248 */
3249static int cpuset_can_attach_check(struct cpuset *cs)
3250{
3251	if (cpumask_empty(cs->effective_cpus) ||
3252	   (!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
3253		return -ENOSPC;
3254	return 0;
3255}
3256
3257static void reset_migrate_dl_data(struct cpuset *cs)
3258{
3259	cs->nr_migrate_dl_tasks = 0;
3260	cs->sum_migrate_dl_bw = 0;
3261}
3262
3263/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
3264static int cpuset_can_attach(struct cgroup_taskset *tset)
3265{
3266	struct cgroup_subsys_state *css;
3267	struct cpuset *cs, *oldcs;
3268	struct task_struct *task;
3269	bool cpus_updated, mems_updated;
3270	int ret;
3271
3272	/* used later by cpuset_attach() */
3273	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
3274	oldcs = cpuset_attach_old_cs;
3275	cs = css_cs(css);
3276
3277	mutex_lock(&cpuset_mutex);
3278
3279	/* Check to see if task is allowed in the cpuset */
3280	ret = cpuset_can_attach_check(cs);
3281	if (ret)
3282		goto out_unlock;
3283
3284	cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
3285	mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
3286
3287	cgroup_taskset_for_each(task, css, tset) {
3288		ret = task_can_attach(task);
3289		if (ret)
3290			goto out_unlock;
3291
3292		/*
3293		 * Skip rights over task check in v2 when nothing changes,
3294		 * migration permission derives from hierarchy ownership in
3295		 * cgroup_procs_write_permission()).
3296		 */
3297		if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
3298		    (cpus_updated || mems_updated)) {
3299			ret = security_task_setscheduler(task);
3300			if (ret)
3301				goto out_unlock;
3302		}
3303
3304		if (dl_task(task)) {
3305			cs->nr_migrate_dl_tasks++;
3306			cs->sum_migrate_dl_bw += task->dl.dl_bw;
3307		}
3308	}
3309
3310	if (!cs->nr_migrate_dl_tasks)
3311		goto out_success;
3312
3313	if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
3314		int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
3315
3316		if (unlikely(cpu >= nr_cpu_ids)) {
3317			reset_migrate_dl_data(cs);
3318			ret = -EINVAL;
3319			goto out_unlock;
3320		}
3321
3322		ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
3323		if (ret) {
3324			reset_migrate_dl_data(cs);
3325			goto out_unlock;
3326		}
3327	}
3328
3329out_success:
3330	/*
3331	 * Mark attach is in progress.  This makes validate_change() fail
3332	 * changes which zero cpus/mems_allowed.
3333	 */
3334	cs->attach_in_progress++;
3335out_unlock:
3336	mutex_unlock(&cpuset_mutex);
3337	return ret;
3338}
3339
3340static void cpuset_cancel_attach(struct cgroup_taskset *tset)
3341{
3342	struct cgroup_subsys_state *css;
3343	struct cpuset *cs;
3344
3345	cgroup_taskset_first(tset, &css);
3346	cs = css_cs(css);
3347
3348	mutex_lock(&cpuset_mutex);
3349	cs->attach_in_progress--;
3350	if (!cs->attach_in_progress)
3351		wake_up(&cpuset_attach_wq);
3352
3353	if (cs->nr_migrate_dl_tasks) {
3354		int cpu = cpumask_any(cs->effective_cpus);
3355
3356		dl_bw_free(cpu, cs->sum_migrate_dl_bw);
3357		reset_migrate_dl_data(cs);
3358	}
3359
3360	mutex_unlock(&cpuset_mutex);
3361}
3362
3363/*
3364 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()
3365 * but we can't allocate it dynamically there.  Define it global and
3366 * allocate from cpuset_init().
3367 */
3368static cpumask_var_t cpus_attach;
3369static nodemask_t cpuset_attach_nodemask_to;
3370
3371static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
3372{
3373	lockdep_assert_held(&cpuset_mutex);
3374
3375	if (cs != &top_cpuset)
3376		guarantee_online_cpus(task, cpus_attach);
3377	else
3378		cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
3379			       subpartitions_cpus);
3380	/*
3381	 * can_attach beforehand should guarantee that this doesn't
3382	 * fail.  TODO: have a better way to handle failure here
3383	 */
3384	WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
3385
3386	cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
3387	cpuset_update_task_spread_flags(cs, task);
3388}
3389
3390static void cpuset_attach(struct cgroup_taskset *tset)
3391{
3392	struct task_struct *task;
3393	struct task_struct *leader;
3394	struct cgroup_subsys_state *css;
3395	struct cpuset *cs;
3396	struct cpuset *oldcs = cpuset_attach_old_cs;
3397	bool cpus_updated, mems_updated;
3398
3399	cgroup_taskset_first(tset, &css);
3400	cs = css_cs(css);
3401
3402	lockdep_assert_cpus_held();	/* see cgroup_attach_lock() */
3403	mutex_lock(&cpuset_mutex);
3404	cpus_updated = !cpumask_equal(cs->effective_cpus,
3405				      oldcs->effective_cpus);
3406	mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
3407
3408	/*
3409	 * In the default hierarchy, enabling cpuset in the child cgroups
3410	 * will trigger a number of cpuset_attach() calls with no change
3411	 * in effective cpus and mems. In that case, we can optimize out
3412	 * by skipping the task iteration and update.
3413	 */
3414	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
3415	    !cpus_updated && !mems_updated) {
3416		cpuset_attach_nodemask_to = cs->effective_mems;
3417		goto out;
3418	}
3419
3420	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
3421
3422	cgroup_taskset_for_each(task, css, tset)
3423		cpuset_attach_task(cs, task);
3424
3425	/*
3426	 * Change mm for all threadgroup leaders. This is expensive and may
3427	 * sleep and should be moved outside migration path proper. Skip it
3428	 * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
3429	 * not set.
3430	 */
3431	cpuset_attach_nodemask_to = cs->effective_mems;
3432	if (!is_memory_migrate(cs) && !mems_updated)
3433		goto out;
3434
3435	cgroup_taskset_for_each_leader(leader, css, tset) {
3436		struct mm_struct *mm = get_task_mm(leader);
3437
3438		if (mm) {
3439			mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
3440
3441			/*
3442			 * old_mems_allowed is the same with mems_allowed
3443			 * here, except if this task is being moved
3444			 * automatically due to hotplug.  In that case
3445			 * @mems_allowed has been updated and is empty, so
3446			 * @old_mems_allowed is the right nodesets that we
3447			 * migrate mm from.
3448			 */
3449			if (is_memory_migrate(cs))
3450				cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
3451						  &cpuset_attach_nodemask_to);
3452			else
3453				mmput(mm);
3454		}
3455	}
3456
3457out:
3458	cs->old_mems_allowed = cpuset_attach_nodemask_to;
3459
3460	if (cs->nr_migrate_dl_tasks) {
3461		cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
3462		oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
3463		reset_migrate_dl_data(cs);
3464	}
3465
3466	cs->attach_in_progress--;
3467	if (!cs->attach_in_progress)
3468		wake_up(&cpuset_attach_wq);
3469
3470	mutex_unlock(&cpuset_mutex);
3471}
3472
3473/* The various types of files and directories in a cpuset file system */
3474
3475typedef enum {
3476	FILE_MEMORY_MIGRATE,
3477	FILE_CPULIST,
3478	FILE_MEMLIST,
3479	FILE_EFFECTIVE_CPULIST,
3480	FILE_EFFECTIVE_MEMLIST,
3481	FILE_SUBPARTS_CPULIST,
3482	FILE_EXCLUSIVE_CPULIST,
3483	FILE_EFFECTIVE_XCPULIST,
3484	FILE_ISOLATED_CPULIST,
3485	FILE_CPU_EXCLUSIVE,
3486	FILE_MEM_EXCLUSIVE,
3487	FILE_MEM_HARDWALL,
3488	FILE_SCHED_LOAD_BALANCE,
3489	FILE_PARTITION_ROOT,
3490	FILE_SCHED_RELAX_DOMAIN_LEVEL,
3491	FILE_MEMORY_PRESSURE_ENABLED,
3492	FILE_MEMORY_PRESSURE,
3493	FILE_SPREAD_PAGE,
3494	FILE_SPREAD_SLAB,
3495} cpuset_filetype_t;
3496
3497static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
3498			    u64 val)
3499{
3500	struct cpuset *cs = css_cs(css);
3501	cpuset_filetype_t type = cft->private;
3502	int retval = 0;
3503
3504	cpus_read_lock();
3505	mutex_lock(&cpuset_mutex);
3506	if (!is_cpuset_online(cs)) {
3507		retval = -ENODEV;
3508		goto out_unlock;
3509	}
3510
3511	switch (type) {
3512	case FILE_CPU_EXCLUSIVE:
3513		retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
3514		break;
3515	case FILE_MEM_EXCLUSIVE:
3516		retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
3517		break;
3518	case FILE_MEM_HARDWALL:
3519		retval = update_flag(CS_MEM_HARDWALL, cs, val);
3520		break;
3521	case FILE_SCHED_LOAD_BALANCE:
3522		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
3523		break;
3524	case FILE_MEMORY_MIGRATE:
3525		retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
3526		break;
3527	case FILE_MEMORY_PRESSURE_ENABLED:
3528		cpuset_memory_pressure_enabled = !!val;
3529		break;
3530	case FILE_SPREAD_PAGE:
3531		retval = update_flag(CS_SPREAD_PAGE, cs, val);
3532		break;
3533	case FILE_SPREAD_SLAB:
3534		retval = update_flag(CS_SPREAD_SLAB, cs, val);
3535		break;
3536	default:
3537		retval = -EINVAL;
3538		break;
3539	}
3540out_unlock:
3541	mutex_unlock(&cpuset_mutex);
3542	cpus_read_unlock();
3543	return retval;
3544}
3545
3546static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
3547			    s64 val)
3548{
3549	struct cpuset *cs = css_cs(css);
3550	cpuset_filetype_t type = cft->private;
3551	int retval = -ENODEV;
3552
3553	cpus_read_lock();
3554	mutex_lock(&cpuset_mutex);
3555	if (!is_cpuset_online(cs))
3556		goto out_unlock;
3557
3558	switch (type) {
3559	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
3560		retval = update_relax_domain_level(cs, val);
3561		break;
3562	default:
3563		retval = -EINVAL;
3564		break;
3565	}
3566out_unlock:
3567	mutex_unlock(&cpuset_mutex);
3568	cpus_read_unlock();
3569	return retval;
3570}
3571
3572/*
3573 * Common handling for a write to a "cpus" or "mems" file.
3574 */
3575static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
3576				    char *buf, size_t nbytes, loff_t off)
3577{
3578	struct cpuset *cs = css_cs(of_css(of));
3579	struct cpuset *trialcs;
3580	int retval = -ENODEV;
3581
3582	buf = strstrip(buf);
3583
3584	/*
3585	 * CPU or memory hotunplug may leave @cs w/o any execution
3586	 * resources, in which case the hotplug code asynchronously updates
3587	 * configuration and transfers all tasks to the nearest ancestor
3588	 * which can execute.
3589	 *
3590	 * As writes to "cpus" or "mems" may restore @cs's execution
3591	 * resources, wait for the previously scheduled operations before
3592	 * proceeding, so that we don't end up keep removing tasks added
3593	 * after execution capability is restored.
3594	 *
3595	 * cpuset_handle_hotplug may call back into cgroup core asynchronously
3596	 * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
3597	 * operation like this one can lead to a deadlock through kernfs
3598	 * active_ref protection.  Let's break the protection.  Losing the
3599	 * protection is okay as we check whether @cs is online after
3600	 * grabbing cpuset_mutex anyway.  This only happens on the legacy
3601	 * hierarchies.
3602	 */
3603	css_get(&cs->css);
3604	kernfs_break_active_protection(of->kn);
3605
3606	cpus_read_lock();
3607	mutex_lock(&cpuset_mutex);
3608	if (!is_cpuset_online(cs))
3609		goto out_unlock;
3610
3611	trialcs = alloc_trial_cpuset(cs);
3612	if (!trialcs) {
3613		retval = -ENOMEM;
3614		goto out_unlock;
3615	}
3616
3617	switch (of_cft(of)->private) {
3618	case FILE_CPULIST:
3619		retval = update_cpumask(cs, trialcs, buf);
3620		break;
3621	case FILE_EXCLUSIVE_CPULIST:
3622		retval = update_exclusive_cpumask(cs, trialcs, buf);
3623		break;
3624	case FILE_MEMLIST:
3625		retval = update_nodemask(cs, trialcs, buf);
3626		break;
3627	default:
3628		retval = -EINVAL;
3629		break;
3630	}
3631
3632	free_cpuset(trialcs);
3633out_unlock:
3634	mutex_unlock(&cpuset_mutex);
3635	cpus_read_unlock();
3636	kernfs_unbreak_active_protection(of->kn);
3637	css_put(&cs->css);
3638	flush_workqueue(cpuset_migrate_mm_wq);
3639	return retval ?: nbytes;
3640}
3641
3642/*
3643 * These ascii lists should be read in a single call, by using a user
3644 * buffer large enough to hold the entire map.  If read in smaller
3645 * chunks, there is no guarantee of atomicity.  Since the display format
3646 * used, list of ranges of sequential numbers, is variable length,
3647 * and since these maps can change value dynamically, one could read
3648 * gibberish by doing partial reads while a list was changing.
3649 */
3650static int cpuset_common_seq_show(struct seq_file *sf, void *v)
3651{
3652	struct cpuset *cs = css_cs(seq_css(sf));
3653	cpuset_filetype_t type = seq_cft(sf)->private;
3654	int ret = 0;
3655
3656	spin_lock_irq(&callback_lock);
3657
3658	switch (type) {
3659	case FILE_CPULIST:
3660		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
3661		break;
3662	case FILE_MEMLIST:
3663		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
3664		break;
3665	case FILE_EFFECTIVE_CPULIST:
3666		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
3667		break;
3668	case FILE_EFFECTIVE_MEMLIST:
3669		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
3670		break;
3671	case FILE_EXCLUSIVE_CPULIST:
3672		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
3673		break;
3674	case FILE_EFFECTIVE_XCPULIST:
3675		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
3676		break;
3677	case FILE_SUBPARTS_CPULIST:
3678		seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
3679		break;
3680	case FILE_ISOLATED_CPULIST:
3681		seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
3682		break;
3683	default:
3684		ret = -EINVAL;
3685	}
3686
3687	spin_unlock_irq(&callback_lock);
3688	return ret;
3689}
3690
3691static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
3692{
3693	struct cpuset *cs = css_cs(css);
3694	cpuset_filetype_t type = cft->private;
3695	switch (type) {
3696	case FILE_CPU_EXCLUSIVE:
3697		return is_cpu_exclusive(cs);
3698	case FILE_MEM_EXCLUSIVE:
3699		return is_mem_exclusive(cs);
3700	case FILE_MEM_HARDWALL:
3701		return is_mem_hardwall(cs);
3702	case FILE_SCHED_LOAD_BALANCE:
3703		return is_sched_load_balance(cs);
3704	case FILE_MEMORY_MIGRATE:
3705		return is_memory_migrate(cs);
3706	case FILE_MEMORY_PRESSURE_ENABLED:
3707		return cpuset_memory_pressure_enabled;
3708	case FILE_MEMORY_PRESSURE:
3709		return fmeter_getrate(&cs->fmeter);
3710	case FILE_SPREAD_PAGE:
3711		return is_spread_page(cs);
3712	case FILE_SPREAD_SLAB:
3713		return is_spread_slab(cs);
3714	default:
3715		BUG();
3716	}
3717
3718	/* Unreachable but makes gcc happy */
3719	return 0;
3720}
3721
3722static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
3723{
3724	struct cpuset *cs = css_cs(css);
3725	cpuset_filetype_t type = cft->private;
3726	switch (type) {
3727	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
3728		return cs->relax_domain_level;
3729	default:
3730		BUG();
3731	}
3732
3733	/* Unreachable but makes gcc happy */
3734	return 0;
3735}
3736
3737static int sched_partition_show(struct seq_file *seq, void *v)
3738{
3739	struct cpuset *cs = css_cs(seq_css(seq));
3740	const char *err, *type = NULL;
3741
3742	switch (cs->partition_root_state) {
3743	case PRS_ROOT:
3744		seq_puts(seq, "root\n");
3745		break;
3746	case PRS_ISOLATED:
3747		seq_puts(seq, "isolated\n");
3748		break;
3749	case PRS_MEMBER:
3750		seq_puts(seq, "member\n");
3751		break;
3752	case PRS_INVALID_ROOT:
3753		type = "root";
3754		fallthrough;
3755	case PRS_INVALID_ISOLATED:
3756		if (!type)
3757			type = "isolated";
3758		err = perr_strings[READ_ONCE(cs->prs_err)];
3759		if (err)
3760			seq_printf(seq, "%s invalid (%s)\n", type, err);
3761		else
3762			seq_printf(seq, "%s invalid\n", type);
3763		break;
3764	}
3765	return 0;
3766}
3767
3768static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
3769				     size_t nbytes, loff_t off)
3770{
3771	struct cpuset *cs = css_cs(of_css(of));
3772	int val;
3773	int retval = -ENODEV;
3774
3775	buf = strstrip(buf);
3776
3777	if (!strcmp(buf, "root"))
3778		val = PRS_ROOT;
3779	else if (!strcmp(buf, "member"))
3780		val = PRS_MEMBER;
3781	else if (!strcmp(buf, "isolated"))
3782		val = PRS_ISOLATED;
3783	else
3784		return -EINVAL;
3785
3786	css_get(&cs->css);
3787	cpus_read_lock();
3788	mutex_lock(&cpuset_mutex);
3789	if (!is_cpuset_online(cs))
3790		goto out_unlock;
3791
3792	retval = update_prstate(cs, val);
3793out_unlock:
3794	mutex_unlock(&cpuset_mutex);
3795	cpus_read_unlock();
3796	css_put(&cs->css);
3797	return retval ?: nbytes;
3798}
3799
3800/*
3801 * for the common functions, 'private' gives the type of file
3802 */
3803
3804static struct cftype legacy_files[] = {
3805	{
3806		.name = "cpus",
3807		.seq_show = cpuset_common_seq_show,
3808		.write = cpuset_write_resmask,
3809		.max_write_len = (100U + 6 * NR_CPUS),
3810		.private = FILE_CPULIST,
3811	},
3812
3813	{
3814		.name = "mems",
3815		.seq_show = cpuset_common_seq_show,
3816		.write = cpuset_write_resmask,
3817		.max_write_len = (100U + 6 * MAX_NUMNODES),
3818		.private = FILE_MEMLIST,
3819	},
3820
3821	{
3822		.name = "effective_cpus",
3823		.seq_show = cpuset_common_seq_show,
3824		.private = FILE_EFFECTIVE_CPULIST,
3825	},
3826
3827	{
3828		.name = "effective_mems",
3829		.seq_show = cpuset_common_seq_show,
3830		.private = FILE_EFFECTIVE_MEMLIST,
3831	},
3832
3833	{
3834		.name = "cpu_exclusive",
3835		.read_u64 = cpuset_read_u64,
3836		.write_u64 = cpuset_write_u64,
3837		.private = FILE_CPU_EXCLUSIVE,
3838	},
3839
3840	{
3841		.name = "mem_exclusive",
3842		.read_u64 = cpuset_read_u64,
3843		.write_u64 = cpuset_write_u64,
3844		.private = FILE_MEM_EXCLUSIVE,
3845	},
3846
3847	{
3848		.name = "mem_hardwall",
3849		.read_u64 = cpuset_read_u64,
3850		.write_u64 = cpuset_write_u64,
3851		.private = FILE_MEM_HARDWALL,
3852	},
3853
3854	{
3855		.name = "sched_load_balance",
3856		.read_u64 = cpuset_read_u64,
3857		.write_u64 = cpuset_write_u64,
3858		.private = FILE_SCHED_LOAD_BALANCE,
3859	},
3860
3861	{
3862		.name = "sched_relax_domain_level",
3863		.read_s64 = cpuset_read_s64,
3864		.write_s64 = cpuset_write_s64,
3865		.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
3866	},
3867
3868	{
3869		.name = "memory_migrate",
3870		.read_u64 = cpuset_read_u64,
3871		.write_u64 = cpuset_write_u64,
3872		.private = FILE_MEMORY_MIGRATE,
3873	},
3874
3875	{
3876		.name = "memory_pressure",
3877		.read_u64 = cpuset_read_u64,
3878		.private = FILE_MEMORY_PRESSURE,
3879	},
3880
3881	{
3882		.name = "memory_spread_page",
3883		.read_u64 = cpuset_read_u64,
3884		.write_u64 = cpuset_write_u64,
3885		.private = FILE_SPREAD_PAGE,
3886	},
3887
3888	{
3889		/* obsolete, may be removed in the future */
3890		.name = "memory_spread_slab",
3891		.read_u64 = cpuset_read_u64,
3892		.write_u64 = cpuset_write_u64,
3893		.private = FILE_SPREAD_SLAB,
3894	},
3895
3896	{
3897		.name = "memory_pressure_enabled",
3898		.flags = CFTYPE_ONLY_ON_ROOT,
3899		.read_u64 = cpuset_read_u64,
3900		.write_u64 = cpuset_write_u64,
3901		.private = FILE_MEMORY_PRESSURE_ENABLED,
3902	},
3903
3904	{ }	/* terminate */
3905};
3906
3907/*
3908 * This is currently a minimal set for the default hierarchy. It can be
3909 * expanded later on by migrating more features and control files from v1.
3910 */
3911static struct cftype dfl_files[] = {
3912	{
3913		.name = "cpus",
3914		.seq_show = cpuset_common_seq_show,
3915		.write = cpuset_write_resmask,
3916		.max_write_len = (100U + 6 * NR_CPUS),
3917		.private = FILE_CPULIST,
3918		.flags = CFTYPE_NOT_ON_ROOT,
3919	},
3920
3921	{
3922		.name = "mems",
3923		.seq_show = cpuset_common_seq_show,
3924		.write = cpuset_write_resmask,
3925		.max_write_len = (100U + 6 * MAX_NUMNODES),
3926		.private = FILE_MEMLIST,
3927		.flags = CFTYPE_NOT_ON_ROOT,
3928	},
3929
3930	{
3931		.name = "cpus.effective",
3932		.seq_show = cpuset_common_seq_show,
3933		.private = FILE_EFFECTIVE_CPULIST,
3934	},
3935
3936	{
3937		.name = "mems.effective",
3938		.seq_show = cpuset_common_seq_show,
3939		.private = FILE_EFFECTIVE_MEMLIST,
3940	},
3941
3942	{
3943		.name = "cpus.partition",
3944		.seq_show = sched_partition_show,
3945		.write = sched_partition_write,
3946		.private = FILE_PARTITION_ROOT,
3947		.flags = CFTYPE_NOT_ON_ROOT,
3948		.file_offset = offsetof(struct cpuset, partition_file),
3949	},
3950
3951	{
3952		.name = "cpus.exclusive",
3953		.seq_show = cpuset_common_seq_show,
3954		.write = cpuset_write_resmask,
3955		.max_write_len = (100U + 6 * NR_CPUS),
3956		.private = FILE_EXCLUSIVE_CPULIST,
3957		.flags = CFTYPE_NOT_ON_ROOT,
3958	},
3959
3960	{
3961		.name = "cpus.exclusive.effective",
3962		.seq_show = cpuset_common_seq_show,
3963		.private = FILE_EFFECTIVE_XCPULIST,
3964		.flags = CFTYPE_NOT_ON_ROOT,
3965	},
3966
3967	{
3968		.name = "cpus.subpartitions",
3969		.seq_show = cpuset_common_seq_show,
3970		.private = FILE_SUBPARTS_CPULIST,
3971		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
3972	},
3973
3974	{
3975		.name = "cpus.isolated",
3976		.seq_show = cpuset_common_seq_show,
3977		.private = FILE_ISOLATED_CPULIST,
3978		.flags = CFTYPE_ONLY_ON_ROOT,
3979	},
3980
3981	{ }	/* terminate */
3982};
3983
3984
3985/**
3986 * cpuset_css_alloc - Allocate a cpuset css
3987 * @parent_css: Parent css of the control group that the new cpuset will be
3988 *              part of
3989 * Return: cpuset css on success, -ENOMEM on failure.
3990 *
3991 * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
3992 * top cpuset css otherwise.
3993 */
3994static struct cgroup_subsys_state *
3995cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
3996{
3997	struct cpuset *cs;
3998
3999	if (!parent_css)
4000		return &top_cpuset.css;
4001
4002	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
4003	if (!cs)
4004		return ERR_PTR(-ENOMEM);
4005
4006	if (alloc_cpumasks(cs, NULL)) {
4007		kfree(cs);
4008		return ERR_PTR(-ENOMEM);
4009	}
4010
4011	__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
4012	nodes_clear(cs->mems_allowed);
4013	nodes_clear(cs->effective_mems);
4014	fmeter_init(&cs->fmeter);
4015	cs->relax_domain_level = -1;
4016	INIT_LIST_HEAD(&cs->remote_sibling);
4017
4018	/* Set CS_MEMORY_MIGRATE for default hierarchy */
4019	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
4020		__set_bit(CS_MEMORY_MIGRATE, &cs->flags);
4021
4022	return &cs->css;
4023}
4024
4025static int cpuset_css_online(struct cgroup_subsys_state *css)
4026{
4027	struct cpuset *cs = css_cs(css);
4028	struct cpuset *parent = parent_cs(cs);
4029	struct cpuset *tmp_cs;
4030	struct cgroup_subsys_state *pos_css;
4031
4032	if (!parent)
4033		return 0;
4034
4035	cpus_read_lock();
4036	mutex_lock(&cpuset_mutex);
4037
4038	set_bit(CS_ONLINE, &cs->flags);
4039	if (is_spread_page(parent))
4040		set_bit(CS_SPREAD_PAGE, &cs->flags);
4041	if (is_spread_slab(parent))
4042		set_bit(CS_SPREAD_SLAB, &cs->flags);
4043
4044	cpuset_inc();
4045
4046	spin_lock_irq(&callback_lock);
4047	if (is_in_v2_mode()) {
4048		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
4049		cs->effective_mems = parent->effective_mems;
4050		cs->use_parent_ecpus = true;
4051		parent->child_ecpus_count++;
4052	}
4053
4054	/*
4055	 * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
4056	 */
4057	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
4058	    !is_sched_load_balance(parent))
4059		clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
4060
4061	spin_unlock_irq(&callback_lock);
4062
4063	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
4064		goto out_unlock;
4065
4066	/*
4067	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
4068	 * set.  This flag handling is implemented in cgroup core for
4069	 * historical reasons - the flag may be specified during mount.
4070	 *
4071	 * Currently, if any sibling cpusets have exclusive cpus or mem, we
4072	 * refuse to clone the configuration - thereby refusing the task to
4073	 * be entered, and as a result refusing the sys_unshare() or
4074	 * clone() which initiated it.  If this becomes a problem for some
4075	 * users who wish to allow that scenario, then this could be
4076	 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
4077	 * (and likewise for mems) to the new cgroup.
4078	 */
4079	rcu_read_lock();
4080	cpuset_for_each_child(tmp_cs, pos_css, parent) {
4081		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
4082			rcu_read_unlock();
4083			goto out_unlock;
4084		}
4085	}
4086	rcu_read_unlock();
4087
4088	spin_lock_irq(&callback_lock);
4089	cs->mems_allowed = parent->mems_allowed;
4090	cs->effective_mems = parent->mems_allowed;
4091	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
4092	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
4093	spin_unlock_irq(&callback_lock);
4094out_unlock:
4095	mutex_unlock(&cpuset_mutex);
4096	cpus_read_unlock();
4097	return 0;
4098}
4099
4100/*
4101 * If the cpuset being removed has its flag 'sched_load_balance'
4102 * enabled, then simulate turning sched_load_balance off, which
4103 * will call rebuild_sched_domains_locked(). That is not needed
4104 * in the default hierarchy where only changes in partition
4105 * will cause repartitioning.
4106 *
4107 * If the cpuset has the 'sched.partition' flag enabled, simulate
4108 * turning 'sched.partition" off.
4109 */
4110
4111static void cpuset_css_offline(struct cgroup_subsys_state *css)
4112{
4113	struct cpuset *cs = css_cs(css);
4114
4115	cpus_read_lock();
4116	mutex_lock(&cpuset_mutex);
4117
4118	if (is_partition_valid(cs))
4119		update_prstate(cs, 0);
4120
4121	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
4122	    is_sched_load_balance(cs))
4123		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
4124
4125	if (cs->use_parent_ecpus) {
4126		struct cpuset *parent = parent_cs(cs);
4127
4128		cs->use_parent_ecpus = false;
4129		parent->child_ecpus_count--;
4130	}
4131
4132	cpuset_dec();
4133	clear_bit(CS_ONLINE, &cs->flags);
4134
4135	mutex_unlock(&cpuset_mutex);
4136	cpus_read_unlock();
4137}
4138
4139static void cpuset_css_free(struct cgroup_subsys_state *css)
4140{
4141	struct cpuset *cs = css_cs(css);
4142
4143	free_cpuset(cs);
4144}
4145
4146static void cpuset_bind(struct cgroup_subsys_state *root_css)
4147{
4148	mutex_lock(&cpuset_mutex);
4149	spin_lock_irq(&callback_lock);
4150
4151	if (is_in_v2_mode()) {
4152		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
4153		cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);
4154		top_cpuset.mems_allowed = node_possible_map;
4155	} else {
4156		cpumask_copy(top_cpuset.cpus_allowed,
4157			     top_cpuset.effective_cpus);
4158		top_cpuset.mems_allowed = top_cpuset.effective_mems;
4159	}
4160
4161	spin_unlock_irq(&callback_lock);
4162	mutex_unlock(&cpuset_mutex);
4163}
4164
4165/*
4166 * In case the child is cloned into a cpuset different from its parent,
4167 * additional checks are done to see if the move is allowed.
4168 */
4169static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
4170{
4171	struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
4172	bool same_cs;
4173	int ret;
4174
4175	rcu_read_lock();
4176	same_cs = (cs == task_cs(current));
4177	rcu_read_unlock();
4178
4179	if (same_cs)
4180		return 0;
4181
4182	lockdep_assert_held(&cgroup_mutex);
4183	mutex_lock(&cpuset_mutex);
4184
4185	/* Check to see if task is allowed in the cpuset */
4186	ret = cpuset_can_attach_check(cs);
4187	if (ret)
4188		goto out_unlock;
4189
4190	ret = task_can_attach(task);
4191	if (ret)
4192		goto out_unlock;
4193
4194	ret = security_task_setscheduler(task);
4195	if (ret)
4196		goto out_unlock;
4197
4198	/*
4199	 * Mark attach is in progress.  This makes validate_change() fail
4200	 * changes which zero cpus/mems_allowed.
4201	 */
4202	cs->attach_in_progress++;
4203out_unlock:
4204	mutex_unlock(&cpuset_mutex);
4205	return ret;
4206}
4207
4208static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
4209{
4210	struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
4211	bool same_cs;
4212
4213	rcu_read_lock();
4214	same_cs = (cs == task_cs(current));
4215	rcu_read_unlock();
4216
4217	if (same_cs)
4218		return;
4219
4220	mutex_lock(&cpuset_mutex);
4221	cs->attach_in_progress--;
4222	if (!cs->attach_in_progress)
4223		wake_up(&cpuset_attach_wq);
4224	mutex_unlock(&cpuset_mutex);
4225}
4226
4227/*
4228 * Make sure the new task conform to the current state of its parent,
4229 * which could have been changed by cpuset just after it inherits the
4230 * state from the parent and before it sits on the cgroup's task list.
4231 */
4232static void cpuset_fork(struct task_struct *task)
4233{
4234	struct cpuset *cs;
4235	bool same_cs;
4236
4237	rcu_read_lock();
4238	cs = task_cs(task);
4239	same_cs = (cs == task_cs(current));
4240	rcu_read_unlock();
4241
4242	if (same_cs) {
4243		if (cs == &top_cpuset)
4244			return;
4245
4246		set_cpus_allowed_ptr(task, current->cpus_ptr);
4247		task->mems_allowed = current->mems_allowed;
4248		return;
4249	}
4250
4251	/* CLONE_INTO_CGROUP */
4252	mutex_lock(&cpuset_mutex);
4253	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
4254	cpuset_attach_task(cs, task);
4255
4256	cs->attach_in_progress--;
4257	if (!cs->attach_in_progress)
4258		wake_up(&cpuset_attach_wq);
4259
4260	mutex_unlock(&cpuset_mutex);
4261}
4262
4263struct cgroup_subsys cpuset_cgrp_subsys = {
4264	.css_alloc	= cpuset_css_alloc,
4265	.css_online	= cpuset_css_online,
4266	.css_offline	= cpuset_css_offline,
4267	.css_free	= cpuset_css_free,
4268	.can_attach	= cpuset_can_attach,
4269	.cancel_attach	= cpuset_cancel_attach,
4270	.attach		= cpuset_attach,
4271	.post_attach	= cpuset_post_attach,
4272	.bind		= cpuset_bind,
4273	.can_fork	= cpuset_can_fork,
4274	.cancel_fork	= cpuset_cancel_fork,
4275	.fork		= cpuset_fork,
4276	.legacy_cftypes	= legacy_files,
4277	.dfl_cftypes	= dfl_files,
4278	.early_init	= true,
4279	.threaded	= true,
4280};
4281
4282/**
4283 * cpuset_init - initialize cpusets at system boot
4284 *
4285 * Description: Initialize top_cpuset
4286 **/
4287
4288int __init cpuset_init(void)
4289{
4290	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
4291	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
4292	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
4293	BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
4294	BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
4295	BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
4296
4297	cpumask_setall(top_cpuset.cpus_allowed);
4298	nodes_setall(top_cpuset.mems_allowed);
4299	cpumask_setall(top_cpuset.effective_cpus);
4300	cpumask_setall(top_cpuset.effective_xcpus);
4301	cpumask_setall(top_cpuset.exclusive_cpus);
4302	nodes_setall(top_cpuset.effective_mems);
4303
4304	fmeter_init(&top_cpuset.fmeter);
4305	INIT_LIST_HEAD(&remote_children);
4306
4307	BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
4308
4309	return 0;
4310}
4311
4312/*
4313 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
4314 * or memory nodes, we need to walk over the cpuset hierarchy,
4315 * removing that CPU or node from all cpusets.  If this removes the
4316 * last CPU or node from a cpuset, then move the tasks in the empty
4317 * cpuset to its next-highest non-empty parent.
4318 */
4319static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
4320{
4321	struct cpuset *parent;
4322
4323	/*
4324	 * Find its next-highest non-empty parent, (top cpuset
4325	 * has online cpus, so can't be empty).
4326	 */
4327	parent = parent_cs(cs);
4328	while (cpumask_empty(parent->cpus_allowed) ||
4329			nodes_empty(parent->mems_allowed))
4330		parent = parent_cs(parent);
4331
4332	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
4333		pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
4334		pr_cont_cgroup_name(cs->css.cgroup);
4335		pr_cont("\n");
4336	}
4337}
4338
4339static void cpuset_migrate_tasks_workfn(struct work_struct *work)
4340{
4341	struct cpuset_remove_tasks_struct *s;
4342
4343	s = container_of(work, struct cpuset_remove_tasks_struct, work);
4344	remove_tasks_in_empty_cpuset(s->cs);
4345	css_put(&s->cs->css);
4346	kfree(s);
4347}
4348
4349static void
4350hotplug_update_tasks_legacy(struct cpuset *cs,
4351			    struct cpumask *new_cpus, nodemask_t *new_mems,
4352			    bool cpus_updated, bool mems_updated)
4353{
4354	bool is_empty;
4355
4356	spin_lock_irq(&callback_lock);
4357	cpumask_copy(cs->cpus_allowed, new_cpus);
4358	cpumask_copy(cs->effective_cpus, new_cpus);
4359	cs->mems_allowed = *new_mems;
4360	cs->effective_mems = *new_mems;
4361	spin_unlock_irq(&callback_lock);
4362
4363	/*
4364	 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
4365	 * as the tasks will be migrated to an ancestor.
4366	 */
4367	if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
4368		update_tasks_cpumask(cs, new_cpus);
4369	if (mems_updated && !nodes_empty(cs->mems_allowed))
4370		update_tasks_nodemask(cs);
4371
4372	is_empty = cpumask_empty(cs->cpus_allowed) ||
4373		   nodes_empty(cs->mems_allowed);
4374
4375	/*
4376	 * Move tasks to the nearest ancestor with execution resources,
4377	 * This is full cgroup operation which will also call back into
4378	 * cpuset. Execute it asynchronously using workqueue.
4379	 */
4380	if (is_empty && cs->css.cgroup->nr_populated_csets &&
4381	    css_tryget_online(&cs->css)) {
4382		struct cpuset_remove_tasks_struct *s;
4383
4384		s = kzalloc(sizeof(*s), GFP_KERNEL);
4385		if (WARN_ON_ONCE(!s)) {
4386			css_put(&cs->css);
4387			return;
4388		}
4389
4390		s->cs = cs;
4391		INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
4392		schedule_work(&s->work);
4393	}
4394}
4395
4396static void
4397hotplug_update_tasks(struct cpuset *cs,
4398		     struct cpumask *new_cpus, nodemask_t *new_mems,
4399		     bool cpus_updated, bool mems_updated)
4400{
4401	/* A partition root is allowed to have empty effective cpus */
4402	if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
4403		cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
4404	if (nodes_empty(*new_mems))
4405		*new_mems = parent_cs(cs)->effective_mems;
4406
4407	spin_lock_irq(&callback_lock);
4408	cpumask_copy(cs->effective_cpus, new_cpus);
4409	cs->effective_mems = *new_mems;
4410	spin_unlock_irq(&callback_lock);
4411
4412	if (cpus_updated)
4413		update_tasks_cpumask(cs, new_cpus);
4414	if (mems_updated)
4415		update_tasks_nodemask(cs);
4416}
4417
4418static bool force_rebuild;
4419
4420void cpuset_force_rebuild(void)
4421{
4422	force_rebuild = true;
4423}
4424
4425/**
4426 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
4427 * @cs: cpuset in interest
4428 * @tmp: the tmpmasks structure pointer
4429 *
4430 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
4431 * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
4432 * all its tasks are moved to the nearest ancestor with both resources.
4433 */
4434static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
4435{
4436	static cpumask_t new_cpus;
4437	static nodemask_t new_mems;
4438	bool cpus_updated;
4439	bool mems_updated;
4440	bool remote;
4441	int partcmd = -1;
4442	struct cpuset *parent;
4443retry:
4444	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
4445
4446	mutex_lock(&cpuset_mutex);
4447
4448	/*
4449	 * We have raced with task attaching. We wait until attaching
4450	 * is finished, so we won't attach a task to an empty cpuset.
4451	 */
4452	if (cs->attach_in_progress) {
4453		mutex_unlock(&cpuset_mutex);
4454		goto retry;
4455	}
4456
4457	parent = parent_cs(cs);
4458	compute_effective_cpumask(&new_cpus, cs, parent);
4459	nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
4460
4461	if (!tmp || !cs->partition_root_state)
4462		goto update_tasks;
4463
4464	/*
4465	 * Compute effective_cpus for valid partition root, may invalidate
4466	 * child partition roots if necessary.
4467	 */
4468	remote = is_remote_partition(cs);
4469	if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))
4470		compute_partition_effective_cpumask(cs, &new_cpus);
4471
4472	if (remote && cpumask_empty(&new_cpus) &&
4473	    partition_is_populated(cs, NULL)) {
4474		remote_partition_disable(cs, tmp);
4475		compute_effective_cpumask(&new_cpus, cs, parent);
4476		remote = false;
4477		cpuset_force_rebuild();
4478	}
4479
4480	/*
4481	 * Force the partition to become invalid if either one of
4482	 * the following conditions hold:
4483	 * 1) empty effective cpus but not valid empty partition.
4484	 * 2) parent is invalid or doesn't grant any cpus to child
4485	 *    partitions.
4486	 */
4487	if (is_local_partition(cs) && (!is_partition_valid(parent) ||
4488				tasks_nocpu_error(parent, cs, &new_cpus)))
4489		partcmd = partcmd_invalidate;
4490	/*
4491	 * On the other hand, an invalid partition root may be transitioned
4492	 * back to a regular one.
4493	 */
4494	else if (is_partition_valid(parent) && is_partition_invalid(cs))
4495		partcmd = partcmd_update;
4496
4497	if (partcmd >= 0) {
4498		update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
4499		if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
4500			compute_partition_effective_cpumask(cs, &new_cpus);
4501			cpuset_force_rebuild();
4502		}
4503	}
4504
4505update_tasks:
4506	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
4507	mems_updated = !nodes_equal(new_mems, cs->effective_mems);
4508	if (!cpus_updated && !mems_updated)
4509		goto unlock;	/* Hotplug doesn't affect this cpuset */
4510
4511	if (mems_updated)
4512		check_insane_mems_config(&new_mems);
4513
4514	if (is_in_v2_mode())
4515		hotplug_update_tasks(cs, &new_cpus, &new_mems,
4516				     cpus_updated, mems_updated);
4517	else
4518		hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
4519					    cpus_updated, mems_updated);
4520
4521unlock:
4522	mutex_unlock(&cpuset_mutex);
4523}
4524
4525/**
4526 * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
4527 *
4528 * This function is called after either CPU or memory configuration has
4529 * changed and updates cpuset accordingly.  The top_cpuset is always
4530 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
4531 * order to make cpusets transparent (of no affect) on systems that are
4532 * actively using CPU hotplug but making no active use of cpusets.
4533 *
4534 * Non-root cpusets are only affected by offlining.  If any CPUs or memory
4535 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
4536 * all descendants.
4537 *
4538 * Note that CPU offlining during suspend is ignored.  We don't modify
4539 * cpusets across suspend/resume cycles at all.
4540 *
4541 * CPU / memory hotplug is handled synchronously.
4542 */
4543static void cpuset_handle_hotplug(void)
4544{
4545	static cpumask_t new_cpus;
4546	static nodemask_t new_mems;
4547	bool cpus_updated, mems_updated;
4548	bool on_dfl = is_in_v2_mode();
4549	struct tmpmasks tmp, *ptmp = NULL;
4550
4551	if (on_dfl && !alloc_cpumasks(NULL, &tmp))
4552		ptmp = &tmp;
4553
4554	lockdep_assert_cpus_held();
4555	mutex_lock(&cpuset_mutex);
4556
4557	/* fetch the available cpus/mems and find out which changed how */
4558	cpumask_copy(&new_cpus, cpu_active_mask);
4559	new_mems = node_states[N_MEMORY];
4560
4561	/*
4562	 * If subpartitions_cpus is populated, it is likely that the check
4563	 * below will produce a false positive on cpus_updated when the cpu
4564	 * list isn't changed. It is extra work, but it is better to be safe.
4565	 */
4566	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||
4567		       !cpumask_empty(subpartitions_cpus);
4568	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
4569
4570	/*
4571	 * In the rare case that hotplug removes all the cpus in
4572	 * subpartitions_cpus, we assumed that cpus are updated.
4573	 */
4574	if (!cpus_updated && top_cpuset.nr_subparts)
4575		cpus_updated = true;
4576
4577	/* For v1, synchronize cpus_allowed to cpu_active_mask */
4578	if (cpus_updated) {
4579		spin_lock_irq(&callback_lock);
4580		if (!on_dfl)
4581			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
4582		/*
4583		 * Make sure that CPUs allocated to child partitions
4584		 * do not show up in effective_cpus. If no CPU is left,
4585		 * we clear the subpartitions_cpus & let the child partitions
4586		 * fight for the CPUs again.
4587		 */
4588		if (!cpumask_empty(subpartitions_cpus)) {
4589			if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
4590				top_cpuset.nr_subparts = 0;
4591				cpumask_clear(subpartitions_cpus);
4592			} else {
4593				cpumask_andnot(&new_cpus, &new_cpus,
4594					       subpartitions_cpus);
4595			}
4596		}
4597		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
4598		spin_unlock_irq(&callback_lock);
4599		/* we don't mess with cpumasks of tasks in top_cpuset */
4600	}
4601
4602	/* synchronize mems_allowed to N_MEMORY */
4603	if (mems_updated) {
4604		spin_lock_irq(&callback_lock);
4605		if (!on_dfl)
4606			top_cpuset.mems_allowed = new_mems;
4607		top_cpuset.effective_mems = new_mems;
4608		spin_unlock_irq(&callback_lock);
4609		update_tasks_nodemask(&top_cpuset);
4610	}
4611
4612	mutex_unlock(&cpuset_mutex);
4613
4614	/* if cpus or mems changed, we need to propagate to descendants */
4615	if (cpus_updated || mems_updated) {
4616		struct cpuset *cs;
4617		struct cgroup_subsys_state *pos_css;
4618
4619		rcu_read_lock();
4620		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
4621			if (cs == &top_cpuset || !css_tryget_online(&cs->css))
4622				continue;
4623			rcu_read_unlock();
4624
4625			cpuset_hotplug_update_tasks(cs, ptmp);
4626
4627			rcu_read_lock();
4628			css_put(&cs->css);
4629		}
4630		rcu_read_unlock();
4631	}
4632
4633	/* rebuild sched domains if cpus_allowed has changed */
4634	if (cpus_updated || force_rebuild) {
4635		force_rebuild = false;
4636		rebuild_sched_domains_cpuslocked();
4637	}
4638
4639	free_cpumasks(NULL, ptmp);
4640}
4641
4642void cpuset_update_active_cpus(void)
4643{
4644	/*
4645	 * We're inside cpu hotplug critical region which usually nests
4646	 * inside cgroup synchronization.  Bounce actual hotplug processing
4647	 * to a work item to avoid reverse locking order.
4648	 */
4649	cpuset_handle_hotplug();
4650}
4651
4652/*
4653 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
4654 * Call this routine anytime after node_states[N_MEMORY] changes.
4655 * See cpuset_update_active_cpus() for CPU hotplug handling.
4656 */
4657static int cpuset_track_online_nodes(struct notifier_block *self,
4658				unsigned long action, void *arg)
4659{
4660	cpuset_handle_hotplug();
4661	return NOTIFY_OK;
4662}
4663
4664/**
4665 * cpuset_init_smp - initialize cpus_allowed
4666 *
4667 * Description: Finish top cpuset after cpu, node maps are initialized
4668 */
4669void __init cpuset_init_smp(void)
4670{
4671	/*
4672	 * cpus_allowd/mems_allowed set to v2 values in the initial
4673	 * cpuset_bind() call will be reset to v1 values in another
4674	 * cpuset_bind() call when v1 cpuset is mounted.
4675	 */
4676	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
4677
4678	cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
4679	top_cpuset.effective_mems = node_states[N_MEMORY];
4680
4681	hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
4682
4683	cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
4684	BUG_ON(!cpuset_migrate_mm_wq);
4685}
4686
4687/**
4688 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
4689 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
4690 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
4691 *
4692 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
4693 * attached to the specified @tsk.  Guaranteed to return some non-empty
4694 * subset of cpu_online_mask, even if this means going outside the
4695 * tasks cpuset, except when the task is in the top cpuset.
4696 **/
4697
4698void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
4699{
4700	unsigned long flags;
4701	struct cpuset *cs;
4702
4703	spin_lock_irqsave(&callback_lock, flags);
4704	rcu_read_lock();
4705
4706	cs = task_cs(tsk);
4707	if (cs != &top_cpuset)
4708		guarantee_online_cpus(tsk, pmask);
4709	/*
4710	 * Tasks in the top cpuset won't get update to their cpumasks
4711	 * when a hotplug online/offline event happens. So we include all
4712	 * offline cpus in the allowed cpu list.
4713	 */
4714	if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
4715		const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
4716
4717		/*
4718		 * We first exclude cpus allocated to partitions. If there is no
4719		 * allowable online cpu left, we fall back to all possible cpus.
4720		 */
4721		cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
4722		if (!cpumask_intersects(pmask, cpu_online_mask))
4723			cpumask_copy(pmask, possible_mask);
4724	}
4725
4726	rcu_read_unlock();
4727	spin_unlock_irqrestore(&callback_lock, flags);
4728}
4729
4730/**
4731 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
4732 * @tsk: pointer to task_struct with which the scheduler is struggling
4733 *
4734 * Description: In the case that the scheduler cannot find an allowed cpu in
4735 * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
4736 * mode however, this value is the same as task_cs(tsk)->effective_cpus,
4737 * which will not contain a sane cpumask during cases such as cpu hotplugging.
4738 * This is the absolute last resort for the scheduler and it is only used if
4739 * _every_ other avenue has been traveled.
4740 *
4741 * Returns true if the affinity of @tsk was changed, false otherwise.
4742 **/
4743
4744bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
4745{
4746	const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
4747	const struct cpumask *cs_mask;
4748	bool changed = false;
4749
4750	rcu_read_lock();
4751	cs_mask = task_cs(tsk)->cpus_allowed;
4752	if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
4753		do_set_cpus_allowed(tsk, cs_mask);
4754		changed = true;
4755	}
4756	rcu_read_unlock();
4757
4758	/*
4759	 * We own tsk->cpus_allowed, nobody can change it under us.
4760	 *
4761	 * But we used cs && cs->cpus_allowed lockless and thus can
4762	 * race with cgroup_attach_task() or update_cpumask() and get
4763	 * the wrong tsk->cpus_allowed. However, both cases imply the
4764	 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
4765	 * which takes task_rq_lock().
4766	 *
4767	 * If we are called after it dropped the lock we must see all
4768	 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
4769	 * set any mask even if it is not right from task_cs() pov,
4770	 * the pending set_cpus_allowed_ptr() will fix things.
4771	 *
4772	 * select_fallback_rq() will fix things ups and set cpu_possible_mask
4773	 * if required.
4774	 */
4775	return changed;
4776}
4777
4778void __init cpuset_init_current_mems_allowed(void)
4779{
4780	nodes_setall(current->mems_allowed);
4781}
4782
4783/**
4784 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
4785 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
4786 *
4787 * Description: Returns the nodemask_t mems_allowed of the cpuset
4788 * attached to the specified @tsk.  Guaranteed to return some non-empty
4789 * subset of node_states[N_MEMORY], even if this means going outside the
4790 * tasks cpuset.
4791 **/
4792
4793nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
4794{
4795	nodemask_t mask;
4796	unsigned long flags;
4797
4798	spin_lock_irqsave(&callback_lock, flags);
4799	rcu_read_lock();
4800	guarantee_online_mems(task_cs(tsk), &mask);
4801	rcu_read_unlock();
4802	spin_unlock_irqrestore(&callback_lock, flags);
4803
4804	return mask;
4805}
4806
4807/**
4808 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
4809 * @nodemask: the nodemask to be checked
4810 *
4811 * Are any of the nodes in the nodemask allowed in current->mems_allowed?
4812 */
4813int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
4814{
4815	return nodes_intersects(*nodemask, current->mems_allowed);
4816}
4817
4818/*
4819 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
4820 * mem_hardwall ancestor to the specified cpuset.  Call holding
4821 * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
4822 * (an unusual configuration), then returns the root cpuset.
4823 */
4824static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
4825{
4826	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
4827		cs = parent_cs(cs);
4828	return cs;
4829}
4830
4831/*
4832 * cpuset_node_allowed - Can we allocate on a memory node?
4833 * @node: is this an allowed node?
4834 * @gfp_mask: memory allocation flags
4835 *
4836 * If we're in interrupt, yes, we can always allocate.  If @node is set in
4837 * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
4838 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
4839 * yes.  If current has access to memory reserves as an oom victim, yes.
4840 * Otherwise, no.
4841 *
4842 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
4843 * and do not allow allocations outside the current tasks cpuset
4844 * unless the task has been OOM killed.
4845 * GFP_KERNEL allocations are not so marked, so can escape to the
4846 * nearest enclosing hardwalled ancestor cpuset.
4847 *
4848 * Scanning up parent cpusets requires callback_lock.  The
4849 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
4850 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
4851 * current tasks mems_allowed came up empty on the first pass over
4852 * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
4853 * cpuset are short of memory, might require taking the callback_lock.
4854 *
4855 * The first call here from mm/page_alloc:get_page_from_freelist()
4856 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
4857 * so no allocation on a node outside the cpuset is allowed (unless
4858 * in interrupt, of course).
4859 *
4860 * The second pass through get_page_from_freelist() doesn't even call
4861 * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
4862 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
4863 * in alloc_flags.  That logic and the checks below have the combined
4864 * affect that:
4865 *	in_interrupt - any node ok (current task context irrelevant)
4866 *	GFP_ATOMIC   - any node ok
4867 *	tsk_is_oom_victim   - any node ok
4868 *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
4869 *	GFP_USER     - only nodes in current tasks mems allowed ok.
4870 */
4871bool cpuset_node_allowed(int node, gfp_t gfp_mask)
4872{
4873	struct cpuset *cs;		/* current cpuset ancestors */
4874	bool allowed;			/* is allocation in zone z allowed? */
4875	unsigned long flags;
4876
4877	if (in_interrupt())
4878		return true;
4879	if (node_isset(node, current->mems_allowed))
4880		return true;
4881	/*
4882	 * Allow tasks that have access to memory reserves because they have
4883	 * been OOM killed to get memory anywhere.
4884	 */
4885	if (unlikely(tsk_is_oom_victim(current)))
4886		return true;
4887	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
4888		return false;
4889
4890	if (current->flags & PF_EXITING) /* Let dying task have memory */
4891		return true;
4892
4893	/* Not hardwall and node outside mems_allowed: scan up cpusets */
4894	spin_lock_irqsave(&callback_lock, flags);
4895
4896	rcu_read_lock();
4897	cs = nearest_hardwall_ancestor(task_cs(current));
4898	allowed = node_isset(node, cs->mems_allowed);
4899	rcu_read_unlock();
4900
4901	spin_unlock_irqrestore(&callback_lock, flags);
4902	return allowed;
4903}
4904
4905/**
4906 * cpuset_spread_node() - On which node to begin search for a page
4907 * @rotor: round robin rotor
4908 *
4909 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
4910 * tasks in a cpuset with is_spread_page or is_spread_slab set),
4911 * and if the memory allocation used cpuset_mem_spread_node()
4912 * to determine on which node to start looking, as it will for
4913 * certain page cache or slab cache pages such as used for file
4914 * system buffers and inode caches, then instead of starting on the
4915 * local node to look for a free page, rather spread the starting
4916 * node around the tasks mems_allowed nodes.
4917 *
4918 * We don't have to worry about the returned node being offline
4919 * because "it can't happen", and even if it did, it would be ok.
4920 *
4921 * The routines calling guarantee_online_mems() are careful to
4922 * only set nodes in task->mems_allowed that are online.  So it
4923 * should not be possible for the following code to return an
4924 * offline node.  But if it did, that would be ok, as this routine
4925 * is not returning the node where the allocation must be, only
4926 * the node where the search should start.  The zonelist passed to
4927 * __alloc_pages() will include all nodes.  If the slab allocator
4928 * is passed an offline node, it will fall back to the local node.
4929 * See kmem_cache_alloc_node().
4930 */
4931static int cpuset_spread_node(int *rotor)
4932{
4933	return *rotor = next_node_in(*rotor, current->mems_allowed);
4934}
4935
4936/**
4937 * cpuset_mem_spread_node() - On which node to begin search for a file page
4938 */
4939int cpuset_mem_spread_node(void)
4940{
4941	if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
4942		current->cpuset_mem_spread_rotor =
4943			node_random(&current->mems_allowed);
4944
4945	return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
4946}
4947
4948/**
4949 * cpuset_slab_spread_node() - On which node to begin search for a slab page
4950 */
4951int cpuset_slab_spread_node(void)
4952{
4953	if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
4954		current->cpuset_slab_spread_rotor =
4955			node_random(&current->mems_allowed);
4956
4957	return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
4958}
4959EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
4960
4961/**
4962 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
4963 * @tsk1: pointer to task_struct of some task.
4964 * @tsk2: pointer to task_struct of some other task.
4965 *
4966 * Description: Return true if @tsk1's mems_allowed intersects the
4967 * mems_allowed of @tsk2.  Used by the OOM killer to determine if
4968 * one of the task's memory usage might impact the memory available
4969 * to the other.
4970 **/
4971
4972int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
4973				   const struct task_struct *tsk2)
4974{
4975	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
4976}
4977
4978/**
4979 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
4980 *
4981 * Description: Prints current's name, cpuset name, and cached copy of its
4982 * mems_allowed to the kernel log.
4983 */
4984void cpuset_print_current_mems_allowed(void)
4985{
4986	struct cgroup *cgrp;
4987
4988	rcu_read_lock();
4989
4990	cgrp = task_cs(current)->css.cgroup;
4991	pr_cont(",cpuset=");
4992	pr_cont_cgroup_name(cgrp);
4993	pr_cont(",mems_allowed=%*pbl",
4994		nodemask_pr_args(&current->mems_allowed));
4995
4996	rcu_read_unlock();
4997}
4998
4999/*
5000 * Collection of memory_pressure is suppressed unless
5001 * this flag is enabled by writing "1" to the special
5002 * cpuset file 'memory_pressure_enabled' in the root cpuset.
5003 */
5004
5005int cpuset_memory_pressure_enabled __read_mostly;
5006
5007/*
5008 * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
5009 *
5010 * Keep a running average of the rate of synchronous (direct)
5011 * page reclaim efforts initiated by tasks in each cpuset.
5012 *
5013 * This represents the rate at which some task in the cpuset
5014 * ran low on memory on all nodes it was allowed to use, and
5015 * had to enter the kernels page reclaim code in an effort to
5016 * create more free memory by tossing clean pages or swapping
5017 * or writing dirty pages.
5018 *
5019 * Display to user space in the per-cpuset read-only file
5020 * "memory_pressure".  Value displayed is an integer
5021 * representing the recent rate of entry into the synchronous
5022 * (direct) page reclaim by any task attached to the cpuset.
5023 */
5024
5025void __cpuset_memory_pressure_bump(void)
5026{
5027	rcu_read_lock();
5028	fmeter_markevent(&task_cs(current)->fmeter);
5029	rcu_read_unlock();
5030}
5031
5032#ifdef CONFIG_PROC_PID_CPUSET
5033/*
5034 * proc_cpuset_show()
5035 *  - Print tasks cpuset path into seq_file.
5036 *  - Used for /proc/<pid>/cpuset.
5037 *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
5038 *    doesn't really matter if tsk->cpuset changes after we read it,
5039 *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
5040 *    anyway.
5041 */
5042int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
5043		     struct pid *pid, struct task_struct *tsk)
5044{
5045	char *buf;
5046	struct cgroup_subsys_state *css;
5047	int retval;
5048
5049	retval = -ENOMEM;
5050	buf = kmalloc(PATH_MAX, GFP_KERNEL);
5051	if (!buf)
5052		goto out;
5053
5054	css = task_get_css(tsk, cpuset_cgrp_id);
5055	retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
5056				current->nsproxy->cgroup_ns);
5057	css_put(css);
5058	if (retval == -E2BIG)
5059		retval = -ENAMETOOLONG;
5060	if (retval < 0)
5061		goto out_free;
5062	seq_puts(m, buf);
5063	seq_putc(m, '\n');
5064	retval = 0;
5065out_free:
5066	kfree(buf);
5067out:
5068	return retval;
5069}
5070#endif /* CONFIG_PROC_PID_CPUSET */
5071
5072/* Display task mems_allowed in /proc/<pid>/status file. */
5073void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
5074{
5075	seq_printf(m, "Mems_allowed:\t%*pb\n",
5076		   nodemask_pr_args(&task->mems_allowed));
5077	seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
5078		   nodemask_pr_args(&task->mems_allowed));
5079}
5080