1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 *
5 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 *
7 *  Interactivity improvements by Mike Galbraith
8 *  (C) 2007 Mike Galbraith <efault@gmx.de>
9 *
10 *  Various enhancements by Dmitry Adamushko.
11 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12 *
13 *  Group scheduling enhancements by Srivatsa Vaddagiri
14 *  Copyright IBM Corporation, 2007
15 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16 *
17 *  Scaled math optimizations by Thomas Gleixner
18 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
19 *
20 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22 */
23#include <linux/energy_model.h>
24#include <linux/mmap_lock.h>
25#include <linux/hugetlb_inline.h>
26#include <linux/jiffies.h>
27#include <linux/mm_api.h>
28#include <linux/highmem.h>
29#include <linux/spinlock_api.h>
30#include <linux/cpumask_api.h>
31#include <linux/lockdep_api.h>
32#include <linux/softirq.h>
33#include <linux/refcount_api.h>
34#include <linux/topology.h>
35#include <linux/sched/clock.h>
36#include <linux/sched/cond_resched.h>
37#include <linux/sched/cputime.h>
38#include <linux/sched/isolation.h>
39#include <linux/sched/nohz.h>
40
41#include <linux/cpuidle.h>
42#include <linux/interrupt.h>
43#include <linux/memory-tiers.h>
44#include <linux/mempolicy.h>
45#include <linux/mutex_api.h>
46#include <linux/profile.h>
47#include <linux/psi.h>
48#include <linux/ratelimit.h>
49#include <linux/task_work.h>
50#include <linux/rbtree_augmented.h>
51
52#include <asm/switch_to.h>
53
54#include "sched.h"
55#include "stats.h"
56#include "autogroup.h"
57
58/*
59 * The initial- and re-scaling of tunables is configurable
60 *
61 * Options are:
62 *
63 *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
64 *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
65 *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
66 *
67 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
68 */
69unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
70
71/*
72 * Minimal preemption granularity for CPU-bound tasks:
73 *
74 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
75 */
76unsigned int sysctl_sched_base_slice			= 750000ULL;
77static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
78
79const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
80
81static int __init setup_sched_thermal_decay_shift(char *str)
82{
83	pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
84	return 1;
85}
86__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
87
88#ifdef CONFIG_SMP
89/*
90 * For asym packing, by default the lower numbered CPU has higher priority.
91 */
92int __weak arch_asym_cpu_priority(int cpu)
93{
94	return -cpu;
95}
96
97/*
98 * The margin used when comparing utilization with CPU capacity.
99 *
100 * (default: ~20%)
101 */
102#define fits_capacity(cap, max)	((cap) * 1280 < (max) * 1024)
103
104/*
105 * The margin used when comparing CPU capacities.
106 * is 'cap1' noticeably greater than 'cap2'
107 *
108 * (default: ~5%)
109 */
110#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
111#endif
112
113#ifdef CONFIG_CFS_BANDWIDTH
114/*
115 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
116 * each time a cfs_rq requests quota.
117 *
118 * Note: in the case that the slice exceeds the runtime remaining (either due
119 * to consumption or the quota being specified to be smaller than the slice)
120 * we will always only issue the remaining available time.
121 *
122 * (default: 5 msec, units: microseconds)
123 */
124static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
125#endif
126
127#ifdef CONFIG_NUMA_BALANCING
128/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
129static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
130#endif
131
132#ifdef CONFIG_SYSCTL
133static struct ctl_table sched_fair_sysctls[] = {
134#ifdef CONFIG_CFS_BANDWIDTH
135	{
136		.procname       = "sched_cfs_bandwidth_slice_us",
137		.data           = &sysctl_sched_cfs_bandwidth_slice,
138		.maxlen         = sizeof(unsigned int),
139		.mode           = 0644,
140		.proc_handler   = proc_dointvec_minmax,
141		.extra1         = SYSCTL_ONE,
142	},
143#endif
144#ifdef CONFIG_NUMA_BALANCING
145	{
146		.procname	= "numa_balancing_promote_rate_limit_MBps",
147		.data		= &sysctl_numa_balancing_promote_rate_limit,
148		.maxlen		= sizeof(unsigned int),
149		.mode		= 0644,
150		.proc_handler	= proc_dointvec_minmax,
151		.extra1		= SYSCTL_ZERO,
152	},
153#endif /* CONFIG_NUMA_BALANCING */
154};
155
156static int __init sched_fair_sysctl_init(void)
157{
158	register_sysctl_init("kernel", sched_fair_sysctls);
159	return 0;
160}
161late_initcall(sched_fair_sysctl_init);
162#endif
163
164static inline void update_load_add(struct load_weight *lw, unsigned long inc)
165{
166	lw->weight += inc;
167	lw->inv_weight = 0;
168}
169
170static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
171{
172	lw->weight -= dec;
173	lw->inv_weight = 0;
174}
175
176static inline void update_load_set(struct load_weight *lw, unsigned long w)
177{
178	lw->weight = w;
179	lw->inv_weight = 0;
180}
181
182/*
183 * Increase the granularity value when there are more CPUs,
184 * because with more CPUs the 'effective latency' as visible
185 * to users decreases. But the relationship is not linear,
186 * so pick a second-best guess by going with the log2 of the
187 * number of CPUs.
188 *
189 * This idea comes from the SD scheduler of Con Kolivas:
190 */
191static unsigned int get_update_sysctl_factor(void)
192{
193	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
194	unsigned int factor;
195
196	switch (sysctl_sched_tunable_scaling) {
197	case SCHED_TUNABLESCALING_NONE:
198		factor = 1;
199		break;
200	case SCHED_TUNABLESCALING_LINEAR:
201		factor = cpus;
202		break;
203	case SCHED_TUNABLESCALING_LOG:
204	default:
205		factor = 1 + ilog2(cpus);
206		break;
207	}
208
209	return factor;
210}
211
212static void update_sysctl(void)
213{
214	unsigned int factor = get_update_sysctl_factor();
215
216#define SET_SYSCTL(name) \
217	(sysctl_##name = (factor) * normalized_sysctl_##name)
218	SET_SYSCTL(sched_base_slice);
219#undef SET_SYSCTL
220}
221
222void __init sched_init_granularity(void)
223{
224	update_sysctl();
225}
226
227#define WMULT_CONST	(~0U)
228#define WMULT_SHIFT	32
229
230static void __update_inv_weight(struct load_weight *lw)
231{
232	unsigned long w;
233
234	if (likely(lw->inv_weight))
235		return;
236
237	w = scale_load_down(lw->weight);
238
239	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
240		lw->inv_weight = 1;
241	else if (unlikely(!w))
242		lw->inv_weight = WMULT_CONST;
243	else
244		lw->inv_weight = WMULT_CONST / w;
245}
246
247/*
248 * delta_exec * weight / lw.weight
249 *   OR
250 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
251 *
252 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
253 * we're guaranteed shift stays positive because inv_weight is guaranteed to
254 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
255 *
256 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
257 * weight/lw.weight <= 1, and therefore our shift will also be positive.
258 */
259static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
260{
261	u64 fact = scale_load_down(weight);
262	u32 fact_hi = (u32)(fact >> 32);
263	int shift = WMULT_SHIFT;
264	int fs;
265
266	__update_inv_weight(lw);
267
268	if (unlikely(fact_hi)) {
269		fs = fls(fact_hi);
270		shift -= fs;
271		fact >>= fs;
272	}
273
274	fact = mul_u32_u32(fact, lw->inv_weight);
275
276	fact_hi = (u32)(fact >> 32);
277	if (fact_hi) {
278		fs = fls(fact_hi);
279		shift -= fs;
280		fact >>= fs;
281	}
282
283	return mul_u64_u32_shr(delta_exec, fact, shift);
284}
285
286/*
287 * delta /= w
288 */
289static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
290{
291	if (unlikely(se->load.weight != NICE_0_LOAD))
292		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
293
294	return delta;
295}
296
297const struct sched_class fair_sched_class;
298
299/**************************************************************
300 * CFS operations on generic schedulable entities:
301 */
302
303#ifdef CONFIG_FAIR_GROUP_SCHED
304
305/* Walk up scheduling entities hierarchy */
306#define for_each_sched_entity(se) \
307		for (; se; se = se->parent)
308
309static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
310{
311	struct rq *rq = rq_of(cfs_rq);
312	int cpu = cpu_of(rq);
313
314	if (cfs_rq->on_list)
315		return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
316
317	cfs_rq->on_list = 1;
318
319	/*
320	 * Ensure we either appear before our parent (if already
321	 * enqueued) or force our parent to appear after us when it is
322	 * enqueued. The fact that we always enqueue bottom-up
323	 * reduces this to two cases and a special case for the root
324	 * cfs_rq. Furthermore, it also means that we will always reset
325	 * tmp_alone_branch either when the branch is connected
326	 * to a tree or when we reach the top of the tree
327	 */
328	if (cfs_rq->tg->parent &&
329	    cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
330		/*
331		 * If parent is already on the list, we add the child
332		 * just before. Thanks to circular linked property of
333		 * the list, this means to put the child at the tail
334		 * of the list that starts by parent.
335		 */
336		list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
337			&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
338		/*
339		 * The branch is now connected to its tree so we can
340		 * reset tmp_alone_branch to the beginning of the
341		 * list.
342		 */
343		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
344		return true;
345	}
346
347	if (!cfs_rq->tg->parent) {
348		/*
349		 * cfs rq without parent should be put
350		 * at the tail of the list.
351		 */
352		list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
353			&rq->leaf_cfs_rq_list);
354		/*
355		 * We have reach the top of a tree so we can reset
356		 * tmp_alone_branch to the beginning of the list.
357		 */
358		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
359		return true;
360	}
361
362	/*
363	 * The parent has not already been added so we want to
364	 * make sure that it will be put after us.
365	 * tmp_alone_branch points to the begin of the branch
366	 * where we will add parent.
367	 */
368	list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
369	/*
370	 * update tmp_alone_branch to points to the new begin
371	 * of the branch
372	 */
373	rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
374	return false;
375}
376
377static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
378{
379	if (cfs_rq->on_list) {
380		struct rq *rq = rq_of(cfs_rq);
381
382		/*
383		 * With cfs_rq being unthrottled/throttled during an enqueue,
384		 * it can happen the tmp_alone_branch points to the leaf that
385		 * we finally want to delete. In this case, tmp_alone_branch moves
386		 * to the prev element but it will point to rq->leaf_cfs_rq_list
387		 * at the end of the enqueue.
388		 */
389		if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
390			rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
391
392		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
393		cfs_rq->on_list = 0;
394	}
395}
396
397static inline void assert_list_leaf_cfs_rq(struct rq *rq)
398{
399	SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
400}
401
402/* Iterate through all leaf cfs_rq's on a runqueue */
403#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
404	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
405				 leaf_cfs_rq_list)
406
407/* Do the two (enqueued) entities belong to the same group ? */
408static inline struct cfs_rq *
409is_same_group(struct sched_entity *se, struct sched_entity *pse)
410{
411	if (se->cfs_rq == pse->cfs_rq)
412		return se->cfs_rq;
413
414	return NULL;
415}
416
417static inline struct sched_entity *parent_entity(const struct sched_entity *se)
418{
419	return se->parent;
420}
421
422static void
423find_matching_se(struct sched_entity **se, struct sched_entity **pse)
424{
425	int se_depth, pse_depth;
426
427	/*
428	 * preemption test can be made between sibling entities who are in the
429	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
430	 * both tasks until we find their ancestors who are siblings of common
431	 * parent.
432	 */
433
434	/* First walk up until both entities are at same depth */
435	se_depth = (*se)->depth;
436	pse_depth = (*pse)->depth;
437
438	while (se_depth > pse_depth) {
439		se_depth--;
440		*se = parent_entity(*se);
441	}
442
443	while (pse_depth > se_depth) {
444		pse_depth--;
445		*pse = parent_entity(*pse);
446	}
447
448	while (!is_same_group(*se, *pse)) {
449		*se = parent_entity(*se);
450		*pse = parent_entity(*pse);
451	}
452}
453
454static int tg_is_idle(struct task_group *tg)
455{
456	return tg->idle > 0;
457}
458
459static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
460{
461	return cfs_rq->idle > 0;
462}
463
464static int se_is_idle(struct sched_entity *se)
465{
466	if (entity_is_task(se))
467		return task_has_idle_policy(task_of(se));
468	return cfs_rq_is_idle(group_cfs_rq(se));
469}
470
471#else	/* !CONFIG_FAIR_GROUP_SCHED */
472
473#define for_each_sched_entity(se) \
474		for (; se; se = NULL)
475
476static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
477{
478	return true;
479}
480
481static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
482{
483}
484
485static inline void assert_list_leaf_cfs_rq(struct rq *rq)
486{
487}
488
489#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
490		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
491
492static inline struct sched_entity *parent_entity(struct sched_entity *se)
493{
494	return NULL;
495}
496
497static inline void
498find_matching_se(struct sched_entity **se, struct sched_entity **pse)
499{
500}
501
502static inline int tg_is_idle(struct task_group *tg)
503{
504	return 0;
505}
506
507static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
508{
509	return 0;
510}
511
512static int se_is_idle(struct sched_entity *se)
513{
514	return 0;
515}
516
517#endif	/* CONFIG_FAIR_GROUP_SCHED */
518
519static __always_inline
520void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
521
522/**************************************************************
523 * Scheduling class tree data structure manipulation methods:
524 */
525
526static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
527{
528	s64 delta = (s64)(vruntime - max_vruntime);
529	if (delta > 0)
530		max_vruntime = vruntime;
531
532	return max_vruntime;
533}
534
535static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
536{
537	s64 delta = (s64)(vruntime - min_vruntime);
538	if (delta < 0)
539		min_vruntime = vruntime;
540
541	return min_vruntime;
542}
543
544static inline bool entity_before(const struct sched_entity *a,
545				 const struct sched_entity *b)
546{
547	/*
548	 * Tiebreak on vruntime seems unnecessary since it can
549	 * hardly happen.
550	 */
551	return (s64)(a->deadline - b->deadline) < 0;
552}
553
554static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
555{
556	return (s64)(se->vruntime - cfs_rq->min_vruntime);
557}
558
559#define __node_2_se(node) \
560	rb_entry((node), struct sched_entity, run_node)
561
562/*
563 * Compute virtual time from the per-task service numbers:
564 *
565 * Fair schedulers conserve lag:
566 *
567 *   \Sum lag_i = 0
568 *
569 * Where lag_i is given by:
570 *
571 *   lag_i = S - s_i = w_i * (V - v_i)
572 *
573 * Where S is the ideal service time and V is it's virtual time counterpart.
574 * Therefore:
575 *
576 *   \Sum lag_i = 0
577 *   \Sum w_i * (V - v_i) = 0
578 *   \Sum w_i * V - w_i * v_i = 0
579 *
580 * From which we can solve an expression for V in v_i (which we have in
581 * se->vruntime):
582 *
583 *       \Sum v_i * w_i   \Sum v_i * w_i
584 *   V = -------------- = --------------
585 *          \Sum w_i            W
586 *
587 * Specifically, this is the weighted average of all entity virtual runtimes.
588 *
589 * [[ NOTE: this is only equal to the ideal scheduler under the condition
590 *          that join/leave operations happen at lag_i = 0, otherwise the
591 *          virtual time has non-contiguous motion equivalent to:
592 *
593 *	      V +-= lag_i / W
594 *
595 *	    Also see the comment in place_entity() that deals with this. ]]
596 *
597 * However, since v_i is u64, and the multiplication could easily overflow
598 * transform it into a relative form that uses smaller quantities:
599 *
600 * Substitute: v_i == (v_i - v0) + v0
601 *
602 *     \Sum ((v_i - v0) + v0) * w_i   \Sum (v_i - v0) * w_i
603 * V = ---------------------------- = --------------------- + v0
604 *                  W                            W
605 *
606 * Which we track using:
607 *
608 *                    v0 := cfs_rq->min_vruntime
609 * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
610 *              \Sum w_i := cfs_rq->avg_load
611 *
612 * Since min_vruntime is a monotonic increasing variable that closely tracks
613 * the per-task service, these deltas: (v_i - v), will be in the order of the
614 * maximal (virtual) lag induced in the system due to quantisation.
615 *
616 * Also, we use scale_load_down() to reduce the size.
617 *
618 * As measured, the max (key * weight) value was ~44 bits for a kernel build.
619 */
620static void
621avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
622{
623	unsigned long weight = scale_load_down(se->load.weight);
624	s64 key = entity_key(cfs_rq, se);
625
626	cfs_rq->avg_vruntime += key * weight;
627	cfs_rq->avg_load += weight;
628}
629
630static void
631avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
632{
633	unsigned long weight = scale_load_down(se->load.weight);
634	s64 key = entity_key(cfs_rq, se);
635
636	cfs_rq->avg_vruntime -= key * weight;
637	cfs_rq->avg_load -= weight;
638}
639
640static inline
641void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
642{
643	/*
644	 * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
645	 */
646	cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
647}
648
649/*
650 * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
651 * For this to be so, the result of this function must have a left bias.
652 */
653u64 avg_vruntime(struct cfs_rq *cfs_rq)
654{
655	struct sched_entity *curr = cfs_rq->curr;
656	s64 avg = cfs_rq->avg_vruntime;
657	long load = cfs_rq->avg_load;
658
659	if (curr && curr->on_rq) {
660		unsigned long weight = scale_load_down(curr->load.weight);
661
662		avg += entity_key(cfs_rq, curr) * weight;
663		load += weight;
664	}
665
666	if (load) {
667		/* sign flips effective floor / ceiling */
668		if (avg < 0)
669			avg -= (load - 1);
670		avg = div_s64(avg, load);
671	}
672
673	return cfs_rq->min_vruntime + avg;
674}
675
676/*
677 * lag_i = S - s_i = w_i * (V - v_i)
678 *
679 * However, since V is approximated by the weighted average of all entities it
680 * is possible -- by addition/removal/reweight to the tree -- to move V around
681 * and end up with a larger lag than we started with.
682 *
683 * Limit this to either double the slice length with a minimum of TICK_NSEC
684 * since that is the timing granularity.
685 *
686 * EEVDF gives the following limit for a steady state system:
687 *
688 *   -r_max < lag < max(r_max, q)
689 *
690 * XXX could add max_slice to the augmented data to track this.
691 */
692static s64 entity_lag(u64 avruntime, struct sched_entity *se)
693{
694	s64 vlag, limit;
695
696	vlag = avruntime - se->vruntime;
697	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
698
699	return clamp(vlag, -limit, limit);
700}
701
702static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
703{
704	SCHED_WARN_ON(!se->on_rq);
705
706	se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
707}
708
709/*
710 * Entity is eligible once it received less service than it ought to have,
711 * eg. lag >= 0.
712 *
713 * lag_i = S - s_i = w_i*(V - v_i)
714 *
715 * lag_i >= 0 -> V >= v_i
716 *
717 *     \Sum (v_i - v)*w_i
718 * V = ------------------ + v
719 *          \Sum w_i
720 *
721 * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
722 *
723 * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
724 *       to the loss in precision caused by the division.
725 */
726static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
727{
728	struct sched_entity *curr = cfs_rq->curr;
729	s64 avg = cfs_rq->avg_vruntime;
730	long load = cfs_rq->avg_load;
731
732	if (curr && curr->on_rq) {
733		unsigned long weight = scale_load_down(curr->load.weight);
734
735		avg += entity_key(cfs_rq, curr) * weight;
736		load += weight;
737	}
738
739	return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
740}
741
742int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
743{
744	return vruntime_eligible(cfs_rq, se->vruntime);
745}
746
747static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
748{
749	u64 min_vruntime = cfs_rq->min_vruntime;
750	/*
751	 * open coded max_vruntime() to allow updating avg_vruntime
752	 */
753	s64 delta = (s64)(vruntime - min_vruntime);
754	if (delta > 0) {
755		avg_vruntime_update(cfs_rq, delta);
756		min_vruntime = vruntime;
757	}
758	return min_vruntime;
759}
760
761static void update_min_vruntime(struct cfs_rq *cfs_rq)
762{
763	struct sched_entity *se = __pick_root_entity(cfs_rq);
764	struct sched_entity *curr = cfs_rq->curr;
765	u64 vruntime = cfs_rq->min_vruntime;
766
767	if (curr) {
768		if (curr->on_rq)
769			vruntime = curr->vruntime;
770		else
771			curr = NULL;
772	}
773
774	if (se) {
775		if (!curr)
776			vruntime = se->min_vruntime;
777		else
778			vruntime = min_vruntime(vruntime, se->min_vruntime);
779	}
780
781	/* ensure we never gain time by being placed backwards. */
782	u64_u32_store(cfs_rq->min_vruntime,
783		      __update_min_vruntime(cfs_rq, vruntime));
784}
785
786static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
787{
788	return entity_before(__node_2_se(a), __node_2_se(b));
789}
790
791#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
792
793static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
794{
795	if (node) {
796		struct sched_entity *rse = __node_2_se(node);
797		if (vruntime_gt(min_vruntime, se, rse))
798			se->min_vruntime = rse->min_vruntime;
799	}
800}
801
802/*
803 * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
804 */
805static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
806{
807	u64 old_min_vruntime = se->min_vruntime;
808	struct rb_node *node = &se->run_node;
809
810	se->min_vruntime = se->vruntime;
811	__min_vruntime_update(se, node->rb_right);
812	__min_vruntime_update(se, node->rb_left);
813
814	return se->min_vruntime == old_min_vruntime;
815}
816
817RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
818		     run_node, min_vruntime, min_vruntime_update);
819
820/*
821 * Enqueue an entity into the rb-tree:
822 */
823static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
824{
825	avg_vruntime_add(cfs_rq, se);
826	se->min_vruntime = se->vruntime;
827	rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
828				__entity_less, &min_vruntime_cb);
829}
830
831static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
832{
833	rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
834				  &min_vruntime_cb);
835	avg_vruntime_sub(cfs_rq, se);
836}
837
838struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
839{
840	struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node;
841
842	if (!root)
843		return NULL;
844
845	return __node_2_se(root);
846}
847
848struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
849{
850	struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
851
852	if (!left)
853		return NULL;
854
855	return __node_2_se(left);
856}
857
858/*
859 * Earliest Eligible Virtual Deadline First
860 *
861 * In order to provide latency guarantees for different request sizes
862 * EEVDF selects the best runnable task from two criteria:
863 *
864 *  1) the task must be eligible (must be owed service)
865 *
866 *  2) from those tasks that meet 1), we select the one
867 *     with the earliest virtual deadline.
868 *
869 * We can do this in O(log n) time due to an augmented RB-tree. The
870 * tree keeps the entries sorted on deadline, but also functions as a
871 * heap based on the vruntime by keeping:
872 *
873 *  se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
874 *
875 * Which allows tree pruning through eligibility.
876 */
877static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
878{
879	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
880	struct sched_entity *se = __pick_first_entity(cfs_rq);
881	struct sched_entity *curr = cfs_rq->curr;
882	struct sched_entity *best = NULL;
883
884	/*
885	 * We can safely skip eligibility check if there is only one entity
886	 * in this cfs_rq, saving some cycles.
887	 */
888	if (cfs_rq->nr_running == 1)
889		return curr && curr->on_rq ? curr : se;
890
891	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
892		curr = NULL;
893
894	/*
895	 * Once selected, run a task until it either becomes non-eligible or
896	 * until it gets a new slice. See the HACK in set_next_entity().
897	 */
898	if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
899		return curr;
900
901	/* Pick the leftmost entity if it's eligible */
902	if (se && entity_eligible(cfs_rq, se)) {
903		best = se;
904		goto found;
905	}
906
907	/* Heap search for the EEVD entity */
908	while (node) {
909		struct rb_node *left = node->rb_left;
910
911		/*
912		 * Eligible entities in left subtree are always better
913		 * choices, since they have earlier deadlines.
914		 */
915		if (left && vruntime_eligible(cfs_rq,
916					__node_2_se(left)->min_vruntime)) {
917			node = left;
918			continue;
919		}
920
921		se = __node_2_se(node);
922
923		/*
924		 * The left subtree either is empty or has no eligible
925		 * entity, so check the current node since it is the one
926		 * with earliest deadline that might be eligible.
927		 */
928		if (entity_eligible(cfs_rq, se)) {
929			best = se;
930			break;
931		}
932
933		node = node->rb_right;
934	}
935found:
936	if (!best || (curr && entity_before(curr, best)))
937		best = curr;
938
939	return best;
940}
941
942#ifdef CONFIG_SCHED_DEBUG
943struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
944{
945	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
946
947	if (!last)
948		return NULL;
949
950	return __node_2_se(last);
951}
952
953/**************************************************************
954 * Scheduling class statistics methods:
955 */
956#ifdef CONFIG_SMP
957int sched_update_scaling(void)
958{
959	unsigned int factor = get_update_sysctl_factor();
960
961#define WRT_SYSCTL(name) \
962	(normalized_sysctl_##name = sysctl_##name / (factor))
963	WRT_SYSCTL(sched_base_slice);
964#undef WRT_SYSCTL
965
966	return 0;
967}
968#endif
969#endif
970
971static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
972
973/*
974 * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
975 * this is probably good enough.
976 */
977static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
978{
979	if ((s64)(se->vruntime - se->deadline) < 0)
980		return;
981
982	/*
983	 * For EEVDF the virtual time slope is determined by w_i (iow.
984	 * nice) while the request time r_i is determined by
985	 * sysctl_sched_base_slice.
986	 */
987	se->slice = sysctl_sched_base_slice;
988
989	/*
990	 * EEVDF: vd_i = ve_i + r_i / w_i
991	 */
992	se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
993
994	/*
995	 * The task has consumed its request, reschedule.
996	 */
997	if (cfs_rq->nr_running > 1) {
998		resched_curr(rq_of(cfs_rq));
999		clear_buddies(cfs_rq, se);
1000	}
1001}
1002
1003#include "pelt.h"
1004#ifdef CONFIG_SMP
1005
1006static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
1007static unsigned long task_h_load(struct task_struct *p);
1008static unsigned long capacity_of(int cpu);
1009
1010/* Give new sched_entity start runnable values to heavy its load in infant time */
1011void init_entity_runnable_average(struct sched_entity *se)
1012{
1013	struct sched_avg *sa = &se->avg;
1014
1015	memset(sa, 0, sizeof(*sa));
1016
1017	/*
1018	 * Tasks are initialized with full load to be seen as heavy tasks until
1019	 * they get a chance to stabilize to their real load level.
1020	 * Group entities are initialized with zero load to reflect the fact that
1021	 * nothing has been attached to the task group yet.
1022	 */
1023	if (entity_is_task(se))
1024		sa->load_avg = scale_load_down(se->load.weight);
1025
1026	/* when this task is enqueued, it will contribute to its cfs_rq's load_avg */
1027}
1028
1029/*
1030 * With new tasks being created, their initial util_avgs are extrapolated
1031 * based on the cfs_rq's current util_avg:
1032 *
1033 *   util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1)
1034 *		* se_weight(se)
1035 *
1036 * However, in many cases, the above util_avg does not give a desired
1037 * value. Moreover, the sum of the util_avgs may be divergent, such
1038 * as when the series is a harmonic series.
1039 *
1040 * To solve this problem, we also cap the util_avg of successive tasks to
1041 * only 1/2 of the left utilization budget:
1042 *
1043 *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
1044 *
1045 * where n denotes the nth task and cpu_scale the CPU capacity.
1046 *
1047 * For example, for a CPU with 1024 of capacity, a simplest series from
1048 * the beginning would be like:
1049 *
1050 *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
1051 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
1052 *
1053 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
1054 * if util_avg > util_avg_cap.
1055 */
1056void post_init_entity_util_avg(struct task_struct *p)
1057{
1058	struct sched_entity *se = &p->se;
1059	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1060	struct sched_avg *sa = &se->avg;
1061	long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
1062	long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
1063
1064	if (p->sched_class != &fair_sched_class) {
1065		/*
1066		 * For !fair tasks do:
1067		 *
1068		update_cfs_rq_load_avg(now, cfs_rq);
1069		attach_entity_load_avg(cfs_rq, se);
1070		switched_from_fair(rq, p);
1071		 *
1072		 * such that the next switched_to_fair() has the
1073		 * expected state.
1074		 */
1075		se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
1076		return;
1077	}
1078
1079	if (cap > 0) {
1080		if (cfs_rq->avg.util_avg != 0) {
1081			sa->util_avg  = cfs_rq->avg.util_avg * se_weight(se);
1082			sa->util_avg /= (cfs_rq->avg.load_avg + 1);
1083
1084			if (sa->util_avg > cap)
1085				sa->util_avg = cap;
1086		} else {
1087			sa->util_avg = cap;
1088		}
1089	}
1090
1091	sa->runnable_avg = sa->util_avg;
1092}
1093
1094#else /* !CONFIG_SMP */
1095void init_entity_runnable_average(struct sched_entity *se)
1096{
1097}
1098void post_init_entity_util_avg(struct task_struct *p)
1099{
1100}
1101static void update_tg_load_avg(struct cfs_rq *cfs_rq)
1102{
1103}
1104#endif /* CONFIG_SMP */
1105
1106static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
1107{
1108	u64 now = rq_clock_task(rq);
1109	s64 delta_exec;
1110
1111	delta_exec = now - curr->exec_start;
1112	if (unlikely(delta_exec <= 0))
1113		return delta_exec;
1114
1115	curr->exec_start = now;
1116	curr->sum_exec_runtime += delta_exec;
1117
1118	if (schedstat_enabled()) {
1119		struct sched_statistics *stats;
1120
1121		stats = __schedstats_from_se(curr);
1122		__schedstat_set(stats->exec_max,
1123				max(delta_exec, stats->exec_max));
1124	}
1125
1126	return delta_exec;
1127}
1128
1129static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
1130{
1131	trace_sched_stat_runtime(p, delta_exec);
1132	account_group_exec_runtime(p, delta_exec);
1133	cgroup_account_cputime(p, delta_exec);
1134	if (p->dl_server)
1135		dl_server_update(p->dl_server, delta_exec);
1136}
1137
1138/*
1139 * Used by other classes to account runtime.
1140 */
1141s64 update_curr_common(struct rq *rq)
1142{
1143	struct task_struct *curr = rq->curr;
1144	s64 delta_exec;
1145
1146	delta_exec = update_curr_se(rq, &curr->se);
1147	if (likely(delta_exec > 0))
1148		update_curr_task(curr, delta_exec);
1149
1150	return delta_exec;
1151}
1152
1153/*
1154 * Update the current task's runtime statistics.
1155 */
1156static void update_curr(struct cfs_rq *cfs_rq)
1157{
1158	struct sched_entity *curr = cfs_rq->curr;
1159	s64 delta_exec;
1160
1161	if (unlikely(!curr))
1162		return;
1163
1164	delta_exec = update_curr_se(rq_of(cfs_rq), curr);
1165	if (unlikely(delta_exec <= 0))
1166		return;
1167
1168	curr->vruntime += calc_delta_fair(delta_exec, curr);
1169	update_deadline(cfs_rq, curr);
1170	update_min_vruntime(cfs_rq);
1171
1172	if (entity_is_task(curr))
1173		update_curr_task(task_of(curr), delta_exec);
1174
1175	account_cfs_rq_runtime(cfs_rq, delta_exec);
1176}
1177
1178static void update_curr_fair(struct rq *rq)
1179{
1180	update_curr(cfs_rq_of(&rq->curr->se));
1181}
1182
1183static inline void
1184update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
1185{
1186	struct sched_statistics *stats;
1187	struct task_struct *p = NULL;
1188
1189	if (!schedstat_enabled())
1190		return;
1191
1192	stats = __schedstats_from_se(se);
1193
1194	if (entity_is_task(se))
1195		p = task_of(se);
1196
1197	__update_stats_wait_start(rq_of(cfs_rq), p, stats);
1198}
1199
1200static inline void
1201update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
1202{
1203	struct sched_statistics *stats;
1204	struct task_struct *p = NULL;
1205
1206	if (!schedstat_enabled())
1207		return;
1208
1209	stats = __schedstats_from_se(se);
1210
1211	/*
1212	 * When the sched_schedstat changes from 0 to 1, some sched se
1213	 * maybe already in the runqueue, the se->statistics.wait_start
1214	 * will be 0.So it will let the delta wrong. We need to avoid this
1215	 * scenario.
1216	 */
1217	if (unlikely(!schedstat_val(stats->wait_start)))
1218		return;
1219
1220	if (entity_is_task(se))
1221		p = task_of(se);
1222
1223	__update_stats_wait_end(rq_of(cfs_rq), p, stats);
1224}
1225
1226static inline void
1227update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
1228{
1229	struct sched_statistics *stats;
1230	struct task_struct *tsk = NULL;
1231
1232	if (!schedstat_enabled())
1233		return;
1234
1235	stats = __schedstats_from_se(se);
1236
1237	if (entity_is_task(se))
1238		tsk = task_of(se);
1239
1240	__update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats);
1241}
1242
1243/*
1244 * Task is being enqueued - update stats:
1245 */
1246static inline void
1247update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1248{
1249	if (!schedstat_enabled())
1250		return;
1251
1252	/*
1253	 * Are we enqueueing a waiting task? (for current tasks
1254	 * a dequeue/enqueue event is a NOP)
1255	 */
1256	if (se != cfs_rq->curr)
1257		update_stats_wait_start_fair(cfs_rq, se);
1258
1259	if (flags & ENQUEUE_WAKEUP)
1260		update_stats_enqueue_sleeper_fair(cfs_rq, se);
1261}
1262
1263static inline void
1264update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1265{
1266
1267	if (!schedstat_enabled())
1268		return;
1269
1270	/*
1271	 * Mark the end of the wait period if dequeueing a
1272	 * waiting task:
1273	 */
1274	if (se != cfs_rq->curr)
1275		update_stats_wait_end_fair(cfs_rq, se);
1276
1277	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1278		struct task_struct *tsk = task_of(se);
1279		unsigned int state;
1280
1281		/* XXX racy against TTWU */
1282		state = READ_ONCE(tsk->__state);
1283		if (state & TASK_INTERRUPTIBLE)
1284			__schedstat_set(tsk->stats.sleep_start,
1285				      rq_clock(rq_of(cfs_rq)));
1286		if (state & TASK_UNINTERRUPTIBLE)
1287			__schedstat_set(tsk->stats.block_start,
1288				      rq_clock(rq_of(cfs_rq)));
1289	}
1290}
1291
1292/*
1293 * We are picking a new current task - update its stats:
1294 */
1295static inline void
1296update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1297{
1298	/*
1299	 * We are starting a new run period:
1300	 */
1301	se->exec_start = rq_clock_task(rq_of(cfs_rq));
1302}
1303
1304/**************************************************
1305 * Scheduling class queueing methods:
1306 */
1307
1308static inline bool is_core_idle(int cpu)
1309{
1310#ifdef CONFIG_SCHED_SMT
1311	int sibling;
1312
1313	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1314		if (cpu == sibling)
1315			continue;
1316
1317		if (!idle_cpu(sibling))
1318			return false;
1319	}
1320#endif
1321
1322	return true;
1323}
1324
1325#ifdef CONFIG_NUMA
1326#define NUMA_IMBALANCE_MIN 2
1327
1328static inline long
1329adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
1330{
1331	/*
1332	 * Allow a NUMA imbalance if busy CPUs is less than the maximum
1333	 * threshold. Above this threshold, individual tasks may be contending
1334	 * for both memory bandwidth and any shared HT resources.  This is an
1335	 * approximation as the number of running tasks may not be related to
1336	 * the number of busy CPUs due to sched_setaffinity.
1337	 */
1338	if (dst_running > imb_numa_nr)
1339		return imbalance;
1340
1341	/*
1342	 * Allow a small imbalance based on a simple pair of communicating
1343	 * tasks that remain local when the destination is lightly loaded.
1344	 */
1345	if (imbalance <= NUMA_IMBALANCE_MIN)
1346		return 0;
1347
1348	return imbalance;
1349}
1350#endif /* CONFIG_NUMA */
1351
1352#ifdef CONFIG_NUMA_BALANCING
1353/*
1354 * Approximate time to scan a full NUMA task in ms. The task scan period is
1355 * calculated based on the tasks virtual memory size and
1356 * numa_balancing_scan_size.
1357 */
1358unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1359unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1360
1361/* Portion of address space to scan in MB */
1362unsigned int sysctl_numa_balancing_scan_size = 256;
1363
1364/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1365unsigned int sysctl_numa_balancing_scan_delay = 1000;
1366
1367/* The page with hint page fault latency < threshold in ms is considered hot */
1368unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
1369
1370struct numa_group {
1371	refcount_t refcount;
1372
1373	spinlock_t lock; /* nr_tasks, tasks */
1374	int nr_tasks;
1375	pid_t gid;
1376	int active_nodes;
1377
1378	struct rcu_head rcu;
1379	unsigned long total_faults;
1380	unsigned long max_faults_cpu;
1381	/*
1382	 * faults[] array is split into two regions: faults_mem and faults_cpu.
1383	 *
1384	 * Faults_cpu is used to decide whether memory should move
1385	 * towards the CPU. As a consequence, these stats are weighted
1386	 * more by CPU use than by memory faults.
1387	 */
1388	unsigned long faults[];
1389};
1390
1391/*
1392 * For functions that can be called in multiple contexts that permit reading
1393 * ->numa_group (see struct task_struct for locking rules).
1394 */
1395static struct numa_group *deref_task_numa_group(struct task_struct *p)
1396{
1397	return rcu_dereference_check(p->numa_group, p == current ||
1398		(lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
1399}
1400
1401static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1402{
1403	return rcu_dereference_protected(p->numa_group, p == current);
1404}
1405
1406static inline unsigned long group_faults_priv(struct numa_group *ng);
1407static inline unsigned long group_faults_shared(struct numa_group *ng);
1408
1409static unsigned int task_nr_scan_windows(struct task_struct *p)
1410{
1411	unsigned long rss = 0;
1412	unsigned long nr_scan_pages;
1413
1414	/*
1415	 * Calculations based on RSS as non-present and empty pages are skipped
1416	 * by the PTE scanner and NUMA hinting faults should be trapped based
1417	 * on resident pages
1418	 */
1419	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1420	rss = get_mm_rss(p->mm);
1421	if (!rss)
1422		rss = nr_scan_pages;
1423
1424	rss = round_up(rss, nr_scan_pages);
1425	return rss / nr_scan_pages;
1426}
1427
1428/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1429#define MAX_SCAN_WINDOW 2560
1430
1431static unsigned int task_scan_min(struct task_struct *p)
1432{
1433	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1434	unsigned int scan, floor;
1435	unsigned int windows = 1;
1436
1437	if (scan_size < MAX_SCAN_WINDOW)
1438		windows = MAX_SCAN_WINDOW / scan_size;
1439	floor = 1000 / windows;
1440
1441	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1442	return max_t(unsigned int, floor, scan);
1443}
1444
1445static unsigned int task_scan_start(struct task_struct *p)
1446{
1447	unsigned long smin = task_scan_min(p);
1448	unsigned long period = smin;
1449	struct numa_group *ng;
1450
1451	/* Scale the maximum scan period with the amount of shared memory. */
1452	rcu_read_lock();
1453	ng = rcu_dereference(p->numa_group);
1454	if (ng) {
1455		unsigned long shared = group_faults_shared(ng);
1456		unsigned long private = group_faults_priv(ng);
1457
1458		period *= refcount_read(&ng->refcount);
1459		period *= shared + 1;
1460		period /= private + shared + 1;
1461	}
1462	rcu_read_unlock();
1463
1464	return max(smin, period);
1465}
1466
1467static unsigned int task_scan_max(struct task_struct *p)
1468{
1469	unsigned long smin = task_scan_min(p);
1470	unsigned long smax;
1471	struct numa_group *ng;
1472
1473	/* Watch for min being lower than max due to floor calculations */
1474	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1475
1476	/* Scale the maximum scan period with the amount of shared memory. */
1477	ng = deref_curr_numa_group(p);
1478	if (ng) {
1479		unsigned long shared = group_faults_shared(ng);
1480		unsigned long private = group_faults_priv(ng);
1481		unsigned long period = smax;
1482
1483		period *= refcount_read(&ng->refcount);
1484		period *= shared + 1;
1485		period /= private + shared + 1;
1486
1487		smax = max(smax, period);
1488	}
1489
1490	return max(smin, smax);
1491}
1492
1493static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1494{
1495	rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1496	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1497}
1498
1499static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1500{
1501	rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1502	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1503}
1504
1505/* Shared or private faults. */
1506#define NR_NUMA_HINT_FAULT_TYPES 2
1507
1508/* Memory and CPU locality */
1509#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1510
1511/* Averaged statistics, and temporary buffers. */
1512#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1513
1514pid_t task_numa_group_id(struct task_struct *p)
1515{
1516	struct numa_group *ng;
1517	pid_t gid = 0;
1518
1519	rcu_read_lock();
1520	ng = rcu_dereference(p->numa_group);
1521	if (ng)
1522		gid = ng->gid;
1523	rcu_read_unlock();
1524
1525	return gid;
1526}
1527
1528/*
1529 * The averaged statistics, shared & private, memory & CPU,
1530 * occupy the first half of the array. The second half of the
1531 * array is for current counters, which are averaged into the
1532 * first set by task_numa_placement.
1533 */
1534static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1535{
1536	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1537}
1538
1539static inline unsigned long task_faults(struct task_struct *p, int nid)
1540{
1541	if (!p->numa_faults)
1542		return 0;
1543
1544	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1545		p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1546}
1547
1548static inline unsigned long group_faults(struct task_struct *p, int nid)
1549{
1550	struct numa_group *ng = deref_task_numa_group(p);
1551
1552	if (!ng)
1553		return 0;
1554
1555	return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1556		ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1557}
1558
1559static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1560{
1561	return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] +
1562		group->faults[task_faults_idx(NUMA_CPU, nid, 1)];
1563}
1564
1565static inline unsigned long group_faults_priv(struct numa_group *ng)
1566{
1567	unsigned long faults = 0;
1568	int node;
1569
1570	for_each_online_node(node) {
1571		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1572	}
1573
1574	return faults;
1575}
1576
1577static inline unsigned long group_faults_shared(struct numa_group *ng)
1578{
1579	unsigned long faults = 0;
1580	int node;
1581
1582	for_each_online_node(node) {
1583		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1584	}
1585
1586	return faults;
1587}
1588
1589/*
1590 * A node triggering more than 1/3 as many NUMA faults as the maximum is
1591 * considered part of a numa group's pseudo-interleaving set. Migrations
1592 * between these nodes are slowed down, to allow things to settle down.
1593 */
1594#define ACTIVE_NODE_FRACTION 3
1595
1596static bool numa_is_active_node(int nid, struct numa_group *ng)
1597{
1598	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1599}
1600
1601/* Handle placement on systems where not all nodes are directly connected. */
1602static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1603					int lim_dist, bool task)
1604{
1605	unsigned long score = 0;
1606	int node, max_dist;
1607
1608	/*
1609	 * All nodes are directly connected, and the same distance
1610	 * from each other. No need for fancy placement algorithms.
1611	 */
1612	if (sched_numa_topology_type == NUMA_DIRECT)
1613		return 0;
1614
1615	/* sched_max_numa_distance may be changed in parallel. */
1616	max_dist = READ_ONCE(sched_max_numa_distance);
1617	/*
1618	 * This code is called for each node, introducing N^2 complexity,
1619	 * which should be OK given the number of nodes rarely exceeds 8.
1620	 */
1621	for_each_online_node(node) {
1622		unsigned long faults;
1623		int dist = node_distance(nid, node);
1624
1625		/*
1626		 * The furthest away nodes in the system are not interesting
1627		 * for placement; nid was already counted.
1628		 */
1629		if (dist >= max_dist || node == nid)
1630			continue;
1631
1632		/*
1633		 * On systems with a backplane NUMA topology, compare groups
1634		 * of nodes, and move tasks towards the group with the most
1635		 * memory accesses. When comparing two nodes at distance
1636		 * "hoplimit", only nodes closer by than "hoplimit" are part
1637		 * of each group. Skip other nodes.
1638		 */
1639		if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
1640			continue;
1641
1642		/* Add up the faults from nearby nodes. */
1643		if (task)
1644			faults = task_faults(p, node);
1645		else
1646			faults = group_faults(p, node);
1647
1648		/*
1649		 * On systems with a glueless mesh NUMA topology, there are
1650		 * no fixed "groups of nodes". Instead, nodes that are not
1651		 * directly connected bounce traffic through intermediate
1652		 * nodes; a numa_group can occupy any set of nodes.
1653		 * The further away a node is, the less the faults count.
1654		 * This seems to result in good task placement.
1655		 */
1656		if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1657			faults *= (max_dist - dist);
1658			faults /= (max_dist - LOCAL_DISTANCE);
1659		}
1660
1661		score += faults;
1662	}
1663
1664	return score;
1665}
1666
1667/*
1668 * These return the fraction of accesses done by a particular task, or
1669 * task group, on a particular numa node.  The group weight is given a
1670 * larger multiplier, in order to group tasks together that are almost
1671 * evenly spread out between numa nodes.
1672 */
1673static inline unsigned long task_weight(struct task_struct *p, int nid,
1674					int dist)
1675{
1676	unsigned long faults, total_faults;
1677
1678	if (!p->numa_faults)
1679		return 0;
1680
1681	total_faults = p->total_numa_faults;
1682
1683	if (!total_faults)
1684		return 0;
1685
1686	faults = task_faults(p, nid);
1687	faults += score_nearby_nodes(p, nid, dist, true);
1688
1689	return 1000 * faults / total_faults;
1690}
1691
1692static inline unsigned long group_weight(struct task_struct *p, int nid,
1693					 int dist)
1694{
1695	struct numa_group *ng = deref_task_numa_group(p);
1696	unsigned long faults, total_faults;
1697
1698	if (!ng)
1699		return 0;
1700
1701	total_faults = ng->total_faults;
1702
1703	if (!total_faults)
1704		return 0;
1705
1706	faults = group_faults(p, nid);
1707	faults += score_nearby_nodes(p, nid, dist, false);
1708
1709	return 1000 * faults / total_faults;
1710}
1711
1712/*
1713 * If memory tiering mode is enabled, cpupid of slow memory page is
1714 * used to record scan time instead of CPU and PID.  When tiering mode
1715 * is disabled at run time, the scan time (in cpupid) will be
1716 * interpreted as CPU and PID.  So CPU needs to be checked to avoid to
1717 * access out of array bound.
1718 */
1719static inline bool cpupid_valid(int cpupid)
1720{
1721	return cpupid_to_cpu(cpupid) < nr_cpu_ids;
1722}
1723
1724/*
1725 * For memory tiering mode, if there are enough free pages (more than
1726 * enough watermark defined here) in fast memory node, to take full
1727 * advantage of fast memory capacity, all recently accessed slow
1728 * memory pages will be migrated to fast memory node without
1729 * considering hot threshold.
1730 */
1731static bool pgdat_free_space_enough(struct pglist_data *pgdat)
1732{
1733	int z;
1734	unsigned long enough_wmark;
1735
1736	enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
1737			   pgdat->node_present_pages >> 4);
1738	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1739		struct zone *zone = pgdat->node_zones + z;
1740
1741		if (!populated_zone(zone))
1742			continue;
1743
1744		if (zone_watermark_ok(zone, 0,
1745				      wmark_pages(zone, WMARK_PROMO) + enough_wmark,
1746				      ZONE_MOVABLE, 0))
1747			return true;
1748	}
1749	return false;
1750}
1751
1752/*
1753 * For memory tiering mode, when page tables are scanned, the scan
1754 * time will be recorded in struct page in addition to make page
1755 * PROT_NONE for slow memory page.  So when the page is accessed, in
1756 * hint page fault handler, the hint page fault latency is calculated
1757 * via,
1758 *
1759 *	hint page fault latency = hint page fault time - scan time
1760 *
1761 * The smaller the hint page fault latency, the higher the possibility
1762 * for the page to be hot.
1763 */
1764static int numa_hint_fault_latency(struct folio *folio)
1765{
1766	int last_time, time;
1767
1768	time = jiffies_to_msecs(jiffies);
1769	last_time = folio_xchg_access_time(folio, time);
1770
1771	return (time - last_time) & PAGE_ACCESS_TIME_MASK;
1772}
1773
1774/*
1775 * For memory tiering mode, too high promotion/demotion throughput may
1776 * hurt application latency.  So we provide a mechanism to rate limit
1777 * the number of pages that are tried to be promoted.
1778 */
1779static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
1780				      unsigned long rate_limit, int nr)
1781{
1782	unsigned long nr_cand;
1783	unsigned int now, start;
1784
1785	now = jiffies_to_msecs(jiffies);
1786	mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
1787	nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
1788	start = pgdat->nbp_rl_start;
1789	if (now - start > MSEC_PER_SEC &&
1790	    cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
1791		pgdat->nbp_rl_nr_cand = nr_cand;
1792	if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
1793		return true;
1794	return false;
1795}
1796
1797#define NUMA_MIGRATION_ADJUST_STEPS	16
1798
1799static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
1800					    unsigned long rate_limit,
1801					    unsigned int ref_th)
1802{
1803	unsigned int now, start, th_period, unit_th, th;
1804	unsigned long nr_cand, ref_cand, diff_cand;
1805
1806	now = jiffies_to_msecs(jiffies);
1807	th_period = sysctl_numa_balancing_scan_period_max;
1808	start = pgdat->nbp_th_start;
1809	if (now - start > th_period &&
1810	    cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
1811		ref_cand = rate_limit *
1812			sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
1813		nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
1814		diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
1815		unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
1816		th = pgdat->nbp_threshold ? : ref_th;
1817		if (diff_cand > ref_cand * 11 / 10)
1818			th = max(th - unit_th, unit_th);
1819		else if (diff_cand < ref_cand * 9 / 10)
1820			th = min(th + unit_th, ref_th * 2);
1821		pgdat->nbp_th_nr_cand = nr_cand;
1822		pgdat->nbp_threshold = th;
1823	}
1824}
1825
1826bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
1827				int src_nid, int dst_cpu)
1828{
1829	struct numa_group *ng = deref_curr_numa_group(p);
1830	int dst_nid = cpu_to_node(dst_cpu);
1831	int last_cpupid, this_cpupid;
1832
1833	/*
1834	 * Cannot migrate to memoryless nodes.
1835	 */
1836	if (!node_state(dst_nid, N_MEMORY))
1837		return false;
1838
1839	/*
1840	 * The pages in slow memory node should be migrated according
1841	 * to hot/cold instead of private/shared.
1842	 */
1843	if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
1844	    !node_is_toptier(src_nid)) {
1845		struct pglist_data *pgdat;
1846		unsigned long rate_limit;
1847		unsigned int latency, th, def_th;
1848
1849		pgdat = NODE_DATA(dst_nid);
1850		if (pgdat_free_space_enough(pgdat)) {
1851			/* workload changed, reset hot threshold */
1852			pgdat->nbp_threshold = 0;
1853			return true;
1854		}
1855
1856		def_th = sysctl_numa_balancing_hot_threshold;
1857		rate_limit = sysctl_numa_balancing_promote_rate_limit << \
1858			(20 - PAGE_SHIFT);
1859		numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
1860
1861		th = pgdat->nbp_threshold ? : def_th;
1862		latency = numa_hint_fault_latency(folio);
1863		if (latency >= th)
1864			return false;
1865
1866		return !numa_promotion_rate_limit(pgdat, rate_limit,
1867						  folio_nr_pages(folio));
1868	}
1869
1870	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1871	last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
1872
1873	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
1874	    !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
1875		return false;
1876
1877	/*
1878	 * Allow first faults or private faults to migrate immediately early in
1879	 * the lifetime of a task. The magic number 4 is based on waiting for
1880	 * two full passes of the "multi-stage node selection" test that is
1881	 * executed below.
1882	 */
1883	if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1884	    (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1885		return true;
1886
1887	/*
1888	 * Multi-stage node selection is used in conjunction with a periodic
1889	 * migration fault to build a temporal task<->page relation. By using
1890	 * a two-stage filter we remove short/unlikely relations.
1891	 *
1892	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1893	 * a task's usage of a particular page (n_p) per total usage of this
1894	 * page (n_t) (in a given time-span) to a probability.
1895	 *
1896	 * Our periodic faults will sample this probability and getting the
1897	 * same result twice in a row, given these samples are fully
1898	 * independent, is then given by P(n)^2, provided our sample period
1899	 * is sufficiently short compared to the usage pattern.
1900	 *
1901	 * This quadric squishes small probabilities, making it less likely we
1902	 * act on an unlikely task<->page relation.
1903	 */
1904	if (!cpupid_pid_unset(last_cpupid) &&
1905				cpupid_to_nid(last_cpupid) != dst_nid)
1906		return false;
1907
1908	/* Always allow migrate on private faults */
1909	if (cpupid_match_pid(p, last_cpupid))
1910		return true;
1911
1912	/* A shared fault, but p->numa_group has not been set up yet. */
1913	if (!ng)
1914		return true;
1915
1916	/*
1917	 * Destination node is much more heavily used than the source
1918	 * node? Allow migration.
1919	 */
1920	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1921					ACTIVE_NODE_FRACTION)
1922		return true;
1923
1924	/*
1925	 * Distribute memory according to CPU & memory use on each node,
1926	 * with 3/4 hysteresis to avoid unnecessary memory migrations:
1927	 *
1928	 * faults_cpu(dst)   3   faults_cpu(src)
1929	 * --------------- * - > ---------------
1930	 * faults_mem(dst)   4   faults_mem(src)
1931	 */
1932	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1933	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1934}
1935
1936/*
1937 * 'numa_type' describes the node at the moment of load balancing.
1938 */
1939enum numa_type {
1940	/* The node has spare capacity that can be used to run more tasks.  */
1941	node_has_spare = 0,
1942	/*
1943	 * The node is fully used and the tasks don't compete for more CPU
1944	 * cycles. Nevertheless, some tasks might wait before running.
1945	 */
1946	node_fully_busy,
1947	/*
1948	 * The node is overloaded and can't provide expected CPU cycles to all
1949	 * tasks.
1950	 */
1951	node_overloaded
1952};
1953
1954/* Cached statistics for all CPUs within a node */
1955struct numa_stats {
1956	unsigned long load;
1957	unsigned long runnable;
1958	unsigned long util;
1959	/* Total compute capacity of CPUs on a node */
1960	unsigned long compute_capacity;
1961	unsigned int nr_running;
1962	unsigned int weight;
1963	enum numa_type node_type;
1964	int idle_cpu;
1965};
1966
1967struct task_numa_env {
1968	struct task_struct *p;
1969
1970	int src_cpu, src_nid;
1971	int dst_cpu, dst_nid;
1972	int imb_numa_nr;
1973
1974	struct numa_stats src_stats, dst_stats;
1975
1976	int imbalance_pct;
1977	int dist;
1978
1979	struct task_struct *best_task;
1980	long best_imp;
1981	int best_cpu;
1982};
1983
1984static unsigned long cpu_load(struct rq *rq);
1985static unsigned long cpu_runnable(struct rq *rq);
1986
1987static inline enum
1988numa_type numa_classify(unsigned int imbalance_pct,
1989			 struct numa_stats *ns)
1990{
1991	if ((ns->nr_running > ns->weight) &&
1992	    (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
1993	     ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
1994		return node_overloaded;
1995
1996	if ((ns->nr_running < ns->weight) ||
1997	    (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
1998	     ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
1999		return node_has_spare;
2000
2001	return node_fully_busy;
2002}
2003
2004#ifdef CONFIG_SCHED_SMT
2005/* Forward declarations of select_idle_sibling helpers */
2006static inline bool test_idle_cores(int cpu);
2007static inline int numa_idle_core(int idle_core, int cpu)
2008{
2009	if (!static_branch_likely(&sched_smt_present) ||
2010	    idle_core >= 0 || !test_idle_cores(cpu))
2011		return idle_core;
2012
2013	/*
2014	 * Prefer cores instead of packing HT siblings
2015	 * and triggering future load balancing.
2016	 */
2017	if (is_core_idle(cpu))
2018		idle_core = cpu;
2019
2020	return idle_core;
2021}
2022#else
2023static inline int numa_idle_core(int idle_core, int cpu)
2024{
2025	return idle_core;
2026}
2027#endif
2028
2029/*
2030 * Gather all necessary information to make NUMA balancing placement
2031 * decisions that are compatible with standard load balancer. This
2032 * borrows code and logic from update_sg_lb_stats but sharing a
2033 * common implementation is impractical.
2034 */
2035static void update_numa_stats(struct task_numa_env *env,
2036			      struct numa_stats *ns, int nid,
2037			      bool find_idle)
2038{
2039	int cpu, idle_core = -1;
2040
2041	memset(ns, 0, sizeof(*ns));
2042	ns->idle_cpu = -1;
2043
2044	rcu_read_lock();
2045	for_each_cpu(cpu, cpumask_of_node(nid)) {
2046		struct rq *rq = cpu_rq(cpu);
2047
2048		ns->load += cpu_load(rq);
2049		ns->runnable += cpu_runnable(rq);
2050		ns->util += cpu_util_cfs(cpu);
2051		ns->nr_running += rq->cfs.h_nr_running;
2052		ns->compute_capacity += capacity_of(cpu);
2053
2054		if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
2055			if (READ_ONCE(rq->numa_migrate_on) ||
2056			    !cpumask_test_cpu(cpu, env->p->cpus_ptr))
2057				continue;
2058
2059			if (ns->idle_cpu == -1)
2060				ns->idle_cpu = cpu;
2061
2062			idle_core = numa_idle_core(idle_core, cpu);
2063		}
2064	}
2065	rcu_read_unlock();
2066
2067	ns->weight = cpumask_weight(cpumask_of_node(nid));
2068
2069	ns->node_type = numa_classify(env->imbalance_pct, ns);
2070
2071	if (idle_core >= 0)
2072		ns->idle_cpu = idle_core;
2073}
2074
2075static void task_numa_assign(struct task_numa_env *env,
2076			     struct task_struct *p, long imp)
2077{
2078	struct rq *rq = cpu_rq(env->dst_cpu);
2079
2080	/* Check if run-queue part of active NUMA balance. */
2081	if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
2082		int cpu;
2083		int start = env->dst_cpu;
2084
2085		/* Find alternative idle CPU. */
2086		for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
2087			if (cpu == env->best_cpu || !idle_cpu(cpu) ||
2088			    !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
2089				continue;
2090			}
2091
2092			env->dst_cpu = cpu;
2093			rq = cpu_rq(env->dst_cpu);
2094			if (!xchg(&rq->numa_migrate_on, 1))
2095				goto assign;
2096		}
2097
2098		/* Failed to find an alternative idle CPU */
2099		return;
2100	}
2101
2102assign:
2103	/*
2104	 * Clear previous best_cpu/rq numa-migrate flag, since task now
2105	 * found a better CPU to move/swap.
2106	 */
2107	if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
2108		rq = cpu_rq(env->best_cpu);
2109		WRITE_ONCE(rq->numa_migrate_on, 0);
2110	}
2111
2112	if (env->best_task)
2113		put_task_struct(env->best_task);
2114	if (p)
2115		get_task_struct(p);
2116
2117	env->best_task = p;
2118	env->best_imp = imp;
2119	env->best_cpu = env->dst_cpu;
2120}
2121
2122static bool load_too_imbalanced(long src_load, long dst_load,
2123				struct task_numa_env *env)
2124{
2125	long imb, old_imb;
2126	long orig_src_load, orig_dst_load;
2127	long src_capacity, dst_capacity;
2128
2129	/*
2130	 * The load is corrected for the CPU capacity available on each node.
2131	 *
2132	 * src_load        dst_load
2133	 * ------------ vs ---------
2134	 * src_capacity    dst_capacity
2135	 */
2136	src_capacity = env->src_stats.compute_capacity;
2137	dst_capacity = env->dst_stats.compute_capacity;
2138
2139	imb = abs(dst_load * src_capacity - src_load * dst_capacity);
2140
2141	orig_src_load = env->src_stats.load;
2142	orig_dst_load = env->dst_stats.load;
2143
2144	old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
2145
2146	/* Would this change make things worse? */
2147	return (imb > old_imb);
2148}
2149
2150/*
2151 * Maximum NUMA importance can be 1998 (2*999);
2152 * SMALLIMP @ 30 would be close to 1998/64.
2153 * Used to deter task migration.
2154 */
2155#define SMALLIMP	30
2156
2157/*
2158 * This checks if the overall compute and NUMA accesses of the system would
2159 * be improved if the source tasks was migrated to the target dst_cpu taking
2160 * into account that it might be best if task running on the dst_cpu should
2161 * be exchanged with the source task
2162 */
2163static bool task_numa_compare(struct task_numa_env *env,
2164			      long taskimp, long groupimp, bool maymove)
2165{
2166	struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
2167	struct rq *dst_rq = cpu_rq(env->dst_cpu);
2168	long imp = p_ng ? groupimp : taskimp;
2169	struct task_struct *cur;
2170	long src_load, dst_load;
2171	int dist = env->dist;
2172	long moveimp = imp;
2173	long load;
2174	bool stopsearch = false;
2175
2176	if (READ_ONCE(dst_rq->numa_migrate_on))
2177		return false;
2178
2179	rcu_read_lock();
2180	cur = rcu_dereference(dst_rq->curr);
2181	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
2182		cur = NULL;
2183
2184	/*
2185	 * Because we have preemption enabled we can get migrated around and
2186	 * end try selecting ourselves (current == env->p) as a swap candidate.
2187	 */
2188	if (cur == env->p) {
2189		stopsearch = true;
2190		goto unlock;
2191	}
2192
2193	if (!cur) {
2194		if (maymove && moveimp >= env->best_imp)
2195			goto assign;
2196		else
2197			goto unlock;
2198	}
2199
2200	/* Skip this swap candidate if cannot move to the source cpu. */
2201	if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
2202		goto unlock;
2203
2204	/*
2205	 * Skip this swap candidate if it is not moving to its preferred
2206	 * node and the best task is.
2207	 */
2208	if (env->best_task &&
2209	    env->best_task->numa_preferred_nid == env->src_nid &&
2210	    cur->numa_preferred_nid != env->src_nid) {
2211		goto unlock;
2212	}
2213
2214	/*
2215	 * "imp" is the fault differential for the source task between the
2216	 * source and destination node. Calculate the total differential for
2217	 * the source task and potential destination task. The more negative
2218	 * the value is, the more remote accesses that would be expected to
2219	 * be incurred if the tasks were swapped.
2220	 *
2221	 * If dst and source tasks are in the same NUMA group, or not
2222	 * in any group then look only at task weights.
2223	 */
2224	cur_ng = rcu_dereference(cur->numa_group);
2225	if (cur_ng == p_ng) {
2226		/*
2227		 * Do not swap within a group or between tasks that have
2228		 * no group if there is spare capacity. Swapping does
2229		 * not address the load imbalance and helps one task at
2230		 * the cost of punishing another.
2231		 */
2232		if (env->dst_stats.node_type == node_has_spare)
2233			goto unlock;
2234
2235		imp = taskimp + task_weight(cur, env->src_nid, dist) -
2236		      task_weight(cur, env->dst_nid, dist);
2237		/*
2238		 * Add some hysteresis to prevent swapping the
2239		 * tasks within a group over tiny differences.
2240		 */
2241		if (cur_ng)
2242			imp -= imp / 16;
2243	} else {
2244		/*
2245		 * Compare the group weights. If a task is all by itself
2246		 * (not part of a group), use the task weight instead.
2247		 */
2248		if (cur_ng && p_ng)
2249			imp += group_weight(cur, env->src_nid, dist) -
2250			       group_weight(cur, env->dst_nid, dist);
2251		else
2252			imp += task_weight(cur, env->src_nid, dist) -
2253			       task_weight(cur, env->dst_nid, dist);
2254	}
2255
2256	/* Discourage picking a task already on its preferred node */
2257	if (cur->numa_preferred_nid == env->dst_nid)
2258		imp -= imp / 16;
2259
2260	/*
2261	 * Encourage picking a task that moves to its preferred node.
2262	 * This potentially makes imp larger than it's maximum of
2263	 * 1998 (see SMALLIMP and task_weight for why) but in this
2264	 * case, it does not matter.
2265	 */
2266	if (cur->numa_preferred_nid == env->src_nid)
2267		imp += imp / 8;
2268
2269	if (maymove && moveimp > imp && moveimp > env->best_imp) {
2270		imp = moveimp;
2271		cur = NULL;
2272		goto assign;
2273	}
2274
2275	/*
2276	 * Prefer swapping with a task moving to its preferred node over a
2277	 * task that is not.
2278	 */
2279	if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
2280	    env->best_task->numa_preferred_nid != env->src_nid) {
2281		goto assign;
2282	}
2283
2284	/*
2285	 * If the NUMA importance is less than SMALLIMP,
2286	 * task migration might only result in ping pong
2287	 * of tasks and also hurt performance due to cache
2288	 * misses.
2289	 */
2290	if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
2291		goto unlock;
2292
2293	/*
2294	 * In the overloaded case, try and keep the load balanced.
2295	 */
2296	load = task_h_load(env->p) - task_h_load(cur);
2297	if (!load)
2298		goto assign;
2299
2300	dst_load = env->dst_stats.load + load;
2301	src_load = env->src_stats.load - load;
2302
2303	if (load_too_imbalanced(src_load, dst_load, env))
2304		goto unlock;
2305
2306assign:
2307	/* Evaluate an idle CPU for a task numa move. */
2308	if (!cur) {
2309		int cpu = env->dst_stats.idle_cpu;
2310
2311		/* Nothing cached so current CPU went idle since the search. */
2312		if (cpu < 0)
2313			cpu = env->dst_cpu;
2314
2315		/*
2316		 * If the CPU is no longer truly idle and the previous best CPU
2317		 * is, keep using it.
2318		 */
2319		if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
2320		    idle_cpu(env->best_cpu)) {
2321			cpu = env->best_cpu;
2322		}
2323
2324		env->dst_cpu = cpu;
2325	}
2326
2327	task_numa_assign(env, cur, imp);
2328
2329	/*
2330	 * If a move to idle is allowed because there is capacity or load
2331	 * balance improves then stop the search. While a better swap
2332	 * candidate may exist, a search is not free.
2333	 */
2334	if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
2335		stopsearch = true;
2336
2337	/*
2338	 * If a swap candidate must be identified and the current best task
2339	 * moves its preferred node then stop the search.
2340	 */
2341	if (!maymove && env->best_task &&
2342	    env->best_task->numa_preferred_nid == env->src_nid) {
2343		stopsearch = true;
2344	}
2345unlock:
2346	rcu_read_unlock();
2347
2348	return stopsearch;
2349}
2350
2351static void task_numa_find_cpu(struct task_numa_env *env,
2352				long taskimp, long groupimp)
2353{
2354	bool maymove = false;
2355	int cpu;
2356
2357	/*
2358	 * If dst node has spare capacity, then check if there is an
2359	 * imbalance that would be overruled by the load balancer.
2360	 */
2361	if (env->dst_stats.node_type == node_has_spare) {
2362		unsigned int imbalance;
2363		int src_running, dst_running;
2364
2365		/*
2366		 * Would movement cause an imbalance? Note that if src has
2367		 * more running tasks that the imbalance is ignored as the
2368		 * move improves the imbalance from the perspective of the
2369		 * CPU load balancer.
2370		 * */
2371		src_running = env->src_stats.nr_running - 1;
2372		dst_running = env->dst_stats.nr_running + 1;
2373		imbalance = max(0, dst_running - src_running);
2374		imbalance = adjust_numa_imbalance(imbalance, dst_running,
2375						  env->imb_numa_nr);
2376
2377		/* Use idle CPU if there is no imbalance */
2378		if (!imbalance) {
2379			maymove = true;
2380			if (env->dst_stats.idle_cpu >= 0) {
2381				env->dst_cpu = env->dst_stats.idle_cpu;
2382				task_numa_assign(env, NULL, 0);
2383				return;
2384			}
2385		}
2386	} else {
2387		long src_load, dst_load, load;
2388		/*
2389		 * If the improvement from just moving env->p direction is better
2390		 * than swapping tasks around, check if a move is possible.
2391		 */
2392		load = task_h_load(env->p);
2393		dst_load = env->dst_stats.load + load;
2394		src_load = env->src_stats.load - load;
2395		maymove = !load_too_imbalanced(src_load, dst_load, env);
2396	}
2397
2398	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
2399		/* Skip this CPU if the source task cannot migrate */
2400		if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
2401			continue;
2402
2403		env->dst_cpu = cpu;
2404		if (task_numa_compare(env, taskimp, groupimp, maymove))
2405			break;
2406	}
2407}
2408
2409static int task_numa_migrate(struct task_struct *p)
2410{
2411	struct task_numa_env env = {
2412		.p = p,
2413
2414		.src_cpu = task_cpu(p),
2415		.src_nid = task_node(p),
2416
2417		.imbalance_pct = 112,
2418
2419		.best_task = NULL,
2420		.best_imp = 0,
2421		.best_cpu = -1,
2422	};
2423	unsigned long taskweight, groupweight;
2424	struct sched_domain *sd;
2425	long taskimp, groupimp;
2426	struct numa_group *ng;
2427	struct rq *best_rq;
2428	int nid, ret, dist;
2429
2430	/*
2431	 * Pick the lowest SD_NUMA domain, as that would have the smallest
2432	 * imbalance and would be the first to start moving tasks about.
2433	 *
2434	 * And we want to avoid any moving of tasks about, as that would create
2435	 * random movement of tasks -- counter the numa conditions we're trying
2436	 * to satisfy here.
2437	 */
2438	rcu_read_lock();
2439	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
2440	if (sd) {
2441		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
2442		env.imb_numa_nr = sd->imb_numa_nr;
2443	}
2444	rcu_read_unlock();
2445
2446	/*
2447	 * Cpusets can break the scheduler domain tree into smaller
2448	 * balance domains, some of which do not cross NUMA boundaries.
2449	 * Tasks that are "trapped" in such domains cannot be migrated
2450	 * elsewhere, so there is no point in (re)trying.
2451	 */
2452	if (unlikely(!sd)) {
2453		sched_setnuma(p, task_node(p));
2454		return -EINVAL;
2455	}
2456
2457	env.dst_nid = p->numa_preferred_nid;
2458	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
2459	taskweight = task_weight(p, env.src_nid, dist);
2460	groupweight = group_weight(p, env.src_nid, dist);
2461	update_numa_stats(&env, &env.src_stats, env.src_nid, false);
2462	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
2463	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
2464	update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2465
2466	/* Try to find a spot on the preferred nid. */
2467	task_numa_find_cpu(&env, taskimp, groupimp);
2468
2469	/*
2470	 * Look at other nodes in these cases:
2471	 * - there is no space available on the preferred_nid
2472	 * - the task is part of a numa_group that is interleaved across
2473	 *   multiple NUMA nodes; in order to better consolidate the group,
2474	 *   we need to check other locations.
2475	 */
2476	ng = deref_curr_numa_group(p);
2477	if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
2478		for_each_node_state(nid, N_CPU) {
2479			if (nid == env.src_nid || nid == p->numa_preferred_nid)
2480				continue;
2481
2482			dist = node_distance(env.src_nid, env.dst_nid);
2483			if (sched_numa_topology_type == NUMA_BACKPLANE &&
2484						dist != env.dist) {
2485				taskweight = task_weight(p, env.src_nid, dist);
2486				groupweight = group_weight(p, env.src_nid, dist);
2487			}
2488
2489			/* Only consider nodes where both task and groups benefit */
2490			taskimp = task_weight(p, nid, dist) - taskweight;
2491			groupimp = group_weight(p, nid, dist) - groupweight;
2492			if (taskimp < 0 && groupimp < 0)
2493				continue;
2494
2495			env.dist = dist;
2496			env.dst_nid = nid;
2497			update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2498			task_numa_find_cpu(&env, taskimp, groupimp);
2499		}
2500	}
2501
2502	/*
2503	 * If the task is part of a workload that spans multiple NUMA nodes,
2504	 * and is migrating into one of the workload's active nodes, remember
2505	 * this node as the task's preferred numa node, so the workload can
2506	 * settle down.
2507	 * A task that migrated to a second choice node will be better off
2508	 * trying for a better one later. Do not set the preferred node here.
2509	 */
2510	if (ng) {
2511		if (env.best_cpu == -1)
2512			nid = env.src_nid;
2513		else
2514			nid = cpu_to_node(env.best_cpu);
2515
2516		if (nid != p->numa_preferred_nid)
2517			sched_setnuma(p, nid);
2518	}
2519
2520	/* No better CPU than the current one was found. */
2521	if (env.best_cpu == -1) {
2522		trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
2523		return -EAGAIN;
2524	}
2525
2526	best_rq = cpu_rq(env.best_cpu);
2527	if (env.best_task == NULL) {
2528		ret = migrate_task_to(p, env.best_cpu);
2529		WRITE_ONCE(best_rq->numa_migrate_on, 0);
2530		if (ret != 0)
2531			trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
2532		return ret;
2533	}
2534
2535	ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
2536	WRITE_ONCE(best_rq->numa_migrate_on, 0);
2537
2538	if (ret != 0)
2539		trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
2540	put_task_struct(env.best_task);
2541	return ret;
2542}
2543
2544/* Attempt to migrate a task to a CPU on the preferred node. */
2545static void numa_migrate_preferred(struct task_struct *p)
2546{
2547	unsigned long interval = HZ;
2548
2549	/* This task has no NUMA fault statistics yet */
2550	if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
2551		return;
2552
2553	/* Periodically retry migrating the task to the preferred node */
2554	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
2555	p->numa_migrate_retry = jiffies + interval;
2556
2557	/* Success if task is already running on preferred CPU */
2558	if (task_node(p) == p->numa_preferred_nid)
2559		return;
2560
2561	/* Otherwise, try migrate to a CPU on the preferred node */
2562	task_numa_migrate(p);
2563}
2564
2565/*
2566 * Find out how many nodes the workload is actively running on. Do this by
2567 * tracking the nodes from which NUMA hinting faults are triggered. This can
2568 * be different from the set of nodes where the workload's memory is currently
2569 * located.
2570 */
2571static void numa_group_count_active_nodes(struct numa_group *numa_group)
2572{
2573	unsigned long faults, max_faults = 0;
2574	int nid, active_nodes = 0;
2575
2576	for_each_node_state(nid, N_CPU) {
2577		faults = group_faults_cpu(numa_group, nid);
2578		if (faults > max_faults)
2579			max_faults = faults;
2580	}
2581
2582	for_each_node_state(nid, N_CPU) {
2583		faults = group_faults_cpu(numa_group, nid);
2584		if (faults * ACTIVE_NODE_FRACTION > max_faults)
2585			active_nodes++;
2586	}
2587
2588	numa_group->max_faults_cpu = max_faults;
2589	numa_group->active_nodes = active_nodes;
2590}
2591
2592/*
2593 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
2594 * increments. The more local the fault statistics are, the higher the scan
2595 * period will be for the next scan window. If local/(local+remote) ratio is
2596 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
2597 * the scan period will decrease. Aim for 70% local accesses.
2598 */
2599#define NUMA_PERIOD_SLOTS 10
2600#define NUMA_PERIOD_THRESHOLD 7
2601
2602/*
2603 * Increase the scan period (slow down scanning) if the majority of
2604 * our memory is already on our local node, or if the majority of
2605 * the page accesses are shared with other processes.
2606 * Otherwise, decrease the scan period.
2607 */
2608static void update_task_scan_period(struct task_struct *p,
2609			unsigned long shared, unsigned long private)
2610{
2611	unsigned int period_slot;
2612	int lr_ratio, ps_ratio;
2613	int diff;
2614
2615	unsigned long remote = p->numa_faults_locality[0];
2616	unsigned long local = p->numa_faults_locality[1];
2617
2618	/*
2619	 * If there were no record hinting faults then either the task is
2620	 * completely idle or all activity is in areas that are not of interest
2621	 * to automatic numa balancing. Related to that, if there were failed
2622	 * migration then it implies we are migrating too quickly or the local
2623	 * node is overloaded. In either case, scan slower
2624	 */
2625	if (local + shared == 0 || p->numa_faults_locality[2]) {
2626		p->numa_scan_period = min(p->numa_scan_period_max,
2627			p->numa_scan_period << 1);
2628
2629		p->mm->numa_next_scan = jiffies +
2630			msecs_to_jiffies(p->numa_scan_period);
2631
2632		return;
2633	}
2634
2635	/*
2636	 * Prepare to scale scan period relative to the current period.
2637	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
2638	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
2639	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
2640	 */
2641	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
2642	lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
2643	ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
2644
2645	if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
2646		/*
2647		 * Most memory accesses are local. There is no need to
2648		 * do fast NUMA scanning, since memory is already local.
2649		 */
2650		int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2651		if (!slot)
2652			slot = 1;
2653		diff = slot * period_slot;
2654	} else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2655		/*
2656		 * Most memory accesses are shared with other tasks.
2657		 * There is no point in continuing fast NUMA scanning,
2658		 * since other tasks may just move the memory elsewhere.
2659		 */
2660		int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
2661		if (!slot)
2662			slot = 1;
2663		diff = slot * period_slot;
2664	} else {
2665		/*
2666		 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
2667		 * yet they are not on the local NUMA node. Speed up
2668		 * NUMA scanning to get the memory moved over.
2669		 */
2670		int ratio = max(lr_ratio, ps_ratio);
2671		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2672	}
2673
2674	p->numa_scan_period = clamp(p->numa_scan_period + diff,
2675			task_scan_min(p), task_scan_max(p));
2676	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2677}
2678
2679/*
2680 * Get the fraction of time the task has been running since the last
2681 * NUMA placement cycle. The scheduler keeps similar statistics, but
2682 * decays those on a 32ms period, which is orders of magnitude off
2683 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
2684 * stats only if the task is so new there are no NUMA statistics yet.
2685 */
2686static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2687{
2688	u64 runtime, delta, now;
2689	/* Use the start of this time slice to avoid calculations. */
2690	now = p->se.exec_start;
2691	runtime = p->se.sum_exec_runtime;
2692
2693	if (p->last_task_numa_placement) {
2694		delta = runtime - p->last_sum_exec_runtime;
2695		*period = now - p->last_task_numa_placement;
2696
2697		/* Avoid time going backwards, prevent potential divide error: */
2698		if (unlikely((s64)*period < 0))
2699			*period = 0;
2700	} else {
2701		delta = p->se.avg.load_sum;
2702		*period = LOAD_AVG_MAX;
2703	}
2704
2705	p->last_sum_exec_runtime = runtime;
2706	p->last_task_numa_placement = now;
2707
2708	return delta;
2709}
2710
2711/*
2712 * Determine the preferred nid for a task in a numa_group. This needs to
2713 * be done in a way that produces consistent results with group_weight,
2714 * otherwise workloads might not converge.
2715 */
2716static int preferred_group_nid(struct task_struct *p, int nid)
2717{
2718	nodemask_t nodes;
2719	int dist;
2720
2721	/* Direct connections between all NUMA nodes. */
2722	if (sched_numa_topology_type == NUMA_DIRECT)
2723		return nid;
2724
2725	/*
2726	 * On a system with glueless mesh NUMA topology, group_weight
2727	 * scores nodes according to the number of NUMA hinting faults on
2728	 * both the node itself, and on nearby nodes.
2729	 */
2730	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2731		unsigned long score, max_score = 0;
2732		int node, max_node = nid;
2733
2734		dist = sched_max_numa_distance;
2735
2736		for_each_node_state(node, N_CPU) {
2737			score = group_weight(p, node, dist);
2738			if (score > max_score) {
2739				max_score = score;
2740				max_node = node;
2741			}
2742		}
2743		return max_node;
2744	}
2745
2746	/*
2747	 * Finding the preferred nid in a system with NUMA backplane
2748	 * interconnect topology is more involved. The goal is to locate
2749	 * tasks from numa_groups near each other in the system, and
2750	 * untangle workloads from different sides of the system. This requires
2751	 * searching down the hierarchy of node groups, recursively searching
2752	 * inside the highest scoring group of nodes. The nodemask tricks
2753	 * keep the complexity of the search down.
2754	 */
2755	nodes = node_states[N_CPU];
2756	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2757		unsigned long max_faults = 0;
2758		nodemask_t max_group = NODE_MASK_NONE;
2759		int a, b;
2760
2761		/* Are there nodes at this distance from each other? */
2762		if (!find_numa_distance(dist))
2763			continue;
2764
2765		for_each_node_mask(a, nodes) {
2766			unsigned long faults = 0;
2767			nodemask_t this_group;
2768			nodes_clear(this_group);
2769
2770			/* Sum group's NUMA faults; includes a==b case. */
2771			for_each_node_mask(b, nodes) {
2772				if (node_distance(a, b) < dist) {
2773					faults += group_faults(p, b);
2774					node_set(b, this_group);
2775					node_clear(b, nodes);
2776				}
2777			}
2778
2779			/* Remember the top group. */
2780			if (faults > max_faults) {
2781				max_faults = faults;
2782				max_group = this_group;
2783				/*
2784				 * subtle: at the smallest distance there is
2785				 * just one node left in each "group", the
2786				 * winner is the preferred nid.
2787				 */
2788				nid = a;
2789			}
2790		}
2791		/* Next round, evaluate the nodes within max_group. */
2792		if (!max_faults)
2793			break;
2794		nodes = max_group;
2795	}
2796	return nid;
2797}
2798
2799static void task_numa_placement(struct task_struct *p)
2800{
2801	int seq, nid, max_nid = NUMA_NO_NODE;
2802	unsigned long max_faults = 0;
2803	unsigned long fault_types[2] = { 0, 0 };
2804	unsigned long total_faults;
2805	u64 runtime, period;
2806	spinlock_t *group_lock = NULL;
2807	struct numa_group *ng;
2808
2809	/*
2810	 * The p->mm->numa_scan_seq field gets updated without
2811	 * exclusive access. Use READ_ONCE() here to ensure
2812	 * that the field is read in a single access:
2813	 */
2814	seq = READ_ONCE(p->mm->numa_scan_seq);
2815	if (p->numa_scan_seq == seq)
2816		return;
2817	p->numa_scan_seq = seq;
2818	p->numa_scan_period_max = task_scan_max(p);
2819
2820	total_faults = p->numa_faults_locality[0] +
2821		       p->numa_faults_locality[1];
2822	runtime = numa_get_avg_runtime(p, &period);
2823
2824	/* If the task is part of a group prevent parallel updates to group stats */
2825	ng = deref_curr_numa_group(p);
2826	if (ng) {
2827		group_lock = &ng->lock;
2828		spin_lock_irq(group_lock);
2829	}
2830
2831	/* Find the node with the highest number of faults */
2832	for_each_online_node(nid) {
2833		/* Keep track of the offsets in numa_faults array */
2834		int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2835		unsigned long faults = 0, group_faults = 0;
2836		int priv;
2837
2838		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2839			long diff, f_diff, f_weight;
2840
2841			mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2842			membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2843			cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2844			cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2845
2846			/* Decay existing window, copy faults since last scan */
2847			diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2848			fault_types[priv] += p->numa_faults[membuf_idx];
2849			p->numa_faults[membuf_idx] = 0;
2850
2851			/*
2852			 * Normalize the faults_from, so all tasks in a group
2853			 * count according to CPU use, instead of by the raw
2854			 * number of faults. Tasks with little runtime have
2855			 * little over-all impact on throughput, and thus their
2856			 * faults are less important.
2857			 */
2858			f_weight = div64_u64(runtime << 16, period + 1);
2859			f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2860				   (total_faults + 1);
2861			f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2862			p->numa_faults[cpubuf_idx] = 0;
2863
2864			p->numa_faults[mem_idx] += diff;
2865			p->numa_faults[cpu_idx] += f_diff;
2866			faults += p->numa_faults[mem_idx];
2867			p->total_numa_faults += diff;
2868			if (ng) {
2869				/*
2870				 * safe because we can only change our own group
2871				 *
2872				 * mem_idx represents the offset for a given
2873				 * nid and priv in a specific region because it
2874				 * is at the beginning of the numa_faults array.
2875				 */
2876				ng->faults[mem_idx] += diff;
2877				ng->faults[cpu_idx] += f_diff;
2878				ng->total_faults += diff;
2879				group_faults += ng->faults[mem_idx];
2880			}
2881		}
2882
2883		if (!ng) {
2884			if (faults > max_faults) {
2885				max_faults = faults;
2886				max_nid = nid;
2887			}
2888		} else if (group_faults > max_faults) {
2889			max_faults = group_faults;
2890			max_nid = nid;
2891		}
2892	}
2893
2894	/* Cannot migrate task to CPU-less node */
2895	max_nid = numa_nearest_node(max_nid, N_CPU);
2896
2897	if (ng) {
2898		numa_group_count_active_nodes(ng);
2899		spin_unlock_irq(group_lock);
2900		max_nid = preferred_group_nid(p, max_nid);
2901	}
2902
2903	if (max_faults) {
2904		/* Set the new preferred node */
2905		if (max_nid != p->numa_preferred_nid)
2906			sched_setnuma(p, max_nid);
2907	}
2908
2909	update_task_scan_period(p, fault_types[0], fault_types[1]);
2910}
2911
2912static inline int get_numa_group(struct numa_group *grp)
2913{
2914	return refcount_inc_not_zero(&grp->refcount);
2915}
2916
2917static inline void put_numa_group(struct numa_group *grp)
2918{
2919	if (refcount_dec_and_test(&grp->refcount))
2920		kfree_rcu(grp, rcu);
2921}
2922
2923static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2924			int *priv)
2925{
2926	struct numa_group *grp, *my_grp;
2927	struct task_struct *tsk;
2928	bool join = false;
2929	int cpu = cpupid_to_cpu(cpupid);
2930	int i;
2931
2932	if (unlikely(!deref_curr_numa_group(p))) {
2933		unsigned int size = sizeof(struct numa_group) +
2934				    NR_NUMA_HINT_FAULT_STATS *
2935				    nr_node_ids * sizeof(unsigned long);
2936
2937		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2938		if (!grp)
2939			return;
2940
2941		refcount_set(&grp->refcount, 1);
2942		grp->active_nodes = 1;
2943		grp->max_faults_cpu = 0;
2944		spin_lock_init(&grp->lock);
2945		grp->gid = p->pid;
2946
2947		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2948			grp->faults[i] = p->numa_faults[i];
2949
2950		grp->total_faults = p->total_numa_faults;
2951
2952		grp->nr_tasks++;
2953		rcu_assign_pointer(p->numa_group, grp);
2954	}
2955
2956	rcu_read_lock();
2957	tsk = READ_ONCE(cpu_rq(cpu)->curr);
2958
2959	if (!cpupid_match_pid(tsk, cpupid))
2960		goto no_join;
2961
2962	grp = rcu_dereference(tsk->numa_group);
2963	if (!grp)
2964		goto no_join;
2965
2966	my_grp = deref_curr_numa_group(p);
2967	if (grp == my_grp)
2968		goto no_join;
2969
2970	/*
2971	 * Only join the other group if its bigger; if we're the bigger group,
2972	 * the other task will join us.
2973	 */
2974	if (my_grp->nr_tasks > grp->nr_tasks)
2975		goto no_join;
2976
2977	/*
2978	 * Tie-break on the grp address.
2979	 */
2980	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2981		goto no_join;
2982
2983	/* Always join threads in the same process. */
2984	if (tsk->mm == current->mm)
2985		join = true;
2986
2987	/* Simple filter to avoid false positives due to PID collisions */
2988	if (flags & TNF_SHARED)
2989		join = true;
2990
2991	/* Update priv based on whether false sharing was detected */
2992	*priv = !join;
2993
2994	if (join && !get_numa_group(grp))
2995		goto no_join;
2996
2997	rcu_read_unlock();
2998
2999	if (!join)
3000		return;
3001
3002	WARN_ON_ONCE(irqs_disabled());
3003	double_lock_irq(&my_grp->lock, &grp->lock);
3004
3005	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
3006		my_grp->faults[i] -= p->numa_faults[i];
3007		grp->faults[i] += p->numa_faults[i];
3008	}
3009	my_grp->total_faults -= p->total_numa_faults;
3010	grp->total_faults += p->total_numa_faults;
3011
3012	my_grp->nr_tasks--;
3013	grp->nr_tasks++;
3014
3015	spin_unlock(&my_grp->lock);
3016	spin_unlock_irq(&grp->lock);
3017
3018	rcu_assign_pointer(p->numa_group, grp);
3019
3020	put_numa_group(my_grp);
3021	return;
3022
3023no_join:
3024	rcu_read_unlock();
3025	return;
3026}
3027
3028/*
3029 * Get rid of NUMA statistics associated with a task (either current or dead).
3030 * If @final is set, the task is dead and has reached refcount zero, so we can
3031 * safely free all relevant data structures. Otherwise, there might be
3032 * concurrent reads from places like load balancing and procfs, and we should
3033 * reset the data back to default state without freeing ->numa_faults.
3034 */
3035void task_numa_free(struct task_struct *p, bool final)
3036{
3037	/* safe: p either is current or is being freed by current */
3038	struct numa_group *grp = rcu_dereference_raw(p->numa_group);
3039	unsigned long *numa_faults = p->numa_faults;
3040	unsigned long flags;
3041	int i;
3042
3043	if (!numa_faults)
3044		return;
3045
3046	if (grp) {
3047		spin_lock_irqsave(&grp->lock, flags);
3048		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3049			grp->faults[i] -= p->numa_faults[i];
3050		grp->total_faults -= p->total_numa_faults;
3051
3052		grp->nr_tasks--;
3053		spin_unlock_irqrestore(&grp->lock, flags);
3054		RCU_INIT_POINTER(p->numa_group, NULL);
3055		put_numa_group(grp);
3056	}
3057
3058	if (final) {
3059		p->numa_faults = NULL;
3060		kfree(numa_faults);
3061	} else {
3062		p->total_numa_faults = 0;
3063		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3064			numa_faults[i] = 0;
3065	}
3066}
3067
3068/*
3069 * Got a PROT_NONE fault for a page on @node.
3070 */
3071void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
3072{
3073	struct task_struct *p = current;
3074	bool migrated = flags & TNF_MIGRATED;
3075	int cpu_node = task_node(current);
3076	int local = !!(flags & TNF_FAULT_LOCAL);
3077	struct numa_group *ng;
3078	int priv;
3079
3080	if (!static_branch_likely(&sched_numa_balancing))
3081		return;
3082
3083	/* for example, ksmd faulting in a user's mm */
3084	if (!p->mm)
3085		return;
3086
3087	/*
3088	 * NUMA faults statistics are unnecessary for the slow memory
3089	 * node for memory tiering mode.
3090	 */
3091	if (!node_is_toptier(mem_node) &&
3092	    (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
3093	     !cpupid_valid(last_cpupid)))
3094		return;
3095
3096	/* Allocate buffer to track faults on a per-node basis */
3097	if (unlikely(!p->numa_faults)) {
3098		int size = sizeof(*p->numa_faults) *
3099			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
3100
3101		p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
3102		if (!p->numa_faults)
3103			return;
3104
3105		p->total_numa_faults = 0;
3106		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
3107	}
3108
3109	/*
3110	 * First accesses are treated as private, otherwise consider accesses
3111	 * to be private if the accessing pid has not changed
3112	 */
3113	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
3114		priv = 1;
3115	} else {
3116		priv = cpupid_match_pid(p, last_cpupid);
3117		if (!priv && !(flags & TNF_NO_GROUP))
3118			task_numa_group(p, last_cpupid, flags, &priv);
3119	}
3120
3121	/*
3122	 * If a workload spans multiple NUMA nodes, a shared fault that
3123	 * occurs wholly within the set of nodes that the workload is
3124	 * actively using should be counted as local. This allows the
3125	 * scan rate to slow down when a workload has settled down.
3126	 */
3127	ng = deref_curr_numa_group(p);
3128	if (!priv && !local && ng && ng->active_nodes > 1 &&
3129				numa_is_active_node(cpu_node, ng) &&
3130				numa_is_active_node(mem_node, ng))
3131		local = 1;
3132
3133	/*
3134	 * Retry to migrate task to preferred node periodically, in case it
3135	 * previously failed, or the scheduler moved us.
3136	 */
3137	if (time_after(jiffies, p->numa_migrate_retry)) {
3138		task_numa_placement(p);
3139		numa_migrate_preferred(p);
3140	}
3141
3142	if (migrated)
3143		p->numa_pages_migrated += pages;
3144	if (flags & TNF_MIGRATE_FAIL)
3145		p->numa_faults_locality[2] += pages;
3146
3147	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
3148	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
3149	p->numa_faults_locality[local] += pages;
3150}
3151
3152static void reset_ptenuma_scan(struct task_struct *p)
3153{
3154	/*
3155	 * We only did a read acquisition of the mmap sem, so
3156	 * p->mm->numa_scan_seq is written to without exclusive access
3157	 * and the update is not guaranteed to be atomic. That's not
3158	 * much of an issue though, since this is just used for
3159	 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
3160	 * expensive, to avoid any form of compiler optimizations:
3161	 */
3162	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
3163	p->mm->numa_scan_offset = 0;
3164}
3165
3166static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
3167{
3168	unsigned long pids;
3169	/*
3170	 * Allow unconditional access first two times, so that all the (pages)
3171	 * of VMAs get prot_none fault introduced irrespective of accesses.
3172	 * This is also done to avoid any side effect of task scanning
3173	 * amplifying the unfairness of disjoint set of VMAs' access.
3174	 */
3175	if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
3176		return true;
3177
3178	pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
3179	if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
3180		return true;
3181
3182	/*
3183	 * Complete a scan that has already started regardless of PID access, or
3184	 * some VMAs may never be scanned in multi-threaded applications:
3185	 */
3186	if (mm->numa_scan_offset > vma->vm_start) {
3187		trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
3188		return true;
3189	}
3190
3191	return false;
3192}
3193
3194#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
3195
3196/*
3197 * The expensive part of numa migration is done from task_work context.
3198 * Triggered from task_tick_numa().
3199 */
3200static void task_numa_work(struct callback_head *work)
3201{
3202	unsigned long migrate, next_scan, now = jiffies;
3203	struct task_struct *p = current;
3204	struct mm_struct *mm = p->mm;
3205	u64 runtime = p->se.sum_exec_runtime;
3206	struct vm_area_struct *vma;
3207	unsigned long start, end;
3208	unsigned long nr_pte_updates = 0;
3209	long pages, virtpages;
3210	struct vma_iterator vmi;
3211	bool vma_pids_skipped;
3212	bool vma_pids_forced = false;
3213
3214	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
3215
3216	work->next = work;
3217	/*
3218	 * Who cares about NUMA placement when they're dying.
3219	 *
3220	 * NOTE: make sure not to dereference p->mm before this check,
3221	 * exit_task_work() happens _after_ exit_mm() so we could be called
3222	 * without p->mm even though we still had it when we enqueued this
3223	 * work.
3224	 */
3225	if (p->flags & PF_EXITING)
3226		return;
3227
3228	if (!mm->numa_next_scan) {
3229		mm->numa_next_scan = now +
3230			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
3231	}
3232
3233	/*
3234	 * Enforce maximal scan/migration frequency..
3235	 */
3236	migrate = mm->numa_next_scan;
3237	if (time_before(now, migrate))
3238		return;
3239
3240	if (p->numa_scan_period == 0) {
3241		p->numa_scan_period_max = task_scan_max(p);
3242		p->numa_scan_period = task_scan_start(p);
3243	}
3244
3245	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
3246	if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
3247		return;
3248
3249	/*
3250	 * Delay this task enough that another task of this mm will likely win
3251	 * the next time around.
3252	 */
3253	p->node_stamp += 2 * TICK_NSEC;
3254
3255	pages = sysctl_numa_balancing_scan_size;
3256	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
3257	virtpages = pages * 8;	   /* Scan up to this much virtual space */
3258	if (!pages)
3259		return;
3260
3261
3262	if (!mmap_read_trylock(mm))
3263		return;
3264
3265	/*
3266	 * VMAs are skipped if the current PID has not trapped a fault within
3267	 * the VMA recently. Allow scanning to be forced if there is no
3268	 * suitable VMA remaining.
3269	 */
3270	vma_pids_skipped = false;
3271
3272retry_pids:
3273	start = mm->numa_scan_offset;
3274	vma_iter_init(&vmi, mm, start);
3275	vma = vma_next(&vmi);
3276	if (!vma) {
3277		reset_ptenuma_scan(p);
3278		start = 0;
3279		vma_iter_set(&vmi, start);
3280		vma = vma_next(&vmi);
3281	}
3282
3283	do {
3284		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
3285			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
3286			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
3287			continue;
3288		}
3289
3290		/*
3291		 * Shared library pages mapped by multiple processes are not
3292		 * migrated as it is expected they are cache replicated. Avoid
3293		 * hinting faults in read-only file-backed mappings or the vDSO
3294		 * as migrating the pages will be of marginal benefit.
3295		 */
3296		if (!vma->vm_mm ||
3297		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
3298			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
3299			continue;
3300		}
3301
3302		/*
3303		 * Skip inaccessible VMAs to avoid any confusion between
3304		 * PROT_NONE and NUMA hinting PTEs
3305		 */
3306		if (!vma_is_accessible(vma)) {
3307			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
3308			continue;
3309		}
3310
3311		/* Initialise new per-VMA NUMAB state. */
3312		if (!vma->numab_state) {
3313			vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
3314				GFP_KERNEL);
3315			if (!vma->numab_state)
3316				continue;
3317
3318			vma->numab_state->start_scan_seq = mm->numa_scan_seq;
3319
3320			vma->numab_state->next_scan = now +
3321				msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
3322
3323			/* Reset happens after 4 times scan delay of scan start */
3324			vma->numab_state->pids_active_reset =  vma->numab_state->next_scan +
3325				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
3326
3327			/*
3328			 * Ensure prev_scan_seq does not match numa_scan_seq,
3329			 * to prevent VMAs being skipped prematurely on the
3330			 * first scan:
3331			 */
3332			 vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
3333		}
3334
3335		/*
3336		 * Scanning the VMAs of short lived tasks add more overhead. So
3337		 * delay the scan for new VMAs.
3338		 */
3339		if (mm->numa_scan_seq && time_before(jiffies,
3340						vma->numab_state->next_scan)) {
3341			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
3342			continue;
3343		}
3344
3345		/* RESET access PIDs regularly for old VMAs. */
3346		if (mm->numa_scan_seq &&
3347				time_after(jiffies, vma->numab_state->pids_active_reset)) {
3348			vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
3349				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
3350			vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
3351			vma->numab_state->pids_active[1] = 0;
3352		}
3353
3354		/* Do not rescan VMAs twice within the same sequence. */
3355		if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
3356			mm->numa_scan_offset = vma->vm_end;
3357			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
3358			continue;
3359		}
3360
3361		/*
3362		 * Do not scan the VMA if task has not accessed it, unless no other
3363		 * VMA candidate exists.
3364		 */
3365		if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
3366			vma_pids_skipped = true;
3367			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
3368			continue;
3369		}
3370
3371		do {
3372			start = max(start, vma->vm_start);
3373			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
3374			end = min(end, vma->vm_end);
3375			nr_pte_updates = change_prot_numa(vma, start, end);
3376
3377			/*
3378			 * Try to scan sysctl_numa_balancing_size worth of
3379			 * hpages that have at least one present PTE that
3380			 * is not already PTE-numa. If the VMA contains
3381			 * areas that are unused or already full of prot_numa
3382			 * PTEs, scan up to virtpages, to skip through those
3383			 * areas faster.
3384			 */
3385			if (nr_pte_updates)
3386				pages -= (end - start) >> PAGE_SHIFT;
3387			virtpages -= (end - start) >> PAGE_SHIFT;
3388
3389			start = end;
3390			if (pages <= 0 || virtpages <= 0)
3391				goto out;
3392
3393			cond_resched();
3394		} while (end != vma->vm_end);
3395
3396		/* VMA scan is complete, do not scan until next sequence. */
3397		vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
3398
3399		/*
3400		 * Only force scan within one VMA at a time, to limit the
3401		 * cost of scanning a potentially uninteresting VMA.
3402		 */
3403		if (vma_pids_forced)
3404			break;
3405	} for_each_vma(vmi, vma);
3406
3407	/*
3408	 * If no VMAs are remaining and VMAs were skipped due to the PID
3409	 * not accessing the VMA previously, then force a scan to ensure
3410	 * forward progress:
3411	 */
3412	if (!vma && !vma_pids_forced && vma_pids_skipped) {
3413		vma_pids_forced = true;
3414		goto retry_pids;
3415	}
3416
3417out:
3418	/*
3419	 * It is possible to reach the end of the VMA list but the last few
3420	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
3421	 * would find the !migratable VMA on the next scan but not reset the
3422	 * scanner to the start so check it now.
3423	 */
3424	if (vma)
3425		mm->numa_scan_offset = start;
3426	else
3427		reset_ptenuma_scan(p);
3428	mmap_read_unlock(mm);
3429
3430	/*
3431	 * Make sure tasks use at least 32x as much time to run other code
3432	 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
3433	 * Usually update_task_scan_period slows down scanning enough; on an
3434	 * overloaded system we need to limit overhead on a per task basis.
3435	 */
3436	if (unlikely(p->se.sum_exec_runtime != runtime)) {
3437		u64 diff = p->se.sum_exec_runtime - runtime;
3438		p->node_stamp += 32 * diff;
3439	}
3440}
3441
3442void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
3443{
3444	int mm_users = 0;
3445	struct mm_struct *mm = p->mm;
3446
3447	if (mm) {
3448		mm_users = atomic_read(&mm->mm_users);
3449		if (mm_users == 1) {
3450			mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
3451			mm->numa_scan_seq = 0;
3452		}
3453	}
3454	p->node_stamp			= 0;
3455	p->numa_scan_seq		= mm ? mm->numa_scan_seq : 0;
3456	p->numa_scan_period		= sysctl_numa_balancing_scan_delay;
3457	p->numa_migrate_retry		= 0;
3458	/* Protect against double add, see task_tick_numa and task_numa_work */
3459	p->numa_work.next		= &p->numa_work;
3460	p->numa_faults			= NULL;
3461	p->numa_pages_migrated		= 0;
3462	p->total_numa_faults		= 0;
3463	RCU_INIT_POINTER(p->numa_group, NULL);
3464	p->last_task_numa_placement	= 0;
3465	p->last_sum_exec_runtime	= 0;
3466
3467	init_task_work(&p->numa_work, task_numa_work);
3468
3469	/* New address space, reset the preferred nid */
3470	if (!(clone_flags & CLONE_VM)) {
3471		p->numa_preferred_nid = NUMA_NO_NODE;
3472		return;
3473	}
3474
3475	/*
3476	 * New thread, keep existing numa_preferred_nid which should be copied
3477	 * already by arch_dup_task_struct but stagger when scans start.
3478	 */
3479	if (mm) {
3480		unsigned int delay;
3481
3482		delay = min_t(unsigned int, task_scan_max(current),
3483			current->numa_scan_period * mm_users * NSEC_PER_MSEC);
3484		delay += 2 * TICK_NSEC;
3485		p->node_stamp = delay;
3486	}
3487}
3488
3489/*
3490 * Drive the periodic memory faults..
3491 */
3492static void task_tick_numa(struct rq *rq, struct task_struct *curr)
3493{
3494	struct callback_head *work = &curr->numa_work;
3495	u64 period, now;
3496
3497	/*
3498	 * We don't care about NUMA placement if we don't have memory.
3499	 */
3500	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
3501		return;
3502
3503	/*
3504	 * Using runtime rather than walltime has the dual advantage that
3505	 * we (mostly) drive the selection from busy threads and that the
3506	 * task needs to have done some actual work before we bother with
3507	 * NUMA placement.
3508	 */
3509	now = curr->se.sum_exec_runtime;
3510	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
3511
3512	if (now > curr->node_stamp + period) {
3513		if (!curr->node_stamp)
3514			curr->numa_scan_period = task_scan_start(curr);
3515		curr->node_stamp += period;
3516
3517		if (!time_before(jiffies, curr->mm->numa_next_scan))
3518			task_work_add(curr, work, TWA_RESUME);
3519	}
3520}
3521
3522static void update_scan_period(struct task_struct *p, int new_cpu)
3523{
3524	int src_nid = cpu_to_node(task_cpu(p));
3525	int dst_nid = cpu_to_node(new_cpu);
3526
3527	if (!static_branch_likely(&sched_numa_balancing))
3528		return;
3529
3530	if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
3531		return;
3532
3533	if (src_nid == dst_nid)
3534		return;
3535
3536	/*
3537	 * Allow resets if faults have been trapped before one scan
3538	 * has completed. This is most likely due to a new task that
3539	 * is pulled cross-node due to wakeups or load balancing.
3540	 */
3541	if (p->numa_scan_seq) {
3542		/*
3543		 * Avoid scan adjustments if moving to the preferred
3544		 * node or if the task was not previously running on
3545		 * the preferred node.
3546		 */
3547		if (dst_nid == p->numa_preferred_nid ||
3548		    (p->numa_preferred_nid != NUMA_NO_NODE &&
3549			src_nid != p->numa_preferred_nid))
3550			return;
3551	}
3552
3553	p->numa_scan_period = task_scan_start(p);
3554}
3555
3556#else
3557static void task_tick_numa(struct rq *rq, struct task_struct *curr)
3558{
3559}
3560
3561static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
3562{
3563}
3564
3565static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
3566{
3567}
3568
3569static inline void update_scan_period(struct task_struct *p, int new_cpu)
3570{
3571}
3572
3573#endif /* CONFIG_NUMA_BALANCING */
3574
3575static void
3576account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
3577{
3578	update_load_add(&cfs_rq->load, se->load.weight);
3579#ifdef CONFIG_SMP
3580	if (entity_is_task(se)) {
3581		struct rq *rq = rq_of(cfs_rq);
3582
3583		account_numa_enqueue(rq, task_of(se));
3584		list_add(&se->group_node, &rq->cfs_tasks);
3585	}
3586#endif
3587	cfs_rq->nr_running++;
3588	if (se_is_idle(se))
3589		cfs_rq->idle_nr_running++;
3590}
3591
3592static void
3593account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
3594{
3595	update_load_sub(&cfs_rq->load, se->load.weight);
3596#ifdef CONFIG_SMP
3597	if (entity_is_task(se)) {
3598		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
3599		list_del_init(&se->group_node);
3600	}
3601#endif
3602	cfs_rq->nr_running--;
3603	if (se_is_idle(se))
3604		cfs_rq->idle_nr_running--;
3605}
3606
3607/*
3608 * Signed add and clamp on underflow.
3609 *
3610 * Explicitly do a load-store to ensure the intermediate value never hits
3611 * memory. This allows lockless observations without ever seeing the negative
3612 * values.
3613 */
3614#define add_positive(_ptr, _val) do {                           \
3615	typeof(_ptr) ptr = (_ptr);                              \
3616	typeof(_val) val = (_val);                              \
3617	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
3618								\
3619	res = var + val;                                        \
3620								\
3621	if (val < 0 && res > var)                               \
3622		res = 0;                                        \
3623								\
3624	WRITE_ONCE(*ptr, res);                                  \
3625} while (0)
3626
3627/*
3628 * Unsigned subtract and clamp on underflow.
3629 *
3630 * Explicitly do a load-store to ensure the intermediate value never hits
3631 * memory. This allows lockless observations without ever seeing the negative
3632 * values.
3633 */
3634#define sub_positive(_ptr, _val) do {				\
3635	typeof(_ptr) ptr = (_ptr);				\
3636	typeof(*ptr) val = (_val);				\
3637	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
3638	res = var - val;					\
3639	if (res > var)						\
3640		res = 0;					\
3641	WRITE_ONCE(*ptr, res);					\
3642} while (0)
3643
3644/*
3645 * Remove and clamp on negative, from a local variable.
3646 *
3647 * A variant of sub_positive(), which does not use explicit load-store
3648 * and is thus optimized for local variable updates.
3649 */
3650#define lsub_positive(_ptr, _val) do {				\
3651	typeof(_ptr) ptr = (_ptr);				\
3652	*ptr -= min_t(typeof(*ptr), *ptr, _val);		\
3653} while (0)
3654
3655#ifdef CONFIG_SMP
3656static inline void
3657enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3658{
3659	cfs_rq->avg.load_avg += se->avg.load_avg;
3660	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
3661}
3662
3663static inline void
3664dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3665{
3666	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3667	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
3668	/* See update_cfs_rq_load_avg() */
3669	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
3670					  cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
3671}
3672#else
3673static inline void
3674enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3675static inline void
3676dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3677#endif
3678
3679static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
3680			   unsigned long weight)
3681{
3682	unsigned long old_weight = se->load.weight;
3683	s64 vlag, vslice;
3684
3685	/*
3686	 * VRUNTIME
3687	 * --------
3688	 *
3689	 * COROLLARY #1: The virtual runtime of the entity needs to be
3690	 * adjusted if re-weight at !0-lag point.
3691	 *
3692	 * Proof: For contradiction assume this is not true, so we can
3693	 * re-weight without changing vruntime at !0-lag point.
3694	 *
3695	 *             Weight	VRuntime   Avg-VRuntime
3696	 *     before    w          v            V
3697	 *      after    w'         v'           V'
3698	 *
3699	 * Since lag needs to be preserved through re-weight:
3700	 *
3701	 *	lag = (V - v)*w = (V'- v')*w', where v = v'
3702	 *	==>	V' = (V - v)*w/w' + v		(1)
3703	 *
3704	 * Let W be the total weight of the entities before reweight,
3705	 * since V' is the new weighted average of entities:
3706	 *
3707	 *	V' = (WV + w'v - wv) / (W + w' - w)	(2)
3708	 *
3709	 * by using (1) & (2) we obtain:
3710	 *
3711	 *	(WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
3712	 *	==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
3713	 *	==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
3714	 *	==>	(V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
3715	 *
3716	 * Since we are doing at !0-lag point which means V != v, we
3717	 * can simplify (3):
3718	 *
3719	 *	==>	W / (W + w' - w) = w / w'
3720	 *	==>	Ww' = Ww + ww' - ww
3721	 *	==>	W * (w' - w) = w * (w' - w)
3722	 *	==>	W = w	(re-weight indicates w' != w)
3723	 *
3724	 * So the cfs_rq contains only one entity, hence vruntime of
3725	 * the entity @v should always equal to the cfs_rq's weighted
3726	 * average vruntime @V, which means we will always re-weight
3727	 * at 0-lag point, thus breach assumption. Proof completed.
3728	 *
3729	 *
3730	 * COROLLARY #2: Re-weight does NOT affect weighted average
3731	 * vruntime of all the entities.
3732	 *
3733	 * Proof: According to corollary #1, Eq. (1) should be:
3734	 *
3735	 *	(V - v)*w = (V' - v')*w'
3736	 *	==>    v' = V' - (V - v)*w/w'		(4)
3737	 *
3738	 * According to the weighted average formula, we have:
3739	 *
3740	 *	V' = (WV - wv + w'v') / (W - w + w')
3741	 *	   = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
3742	 *	   = (WV - wv + w'V' - Vw + wv) / (W - w + w')
3743	 *	   = (WV + w'V' - Vw) / (W - w + w')
3744	 *
3745	 *	==>  V'*(W - w + w') = WV + w'V' - Vw
3746	 *	==>	V' * (W - w) = (W - w) * V	(5)
3747	 *
3748	 * If the entity is the only one in the cfs_rq, then reweight
3749	 * always occurs at 0-lag point, so V won't change. Or else
3750	 * there are other entities, hence W != w, then Eq. (5) turns
3751	 * into V' = V. So V won't change in either case, proof done.
3752	 *
3753	 *
3754	 * So according to corollary #1 & #2, the effect of re-weight
3755	 * on vruntime should be:
3756	 *
3757	 *	v' = V' - (V - v) * w / w'		(4)
3758	 *	   = V  - (V - v) * w / w'
3759	 *	   = V  - vl * w / w'
3760	 *	   = V  - vl'
3761	 */
3762	if (avruntime != se->vruntime) {
3763		vlag = entity_lag(avruntime, se);
3764		vlag = div_s64(vlag * old_weight, weight);
3765		se->vruntime = avruntime - vlag;
3766	}
3767
3768	/*
3769	 * DEADLINE
3770	 * --------
3771	 *
3772	 * When the weight changes, the virtual time slope changes and
3773	 * we should adjust the relative virtual deadline accordingly.
3774	 *
3775	 *	d' = v' + (d - v)*w/w'
3776	 *	   = V' - (V - v)*w/w' + (d - v)*w/w'
3777	 *	   = V  - (V - v)*w/w' + (d - v)*w/w'
3778	 *	   = V  + (d - V)*w/w'
3779	 */
3780	vslice = (s64)(se->deadline - avruntime);
3781	vslice = div_s64(vslice * old_weight, weight);
3782	se->deadline = avruntime + vslice;
3783}
3784
3785static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
3786			    unsigned long weight)
3787{
3788	bool curr = cfs_rq->curr == se;
3789	u64 avruntime;
3790
3791	if (se->on_rq) {
3792		/* commit outstanding execution time */
3793		update_curr(cfs_rq);
3794		avruntime = avg_vruntime(cfs_rq);
3795		if (!curr)
3796			__dequeue_entity(cfs_rq, se);
3797		update_load_sub(&cfs_rq->load, se->load.weight);
3798	}
3799	dequeue_load_avg(cfs_rq, se);
3800
3801	if (se->on_rq) {
3802		reweight_eevdf(se, avruntime, weight);
3803	} else {
3804		/*
3805		 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
3806		 * we need to scale se->vlag when w_i changes.
3807		 */
3808		se->vlag = div_s64(se->vlag * se->load.weight, weight);
3809	}
3810
3811	update_load_set(&se->load, weight);
3812
3813#ifdef CONFIG_SMP
3814	do {
3815		u32 divider = get_pelt_divider(&se->avg);
3816
3817		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
3818	} while (0);
3819#endif
3820
3821	enqueue_load_avg(cfs_rq, se);
3822	if (se->on_rq) {
3823		update_load_add(&cfs_rq->load, se->load.weight);
3824		if (!curr)
3825			__enqueue_entity(cfs_rq, se);
3826
3827		/*
3828		 * The entity's vruntime has been adjusted, so let's check
3829		 * whether the rq-wide min_vruntime needs updated too. Since
3830		 * the calculations above require stable min_vruntime rather
3831		 * than up-to-date one, we do the update at the end of the
3832		 * reweight process.
3833		 */
3834		update_min_vruntime(cfs_rq);
3835	}
3836}
3837
3838void reweight_task(struct task_struct *p, int prio)
3839{
3840	struct sched_entity *se = &p->se;
3841	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3842	struct load_weight *load = &se->load;
3843	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
3844
3845	reweight_entity(cfs_rq, se, weight);
3846	load->inv_weight = sched_prio_to_wmult[prio];
3847}
3848
3849static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3850
3851#ifdef CONFIG_FAIR_GROUP_SCHED
3852#ifdef CONFIG_SMP
3853/*
3854 * All this does is approximate the hierarchical proportion which includes that
3855 * global sum we all love to hate.
3856 *
3857 * That is, the weight of a group entity, is the proportional share of the
3858 * group weight based on the group runqueue weights. That is:
3859 *
3860 *                     tg->weight * grq->load.weight
3861 *   ge->load.weight = -----------------------------               (1)
3862 *                       \Sum grq->load.weight
3863 *
3864 * Now, because computing that sum is prohibitively expensive to compute (been
3865 * there, done that) we approximate it with this average stuff. The average
3866 * moves slower and therefore the approximation is cheaper and more stable.
3867 *
3868 * So instead of the above, we substitute:
3869 *
3870 *   grq->load.weight -> grq->avg.load_avg                         (2)
3871 *
3872 * which yields the following:
3873 *
3874 *                     tg->weight * grq->avg.load_avg
3875 *   ge->load.weight = ------------------------------              (3)
3876 *                             tg->load_avg
3877 *
3878 * Where: tg->load_avg ~= \Sum grq->avg.load_avg
3879 *
3880 * That is shares_avg, and it is right (given the approximation (2)).
3881 *
3882 * The problem with it is that because the average is slow -- it was designed
3883 * to be exactly that of course -- this leads to transients in boundary
3884 * conditions. In specific, the case where the group was idle and we start the
3885 * one task. It takes time for our CPU's grq->avg.load_avg to build up,
3886 * yielding bad latency etc..
3887 *
3888 * Now, in that special case (1) reduces to:
3889 *
3890 *                     tg->weight * grq->load.weight
3891 *   ge->load.weight = ----------------------------- = tg->weight   (4)
3892 *                         grp->load.weight
3893 *
3894 * That is, the sum collapses because all other CPUs are idle; the UP scenario.
3895 *
3896 * So what we do is modify our approximation (3) to approach (4) in the (near)
3897 * UP case, like:
3898 *
3899 *   ge->load.weight =
3900 *
3901 *              tg->weight * grq->load.weight
3902 *     ---------------------------------------------------         (5)
3903 *     tg->load_avg - grq->avg.load_avg + grq->load.weight
3904 *
3905 * But because grq->load.weight can drop to 0, resulting in a divide by zero,
3906 * we need to use grq->avg.load_avg as its lower bound, which then gives:
3907 *
3908 *
3909 *                     tg->weight * grq->load.weight
3910 *   ge->load.weight = -----------------------------		   (6)
3911 *                             tg_load_avg'
3912 *
3913 * Where:
3914 *
3915 *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
3916 *                  max(grq->load.weight, grq->avg.load_avg)
3917 *
3918 * And that is shares_weight and is icky. In the (near) UP case it approaches
3919 * (4) while in the normal case it approaches (3). It consistently
3920 * overestimates the ge->load.weight and therefore:
3921 *
3922 *   \Sum ge->load.weight >= tg->weight
3923 *
3924 * hence icky!
3925 */
3926static long calc_group_shares(struct cfs_rq *cfs_rq)
3927{
3928	long tg_weight, tg_shares, load, shares;
3929	struct task_group *tg = cfs_rq->tg;
3930
3931	tg_shares = READ_ONCE(tg->shares);
3932
3933	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3934
3935	tg_weight = atomic_long_read(&tg->load_avg);
3936
3937	/* Ensure tg_weight >= load */
3938	tg_weight -= cfs_rq->tg_load_avg_contrib;
3939	tg_weight += load;
3940
3941	shares = (tg_shares * load);
3942	if (tg_weight)
3943		shares /= tg_weight;
3944
3945	/*
3946	 * MIN_SHARES has to be unscaled here to support per-CPU partitioning
3947	 * of a group with small tg->shares value. It is a floor value which is
3948	 * assigned as a minimum load.weight to the sched_entity representing
3949	 * the group on a CPU.
3950	 *
3951	 * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
3952	 * on an 8-core system with 8 tasks each runnable on one CPU shares has
3953	 * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
3954	 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
3955	 * instead of 0.
3956	 */
3957	return clamp_t(long, shares, MIN_SHARES, tg_shares);
3958}
3959#endif /* CONFIG_SMP */
3960
3961/*
3962 * Recomputes the group entity based on the current state of its group
3963 * runqueue.
3964 */
3965static void update_cfs_group(struct sched_entity *se)
3966{
3967	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3968	long shares;
3969
3970	if (!gcfs_rq)
3971		return;
3972
3973	if (throttled_hierarchy(gcfs_rq))
3974		return;
3975
3976#ifndef CONFIG_SMP
3977	shares = READ_ONCE(gcfs_rq->tg->shares);
3978#else
3979	shares = calc_group_shares(gcfs_rq);
3980#endif
3981	if (unlikely(se->load.weight != shares))
3982		reweight_entity(cfs_rq_of(se), se, shares);
3983}
3984
3985#else /* CONFIG_FAIR_GROUP_SCHED */
3986static inline void update_cfs_group(struct sched_entity *se)
3987{
3988}
3989#endif /* CONFIG_FAIR_GROUP_SCHED */
3990
3991static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3992{
3993	struct rq *rq = rq_of(cfs_rq);
3994
3995	if (&rq->cfs == cfs_rq) {
3996		/*
3997		 * There are a few boundary cases this might miss but it should
3998		 * get called often enough that that should (hopefully) not be
3999		 * a real problem.
4000		 *
4001		 * It will not get called when we go idle, because the idle
4002		 * thread is a different class (!fair), nor will the utilization
4003		 * number include things like RT tasks.
4004		 *
4005		 * As is, the util number is not freq-invariant (we'd have to
4006		 * implement arch_scale_freq_capacity() for that).
4007		 *
4008		 * See cpu_util_cfs().
4009		 */
4010		cpufreq_update_util(rq, flags);
4011	}
4012}
4013
4014#ifdef CONFIG_SMP
4015static inline bool load_avg_is_decayed(struct sched_avg *sa)
4016{
4017	if (sa->load_sum)
4018		return false;
4019
4020	if (sa->util_sum)
4021		return false;
4022
4023	if (sa->runnable_sum)
4024		return false;
4025
4026	/*
4027	 * _avg must be null when _sum are null because _avg = _sum / divider
4028	 * Make sure that rounding and/or propagation of PELT values never
4029	 * break this.
4030	 */
4031	SCHED_WARN_ON(sa->load_avg ||
4032		      sa->util_avg ||
4033		      sa->runnable_avg);
4034
4035	return true;
4036}
4037
4038static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4039{
4040	return u64_u32_load_copy(cfs_rq->avg.last_update_time,
4041				 cfs_rq->last_update_time_copy);
4042}
4043#ifdef CONFIG_FAIR_GROUP_SCHED
4044/*
4045 * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
4046 * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
4047 * bottom-up, we only have to test whether the cfs_rq before us on the list
4048 * is our child.
4049 * If cfs_rq is not on the list, test whether a child needs its to be added to
4050 * connect a branch to the tree  * (see list_add_leaf_cfs_rq() for details).
4051 */
4052static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
4053{
4054	struct cfs_rq *prev_cfs_rq;
4055	struct list_head *prev;
4056
4057	if (cfs_rq->on_list) {
4058		prev = cfs_rq->leaf_cfs_rq_list.prev;
4059	} else {
4060		struct rq *rq = rq_of(cfs_rq);
4061
4062		prev = rq->tmp_alone_branch;
4063	}
4064
4065	prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
4066
4067	return (prev_cfs_rq->tg->parent == cfs_rq->tg);
4068}
4069
4070static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
4071{
4072	if (cfs_rq->load.weight)
4073		return false;
4074
4075	if (!load_avg_is_decayed(&cfs_rq->avg))
4076		return false;
4077
4078	if (child_cfs_rq_on_list(cfs_rq))
4079		return false;
4080
4081	return true;
4082}
4083
4084/**
4085 * update_tg_load_avg - update the tg's load avg
4086 * @cfs_rq: the cfs_rq whose avg changed
4087 *
4088 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
4089 * However, because tg->load_avg is a global value there are performance
4090 * considerations.
4091 *
4092 * In order to avoid having to look at the other cfs_rq's, we use a
4093 * differential update where we store the last value we propagated. This in
4094 * turn allows skipping updates if the differential is 'small'.
4095 *
4096 * Updating tg's load_avg is necessary before update_cfs_share().
4097 */
4098static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
4099{
4100	long delta;
4101	u64 now;
4102
4103	/*
4104	 * No need to update load_avg for root_task_group as it is not used.
4105	 */
4106	if (cfs_rq->tg == &root_task_group)
4107		return;
4108
4109	/* rq has been offline and doesn't contribute to the share anymore: */
4110	if (!cpu_active(cpu_of(rq_of(cfs_rq))))
4111		return;
4112
4113	/*
4114	 * For migration heavy workloads, access to tg->load_avg can be
4115	 * unbound. Limit the update rate to at most once per ms.
4116	 */
4117	now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
4118	if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
4119		return;
4120
4121	delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
4122	if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
4123		atomic_long_add(delta, &cfs_rq->tg->load_avg);
4124		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
4125		cfs_rq->last_update_tg_load_avg = now;
4126	}
4127}
4128
4129static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
4130{
4131	long delta;
4132	u64 now;
4133
4134	/*
4135	 * No need to update load_avg for root_task_group, as it is not used.
4136	 */
4137	if (cfs_rq->tg == &root_task_group)
4138		return;
4139
4140	now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
4141	delta = 0 - cfs_rq->tg_load_avg_contrib;
4142	atomic_long_add(delta, &cfs_rq->tg->load_avg);
4143	cfs_rq->tg_load_avg_contrib = 0;
4144	cfs_rq->last_update_tg_load_avg = now;
4145}
4146
4147/* CPU offline callback: */
4148static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
4149{
4150	struct task_group *tg;
4151
4152	lockdep_assert_rq_held(rq);
4153
4154	/*
4155	 * The rq clock has already been updated in
4156	 * set_rq_offline(), so we should skip updating
4157	 * the rq clock again in unthrottle_cfs_rq().
4158	 */
4159	rq_clock_start_loop_update(rq);
4160
4161	rcu_read_lock();
4162	list_for_each_entry_rcu(tg, &task_groups, list) {
4163		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4164
4165		clear_tg_load_avg(cfs_rq);
4166	}
4167	rcu_read_unlock();
4168
4169	rq_clock_stop_loop_update(rq);
4170}
4171
4172/*
4173 * Called within set_task_rq() right before setting a task's CPU. The
4174 * caller only guarantees p->pi_lock is held; no other assumptions,
4175 * including the state of rq->lock, should be made.
4176 */
4177void set_task_rq_fair(struct sched_entity *se,
4178		      struct cfs_rq *prev, struct cfs_rq *next)
4179{
4180	u64 p_last_update_time;
4181	u64 n_last_update_time;
4182
4183	if (!sched_feat(ATTACH_AGE_LOAD))
4184		return;
4185
4186	/*
4187	 * We are supposed to update the task to "current" time, then its up to
4188	 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
4189	 * getting what current time is, so simply throw away the out-of-date
4190	 * time. This will result in the wakee task is less decayed, but giving
4191	 * the wakee more load sounds not bad.
4192	 */
4193	if (!(se->avg.last_update_time && prev))
4194		return;
4195
4196	p_last_update_time = cfs_rq_last_update_time(prev);
4197	n_last_update_time = cfs_rq_last_update_time(next);
4198
4199	__update_load_avg_blocked_se(p_last_update_time, se);
4200	se->avg.last_update_time = n_last_update_time;
4201}
4202
4203/*
4204 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
4205 * propagate its contribution. The key to this propagation is the invariant
4206 * that for each group:
4207 *
4208 *   ge->avg == grq->avg						(1)
4209 *
4210 * _IFF_ we look at the pure running and runnable sums. Because they
4211 * represent the very same entity, just at different points in the hierarchy.
4212 *
4213 * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
4214 * and simply copies the running/runnable sum over (but still wrong, because
4215 * the group entity and group rq do not have their PELT windows aligned).
4216 *
4217 * However, update_tg_cfs_load() is more complex. So we have:
4218 *
4219 *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg		(2)
4220 *
4221 * And since, like util, the runnable part should be directly transferable,
4222 * the following would _appear_ to be the straight forward approach:
4223 *
4224 *   grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg	(3)
4225 *
4226 * And per (1) we have:
4227 *
4228 *   ge->avg.runnable_avg == grq->avg.runnable_avg
4229 *
4230 * Which gives:
4231 *
4232 *                      ge->load.weight * grq->avg.load_avg
4233 *   ge->avg.load_avg = -----------------------------------		(4)
4234 *                               grq->load.weight
4235 *
4236 * Except that is wrong!
4237 *
4238 * Because while for entities historical weight is not important and we
4239 * really only care about our future and therefore can consider a pure
4240 * runnable sum, runqueues can NOT do this.
4241 *
4242 * We specifically want runqueues to have a load_avg that includes
4243 * historical weights. Those represent the blocked load, the load we expect
4244 * to (shortly) return to us. This only works by keeping the weights as
4245 * integral part of the sum. We therefore cannot decompose as per (3).
4246 *
4247 * Another reason this doesn't work is that runnable isn't a 0-sum entity.
4248 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
4249 * rq itself is runnable anywhere between 2/3 and 1 depending on how the
4250 * runnable section of these tasks overlap (or not). If they were to perfectly
4251 * align the rq as a whole would be runnable 2/3 of the time. If however we
4252 * always have at least 1 runnable task, the rq as a whole is always runnable.
4253 *
4254 * So we'll have to approximate.. :/
4255 *
4256 * Given the constraint:
4257 *
4258 *   ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
4259 *
4260 * We can construct a rule that adds runnable to a rq by assuming minimal
4261 * overlap.
4262 *
4263 * On removal, we'll assume each task is equally runnable; which yields:
4264 *
4265 *   grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
4266 *
4267 * XXX: only do this for the part of runnable > running ?
4268 *
4269 */
4270static inline void
4271update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
4272{
4273	long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
4274	u32 new_sum, divider;
4275
4276	/* Nothing to update */
4277	if (!delta_avg)
4278		return;
4279
4280	/*
4281	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4282	 * See ___update_load_avg() for details.
4283	 */
4284	divider = get_pelt_divider(&cfs_rq->avg);
4285
4286
4287	/* Set new sched_entity's utilization */
4288	se->avg.util_avg = gcfs_rq->avg.util_avg;
4289	new_sum = se->avg.util_avg * divider;
4290	delta_sum = (long)new_sum - (long)se->avg.util_sum;
4291	se->avg.util_sum = new_sum;
4292
4293	/* Update parent cfs_rq utilization */
4294	add_positive(&cfs_rq->avg.util_avg, delta_avg);
4295	add_positive(&cfs_rq->avg.util_sum, delta_sum);
4296
4297	/* See update_cfs_rq_load_avg() */
4298	cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
4299					  cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
4300}
4301
4302static inline void
4303update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
4304{
4305	long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
4306	u32 new_sum, divider;
4307
4308	/* Nothing to update */
4309	if (!delta_avg)
4310		return;
4311
4312	/*
4313	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4314	 * See ___update_load_avg() for details.
4315	 */
4316	divider = get_pelt_divider(&cfs_rq->avg);
4317
4318	/* Set new sched_entity's runnable */
4319	se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
4320	new_sum = se->avg.runnable_avg * divider;
4321	delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
4322	se->avg.runnable_sum = new_sum;
4323
4324	/* Update parent cfs_rq runnable */
4325	add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
4326	add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
4327	/* See update_cfs_rq_load_avg() */
4328	cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
4329					      cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
4330}
4331
4332static inline void
4333update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
4334{
4335	long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
4336	unsigned long load_avg;
4337	u64 load_sum = 0;
4338	s64 delta_sum;
4339	u32 divider;
4340
4341	if (!runnable_sum)
4342		return;
4343
4344	gcfs_rq->prop_runnable_sum = 0;
4345
4346	/*
4347	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4348	 * See ___update_load_avg() for details.
4349	 */
4350	divider = get_pelt_divider(&cfs_rq->avg);
4351
4352	if (runnable_sum >= 0) {
4353		/*
4354		 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
4355		 * the CPU is saturated running == runnable.
4356		 */
4357		runnable_sum += se->avg.load_sum;
4358		runnable_sum = min_t(long, runnable_sum, divider);
4359	} else {
4360		/*
4361		 * Estimate the new unweighted runnable_sum of the gcfs_rq by
4362		 * assuming all tasks are equally runnable.
4363		 */
4364		if (scale_load_down(gcfs_rq->load.weight)) {
4365			load_sum = div_u64(gcfs_rq->avg.load_sum,
4366				scale_load_down(gcfs_rq->load.weight));
4367		}
4368
4369		/* But make sure to not inflate se's runnable */
4370		runnable_sum = min(se->avg.load_sum, load_sum);
4371	}
4372
4373	/*
4374	 * runnable_sum can't be lower than running_sum
4375	 * Rescale running sum to be in the same range as runnable sum
4376	 * running_sum is in [0 : LOAD_AVG_MAX <<  SCHED_CAPACITY_SHIFT]
4377	 * runnable_sum is in [0 : LOAD_AVG_MAX]
4378	 */
4379	running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
4380	runnable_sum = max(runnable_sum, running_sum);
4381
4382	load_sum = se_weight(se) * runnable_sum;
4383	load_avg = div_u64(load_sum, divider);
4384
4385	delta_avg = load_avg - se->avg.load_avg;
4386	if (!delta_avg)
4387		return;
4388
4389	delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
4390
4391	se->avg.load_sum = runnable_sum;
4392	se->avg.load_avg = load_avg;
4393	add_positive(&cfs_rq->avg.load_avg, delta_avg);
4394	add_positive(&cfs_rq->avg.load_sum, delta_sum);
4395	/* See update_cfs_rq_load_avg() */
4396	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
4397					  cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
4398}
4399
4400static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
4401{
4402	cfs_rq->propagate = 1;
4403	cfs_rq->prop_runnable_sum += runnable_sum;
4404}
4405
4406/* Update task and its cfs_rq load average */
4407static inline int propagate_entity_load_avg(struct sched_entity *se)
4408{
4409	struct cfs_rq *cfs_rq, *gcfs_rq;
4410
4411	if (entity_is_task(se))
4412		return 0;
4413
4414	gcfs_rq = group_cfs_rq(se);
4415	if (!gcfs_rq->propagate)
4416		return 0;
4417
4418	gcfs_rq->propagate = 0;
4419
4420	cfs_rq = cfs_rq_of(se);
4421
4422	add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
4423
4424	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
4425	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
4426	update_tg_cfs_load(cfs_rq, se, gcfs_rq);
4427
4428	trace_pelt_cfs_tp(cfs_rq);
4429	trace_pelt_se_tp(se);
4430
4431	return 1;
4432}
4433
4434/*
4435 * Check if we need to update the load and the utilization of a blocked
4436 * group_entity:
4437 */
4438static inline bool skip_blocked_update(struct sched_entity *se)
4439{
4440	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
4441
4442	/*
4443	 * If sched_entity still have not zero load or utilization, we have to
4444	 * decay it:
4445	 */
4446	if (se->avg.load_avg || se->avg.util_avg)
4447		return false;
4448
4449	/*
4450	 * If there is a pending propagation, we have to update the load and
4451	 * the utilization of the sched_entity:
4452	 */
4453	if (gcfs_rq->propagate)
4454		return false;
4455
4456	/*
4457	 * Otherwise, the load and the utilization of the sched_entity is
4458	 * already zero and there is no pending propagation, so it will be a
4459	 * waste of time to try to decay it:
4460	 */
4461	return true;
4462}
4463
4464#else /* CONFIG_FAIR_GROUP_SCHED */
4465
4466static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
4467
4468static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
4469
4470static inline int propagate_entity_load_avg(struct sched_entity *se)
4471{
4472	return 0;
4473}
4474
4475static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
4476
4477#endif /* CONFIG_FAIR_GROUP_SCHED */
4478
4479#ifdef CONFIG_NO_HZ_COMMON
4480static inline void migrate_se_pelt_lag(struct sched_entity *se)
4481{
4482	u64 throttled = 0, now, lut;
4483	struct cfs_rq *cfs_rq;
4484	struct rq *rq;
4485	bool is_idle;
4486
4487	if (load_avg_is_decayed(&se->avg))
4488		return;
4489
4490	cfs_rq = cfs_rq_of(se);
4491	rq = rq_of(cfs_rq);
4492
4493	rcu_read_lock();
4494	is_idle = is_idle_task(rcu_dereference(rq->curr));
4495	rcu_read_unlock();
4496
4497	/*
4498	 * The lag estimation comes with a cost we don't want to pay all the
4499	 * time. Hence, limiting to the case where the source CPU is idle and
4500	 * we know we are at the greatest risk to have an outdated clock.
4501	 */
4502	if (!is_idle)
4503		return;
4504
4505	/*
4506	 * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where:
4507	 *
4508	 *   last_update_time (the cfs_rq's last_update_time)
4509	 *	= cfs_rq_clock_pelt()@cfs_rq_idle
4510	 *      = rq_clock_pelt()@cfs_rq_idle
4511	 *        - cfs->throttled_clock_pelt_time@cfs_rq_idle
4512	 *
4513	 *   cfs_idle_lag (delta between rq's update and cfs_rq's update)
4514	 *      = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle
4515	 *
4516	 *   rq_idle_lag (delta between now and rq's update)
4517	 *      = sched_clock_cpu() - rq_clock()@rq_idle
4518	 *
4519	 * We can then write:
4520	 *
4521	 *    now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time +
4522	 *          sched_clock_cpu() - rq_clock()@rq_idle
4523	 * Where:
4524	 *      rq_clock_pelt()@rq_idle is rq->clock_pelt_idle
4525	 *      rq_clock()@rq_idle      is rq->clock_idle
4526	 *      cfs->throttled_clock_pelt_time@cfs_rq_idle
4527	 *                              is cfs_rq->throttled_pelt_idle
4528	 */
4529
4530#ifdef CONFIG_CFS_BANDWIDTH
4531	throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
4532	/* The clock has been stopped for throttling */
4533	if (throttled == U64_MAX)
4534		return;
4535#endif
4536	now = u64_u32_load(rq->clock_pelt_idle);
4537	/*
4538	 * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case
4539	 * is observed the old clock_pelt_idle value and the new clock_idle,
4540	 * which lead to an underestimation. The opposite would lead to an
4541	 * overestimation.
4542	 */
4543	smp_rmb();
4544	lut = cfs_rq_last_update_time(cfs_rq);
4545
4546	now -= throttled;
4547	if (now < lut)
4548		/*
4549		 * cfs_rq->avg.last_update_time is more recent than our
4550		 * estimation, let's use it.
4551		 */
4552		now = lut;
4553	else
4554		now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle);
4555
4556	__update_load_avg_blocked_se(now, se);
4557}
4558#else
4559static void migrate_se_pelt_lag(struct sched_entity *se) {}
4560#endif
4561
4562/**
4563 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
4564 * @now: current time, as per cfs_rq_clock_pelt()
4565 * @cfs_rq: cfs_rq to update
4566 *
4567 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
4568 * avg. The immediate corollary is that all (fair) tasks must be attached.
4569 *
4570 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
4571 *
4572 * Return: true if the load decayed or we removed load.
4573 *
4574 * Since both these conditions indicate a changed cfs_rq->avg.load we should
4575 * call update_tg_load_avg() when this function returns true.
4576 */
4577static inline int
4578update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
4579{
4580	unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
4581	struct sched_avg *sa = &cfs_rq->avg;
4582	int decayed = 0;
4583
4584	if (cfs_rq->removed.nr) {
4585		unsigned long r;
4586		u32 divider = get_pelt_divider(&cfs_rq->avg);
4587
4588		raw_spin_lock(&cfs_rq->removed.lock);
4589		swap(cfs_rq->removed.util_avg, removed_util);
4590		swap(cfs_rq->removed.load_avg, removed_load);
4591		swap(cfs_rq->removed.runnable_avg, removed_runnable);
4592		cfs_rq->removed.nr = 0;
4593		raw_spin_unlock(&cfs_rq->removed.lock);
4594
4595		r = removed_load;
4596		sub_positive(&sa->load_avg, r);
4597		sub_positive(&sa->load_sum, r * divider);
4598		/* See sa->util_sum below */
4599		sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
4600
4601		r = removed_util;
4602		sub_positive(&sa->util_avg, r);
4603		sub_positive(&sa->util_sum, r * divider);
4604		/*
4605		 * Because of rounding, se->util_sum might ends up being +1 more than
4606		 * cfs->util_sum. Although this is not a problem by itself, detaching
4607		 * a lot of tasks with the rounding problem between 2 updates of
4608		 * util_avg (~1ms) can make cfs->util_sum becoming null whereas
4609		 * cfs_util_avg is not.
4610		 * Check that util_sum is still above its lower bound for the new
4611		 * util_avg. Given that period_contrib might have moved since the last
4612		 * sync, we are only sure that util_sum must be above or equal to
4613		 *    util_avg * minimum possible divider
4614		 */
4615		sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
4616
4617		r = removed_runnable;
4618		sub_positive(&sa->runnable_avg, r);
4619		sub_positive(&sa->runnable_sum, r * divider);
4620		/* See sa->util_sum above */
4621		sa->runnable_sum = max_t(u32, sa->runnable_sum,
4622					      sa->runnable_avg * PELT_MIN_DIVIDER);
4623
4624		/*
4625		 * removed_runnable is the unweighted version of removed_load so we
4626		 * can use it to estimate removed_load_sum.
4627		 */
4628		add_tg_cfs_propagate(cfs_rq,
4629			-(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
4630
4631		decayed = 1;
4632	}
4633
4634	decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
4635	u64_u32_store_copy(sa->last_update_time,
4636			   cfs_rq->last_update_time_copy,
4637			   sa->last_update_time);
4638	return decayed;
4639}
4640
4641/**
4642 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
4643 * @cfs_rq: cfs_rq to attach to
4644 * @se: sched_entity to attach
4645 *
4646 * Must call update_cfs_rq_load_avg() before this, since we rely on
4647 * cfs_rq->avg.last_update_time being current.
4648 */
4649static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4650{
4651	/*
4652	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4653	 * See ___update_load_avg() for details.
4654	 */
4655	u32 divider = get_pelt_divider(&cfs_rq->avg);
4656
4657	/*
4658	 * When we attach the @se to the @cfs_rq, we must align the decay
4659	 * window because without that, really weird and wonderful things can
4660	 * happen.
4661	 *
4662	 * XXX illustrate
4663	 */
4664	se->avg.last_update_time = cfs_rq->avg.last_update_time;
4665	se->avg.period_contrib = cfs_rq->avg.period_contrib;
4666
4667	/*
4668	 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
4669	 * period_contrib. This isn't strictly correct, but since we're
4670	 * entirely outside of the PELT hierarchy, nobody cares if we truncate
4671	 * _sum a little.
4672	 */
4673	se->avg.util_sum = se->avg.util_avg * divider;
4674
4675	se->avg.runnable_sum = se->avg.runnable_avg * divider;
4676
4677	se->avg.load_sum = se->avg.load_avg * divider;
4678	if (se_weight(se) < se->avg.load_sum)
4679		se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
4680	else
4681		se->avg.load_sum = 1;
4682
4683	enqueue_load_avg(cfs_rq, se);
4684	cfs_rq->avg.util_avg += se->avg.util_avg;
4685	cfs_rq->avg.util_sum += se->avg.util_sum;
4686	cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
4687	cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
4688
4689	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
4690
4691	cfs_rq_util_change(cfs_rq, 0);
4692
4693	trace_pelt_cfs_tp(cfs_rq);
4694}
4695
4696/**
4697 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
4698 * @cfs_rq: cfs_rq to detach from
4699 * @se: sched_entity to detach
4700 *
4701 * Must call update_cfs_rq_load_avg() before this, since we rely on
4702 * cfs_rq->avg.last_update_time being current.
4703 */
4704static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4705{
4706	dequeue_load_avg(cfs_rq, se);
4707	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
4708	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
4709	/* See update_cfs_rq_load_avg() */
4710	cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
4711					  cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
4712
4713	sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
4714	sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
4715	/* See update_cfs_rq_load_avg() */
4716	cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
4717					      cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
4718
4719	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
4720
4721	cfs_rq_util_change(cfs_rq, 0);
4722
4723	trace_pelt_cfs_tp(cfs_rq);
4724}
4725
4726/*
4727 * Optional action to be done while updating the load average
4728 */
4729#define UPDATE_TG	0x1
4730#define SKIP_AGE_LOAD	0x2
4731#define DO_ATTACH	0x4
4732#define DO_DETACH	0x8
4733
4734/* Update task and its cfs_rq load average */
4735static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4736{
4737	u64 now = cfs_rq_clock_pelt(cfs_rq);
4738	int decayed;
4739
4740	/*
4741	 * Track task load average for carrying it to new CPU after migrated, and
4742	 * track group sched_entity load average for task_h_load calculation in migration
4743	 */
4744	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
4745		__update_load_avg_se(now, cfs_rq, se);
4746
4747	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
4748	decayed |= propagate_entity_load_avg(se);
4749
4750	if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
4751
4752		/*
4753		 * DO_ATTACH means we're here from enqueue_entity().
4754		 * !last_update_time means we've passed through
4755		 * migrate_task_rq_fair() indicating we migrated.
4756		 *
4757		 * IOW we're enqueueing a task on a new CPU.
4758		 */
4759		attach_entity_load_avg(cfs_rq, se);
4760		update_tg_load_avg(cfs_rq);
4761
4762	} else if (flags & DO_DETACH) {
4763		/*
4764		 * DO_DETACH means we're here from dequeue_entity()
4765		 * and we are migrating task out of the CPU.
4766		 */
4767		detach_entity_load_avg(cfs_rq, se);
4768		update_tg_load_avg(cfs_rq);
4769	} else if (decayed) {
4770		cfs_rq_util_change(cfs_rq, 0);
4771
4772		if (flags & UPDATE_TG)
4773			update_tg_load_avg(cfs_rq);
4774	}
4775}
4776
4777/*
4778 * Synchronize entity load avg of dequeued entity without locking
4779 * the previous rq.
4780 */
4781static void sync_entity_load_avg(struct sched_entity *se)
4782{
4783	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4784	u64 last_update_time;
4785
4786	last_update_time = cfs_rq_last_update_time(cfs_rq);
4787	__update_load_avg_blocked_se(last_update_time, se);
4788}
4789
4790/*
4791 * Task first catches up with cfs_rq, and then subtract
4792 * itself from the cfs_rq (task must be off the queue now).
4793 */
4794static void remove_entity_load_avg(struct sched_entity *se)
4795{
4796	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4797	unsigned long flags;
4798
4799	/*
4800	 * tasks cannot exit without having gone through wake_up_new_task() ->
4801	 * enqueue_task_fair() which will have added things to the cfs_rq,
4802	 * so we can remove unconditionally.
4803	 */
4804
4805	sync_entity_load_avg(se);
4806
4807	raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
4808	++cfs_rq->removed.nr;
4809	cfs_rq->removed.util_avg	+= se->avg.util_avg;
4810	cfs_rq->removed.load_avg	+= se->avg.load_avg;
4811	cfs_rq->removed.runnable_avg	+= se->avg.runnable_avg;
4812	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
4813}
4814
4815static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
4816{
4817	return cfs_rq->avg.runnable_avg;
4818}
4819
4820static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
4821{
4822	return cfs_rq->avg.load_avg;
4823}
4824
4825static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf);
4826
4827static inline unsigned long task_util(struct task_struct *p)
4828{
4829	return READ_ONCE(p->se.avg.util_avg);
4830}
4831
4832static inline unsigned long task_runnable(struct task_struct *p)
4833{
4834	return READ_ONCE(p->se.avg.runnable_avg);
4835}
4836
4837static inline unsigned long _task_util_est(struct task_struct *p)
4838{
4839	return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
4840}
4841
4842static inline unsigned long task_util_est(struct task_struct *p)
4843{
4844	return max(task_util(p), _task_util_est(p));
4845}
4846
4847static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
4848				    struct task_struct *p)
4849{
4850	unsigned int enqueued;
4851
4852	if (!sched_feat(UTIL_EST))
4853		return;
4854
4855	/* Update root cfs_rq's estimated utilization */
4856	enqueued  = cfs_rq->avg.util_est;
4857	enqueued += _task_util_est(p);
4858	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
4859
4860	trace_sched_util_est_cfs_tp(cfs_rq);
4861}
4862
4863static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
4864				    struct task_struct *p)
4865{
4866	unsigned int enqueued;
4867
4868	if (!sched_feat(UTIL_EST))
4869		return;
4870
4871	/* Update root cfs_rq's estimated utilization */
4872	enqueued  = cfs_rq->avg.util_est;
4873	enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
4874	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
4875
4876	trace_sched_util_est_cfs_tp(cfs_rq);
4877}
4878
4879#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
4880
4881static inline void util_est_update(struct cfs_rq *cfs_rq,
4882				   struct task_struct *p,
4883				   bool task_sleep)
4884{
4885	unsigned int ewma, dequeued, last_ewma_diff;
4886
4887	if (!sched_feat(UTIL_EST))
4888		return;
4889
4890	/*
4891	 * Skip update of task's estimated utilization when the task has not
4892	 * yet completed an activation, e.g. being migrated.
4893	 */
4894	if (!task_sleep)
4895		return;
4896
4897	/* Get current estimate of utilization */
4898	ewma = READ_ONCE(p->se.avg.util_est);
4899
4900	/*
4901	 * If the PELT values haven't changed since enqueue time,
4902	 * skip the util_est update.
4903	 */
4904	if (ewma & UTIL_AVG_UNCHANGED)
4905		return;
4906
4907	/* Get utilization at dequeue */
4908	dequeued = task_util(p);
4909
4910	/*
4911	 * Reset EWMA on utilization increases, the moving average is used only
4912	 * to smooth utilization decreases.
4913	 */
4914	if (ewma <= dequeued) {
4915		ewma = dequeued;
4916		goto done;
4917	}
4918
4919	/*
4920	 * Skip update of task's estimated utilization when its members are
4921	 * already ~1% close to its last activation value.
4922	 */
4923	last_ewma_diff = ewma - dequeued;
4924	if (last_ewma_diff < UTIL_EST_MARGIN)
4925		goto done;
4926
4927	/*
4928	 * To avoid overestimation of actual task utilization, skip updates if
4929	 * we cannot grant there is idle time in this CPU.
4930	 */
4931	if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
4932		return;
4933
4934	/*
4935	 * To avoid underestimate of task utilization, skip updates of EWMA if
4936	 * we cannot grant that thread got all CPU time it wanted.
4937	 */
4938	if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
4939		goto done;
4940
4941
4942	/*
4943	 * Update Task's estimated utilization
4944	 *
4945	 * When *p completes an activation we can consolidate another sample
4946	 * of the task size. This is done by using this value to update the
4947	 * Exponential Weighted Moving Average (EWMA):
4948	 *
4949	 *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
4950	 *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
4951	 *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
4952	 *          = w * (      -last_ewma_diff           ) +     ewma(t-1)
4953	 *          = w * (-last_ewma_diff +  ewma(t-1) / w)
4954	 *
4955	 * Where 'w' is the weight of new samples, which is configured to be
4956	 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
4957	 */
4958	ewma <<= UTIL_EST_WEIGHT_SHIFT;
4959	ewma  -= last_ewma_diff;
4960	ewma >>= UTIL_EST_WEIGHT_SHIFT;
4961done:
4962	ewma |= UTIL_AVG_UNCHANGED;
4963	WRITE_ONCE(p->se.avg.util_est, ewma);
4964
4965	trace_sched_util_est_se_tp(&p->se);
4966}
4967
4968static inline unsigned long get_actual_cpu_capacity(int cpu)
4969{
4970	unsigned long capacity = arch_scale_cpu_capacity(cpu);
4971
4972	capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
4973
4974	return capacity;
4975}
4976
4977static inline int util_fits_cpu(unsigned long util,
4978				unsigned long uclamp_min,
4979				unsigned long uclamp_max,
4980				int cpu)
4981{
4982	unsigned long capacity = capacity_of(cpu);
4983	unsigned long capacity_orig;
4984	bool fits, uclamp_max_fits;
4985
4986	/*
4987	 * Check if the real util fits without any uclamp boost/cap applied.
4988	 */
4989	fits = fits_capacity(util, capacity);
4990
4991	if (!uclamp_is_used())
4992		return fits;
4993
4994	/*
4995	 * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
4996	 * uclamp_max. We only care about capacity pressure (by using
4997	 * capacity_of()) for comparing against the real util.
4998	 *
4999	 * If a task is boosted to 1024 for example, we don't want a tiny
5000	 * pressure to skew the check whether it fits a CPU or not.
5001	 *
5002	 * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
5003	 * should fit a little cpu even if there's some pressure.
5004	 *
5005	 * Only exception is for HW or cpufreq pressure since it has a direct impact
5006	 * on available OPP of the system.
5007	 *
5008	 * We honour it for uclamp_min only as a drop in performance level
5009	 * could result in not getting the requested minimum performance level.
5010	 *
5011	 * For uclamp_max, we can tolerate a drop in performance level as the
5012	 * goal is to cap the task. So it's okay if it's getting less.
5013	 */
5014	capacity_orig = arch_scale_cpu_capacity(cpu);
5015
5016	/*
5017	 * We want to force a task to fit a cpu as implied by uclamp_max.
5018	 * But we do have some corner cases to cater for..
5019	 *
5020	 *
5021	 *                                 C=z
5022	 *   |                             ___
5023	 *   |                  C=y       |   |
5024	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _  uclamp_max
5025	 *   |      C=x        |   |      |   |
5026	 *   |      ___        |   |      |   |
5027	 *   |     |   |       |   |      |   |    (util somewhere in this region)
5028	 *   |     |   |       |   |      |   |
5029	 *   |     |   |       |   |      |   |
5030	 *   +----------------------------------------
5031	 *         CPU0        CPU1       CPU2
5032	 *
5033	 *   In the above example if a task is capped to a specific performance
5034	 *   point, y, then when:
5035	 *
5036	 *   * util = 80% of x then it does not fit on CPU0 and should migrate
5037	 *     to CPU1
5038	 *   * util = 80% of y then it is forced to fit on CPU1 to honour
5039	 *     uclamp_max request.
5040	 *
5041	 *   which is what we're enforcing here. A task always fits if
5042	 *   uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
5043	 *   the normal upmigration rules should withhold still.
5044	 *
5045	 *   Only exception is when we are on max capacity, then we need to be
5046	 *   careful not to block overutilized state. This is so because:
5047	 *
5048	 *     1. There's no concept of capping at max_capacity! We can't go
5049	 *        beyond this performance level anyway.
5050	 *     2. The system is being saturated when we're operating near
5051	 *        max capacity, it doesn't make sense to block overutilized.
5052	 */
5053	uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
5054	uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
5055	fits = fits || uclamp_max_fits;
5056
5057	/*
5058	 *
5059	 *                                 C=z
5060	 *   |                             ___       (region a, capped, util >= uclamp_max)
5061	 *   |                  C=y       |   |
5062	 *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
5063	 *   |      C=x        |   |      |   |
5064	 *   |      ___        |   |      |   |      (region b, uclamp_min <= util <= uclamp_max)
5065	 *   |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
5066	 *   |     |   |       |   |      |   |
5067	 *   |     |   |       |   |      |   |      (region c, boosted, util < uclamp_min)
5068	 *   +----------------------------------------
5069	 *         CPU0        CPU1       CPU2
5070	 *
5071	 * a) If util > uclamp_max, then we're capped, we don't care about
5072	 *    actual fitness value here. We only care if uclamp_max fits
5073	 *    capacity without taking margin/pressure into account.
5074	 *    See comment above.
5075	 *
5076	 * b) If uclamp_min <= util <= uclamp_max, then the normal
5077	 *    fits_capacity() rules apply. Except we need to ensure that we
5078	 *    enforce we remain within uclamp_max, see comment above.
5079	 *
5080	 * c) If util < uclamp_min, then we are boosted. Same as (b) but we
5081	 *    need to take into account the boosted value fits the CPU without
5082	 *    taking margin/pressure into account.
5083	 *
5084	 * Cases (a) and (b) are handled in the 'fits' variable already. We
5085	 * just need to consider an extra check for case (c) after ensuring we
5086	 * handle the case uclamp_min > uclamp_max.
5087	 */
5088	uclamp_min = min(uclamp_min, uclamp_max);
5089	if (fits && (util < uclamp_min) &&
5090	    (uclamp_min > get_actual_cpu_capacity(cpu)))
5091		return -1;
5092
5093	return fits;
5094}
5095
5096static inline int task_fits_cpu(struct task_struct *p, int cpu)
5097{
5098	unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
5099	unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
5100	unsigned long util = task_util_est(p);
5101	/*
5102	 * Return true only if the cpu fully fits the task requirements, which
5103	 * include the utilization but also the performance hints.
5104	 */
5105	return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
5106}
5107
5108static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
5109{
5110	int cpu = cpu_of(rq);
5111
5112	if (!sched_asym_cpucap_active())
5113		return;
5114
5115	/*
5116	 * Affinity allows us to go somewhere higher?  Or are we on biggest
5117	 * available CPU already? Or do we fit into this CPU ?
5118	 */
5119	if (!p || (p->nr_cpus_allowed == 1) ||
5120	    (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
5121	    task_fits_cpu(p, cpu)) {
5122
5123		rq->misfit_task_load = 0;
5124		return;
5125	}
5126
5127	/*
5128	 * Make sure that misfit_task_load will not be null even if
5129	 * task_h_load() returns 0.
5130	 */
5131	rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
5132}
5133
5134#else /* CONFIG_SMP */
5135
5136static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
5137{
5138	return !cfs_rq->nr_running;
5139}
5140
5141#define UPDATE_TG	0x0
5142#define SKIP_AGE_LOAD	0x0
5143#define DO_ATTACH	0x0
5144#define DO_DETACH	0x0
5145
5146static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
5147{
5148	cfs_rq_util_change(cfs_rq, 0);
5149}
5150
5151static inline void remove_entity_load_avg(struct sched_entity *se) {}
5152
5153static inline void
5154attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
5155static inline void
5156detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
5157
5158static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf)
5159{
5160	return 0;
5161}
5162
5163static inline void
5164util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
5165
5166static inline void
5167util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
5168
5169static inline void
5170util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
5171		bool task_sleep) {}
5172static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
5173
5174#endif /* CONFIG_SMP */
5175
5176static void
5177place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
5178{
5179	u64 vslice, vruntime = avg_vruntime(cfs_rq);
5180	s64 lag = 0;
5181
5182	se->slice = sysctl_sched_base_slice;
5183	vslice = calc_delta_fair(se->slice, se);
5184
5185	/*
5186	 * Due to how V is constructed as the weighted average of entities,
5187	 * adding tasks with positive lag, or removing tasks with negative lag
5188	 * will move 'time' backwards, this can screw around with the lag of
5189	 * other tasks.
5190	 *
5191	 * EEVDF: placement strategy #1 / #2
5192	 */
5193	if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
5194		struct sched_entity *curr = cfs_rq->curr;
5195		unsigned long load;
5196
5197		lag = se->vlag;
5198
5199		/*
5200		 * If we want to place a task and preserve lag, we have to
5201		 * consider the effect of the new entity on the weighted
5202		 * average and compensate for this, otherwise lag can quickly
5203		 * evaporate.
5204		 *
5205		 * Lag is defined as:
5206		 *
5207		 *   lag_i = S - s_i = w_i * (V - v_i)
5208		 *
5209		 * To avoid the 'w_i' term all over the place, we only track
5210		 * the virtual lag:
5211		 *
5212		 *   vl_i = V - v_i <=> v_i = V - vl_i
5213		 *
5214		 * And we take V to be the weighted average of all v:
5215		 *
5216		 *   V = (\Sum w_j*v_j) / W
5217		 *
5218		 * Where W is: \Sum w_j
5219		 *
5220		 * Then, the weighted average after adding an entity with lag
5221		 * vl_i is given by:
5222		 *
5223		 *   V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
5224		 *      = (W*V + w_i*(V - vl_i)) / (W + w_i)
5225		 *      = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
5226		 *      = (V*(W + w_i) - w_i*l) / (W + w_i)
5227		 *      = V - w_i*vl_i / (W + w_i)
5228		 *
5229		 * And the actual lag after adding an entity with vl_i is:
5230		 *
5231		 *   vl'_i = V' - v_i
5232		 *         = V - w_i*vl_i / (W + w_i) - (V - vl_i)
5233		 *         = vl_i - w_i*vl_i / (W + w_i)
5234		 *
5235		 * Which is strictly less than vl_i. So in order to preserve lag
5236		 * we should inflate the lag before placement such that the
5237		 * effective lag after placement comes out right.
5238		 *
5239		 * As such, invert the above relation for vl'_i to get the vl_i
5240		 * we need to use such that the lag after placement is the lag
5241		 * we computed before dequeue.
5242		 *
5243		 *   vl'_i = vl_i - w_i*vl_i / (W + w_i)
5244		 *         = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
5245		 *
5246		 *   (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
5247		 *                   = W*vl_i
5248		 *
5249		 *   vl_i = (W + w_i)*vl'_i / W
5250		 */
5251		load = cfs_rq->avg_load;
5252		if (curr && curr->on_rq)
5253			load += scale_load_down(curr->load.weight);
5254
5255		lag *= load + scale_load_down(se->load.weight);
5256		if (WARN_ON_ONCE(!load))
5257			load = 1;
5258		lag = div_s64(lag, load);
5259	}
5260
5261	se->vruntime = vruntime - lag;
5262
5263	/*
5264	 * When joining the competition; the existing tasks will be,
5265	 * on average, halfway through their slice, as such start tasks
5266	 * off with half a slice to ease into the competition.
5267	 */
5268	if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
5269		vslice /= 2;
5270
5271	/*
5272	 * EEVDF: vd_i = ve_i + r_i/w_i
5273	 */
5274	se->deadline = se->vruntime + vslice;
5275}
5276
5277static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
5278static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
5279
5280static inline bool cfs_bandwidth_used(void);
5281
5282static void
5283enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
5284{
5285	bool curr = cfs_rq->curr == se;
5286
5287	/*
5288	 * If we're the current task, we must renormalise before calling
5289	 * update_curr().
5290	 */
5291	if (curr)
5292		place_entity(cfs_rq, se, flags);
5293
5294	update_curr(cfs_rq);
5295
5296	/*
5297	 * When enqueuing a sched_entity, we must:
5298	 *   - Update loads to have both entity and cfs_rq synced with now.
5299	 *   - For group_entity, update its runnable_weight to reflect the new
5300	 *     h_nr_running of its group cfs_rq.
5301	 *   - For group_entity, update its weight to reflect the new share of
5302	 *     its group cfs_rq
5303	 *   - Add its new weight to cfs_rq->load.weight
5304	 */
5305	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
5306	se_update_runnable(se);
5307	/*
5308	 * XXX update_load_avg() above will have attached us to the pelt sum;
5309	 * but update_cfs_group() here will re-adjust the weight and have to
5310	 * undo/redo all that. Seems wasteful.
5311	 */
5312	update_cfs_group(se);
5313
5314	/*
5315	 * XXX now that the entity has been re-weighted, and it's lag adjusted,
5316	 * we can place the entity.
5317	 */
5318	if (!curr)
5319		place_entity(cfs_rq, se, flags);
5320
5321	account_entity_enqueue(cfs_rq, se);
5322
5323	/* Entity has migrated, no longer consider this task hot */
5324	if (flags & ENQUEUE_MIGRATED)
5325		se->exec_start = 0;
5326
5327	check_schedstat_required();
5328	update_stats_enqueue_fair(cfs_rq, se, flags);
5329	if (!curr)
5330		__enqueue_entity(cfs_rq, se);
5331	se->on_rq = 1;
5332
5333	if (cfs_rq->nr_running == 1) {
5334		check_enqueue_throttle(cfs_rq);
5335		if (!throttled_hierarchy(cfs_rq)) {
5336			list_add_leaf_cfs_rq(cfs_rq);
5337		} else {
5338#ifdef CONFIG_CFS_BANDWIDTH
5339			struct rq *rq = rq_of(cfs_rq);
5340
5341			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
5342				cfs_rq->throttled_clock = rq_clock(rq);
5343			if (!cfs_rq->throttled_clock_self)
5344				cfs_rq->throttled_clock_self = rq_clock(rq);
5345#endif
5346		}
5347	}
5348}
5349
5350static void __clear_buddies_next(struct sched_entity *se)
5351{
5352	for_each_sched_entity(se) {
5353		struct cfs_rq *cfs_rq = cfs_rq_of(se);
5354		if (cfs_rq->next != se)
5355			break;
5356
5357		cfs_rq->next = NULL;
5358	}
5359}
5360
5361static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
5362{
5363	if (cfs_rq->next == se)
5364		__clear_buddies_next(se);
5365}
5366
5367static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
5368
5369static void
5370dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
5371{
5372	int action = UPDATE_TG;
5373
5374	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
5375		action |= DO_DETACH;
5376
5377	/*
5378	 * Update run-time statistics of the 'current'.
5379	 */
5380	update_curr(cfs_rq);
5381
5382	/*
5383	 * When dequeuing a sched_entity, we must:
5384	 *   - Update loads to have both entity and cfs_rq synced with now.
5385	 *   - For group_entity, update its runnable_weight to reflect the new
5386	 *     h_nr_running of its group cfs_rq.
5387	 *   - Subtract its previous weight from cfs_rq->load.weight.
5388	 *   - For group entity, update its weight to reflect the new share
5389	 *     of its group cfs_rq.
5390	 */
5391	update_load_avg(cfs_rq, se, action);
5392	se_update_runnable(se);
5393
5394	update_stats_dequeue_fair(cfs_rq, se, flags);
5395
5396	clear_buddies(cfs_rq, se);
5397
5398	update_entity_lag(cfs_rq, se);
5399	if (se != cfs_rq->curr)
5400		__dequeue_entity(cfs_rq, se);
5401	se->on_rq = 0;
5402	account_entity_dequeue(cfs_rq, se);
5403
5404	/* return excess runtime on last dequeue */
5405	return_cfs_rq_runtime(cfs_rq);
5406
5407	update_cfs_group(se);
5408
5409	/*
5410	 * Now advance min_vruntime if @se was the entity holding it back,
5411	 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
5412	 * put back on, and if we advance min_vruntime, we'll be placed back
5413	 * further than we started -- i.e. we'll be penalized.
5414	 */
5415	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
5416		update_min_vruntime(cfs_rq);
5417
5418	if (cfs_rq->nr_running == 0)
5419		update_idle_cfs_rq_clock_pelt(cfs_rq);
5420}
5421
5422static void
5423set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
5424{
5425	clear_buddies(cfs_rq, se);
5426
5427	/* 'current' is not kept within the tree. */
5428	if (se->on_rq) {
5429		/*
5430		 * Any task has to be enqueued before it get to execute on
5431		 * a CPU. So account for the time it spent waiting on the
5432		 * runqueue.
5433		 */
5434		update_stats_wait_end_fair(cfs_rq, se);
5435		__dequeue_entity(cfs_rq, se);
5436		update_load_avg(cfs_rq, se, UPDATE_TG);
5437		/*
5438		 * HACK, stash a copy of deadline at the point of pick in vlag,
5439		 * which isn't used until dequeue.
5440		 */
5441		se->vlag = se->deadline;
5442	}
5443
5444	update_stats_curr_start(cfs_rq, se);
5445	cfs_rq->curr = se;
5446
5447	/*
5448	 * Track our maximum slice length, if the CPU's load is at
5449	 * least twice that of our own weight (i.e. don't track it
5450	 * when there are only lesser-weight tasks around):
5451	 */
5452	if (schedstat_enabled() &&
5453	    rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
5454		struct sched_statistics *stats;
5455
5456		stats = __schedstats_from_se(se);
5457		__schedstat_set(stats->slice_max,
5458				max((u64)stats->slice_max,
5459				    se->sum_exec_runtime - se->prev_sum_exec_runtime));
5460	}
5461
5462	se->prev_sum_exec_runtime = se->sum_exec_runtime;
5463}
5464
5465/*
5466 * Pick the next process, keeping these things in mind, in this order:
5467 * 1) keep things fair between processes/task groups
5468 * 2) pick the "next" process, since someone really wants that to run
5469 * 3) pick the "last" process, for cache locality
5470 * 4) do not run the "skip" process, if something else is available
5471 */
5472static struct sched_entity *
5473pick_next_entity(struct cfs_rq *cfs_rq)
5474{
5475	/*
5476	 * Enabling NEXT_BUDDY will affect latency but not fairness.
5477	 */
5478	if (sched_feat(NEXT_BUDDY) &&
5479	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
5480		return cfs_rq->next;
5481
5482	return pick_eevdf(cfs_rq);
5483}
5484
5485static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
5486
5487static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
5488{
5489	/*
5490	 * If still on the runqueue then deactivate_task()
5491	 * was not called and update_curr() has to be done:
5492	 */
5493	if (prev->on_rq)
5494		update_curr(cfs_rq);
5495
5496	/* throttle cfs_rqs exceeding runtime */
5497	check_cfs_rq_runtime(cfs_rq);
5498
5499	if (prev->on_rq) {
5500		update_stats_wait_start_fair(cfs_rq, prev);
5501		/* Put 'current' back into the tree. */
5502		__enqueue_entity(cfs_rq, prev);
5503		/* in !on_rq case, update occurred at dequeue */
5504		update_load_avg(cfs_rq, prev, 0);
5505	}
5506	cfs_rq->curr = NULL;
5507}
5508
5509static void
5510entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
5511{
5512	/*
5513	 * Update run-time statistics of the 'current'.
5514	 */
5515	update_curr(cfs_rq);
5516
5517	/*
5518	 * Ensure that runnable average is periodically updated.
5519	 */
5520	update_load_avg(cfs_rq, curr, UPDATE_TG);
5521	update_cfs_group(curr);
5522
5523#ifdef CONFIG_SCHED_HRTICK
5524	/*
5525	 * queued ticks are scheduled to match the slice, so don't bother
5526	 * validating it and just reschedule.
5527	 */
5528	if (queued) {
5529		resched_curr(rq_of(cfs_rq));
5530		return;
5531	}
5532	/*
5533	 * don't let the period tick interfere with the hrtick preemption
5534	 */
5535	if (!sched_feat(DOUBLE_TICK) &&
5536			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
5537		return;
5538#endif
5539}
5540
5541
5542/**************************************************
5543 * CFS bandwidth control machinery
5544 */
5545
5546#ifdef CONFIG_CFS_BANDWIDTH
5547
5548#ifdef CONFIG_JUMP_LABEL
5549static struct static_key __cfs_bandwidth_used;
5550
5551static inline bool cfs_bandwidth_used(void)
5552{
5553	return static_key_false(&__cfs_bandwidth_used);
5554}
5555
5556void cfs_bandwidth_usage_inc(void)
5557{
5558	static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
5559}
5560
5561void cfs_bandwidth_usage_dec(void)
5562{
5563	static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
5564}
5565#else /* CONFIG_JUMP_LABEL */
5566static bool cfs_bandwidth_used(void)
5567{
5568	return true;
5569}
5570
5571void cfs_bandwidth_usage_inc(void) {}
5572void cfs_bandwidth_usage_dec(void) {}
5573#endif /* CONFIG_JUMP_LABEL */
5574
5575/*
5576 * default period for cfs group bandwidth.
5577 * default: 0.1s, units: nanoseconds
5578 */
5579static inline u64 default_cfs_period(void)
5580{
5581	return 100000000ULL;
5582}
5583
5584static inline u64 sched_cfs_bandwidth_slice(void)
5585{
5586	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
5587}
5588
5589/*
5590 * Replenish runtime according to assigned quota. We use sched_clock_cpu
5591 * directly instead of rq->clock to avoid adding additional synchronization
5592 * around rq->lock.
5593 *
5594 * requires cfs_b->lock
5595 */
5596void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
5597{
5598	s64 runtime;
5599
5600	if (unlikely(cfs_b->quota == RUNTIME_INF))
5601		return;
5602
5603	cfs_b->runtime += cfs_b->quota;
5604	runtime = cfs_b->runtime_snap - cfs_b->runtime;
5605	if (runtime > 0) {
5606		cfs_b->burst_time += runtime;
5607		cfs_b->nr_burst++;
5608	}
5609
5610	cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
5611	cfs_b->runtime_snap = cfs_b->runtime;
5612}
5613
5614static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5615{
5616	return &tg->cfs_bandwidth;
5617}
5618
5619/* returns 0 on failure to allocate runtime */
5620static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
5621				   struct cfs_rq *cfs_rq, u64 target_runtime)
5622{
5623	u64 min_amount, amount = 0;
5624
5625	lockdep_assert_held(&cfs_b->lock);
5626
5627	/* note: this is a positive sum as runtime_remaining <= 0 */
5628	min_amount = target_runtime - cfs_rq->runtime_remaining;
5629
5630	if (cfs_b->quota == RUNTIME_INF)
5631		amount = min_amount;
5632	else {
5633		start_cfs_bandwidth(cfs_b);
5634
5635		if (cfs_b->runtime > 0) {
5636			amount = min(cfs_b->runtime, min_amount);
5637			cfs_b->runtime -= amount;
5638			cfs_b->idle = 0;
5639		}
5640	}
5641
5642	cfs_rq->runtime_remaining += amount;
5643
5644	return cfs_rq->runtime_remaining > 0;
5645}
5646
5647/* returns 0 on failure to allocate runtime */
5648static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5649{
5650	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5651	int ret;
5652
5653	raw_spin_lock(&cfs_b->lock);
5654	ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
5655	raw_spin_unlock(&cfs_b->lock);
5656
5657	return ret;
5658}
5659
5660static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5661{
5662	/* dock delta_exec before expiring quota (as it could span periods) */
5663	cfs_rq->runtime_remaining -= delta_exec;
5664
5665	if (likely(cfs_rq->runtime_remaining > 0))
5666		return;
5667
5668	if (cfs_rq->throttled)
5669		return;
5670	/*
5671	 * if we're unable to extend our runtime we resched so that the active
5672	 * hierarchy can be throttled
5673	 */
5674	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
5675		resched_curr(rq_of(cfs_rq));
5676}
5677
5678static __always_inline
5679void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5680{
5681	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
5682		return;
5683
5684	__account_cfs_rq_runtime(cfs_rq, delta_exec);
5685}
5686
5687static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5688{
5689	return cfs_bandwidth_used() && cfs_rq->throttled;
5690}
5691
5692/* check whether cfs_rq, or any parent, is throttled */
5693static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5694{
5695	return cfs_bandwidth_used() && cfs_rq->throttle_count;
5696}
5697
5698/*
5699 * Ensure that neither of the group entities corresponding to src_cpu or
5700 * dest_cpu are members of a throttled hierarchy when performing group
5701 * load-balance operations.
5702 */
5703static inline int throttled_lb_pair(struct task_group *tg,
5704				    int src_cpu, int dest_cpu)
5705{
5706	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
5707
5708	src_cfs_rq = tg->cfs_rq[src_cpu];
5709	dest_cfs_rq = tg->cfs_rq[dest_cpu];
5710
5711	return throttled_hierarchy(src_cfs_rq) ||
5712	       throttled_hierarchy(dest_cfs_rq);
5713}
5714
5715static int tg_unthrottle_up(struct task_group *tg, void *data)
5716{
5717	struct rq *rq = data;
5718	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5719
5720	cfs_rq->throttle_count--;
5721	if (!cfs_rq->throttle_count) {
5722		cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
5723					     cfs_rq->throttled_clock_pelt;
5724
5725		/* Add cfs_rq with load or one or more already running entities to the list */
5726		if (!cfs_rq_is_decayed(cfs_rq))
5727			list_add_leaf_cfs_rq(cfs_rq);
5728
5729		if (cfs_rq->throttled_clock_self) {
5730			u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
5731
5732			cfs_rq->throttled_clock_self = 0;
5733
5734			if (SCHED_WARN_ON((s64)delta < 0))
5735				delta = 0;
5736
5737			cfs_rq->throttled_clock_self_time += delta;
5738		}
5739	}
5740
5741	return 0;
5742}
5743
5744static int tg_throttle_down(struct task_group *tg, void *data)
5745{
5746	struct rq *rq = data;
5747	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5748
5749	/* group is entering throttled state, stop time */
5750	if (!cfs_rq->throttle_count) {
5751		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
5752		list_del_leaf_cfs_rq(cfs_rq);
5753
5754		SCHED_WARN_ON(cfs_rq->throttled_clock_self);
5755		if (cfs_rq->nr_running)
5756			cfs_rq->throttled_clock_self = rq_clock(rq);
5757	}
5758	cfs_rq->throttle_count++;
5759
5760	return 0;
5761}
5762
5763static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
5764{
5765	struct rq *rq = rq_of(cfs_rq);
5766	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5767	struct sched_entity *se;
5768	long task_delta, idle_task_delta, dequeue = 1;
5769
5770	raw_spin_lock(&cfs_b->lock);
5771	/* This will start the period timer if necessary */
5772	if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
5773		/*
5774		 * We have raced with bandwidth becoming available, and if we
5775		 * actually throttled the timer might not unthrottle us for an
5776		 * entire period. We additionally needed to make sure that any
5777		 * subsequent check_cfs_rq_runtime calls agree not to throttle
5778		 * us, as we may commit to do cfs put_prev+pick_next, so we ask
5779		 * for 1ns of runtime rather than just check cfs_b.
5780		 */
5781		dequeue = 0;
5782	} else {
5783		list_add_tail_rcu(&cfs_rq->throttled_list,
5784				  &cfs_b->throttled_cfs_rq);
5785	}
5786	raw_spin_unlock(&cfs_b->lock);
5787
5788	if (!dequeue)
5789		return false;  /* Throttle no longer required. */
5790
5791	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
5792
5793	/* freeze hierarchy runnable averages while throttled */
5794	rcu_read_lock();
5795	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
5796	rcu_read_unlock();
5797
5798	task_delta = cfs_rq->h_nr_running;
5799	idle_task_delta = cfs_rq->idle_h_nr_running;
5800	for_each_sched_entity(se) {
5801		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5802		/* throttled entity or throttle-on-deactivate */
5803		if (!se->on_rq)
5804			goto done;
5805
5806		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
5807
5808		if (cfs_rq_is_idle(group_cfs_rq(se)))
5809			idle_task_delta = cfs_rq->h_nr_running;
5810
5811		qcfs_rq->h_nr_running -= task_delta;
5812		qcfs_rq->idle_h_nr_running -= idle_task_delta;
5813
5814		if (qcfs_rq->load.weight) {
5815			/* Avoid re-evaluating load for this entity: */
5816			se = parent_entity(se);
5817			break;
5818		}
5819	}
5820
5821	for_each_sched_entity(se) {
5822		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5823		/* throttled entity or throttle-on-deactivate */
5824		if (!se->on_rq)
5825			goto done;
5826
5827		update_load_avg(qcfs_rq, se, 0);
5828		se_update_runnable(se);
5829
5830		if (cfs_rq_is_idle(group_cfs_rq(se)))
5831			idle_task_delta = cfs_rq->h_nr_running;
5832
5833		qcfs_rq->h_nr_running -= task_delta;
5834		qcfs_rq->idle_h_nr_running -= idle_task_delta;
5835	}
5836
5837	/* At this point se is NULL and we are at root level*/
5838	sub_nr_running(rq, task_delta);
5839
5840done:
5841	/*
5842	 * Note: distribution will already see us throttled via the
5843	 * throttled-list.  rq->lock protects completion.
5844	 */
5845	cfs_rq->throttled = 1;
5846	SCHED_WARN_ON(cfs_rq->throttled_clock);
5847	if (cfs_rq->nr_running)
5848		cfs_rq->throttled_clock = rq_clock(rq);
5849	return true;
5850}
5851
5852void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
5853{
5854	struct rq *rq = rq_of(cfs_rq);
5855	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5856	struct sched_entity *se;
5857	long task_delta, idle_task_delta;
5858
5859	se = cfs_rq->tg->se[cpu_of(rq)];
5860
5861	cfs_rq->throttled = 0;
5862
5863	update_rq_clock(rq);
5864
5865	raw_spin_lock(&cfs_b->lock);
5866	if (cfs_rq->throttled_clock) {
5867		cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
5868		cfs_rq->throttled_clock = 0;
5869	}
5870	list_del_rcu(&cfs_rq->throttled_list);
5871	raw_spin_unlock(&cfs_b->lock);
5872
5873	/* update hierarchical throttle state */
5874	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
5875
5876	if (!cfs_rq->load.weight) {
5877		if (!cfs_rq->on_list)
5878			return;
5879		/*
5880		 * Nothing to run but something to decay (on_list)?
5881		 * Complete the branch.
5882		 */
5883		for_each_sched_entity(se) {
5884			if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
5885				break;
5886		}
5887		goto unthrottle_throttle;
5888	}
5889
5890	task_delta = cfs_rq->h_nr_running;
5891	idle_task_delta = cfs_rq->idle_h_nr_running;
5892	for_each_sched_entity(se) {
5893		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5894
5895		if (se->on_rq)
5896			break;
5897		enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
5898
5899		if (cfs_rq_is_idle(group_cfs_rq(se)))
5900			idle_task_delta = cfs_rq->h_nr_running;
5901
5902		qcfs_rq->h_nr_running += task_delta;
5903		qcfs_rq->idle_h_nr_running += idle_task_delta;
5904
5905		/* end evaluation on encountering a throttled cfs_rq */
5906		if (cfs_rq_throttled(qcfs_rq))
5907			goto unthrottle_throttle;
5908	}
5909
5910	for_each_sched_entity(se) {
5911		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5912
5913		update_load_avg(qcfs_rq, se, UPDATE_TG);
5914		se_update_runnable(se);
5915
5916		if (cfs_rq_is_idle(group_cfs_rq(se)))
5917			idle_task_delta = cfs_rq->h_nr_running;
5918
5919		qcfs_rq->h_nr_running += task_delta;
5920		qcfs_rq->idle_h_nr_running += idle_task_delta;
5921
5922		/* end evaluation on encountering a throttled cfs_rq */
5923		if (cfs_rq_throttled(qcfs_rq))
5924			goto unthrottle_throttle;
5925	}
5926
5927	/* At this point se is NULL and we are at root level*/
5928	add_nr_running(rq, task_delta);
5929
5930unthrottle_throttle:
5931	assert_list_leaf_cfs_rq(rq);
5932
5933	/* Determine whether we need to wake up potentially idle CPU: */
5934	if (rq->curr == rq->idle && rq->cfs.nr_running)
5935		resched_curr(rq);
5936}
5937
5938#ifdef CONFIG_SMP
5939static void __cfsb_csd_unthrottle(void *arg)
5940{
5941	struct cfs_rq *cursor, *tmp;
5942	struct rq *rq = arg;
5943	struct rq_flags rf;
5944
5945	rq_lock(rq, &rf);
5946
5947	/*
5948	 * Iterating over the list can trigger several call to
5949	 * update_rq_clock() in unthrottle_cfs_rq().
5950	 * Do it once and skip the potential next ones.
5951	 */
5952	update_rq_clock(rq);
5953	rq_clock_start_loop_update(rq);
5954
5955	/*
5956	 * Since we hold rq lock we're safe from concurrent manipulation of
5957	 * the CSD list. However, this RCU critical section annotates the
5958	 * fact that we pair with sched_free_group_rcu(), so that we cannot
5959	 * race with group being freed in the window between removing it
5960	 * from the list and advancing to the next entry in the list.
5961	 */
5962	rcu_read_lock();
5963
5964	list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
5965				 throttled_csd_list) {
5966		list_del_init(&cursor->throttled_csd_list);
5967
5968		if (cfs_rq_throttled(cursor))
5969			unthrottle_cfs_rq(cursor);
5970	}
5971
5972	rcu_read_unlock();
5973
5974	rq_clock_stop_loop_update(rq);
5975	rq_unlock(rq, &rf);
5976}
5977
5978static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
5979{
5980	struct rq *rq = rq_of(cfs_rq);
5981	bool first;
5982
5983	if (rq == this_rq()) {
5984		unthrottle_cfs_rq(cfs_rq);
5985		return;
5986	}
5987
5988	/* Already enqueued */
5989	if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
5990		return;
5991
5992	first = list_empty(&rq->cfsb_csd_list);
5993	list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
5994	if (first)
5995		smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
5996}
5997#else
5998static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
5999{
6000	unthrottle_cfs_rq(cfs_rq);
6001}
6002#endif
6003
6004static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
6005{
6006	lockdep_assert_rq_held(rq_of(cfs_rq));
6007
6008	if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
6009	    cfs_rq->runtime_remaining <= 0))
6010		return;
6011
6012	__unthrottle_cfs_rq_async(cfs_rq);
6013}
6014
6015static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
6016{
6017	int this_cpu = smp_processor_id();
6018	u64 runtime, remaining = 1;
6019	bool throttled = false;
6020	struct cfs_rq *cfs_rq, *tmp;
6021	struct rq_flags rf;
6022	struct rq *rq;
6023	LIST_HEAD(local_unthrottle);
6024
6025	rcu_read_lock();
6026	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
6027				throttled_list) {
6028		rq = rq_of(cfs_rq);
6029
6030		if (!remaining) {
6031			throttled = true;
6032			break;
6033		}
6034
6035		rq_lock_irqsave(rq, &rf);
6036		if (!cfs_rq_throttled(cfs_rq))
6037			goto next;
6038
6039		/* Already queued for async unthrottle */
6040		if (!list_empty(&cfs_rq->throttled_csd_list))
6041			goto next;
6042
6043		/* By the above checks, this should never be true */
6044		SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
6045
6046		raw_spin_lock(&cfs_b->lock);
6047		runtime = -cfs_rq->runtime_remaining + 1;
6048		if (runtime > cfs_b->runtime)
6049			runtime = cfs_b->runtime;
6050		cfs_b->runtime -= runtime;
6051		remaining = cfs_b->runtime;
6052		raw_spin_unlock(&cfs_b->lock);
6053
6054		cfs_rq->runtime_remaining += runtime;
6055
6056		/* we check whether we're throttled above */
6057		if (cfs_rq->runtime_remaining > 0) {
6058			if (cpu_of(rq) != this_cpu) {
6059				unthrottle_cfs_rq_async(cfs_rq);
6060			} else {
6061				/*
6062				 * We currently only expect to be unthrottling
6063				 * a single cfs_rq locally.
6064				 */
6065				SCHED_WARN_ON(!list_empty(&local_unthrottle));
6066				list_add_tail(&cfs_rq->throttled_csd_list,
6067					      &local_unthrottle);
6068			}
6069		} else {
6070			throttled = true;
6071		}
6072
6073next:
6074		rq_unlock_irqrestore(rq, &rf);
6075	}
6076
6077	list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
6078				 throttled_csd_list) {
6079		struct rq *rq = rq_of(cfs_rq);
6080
6081		rq_lock_irqsave(rq, &rf);
6082
6083		list_del_init(&cfs_rq->throttled_csd_list);
6084
6085		if (cfs_rq_throttled(cfs_rq))
6086			unthrottle_cfs_rq(cfs_rq);
6087
6088		rq_unlock_irqrestore(rq, &rf);
6089	}
6090	SCHED_WARN_ON(!list_empty(&local_unthrottle));
6091
6092	rcu_read_unlock();
6093
6094	return throttled;
6095}
6096
6097/*
6098 * Responsible for refilling a task_group's bandwidth and unthrottling its
6099 * cfs_rqs as appropriate. If there has been no activity within the last
6100 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
6101 * used to track this state.
6102 */
6103static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
6104{
6105	int throttled;
6106
6107	/* no need to continue the timer with no bandwidth constraint */
6108	if (cfs_b->quota == RUNTIME_INF)
6109		goto out_deactivate;
6110
6111	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
6112	cfs_b->nr_periods += overrun;
6113
6114	/* Refill extra burst quota even if cfs_b->idle */
6115	__refill_cfs_bandwidth_runtime(cfs_b);
6116
6117	/*
6118	 * idle depends on !throttled (for the case of a large deficit), and if
6119	 * we're going inactive then everything else can be deferred
6120	 */
6121	if (cfs_b->idle && !throttled)
6122		goto out_deactivate;
6123
6124	if (!throttled) {
6125		/* mark as potentially idle for the upcoming period */
6126		cfs_b->idle = 1;
6127		return 0;
6128	}
6129
6130	/* account preceding periods in which throttling occurred */
6131	cfs_b->nr_throttled += overrun;
6132
6133	/*
6134	 * This check is repeated as we release cfs_b->lock while we unthrottle.
6135	 */
6136	while (throttled && cfs_b->runtime > 0) {
6137		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6138		/* we can't nest cfs_b->lock while distributing bandwidth */
6139		throttled = distribute_cfs_runtime(cfs_b);
6140		raw_spin_lock_irqsave(&cfs_b->lock, flags);
6141	}
6142
6143	/*
6144	 * While we are ensured activity in the period following an
6145	 * unthrottle, this also covers the case in which the new bandwidth is
6146	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
6147	 * timer to remain active while there are any throttled entities.)
6148	 */
6149	cfs_b->idle = 0;
6150
6151	return 0;
6152
6153out_deactivate:
6154	return 1;
6155}
6156
6157/* a cfs_rq won't donate quota below this amount */
6158static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
6159/* minimum remaining period time to redistribute slack quota */
6160static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
6161/* how long we wait to gather additional slack before distributing */
6162static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
6163
6164/*
6165 * Are we near the end of the current quota period?
6166 *
6167 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
6168 * hrtimer base being cleared by hrtimer_start. In the case of
6169 * migrate_hrtimers, base is never cleared, so we are fine.
6170 */
6171static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
6172{
6173	struct hrtimer *refresh_timer = &cfs_b->period_timer;
6174	s64 remaining;
6175
6176	/* if the call-back is running a quota refresh is already occurring */
6177	if (hrtimer_callback_running(refresh_timer))
6178		return 1;
6179
6180	/* is a quota refresh about to occur? */
6181	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
6182	if (remaining < (s64)min_expire)
6183		return 1;
6184
6185	return 0;
6186}
6187
6188static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
6189{
6190	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
6191
6192	/* if there's a quota refresh soon don't bother with slack */
6193	if (runtime_refresh_within(cfs_b, min_left))
6194		return;
6195
6196	/* don't push forwards an existing deferred unthrottle */
6197	if (cfs_b->slack_started)
6198		return;
6199	cfs_b->slack_started = true;
6200
6201	hrtimer_start(&cfs_b->slack_timer,
6202			ns_to_ktime(cfs_bandwidth_slack_period),
6203			HRTIMER_MODE_REL);
6204}
6205
6206/* we know any runtime found here is valid as update_curr() precedes return */
6207static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6208{
6209	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6210	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
6211
6212	if (slack_runtime <= 0)
6213		return;
6214
6215	raw_spin_lock(&cfs_b->lock);
6216	if (cfs_b->quota != RUNTIME_INF) {
6217		cfs_b->runtime += slack_runtime;
6218
6219		/* we are under rq->lock, defer unthrottling using a timer */
6220		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
6221		    !list_empty(&cfs_b->throttled_cfs_rq))
6222			start_cfs_slack_bandwidth(cfs_b);
6223	}
6224	raw_spin_unlock(&cfs_b->lock);
6225
6226	/* even if it's not valid for return we don't want to try again */
6227	cfs_rq->runtime_remaining -= slack_runtime;
6228}
6229
6230static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6231{
6232	if (!cfs_bandwidth_used())
6233		return;
6234
6235	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
6236		return;
6237
6238	__return_cfs_rq_runtime(cfs_rq);
6239}
6240
6241/*
6242 * This is done with a timer (instead of inline with bandwidth return) since
6243 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
6244 */
6245static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
6246{
6247	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
6248	unsigned long flags;
6249
6250	/* confirm we're still not at a refresh boundary */
6251	raw_spin_lock_irqsave(&cfs_b->lock, flags);
6252	cfs_b->slack_started = false;
6253
6254	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
6255		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6256		return;
6257	}
6258
6259	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
6260		runtime = cfs_b->runtime;
6261
6262	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6263
6264	if (!runtime)
6265		return;
6266
6267	distribute_cfs_runtime(cfs_b);
6268}
6269
6270/*
6271 * When a group wakes up we want to make sure that its quota is not already
6272 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
6273 * runtime as update_curr() throttling can not trigger until it's on-rq.
6274 */
6275static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
6276{
6277	if (!cfs_bandwidth_used())
6278		return;
6279
6280	/* an active group must be handled by the update_curr()->put() path */
6281	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
6282		return;
6283
6284	/* ensure the group is not already throttled */
6285	if (cfs_rq_throttled(cfs_rq))
6286		return;
6287
6288	/* update runtime allocation */
6289	account_cfs_rq_runtime(cfs_rq, 0);
6290	if (cfs_rq->runtime_remaining <= 0)
6291		throttle_cfs_rq(cfs_rq);
6292}
6293
6294static void sync_throttle(struct task_group *tg, int cpu)
6295{
6296	struct cfs_rq *pcfs_rq, *cfs_rq;
6297
6298	if (!cfs_bandwidth_used())
6299		return;
6300
6301	if (!tg->parent)
6302		return;
6303
6304	cfs_rq = tg->cfs_rq[cpu];
6305	pcfs_rq = tg->parent->cfs_rq[cpu];
6306
6307	cfs_rq->throttle_count = pcfs_rq->throttle_count;
6308	cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
6309}
6310
6311/* conditionally throttle active cfs_rq's from put_prev_entity() */
6312static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6313{
6314	if (!cfs_bandwidth_used())
6315		return false;
6316
6317	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
6318		return false;
6319
6320	/*
6321	 * it's possible for a throttled entity to be forced into a running
6322	 * state (e.g. set_curr_task), in this case we're finished.
6323	 */
6324	if (cfs_rq_throttled(cfs_rq))
6325		return true;
6326
6327	return throttle_cfs_rq(cfs_rq);
6328}
6329
6330static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
6331{
6332	struct cfs_bandwidth *cfs_b =
6333		container_of(timer, struct cfs_bandwidth, slack_timer);
6334
6335	do_sched_cfs_slack_timer(cfs_b);
6336
6337	return HRTIMER_NORESTART;
6338}
6339
6340extern const u64 max_cfs_quota_period;
6341
6342static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
6343{
6344	struct cfs_bandwidth *cfs_b =
6345		container_of(timer, struct cfs_bandwidth, period_timer);
6346	unsigned long flags;
6347	int overrun;
6348	int idle = 0;
6349	int count = 0;
6350
6351	raw_spin_lock_irqsave(&cfs_b->lock, flags);
6352	for (;;) {
6353		overrun = hrtimer_forward_now(timer, cfs_b->period);
6354		if (!overrun)
6355			break;
6356
6357		idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
6358
6359		if (++count > 3) {
6360			u64 new, old = ktime_to_ns(cfs_b->period);
6361
6362			/*
6363			 * Grow period by a factor of 2 to avoid losing precision.
6364			 * Precision loss in the quota/period ratio can cause __cfs_schedulable
6365			 * to fail.
6366			 */
6367			new = old * 2;
6368			if (new < max_cfs_quota_period) {
6369				cfs_b->period = ns_to_ktime(new);
6370				cfs_b->quota *= 2;
6371				cfs_b->burst *= 2;
6372
6373				pr_warn_ratelimited(
6374	"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
6375					smp_processor_id(),
6376					div_u64(new, NSEC_PER_USEC),
6377					div_u64(cfs_b->quota, NSEC_PER_USEC));
6378			} else {
6379				pr_warn_ratelimited(
6380	"cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
6381					smp_processor_id(),
6382					div_u64(old, NSEC_PER_USEC),
6383					div_u64(cfs_b->quota, NSEC_PER_USEC));
6384			}
6385
6386			/* reset count so we don't come right back in here */
6387			count = 0;
6388		}
6389	}
6390	if (idle)
6391		cfs_b->period_active = 0;
6392	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6393
6394	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
6395}
6396
6397void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent)
6398{
6399	raw_spin_lock_init(&cfs_b->lock);
6400	cfs_b->runtime = 0;
6401	cfs_b->quota = RUNTIME_INF;
6402	cfs_b->period = ns_to_ktime(default_cfs_period());
6403	cfs_b->burst = 0;
6404	cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
6405
6406	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
6407	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
6408	cfs_b->period_timer.function = sched_cfs_period_timer;
6409
6410	/* Add a random offset so that timers interleave */
6411	hrtimer_set_expires(&cfs_b->period_timer,
6412			    get_random_u32_below(cfs_b->period));
6413	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6414	cfs_b->slack_timer.function = sched_cfs_slack_timer;
6415	cfs_b->slack_started = false;
6416}
6417
6418static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6419{
6420	cfs_rq->runtime_enabled = 0;
6421	INIT_LIST_HEAD(&cfs_rq->throttled_list);
6422	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
6423}
6424
6425void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
6426{
6427	lockdep_assert_held(&cfs_b->lock);
6428
6429	if (cfs_b->period_active)
6430		return;
6431
6432	cfs_b->period_active = 1;
6433	hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
6434	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
6435}
6436
6437static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
6438{
6439	int __maybe_unused i;
6440
6441	/* init_cfs_bandwidth() was not called */
6442	if (!cfs_b->throttled_cfs_rq.next)
6443		return;
6444
6445	hrtimer_cancel(&cfs_b->period_timer);
6446	hrtimer_cancel(&cfs_b->slack_timer);
6447
6448	/*
6449	 * It is possible that we still have some cfs_rq's pending on a CSD
6450	 * list, though this race is very rare. In order for this to occur, we
6451	 * must have raced with the last task leaving the group while there
6452	 * exist throttled cfs_rq(s), and the period_timer must have queued the
6453	 * CSD item but the remote cpu has not yet processed it. To handle this,
6454	 * we can simply flush all pending CSD work inline here. We're
6455	 * guaranteed at this point that no additional cfs_rq of this group can
6456	 * join a CSD list.
6457	 */
6458#ifdef CONFIG_SMP
6459	for_each_possible_cpu(i) {
6460		struct rq *rq = cpu_rq(i);
6461		unsigned long flags;
6462
6463		if (list_empty(&rq->cfsb_csd_list))
6464			continue;
6465
6466		local_irq_save(flags);
6467		__cfsb_csd_unthrottle(rq);
6468		local_irq_restore(flags);
6469	}
6470#endif
6471}
6472
6473/*
6474 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
6475 *
6476 * The race is harmless, since modifying bandwidth settings of unhooked group
6477 * bits doesn't do much.
6478 */
6479
6480/* cpu online callback */
6481static void __maybe_unused update_runtime_enabled(struct rq *rq)
6482{
6483	struct task_group *tg;
6484
6485	lockdep_assert_rq_held(rq);
6486
6487	rcu_read_lock();
6488	list_for_each_entry_rcu(tg, &task_groups, list) {
6489		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6490		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
6491
6492		raw_spin_lock(&cfs_b->lock);
6493		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
6494		raw_spin_unlock(&cfs_b->lock);
6495	}
6496	rcu_read_unlock();
6497}
6498
6499/* cpu offline callback */
6500static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
6501{
6502	struct task_group *tg;
6503
6504	lockdep_assert_rq_held(rq);
6505
6506	/*
6507	 * The rq clock has already been updated in the
6508	 * set_rq_offline(), so we should skip updating
6509	 * the rq clock again in unthrottle_cfs_rq().
6510	 */
6511	rq_clock_start_loop_update(rq);
6512
6513	rcu_read_lock();
6514	list_for_each_entry_rcu(tg, &task_groups, list) {
6515		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
6516
6517		if (!cfs_rq->runtime_enabled)
6518			continue;
6519
6520		/*
6521		 * clock_task is not advancing so we just need to make sure
6522		 * there's some valid quota amount
6523		 */
6524		cfs_rq->runtime_remaining = 1;
6525		/*
6526		 * Offline rq is schedulable till CPU is completely disabled
6527		 * in take_cpu_down(), so we prevent new cfs throttling here.
6528		 */
6529		cfs_rq->runtime_enabled = 0;
6530
6531		if (cfs_rq_throttled(cfs_rq))
6532			unthrottle_cfs_rq(cfs_rq);
6533	}
6534	rcu_read_unlock();
6535
6536	rq_clock_stop_loop_update(rq);
6537}
6538
6539bool cfs_task_bw_constrained(struct task_struct *p)
6540{
6541	struct cfs_rq *cfs_rq = task_cfs_rq(p);
6542
6543	if (!cfs_bandwidth_used())
6544		return false;
6545
6546	if (cfs_rq->runtime_enabled ||
6547	    tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF)
6548		return true;
6549
6550	return false;
6551}
6552
6553#ifdef CONFIG_NO_HZ_FULL
6554/* called from pick_next_task_fair() */
6555static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
6556{
6557	int cpu = cpu_of(rq);
6558
6559	if (!sched_feat(HZ_BW) || !cfs_bandwidth_used())
6560		return;
6561
6562	if (!tick_nohz_full_cpu(cpu))
6563		return;
6564
6565	if (rq->nr_running != 1)
6566		return;
6567
6568	/*
6569	 *  We know there is only one task runnable and we've just picked it. The
6570	 *  normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
6571	 *  be otherwise able to stop the tick. Just need to check if we are using
6572	 *  bandwidth control.
6573	 */
6574	if (cfs_task_bw_constrained(p))
6575		tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
6576}
6577#endif
6578
6579#else /* CONFIG_CFS_BANDWIDTH */
6580
6581static inline bool cfs_bandwidth_used(void)
6582{
6583	return false;
6584}
6585
6586static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
6587static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
6588static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
6589static inline void sync_throttle(struct task_group *tg, int cpu) {}
6590static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
6591
6592static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
6593{
6594	return 0;
6595}
6596
6597static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
6598{
6599	return 0;
6600}
6601
6602static inline int throttled_lb_pair(struct task_group *tg,
6603				    int src_cpu, int dest_cpu)
6604{
6605	return 0;
6606}
6607
6608#ifdef CONFIG_FAIR_GROUP_SCHED
6609void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
6610static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
6611#endif
6612
6613static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
6614{
6615	return NULL;
6616}
6617static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
6618static inline void update_runtime_enabled(struct rq *rq) {}
6619static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6620#ifdef CONFIG_CGROUP_SCHED
6621bool cfs_task_bw_constrained(struct task_struct *p)
6622{
6623	return false;
6624}
6625#endif
6626#endif /* CONFIG_CFS_BANDWIDTH */
6627
6628#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
6629static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
6630#endif
6631
6632/**************************************************
6633 * CFS operations on tasks:
6634 */
6635
6636#ifdef CONFIG_SCHED_HRTICK
6637static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
6638{
6639	struct sched_entity *se = &p->se;
6640
6641	SCHED_WARN_ON(task_rq(p) != rq);
6642
6643	if (rq->cfs.h_nr_running > 1) {
6644		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
6645		u64 slice = se->slice;
6646		s64 delta = slice - ran;
6647
6648		if (delta < 0) {
6649			if (task_current(rq, p))
6650				resched_curr(rq);
6651			return;
6652		}
6653		hrtick_start(rq, delta);
6654	}
6655}
6656
6657/*
6658 * called from enqueue/dequeue and updates the hrtick when the
6659 * current task is from our class and nr_running is low enough
6660 * to matter.
6661 */
6662static void hrtick_update(struct rq *rq)
6663{
6664	struct task_struct *curr = rq->curr;
6665
6666	if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
6667		return;
6668
6669	hrtick_start_fair(rq, curr);
6670}
6671#else /* !CONFIG_SCHED_HRTICK */
6672static inline void
6673hrtick_start_fair(struct rq *rq, struct task_struct *p)
6674{
6675}
6676
6677static inline void hrtick_update(struct rq *rq)
6678{
6679}
6680#endif
6681
6682#ifdef CONFIG_SMP
6683static inline bool cpu_overutilized(int cpu)
6684{
6685	unsigned long  rq_util_min, rq_util_max;
6686
6687	if (!sched_energy_enabled())
6688		return false;
6689
6690	rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
6691	rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
6692
6693	/* Return true only if the utilization doesn't fit CPU's capacity */
6694	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
6695}
6696
6697/*
6698 * overutilized value make sense only if EAS is enabled
6699 */
6700static inline bool is_rd_overutilized(struct root_domain *rd)
6701{
6702	return !sched_energy_enabled() || READ_ONCE(rd->overutilized);
6703}
6704
6705static inline void set_rd_overutilized(struct root_domain *rd, bool flag)
6706{
6707	if (!sched_energy_enabled())
6708		return;
6709
6710	WRITE_ONCE(rd->overutilized, flag);
6711	trace_sched_overutilized_tp(rd, flag);
6712}
6713
6714static inline void check_update_overutilized_status(struct rq *rq)
6715{
6716	/*
6717	 * overutilized field is used for load balancing decisions only
6718	 * if energy aware scheduler is being used
6719	 */
6720
6721	if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
6722		set_rd_overutilized(rq->rd, 1);
6723}
6724#else
6725static inline void check_update_overutilized_status(struct rq *rq) { }
6726#endif
6727
6728/* Runqueue only has SCHED_IDLE tasks enqueued */
6729static int sched_idle_rq(struct rq *rq)
6730{
6731	return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
6732			rq->nr_running);
6733}
6734
6735#ifdef CONFIG_SMP
6736static int sched_idle_cpu(int cpu)
6737{
6738	return sched_idle_rq(cpu_rq(cpu));
6739}
6740#endif
6741
6742/*
6743 * The enqueue_task method is called before nr_running is
6744 * increased. Here we update the fair scheduling stats and
6745 * then put the task into the rbtree:
6746 */
6747static void
6748enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
6749{
6750	struct cfs_rq *cfs_rq;
6751	struct sched_entity *se = &p->se;
6752	int idle_h_nr_running = task_has_idle_policy(p);
6753	int task_new = !(flags & ENQUEUE_WAKEUP);
6754
6755	/*
6756	 * The code below (indirectly) updates schedutil which looks at
6757	 * the cfs_rq utilization to select a frequency.
6758	 * Let's add the task's estimated utilization to the cfs_rq's
6759	 * estimated utilization, before we update schedutil.
6760	 */
6761	util_est_enqueue(&rq->cfs, p);
6762
6763	/*
6764	 * If in_iowait is set, the code below may not trigger any cpufreq
6765	 * utilization updates, so do it here explicitly with the IOWAIT flag
6766	 * passed.
6767	 */
6768	if (p->in_iowait)
6769		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
6770
6771	for_each_sched_entity(se) {
6772		if (se->on_rq)
6773			break;
6774		cfs_rq = cfs_rq_of(se);
6775		enqueue_entity(cfs_rq, se, flags);
6776
6777		cfs_rq->h_nr_running++;
6778		cfs_rq->idle_h_nr_running += idle_h_nr_running;
6779
6780		if (cfs_rq_is_idle(cfs_rq))
6781			idle_h_nr_running = 1;
6782
6783		/* end evaluation on encountering a throttled cfs_rq */
6784		if (cfs_rq_throttled(cfs_rq))
6785			goto enqueue_throttle;
6786
6787		flags = ENQUEUE_WAKEUP;
6788	}
6789
6790	for_each_sched_entity(se) {
6791		cfs_rq = cfs_rq_of(se);
6792
6793		update_load_avg(cfs_rq, se, UPDATE_TG);
6794		se_update_runnable(se);
6795		update_cfs_group(se);
6796
6797		cfs_rq->h_nr_running++;
6798		cfs_rq->idle_h_nr_running += idle_h_nr_running;
6799
6800		if (cfs_rq_is_idle(cfs_rq))
6801			idle_h_nr_running = 1;
6802
6803		/* end evaluation on encountering a throttled cfs_rq */
6804		if (cfs_rq_throttled(cfs_rq))
6805			goto enqueue_throttle;
6806	}
6807
6808	/* At this point se is NULL and we are at root level*/
6809	add_nr_running(rq, 1);
6810
6811	/*
6812	 * Since new tasks are assigned an initial util_avg equal to
6813	 * half of the spare capacity of their CPU, tiny tasks have the
6814	 * ability to cross the overutilized threshold, which will
6815	 * result in the load balancer ruining all the task placement
6816	 * done by EAS. As a way to mitigate that effect, do not account
6817	 * for the first enqueue operation of new tasks during the
6818	 * overutilized flag detection.
6819	 *
6820	 * A better way of solving this problem would be to wait for
6821	 * the PELT signals of tasks to converge before taking them
6822	 * into account, but that is not straightforward to implement,
6823	 * and the following generally works well enough in practice.
6824	 */
6825	if (!task_new)
6826		check_update_overutilized_status(rq);
6827
6828enqueue_throttle:
6829	assert_list_leaf_cfs_rq(rq);
6830
6831	hrtick_update(rq);
6832}
6833
6834static void set_next_buddy(struct sched_entity *se);
6835
6836/*
6837 * The dequeue_task method is called before nr_running is
6838 * decreased. We remove the task from the rbtree and
6839 * update the fair scheduling stats:
6840 */
6841static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
6842{
6843	struct cfs_rq *cfs_rq;
6844	struct sched_entity *se = &p->se;
6845	int task_sleep = flags & DEQUEUE_SLEEP;
6846	int idle_h_nr_running = task_has_idle_policy(p);
6847	bool was_sched_idle = sched_idle_rq(rq);
6848
6849	util_est_dequeue(&rq->cfs, p);
6850
6851	for_each_sched_entity(se) {
6852		cfs_rq = cfs_rq_of(se);
6853		dequeue_entity(cfs_rq, se, flags);
6854
6855		cfs_rq->h_nr_running--;
6856		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
6857
6858		if (cfs_rq_is_idle(cfs_rq))
6859			idle_h_nr_running = 1;
6860
6861		/* end evaluation on encountering a throttled cfs_rq */
6862		if (cfs_rq_throttled(cfs_rq))
6863			goto dequeue_throttle;
6864
6865		/* Don't dequeue parent if it has other entities besides us */
6866		if (cfs_rq->load.weight) {
6867			/* Avoid re-evaluating load for this entity: */
6868			se = parent_entity(se);
6869			/*
6870			 * Bias pick_next to pick a task from this cfs_rq, as
6871			 * p is sleeping when it is within its sched_slice.
6872			 */
6873			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
6874				set_next_buddy(se);
6875			break;
6876		}
6877		flags |= DEQUEUE_SLEEP;
6878	}
6879
6880	for_each_sched_entity(se) {
6881		cfs_rq = cfs_rq_of(se);
6882
6883		update_load_avg(cfs_rq, se, UPDATE_TG);
6884		se_update_runnable(se);
6885		update_cfs_group(se);
6886
6887		cfs_rq->h_nr_running--;
6888		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
6889
6890		if (cfs_rq_is_idle(cfs_rq))
6891			idle_h_nr_running = 1;
6892
6893		/* end evaluation on encountering a throttled cfs_rq */
6894		if (cfs_rq_throttled(cfs_rq))
6895			goto dequeue_throttle;
6896
6897	}
6898
6899	/* At this point se is NULL and we are at root level*/
6900	sub_nr_running(rq, 1);
6901
6902	/* balance early to pull high priority tasks */
6903	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
6904		rq->next_balance = jiffies;
6905
6906dequeue_throttle:
6907	util_est_update(&rq->cfs, p, task_sleep);
6908	hrtick_update(rq);
6909}
6910
6911#ifdef CONFIG_SMP
6912
6913/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
6914static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
6915static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
6916static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
6917
6918#ifdef CONFIG_NO_HZ_COMMON
6919
6920static struct {
6921	cpumask_var_t idle_cpus_mask;
6922	atomic_t nr_cpus;
6923	int has_blocked;		/* Idle CPUS has blocked load */
6924	int needs_update;		/* Newly idle CPUs need their next_balance collated */
6925	unsigned long next_balance;     /* in jiffy units */
6926	unsigned long next_blocked;	/* Next update of blocked load in jiffies */
6927} nohz ____cacheline_aligned;
6928
6929#endif /* CONFIG_NO_HZ_COMMON */
6930
6931static unsigned long cpu_load(struct rq *rq)
6932{
6933	return cfs_rq_load_avg(&rq->cfs);
6934}
6935
6936/*
6937 * cpu_load_without - compute CPU load without any contributions from *p
6938 * @cpu: the CPU which load is requested
6939 * @p: the task which load should be discounted
6940 *
6941 * The load of a CPU is defined by the load of tasks currently enqueued on that
6942 * CPU as well as tasks which are currently sleeping after an execution on that
6943 * CPU.
6944 *
6945 * This method returns the load of the specified CPU by discounting the load of
6946 * the specified task, whenever the task is currently contributing to the CPU
6947 * load.
6948 */
6949static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
6950{
6951	struct cfs_rq *cfs_rq;
6952	unsigned int load;
6953
6954	/* Task has no contribution or is new */
6955	if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6956		return cpu_load(rq);
6957
6958	cfs_rq = &rq->cfs;
6959	load = READ_ONCE(cfs_rq->avg.load_avg);
6960
6961	/* Discount task's util from CPU's util */
6962	lsub_positive(&load, task_h_load(p));
6963
6964	return load;
6965}
6966
6967static unsigned long cpu_runnable(struct rq *rq)
6968{
6969	return cfs_rq_runnable_avg(&rq->cfs);
6970}
6971
6972static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
6973{
6974	struct cfs_rq *cfs_rq;
6975	unsigned int runnable;
6976
6977	/* Task has no contribution or is new */
6978	if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6979		return cpu_runnable(rq);
6980
6981	cfs_rq = &rq->cfs;
6982	runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
6983
6984	/* Discount task's runnable from CPU's runnable */
6985	lsub_positive(&runnable, p->se.avg.runnable_avg);
6986
6987	return runnable;
6988}
6989
6990static unsigned long capacity_of(int cpu)
6991{
6992	return cpu_rq(cpu)->cpu_capacity;
6993}
6994
6995static void record_wakee(struct task_struct *p)
6996{
6997	/*
6998	 * Only decay a single time; tasks that have less then 1 wakeup per
6999	 * jiffy will not have built up many flips.
7000	 */
7001	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
7002		current->wakee_flips >>= 1;
7003		current->wakee_flip_decay_ts = jiffies;
7004	}
7005
7006	if (current->last_wakee != p) {
7007		current->last_wakee = p;
7008		current->wakee_flips++;
7009	}
7010}
7011
7012/*
7013 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
7014 *
7015 * A waker of many should wake a different task than the one last awakened
7016 * at a frequency roughly N times higher than one of its wakees.
7017 *
7018 * In order to determine whether we should let the load spread vs consolidating
7019 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
7020 * partner, and a factor of lls_size higher frequency in the other.
7021 *
7022 * With both conditions met, we can be relatively sure that the relationship is
7023 * non-monogamous, with partner count exceeding socket size.
7024 *
7025 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
7026 * whatever is irrelevant, spread criteria is apparent partner count exceeds
7027 * socket size.
7028 */
7029static int wake_wide(struct task_struct *p)
7030{
7031	unsigned int master = current->wakee_flips;
7032	unsigned int slave = p->wakee_flips;
7033	int factor = __this_cpu_read(sd_llc_size);
7034
7035	if (master < slave)
7036		swap(master, slave);
7037	if (slave < factor || master < slave * factor)
7038		return 0;
7039	return 1;
7040}
7041
7042/*
7043 * The purpose of wake_affine() is to quickly determine on which CPU we can run
7044 * soonest. For the purpose of speed we only consider the waking and previous
7045 * CPU.
7046 *
7047 * wake_affine_idle() - only considers 'now', it check if the waking CPU is
7048 *			cache-affine and is (or	will be) idle.
7049 *
7050 * wake_affine_weight() - considers the weight to reflect the average
7051 *			  scheduling latency of the CPUs. This seems to work
7052 *			  for the overloaded case.
7053 */
7054static int
7055wake_affine_idle(int this_cpu, int prev_cpu, int sync)
7056{
7057	/*
7058	 * If this_cpu is idle, it implies the wakeup is from interrupt
7059	 * context. Only allow the move if cache is shared. Otherwise an
7060	 * interrupt intensive workload could force all tasks onto one
7061	 * node depending on the IO topology or IRQ affinity settings.
7062	 *
7063	 * If the prev_cpu is idle and cache affine then avoid a migration.
7064	 * There is no guarantee that the cache hot data from an interrupt
7065	 * is more important than cache hot data on the prev_cpu and from
7066	 * a cpufreq perspective, it's better to have higher utilisation
7067	 * on one CPU.
7068	 */
7069	if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
7070		return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
7071
7072	if (sync && cpu_rq(this_cpu)->nr_running == 1)
7073		return this_cpu;
7074
7075	if (available_idle_cpu(prev_cpu))
7076		return prev_cpu;
7077
7078	return nr_cpumask_bits;
7079}
7080
7081static int
7082wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
7083		   int this_cpu, int prev_cpu, int sync)
7084{
7085	s64 this_eff_load, prev_eff_load;
7086	unsigned long task_load;
7087
7088	this_eff_load = cpu_load(cpu_rq(this_cpu));
7089
7090	if (sync) {
7091		unsigned long current_load = task_h_load(current);
7092
7093		if (current_load > this_eff_load)
7094			return this_cpu;
7095
7096		this_eff_load -= current_load;
7097	}
7098
7099	task_load = task_h_load(p);
7100
7101	this_eff_load += task_load;
7102	if (sched_feat(WA_BIAS))
7103		this_eff_load *= 100;
7104	this_eff_load *= capacity_of(prev_cpu);
7105
7106	prev_eff_load = cpu_load(cpu_rq(prev_cpu));
7107	prev_eff_load -= task_load;
7108	if (sched_feat(WA_BIAS))
7109		prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
7110	prev_eff_load *= capacity_of(this_cpu);
7111
7112	/*
7113	 * If sync, adjust the weight of prev_eff_load such that if
7114	 * prev_eff == this_eff that select_idle_sibling() will consider
7115	 * stacking the wakee on top of the waker if no other CPU is
7116	 * idle.
7117	 */
7118	if (sync)
7119		prev_eff_load += 1;
7120
7121	return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
7122}
7123
7124static int wake_affine(struct sched_domain *sd, struct task_struct *p,
7125		       int this_cpu, int prev_cpu, int sync)
7126{
7127	int target = nr_cpumask_bits;
7128
7129	if (sched_feat(WA_IDLE))
7130		target = wake_affine_idle(this_cpu, prev_cpu, sync);
7131
7132	if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
7133		target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
7134
7135	schedstat_inc(p->stats.nr_wakeups_affine_attempts);
7136	if (target != this_cpu)
7137		return prev_cpu;
7138
7139	schedstat_inc(sd->ttwu_move_affine);
7140	schedstat_inc(p->stats.nr_wakeups_affine);
7141	return target;
7142}
7143
7144static struct sched_group *
7145sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
7146
7147/*
7148 * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group.
7149 */
7150static int
7151sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
7152{
7153	unsigned long load, min_load = ULONG_MAX;
7154	unsigned int min_exit_latency = UINT_MAX;
7155	u64 latest_idle_timestamp = 0;
7156	int least_loaded_cpu = this_cpu;
7157	int shallowest_idle_cpu = -1;
7158	int i;
7159
7160	/* Check if we have any choice: */
7161	if (group->group_weight == 1)
7162		return cpumask_first(sched_group_span(group));
7163
7164	/* Traverse only the allowed CPUs */
7165	for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
7166		struct rq *rq = cpu_rq(i);
7167
7168		if (!sched_core_cookie_match(rq, p))
7169			continue;
7170
7171		if (sched_idle_cpu(i))
7172			return i;
7173
7174		if (available_idle_cpu(i)) {
7175			struct cpuidle_state *idle = idle_get_state(rq);
7176			if (idle && idle->exit_latency < min_exit_latency) {
7177				/*
7178				 * We give priority to a CPU whose idle state
7179				 * has the smallest exit latency irrespective
7180				 * of any idle timestamp.
7181				 */
7182				min_exit_latency = idle->exit_latency;
7183				latest_idle_timestamp = rq->idle_stamp;
7184				shallowest_idle_cpu = i;
7185			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
7186				   rq->idle_stamp > latest_idle_timestamp) {
7187				/*
7188				 * If equal or no active idle state, then
7189				 * the most recently idled CPU might have
7190				 * a warmer cache.
7191				 */
7192				latest_idle_timestamp = rq->idle_stamp;
7193				shallowest_idle_cpu = i;
7194			}
7195		} else if (shallowest_idle_cpu == -1) {
7196			load = cpu_load(cpu_rq(i));
7197			if (load < min_load) {
7198				min_load = load;
7199				least_loaded_cpu = i;
7200			}
7201		}
7202	}
7203
7204	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
7205}
7206
7207static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p,
7208				  int cpu, int prev_cpu, int sd_flag)
7209{
7210	int new_cpu = cpu;
7211
7212	if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
7213		return prev_cpu;
7214
7215	/*
7216	 * We need task's util for cpu_util_without, sync it up to
7217	 * prev_cpu's last_update_time.
7218	 */
7219	if (!(sd_flag & SD_BALANCE_FORK))
7220		sync_entity_load_avg(&p->se);
7221
7222	while (sd) {
7223		struct sched_group *group;
7224		struct sched_domain *tmp;
7225		int weight;
7226
7227		if (!(sd->flags & sd_flag)) {
7228			sd = sd->child;
7229			continue;
7230		}
7231
7232		group = sched_balance_find_dst_group(sd, p, cpu);
7233		if (!group) {
7234			sd = sd->child;
7235			continue;
7236		}
7237
7238		new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu);
7239		if (new_cpu == cpu) {
7240			/* Now try balancing at a lower domain level of 'cpu': */
7241			sd = sd->child;
7242			continue;
7243		}
7244
7245		/* Now try balancing at a lower domain level of 'new_cpu': */
7246		cpu = new_cpu;
7247		weight = sd->span_weight;
7248		sd = NULL;
7249		for_each_domain(cpu, tmp) {
7250			if (weight <= tmp->span_weight)
7251				break;
7252			if (tmp->flags & sd_flag)
7253				sd = tmp;
7254		}
7255	}
7256
7257	return new_cpu;
7258}
7259
7260static inline int __select_idle_cpu(int cpu, struct task_struct *p)
7261{
7262	if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
7263	    sched_cpu_cookie_match(cpu_rq(cpu), p))
7264		return cpu;
7265
7266	return -1;
7267}
7268
7269#ifdef CONFIG_SCHED_SMT
7270DEFINE_STATIC_KEY_FALSE(sched_smt_present);
7271EXPORT_SYMBOL_GPL(sched_smt_present);
7272
7273static inline void set_idle_cores(int cpu, int val)
7274{
7275	struct sched_domain_shared *sds;
7276
7277	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
7278	if (sds)
7279		WRITE_ONCE(sds->has_idle_cores, val);
7280}
7281
7282static inline bool test_idle_cores(int cpu)
7283{
7284	struct sched_domain_shared *sds;
7285
7286	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
7287	if (sds)
7288		return READ_ONCE(sds->has_idle_cores);
7289
7290	return false;
7291}
7292
7293/*
7294 * Scans the local SMT mask to see if the entire core is idle, and records this
7295 * information in sd_llc_shared->has_idle_cores.
7296 *
7297 * Since SMT siblings share all cache levels, inspecting this limited remote
7298 * state should be fairly cheap.
7299 */
7300void __update_idle_core(struct rq *rq)
7301{
7302	int core = cpu_of(rq);
7303	int cpu;
7304
7305	rcu_read_lock();
7306	if (test_idle_cores(core))
7307		goto unlock;
7308
7309	for_each_cpu(cpu, cpu_smt_mask(core)) {
7310		if (cpu == core)
7311			continue;
7312
7313		if (!available_idle_cpu(cpu))
7314			goto unlock;
7315	}
7316
7317	set_idle_cores(core, 1);
7318unlock:
7319	rcu_read_unlock();
7320}
7321
7322/*
7323 * Scan the entire LLC domain for idle cores; this dynamically switches off if
7324 * there are no idle cores left in the system; tracked through
7325 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
7326 */
7327static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
7328{
7329	bool idle = true;
7330	int cpu;
7331
7332	for_each_cpu(cpu, cpu_smt_mask(core)) {
7333		if (!available_idle_cpu(cpu)) {
7334			idle = false;
7335			if (*idle_cpu == -1) {
7336				if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
7337					*idle_cpu = cpu;
7338					break;
7339				}
7340				continue;
7341			}
7342			break;
7343		}
7344		if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus))
7345			*idle_cpu = cpu;
7346	}
7347
7348	if (idle)
7349		return core;
7350
7351	cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
7352	return -1;
7353}
7354
7355/*
7356 * Scan the local SMT mask for idle CPUs.
7357 */
7358static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
7359{
7360	int cpu;
7361
7362	for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
7363		if (cpu == target)
7364			continue;
7365		/*
7366		 * Check if the CPU is in the LLC scheduling domain of @target.
7367		 * Due to isolcpus, there is no guarantee that all the siblings are in the domain.
7368		 */
7369		if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
7370			continue;
7371		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
7372			return cpu;
7373	}
7374
7375	return -1;
7376}
7377
7378#else /* CONFIG_SCHED_SMT */
7379
7380static inline void set_idle_cores(int cpu, int val)
7381{
7382}
7383
7384static inline bool test_idle_cores(int cpu)
7385{
7386	return false;
7387}
7388
7389static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
7390{
7391	return __select_idle_cpu(core, p);
7392}
7393
7394static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
7395{
7396	return -1;
7397}
7398
7399#endif /* CONFIG_SCHED_SMT */
7400
7401/*
7402 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
7403 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
7404 * average idle time for this rq (as found in rq->avg_idle).
7405 */
7406static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
7407{
7408	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
7409	int i, cpu, idle_cpu = -1, nr = INT_MAX;
7410	struct sched_domain_shared *sd_share;
7411
7412	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
7413
7414	if (sched_feat(SIS_UTIL)) {
7415		sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
7416		if (sd_share) {
7417			/* because !--nr is the condition to stop scan */
7418			nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
7419			/* overloaded LLC is unlikely to have idle cpu/core */
7420			if (nr == 1)
7421				return -1;
7422		}
7423	}
7424
7425	if (static_branch_unlikely(&sched_cluster_active)) {
7426		struct sched_group *sg = sd->groups;
7427
7428		if (sg->flags & SD_CLUSTER) {
7429			for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
7430				if (!cpumask_test_cpu(cpu, cpus))
7431					continue;
7432
7433				if (has_idle_core) {
7434					i = select_idle_core(p, cpu, cpus, &idle_cpu);
7435					if ((unsigned int)i < nr_cpumask_bits)
7436						return i;
7437				} else {
7438					if (--nr <= 0)
7439						return -1;
7440					idle_cpu = __select_idle_cpu(cpu, p);
7441					if ((unsigned int)idle_cpu < nr_cpumask_bits)
7442						return idle_cpu;
7443				}
7444			}
7445			cpumask_andnot(cpus, cpus, sched_group_span(sg));
7446		}
7447	}
7448
7449	for_each_cpu_wrap(cpu, cpus, target + 1) {
7450		if (has_idle_core) {
7451			i = select_idle_core(p, cpu, cpus, &idle_cpu);
7452			if ((unsigned int)i < nr_cpumask_bits)
7453				return i;
7454
7455		} else {
7456			if (--nr <= 0)
7457				return -1;
7458			idle_cpu = __select_idle_cpu(cpu, p);
7459			if ((unsigned int)idle_cpu < nr_cpumask_bits)
7460				break;
7461		}
7462	}
7463
7464	if (has_idle_core)
7465		set_idle_cores(target, false);
7466
7467	return idle_cpu;
7468}
7469
7470/*
7471 * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
7472 * the task fits. If no CPU is big enough, but there are idle ones, try to
7473 * maximize capacity.
7474 */
7475static int
7476select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
7477{
7478	unsigned long task_util, util_min, util_max, best_cap = 0;
7479	int fits, best_fits = 0;
7480	int cpu, best_cpu = -1;
7481	struct cpumask *cpus;
7482
7483	cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
7484	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
7485
7486	task_util = task_util_est(p);
7487	util_min = uclamp_eff_value(p, UCLAMP_MIN);
7488	util_max = uclamp_eff_value(p, UCLAMP_MAX);
7489
7490	for_each_cpu_wrap(cpu, cpus, target) {
7491		unsigned long cpu_cap = capacity_of(cpu);
7492
7493		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
7494			continue;
7495
7496		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
7497
7498		/* This CPU fits with all requirements */
7499		if (fits > 0)
7500			return cpu;
7501		/*
7502		 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
7503		 * Look for the CPU with best capacity.
7504		 */
7505		else if (fits < 0)
7506			cpu_cap = get_actual_cpu_capacity(cpu);
7507
7508		/*
7509		 * First, select CPU which fits better (-1 being better than 0).
7510		 * Then, select the one with best capacity at same level.
7511		 */
7512		if ((fits < best_fits) ||
7513		    ((fits == best_fits) && (cpu_cap > best_cap))) {
7514			best_cap = cpu_cap;
7515			best_cpu = cpu;
7516			best_fits = fits;
7517		}
7518	}
7519
7520	return best_cpu;
7521}
7522
7523static inline bool asym_fits_cpu(unsigned long util,
7524				 unsigned long util_min,
7525				 unsigned long util_max,
7526				 int cpu)
7527{
7528	if (sched_asym_cpucap_active())
7529		/*
7530		 * Return true only if the cpu fully fits the task requirements
7531		 * which include the utilization and the performance hints.
7532		 */
7533		return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
7534
7535	return true;
7536}
7537
7538/*
7539 * Try and locate an idle core/thread in the LLC cache domain.
7540 */
7541static int select_idle_sibling(struct task_struct *p, int prev, int target)
7542{
7543	bool has_idle_core = false;
7544	struct sched_domain *sd;
7545	unsigned long task_util, util_min, util_max;
7546	int i, recent_used_cpu, prev_aff = -1;
7547
7548	/*
7549	 * On asymmetric system, update task utilization because we will check
7550	 * that the task fits with CPU's capacity.
7551	 */
7552	if (sched_asym_cpucap_active()) {
7553		sync_entity_load_avg(&p->se);
7554		task_util = task_util_est(p);
7555		util_min = uclamp_eff_value(p, UCLAMP_MIN);
7556		util_max = uclamp_eff_value(p, UCLAMP_MAX);
7557	}
7558
7559	/*
7560	 * per-cpu select_rq_mask usage
7561	 */
7562	lockdep_assert_irqs_disabled();
7563
7564	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
7565	    asym_fits_cpu(task_util, util_min, util_max, target))
7566		return target;
7567
7568	/*
7569	 * If the previous CPU is cache affine and idle, don't be stupid:
7570	 */
7571	if (prev != target && cpus_share_cache(prev, target) &&
7572	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
7573	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
7574
7575		if (!static_branch_unlikely(&sched_cluster_active) ||
7576		    cpus_share_resources(prev, target))
7577			return prev;
7578
7579		prev_aff = prev;
7580	}
7581
7582	/*
7583	 * Allow a per-cpu kthread to stack with the wakee if the
7584	 * kworker thread and the tasks previous CPUs are the same.
7585	 * The assumption is that the wakee queued work for the
7586	 * per-cpu kthread that is now complete and the wakeup is
7587	 * essentially a sync wakeup. An obvious example of this
7588	 * pattern is IO completions.
7589	 */
7590	if (is_per_cpu_kthread(current) &&
7591	    in_task() &&
7592	    prev == smp_processor_id() &&
7593	    this_rq()->nr_running <= 1 &&
7594	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
7595		return prev;
7596	}
7597
7598	/* Check a recently used CPU as a potential idle candidate: */
7599	recent_used_cpu = p->recent_used_cpu;
7600	p->recent_used_cpu = prev;
7601	if (recent_used_cpu != prev &&
7602	    recent_used_cpu != target &&
7603	    cpus_share_cache(recent_used_cpu, target) &&
7604	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
7605	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
7606	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
7607
7608		if (!static_branch_unlikely(&sched_cluster_active) ||
7609		    cpus_share_resources(recent_used_cpu, target))
7610			return recent_used_cpu;
7611
7612	} else {
7613		recent_used_cpu = -1;
7614	}
7615
7616	/*
7617	 * For asymmetric CPU capacity systems, our domain of interest is
7618	 * sd_asym_cpucapacity rather than sd_llc.
7619	 */
7620	if (sched_asym_cpucap_active()) {
7621		sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
7622		/*
7623		 * On an asymmetric CPU capacity system where an exclusive
7624		 * cpuset defines a symmetric island (i.e. one unique
7625		 * capacity_orig value through the cpuset), the key will be set
7626		 * but the CPUs within that cpuset will not have a domain with
7627		 * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
7628		 * capacity path.
7629		 */
7630		if (sd) {
7631			i = select_idle_capacity(p, sd, target);
7632			return ((unsigned)i < nr_cpumask_bits) ? i : target;
7633		}
7634	}
7635
7636	sd = rcu_dereference(per_cpu(sd_llc, target));
7637	if (!sd)
7638		return target;
7639
7640	if (sched_smt_active()) {
7641		has_idle_core = test_idle_cores(target);
7642
7643		if (!has_idle_core && cpus_share_cache(prev, target)) {
7644			i = select_idle_smt(p, sd, prev);
7645			if ((unsigned int)i < nr_cpumask_bits)
7646				return i;
7647		}
7648	}
7649
7650	i = select_idle_cpu(p, sd, has_idle_core, target);
7651	if ((unsigned)i < nr_cpumask_bits)
7652		return i;
7653
7654	/*
7655	 * For cluster machines which have lower sharing cache like L2 or
7656	 * LLC Tag, we tend to find an idle CPU in the target's cluster
7657	 * first. But prev_cpu or recent_used_cpu may also be a good candidate,
7658	 * use them if possible when no idle CPU found in select_idle_cpu().
7659	 */
7660	if ((unsigned int)prev_aff < nr_cpumask_bits)
7661		return prev_aff;
7662	if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
7663		return recent_used_cpu;
7664
7665	return target;
7666}
7667
7668/**
7669 * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
7670 * @cpu: the CPU to get the utilization for
7671 * @p: task for which the CPU utilization should be predicted or NULL
7672 * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
7673 * @boost: 1 to enable boosting, otherwise 0
7674 *
7675 * The unit of the return value must be the same as the one of CPU capacity
7676 * so that CPU utilization can be compared with CPU capacity.
7677 *
7678 * CPU utilization is the sum of running time of runnable tasks plus the
7679 * recent utilization of currently non-runnable tasks on that CPU.
7680 * It represents the amount of CPU capacity currently used by CFS tasks in
7681 * the range [0..max CPU capacity] with max CPU capacity being the CPU
7682 * capacity at f_max.
7683 *
7684 * The estimated CPU utilization is defined as the maximum between CPU
7685 * utilization and sum of the estimated utilization of the currently
7686 * runnable tasks on that CPU. It preserves a utilization "snapshot" of
7687 * previously-executed tasks, which helps better deduce how busy a CPU will
7688 * be when a long-sleeping task wakes up. The contribution to CPU utilization
7689 * of such a task would be significantly decayed at this point of time.
7690 *
7691 * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
7692 * CPU contention for CFS tasks can be detected by CPU runnable > CPU
7693 * utilization. Boosting is implemented in cpu_util() so that internal
7694 * users (e.g. EAS) can use it next to external users (e.g. schedutil),
7695 * latter via cpu_util_cfs_boost().
7696 *
7697 * CPU utilization can be higher than the current CPU capacity
7698 * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
7699 * of rounding errors as well as task migrations or wakeups of new tasks.
7700 * CPU utilization has to be capped to fit into the [0..max CPU capacity]
7701 * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
7702 * could be seen as over-utilized even though CPU1 has 20% of spare CPU
7703 * capacity. CPU utilization is allowed to overshoot current CPU capacity
7704 * though since this is useful for predicting the CPU capacity required
7705 * after task migrations (scheduler-driven DVFS).
7706 *
7707 * Return: (Boosted) (estimated) utilization for the specified CPU.
7708 */
7709static unsigned long
7710cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
7711{
7712	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
7713	unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
7714	unsigned long runnable;
7715
7716	if (boost) {
7717		runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
7718		util = max(util, runnable);
7719	}
7720
7721	/*
7722	 * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
7723	 * contribution. If @p migrates from another CPU to @cpu add its
7724	 * contribution. In all the other cases @cpu is not impacted by the
7725	 * migration so its util_avg is already correct.
7726	 */
7727	if (p && task_cpu(p) == cpu && dst_cpu != cpu)
7728		lsub_positive(&util, task_util(p));
7729	else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
7730		util += task_util(p);
7731
7732	if (sched_feat(UTIL_EST)) {
7733		unsigned long util_est;
7734
7735		util_est = READ_ONCE(cfs_rq->avg.util_est);
7736
7737		/*
7738		 * During wake-up @p isn't enqueued yet and doesn't contribute
7739		 * to any cpu_rq(cpu)->cfs.avg.util_est.
7740		 * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
7741		 * has been enqueued.
7742		 *
7743		 * During exec (@dst_cpu = -1) @p is enqueued and does
7744		 * contribute to cpu_rq(cpu)->cfs.util_est.
7745		 * Remove it to "simulate" cpu_util without @p's contribution.
7746		 *
7747		 * Despite the task_on_rq_queued(@p) check there is still a
7748		 * small window for a possible race when an exec
7749		 * select_task_rq_fair() races with LB's detach_task().
7750		 *
7751		 *   detach_task()
7752		 *     deactivate_task()
7753		 *       p->on_rq = TASK_ON_RQ_MIGRATING;
7754		 *       -------------------------------- A
7755		 *       dequeue_task()                    \
7756		 *         dequeue_task_fair()              + Race Time
7757		 *           util_est_dequeue()            /
7758		 *       -------------------------------- B
7759		 *
7760		 * The additional check "current == p" is required to further
7761		 * reduce the race window.
7762		 */
7763		if (dst_cpu == cpu)
7764			util_est += _task_util_est(p);
7765		else if (p && unlikely(task_on_rq_queued(p) || current == p))
7766			lsub_positive(&util_est, _task_util_est(p));
7767
7768		util = max(util, util_est);
7769	}
7770
7771	return min(util, arch_scale_cpu_capacity(cpu));
7772}
7773
7774unsigned long cpu_util_cfs(int cpu)
7775{
7776	return cpu_util(cpu, NULL, -1, 0);
7777}
7778
7779unsigned long cpu_util_cfs_boost(int cpu)
7780{
7781	return cpu_util(cpu, NULL, -1, 1);
7782}
7783
7784/*
7785 * cpu_util_without: compute cpu utilization without any contributions from *p
7786 * @cpu: the CPU which utilization is requested
7787 * @p: the task which utilization should be discounted
7788 *
7789 * The utilization of a CPU is defined by the utilization of tasks currently
7790 * enqueued on that CPU as well as tasks which are currently sleeping after an
7791 * execution on that CPU.
7792 *
7793 * This method returns the utilization of the specified CPU by discounting the
7794 * utilization of the specified task, whenever the task is currently
7795 * contributing to the CPU utilization.
7796 */
7797static unsigned long cpu_util_without(int cpu, struct task_struct *p)
7798{
7799	/* Task has no contribution or is new */
7800	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
7801		p = NULL;
7802
7803	return cpu_util(cpu, p, -1, 0);
7804}
7805
7806/*
7807 * energy_env - Utilization landscape for energy estimation.
7808 * @task_busy_time: Utilization contribution by the task for which we test the
7809 *                  placement. Given by eenv_task_busy_time().
7810 * @pd_busy_time:   Utilization of the whole perf domain without the task
7811 *                  contribution. Given by eenv_pd_busy_time().
7812 * @cpu_cap:        Maximum CPU capacity for the perf domain.
7813 * @pd_cap:         Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
7814 */
7815struct energy_env {
7816	unsigned long task_busy_time;
7817	unsigned long pd_busy_time;
7818	unsigned long cpu_cap;
7819	unsigned long pd_cap;
7820};
7821
7822/*
7823 * Compute the task busy time for compute_energy(). This time cannot be
7824 * injected directly into effective_cpu_util() because of the IRQ scaling.
7825 * The latter only makes sense with the most recent CPUs where the task has
7826 * run.
7827 */
7828static inline void eenv_task_busy_time(struct energy_env *eenv,
7829				       struct task_struct *p, int prev_cpu)
7830{
7831	unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu);
7832	unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
7833
7834	if (unlikely(irq >= max_cap))
7835		busy_time = max_cap;
7836	else
7837		busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap);
7838
7839	eenv->task_busy_time = busy_time;
7840}
7841
7842/*
7843 * Compute the perf_domain (PD) busy time for compute_energy(). Based on the
7844 * utilization for each @pd_cpus, it however doesn't take into account
7845 * clamping since the ratio (utilization / cpu_capacity) is already enough to
7846 * scale the EM reported power consumption at the (eventually clamped)
7847 * cpu_capacity.
7848 *
7849 * The contribution of the task @p for which we want to estimate the
7850 * energy cost is removed (by cpu_util()) and must be calculated
7851 * separately (see eenv_task_busy_time). This ensures:
7852 *
7853 *   - A stable PD utilization, no matter which CPU of that PD we want to place
7854 *     the task on.
7855 *
7856 *   - A fair comparison between CPUs as the task contribution (task_util())
7857 *     will always be the same no matter which CPU utilization we rely on
7858 *     (util_avg or util_est).
7859 *
7860 * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
7861 * exceed @eenv->pd_cap.
7862 */
7863static inline void eenv_pd_busy_time(struct energy_env *eenv,
7864				     struct cpumask *pd_cpus,
7865				     struct task_struct *p)
7866{
7867	unsigned long busy_time = 0;
7868	int cpu;
7869
7870	for_each_cpu(cpu, pd_cpus) {
7871		unsigned long util = cpu_util(cpu, p, -1, 0);
7872
7873		busy_time += effective_cpu_util(cpu, util, NULL, NULL);
7874	}
7875
7876	eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
7877}
7878
7879/*
7880 * Compute the maximum utilization for compute_energy() when the task @p
7881 * is placed on the cpu @dst_cpu.
7882 *
7883 * Returns the maximum utilization among @eenv->cpus. This utilization can't
7884 * exceed @eenv->cpu_cap.
7885 */
7886static inline unsigned long
7887eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
7888		 struct task_struct *p, int dst_cpu)
7889{
7890	unsigned long max_util = 0;
7891	int cpu;
7892
7893	for_each_cpu(cpu, pd_cpus) {
7894		struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
7895		unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
7896		unsigned long eff_util, min, max;
7897
7898		/*
7899		 * Performance domain frequency: utilization clamping
7900		 * must be considered since it affects the selection
7901		 * of the performance domain frequency.
7902		 * NOTE: in case RT tasks are running, by default the min
7903		 * utilization can be max OPP.
7904		 */
7905		eff_util = effective_cpu_util(cpu, util, &min, &max);
7906
7907		/* Task's uclamp can modify min and max value */
7908		if (tsk && uclamp_is_used()) {
7909			min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
7910
7911			/*
7912			 * If there is no active max uclamp constraint,
7913			 * directly use task's one, otherwise keep max.
7914			 */
7915			if (uclamp_rq_is_idle(cpu_rq(cpu)))
7916				max = uclamp_eff_value(p, UCLAMP_MAX);
7917			else
7918				max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
7919		}
7920
7921		eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
7922		max_util = max(max_util, eff_util);
7923	}
7924
7925	return min(max_util, eenv->cpu_cap);
7926}
7927
7928/*
7929 * compute_energy(): Use the Energy Model to estimate the energy that @pd would
7930 * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
7931 * contribution is ignored.
7932 */
7933static inline unsigned long
7934compute_energy(struct energy_env *eenv, struct perf_domain *pd,
7935	       struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
7936{
7937	unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
7938	unsigned long busy_time = eenv->pd_busy_time;
7939	unsigned long energy;
7940
7941	if (dst_cpu >= 0)
7942		busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
7943
7944	energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
7945
7946	trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
7947
7948	return energy;
7949}
7950
7951/*
7952 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
7953 * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
7954 * spare capacity in each performance domain and uses it as a potential
7955 * candidate to execute the task. Then, it uses the Energy Model to figure
7956 * out which of the CPU candidates is the most energy-efficient.
7957 *
7958 * The rationale for this heuristic is as follows. In a performance domain,
7959 * all the most energy efficient CPU candidates (according to the Energy
7960 * Model) are those for which we'll request a low frequency. When there are
7961 * several CPUs for which the frequency request will be the same, we don't
7962 * have enough data to break the tie between them, because the Energy Model
7963 * only includes active power costs. With this model, if we assume that
7964 * frequency requests follow utilization (e.g. using schedutil), the CPU with
7965 * the maximum spare capacity in a performance domain is guaranteed to be among
7966 * the best candidates of the performance domain.
7967 *
7968 * In practice, it could be preferable from an energy standpoint to pack
7969 * small tasks on a CPU in order to let other CPUs go in deeper idle states,
7970 * but that could also hurt our chances to go cluster idle, and we have no
7971 * ways to tell with the current Energy Model if this is actually a good
7972 * idea or not. So, find_energy_efficient_cpu() basically favors
7973 * cluster-packing, and spreading inside a cluster. That should at least be
7974 * a good thing for latency, and this is consistent with the idea that most
7975 * of the energy savings of EAS come from the asymmetry of the system, and
7976 * not so much from breaking the tie between identical CPUs. That's also the
7977 * reason why EAS is enabled in the topology code only for systems where
7978 * SD_ASYM_CPUCAPACITY is set.
7979 *
7980 * NOTE: Forkees are not accepted in the energy-aware wake-up path because
7981 * they don't have any useful utilization data yet and it's not possible to
7982 * forecast their impact on energy consumption. Consequently, they will be
7983 * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out
7984 * to be energy-inefficient in some use-cases. The alternative would be to
7985 * bias new tasks towards specific types of CPUs first, or to try to infer
7986 * their util_avg from the parent task, but those heuristics could hurt
7987 * other use-cases too. So, until someone finds a better way to solve this,
7988 * let's keep things simple by re-using the existing slow path.
7989 */
7990static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
7991{
7992	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
7993	unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
7994	unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
7995	unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
7996	struct root_domain *rd = this_rq()->rd;
7997	int cpu, best_energy_cpu, target = -1;
7998	int prev_fits = -1, best_fits = -1;
7999	unsigned long best_actual_cap = 0;
8000	unsigned long prev_actual_cap = 0;
8001	struct sched_domain *sd;
8002	struct perf_domain *pd;
8003	struct energy_env eenv;
8004
8005	rcu_read_lock();
8006	pd = rcu_dereference(rd->pd);
8007	if (!pd)
8008		goto unlock;
8009
8010	/*
8011	 * Energy-aware wake-up happens on the lowest sched_domain starting
8012	 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
8013	 */
8014	sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
8015	while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
8016		sd = sd->parent;
8017	if (!sd)
8018		goto unlock;
8019
8020	target = prev_cpu;
8021
8022	sync_entity_load_avg(&p->se);
8023	if (!task_util_est(p) && p_util_min == 0)
8024		goto unlock;
8025
8026	eenv_task_busy_time(&eenv, p, prev_cpu);
8027
8028	for (; pd; pd = pd->next) {
8029		unsigned long util_min = p_util_min, util_max = p_util_max;
8030		unsigned long cpu_cap, cpu_actual_cap, util;
8031		long prev_spare_cap = -1, max_spare_cap = -1;
8032		unsigned long rq_util_min, rq_util_max;
8033		unsigned long cur_delta, base_energy;
8034		int max_spare_cap_cpu = -1;
8035		int fits, max_fits = -1;
8036
8037		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
8038
8039		if (cpumask_empty(cpus))
8040			continue;
8041
8042		/* Account external pressure for the energy estimation */
8043		cpu = cpumask_first(cpus);
8044		cpu_actual_cap = get_actual_cpu_capacity(cpu);
8045
8046		eenv.cpu_cap = cpu_actual_cap;
8047		eenv.pd_cap = 0;
8048
8049		for_each_cpu(cpu, cpus) {
8050			struct rq *rq = cpu_rq(cpu);
8051
8052			eenv.pd_cap += cpu_actual_cap;
8053
8054			if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
8055				continue;
8056
8057			if (!cpumask_test_cpu(cpu, p->cpus_ptr))
8058				continue;
8059
8060			util = cpu_util(cpu, p, cpu, 0);
8061			cpu_cap = capacity_of(cpu);
8062
8063			/*
8064			 * Skip CPUs that cannot satisfy the capacity request.
8065			 * IOW, placing the task there would make the CPU
8066			 * overutilized. Take uclamp into account to see how
8067			 * much capacity we can get out of the CPU; this is
8068			 * aligned with sched_cpu_util().
8069			 */
8070			if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
8071				/*
8072				 * Open code uclamp_rq_util_with() except for
8073				 * the clamp() part. I.e.: apply max aggregation
8074				 * only. util_fits_cpu() logic requires to
8075				 * operate on non clamped util but must use the
8076				 * max-aggregated uclamp_{min, max}.
8077				 */
8078				rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
8079				rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
8080
8081				util_min = max(rq_util_min, p_util_min);
8082				util_max = max(rq_util_max, p_util_max);
8083			}
8084
8085			fits = util_fits_cpu(util, util_min, util_max, cpu);
8086			if (!fits)
8087				continue;
8088
8089			lsub_positive(&cpu_cap, util);
8090
8091			if (cpu == prev_cpu) {
8092				/* Always use prev_cpu as a candidate. */
8093				prev_spare_cap = cpu_cap;
8094				prev_fits = fits;
8095			} else if ((fits > max_fits) ||
8096				   ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
8097				/*
8098				 * Find the CPU with the maximum spare capacity
8099				 * among the remaining CPUs in the performance
8100				 * domain.
8101				 */
8102				max_spare_cap = cpu_cap;
8103				max_spare_cap_cpu = cpu;
8104				max_fits = fits;
8105			}
8106		}
8107
8108		if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
8109			continue;
8110
8111		eenv_pd_busy_time(&eenv, cpus, p);
8112		/* Compute the 'base' energy of the pd, without @p */
8113		base_energy = compute_energy(&eenv, pd, cpus, p, -1);
8114
8115		/* Evaluate the energy impact of using prev_cpu. */
8116		if (prev_spare_cap > -1) {
8117			prev_delta = compute_energy(&eenv, pd, cpus, p,
8118						    prev_cpu);
8119			/* CPU utilization has changed */
8120			if (prev_delta < base_energy)
8121				goto unlock;
8122			prev_delta -= base_energy;
8123			prev_actual_cap = cpu_actual_cap;
8124			best_delta = min(best_delta, prev_delta);
8125		}
8126
8127		/* Evaluate the energy impact of using max_spare_cap_cpu. */
8128		if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
8129			/* Current best energy cpu fits better */
8130			if (max_fits < best_fits)
8131				continue;
8132
8133			/*
8134			 * Both don't fit performance hint (i.e. uclamp_min)
8135			 * but best energy cpu has better capacity.
8136			 */
8137			if ((max_fits < 0) &&
8138			    (cpu_actual_cap <= best_actual_cap))
8139				continue;
8140
8141			cur_delta = compute_energy(&eenv, pd, cpus, p,
8142						   max_spare_cap_cpu);
8143			/* CPU utilization has changed */
8144			if (cur_delta < base_energy)
8145				goto unlock;
8146			cur_delta -= base_energy;
8147
8148			/*
8149			 * Both fit for the task but best energy cpu has lower
8150			 * energy impact.
8151			 */
8152			if ((max_fits > 0) && (best_fits > 0) &&
8153			    (cur_delta >= best_delta))
8154				continue;
8155
8156			best_delta = cur_delta;
8157			best_energy_cpu = max_spare_cap_cpu;
8158			best_fits = max_fits;
8159			best_actual_cap = cpu_actual_cap;
8160		}
8161	}
8162	rcu_read_unlock();
8163
8164	if ((best_fits > prev_fits) ||
8165	    ((best_fits > 0) && (best_delta < prev_delta)) ||
8166	    ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
8167		target = best_energy_cpu;
8168
8169	return target;
8170
8171unlock:
8172	rcu_read_unlock();
8173
8174	return target;
8175}
8176
8177/*
8178 * select_task_rq_fair: Select target runqueue for the waking task in domains
8179 * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
8180 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
8181 *
8182 * Balances load by selecting the idlest CPU in the idlest group, or under
8183 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
8184 *
8185 * Returns the target CPU number.
8186 */
8187static int
8188select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
8189{
8190	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
8191	struct sched_domain *tmp, *sd = NULL;
8192	int cpu = smp_processor_id();
8193	int new_cpu = prev_cpu;
8194	int want_affine = 0;
8195	/* SD_flags and WF_flags share the first nibble */
8196	int sd_flag = wake_flags & 0xF;
8197
8198	/*
8199	 * required for stable ->cpus_allowed
8200	 */
8201	lockdep_assert_held(&p->pi_lock);
8202	if (wake_flags & WF_TTWU) {
8203		record_wakee(p);
8204
8205		if ((wake_flags & WF_CURRENT_CPU) &&
8206		    cpumask_test_cpu(cpu, p->cpus_ptr))
8207			return cpu;
8208
8209		if (!is_rd_overutilized(this_rq()->rd)) {
8210			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
8211			if (new_cpu >= 0)
8212				return new_cpu;
8213			new_cpu = prev_cpu;
8214		}
8215
8216		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
8217	}
8218
8219	rcu_read_lock();
8220	for_each_domain(cpu, tmp) {
8221		/*
8222		 * If both 'cpu' and 'prev_cpu' are part of this domain,
8223		 * cpu is a valid SD_WAKE_AFFINE target.
8224		 */
8225		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
8226		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
8227			if (cpu != prev_cpu)
8228				new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
8229
8230			sd = NULL; /* Prefer wake_affine over balance flags */
8231			break;
8232		}
8233
8234		/*
8235		 * Usually only true for WF_EXEC and WF_FORK, as sched_domains
8236		 * usually do not have SD_BALANCE_WAKE set. That means wakeup
8237		 * will usually go to the fast path.
8238		 */
8239		if (tmp->flags & sd_flag)
8240			sd = tmp;
8241		else if (!want_affine)
8242			break;
8243	}
8244
8245	if (unlikely(sd)) {
8246		/* Slow path */
8247		new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
8248	} else if (wake_flags & WF_TTWU) { /* XXX always ? */
8249		/* Fast path */
8250		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
8251	}
8252	rcu_read_unlock();
8253
8254	return new_cpu;
8255}
8256
8257/*
8258 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
8259 * cfs_rq_of(p) references at time of call are still valid and identify the
8260 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
8261 */
8262static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
8263{
8264	struct sched_entity *se = &p->se;
8265
8266	if (!task_on_rq_migrating(p)) {
8267		remove_entity_load_avg(se);
8268
8269		/*
8270		 * Here, the task's PELT values have been updated according to
8271		 * the current rq's clock. But if that clock hasn't been
8272		 * updated in a while, a substantial idle time will be missed,
8273		 * leading to an inflation after wake-up on the new rq.
8274		 *
8275		 * Estimate the missing time from the cfs_rq last_update_time
8276		 * and update sched_avg to improve the PELT continuity after
8277		 * migration.
8278		 */
8279		migrate_se_pelt_lag(se);
8280	}
8281
8282	/* Tell new CPU we are migrated */
8283	se->avg.last_update_time = 0;
8284
8285	update_scan_period(p, new_cpu);
8286}
8287
8288static void task_dead_fair(struct task_struct *p)
8289{
8290	remove_entity_load_avg(&p->se);
8291}
8292
8293/*
8294 * Set the max capacity the task is allowed to run at for misfit detection.
8295 */
8296static void set_task_max_allowed_capacity(struct task_struct *p)
8297{
8298	struct asym_cap_data *entry;
8299
8300	if (!sched_asym_cpucap_active())
8301		return;
8302
8303	rcu_read_lock();
8304	list_for_each_entry_rcu(entry, &asym_cap_list, link) {
8305		cpumask_t *cpumask;
8306
8307		cpumask = cpu_capacity_span(entry);
8308		if (!cpumask_intersects(p->cpus_ptr, cpumask))
8309			continue;
8310
8311		p->max_allowed_capacity = entry->capacity;
8312		break;
8313	}
8314	rcu_read_unlock();
8315}
8316
8317static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
8318{
8319	set_cpus_allowed_common(p, ctx);
8320	set_task_max_allowed_capacity(p);
8321}
8322
8323static int
8324balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
8325{
8326	if (rq->nr_running)
8327		return 1;
8328
8329	return sched_balance_newidle(rq, rf) != 0;
8330}
8331#else
8332static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
8333#endif /* CONFIG_SMP */
8334
8335static void set_next_buddy(struct sched_entity *se)
8336{
8337	for_each_sched_entity(se) {
8338		if (SCHED_WARN_ON(!se->on_rq))
8339			return;
8340		if (se_is_idle(se))
8341			return;
8342		cfs_rq_of(se)->next = se;
8343	}
8344}
8345
8346/*
8347 * Preempt the current task with a newly woken task if needed:
8348 */
8349static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
8350{
8351	struct task_struct *curr = rq->curr;
8352	struct sched_entity *se = &curr->se, *pse = &p->se;
8353	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
8354	int cse_is_idle, pse_is_idle;
8355
8356	if (unlikely(se == pse))
8357		return;
8358
8359	/*
8360	 * This is possible from callers such as attach_tasks(), in which we
8361	 * unconditionally wakeup_preempt() after an enqueue (which may have
8362	 * lead to a throttle).  This both saves work and prevents false
8363	 * next-buddy nomination below.
8364	 */
8365	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
8366		return;
8367
8368	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
8369		set_next_buddy(pse);
8370	}
8371
8372	/*
8373	 * We can come here with TIF_NEED_RESCHED already set from new task
8374	 * wake up path.
8375	 *
8376	 * Note: this also catches the edge-case of curr being in a throttled
8377	 * group (e.g. via set_curr_task), since update_curr() (in the
8378	 * enqueue of curr) will have resulted in resched being set.  This
8379	 * prevents us from potentially nominating it as a false LAST_BUDDY
8380	 * below.
8381	 */
8382	if (test_tsk_need_resched(curr))
8383		return;
8384
8385	/* Idle tasks are by definition preempted by non-idle tasks. */
8386	if (unlikely(task_has_idle_policy(curr)) &&
8387	    likely(!task_has_idle_policy(p)))
8388		goto preempt;
8389
8390	/*
8391	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
8392	 * is driven by the tick):
8393	 */
8394	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
8395		return;
8396
8397	find_matching_se(&se, &pse);
8398	WARN_ON_ONCE(!pse);
8399
8400	cse_is_idle = se_is_idle(se);
8401	pse_is_idle = se_is_idle(pse);
8402
8403	/*
8404	 * Preempt an idle group in favor of a non-idle group (and don't preempt
8405	 * in the inverse case).
8406	 */
8407	if (cse_is_idle && !pse_is_idle)
8408		goto preempt;
8409	if (cse_is_idle != pse_is_idle)
8410		return;
8411
8412	cfs_rq = cfs_rq_of(se);
8413	update_curr(cfs_rq);
8414
8415	/*
8416	 * XXX pick_eevdf(cfs_rq) != se ?
8417	 */
8418	if (pick_eevdf(cfs_rq) == pse)
8419		goto preempt;
8420
8421	return;
8422
8423preempt:
8424	resched_curr(rq);
8425}
8426
8427#ifdef CONFIG_SMP
8428static struct task_struct *pick_task_fair(struct rq *rq)
8429{
8430	struct sched_entity *se;
8431	struct cfs_rq *cfs_rq;
8432
8433again:
8434	cfs_rq = &rq->cfs;
8435	if (!cfs_rq->nr_running)
8436		return NULL;
8437
8438	do {
8439		struct sched_entity *curr = cfs_rq->curr;
8440
8441		/* When we pick for a remote RQ, we'll not have done put_prev_entity() */
8442		if (curr) {
8443			if (curr->on_rq)
8444				update_curr(cfs_rq);
8445			else
8446				curr = NULL;
8447
8448			if (unlikely(check_cfs_rq_runtime(cfs_rq)))
8449				goto again;
8450		}
8451
8452		se = pick_next_entity(cfs_rq);
8453		cfs_rq = group_cfs_rq(se);
8454	} while (cfs_rq);
8455
8456	return task_of(se);
8457}
8458#endif
8459
8460struct task_struct *
8461pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
8462{
8463	struct cfs_rq *cfs_rq = &rq->cfs;
8464	struct sched_entity *se;
8465	struct task_struct *p;
8466	int new_tasks;
8467
8468again:
8469	if (!sched_fair_runnable(rq))
8470		goto idle;
8471
8472#ifdef CONFIG_FAIR_GROUP_SCHED
8473	if (!prev || prev->sched_class != &fair_sched_class)
8474		goto simple;
8475
8476	/*
8477	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
8478	 * likely that a next task is from the same cgroup as the current.
8479	 *
8480	 * Therefore attempt to avoid putting and setting the entire cgroup
8481	 * hierarchy, only change the part that actually changes.
8482	 */
8483
8484	do {
8485		struct sched_entity *curr = cfs_rq->curr;
8486
8487		/*
8488		 * Since we got here without doing put_prev_entity() we also
8489		 * have to consider cfs_rq->curr. If it is still a runnable
8490		 * entity, update_curr() will update its vruntime, otherwise
8491		 * forget we've ever seen it.
8492		 */
8493		if (curr) {
8494			if (curr->on_rq)
8495				update_curr(cfs_rq);
8496			else
8497				curr = NULL;
8498
8499			/*
8500			 * This call to check_cfs_rq_runtime() will do the
8501			 * throttle and dequeue its entity in the parent(s).
8502			 * Therefore the nr_running test will indeed
8503			 * be correct.
8504			 */
8505			if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
8506				cfs_rq = &rq->cfs;
8507
8508				if (!cfs_rq->nr_running)
8509					goto idle;
8510
8511				goto simple;
8512			}
8513		}
8514
8515		se = pick_next_entity(cfs_rq);
8516		cfs_rq = group_cfs_rq(se);
8517	} while (cfs_rq);
8518
8519	p = task_of(se);
8520
8521	/*
8522	 * Since we haven't yet done put_prev_entity and if the selected task
8523	 * is a different task than we started out with, try and touch the
8524	 * least amount of cfs_rqs.
8525	 */
8526	if (prev != p) {
8527		struct sched_entity *pse = &prev->se;
8528
8529		while (!(cfs_rq = is_same_group(se, pse))) {
8530			int se_depth = se->depth;
8531			int pse_depth = pse->depth;
8532
8533			if (se_depth <= pse_depth) {
8534				put_prev_entity(cfs_rq_of(pse), pse);
8535				pse = parent_entity(pse);
8536			}
8537			if (se_depth >= pse_depth) {
8538				set_next_entity(cfs_rq_of(se), se);
8539				se = parent_entity(se);
8540			}
8541		}
8542
8543		put_prev_entity(cfs_rq, pse);
8544		set_next_entity(cfs_rq, se);
8545	}
8546
8547	goto done;
8548simple:
8549#endif
8550	if (prev)
8551		put_prev_task(rq, prev);
8552
8553	do {
8554		se = pick_next_entity(cfs_rq);
8555		set_next_entity(cfs_rq, se);
8556		cfs_rq = group_cfs_rq(se);
8557	} while (cfs_rq);
8558
8559	p = task_of(se);
8560
8561done: __maybe_unused;
8562#ifdef CONFIG_SMP
8563	/*
8564	 * Move the next running task to the front of
8565	 * the list, so our cfs_tasks list becomes MRU
8566	 * one.
8567	 */
8568	list_move(&p->se.group_node, &rq->cfs_tasks);
8569#endif
8570
8571	if (hrtick_enabled_fair(rq))
8572		hrtick_start_fair(rq, p);
8573
8574	update_misfit_status(p, rq);
8575	sched_fair_update_stop_tick(rq, p);
8576
8577	return p;
8578
8579idle:
8580	if (!rf)
8581		return NULL;
8582
8583	new_tasks = sched_balance_newidle(rq, rf);
8584
8585	/*
8586	 * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
8587	 * possible for any higher priority task to appear. In that case we
8588	 * must re-start the pick_next_entity() loop.
8589	 */
8590	if (new_tasks < 0)
8591		return RETRY_TASK;
8592
8593	if (new_tasks > 0)
8594		goto again;
8595
8596	/*
8597	 * rq is about to be idle, check if we need to update the
8598	 * lost_idle_time of clock_pelt
8599	 */
8600	update_idle_rq_clock_pelt(rq);
8601
8602	return NULL;
8603}
8604
8605static struct task_struct *__pick_next_task_fair(struct rq *rq)
8606{
8607	return pick_next_task_fair(rq, NULL, NULL);
8608}
8609
8610/*
8611 * Account for a descheduled task:
8612 */
8613static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
8614{
8615	struct sched_entity *se = &prev->se;
8616	struct cfs_rq *cfs_rq;
8617
8618	for_each_sched_entity(se) {
8619		cfs_rq = cfs_rq_of(se);
8620		put_prev_entity(cfs_rq, se);
8621	}
8622}
8623
8624/*
8625 * sched_yield() is very simple
8626 */
8627static void yield_task_fair(struct rq *rq)
8628{
8629	struct task_struct *curr = rq->curr;
8630	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
8631	struct sched_entity *se = &curr->se;
8632
8633	/*
8634	 * Are we the only task in the tree?
8635	 */
8636	if (unlikely(rq->nr_running == 1))
8637		return;
8638
8639	clear_buddies(cfs_rq, se);
8640
8641	update_rq_clock(rq);
8642	/*
8643	 * Update run-time statistics of the 'current'.
8644	 */
8645	update_curr(cfs_rq);
8646	/*
8647	 * Tell update_rq_clock() that we've just updated,
8648	 * so we don't do microscopic update in schedule()
8649	 * and double the fastpath cost.
8650	 */
8651	rq_clock_skip_update(rq);
8652
8653	se->deadline += calc_delta_fair(se->slice, se);
8654}
8655
8656static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
8657{
8658	struct sched_entity *se = &p->se;
8659
8660	/* throttled hierarchies are not runnable */
8661	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
8662		return false;
8663
8664	/* Tell the scheduler that we'd really like se to run next. */
8665	set_next_buddy(se);
8666
8667	yield_task_fair(rq);
8668
8669	return true;
8670}
8671
8672#ifdef CONFIG_SMP
8673/**************************************************
8674 * Fair scheduling class load-balancing methods.
8675 *
8676 * BASICS
8677 *
8678 * The purpose of load-balancing is to achieve the same basic fairness the
8679 * per-CPU scheduler provides, namely provide a proportional amount of compute
8680 * time to each task. This is expressed in the following equation:
8681 *
8682 *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
8683 *
8684 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
8685 * W_i,0 is defined as:
8686 *
8687 *   W_i,0 = \Sum_j w_i,j                                             (2)
8688 *
8689 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
8690 * is derived from the nice value as per sched_prio_to_weight[].
8691 *
8692 * The weight average is an exponential decay average of the instantaneous
8693 * weight:
8694 *
8695 *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
8696 *
8697 * C_i is the compute capacity of CPU i, typically it is the
8698 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
8699 * can also include other factors [XXX].
8700 *
8701 * To achieve this balance we define a measure of imbalance which follows
8702 * directly from (1):
8703 *
8704 *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
8705 *
8706 * We them move tasks around to minimize the imbalance. In the continuous
8707 * function space it is obvious this converges, in the discrete case we get
8708 * a few fun cases generally called infeasible weight scenarios.
8709 *
8710 * [XXX expand on:
8711 *     - infeasible weights;
8712 *     - local vs global optima in the discrete case. ]
8713 *
8714 *
8715 * SCHED DOMAINS
8716 *
8717 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
8718 * for all i,j solution, we create a tree of CPUs that follows the hardware
8719 * topology where each level pairs two lower groups (or better). This results
8720 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
8721 * tree to only the first of the previous level and we decrease the frequency
8722 * of load-balance at each level inv. proportional to the number of CPUs in
8723 * the groups.
8724 *
8725 * This yields:
8726 *
8727 *     log_2 n     1     n
8728 *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
8729 *     i = 0      2^i   2^i
8730 *                               `- size of each group
8731 *         |         |     `- number of CPUs doing load-balance
8732 *         |         `- freq
8733 *         `- sum over all levels
8734 *
8735 * Coupled with a limit on how many tasks we can migrate every balance pass,
8736 * this makes (5) the runtime complexity of the balancer.
8737 *
8738 * An important property here is that each CPU is still (indirectly) connected
8739 * to every other CPU in at most O(log n) steps:
8740 *
8741 * The adjacency matrix of the resulting graph is given by:
8742 *
8743 *             log_2 n
8744 *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
8745 *             k = 0
8746 *
8747 * And you'll find that:
8748 *
8749 *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
8750 *
8751 * Showing there's indeed a path between every CPU in at most O(log n) steps.
8752 * The task movement gives a factor of O(m), giving a convergence complexity
8753 * of:
8754 *
8755 *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
8756 *
8757 *
8758 * WORK CONSERVING
8759 *
8760 * In order to avoid CPUs going idle while there's still work to do, new idle
8761 * balancing is more aggressive and has the newly idle CPU iterate up the domain
8762 * tree itself instead of relying on other CPUs to bring it work.
8763 *
8764 * This adds some complexity to both (5) and (8) but it reduces the total idle
8765 * time.
8766 *
8767 * [XXX more?]
8768 *
8769 *
8770 * CGROUPS
8771 *
8772 * Cgroups make a horror show out of (2), instead of a simple sum we get:
8773 *
8774 *                                s_k,i
8775 *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
8776 *                                 S_k
8777 *
8778 * Where
8779 *
8780 *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
8781 *
8782 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
8783 *
8784 * The big problem is S_k, its a global sum needed to compute a local (W_i)
8785 * property.
8786 *
8787 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
8788 *      rewrite all of this once again.]
8789 */
8790
8791static unsigned long __read_mostly max_load_balance_interval = HZ/10;
8792
8793enum fbq_type { regular, remote, all };
8794
8795/*
8796 * 'group_type' describes the group of CPUs at the moment of load balancing.
8797 *
8798 * The enum is ordered by pulling priority, with the group with lowest priority
8799 * first so the group_type can simply be compared when selecting the busiest
8800 * group. See update_sd_pick_busiest().
8801 */
8802enum group_type {
8803	/* The group has spare capacity that can be used to run more tasks.  */
8804	group_has_spare = 0,
8805	/*
8806	 * The group is fully used and the tasks don't compete for more CPU
8807	 * cycles. Nevertheless, some tasks might wait before running.
8808	 */
8809	group_fully_busy,
8810	/*
8811	 * One task doesn't fit with CPU's capacity and must be migrated to a
8812	 * more powerful CPU.
8813	 */
8814	group_misfit_task,
8815	/*
8816	 * Balance SMT group that's fully busy. Can benefit from migration
8817	 * a task on SMT with busy sibling to another CPU on idle core.
8818	 */
8819	group_smt_balance,
8820	/*
8821	 * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
8822	 * and the task should be migrated to it instead of running on the
8823	 * current CPU.
8824	 */
8825	group_asym_packing,
8826	/*
8827	 * The tasks' affinity constraints previously prevented the scheduler
8828	 * from balancing the load across the system.
8829	 */
8830	group_imbalanced,
8831	/*
8832	 * The CPU is overloaded and can't provide expected CPU cycles to all
8833	 * tasks.
8834	 */
8835	group_overloaded
8836};
8837
8838enum migration_type {
8839	migrate_load = 0,
8840	migrate_util,
8841	migrate_task,
8842	migrate_misfit
8843};
8844
8845#define LBF_ALL_PINNED	0x01
8846#define LBF_NEED_BREAK	0x02
8847#define LBF_DST_PINNED  0x04
8848#define LBF_SOME_PINNED	0x08
8849#define LBF_ACTIVE_LB	0x10
8850
8851struct lb_env {
8852	struct sched_domain	*sd;
8853
8854	struct rq		*src_rq;
8855	int			src_cpu;
8856
8857	int			dst_cpu;
8858	struct rq		*dst_rq;
8859
8860	struct cpumask		*dst_grpmask;
8861	int			new_dst_cpu;
8862	enum cpu_idle_type	idle;
8863	long			imbalance;
8864	/* The set of CPUs under consideration for load-balancing */
8865	struct cpumask		*cpus;
8866
8867	unsigned int		flags;
8868
8869	unsigned int		loop;
8870	unsigned int		loop_break;
8871	unsigned int		loop_max;
8872
8873	enum fbq_type		fbq_type;
8874	enum migration_type	migration_type;
8875	struct list_head	tasks;
8876};
8877
8878/*
8879 * Is this task likely cache-hot:
8880 */
8881static int task_hot(struct task_struct *p, struct lb_env *env)
8882{
8883	s64 delta;
8884
8885	lockdep_assert_rq_held(env->src_rq);
8886
8887	if (p->sched_class != &fair_sched_class)
8888		return 0;
8889
8890	if (unlikely(task_has_idle_policy(p)))
8891		return 0;
8892
8893	/* SMT siblings share cache */
8894	if (env->sd->flags & SD_SHARE_CPUCAPACITY)
8895		return 0;
8896
8897	/*
8898	 * Buddy candidates are cache hot:
8899	 */
8900	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
8901	    (&p->se == cfs_rq_of(&p->se)->next))
8902		return 1;
8903
8904	if (sysctl_sched_migration_cost == -1)
8905		return 1;
8906
8907	/*
8908	 * Don't migrate task if the task's cookie does not match
8909	 * with the destination CPU's core cookie.
8910	 */
8911	if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
8912		return 1;
8913
8914	if (sysctl_sched_migration_cost == 0)
8915		return 0;
8916
8917	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
8918
8919	return delta < (s64)sysctl_sched_migration_cost;
8920}
8921
8922#ifdef CONFIG_NUMA_BALANCING
8923/*
8924 * Returns 1, if task migration degrades locality
8925 * Returns 0, if task migration improves locality i.e migration preferred.
8926 * Returns -1, if task migration is not affected by locality.
8927 */
8928static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
8929{
8930	struct numa_group *numa_group = rcu_dereference(p->numa_group);
8931	unsigned long src_weight, dst_weight;
8932	int src_nid, dst_nid, dist;
8933
8934	if (!static_branch_likely(&sched_numa_balancing))
8935		return -1;
8936
8937	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
8938		return -1;
8939
8940	src_nid = cpu_to_node(env->src_cpu);
8941	dst_nid = cpu_to_node(env->dst_cpu);
8942
8943	if (src_nid == dst_nid)
8944		return -1;
8945
8946	/* Migrating away from the preferred node is always bad. */
8947	if (src_nid == p->numa_preferred_nid) {
8948		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
8949			return 1;
8950		else
8951			return -1;
8952	}
8953
8954	/* Encourage migration to the preferred node. */
8955	if (dst_nid == p->numa_preferred_nid)
8956		return 0;
8957
8958	/* Leaving a core idle is often worse than degrading locality. */
8959	if (env->idle == CPU_IDLE)
8960		return -1;
8961
8962	dist = node_distance(src_nid, dst_nid);
8963	if (numa_group) {
8964		src_weight = group_weight(p, src_nid, dist);
8965		dst_weight = group_weight(p, dst_nid, dist);
8966	} else {
8967		src_weight = task_weight(p, src_nid, dist);
8968		dst_weight = task_weight(p, dst_nid, dist);
8969	}
8970
8971	return dst_weight < src_weight;
8972}
8973
8974#else
8975static inline int migrate_degrades_locality(struct task_struct *p,
8976					     struct lb_env *env)
8977{
8978	return -1;
8979}
8980#endif
8981
8982/*
8983 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
8984 */
8985static
8986int can_migrate_task(struct task_struct *p, struct lb_env *env)
8987{
8988	int tsk_cache_hot;
8989
8990	lockdep_assert_rq_held(env->src_rq);
8991
8992	/*
8993	 * We do not migrate tasks that are:
8994	 * 1) throttled_lb_pair, or
8995	 * 2) cannot be migrated to this CPU due to cpus_ptr, or
8996	 * 3) running (obviously), or
8997	 * 4) are cache-hot on their current CPU.
8998	 */
8999	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
9000		return 0;
9001
9002	/* Disregard percpu kthreads; they are where they need to be. */
9003	if (kthread_is_per_cpu(p))
9004		return 0;
9005
9006	if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
9007		int cpu;
9008
9009		schedstat_inc(p->stats.nr_failed_migrations_affine);
9010
9011		env->flags |= LBF_SOME_PINNED;
9012
9013		/*
9014		 * Remember if this task can be migrated to any other CPU in
9015		 * our sched_group. We may want to revisit it if we couldn't
9016		 * meet load balance goals by pulling other tasks on src_cpu.
9017		 *
9018		 * Avoid computing new_dst_cpu
9019		 * - for NEWLY_IDLE
9020		 * - if we have already computed one in current iteration
9021		 * - if it's an active balance
9022		 */
9023		if (env->idle == CPU_NEWLY_IDLE ||
9024		    env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
9025			return 0;
9026
9027		/* Prevent to re-select dst_cpu via env's CPUs: */
9028		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
9029			if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
9030				env->flags |= LBF_DST_PINNED;
9031				env->new_dst_cpu = cpu;
9032				break;
9033			}
9034		}
9035
9036		return 0;
9037	}
9038
9039	/* Record that we found at least one task that could run on dst_cpu */
9040	env->flags &= ~LBF_ALL_PINNED;
9041
9042	if (task_on_cpu(env->src_rq, p)) {
9043		schedstat_inc(p->stats.nr_failed_migrations_running);
9044		return 0;
9045	}
9046
9047	/*
9048	 * Aggressive migration if:
9049	 * 1) active balance
9050	 * 2) destination numa is preferred
9051	 * 3) task is cache cold, or
9052	 * 4) too many balance attempts have failed.
9053	 */
9054	if (env->flags & LBF_ACTIVE_LB)
9055		return 1;
9056
9057	tsk_cache_hot = migrate_degrades_locality(p, env);
9058	if (tsk_cache_hot == -1)
9059		tsk_cache_hot = task_hot(p, env);
9060
9061	if (tsk_cache_hot <= 0 ||
9062	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
9063		if (tsk_cache_hot == 1) {
9064			schedstat_inc(env->sd->lb_hot_gained[env->idle]);
9065			schedstat_inc(p->stats.nr_forced_migrations);
9066		}
9067		return 1;
9068	}
9069
9070	schedstat_inc(p->stats.nr_failed_migrations_hot);
9071	return 0;
9072}
9073
9074/*
9075 * detach_task() -- detach the task for the migration specified in env
9076 */
9077static void detach_task(struct task_struct *p, struct lb_env *env)
9078{
9079	lockdep_assert_rq_held(env->src_rq);
9080
9081	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
9082	set_task_cpu(p, env->dst_cpu);
9083}
9084
9085/*
9086 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
9087 * part of active balancing operations within "domain".
9088 *
9089 * Returns a task if successful and NULL otherwise.
9090 */
9091static struct task_struct *detach_one_task(struct lb_env *env)
9092{
9093	struct task_struct *p;
9094
9095	lockdep_assert_rq_held(env->src_rq);
9096
9097	list_for_each_entry_reverse(p,
9098			&env->src_rq->cfs_tasks, se.group_node) {
9099		if (!can_migrate_task(p, env))
9100			continue;
9101
9102		detach_task(p, env);
9103
9104		/*
9105		 * Right now, this is only the second place where
9106		 * lb_gained[env->idle] is updated (other is detach_tasks)
9107		 * so we can safely collect stats here rather than
9108		 * inside detach_tasks().
9109		 */
9110		schedstat_inc(env->sd->lb_gained[env->idle]);
9111		return p;
9112	}
9113	return NULL;
9114}
9115
9116/*
9117 * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
9118 * busiest_rq, as part of a balancing operation within domain "sd".
9119 *
9120 * Returns number of detached tasks if successful and 0 otherwise.
9121 */
9122static int detach_tasks(struct lb_env *env)
9123{
9124	struct list_head *tasks = &env->src_rq->cfs_tasks;
9125	unsigned long util, load;
9126	struct task_struct *p;
9127	int detached = 0;
9128
9129	lockdep_assert_rq_held(env->src_rq);
9130
9131	/*
9132	 * Source run queue has been emptied by another CPU, clear
9133	 * LBF_ALL_PINNED flag as we will not test any task.
9134	 */
9135	if (env->src_rq->nr_running <= 1) {
9136		env->flags &= ~LBF_ALL_PINNED;
9137		return 0;
9138	}
9139
9140	if (env->imbalance <= 0)
9141		return 0;
9142
9143	while (!list_empty(tasks)) {
9144		/*
9145		 * We don't want to steal all, otherwise we may be treated likewise,
9146		 * which could at worst lead to a livelock crash.
9147		 */
9148		if (env->idle && env->src_rq->nr_running <= 1)
9149			break;
9150
9151		env->loop++;
9152		/*
9153		 * We've more or less seen every task there is, call it quits
9154		 * unless we haven't found any movable task yet.
9155		 */
9156		if (env->loop > env->loop_max &&
9157		    !(env->flags & LBF_ALL_PINNED))
9158			break;
9159
9160		/* take a breather every nr_migrate tasks */
9161		if (env->loop > env->loop_break) {
9162			env->loop_break += SCHED_NR_MIGRATE_BREAK;
9163			env->flags |= LBF_NEED_BREAK;
9164			break;
9165		}
9166
9167		p = list_last_entry(tasks, struct task_struct, se.group_node);
9168
9169		if (!can_migrate_task(p, env))
9170			goto next;
9171
9172		switch (env->migration_type) {
9173		case migrate_load:
9174			/*
9175			 * Depending of the number of CPUs and tasks and the
9176			 * cgroup hierarchy, task_h_load() can return a null
9177			 * value. Make sure that env->imbalance decreases
9178			 * otherwise detach_tasks() will stop only after
9179			 * detaching up to loop_max tasks.
9180			 */
9181			load = max_t(unsigned long, task_h_load(p), 1);
9182
9183			if (sched_feat(LB_MIN) &&
9184			    load < 16 && !env->sd->nr_balance_failed)
9185				goto next;
9186
9187			/*
9188			 * Make sure that we don't migrate too much load.
9189			 * Nevertheless, let relax the constraint if
9190			 * scheduler fails to find a good waiting task to
9191			 * migrate.
9192			 */
9193			if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
9194				goto next;
9195
9196			env->imbalance -= load;
9197			break;
9198
9199		case migrate_util:
9200			util = task_util_est(p);
9201
9202			if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance)
9203				goto next;
9204
9205			env->imbalance -= util;
9206			break;
9207
9208		case migrate_task:
9209			env->imbalance--;
9210			break;
9211
9212		case migrate_misfit:
9213			/* This is not a misfit task */
9214			if (task_fits_cpu(p, env->src_cpu))
9215				goto next;
9216
9217			env->imbalance = 0;
9218			break;
9219		}
9220
9221		detach_task(p, env);
9222		list_add(&p->se.group_node, &env->tasks);
9223
9224		detached++;
9225
9226#ifdef CONFIG_PREEMPTION
9227		/*
9228		 * NEWIDLE balancing is a source of latency, so preemptible
9229		 * kernels will stop after the first task is detached to minimize
9230		 * the critical section.
9231		 */
9232		if (env->idle == CPU_NEWLY_IDLE)
9233			break;
9234#endif
9235
9236		/*
9237		 * We only want to steal up to the prescribed amount of
9238		 * load/util/tasks.
9239		 */
9240		if (env->imbalance <= 0)
9241			break;
9242
9243		continue;
9244next:
9245		list_move(&p->se.group_node, tasks);
9246	}
9247
9248	/*
9249	 * Right now, this is one of only two places we collect this stat
9250	 * so we can safely collect detach_one_task() stats here rather
9251	 * than inside detach_one_task().
9252	 */
9253	schedstat_add(env->sd->lb_gained[env->idle], detached);
9254
9255	return detached;
9256}
9257
9258/*
9259 * attach_task() -- attach the task detached by detach_task() to its new rq.
9260 */
9261static void attach_task(struct rq *rq, struct task_struct *p)
9262{
9263	lockdep_assert_rq_held(rq);
9264
9265	WARN_ON_ONCE(task_rq(p) != rq);
9266	activate_task(rq, p, ENQUEUE_NOCLOCK);
9267	wakeup_preempt(rq, p, 0);
9268}
9269
9270/*
9271 * attach_one_task() -- attaches the task returned from detach_one_task() to
9272 * its new rq.
9273 */
9274static void attach_one_task(struct rq *rq, struct task_struct *p)
9275{
9276	struct rq_flags rf;
9277
9278	rq_lock(rq, &rf);
9279	update_rq_clock(rq);
9280	attach_task(rq, p);
9281	rq_unlock(rq, &rf);
9282}
9283
9284/*
9285 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
9286 * new rq.
9287 */
9288static void attach_tasks(struct lb_env *env)
9289{
9290	struct list_head *tasks = &env->tasks;
9291	struct task_struct *p;
9292	struct rq_flags rf;
9293
9294	rq_lock(env->dst_rq, &rf);
9295	update_rq_clock(env->dst_rq);
9296
9297	while (!list_empty(tasks)) {
9298		p = list_first_entry(tasks, struct task_struct, se.group_node);
9299		list_del_init(&p->se.group_node);
9300
9301		attach_task(env->dst_rq, p);
9302	}
9303
9304	rq_unlock(env->dst_rq, &rf);
9305}
9306
9307#ifdef CONFIG_NO_HZ_COMMON
9308static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
9309{
9310	if (cfs_rq->avg.load_avg)
9311		return true;
9312
9313	if (cfs_rq->avg.util_avg)
9314		return true;
9315
9316	return false;
9317}
9318
9319static inline bool others_have_blocked(struct rq *rq)
9320{
9321	if (cpu_util_rt(rq))
9322		return true;
9323
9324	if (cpu_util_dl(rq))
9325		return true;
9326
9327	if (hw_load_avg(rq))
9328		return true;
9329
9330	if (cpu_util_irq(rq))
9331		return true;
9332
9333	return false;
9334}
9335
9336static inline void update_blocked_load_tick(struct rq *rq)
9337{
9338	WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
9339}
9340
9341static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
9342{
9343	if (!has_blocked)
9344		rq->has_blocked_load = 0;
9345}
9346#else
9347static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
9348static inline bool others_have_blocked(struct rq *rq) { return false; }
9349static inline void update_blocked_load_tick(struct rq *rq) {}
9350static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
9351#endif
9352
9353static bool __update_blocked_others(struct rq *rq, bool *done)
9354{
9355	const struct sched_class *curr_class;
9356	u64 now = rq_clock_pelt(rq);
9357	unsigned long hw_pressure;
9358	bool decayed;
9359
9360	/*
9361	 * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
9362	 * DL and IRQ signals have been updated before updating CFS.
9363	 */
9364	curr_class = rq->curr->sched_class;
9365
9366	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
9367
9368	decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
9369		  update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
9370		  update_hw_load_avg(now, rq, hw_pressure) |
9371		  update_irq_load_avg(rq, 0);
9372
9373	if (others_have_blocked(rq))
9374		*done = false;
9375
9376	return decayed;
9377}
9378
9379#ifdef CONFIG_FAIR_GROUP_SCHED
9380
9381static bool __update_blocked_fair(struct rq *rq, bool *done)
9382{
9383	struct cfs_rq *cfs_rq, *pos;
9384	bool decayed = false;
9385	int cpu = cpu_of(rq);
9386
9387	/*
9388	 * Iterates the task_group tree in a bottom up fashion, see
9389	 * list_add_leaf_cfs_rq() for details.
9390	 */
9391	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
9392		struct sched_entity *se;
9393
9394		if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
9395			update_tg_load_avg(cfs_rq);
9396
9397			if (cfs_rq->nr_running == 0)
9398				update_idle_cfs_rq_clock_pelt(cfs_rq);
9399
9400			if (cfs_rq == &rq->cfs)
9401				decayed = true;
9402		}
9403
9404		/* Propagate pending load changes to the parent, if any: */
9405		se = cfs_rq->tg->se[cpu];
9406		if (se && !skip_blocked_update(se))
9407			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
9408
9409		/*
9410		 * There can be a lot of idle CPU cgroups.  Don't let fully
9411		 * decayed cfs_rqs linger on the list.
9412		 */
9413		if (cfs_rq_is_decayed(cfs_rq))
9414			list_del_leaf_cfs_rq(cfs_rq);
9415
9416		/* Don't need periodic decay once load/util_avg are null */
9417		if (cfs_rq_has_blocked(cfs_rq))
9418			*done = false;
9419	}
9420
9421	return decayed;
9422}
9423
9424/*
9425 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
9426 * This needs to be done in a top-down fashion because the load of a child
9427 * group is a fraction of its parents load.
9428 */
9429static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
9430{
9431	struct rq *rq = rq_of(cfs_rq);
9432	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
9433	unsigned long now = jiffies;
9434	unsigned long load;
9435
9436	if (cfs_rq->last_h_load_update == now)
9437		return;
9438
9439	WRITE_ONCE(cfs_rq->h_load_next, NULL);
9440	for_each_sched_entity(se) {
9441		cfs_rq = cfs_rq_of(se);
9442		WRITE_ONCE(cfs_rq->h_load_next, se);
9443		if (cfs_rq->last_h_load_update == now)
9444			break;
9445	}
9446
9447	if (!se) {
9448		cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
9449		cfs_rq->last_h_load_update = now;
9450	}
9451
9452	while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
9453		load = cfs_rq->h_load;
9454		load = div64_ul(load * se->avg.load_avg,
9455			cfs_rq_load_avg(cfs_rq) + 1);
9456		cfs_rq = group_cfs_rq(se);
9457		cfs_rq->h_load = load;
9458		cfs_rq->last_h_load_update = now;
9459	}
9460}
9461
9462static unsigned long task_h_load(struct task_struct *p)
9463{
9464	struct cfs_rq *cfs_rq = task_cfs_rq(p);
9465
9466	update_cfs_rq_h_load(cfs_rq);
9467	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
9468			cfs_rq_load_avg(cfs_rq) + 1);
9469}
9470#else
9471static bool __update_blocked_fair(struct rq *rq, bool *done)
9472{
9473	struct cfs_rq *cfs_rq = &rq->cfs;
9474	bool decayed;
9475
9476	decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
9477	if (cfs_rq_has_blocked(cfs_rq))
9478		*done = false;
9479
9480	return decayed;
9481}
9482
9483static unsigned long task_h_load(struct task_struct *p)
9484{
9485	return p->se.avg.load_avg;
9486}
9487#endif
9488
9489static void sched_balance_update_blocked_averages(int cpu)
9490{
9491	bool decayed = false, done = true;
9492	struct rq *rq = cpu_rq(cpu);
9493	struct rq_flags rf;
9494
9495	rq_lock_irqsave(rq, &rf);
9496	update_blocked_load_tick(rq);
9497	update_rq_clock(rq);
9498
9499	decayed |= __update_blocked_others(rq, &done);
9500	decayed |= __update_blocked_fair(rq, &done);
9501
9502	update_blocked_load_status(rq, !done);
9503	if (decayed)
9504		cpufreq_update_util(rq, 0);
9505	rq_unlock_irqrestore(rq, &rf);
9506}
9507
9508/********** Helpers for sched_balance_find_src_group ************************/
9509
9510/*
9511 * sg_lb_stats - stats of a sched_group required for load-balancing:
9512 */
9513struct sg_lb_stats {
9514	unsigned long avg_load;			/* Avg load            over the CPUs of the group */
9515	unsigned long group_load;		/* Total load          over the CPUs of the group */
9516	unsigned long group_capacity;		/* Capacity            over the CPUs of the group */
9517	unsigned long group_util;		/* Total utilization   over the CPUs of the group */
9518	unsigned long group_runnable;		/* Total runnable time over the CPUs of the group */
9519	unsigned int sum_nr_running;		/* Nr of all tasks running in the group */
9520	unsigned int sum_h_nr_running;		/* Nr of CFS tasks running in the group */
9521	unsigned int idle_cpus;                 /* Nr of idle CPUs         in the group */
9522	unsigned int group_weight;
9523	enum group_type group_type;
9524	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
9525	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
9526	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
9527#ifdef CONFIG_NUMA_BALANCING
9528	unsigned int nr_numa_running;
9529	unsigned int nr_preferred_running;
9530#endif
9531};
9532
9533/*
9534 * sd_lb_stats - stats of a sched_domain required for load-balancing:
9535 */
9536struct sd_lb_stats {
9537	struct sched_group *busiest;		/* Busiest group in this sd */
9538	struct sched_group *local;		/* Local group in this sd */
9539	unsigned long total_load;		/* Total load of all groups in sd */
9540	unsigned long total_capacity;		/* Total capacity of all groups in sd */
9541	unsigned long avg_load;			/* Average load across all groups in sd */
9542	unsigned int prefer_sibling;		/* Tasks should go to sibling first */
9543
9544	struct sg_lb_stats busiest_stat;	/* Statistics of the busiest group */
9545	struct sg_lb_stats local_stat;		/* Statistics of the local group */
9546};
9547
9548static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
9549{
9550	/*
9551	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
9552	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
9553	 * We must however set busiest_stat::group_type and
9554	 * busiest_stat::idle_cpus to the worst busiest group because
9555	 * update_sd_pick_busiest() reads these before assignment.
9556	 */
9557	*sds = (struct sd_lb_stats){
9558		.busiest = NULL,
9559		.local = NULL,
9560		.total_load = 0UL,
9561		.total_capacity = 0UL,
9562		.busiest_stat = {
9563			.idle_cpus = UINT_MAX,
9564			.group_type = group_has_spare,
9565		},
9566	};
9567}
9568
9569static unsigned long scale_rt_capacity(int cpu)
9570{
9571	unsigned long max = get_actual_cpu_capacity(cpu);
9572	struct rq *rq = cpu_rq(cpu);
9573	unsigned long used, free;
9574	unsigned long irq;
9575
9576	irq = cpu_util_irq(rq);
9577
9578	if (unlikely(irq >= max))
9579		return 1;
9580
9581	/*
9582	 * avg_rt.util_avg and avg_dl.util_avg track binary signals
9583	 * (running and not running) with weights 0 and 1024 respectively.
9584	 */
9585	used = cpu_util_rt(rq);
9586	used += cpu_util_dl(rq);
9587
9588	if (unlikely(used >= max))
9589		return 1;
9590
9591	free = max - used;
9592
9593	return scale_irq_capacity(free, irq, max);
9594}
9595
9596static void update_cpu_capacity(struct sched_domain *sd, int cpu)
9597{
9598	unsigned long capacity = scale_rt_capacity(cpu);
9599	struct sched_group *sdg = sd->groups;
9600
9601	if (!capacity)
9602		capacity = 1;
9603
9604	cpu_rq(cpu)->cpu_capacity = capacity;
9605	trace_sched_cpu_capacity_tp(cpu_rq(cpu));
9606
9607	sdg->sgc->capacity = capacity;
9608	sdg->sgc->min_capacity = capacity;
9609	sdg->sgc->max_capacity = capacity;
9610}
9611
9612void update_group_capacity(struct sched_domain *sd, int cpu)
9613{
9614	struct sched_domain *child = sd->child;
9615	struct sched_group *group, *sdg = sd->groups;
9616	unsigned long capacity, min_capacity, max_capacity;
9617	unsigned long interval;
9618
9619	interval = msecs_to_jiffies(sd->balance_interval);
9620	interval = clamp(interval, 1UL, max_load_balance_interval);
9621	sdg->sgc->next_update = jiffies + interval;
9622
9623	if (!child) {
9624		update_cpu_capacity(sd, cpu);
9625		return;
9626	}
9627
9628	capacity = 0;
9629	min_capacity = ULONG_MAX;
9630	max_capacity = 0;
9631
9632	if (child->flags & SD_OVERLAP) {
9633		/*
9634		 * SD_OVERLAP domains cannot assume that child groups
9635		 * span the current group.
9636		 */
9637
9638		for_each_cpu(cpu, sched_group_span(sdg)) {
9639			unsigned long cpu_cap = capacity_of(cpu);
9640
9641			capacity += cpu_cap;
9642			min_capacity = min(cpu_cap, min_capacity);
9643			max_capacity = max(cpu_cap, max_capacity);
9644		}
9645	} else  {
9646		/*
9647		 * !SD_OVERLAP domains can assume that child groups
9648		 * span the current group.
9649		 */
9650
9651		group = child->groups;
9652		do {
9653			struct sched_group_capacity *sgc = group->sgc;
9654
9655			capacity += sgc->capacity;
9656			min_capacity = min(sgc->min_capacity, min_capacity);
9657			max_capacity = max(sgc->max_capacity, max_capacity);
9658			group = group->next;
9659		} while (group != child->groups);
9660	}
9661
9662	sdg->sgc->capacity = capacity;
9663	sdg->sgc->min_capacity = min_capacity;
9664	sdg->sgc->max_capacity = max_capacity;
9665}
9666
9667/*
9668 * Check whether the capacity of the rq has been noticeably reduced by side
9669 * activity. The imbalance_pct is used for the threshold.
9670 * Return true is the capacity is reduced
9671 */
9672static inline int
9673check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
9674{
9675	return ((rq->cpu_capacity * sd->imbalance_pct) <
9676				(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
9677}
9678
9679/* Check if the rq has a misfit task */
9680static inline bool check_misfit_status(struct rq *rq)
9681{
9682	return rq->misfit_task_load;
9683}
9684
9685/*
9686 * Group imbalance indicates (and tries to solve) the problem where balancing
9687 * groups is inadequate due to ->cpus_ptr constraints.
9688 *
9689 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
9690 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
9691 * Something like:
9692 *
9693 *	{ 0 1 2 3 } { 4 5 6 7 }
9694 *	        *     * * *
9695 *
9696 * If we were to balance group-wise we'd place two tasks in the first group and
9697 * two tasks in the second group. Clearly this is undesired as it will overload
9698 * cpu 3 and leave one of the CPUs in the second group unused.
9699 *
9700 * The current solution to this issue is detecting the skew in the first group
9701 * by noticing the lower domain failed to reach balance and had difficulty
9702 * moving tasks due to affinity constraints.
9703 *
9704 * When this is so detected; this group becomes a candidate for busiest; see
9705 * update_sd_pick_busiest(). And calculate_imbalance() and
9706 * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it
9707 * to create an effective group imbalance.
9708 *
9709 * This is a somewhat tricky proposition since the next run might not find the
9710 * group imbalance and decide the groups need to be balanced again. A most
9711 * subtle and fragile situation.
9712 */
9713
9714static inline int sg_imbalanced(struct sched_group *group)
9715{
9716	return group->sgc->imbalance;
9717}
9718
9719/*
9720 * group_has_capacity returns true if the group has spare capacity that could
9721 * be used by some tasks.
9722 * We consider that a group has spare capacity if the number of task is
9723 * smaller than the number of CPUs or if the utilization is lower than the
9724 * available capacity for CFS tasks.
9725 * For the latter, we use a threshold to stabilize the state, to take into
9726 * account the variance of the tasks' load and to return true if the available
9727 * capacity in meaningful for the load balancer.
9728 * As an example, an available capacity of 1% can appear but it doesn't make
9729 * any benefit for the load balance.
9730 */
9731static inline bool
9732group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
9733{
9734	if (sgs->sum_nr_running < sgs->group_weight)
9735		return true;
9736
9737	if ((sgs->group_capacity * imbalance_pct) <
9738			(sgs->group_runnable * 100))
9739		return false;
9740
9741	if ((sgs->group_capacity * 100) >
9742			(sgs->group_util * imbalance_pct))
9743		return true;
9744
9745	return false;
9746}
9747
9748/*
9749 *  group_is_overloaded returns true if the group has more tasks than it can
9750 *  handle.
9751 *  group_is_overloaded is not equals to !group_has_capacity because a group
9752 *  with the exact right number of tasks, has no more spare capacity but is not
9753 *  overloaded so both group_has_capacity and group_is_overloaded return
9754 *  false.
9755 */
9756static inline bool
9757group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
9758{
9759	if (sgs->sum_nr_running <= sgs->group_weight)
9760		return false;
9761
9762	if ((sgs->group_capacity * 100) <
9763			(sgs->group_util * imbalance_pct))
9764		return true;
9765
9766	if ((sgs->group_capacity * imbalance_pct) <
9767			(sgs->group_runnable * 100))
9768		return true;
9769
9770	return false;
9771}
9772
9773static inline enum
9774group_type group_classify(unsigned int imbalance_pct,
9775			  struct sched_group *group,
9776			  struct sg_lb_stats *sgs)
9777{
9778	if (group_is_overloaded(imbalance_pct, sgs))
9779		return group_overloaded;
9780
9781	if (sg_imbalanced(group))
9782		return group_imbalanced;
9783
9784	if (sgs->group_asym_packing)
9785		return group_asym_packing;
9786
9787	if (sgs->group_smt_balance)
9788		return group_smt_balance;
9789
9790	if (sgs->group_misfit_task_load)
9791		return group_misfit_task;
9792
9793	if (!group_has_capacity(imbalance_pct, sgs))
9794		return group_fully_busy;
9795
9796	return group_has_spare;
9797}
9798
9799/**
9800 * sched_use_asym_prio - Check whether asym_packing priority must be used
9801 * @sd:		The scheduling domain of the load balancing
9802 * @cpu:	A CPU
9803 *
9804 * Always use CPU priority when balancing load between SMT siblings. When
9805 * balancing load between cores, it is not sufficient that @cpu is idle. Only
9806 * use CPU priority if the whole core is idle.
9807 *
9808 * Returns: True if the priority of @cpu must be followed. False otherwise.
9809 */
9810static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
9811{
9812	if (!(sd->flags & SD_ASYM_PACKING))
9813		return false;
9814
9815	if (!sched_smt_active())
9816		return true;
9817
9818	return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
9819}
9820
9821static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu)
9822{
9823	/*
9824	 * First check if @dst_cpu can do asym_packing load balance. Only do it
9825	 * if it has higher priority than @src_cpu.
9826	 */
9827	return sched_use_asym_prio(sd, dst_cpu) &&
9828		sched_asym_prefer(dst_cpu, src_cpu);
9829}
9830
9831/**
9832 * sched_group_asym - Check if the destination CPU can do asym_packing balance
9833 * @env:	The load balancing environment
9834 * @sgs:	Load-balancing statistics of the candidate busiest group
9835 * @group:	The candidate busiest group
9836 *
9837 * @env::dst_cpu can do asym_packing if it has higher priority than the
9838 * preferred CPU of @group.
9839 *
9840 * Return: true if @env::dst_cpu can do with asym_packing load balance. False
9841 * otherwise.
9842 */
9843static inline bool
9844sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group)
9845{
9846	/*
9847	 * CPU priorities do not make sense for SMT cores with more than one
9848	 * busy sibling.
9849	 */
9850	if ((group->flags & SD_SHARE_CPUCAPACITY) &&
9851	    (sgs->group_weight - sgs->idle_cpus != 1))
9852		return false;
9853
9854	return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu);
9855}
9856
9857/* One group has more than one SMT CPU while the other group does not */
9858static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
9859				    struct sched_group *sg2)
9860{
9861	if (!sg1 || !sg2)
9862		return false;
9863
9864	return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
9865		(sg2->flags & SD_SHARE_CPUCAPACITY);
9866}
9867
9868static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
9869			       struct sched_group *group)
9870{
9871	if (!env->idle)
9872		return false;
9873
9874	/*
9875	 * For SMT source group, it is better to move a task
9876	 * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
9877	 * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
9878	 * will not be on.
9879	 */
9880	if (group->flags & SD_SHARE_CPUCAPACITY &&
9881	    sgs->sum_h_nr_running > 1)
9882		return true;
9883
9884	return false;
9885}
9886
9887static inline long sibling_imbalance(struct lb_env *env,
9888				    struct sd_lb_stats *sds,
9889				    struct sg_lb_stats *busiest,
9890				    struct sg_lb_stats *local)
9891{
9892	int ncores_busiest, ncores_local;
9893	long imbalance;
9894
9895	if (!env->idle || !busiest->sum_nr_running)
9896		return 0;
9897
9898	ncores_busiest = sds->busiest->cores;
9899	ncores_local = sds->local->cores;
9900
9901	if (ncores_busiest == ncores_local) {
9902		imbalance = busiest->sum_nr_running;
9903		lsub_positive(&imbalance, local->sum_nr_running);
9904		return imbalance;
9905	}
9906
9907	/* Balance such that nr_running/ncores ratio are same on both groups */
9908	imbalance = ncores_local * busiest->sum_nr_running;
9909	lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
9910	/* Normalize imbalance and do rounding on normalization */
9911	imbalance = 2 * imbalance + ncores_local + ncores_busiest;
9912	imbalance /= ncores_local + ncores_busiest;
9913
9914	/* Take advantage of resource in an empty sched group */
9915	if (imbalance <= 1 && local->sum_nr_running == 0 &&
9916	    busiest->sum_nr_running > 1)
9917		imbalance = 2;
9918
9919	return imbalance;
9920}
9921
9922static inline bool
9923sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
9924{
9925	/*
9926	 * When there is more than 1 task, the group_overloaded case already
9927	 * takes care of cpu with reduced capacity
9928	 */
9929	if (rq->cfs.h_nr_running != 1)
9930		return false;
9931
9932	return check_cpu_capacity(rq, sd);
9933}
9934
9935/**
9936 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
9937 * @env: The load balancing environment.
9938 * @sds: Load-balancing data with statistics of the local group.
9939 * @group: sched_group whose statistics are to be updated.
9940 * @sgs: variable to hold the statistics for this group.
9941 * @sg_overloaded: sched_group is overloaded
9942 * @sg_overutilized: sched_group is overutilized
9943 */
9944static inline void update_sg_lb_stats(struct lb_env *env,
9945				      struct sd_lb_stats *sds,
9946				      struct sched_group *group,
9947				      struct sg_lb_stats *sgs,
9948				      bool *sg_overloaded,
9949				      bool *sg_overutilized)
9950{
9951	int i, nr_running, local_group;
9952
9953	memset(sgs, 0, sizeof(*sgs));
9954
9955	local_group = group == sds->local;
9956
9957	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
9958		struct rq *rq = cpu_rq(i);
9959		unsigned long load = cpu_load(rq);
9960
9961		sgs->group_load += load;
9962		sgs->group_util += cpu_util_cfs(i);
9963		sgs->group_runnable += cpu_runnable(rq);
9964		sgs->sum_h_nr_running += rq->cfs.h_nr_running;
9965
9966		nr_running = rq->nr_running;
9967		sgs->sum_nr_running += nr_running;
9968
9969		if (nr_running > 1)
9970			*sg_overloaded = 1;
9971
9972		if (cpu_overutilized(i))
9973			*sg_overutilized = 1;
9974
9975#ifdef CONFIG_NUMA_BALANCING
9976		sgs->nr_numa_running += rq->nr_numa_running;
9977		sgs->nr_preferred_running += rq->nr_preferred_running;
9978#endif
9979		/*
9980		 * No need to call idle_cpu() if nr_running is not 0
9981		 */
9982		if (!nr_running && idle_cpu(i)) {
9983			sgs->idle_cpus++;
9984			/* Idle cpu can't have misfit task */
9985			continue;
9986		}
9987
9988		if (local_group)
9989			continue;
9990
9991		if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
9992			/* Check for a misfit task on the cpu */
9993			if (sgs->group_misfit_task_load < rq->misfit_task_load) {
9994				sgs->group_misfit_task_load = rq->misfit_task_load;
9995				*sg_overloaded = 1;
9996			}
9997		} else if (env->idle && sched_reduced_capacity(rq, env->sd)) {
9998			/* Check for a task running on a CPU with reduced capacity */
9999			if (sgs->group_misfit_task_load < load)
10000				sgs->group_misfit_task_load = load;
10001		}
10002	}
10003
10004	sgs->group_capacity = group->sgc->capacity;
10005
10006	sgs->group_weight = group->group_weight;
10007
10008	/* Check if dst CPU is idle and preferred to this group */
10009	if (!local_group && env->idle && sgs->sum_h_nr_running &&
10010	    sched_group_asym(env, sgs, group))
10011		sgs->group_asym_packing = 1;
10012
10013	/* Check for loaded SMT group to be balanced to dst CPU */
10014	if (!local_group && smt_balance(env, sgs, group))
10015		sgs->group_smt_balance = 1;
10016
10017	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
10018
10019	/* Computing avg_load makes sense only when group is overloaded */
10020	if (sgs->group_type == group_overloaded)
10021		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
10022				sgs->group_capacity;
10023}
10024
10025/**
10026 * update_sd_pick_busiest - return 1 on busiest group
10027 * @env: The load balancing environment.
10028 * @sds: sched_domain statistics
10029 * @sg: sched_group candidate to be checked for being the busiest
10030 * @sgs: sched_group statistics
10031 *
10032 * Determine if @sg is a busier group than the previously selected
10033 * busiest group.
10034 *
10035 * Return: %true if @sg is a busier group than the previously selected
10036 * busiest group. %false otherwise.
10037 */
10038static bool update_sd_pick_busiest(struct lb_env *env,
10039				   struct sd_lb_stats *sds,
10040				   struct sched_group *sg,
10041				   struct sg_lb_stats *sgs)
10042{
10043	struct sg_lb_stats *busiest = &sds->busiest_stat;
10044
10045	/* Make sure that there is at least one task to pull */
10046	if (!sgs->sum_h_nr_running)
10047		return false;
10048
10049	/*
10050	 * Don't try to pull misfit tasks we can't help.
10051	 * We can use max_capacity here as reduction in capacity on some
10052	 * CPUs in the group should either be possible to resolve
10053	 * internally or be covered by avg_load imbalance (eventually).
10054	 */
10055	if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
10056	    (sgs->group_type == group_misfit_task) &&
10057	    (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
10058	     sds->local_stat.group_type != group_has_spare))
10059		return false;
10060
10061	if (sgs->group_type > busiest->group_type)
10062		return true;
10063
10064	if (sgs->group_type < busiest->group_type)
10065		return false;
10066
10067	/*
10068	 * The candidate and the current busiest group are the same type of
10069	 * group. Let check which one is the busiest according to the type.
10070	 */
10071
10072	switch (sgs->group_type) {
10073	case group_overloaded:
10074		/* Select the overloaded group with highest avg_load. */
10075		return sgs->avg_load > busiest->avg_load;
10076
10077	case group_imbalanced:
10078		/*
10079		 * Select the 1st imbalanced group as we don't have any way to
10080		 * choose one more than another.
10081		 */
10082		return false;
10083
10084	case group_asym_packing:
10085		/* Prefer to move from lowest priority CPU's work */
10086		return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu);
10087
10088	case group_misfit_task:
10089		/*
10090		 * If we have more than one misfit sg go with the biggest
10091		 * misfit.
10092		 */
10093		return sgs->group_misfit_task_load > busiest->group_misfit_task_load;
10094
10095	case group_smt_balance:
10096		/*
10097		 * Check if we have spare CPUs on either SMT group to
10098		 * choose has spare or fully busy handling.
10099		 */
10100		if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0)
10101			goto has_spare;
10102
10103		fallthrough;
10104
10105	case group_fully_busy:
10106		/*
10107		 * Select the fully busy group with highest avg_load. In
10108		 * theory, there is no need to pull task from such kind of
10109		 * group because tasks have all compute capacity that they need
10110		 * but we can still improve the overall throughput by reducing
10111		 * contention when accessing shared HW resources.
10112		 *
10113		 * XXX for now avg_load is not computed and always 0 so we
10114		 * select the 1st one, except if @sg is composed of SMT
10115		 * siblings.
10116		 */
10117
10118		if (sgs->avg_load < busiest->avg_load)
10119			return false;
10120
10121		if (sgs->avg_load == busiest->avg_load) {
10122			/*
10123			 * SMT sched groups need more help than non-SMT groups.
10124			 * If @sg happens to also be SMT, either choice is good.
10125			 */
10126			if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
10127				return false;
10128		}
10129
10130		break;
10131
10132	case group_has_spare:
10133		/*
10134		 * Do not pick sg with SMT CPUs over sg with pure CPUs,
10135		 * as we do not want to pull task off SMT core with one task
10136		 * and make the core idle.
10137		 */
10138		if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
10139			if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
10140				return false;
10141			else
10142				return true;
10143		}
10144has_spare:
10145
10146		/*
10147		 * Select not overloaded group with lowest number of idle CPUs
10148		 * and highest number of running tasks. We could also compare
10149		 * the spare capacity which is more stable but it can end up
10150		 * that the group has less spare capacity but finally more idle
10151		 * CPUs which means less opportunity to pull tasks.
10152		 */
10153		if (sgs->idle_cpus > busiest->idle_cpus)
10154			return false;
10155		else if ((sgs->idle_cpus == busiest->idle_cpus) &&
10156			 (sgs->sum_nr_running <= busiest->sum_nr_running))
10157			return false;
10158
10159		break;
10160	}
10161
10162	/*
10163	 * Candidate sg has no more than one task per CPU and has higher
10164	 * per-CPU capacity. Migrating tasks to less capable CPUs may harm
10165	 * throughput. Maximize throughput, power/energy consequences are not
10166	 * considered.
10167	 */
10168	if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
10169	    (sgs->group_type <= group_fully_busy) &&
10170	    (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
10171		return false;
10172
10173	return true;
10174}
10175
10176#ifdef CONFIG_NUMA_BALANCING
10177static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
10178{
10179	if (sgs->sum_h_nr_running > sgs->nr_numa_running)
10180		return regular;
10181	if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
10182		return remote;
10183	return all;
10184}
10185
10186static inline enum fbq_type fbq_classify_rq(struct rq *rq)
10187{
10188	if (rq->nr_running > rq->nr_numa_running)
10189		return regular;
10190	if (rq->nr_running > rq->nr_preferred_running)
10191		return remote;
10192	return all;
10193}
10194#else
10195static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
10196{
10197	return all;
10198}
10199
10200static inline enum fbq_type fbq_classify_rq(struct rq *rq)
10201{
10202	return regular;
10203}
10204#endif /* CONFIG_NUMA_BALANCING */
10205
10206
10207struct sg_lb_stats;
10208
10209/*
10210 * task_running_on_cpu - return 1 if @p is running on @cpu.
10211 */
10212
10213static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
10214{
10215	/* Task has no contribution or is new */
10216	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
10217		return 0;
10218
10219	if (task_on_rq_queued(p))
10220		return 1;
10221
10222	return 0;
10223}
10224
10225/**
10226 * idle_cpu_without - would a given CPU be idle without p ?
10227 * @cpu: the processor on which idleness is tested.
10228 * @p: task which should be ignored.
10229 *
10230 * Return: 1 if the CPU would be idle. 0 otherwise.
10231 */
10232static int idle_cpu_without(int cpu, struct task_struct *p)
10233{
10234	struct rq *rq = cpu_rq(cpu);
10235
10236	if (rq->curr != rq->idle && rq->curr != p)
10237		return 0;
10238
10239	/*
10240	 * rq->nr_running can't be used but an updated version without the
10241	 * impact of p on cpu must be used instead. The updated nr_running
10242	 * be computed and tested before calling idle_cpu_without().
10243	 */
10244
10245	if (rq->ttwu_pending)
10246		return 0;
10247
10248	return 1;
10249}
10250
10251/*
10252 * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
10253 * @sd: The sched_domain level to look for idlest group.
10254 * @group: sched_group whose statistics are to be updated.
10255 * @sgs: variable to hold the statistics for this group.
10256 * @p: The task for which we look for the idlest group/CPU.
10257 */
10258static inline void update_sg_wakeup_stats(struct sched_domain *sd,
10259					  struct sched_group *group,
10260					  struct sg_lb_stats *sgs,
10261					  struct task_struct *p)
10262{
10263	int i, nr_running;
10264
10265	memset(sgs, 0, sizeof(*sgs));
10266
10267	/* Assume that task can't fit any CPU of the group */
10268	if (sd->flags & SD_ASYM_CPUCAPACITY)
10269		sgs->group_misfit_task_load = 1;
10270
10271	for_each_cpu(i, sched_group_span(group)) {
10272		struct rq *rq = cpu_rq(i);
10273		unsigned int local;
10274
10275		sgs->group_load += cpu_load_without(rq, p);
10276		sgs->group_util += cpu_util_without(i, p);
10277		sgs->group_runnable += cpu_runnable_without(rq, p);
10278		local = task_running_on_cpu(i, p);
10279		sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
10280
10281		nr_running = rq->nr_running - local;
10282		sgs->sum_nr_running += nr_running;
10283
10284		/*
10285		 * No need to call idle_cpu_without() if nr_running is not 0
10286		 */
10287		if (!nr_running && idle_cpu_without(i, p))
10288			sgs->idle_cpus++;
10289
10290		/* Check if task fits in the CPU */
10291		if (sd->flags & SD_ASYM_CPUCAPACITY &&
10292		    sgs->group_misfit_task_load &&
10293		    task_fits_cpu(p, i))
10294			sgs->group_misfit_task_load = 0;
10295
10296	}
10297
10298	sgs->group_capacity = group->sgc->capacity;
10299
10300	sgs->group_weight = group->group_weight;
10301
10302	sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
10303
10304	/*
10305	 * Computing avg_load makes sense only when group is fully busy or
10306	 * overloaded
10307	 */
10308	if (sgs->group_type == group_fully_busy ||
10309		sgs->group_type == group_overloaded)
10310		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
10311				sgs->group_capacity;
10312}
10313
10314static bool update_pick_idlest(struct sched_group *idlest,
10315			       struct sg_lb_stats *idlest_sgs,
10316			       struct sched_group *group,
10317			       struct sg_lb_stats *sgs)
10318{
10319	if (sgs->group_type < idlest_sgs->group_type)
10320		return true;
10321
10322	if (sgs->group_type > idlest_sgs->group_type)
10323		return false;
10324
10325	/*
10326	 * The candidate and the current idlest group are the same type of
10327	 * group. Let check which one is the idlest according to the type.
10328	 */
10329
10330	switch (sgs->group_type) {
10331	case group_overloaded:
10332	case group_fully_busy:
10333		/* Select the group with lowest avg_load. */
10334		if (idlest_sgs->avg_load <= sgs->avg_load)
10335			return false;
10336		break;
10337
10338	case group_imbalanced:
10339	case group_asym_packing:
10340	case group_smt_balance:
10341		/* Those types are not used in the slow wakeup path */
10342		return false;
10343
10344	case group_misfit_task:
10345		/* Select group with the highest max capacity */
10346		if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
10347			return false;
10348		break;
10349
10350	case group_has_spare:
10351		/* Select group with most idle CPUs */
10352		if (idlest_sgs->idle_cpus > sgs->idle_cpus)
10353			return false;
10354
10355		/* Select group with lowest group_util */
10356		if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
10357			idlest_sgs->group_util <= sgs->group_util)
10358			return false;
10359
10360		break;
10361	}
10362
10363	return true;
10364}
10365
10366/*
10367 * sched_balance_find_dst_group() finds and returns the least busy CPU group within the
10368 * domain.
10369 *
10370 * Assumes p is allowed on at least one CPU in sd.
10371 */
10372static struct sched_group *
10373sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
10374{
10375	struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
10376	struct sg_lb_stats local_sgs, tmp_sgs;
10377	struct sg_lb_stats *sgs;
10378	unsigned long imbalance;
10379	struct sg_lb_stats idlest_sgs = {
10380			.avg_load = UINT_MAX,
10381			.group_type = group_overloaded,
10382	};
10383
10384	do {
10385		int local_group;
10386
10387		/* Skip over this group if it has no CPUs allowed */
10388		if (!cpumask_intersects(sched_group_span(group),
10389					p->cpus_ptr))
10390			continue;
10391
10392		/* Skip over this group if no cookie matched */
10393		if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
10394			continue;
10395
10396		local_group = cpumask_test_cpu(this_cpu,
10397					       sched_group_span(group));
10398
10399		if (local_group) {
10400			sgs = &local_sgs;
10401			local = group;
10402		} else {
10403			sgs = &tmp_sgs;
10404		}
10405
10406		update_sg_wakeup_stats(sd, group, sgs, p);
10407
10408		if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
10409			idlest = group;
10410			idlest_sgs = *sgs;
10411		}
10412
10413	} while (group = group->next, group != sd->groups);
10414
10415
10416	/* There is no idlest group to push tasks to */
10417	if (!idlest)
10418		return NULL;
10419
10420	/* The local group has been skipped because of CPU affinity */
10421	if (!local)
10422		return idlest;
10423
10424	/*
10425	 * If the local group is idler than the selected idlest group
10426	 * don't try and push the task.
10427	 */
10428	if (local_sgs.group_type < idlest_sgs.group_type)
10429		return NULL;
10430
10431	/*
10432	 * If the local group is busier than the selected idlest group
10433	 * try and push the task.
10434	 */
10435	if (local_sgs.group_type > idlest_sgs.group_type)
10436		return idlest;
10437
10438	switch (local_sgs.group_type) {
10439	case group_overloaded:
10440	case group_fully_busy:
10441
10442		/* Calculate allowed imbalance based on load */
10443		imbalance = scale_load_down(NICE_0_LOAD) *
10444				(sd->imbalance_pct-100) / 100;
10445
10446		/*
10447		 * When comparing groups across NUMA domains, it's possible for
10448		 * the local domain to be very lightly loaded relative to the
10449		 * remote domains but "imbalance" skews the comparison making
10450		 * remote CPUs look much more favourable. When considering
10451		 * cross-domain, add imbalance to the load on the remote node
10452		 * and consider staying local.
10453		 */
10454
10455		if ((sd->flags & SD_NUMA) &&
10456		    ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
10457			return NULL;
10458
10459		/*
10460		 * If the local group is less loaded than the selected
10461		 * idlest group don't try and push any tasks.
10462		 */
10463		if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
10464			return NULL;
10465
10466		if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
10467			return NULL;
10468		break;
10469
10470	case group_imbalanced:
10471	case group_asym_packing:
10472	case group_smt_balance:
10473		/* Those type are not used in the slow wakeup path */
10474		return NULL;
10475
10476	case group_misfit_task:
10477		/* Select group with the highest max capacity */
10478		if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
10479			return NULL;
10480		break;
10481
10482	case group_has_spare:
10483#ifdef CONFIG_NUMA
10484		if (sd->flags & SD_NUMA) {
10485			int imb_numa_nr = sd->imb_numa_nr;
10486#ifdef CONFIG_NUMA_BALANCING
10487			int idlest_cpu;
10488			/*
10489			 * If there is spare capacity at NUMA, try to select
10490			 * the preferred node
10491			 */
10492			if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
10493				return NULL;
10494
10495			idlest_cpu = cpumask_first(sched_group_span(idlest));
10496			if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
10497				return idlest;
10498#endif /* CONFIG_NUMA_BALANCING */
10499			/*
10500			 * Otherwise, keep the task close to the wakeup source
10501			 * and improve locality if the number of running tasks
10502			 * would remain below threshold where an imbalance is
10503			 * allowed while accounting for the possibility the
10504			 * task is pinned to a subset of CPUs. If there is a
10505			 * real need of migration, periodic load balance will
10506			 * take care of it.
10507			 */
10508			if (p->nr_cpus_allowed != NR_CPUS) {
10509				struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
10510
10511				cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
10512				imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
10513			}
10514
10515			imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
10516			if (!adjust_numa_imbalance(imbalance,
10517						   local_sgs.sum_nr_running + 1,
10518						   imb_numa_nr)) {
10519				return NULL;
10520			}
10521		}
10522#endif /* CONFIG_NUMA */
10523
10524		/*
10525		 * Select group with highest number of idle CPUs. We could also
10526		 * compare the utilization which is more stable but it can end
10527		 * up that the group has less spare capacity but finally more
10528		 * idle CPUs which means more opportunity to run task.
10529		 */
10530		if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
10531			return NULL;
10532		break;
10533	}
10534
10535	return idlest;
10536}
10537
10538static void update_idle_cpu_scan(struct lb_env *env,
10539				 unsigned long sum_util)
10540{
10541	struct sched_domain_shared *sd_share;
10542	int llc_weight, pct;
10543	u64 x, y, tmp;
10544	/*
10545	 * Update the number of CPUs to scan in LLC domain, which could
10546	 * be used as a hint in select_idle_cpu(). The update of sd_share
10547	 * could be expensive because it is within a shared cache line.
10548	 * So the write of this hint only occurs during periodic load
10549	 * balancing, rather than CPU_NEWLY_IDLE, because the latter
10550	 * can fire way more frequently than the former.
10551	 */
10552	if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
10553		return;
10554
10555	llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
10556	if (env->sd->span_weight != llc_weight)
10557		return;
10558
10559	sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
10560	if (!sd_share)
10561		return;
10562
10563	/*
10564	 * The number of CPUs to search drops as sum_util increases, when
10565	 * sum_util hits 85% or above, the scan stops.
10566	 * The reason to choose 85% as the threshold is because this is the
10567	 * imbalance_pct(117) when a LLC sched group is overloaded.
10568	 *
10569	 * let y = SCHED_CAPACITY_SCALE - p * x^2                       [1]
10570	 * and y'= y / SCHED_CAPACITY_SCALE
10571	 *
10572	 * x is the ratio of sum_util compared to the CPU capacity:
10573	 * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
10574	 * y' is the ratio of CPUs to be scanned in the LLC domain,
10575	 * and the number of CPUs to scan is calculated by:
10576	 *
10577	 * nr_scan = llc_weight * y'                                    [2]
10578	 *
10579	 * When x hits the threshold of overloaded, AKA, when
10580	 * x = 100 / pct, y drops to 0. According to [1],
10581	 * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
10582	 *
10583	 * Scale x by SCHED_CAPACITY_SCALE:
10584	 * x' = sum_util / llc_weight;                                  [3]
10585	 *
10586	 * and finally [1] becomes:
10587	 * y = SCHED_CAPACITY_SCALE -
10588	 *     x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE)            [4]
10589	 *
10590	 */
10591	/* equation [3] */
10592	x = sum_util;
10593	do_div(x, llc_weight);
10594
10595	/* equation [4] */
10596	pct = env->sd->imbalance_pct;
10597	tmp = x * x * pct * pct;
10598	do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
10599	tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
10600	y = SCHED_CAPACITY_SCALE - tmp;
10601
10602	/* equation [2] */
10603	y *= llc_weight;
10604	do_div(y, SCHED_CAPACITY_SCALE);
10605	if ((int)y != sd_share->nr_idle_scan)
10606		WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
10607}
10608
10609/**
10610 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
10611 * @env: The load balancing environment.
10612 * @sds: variable to hold the statistics for this sched_domain.
10613 */
10614
10615static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
10616{
10617	struct sched_group *sg = env->sd->groups;
10618	struct sg_lb_stats *local = &sds->local_stat;
10619	struct sg_lb_stats tmp_sgs;
10620	unsigned long sum_util = 0;
10621	bool sg_overloaded = 0, sg_overutilized = 0;
10622
10623	do {
10624		struct sg_lb_stats *sgs = &tmp_sgs;
10625		int local_group;
10626
10627		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
10628		if (local_group) {
10629			sds->local = sg;
10630			sgs = local;
10631
10632			if (env->idle != CPU_NEWLY_IDLE ||
10633			    time_after_eq(jiffies, sg->sgc->next_update))
10634				update_group_capacity(env->sd, env->dst_cpu);
10635		}
10636
10637		update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
10638
10639		if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
10640			sds->busiest = sg;
10641			sds->busiest_stat = *sgs;
10642		}
10643
10644		/* Now, start updating sd_lb_stats */
10645		sds->total_load += sgs->group_load;
10646		sds->total_capacity += sgs->group_capacity;
10647
10648		sum_util += sgs->group_util;
10649		sg = sg->next;
10650	} while (sg != env->sd->groups);
10651
10652	/*
10653	 * Indicate that the child domain of the busiest group prefers tasks
10654	 * go to a child's sibling domains first. NB the flags of a sched group
10655	 * are those of the child domain.
10656	 */
10657	if (sds->busiest)
10658		sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
10659
10660
10661	if (env->sd->flags & SD_NUMA)
10662		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
10663
10664	if (!env->sd->parent) {
10665		/* update overload indicator if we are at root domain */
10666		set_rd_overloaded(env->dst_rq->rd, sg_overloaded);
10667
10668		/* Update over-utilization (tipping point, U >= 0) indicator */
10669		set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
10670	} else if (sg_overutilized) {
10671		set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
10672	}
10673
10674	update_idle_cpu_scan(env, sum_util);
10675}
10676
10677/**
10678 * calculate_imbalance - Calculate the amount of imbalance present within the
10679 *			 groups of a given sched_domain during load balance.
10680 * @env: load balance environment
10681 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
10682 */
10683static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
10684{
10685	struct sg_lb_stats *local, *busiest;
10686
10687	local = &sds->local_stat;
10688	busiest = &sds->busiest_stat;
10689
10690	if (busiest->group_type == group_misfit_task) {
10691		if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
10692			/* Set imbalance to allow misfit tasks to be balanced. */
10693			env->migration_type = migrate_misfit;
10694			env->imbalance = 1;
10695		} else {
10696			/*
10697			 * Set load imbalance to allow moving task from cpu
10698			 * with reduced capacity.
10699			 */
10700			env->migration_type = migrate_load;
10701			env->imbalance = busiest->group_misfit_task_load;
10702		}
10703		return;
10704	}
10705
10706	if (busiest->group_type == group_asym_packing) {
10707		/*
10708		 * In case of asym capacity, we will try to migrate all load to
10709		 * the preferred CPU.
10710		 */
10711		env->migration_type = migrate_task;
10712		env->imbalance = busiest->sum_h_nr_running;
10713		return;
10714	}
10715
10716	if (busiest->group_type == group_smt_balance) {
10717		/* Reduce number of tasks sharing CPU capacity */
10718		env->migration_type = migrate_task;
10719		env->imbalance = 1;
10720		return;
10721	}
10722
10723	if (busiest->group_type == group_imbalanced) {
10724		/*
10725		 * In the group_imb case we cannot rely on group-wide averages
10726		 * to ensure CPU-load equilibrium, try to move any task to fix
10727		 * the imbalance. The next load balance will take care of
10728		 * balancing back the system.
10729		 */
10730		env->migration_type = migrate_task;
10731		env->imbalance = 1;
10732		return;
10733	}
10734
10735	/*
10736	 * Try to use spare capacity of local group without overloading it or
10737	 * emptying busiest.
10738	 */
10739	if (local->group_type == group_has_spare) {
10740		if ((busiest->group_type > group_fully_busy) &&
10741		    !(env->sd->flags & SD_SHARE_LLC)) {
10742			/*
10743			 * If busiest is overloaded, try to fill spare
10744			 * capacity. This might end up creating spare capacity
10745			 * in busiest or busiest still being overloaded but
10746			 * there is no simple way to directly compute the
10747			 * amount of load to migrate in order to balance the
10748			 * system.
10749			 */
10750			env->migration_type = migrate_util;
10751			env->imbalance = max(local->group_capacity, local->group_util) -
10752					 local->group_util;
10753
10754			/*
10755			 * In some cases, the group's utilization is max or even
10756			 * higher than capacity because of migrations but the
10757			 * local CPU is (newly) idle. There is at least one
10758			 * waiting task in this overloaded busiest group. Let's
10759			 * try to pull it.
10760			 */
10761			if (env->idle && env->imbalance == 0) {
10762				env->migration_type = migrate_task;
10763				env->imbalance = 1;
10764			}
10765
10766			return;
10767		}
10768
10769		if (busiest->group_weight == 1 || sds->prefer_sibling) {
10770			/*
10771			 * When prefer sibling, evenly spread running tasks on
10772			 * groups.
10773			 */
10774			env->migration_type = migrate_task;
10775			env->imbalance = sibling_imbalance(env, sds, busiest, local);
10776		} else {
10777
10778			/*
10779			 * If there is no overload, we just want to even the number of
10780			 * idle CPUs.
10781			 */
10782			env->migration_type = migrate_task;
10783			env->imbalance = max_t(long, 0,
10784					       (local->idle_cpus - busiest->idle_cpus));
10785		}
10786
10787#ifdef CONFIG_NUMA
10788		/* Consider allowing a small imbalance between NUMA groups */
10789		if (env->sd->flags & SD_NUMA) {
10790			env->imbalance = adjust_numa_imbalance(env->imbalance,
10791							       local->sum_nr_running + 1,
10792							       env->sd->imb_numa_nr);
10793		}
10794#endif
10795
10796		/* Number of tasks to move to restore balance */
10797		env->imbalance >>= 1;
10798
10799		return;
10800	}
10801
10802	/*
10803	 * Local is fully busy but has to take more load to relieve the
10804	 * busiest group
10805	 */
10806	if (local->group_type < group_overloaded) {
10807		/*
10808		 * Local will become overloaded so the avg_load metrics are
10809		 * finally needed.
10810		 */
10811
10812		local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
10813				  local->group_capacity;
10814
10815		/*
10816		 * If the local group is more loaded than the selected
10817		 * busiest group don't try to pull any tasks.
10818		 */
10819		if (local->avg_load >= busiest->avg_load) {
10820			env->imbalance = 0;
10821			return;
10822		}
10823
10824		sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
10825				sds->total_capacity;
10826
10827		/*
10828		 * If the local group is more loaded than the average system
10829		 * load, don't try to pull any tasks.
10830		 */
10831		if (local->avg_load >= sds->avg_load) {
10832			env->imbalance = 0;
10833			return;
10834		}
10835
10836	}
10837
10838	/*
10839	 * Both group are or will become overloaded and we're trying to get all
10840	 * the CPUs to the average_load, so we don't want to push ourselves
10841	 * above the average load, nor do we wish to reduce the max loaded CPU
10842	 * below the average load. At the same time, we also don't want to
10843	 * reduce the group load below the group capacity. Thus we look for
10844	 * the minimum possible imbalance.
10845	 */
10846	env->migration_type = migrate_load;
10847	env->imbalance = min(
10848		(busiest->avg_load - sds->avg_load) * busiest->group_capacity,
10849		(sds->avg_load - local->avg_load) * local->group_capacity
10850	) / SCHED_CAPACITY_SCALE;
10851}
10852
10853/******* sched_balance_find_src_group() helpers end here *********************/
10854
10855/*
10856 * Decision matrix according to the local and busiest group type:
10857 *
10858 * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
10859 * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced
10860 * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced
10861 * misfit_task      force     N/A        N/A    N/A  N/A        N/A
10862 * asym_packing     force     force      N/A    N/A  force      force
10863 * imbalanced       force     force      N/A    N/A  force      force
10864 * overloaded       force     force      N/A    N/A  force      avg_load
10865 *
10866 * N/A :      Not Applicable because already filtered while updating
10867 *            statistics.
10868 * balanced : The system is balanced for these 2 groups.
10869 * force :    Calculate the imbalance as load migration is probably needed.
10870 * avg_load : Only if imbalance is significant enough.
10871 * nr_idle :  dst_cpu is not busy and the number of idle CPUs is quite
10872 *            different in groups.
10873 */
10874
10875/**
10876 * sched_balance_find_src_group - Returns the busiest group within the sched_domain
10877 * if there is an imbalance.
10878 * @env: The load balancing environment.
10879 *
10880 * Also calculates the amount of runnable load which should be moved
10881 * to restore balance.
10882 *
10883 * Return:	- The busiest group if imbalance exists.
10884 */
10885static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
10886{
10887	struct sg_lb_stats *local, *busiest;
10888	struct sd_lb_stats sds;
10889
10890	init_sd_lb_stats(&sds);
10891
10892	/*
10893	 * Compute the various statistics relevant for load balancing at
10894	 * this level.
10895	 */
10896	update_sd_lb_stats(env, &sds);
10897
10898	/* There is no busy sibling group to pull tasks from */
10899	if (!sds.busiest)
10900		goto out_balanced;
10901
10902	busiest = &sds.busiest_stat;
10903
10904	/* Misfit tasks should be dealt with regardless of the avg load */
10905	if (busiest->group_type == group_misfit_task)
10906		goto force_balance;
10907
10908	if (!is_rd_overutilized(env->dst_rq->rd) &&
10909	    rcu_dereference(env->dst_rq->rd->pd))
10910		goto out_balanced;
10911
10912	/* ASYM feature bypasses nice load balance check */
10913	if (busiest->group_type == group_asym_packing)
10914		goto force_balance;
10915
10916	/*
10917	 * If the busiest group is imbalanced the below checks don't
10918	 * work because they assume all things are equal, which typically
10919	 * isn't true due to cpus_ptr constraints and the like.
10920	 */
10921	if (busiest->group_type == group_imbalanced)
10922		goto force_balance;
10923
10924	local = &sds.local_stat;
10925	/*
10926	 * If the local group is busier than the selected busiest group
10927	 * don't try and pull any tasks.
10928	 */
10929	if (local->group_type > busiest->group_type)
10930		goto out_balanced;
10931
10932	/*
10933	 * When groups are overloaded, use the avg_load to ensure fairness
10934	 * between tasks.
10935	 */
10936	if (local->group_type == group_overloaded) {
10937		/*
10938		 * If the local group is more loaded than the selected
10939		 * busiest group don't try to pull any tasks.
10940		 */
10941		if (local->avg_load >= busiest->avg_load)
10942			goto out_balanced;
10943
10944		/* XXX broken for overlapping NUMA groups */
10945		sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
10946				sds.total_capacity;
10947
10948		/*
10949		 * Don't pull any tasks if this group is already above the
10950		 * domain average load.
10951		 */
10952		if (local->avg_load >= sds.avg_load)
10953			goto out_balanced;
10954
10955		/*
10956		 * If the busiest group is more loaded, use imbalance_pct to be
10957		 * conservative.
10958		 */
10959		if (100 * busiest->avg_load <=
10960				env->sd->imbalance_pct * local->avg_load)
10961			goto out_balanced;
10962	}
10963
10964	/*
10965	 * Try to move all excess tasks to a sibling domain of the busiest
10966	 * group's child domain.
10967	 */
10968	if (sds.prefer_sibling && local->group_type == group_has_spare &&
10969	    sibling_imbalance(env, &sds, busiest, local) > 1)
10970		goto force_balance;
10971
10972	if (busiest->group_type != group_overloaded) {
10973		if (!env->idle) {
10974			/*
10975			 * If the busiest group is not overloaded (and as a
10976			 * result the local one too) but this CPU is already
10977			 * busy, let another idle CPU try to pull task.
10978			 */
10979			goto out_balanced;
10980		}
10981
10982		if (busiest->group_type == group_smt_balance &&
10983		    smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
10984			/* Let non SMT CPU pull from SMT CPU sharing with sibling */
10985			goto force_balance;
10986		}
10987
10988		if (busiest->group_weight > 1 &&
10989		    local->idle_cpus <= (busiest->idle_cpus + 1)) {
10990			/*
10991			 * If the busiest group is not overloaded
10992			 * and there is no imbalance between this and busiest
10993			 * group wrt idle CPUs, it is balanced. The imbalance
10994			 * becomes significant if the diff is greater than 1
10995			 * otherwise we might end up to just move the imbalance
10996			 * on another group. Of course this applies only if
10997			 * there is more than 1 CPU per group.
10998			 */
10999			goto out_balanced;
11000		}
11001
11002		if (busiest->sum_h_nr_running == 1) {
11003			/*
11004			 * busiest doesn't have any tasks waiting to run
11005			 */
11006			goto out_balanced;
11007		}
11008	}
11009
11010force_balance:
11011	/* Looks like there is an imbalance. Compute it */
11012	calculate_imbalance(env, &sds);
11013	return env->imbalance ? sds.busiest : NULL;
11014
11015out_balanced:
11016	env->imbalance = 0;
11017	return NULL;
11018}
11019
11020/*
11021 * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group.
11022 */
11023static struct rq *sched_balance_find_src_rq(struct lb_env *env,
11024				     struct sched_group *group)
11025{
11026	struct rq *busiest = NULL, *rq;
11027	unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
11028	unsigned int busiest_nr = 0;
11029	int i;
11030
11031	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
11032		unsigned long capacity, load, util;
11033		unsigned int nr_running;
11034		enum fbq_type rt;
11035
11036		rq = cpu_rq(i);
11037		rt = fbq_classify_rq(rq);
11038
11039		/*
11040		 * We classify groups/runqueues into three groups:
11041		 *  - regular: there are !numa tasks
11042		 *  - remote:  there are numa tasks that run on the 'wrong' node
11043		 *  - all:     there is no distinction
11044		 *
11045		 * In order to avoid migrating ideally placed numa tasks,
11046		 * ignore those when there's better options.
11047		 *
11048		 * If we ignore the actual busiest queue to migrate another
11049		 * task, the next balance pass can still reduce the busiest
11050		 * queue by moving tasks around inside the node.
11051		 *
11052		 * If we cannot move enough load due to this classification
11053		 * the next pass will adjust the group classification and
11054		 * allow migration of more tasks.
11055		 *
11056		 * Both cases only affect the total convergence complexity.
11057		 */
11058		if (rt > env->fbq_type)
11059			continue;
11060
11061		nr_running = rq->cfs.h_nr_running;
11062		if (!nr_running)
11063			continue;
11064
11065		capacity = capacity_of(i);
11066
11067		/*
11068		 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
11069		 * eventually lead to active_balancing high->low capacity.
11070		 * Higher per-CPU capacity is considered better than balancing
11071		 * average load.
11072		 */
11073		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
11074		    !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
11075		    nr_running == 1)
11076			continue;
11077
11078		/*
11079		 * Make sure we only pull tasks from a CPU of lower priority
11080		 * when balancing between SMT siblings.
11081		 *
11082		 * If balancing between cores, let lower priority CPUs help
11083		 * SMT cores with more than one busy sibling.
11084		 */
11085		if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1)
11086			continue;
11087
11088		switch (env->migration_type) {
11089		case migrate_load:
11090			/*
11091			 * When comparing with load imbalance, use cpu_load()
11092			 * which is not scaled with the CPU capacity.
11093			 */
11094			load = cpu_load(rq);
11095
11096			if (nr_running == 1 && load > env->imbalance &&
11097			    !check_cpu_capacity(rq, env->sd))
11098				break;
11099
11100			/*
11101			 * For the load comparisons with the other CPUs,
11102			 * consider the cpu_load() scaled with the CPU
11103			 * capacity, so that the load can be moved away
11104			 * from the CPU that is potentially running at a
11105			 * lower capacity.
11106			 *
11107			 * Thus we're looking for max(load_i / capacity_i),
11108			 * crosswise multiplication to rid ourselves of the
11109			 * division works out to:
11110			 * load_i * capacity_j > load_j * capacity_i;
11111			 * where j is our previous maximum.
11112			 */
11113			if (load * busiest_capacity > busiest_load * capacity) {
11114				busiest_load = load;
11115				busiest_capacity = capacity;
11116				busiest = rq;
11117			}
11118			break;
11119
11120		case migrate_util:
11121			util = cpu_util_cfs_boost(i);
11122
11123			/*
11124			 * Don't try to pull utilization from a CPU with one
11125			 * running task. Whatever its utilization, we will fail
11126			 * detach the task.
11127			 */
11128			if (nr_running <= 1)
11129				continue;
11130
11131			if (busiest_util < util) {
11132				busiest_util = util;
11133				busiest = rq;
11134			}
11135			break;
11136
11137		case migrate_task:
11138			if (busiest_nr < nr_running) {
11139				busiest_nr = nr_running;
11140				busiest = rq;
11141			}
11142			break;
11143
11144		case migrate_misfit:
11145			/*
11146			 * For ASYM_CPUCAPACITY domains with misfit tasks we
11147			 * simply seek the "biggest" misfit task.
11148			 */
11149			if (rq->misfit_task_load > busiest_load) {
11150				busiest_load = rq->misfit_task_load;
11151				busiest = rq;
11152			}
11153
11154			break;
11155
11156		}
11157	}
11158
11159	return busiest;
11160}
11161
11162/*
11163 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
11164 * so long as it is large enough.
11165 */
11166#define MAX_PINNED_INTERVAL	512
11167
11168static inline bool
11169asym_active_balance(struct lb_env *env)
11170{
11171	/*
11172	 * ASYM_PACKING needs to force migrate tasks from busy but lower
11173	 * priority CPUs in order to pack all tasks in the highest priority
11174	 * CPUs. When done between cores, do it only if the whole core if the
11175	 * whole core is idle.
11176	 *
11177	 * If @env::src_cpu is an SMT core with busy siblings, let
11178	 * the lower priority @env::dst_cpu help it. Do not follow
11179	 * CPU priority.
11180	 */
11181	return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) &&
11182	       (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
11183		!sched_use_asym_prio(env->sd, env->src_cpu));
11184}
11185
11186static inline bool
11187imbalanced_active_balance(struct lb_env *env)
11188{
11189	struct sched_domain *sd = env->sd;
11190
11191	/*
11192	 * The imbalanced case includes the case of pinned tasks preventing a fair
11193	 * distribution of the load on the system but also the even distribution of the
11194	 * threads on a system with spare capacity
11195	 */
11196	if ((env->migration_type == migrate_task) &&
11197	    (sd->nr_balance_failed > sd->cache_nice_tries+2))
11198		return 1;
11199
11200	return 0;
11201}
11202
11203static int need_active_balance(struct lb_env *env)
11204{
11205	struct sched_domain *sd = env->sd;
11206
11207	if (asym_active_balance(env))
11208		return 1;
11209
11210	if (imbalanced_active_balance(env))
11211		return 1;
11212
11213	/*
11214	 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
11215	 * It's worth migrating the task if the src_cpu's capacity is reduced
11216	 * because of other sched_class or IRQs if more capacity stays
11217	 * available on dst_cpu.
11218	 */
11219	if (env->idle &&
11220	    (env->src_rq->cfs.h_nr_running == 1)) {
11221		if ((check_cpu_capacity(env->src_rq, sd)) &&
11222		    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
11223			return 1;
11224	}
11225
11226	if (env->migration_type == migrate_misfit)
11227		return 1;
11228
11229	return 0;
11230}
11231
11232static int active_load_balance_cpu_stop(void *data);
11233
11234static int should_we_balance(struct lb_env *env)
11235{
11236	struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
11237	struct sched_group *sg = env->sd->groups;
11238	int cpu, idle_smt = -1;
11239
11240	/*
11241	 * Ensure the balancing environment is consistent; can happen
11242	 * when the softirq triggers 'during' hotplug.
11243	 */
11244	if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
11245		return 0;
11246
11247	/*
11248	 * In the newly idle case, we will allow all the CPUs
11249	 * to do the newly idle load balance.
11250	 *
11251	 * However, we bail out if we already have tasks or a wakeup pending,
11252	 * to optimize wakeup latency.
11253	 */
11254	if (env->idle == CPU_NEWLY_IDLE) {
11255		if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending)
11256			return 0;
11257		return 1;
11258	}
11259
11260	cpumask_copy(swb_cpus, group_balance_mask(sg));
11261	/* Try to find first idle CPU */
11262	for_each_cpu_and(cpu, swb_cpus, env->cpus) {
11263		if (!idle_cpu(cpu))
11264			continue;
11265
11266		/*
11267		 * Don't balance to idle SMT in busy core right away when
11268		 * balancing cores, but remember the first idle SMT CPU for
11269		 * later consideration.  Find CPU on an idle core first.
11270		 */
11271		if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
11272			if (idle_smt == -1)
11273				idle_smt = cpu;
11274			/*
11275			 * If the core is not idle, and first SMT sibling which is
11276			 * idle has been found, then its not needed to check other
11277			 * SMT siblings for idleness:
11278			 */
11279#ifdef CONFIG_SCHED_SMT
11280			cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
11281#endif
11282			continue;
11283		}
11284
11285		/*
11286		 * Are we the first idle core in a non-SMT domain or higher,
11287		 * or the first idle CPU in a SMT domain?
11288		 */
11289		return cpu == env->dst_cpu;
11290	}
11291
11292	/* Are we the first idle CPU with busy siblings? */
11293	if (idle_smt != -1)
11294		return idle_smt == env->dst_cpu;
11295
11296	/* Are we the first CPU of this group ? */
11297	return group_balance_cpu(sg) == env->dst_cpu;
11298}
11299
11300/*
11301 * Check this_cpu to ensure it is balanced within domain. Attempt to move
11302 * tasks if there is an imbalance.
11303 */
11304static int sched_balance_rq(int this_cpu, struct rq *this_rq,
11305			struct sched_domain *sd, enum cpu_idle_type idle,
11306			int *continue_balancing)
11307{
11308	int ld_moved, cur_ld_moved, active_balance = 0;
11309	struct sched_domain *sd_parent = sd->parent;
11310	struct sched_group *group;
11311	struct rq *busiest;
11312	struct rq_flags rf;
11313	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
11314	struct lb_env env = {
11315		.sd		= sd,
11316		.dst_cpu	= this_cpu,
11317		.dst_rq		= this_rq,
11318		.dst_grpmask    = group_balance_mask(sd->groups),
11319		.idle		= idle,
11320		.loop_break	= SCHED_NR_MIGRATE_BREAK,
11321		.cpus		= cpus,
11322		.fbq_type	= all,
11323		.tasks		= LIST_HEAD_INIT(env.tasks),
11324	};
11325
11326	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
11327
11328	schedstat_inc(sd->lb_count[idle]);
11329
11330redo:
11331	if (!should_we_balance(&env)) {
11332		*continue_balancing = 0;
11333		goto out_balanced;
11334	}
11335
11336	group = sched_balance_find_src_group(&env);
11337	if (!group) {
11338		schedstat_inc(sd->lb_nobusyg[idle]);
11339		goto out_balanced;
11340	}
11341
11342	busiest = sched_balance_find_src_rq(&env, group);
11343	if (!busiest) {
11344		schedstat_inc(sd->lb_nobusyq[idle]);
11345		goto out_balanced;
11346	}
11347
11348	WARN_ON_ONCE(busiest == env.dst_rq);
11349
11350	schedstat_add(sd->lb_imbalance[idle], env.imbalance);
11351
11352	env.src_cpu = busiest->cpu;
11353	env.src_rq = busiest;
11354
11355	ld_moved = 0;
11356	/* Clear this flag as soon as we find a pullable task */
11357	env.flags |= LBF_ALL_PINNED;
11358	if (busiest->nr_running > 1) {
11359		/*
11360		 * Attempt to move tasks. If sched_balance_find_src_group has found
11361		 * an imbalance but busiest->nr_running <= 1, the group is
11362		 * still unbalanced. ld_moved simply stays zero, so it is
11363		 * correctly treated as an imbalance.
11364		 */
11365		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
11366
11367more_balance:
11368		rq_lock_irqsave(busiest, &rf);
11369		update_rq_clock(busiest);
11370
11371		/*
11372		 * cur_ld_moved - load moved in current iteration
11373		 * ld_moved     - cumulative load moved across iterations
11374		 */
11375		cur_ld_moved = detach_tasks(&env);
11376
11377		/*
11378		 * We've detached some tasks from busiest_rq. Every
11379		 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
11380		 * unlock busiest->lock, and we are able to be sure
11381		 * that nobody can manipulate the tasks in parallel.
11382		 * See task_rq_lock() family for the details.
11383		 */
11384
11385		rq_unlock(busiest, &rf);
11386
11387		if (cur_ld_moved) {
11388			attach_tasks(&env);
11389			ld_moved += cur_ld_moved;
11390		}
11391
11392		local_irq_restore(rf.flags);
11393
11394		if (env.flags & LBF_NEED_BREAK) {
11395			env.flags &= ~LBF_NEED_BREAK;
11396			/* Stop if we tried all running tasks */
11397			if (env.loop < busiest->nr_running)
11398				goto more_balance;
11399		}
11400
11401		/*
11402		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
11403		 * us and move them to an alternate dst_cpu in our sched_group
11404		 * where they can run. The upper limit on how many times we
11405		 * iterate on same src_cpu is dependent on number of CPUs in our
11406		 * sched_group.
11407		 *
11408		 * This changes load balance semantics a bit on who can move
11409		 * load to a given_cpu. In addition to the given_cpu itself
11410		 * (or a ilb_cpu acting on its behalf where given_cpu is
11411		 * nohz-idle), we now have balance_cpu in a position to move
11412		 * load to given_cpu. In rare situations, this may cause
11413		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
11414		 * _independently_ and at _same_ time to move some load to
11415		 * given_cpu) causing excess load to be moved to given_cpu.
11416		 * This however should not happen so much in practice and
11417		 * moreover subsequent load balance cycles should correct the
11418		 * excess load moved.
11419		 */
11420		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
11421
11422			/* Prevent to re-select dst_cpu via env's CPUs */
11423			__cpumask_clear_cpu(env.dst_cpu, env.cpus);
11424
11425			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
11426			env.dst_cpu	 = env.new_dst_cpu;
11427			env.flags	&= ~LBF_DST_PINNED;
11428			env.loop	 = 0;
11429			env.loop_break	 = SCHED_NR_MIGRATE_BREAK;
11430
11431			/*
11432			 * Go back to "more_balance" rather than "redo" since we
11433			 * need to continue with same src_cpu.
11434			 */
11435			goto more_balance;
11436		}
11437
11438		/*
11439		 * We failed to reach balance because of affinity.
11440		 */
11441		if (sd_parent) {
11442			int *group_imbalance = &sd_parent->groups->sgc->imbalance;
11443
11444			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
11445				*group_imbalance = 1;
11446		}
11447
11448		/* All tasks on this runqueue were pinned by CPU affinity */
11449		if (unlikely(env.flags & LBF_ALL_PINNED)) {
11450			__cpumask_clear_cpu(cpu_of(busiest), cpus);
11451			/*
11452			 * Attempting to continue load balancing at the current
11453			 * sched_domain level only makes sense if there are
11454			 * active CPUs remaining as possible busiest CPUs to
11455			 * pull load from which are not contained within the
11456			 * destination group that is receiving any migrated
11457			 * load.
11458			 */
11459			if (!cpumask_subset(cpus, env.dst_grpmask)) {
11460				env.loop = 0;
11461				env.loop_break = SCHED_NR_MIGRATE_BREAK;
11462				goto redo;
11463			}
11464			goto out_all_pinned;
11465		}
11466	}
11467
11468	if (!ld_moved) {
11469		schedstat_inc(sd->lb_failed[idle]);
11470		/*
11471		 * Increment the failure counter only on periodic balance.
11472		 * We do not want newidle balance, which can be very
11473		 * frequent, pollute the failure counter causing
11474		 * excessive cache_hot migrations and active balances.
11475		 *
11476		 * Similarly for migration_misfit which is not related to
11477		 * load/util migration, don't pollute nr_balance_failed.
11478		 */
11479		if (idle != CPU_NEWLY_IDLE &&
11480		    env.migration_type != migrate_misfit)
11481			sd->nr_balance_failed++;
11482
11483		if (need_active_balance(&env)) {
11484			unsigned long flags;
11485
11486			raw_spin_rq_lock_irqsave(busiest, flags);
11487
11488			/*
11489			 * Don't kick the active_load_balance_cpu_stop,
11490			 * if the curr task on busiest CPU can't be
11491			 * moved to this_cpu:
11492			 */
11493			if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
11494				raw_spin_rq_unlock_irqrestore(busiest, flags);
11495				goto out_one_pinned;
11496			}
11497
11498			/* Record that we found at least one task that could run on this_cpu */
11499			env.flags &= ~LBF_ALL_PINNED;
11500
11501			/*
11502			 * ->active_balance synchronizes accesses to
11503			 * ->active_balance_work.  Once set, it's cleared
11504			 * only after active load balance is finished.
11505			 */
11506			if (!busiest->active_balance) {
11507				busiest->active_balance = 1;
11508				busiest->push_cpu = this_cpu;
11509				active_balance = 1;
11510			}
11511
11512			preempt_disable();
11513			raw_spin_rq_unlock_irqrestore(busiest, flags);
11514			if (active_balance) {
11515				stop_one_cpu_nowait(cpu_of(busiest),
11516					active_load_balance_cpu_stop, busiest,
11517					&busiest->active_balance_work);
11518			}
11519			preempt_enable();
11520		}
11521	} else {
11522		sd->nr_balance_failed = 0;
11523	}
11524
11525	if (likely(!active_balance) || need_active_balance(&env)) {
11526		/* We were unbalanced, so reset the balancing interval */
11527		sd->balance_interval = sd->min_interval;
11528	}
11529
11530	goto out;
11531
11532out_balanced:
11533	/*
11534	 * We reach balance although we may have faced some affinity
11535	 * constraints. Clear the imbalance flag only if other tasks got
11536	 * a chance to move and fix the imbalance.
11537	 */
11538	if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
11539		int *group_imbalance = &sd_parent->groups->sgc->imbalance;
11540
11541		if (*group_imbalance)
11542			*group_imbalance = 0;
11543	}
11544
11545out_all_pinned:
11546	/*
11547	 * We reach balance because all tasks are pinned at this level so
11548	 * we can't migrate them. Let the imbalance flag set so parent level
11549	 * can try to migrate them.
11550	 */
11551	schedstat_inc(sd->lb_balanced[idle]);
11552
11553	sd->nr_balance_failed = 0;
11554
11555out_one_pinned:
11556	ld_moved = 0;
11557
11558	/*
11559	 * sched_balance_newidle() disregards balance intervals, so we could
11560	 * repeatedly reach this code, which would lead to balance_interval
11561	 * skyrocketing in a short amount of time. Skip the balance_interval
11562	 * increase logic to avoid that.
11563	 *
11564	 * Similarly misfit migration which is not necessarily an indication of
11565	 * the system being busy and requires lb to backoff to let it settle
11566	 * down.
11567	 */
11568	if (env.idle == CPU_NEWLY_IDLE ||
11569	    env.migration_type == migrate_misfit)
11570		goto out;
11571
11572	/* tune up the balancing interval */
11573	if ((env.flags & LBF_ALL_PINNED &&
11574	     sd->balance_interval < MAX_PINNED_INTERVAL) ||
11575	    sd->balance_interval < sd->max_interval)
11576		sd->balance_interval *= 2;
11577out:
11578	return ld_moved;
11579}
11580
11581static inline unsigned long
11582get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
11583{
11584	unsigned long interval = sd->balance_interval;
11585
11586	if (cpu_busy)
11587		interval *= sd->busy_factor;
11588
11589	/* scale ms to jiffies */
11590	interval = msecs_to_jiffies(interval);
11591
11592	/*
11593	 * Reduce likelihood of busy balancing at higher domains racing with
11594	 * balancing at lower domains by preventing their balancing periods
11595	 * from being multiples of each other.
11596	 */
11597	if (cpu_busy)
11598		interval -= 1;
11599
11600	interval = clamp(interval, 1UL, max_load_balance_interval);
11601
11602	return interval;
11603}
11604
11605static inline void
11606update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
11607{
11608	unsigned long interval, next;
11609
11610	/* used by idle balance, so cpu_busy = 0 */
11611	interval = get_sd_balance_interval(sd, 0);
11612	next = sd->last_balance + interval;
11613
11614	if (time_after(*next_balance, next))
11615		*next_balance = next;
11616}
11617
11618/*
11619 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
11620 * running tasks off the busiest CPU onto idle CPUs. It requires at
11621 * least 1 task to be running on each physical CPU where possible, and
11622 * avoids physical / logical imbalances.
11623 */
11624static int active_load_balance_cpu_stop(void *data)
11625{
11626	struct rq *busiest_rq = data;
11627	int busiest_cpu = cpu_of(busiest_rq);
11628	int target_cpu = busiest_rq->push_cpu;
11629	struct rq *target_rq = cpu_rq(target_cpu);
11630	struct sched_domain *sd;
11631	struct task_struct *p = NULL;
11632	struct rq_flags rf;
11633
11634	rq_lock_irq(busiest_rq, &rf);
11635	/*
11636	 * Between queueing the stop-work and running it is a hole in which
11637	 * CPUs can become inactive. We should not move tasks from or to
11638	 * inactive CPUs.
11639	 */
11640	if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
11641		goto out_unlock;
11642
11643	/* Make sure the requested CPU hasn't gone down in the meantime: */
11644	if (unlikely(busiest_cpu != smp_processor_id() ||
11645		     !busiest_rq->active_balance))
11646		goto out_unlock;
11647
11648	/* Is there any task to move? */
11649	if (busiest_rq->nr_running <= 1)
11650		goto out_unlock;
11651
11652	/*
11653	 * This condition is "impossible", if it occurs
11654	 * we need to fix it. Originally reported by
11655	 * Bjorn Helgaas on a 128-CPU setup.
11656	 */
11657	WARN_ON_ONCE(busiest_rq == target_rq);
11658
11659	/* Search for an sd spanning us and the target CPU. */
11660	rcu_read_lock();
11661	for_each_domain(target_cpu, sd) {
11662		if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
11663			break;
11664	}
11665
11666	if (likely(sd)) {
11667		struct lb_env env = {
11668			.sd		= sd,
11669			.dst_cpu	= target_cpu,
11670			.dst_rq		= target_rq,
11671			.src_cpu	= busiest_rq->cpu,
11672			.src_rq		= busiest_rq,
11673			.idle		= CPU_IDLE,
11674			.flags		= LBF_ACTIVE_LB,
11675		};
11676
11677		schedstat_inc(sd->alb_count);
11678		update_rq_clock(busiest_rq);
11679
11680		p = detach_one_task(&env);
11681		if (p) {
11682			schedstat_inc(sd->alb_pushed);
11683			/* Active balancing done, reset the failure counter. */
11684			sd->nr_balance_failed = 0;
11685		} else {
11686			schedstat_inc(sd->alb_failed);
11687		}
11688	}
11689	rcu_read_unlock();
11690out_unlock:
11691	busiest_rq->active_balance = 0;
11692	rq_unlock(busiest_rq, &rf);
11693
11694	if (p)
11695		attach_one_task(target_rq, p);
11696
11697	local_irq_enable();
11698
11699	return 0;
11700}
11701
11702/*
11703 * This flag serializes load-balancing passes over large domains
11704 * (above the NODE topology level) - only one load-balancing instance
11705 * may run at a time, to reduce overhead on very large systems with
11706 * lots of CPUs and large NUMA distances.
11707 *
11708 * - Note that load-balancing passes triggered while another one
11709 *   is executing are skipped and not re-tried.
11710 *
11711 * - Also note that this does not serialize rebalance_domains()
11712 *   execution, as non-SD_SERIALIZE domains will still be
11713 *   load-balanced in parallel.
11714 */
11715static atomic_t sched_balance_running = ATOMIC_INIT(0);
11716
11717/*
11718 * Scale the max sched_balance_rq interval with the number of CPUs in the system.
11719 * This trades load-balance latency on larger machines for less cross talk.
11720 */
11721void update_max_interval(void)
11722{
11723	max_load_balance_interval = HZ*num_online_cpus()/10;
11724}
11725
11726static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
11727{
11728	if (cost > sd->max_newidle_lb_cost) {
11729		/*
11730		 * Track max cost of a domain to make sure to not delay the
11731		 * next wakeup on the CPU.
11732		 */
11733		sd->max_newidle_lb_cost = cost;
11734		sd->last_decay_max_lb_cost = jiffies;
11735	} else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
11736		/*
11737		 * Decay the newidle max times by ~1% per second to ensure that
11738		 * it is not outdated and the current max cost is actually
11739		 * shorter.
11740		 */
11741		sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
11742		sd->last_decay_max_lb_cost = jiffies;
11743
11744		return true;
11745	}
11746
11747	return false;
11748}
11749
11750/*
11751 * It checks each scheduling domain to see if it is due to be balanced,
11752 * and initiates a balancing operation if so.
11753 *
11754 * Balancing parameters are set up in init_sched_domains.
11755 */
11756static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
11757{
11758	int continue_balancing = 1;
11759	int cpu = rq->cpu;
11760	int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
11761	unsigned long interval;
11762	struct sched_domain *sd;
11763	/* Earliest time when we have to do rebalance again */
11764	unsigned long next_balance = jiffies + 60*HZ;
11765	int update_next_balance = 0;
11766	int need_serialize, need_decay = 0;
11767	u64 max_cost = 0;
11768
11769	rcu_read_lock();
11770	for_each_domain(cpu, sd) {
11771		/*
11772		 * Decay the newidle max times here because this is a regular
11773		 * visit to all the domains.
11774		 */
11775		need_decay = update_newidle_cost(sd, 0);
11776		max_cost += sd->max_newidle_lb_cost;
11777
11778		/*
11779		 * Stop the load balance at this level. There is another
11780		 * CPU in our sched group which is doing load balancing more
11781		 * actively.
11782		 */
11783		if (!continue_balancing) {
11784			if (need_decay)
11785				continue;
11786			break;
11787		}
11788
11789		interval = get_sd_balance_interval(sd, busy);
11790
11791		need_serialize = sd->flags & SD_SERIALIZE;
11792		if (need_serialize) {
11793			if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
11794				goto out;
11795		}
11796
11797		if (time_after_eq(jiffies, sd->last_balance + interval)) {
11798			if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
11799				/*
11800				 * The LBF_DST_PINNED logic could have changed
11801				 * env->dst_cpu, so we can't know our idle
11802				 * state even if we migrated tasks. Update it.
11803				 */
11804				idle = idle_cpu(cpu);
11805				busy = !idle && !sched_idle_cpu(cpu);
11806			}
11807			sd->last_balance = jiffies;
11808			interval = get_sd_balance_interval(sd, busy);
11809		}
11810		if (need_serialize)
11811			atomic_set_release(&sched_balance_running, 0);
11812out:
11813		if (time_after(next_balance, sd->last_balance + interval)) {
11814			next_balance = sd->last_balance + interval;
11815			update_next_balance = 1;
11816		}
11817	}
11818	if (need_decay) {
11819		/*
11820		 * Ensure the rq-wide value also decays but keep it at a
11821		 * reasonable floor to avoid funnies with rq->avg_idle.
11822		 */
11823		rq->max_idle_balance_cost =
11824			max((u64)sysctl_sched_migration_cost, max_cost);
11825	}
11826	rcu_read_unlock();
11827
11828	/*
11829	 * next_balance will be updated only when there is a need.
11830	 * When the cpu is attached to null domain for ex, it will not be
11831	 * updated.
11832	 */
11833	if (likely(update_next_balance))
11834		rq->next_balance = next_balance;
11835
11836}
11837
11838static inline int on_null_domain(struct rq *rq)
11839{
11840	return unlikely(!rcu_dereference_sched(rq->sd));
11841}
11842
11843#ifdef CONFIG_NO_HZ_COMMON
11844/*
11845 * NOHZ idle load balancing (ILB) details:
11846 *
11847 * - When one of the busy CPUs notices that there may be an idle rebalancing
11848 *   needed, they will kick the idle load balancer, which then does idle
11849 *   load balancing for all the idle CPUs.
11850 *
11851 * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
11852 *   anywhere yet.
11853 */
11854static inline int find_new_ilb(void)
11855{
11856	const struct cpumask *hk_mask;
11857	int ilb_cpu;
11858
11859	hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
11860
11861	for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
11862
11863		if (ilb_cpu == smp_processor_id())
11864			continue;
11865
11866		if (idle_cpu(ilb_cpu))
11867			return ilb_cpu;
11868	}
11869
11870	return -1;
11871}
11872
11873/*
11874 * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
11875 * SMP function call (IPI).
11876 *
11877 * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
11878 */
11879static void kick_ilb(unsigned int flags)
11880{
11881	int ilb_cpu;
11882
11883	/*
11884	 * Increase nohz.next_balance only when if full ilb is triggered but
11885	 * not if we only update stats.
11886	 */
11887	if (flags & NOHZ_BALANCE_KICK)
11888		nohz.next_balance = jiffies+1;
11889
11890	ilb_cpu = find_new_ilb();
11891	if (ilb_cpu < 0)
11892		return;
11893
11894	/*
11895	 * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
11896	 * the first flag owns it; cleared by nohz_csd_func().
11897	 */
11898	flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
11899	if (flags & NOHZ_KICK_MASK)
11900		return;
11901
11902	/*
11903	 * This way we generate an IPI on the target CPU which
11904	 * is idle, and the softirq performing NOHZ idle load balancing
11905	 * will be run before returning from the IPI.
11906	 */
11907	smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
11908}
11909
11910/*
11911 * Current decision point for kicking the idle load balancer in the presence
11912 * of idle CPUs in the system.
11913 */
11914static void nohz_balancer_kick(struct rq *rq)
11915{
11916	unsigned long now = jiffies;
11917	struct sched_domain_shared *sds;
11918	struct sched_domain *sd;
11919	int nr_busy, i, cpu = rq->cpu;
11920	unsigned int flags = 0;
11921
11922	if (unlikely(rq->idle_balance))
11923		return;
11924
11925	/*
11926	 * We may be recently in ticked or tickless idle mode. At the first
11927	 * busy tick after returning from idle, we will update the busy stats.
11928	 */
11929	nohz_balance_exit_idle(rq);
11930
11931	/*
11932	 * None are in tickless mode and hence no need for NOHZ idle load
11933	 * balancing:
11934	 */
11935	if (likely(!atomic_read(&nohz.nr_cpus)))
11936		return;
11937
11938	if (READ_ONCE(nohz.has_blocked) &&
11939	    time_after(now, READ_ONCE(nohz.next_blocked)))
11940		flags = NOHZ_STATS_KICK;
11941
11942	if (time_before(now, nohz.next_balance))
11943		goto out;
11944
11945	if (rq->nr_running >= 2) {
11946		flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
11947		goto out;
11948	}
11949
11950	rcu_read_lock();
11951
11952	sd = rcu_dereference(rq->sd);
11953	if (sd) {
11954		/*
11955		 * If there's a runnable CFS task and the current CPU has reduced
11956		 * capacity, kick the ILB to see if there's a better CPU to run on:
11957		 */
11958		if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
11959			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
11960			goto unlock;
11961		}
11962	}
11963
11964	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
11965	if (sd) {
11966		/*
11967		 * When ASYM_PACKING; see if there's a more preferred CPU
11968		 * currently idle; in which case, kick the ILB to move tasks
11969		 * around.
11970		 *
11971		 * When balancing between cores, all the SMT siblings of the
11972		 * preferred CPU must be idle.
11973		 */
11974		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
11975			if (sched_asym(sd, i, cpu)) {
11976				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
11977				goto unlock;
11978			}
11979		}
11980	}
11981
11982	sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
11983	if (sd) {
11984		/*
11985		 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
11986		 * to run the misfit task on.
11987		 */
11988		if (check_misfit_status(rq)) {
11989			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
11990			goto unlock;
11991		}
11992
11993		/*
11994		 * For asymmetric systems, we do not want to nicely balance
11995		 * cache use, instead we want to embrace asymmetry and only
11996		 * ensure tasks have enough CPU capacity.
11997		 *
11998		 * Skip the LLC logic because it's not relevant in that case.
11999		 */
12000		goto unlock;
12001	}
12002
12003	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
12004	if (sds) {
12005		/*
12006		 * If there is an imbalance between LLC domains (IOW we could
12007		 * increase the overall cache utilization), we need a less-loaded LLC
12008		 * domain to pull some load from. Likewise, we may need to spread
12009		 * load within the current LLC domain (e.g. packed SMT cores but
12010		 * other CPUs are idle). We can't really know from here how busy
12011		 * the others are - so just get a NOHZ balance going if it looks
12012		 * like this LLC domain has tasks we could move.
12013		 */
12014		nr_busy = atomic_read(&sds->nr_busy_cpus);
12015		if (nr_busy > 1) {
12016			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12017			goto unlock;
12018		}
12019	}
12020unlock:
12021	rcu_read_unlock();
12022out:
12023	if (READ_ONCE(nohz.needs_update))
12024		flags |= NOHZ_NEXT_KICK;
12025
12026	if (flags)
12027		kick_ilb(flags);
12028}
12029
12030static void set_cpu_sd_state_busy(int cpu)
12031{
12032	struct sched_domain *sd;
12033
12034	rcu_read_lock();
12035	sd = rcu_dereference(per_cpu(sd_llc, cpu));
12036
12037	if (!sd || !sd->nohz_idle)
12038		goto unlock;
12039	sd->nohz_idle = 0;
12040
12041	atomic_inc(&sd->shared->nr_busy_cpus);
12042unlock:
12043	rcu_read_unlock();
12044}
12045
12046void nohz_balance_exit_idle(struct rq *rq)
12047{
12048	SCHED_WARN_ON(rq != this_rq());
12049
12050	if (likely(!rq->nohz_tick_stopped))
12051		return;
12052
12053	rq->nohz_tick_stopped = 0;
12054	cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
12055	atomic_dec(&nohz.nr_cpus);
12056
12057	set_cpu_sd_state_busy(rq->cpu);
12058}
12059
12060static void set_cpu_sd_state_idle(int cpu)
12061{
12062	struct sched_domain *sd;
12063
12064	rcu_read_lock();
12065	sd = rcu_dereference(per_cpu(sd_llc, cpu));
12066
12067	if (!sd || sd->nohz_idle)
12068		goto unlock;
12069	sd->nohz_idle = 1;
12070
12071	atomic_dec(&sd->shared->nr_busy_cpus);
12072unlock:
12073	rcu_read_unlock();
12074}
12075
12076/*
12077 * This routine will record that the CPU is going idle with tick stopped.
12078 * This info will be used in performing idle load balancing in the future.
12079 */
12080void nohz_balance_enter_idle(int cpu)
12081{
12082	struct rq *rq = cpu_rq(cpu);
12083
12084	SCHED_WARN_ON(cpu != smp_processor_id());
12085
12086	/* If this CPU is going down, then nothing needs to be done: */
12087	if (!cpu_active(cpu))
12088		return;
12089
12090	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
12091	if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
12092		return;
12093
12094	/*
12095	 * Can be set safely without rq->lock held
12096	 * If a clear happens, it will have evaluated last additions because
12097	 * rq->lock is held during the check and the clear
12098	 */
12099	rq->has_blocked_load = 1;
12100
12101	/*
12102	 * The tick is still stopped but load could have been added in the
12103	 * meantime. We set the nohz.has_blocked flag to trig a check of the
12104	 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
12105	 * of nohz.has_blocked can only happen after checking the new load
12106	 */
12107	if (rq->nohz_tick_stopped)
12108		goto out;
12109
12110	/* If we're a completely isolated CPU, we don't play: */
12111	if (on_null_domain(rq))
12112		return;
12113
12114	rq->nohz_tick_stopped = 1;
12115
12116	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
12117	atomic_inc(&nohz.nr_cpus);
12118
12119	/*
12120	 * Ensures that if nohz_idle_balance() fails to observe our
12121	 * @idle_cpus_mask store, it must observe the @has_blocked
12122	 * and @needs_update stores.
12123	 */
12124	smp_mb__after_atomic();
12125
12126	set_cpu_sd_state_idle(cpu);
12127
12128	WRITE_ONCE(nohz.needs_update, 1);
12129out:
12130	/*
12131	 * Each time a cpu enter idle, we assume that it has blocked load and
12132	 * enable the periodic update of the load of idle CPUs
12133	 */
12134	WRITE_ONCE(nohz.has_blocked, 1);
12135}
12136
12137static bool update_nohz_stats(struct rq *rq)
12138{
12139	unsigned int cpu = rq->cpu;
12140
12141	if (!rq->has_blocked_load)
12142		return false;
12143
12144	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
12145		return false;
12146
12147	if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
12148		return true;
12149
12150	sched_balance_update_blocked_averages(cpu);
12151
12152	return rq->has_blocked_load;
12153}
12154
12155/*
12156 * Internal function that runs load balance for all idle CPUs. The load balance
12157 * can be a simple update of blocked load or a complete load balance with
12158 * tasks movement depending of flags.
12159 */
12160static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
12161{
12162	/* Earliest time when we have to do rebalance again */
12163	unsigned long now = jiffies;
12164	unsigned long next_balance = now + 60*HZ;
12165	bool has_blocked_load = false;
12166	int update_next_balance = 0;
12167	int this_cpu = this_rq->cpu;
12168	int balance_cpu;
12169	struct rq *rq;
12170
12171	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
12172
12173	/*
12174	 * We assume there will be no idle load after this update and clear
12175	 * the has_blocked flag. If a cpu enters idle in the mean time, it will
12176	 * set the has_blocked flag and trigger another update of idle load.
12177	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
12178	 * setting the flag, we are sure to not clear the state and not
12179	 * check the load of an idle cpu.
12180	 *
12181	 * Same applies to idle_cpus_mask vs needs_update.
12182	 */
12183	if (flags & NOHZ_STATS_KICK)
12184		WRITE_ONCE(nohz.has_blocked, 0);
12185	if (flags & NOHZ_NEXT_KICK)
12186		WRITE_ONCE(nohz.needs_update, 0);
12187
12188	/*
12189	 * Ensures that if we miss the CPU, we must see the has_blocked
12190	 * store from nohz_balance_enter_idle().
12191	 */
12192	smp_mb();
12193
12194	/*
12195	 * Start with the next CPU after this_cpu so we will end with this_cpu and let a
12196	 * chance for other idle cpu to pull load.
12197	 */
12198	for_each_cpu_wrap(balance_cpu,  nohz.idle_cpus_mask, this_cpu+1) {
12199		if (!idle_cpu(balance_cpu))
12200			continue;
12201
12202		/*
12203		 * If this CPU gets work to do, stop the load balancing
12204		 * work being done for other CPUs. Next load
12205		 * balancing owner will pick it up.
12206		 */
12207		if (need_resched()) {
12208			if (flags & NOHZ_STATS_KICK)
12209				has_blocked_load = true;
12210			if (flags & NOHZ_NEXT_KICK)
12211				WRITE_ONCE(nohz.needs_update, 1);
12212			goto abort;
12213		}
12214
12215		rq = cpu_rq(balance_cpu);
12216
12217		if (flags & NOHZ_STATS_KICK)
12218			has_blocked_load |= update_nohz_stats(rq);
12219
12220		/*
12221		 * If time for next balance is due,
12222		 * do the balance.
12223		 */
12224		if (time_after_eq(jiffies, rq->next_balance)) {
12225			struct rq_flags rf;
12226
12227			rq_lock_irqsave(rq, &rf);
12228			update_rq_clock(rq);
12229			rq_unlock_irqrestore(rq, &rf);
12230
12231			if (flags & NOHZ_BALANCE_KICK)
12232				sched_balance_domains(rq, CPU_IDLE);
12233		}
12234
12235		if (time_after(next_balance, rq->next_balance)) {
12236			next_balance = rq->next_balance;
12237			update_next_balance = 1;
12238		}
12239	}
12240
12241	/*
12242	 * next_balance will be updated only when there is a need.
12243	 * When the CPU is attached to null domain for ex, it will not be
12244	 * updated.
12245	 */
12246	if (likely(update_next_balance))
12247		nohz.next_balance = next_balance;
12248
12249	if (flags & NOHZ_STATS_KICK)
12250		WRITE_ONCE(nohz.next_blocked,
12251			   now + msecs_to_jiffies(LOAD_AVG_PERIOD));
12252
12253abort:
12254	/* There is still blocked load, enable periodic update */
12255	if (has_blocked_load)
12256		WRITE_ONCE(nohz.has_blocked, 1);
12257}
12258
12259/*
12260 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
12261 * rebalancing for all the CPUs for whom scheduler ticks are stopped.
12262 */
12263static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
12264{
12265	unsigned int flags = this_rq->nohz_idle_balance;
12266
12267	if (!flags)
12268		return false;
12269
12270	this_rq->nohz_idle_balance = 0;
12271
12272	if (idle != CPU_IDLE)
12273		return false;
12274
12275	_nohz_idle_balance(this_rq, flags);
12276
12277	return true;
12278}
12279
12280/*
12281 * Check if we need to directly run the ILB for updating blocked load before
12282 * entering idle state. Here we run ILB directly without issuing IPIs.
12283 *
12284 * Note that when this function is called, the tick may not yet be stopped on
12285 * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
12286 * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
12287 * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
12288 * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
12289 * called from this function on (this) CPU that's not yet in the mask. That's
12290 * OK because the goal of nohz_run_idle_balance() is to run ILB only for
12291 * updating the blocked load of already idle CPUs without waking up one of
12292 * those idle CPUs and outside the preempt disable / IRQ off phase of the local
12293 * cpu about to enter idle, because it can take a long time.
12294 */
12295void nohz_run_idle_balance(int cpu)
12296{
12297	unsigned int flags;
12298
12299	flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
12300
12301	/*
12302	 * Update the blocked load only if no SCHED_SOFTIRQ is about to happen
12303	 * (i.e. NOHZ_STATS_KICK set) and will do the same.
12304	 */
12305	if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
12306		_nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
12307}
12308
12309static void nohz_newidle_balance(struct rq *this_rq)
12310{
12311	int this_cpu = this_rq->cpu;
12312
12313	/*
12314	 * This CPU doesn't want to be disturbed by scheduler
12315	 * housekeeping
12316	 */
12317	if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
12318		return;
12319
12320	/* Will wake up very soon. No time for doing anything else*/
12321	if (this_rq->avg_idle < sysctl_sched_migration_cost)
12322		return;
12323
12324	/* Don't need to update blocked load of idle CPUs*/
12325	if (!READ_ONCE(nohz.has_blocked) ||
12326	    time_before(jiffies, READ_ONCE(nohz.next_blocked)))
12327		return;
12328
12329	/*
12330	 * Set the need to trigger ILB in order to update blocked load
12331	 * before entering idle state.
12332	 */
12333	atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
12334}
12335
12336#else /* !CONFIG_NO_HZ_COMMON */
12337static inline void nohz_balancer_kick(struct rq *rq) { }
12338
12339static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
12340{
12341	return false;
12342}
12343
12344static inline void nohz_newidle_balance(struct rq *this_rq) { }
12345#endif /* CONFIG_NO_HZ_COMMON */
12346
12347/*
12348 * sched_balance_newidle is called by schedule() if this_cpu is about to become
12349 * idle. Attempts to pull tasks from other CPUs.
12350 *
12351 * Returns:
12352 *   < 0 - we released the lock and there are !fair tasks present
12353 *     0 - failed, no new tasks
12354 *   > 0 - success, new (fair) tasks present
12355 */
12356static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
12357{
12358	unsigned long next_balance = jiffies + HZ;
12359	int this_cpu = this_rq->cpu;
12360	int continue_balancing = 1;
12361	u64 t0, t1, curr_cost = 0;
12362	struct sched_domain *sd;
12363	int pulled_task = 0;
12364
12365	update_misfit_status(NULL, this_rq);
12366
12367	/*
12368	 * There is a task waiting to run. No need to search for one.
12369	 * Return 0; the task will be enqueued when switching to idle.
12370	 */
12371	if (this_rq->ttwu_pending)
12372		return 0;
12373
12374	/*
12375	 * We must set idle_stamp _before_ calling sched_balance_rq()
12376	 * for CPU_NEWLY_IDLE, such that we measure the this duration
12377	 * as idle time.
12378	 */
12379	this_rq->idle_stamp = rq_clock(this_rq);
12380
12381	/*
12382	 * Do not pull tasks towards !active CPUs...
12383	 */
12384	if (!cpu_active(this_cpu))
12385		return 0;
12386
12387	/*
12388	 * This is OK, because current is on_cpu, which avoids it being picked
12389	 * for load-balance and preemption/IRQs are still disabled avoiding
12390	 * further scheduler activity on it and we're being very careful to
12391	 * re-start the picking loop.
12392	 */
12393	rq_unpin_lock(this_rq, rf);
12394
12395	rcu_read_lock();
12396	sd = rcu_dereference_check_sched_domain(this_rq->sd);
12397
12398	if (!get_rd_overloaded(this_rq->rd) ||
12399	    (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
12400
12401		if (sd)
12402			update_next_balance(sd, &next_balance);
12403		rcu_read_unlock();
12404
12405		goto out;
12406	}
12407	rcu_read_unlock();
12408
12409	raw_spin_rq_unlock(this_rq);
12410
12411	t0 = sched_clock_cpu(this_cpu);
12412	sched_balance_update_blocked_averages(this_cpu);
12413
12414	rcu_read_lock();
12415	for_each_domain(this_cpu, sd) {
12416		u64 domain_cost;
12417
12418		update_next_balance(sd, &next_balance);
12419
12420		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
12421			break;
12422
12423		if (sd->flags & SD_BALANCE_NEWIDLE) {
12424
12425			pulled_task = sched_balance_rq(this_cpu, this_rq,
12426						   sd, CPU_NEWLY_IDLE,
12427						   &continue_balancing);
12428
12429			t1 = sched_clock_cpu(this_cpu);
12430			domain_cost = t1 - t0;
12431			update_newidle_cost(sd, domain_cost);
12432
12433			curr_cost += domain_cost;
12434			t0 = t1;
12435		}
12436
12437		/*
12438		 * Stop searching for tasks to pull if there are
12439		 * now runnable tasks on this rq.
12440		 */
12441		if (pulled_task || !continue_balancing)
12442			break;
12443	}
12444	rcu_read_unlock();
12445
12446	raw_spin_rq_lock(this_rq);
12447
12448	if (curr_cost > this_rq->max_idle_balance_cost)
12449		this_rq->max_idle_balance_cost = curr_cost;
12450
12451	/*
12452	 * While browsing the domains, we released the rq lock, a task could
12453	 * have been enqueued in the meantime. Since we're not going idle,
12454	 * pretend we pulled a task.
12455	 */
12456	if (this_rq->cfs.h_nr_running && !pulled_task)
12457		pulled_task = 1;
12458
12459	/* Is there a task of a high priority class? */
12460	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
12461		pulled_task = -1;
12462
12463out:
12464	/* Move the next balance forward */
12465	if (time_after(this_rq->next_balance, next_balance))
12466		this_rq->next_balance = next_balance;
12467
12468	if (pulled_task)
12469		this_rq->idle_stamp = 0;
12470	else
12471		nohz_newidle_balance(this_rq);
12472
12473	rq_repin_lock(this_rq, rf);
12474
12475	return pulled_task;
12476}
12477
12478/*
12479 * This softirq handler is triggered via SCHED_SOFTIRQ from two places:
12480 *
12481 * - directly from the local scheduler_tick() for periodic load balancing
12482 *
12483 * - indirectly from a remote scheduler_tick() for NOHZ idle balancing
12484 *   through the SMP cross-call nohz_csd_func()
12485 */
12486static __latent_entropy void sched_balance_softirq(struct softirq_action *h)
12487{
12488	struct rq *this_rq = this_rq();
12489	enum cpu_idle_type idle = this_rq->idle_balance;
12490	/*
12491	 * If this CPU has a pending NOHZ_BALANCE_KICK, then do the
12492	 * balancing on behalf of the other idle CPUs whose ticks are
12493	 * stopped. Do nohz_idle_balance *before* sched_balance_domains to
12494	 * give the idle CPUs a chance to load balance. Else we may
12495	 * load balance only within the local sched_domain hierarchy
12496	 * and abort nohz_idle_balance altogether if we pull some load.
12497	 */
12498	if (nohz_idle_balance(this_rq, idle))
12499		return;
12500
12501	/* normal load balance */
12502	sched_balance_update_blocked_averages(this_rq->cpu);
12503	sched_balance_domains(this_rq, idle);
12504}
12505
12506/*
12507 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
12508 */
12509void sched_balance_trigger(struct rq *rq)
12510{
12511	/*
12512	 * Don't need to rebalance while attached to NULL domain or
12513	 * runqueue CPU is not active
12514	 */
12515	if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
12516		return;
12517
12518	if (time_after_eq(jiffies, rq->next_balance))
12519		raise_softirq(SCHED_SOFTIRQ);
12520
12521	nohz_balancer_kick(rq);
12522}
12523
12524static void rq_online_fair(struct rq *rq)
12525{
12526	update_sysctl();
12527
12528	update_runtime_enabled(rq);
12529}
12530
12531static void rq_offline_fair(struct rq *rq)
12532{
12533	update_sysctl();
12534
12535	/* Ensure any throttled groups are reachable by pick_next_task */
12536	unthrottle_offline_cfs_rqs(rq);
12537
12538	/* Ensure that we remove rq contribution to group share: */
12539	clear_tg_offline_cfs_rqs(rq);
12540}
12541
12542#endif /* CONFIG_SMP */
12543
12544#ifdef CONFIG_SCHED_CORE
12545static inline bool
12546__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
12547{
12548	u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
12549	u64 slice = se->slice;
12550
12551	return (rtime * min_nr_tasks > slice);
12552}
12553
12554#define MIN_NR_TASKS_DURING_FORCEIDLE	2
12555static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
12556{
12557	if (!sched_core_enabled(rq))
12558		return;
12559
12560	/*
12561	 * If runqueue has only one task which used up its slice and
12562	 * if the sibling is forced idle, then trigger schedule to
12563	 * give forced idle task a chance.
12564	 *
12565	 * sched_slice() considers only this active rq and it gets the
12566	 * whole slice. But during force idle, we have siblings acting
12567	 * like a single runqueue and hence we need to consider runnable
12568	 * tasks on this CPU and the forced idle CPU. Ideally, we should
12569	 * go through the forced idle rq, but that would be a perf hit.
12570	 * We can assume that the forced idle CPU has at least
12571	 * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
12572	 * if we need to give up the CPU.
12573	 */
12574	if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
12575	    __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
12576		resched_curr(rq);
12577}
12578
12579/*
12580 * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
12581 */
12582static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
12583			 bool forceidle)
12584{
12585	for_each_sched_entity(se) {
12586		struct cfs_rq *cfs_rq = cfs_rq_of(se);
12587
12588		if (forceidle) {
12589			if (cfs_rq->forceidle_seq == fi_seq)
12590				break;
12591			cfs_rq->forceidle_seq = fi_seq;
12592		}
12593
12594		cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
12595	}
12596}
12597
12598void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
12599{
12600	struct sched_entity *se = &p->se;
12601
12602	if (p->sched_class != &fair_sched_class)
12603		return;
12604
12605	se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
12606}
12607
12608bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
12609			bool in_fi)
12610{
12611	struct rq *rq = task_rq(a);
12612	const struct sched_entity *sea = &a->se;
12613	const struct sched_entity *seb = &b->se;
12614	struct cfs_rq *cfs_rqa;
12615	struct cfs_rq *cfs_rqb;
12616	s64 delta;
12617
12618	SCHED_WARN_ON(task_rq(b)->core != rq->core);
12619
12620#ifdef CONFIG_FAIR_GROUP_SCHED
12621	/*
12622	 * Find an se in the hierarchy for tasks a and b, such that the se's
12623	 * are immediate siblings.
12624	 */
12625	while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
12626		int sea_depth = sea->depth;
12627		int seb_depth = seb->depth;
12628
12629		if (sea_depth >= seb_depth)
12630			sea = parent_entity(sea);
12631		if (sea_depth <= seb_depth)
12632			seb = parent_entity(seb);
12633	}
12634
12635	se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
12636	se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
12637
12638	cfs_rqa = sea->cfs_rq;
12639	cfs_rqb = seb->cfs_rq;
12640#else
12641	cfs_rqa = &task_rq(a)->cfs;
12642	cfs_rqb = &task_rq(b)->cfs;
12643#endif
12644
12645	/*
12646	 * Find delta after normalizing se's vruntime with its cfs_rq's
12647	 * min_vruntime_fi, which would have been updated in prior calls
12648	 * to se_fi_update().
12649	 */
12650	delta = (s64)(sea->vruntime - seb->vruntime) +
12651		(s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
12652
12653	return delta > 0;
12654}
12655
12656static int task_is_throttled_fair(struct task_struct *p, int cpu)
12657{
12658	struct cfs_rq *cfs_rq;
12659
12660#ifdef CONFIG_FAIR_GROUP_SCHED
12661	cfs_rq = task_group(p)->cfs_rq[cpu];
12662#else
12663	cfs_rq = &cpu_rq(cpu)->cfs;
12664#endif
12665	return throttled_hierarchy(cfs_rq);
12666}
12667#else
12668static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
12669#endif
12670
12671/*
12672 * scheduler tick hitting a task of our scheduling class.
12673 *
12674 * NOTE: This function can be called remotely by the tick offload that
12675 * goes along full dynticks. Therefore no local assumption can be made
12676 * and everything must be accessed through the @rq and @curr passed in
12677 * parameters.
12678 */
12679static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
12680{
12681	struct cfs_rq *cfs_rq;
12682	struct sched_entity *se = &curr->se;
12683
12684	for_each_sched_entity(se) {
12685		cfs_rq = cfs_rq_of(se);
12686		entity_tick(cfs_rq, se, queued);
12687	}
12688
12689	if (static_branch_unlikely(&sched_numa_balancing))
12690		task_tick_numa(rq, curr);
12691
12692	update_misfit_status(curr, rq);
12693	check_update_overutilized_status(task_rq(curr));
12694
12695	task_tick_core(rq, curr);
12696}
12697
12698/*
12699 * called on fork with the child task as argument from the parent's context
12700 *  - child not yet on the tasklist
12701 *  - preemption disabled
12702 */
12703static void task_fork_fair(struct task_struct *p)
12704{
12705	struct sched_entity *se = &p->se, *curr;
12706	struct cfs_rq *cfs_rq;
12707	struct rq *rq = this_rq();
12708	struct rq_flags rf;
12709
12710	rq_lock(rq, &rf);
12711	update_rq_clock(rq);
12712
12713	set_task_max_allowed_capacity(p);
12714
12715	cfs_rq = task_cfs_rq(current);
12716	curr = cfs_rq->curr;
12717	if (curr)
12718		update_curr(cfs_rq);
12719	place_entity(cfs_rq, se, ENQUEUE_INITIAL);
12720	rq_unlock(rq, &rf);
12721}
12722
12723/*
12724 * Priority of the task has changed. Check to see if we preempt
12725 * the current task.
12726 */
12727static void
12728prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
12729{
12730	if (!task_on_rq_queued(p))
12731		return;
12732
12733	if (rq->cfs.nr_running == 1)
12734		return;
12735
12736	/*
12737	 * Reschedule if we are currently running on this runqueue and
12738	 * our priority decreased, or if we are not currently running on
12739	 * this runqueue and our priority is higher than the current's
12740	 */
12741	if (task_current(rq, p)) {
12742		if (p->prio > oldprio)
12743			resched_curr(rq);
12744	} else
12745		wakeup_preempt(rq, p, 0);
12746}
12747
12748#ifdef CONFIG_FAIR_GROUP_SCHED
12749/*
12750 * Propagate the changes of the sched_entity across the tg tree to make it
12751 * visible to the root
12752 */
12753static void propagate_entity_cfs_rq(struct sched_entity *se)
12754{
12755	struct cfs_rq *cfs_rq = cfs_rq_of(se);
12756
12757	if (cfs_rq_throttled(cfs_rq))
12758		return;
12759
12760	if (!throttled_hierarchy(cfs_rq))
12761		list_add_leaf_cfs_rq(cfs_rq);
12762
12763	/* Start to propagate at parent */
12764	se = se->parent;
12765
12766	for_each_sched_entity(se) {
12767		cfs_rq = cfs_rq_of(se);
12768
12769		update_load_avg(cfs_rq, se, UPDATE_TG);
12770
12771		if (cfs_rq_throttled(cfs_rq))
12772			break;
12773
12774		if (!throttled_hierarchy(cfs_rq))
12775			list_add_leaf_cfs_rq(cfs_rq);
12776	}
12777}
12778#else
12779static void propagate_entity_cfs_rq(struct sched_entity *se) { }
12780#endif
12781
12782static void detach_entity_cfs_rq(struct sched_entity *se)
12783{
12784	struct cfs_rq *cfs_rq = cfs_rq_of(se);
12785
12786#ifdef CONFIG_SMP
12787	/*
12788	 * In case the task sched_avg hasn't been attached:
12789	 * - A forked task which hasn't been woken up by wake_up_new_task().
12790	 * - A task which has been woken up by try_to_wake_up() but is
12791	 *   waiting for actually being woken up by sched_ttwu_pending().
12792	 */
12793	if (!se->avg.last_update_time)
12794		return;
12795#endif
12796
12797	/* Catch up with the cfs_rq and remove our load when we leave */
12798	update_load_avg(cfs_rq, se, 0);
12799	detach_entity_load_avg(cfs_rq, se);
12800	update_tg_load_avg(cfs_rq);
12801	propagate_entity_cfs_rq(se);
12802}
12803
12804static void attach_entity_cfs_rq(struct sched_entity *se)
12805{
12806	struct cfs_rq *cfs_rq = cfs_rq_of(se);
12807
12808	/* Synchronize entity with its cfs_rq */
12809	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
12810	attach_entity_load_avg(cfs_rq, se);
12811	update_tg_load_avg(cfs_rq);
12812	propagate_entity_cfs_rq(se);
12813}
12814
12815static void detach_task_cfs_rq(struct task_struct *p)
12816{
12817	struct sched_entity *se = &p->se;
12818
12819	detach_entity_cfs_rq(se);
12820}
12821
12822static void attach_task_cfs_rq(struct task_struct *p)
12823{
12824	struct sched_entity *se = &p->se;
12825
12826	attach_entity_cfs_rq(se);
12827}
12828
12829static void switched_from_fair(struct rq *rq, struct task_struct *p)
12830{
12831	detach_task_cfs_rq(p);
12832}
12833
12834static void switched_to_fair(struct rq *rq, struct task_struct *p)
12835{
12836	attach_task_cfs_rq(p);
12837
12838	set_task_max_allowed_capacity(p);
12839
12840	if (task_on_rq_queued(p)) {
12841		/*
12842		 * We were most likely switched from sched_rt, so
12843		 * kick off the schedule if running, otherwise just see
12844		 * if we can still preempt the current task.
12845		 */
12846		if (task_current(rq, p))
12847			resched_curr(rq);
12848		else
12849			wakeup_preempt(rq, p, 0);
12850	}
12851}
12852
12853/* Account for a task changing its policy or group.
12854 *
12855 * This routine is mostly called to set cfs_rq->curr field when a task
12856 * migrates between groups/classes.
12857 */
12858static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
12859{
12860	struct sched_entity *se = &p->se;
12861
12862#ifdef CONFIG_SMP
12863	if (task_on_rq_queued(p)) {
12864		/*
12865		 * Move the next running task to the front of the list, so our
12866		 * cfs_tasks list becomes MRU one.
12867		 */
12868		list_move(&se->group_node, &rq->cfs_tasks);
12869	}
12870#endif
12871
12872	for_each_sched_entity(se) {
12873		struct cfs_rq *cfs_rq = cfs_rq_of(se);
12874
12875		set_next_entity(cfs_rq, se);
12876		/* ensure bandwidth has been allocated on our new cfs_rq */
12877		account_cfs_rq_runtime(cfs_rq, 0);
12878	}
12879}
12880
12881void init_cfs_rq(struct cfs_rq *cfs_rq)
12882{
12883	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
12884	u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
12885#ifdef CONFIG_SMP
12886	raw_spin_lock_init(&cfs_rq->removed.lock);
12887#endif
12888}
12889
12890#ifdef CONFIG_FAIR_GROUP_SCHED
12891static void task_change_group_fair(struct task_struct *p)
12892{
12893	/*
12894	 * We couldn't detach or attach a forked task which
12895	 * hasn't been woken up by wake_up_new_task().
12896	 */
12897	if (READ_ONCE(p->__state) == TASK_NEW)
12898		return;
12899
12900	detach_task_cfs_rq(p);
12901
12902#ifdef CONFIG_SMP
12903	/* Tell se's cfs_rq has been changed -- migrated */
12904	p->se.avg.last_update_time = 0;
12905#endif
12906	set_task_rq(p, task_cpu(p));
12907	attach_task_cfs_rq(p);
12908}
12909
12910void free_fair_sched_group(struct task_group *tg)
12911{
12912	int i;
12913
12914	for_each_possible_cpu(i) {
12915		if (tg->cfs_rq)
12916			kfree(tg->cfs_rq[i]);
12917		if (tg->se)
12918			kfree(tg->se[i]);
12919	}
12920
12921	kfree(tg->cfs_rq);
12922	kfree(tg->se);
12923}
12924
12925int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
12926{
12927	struct sched_entity *se;
12928	struct cfs_rq *cfs_rq;
12929	int i;
12930
12931	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
12932	if (!tg->cfs_rq)
12933		goto err;
12934	tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
12935	if (!tg->se)
12936		goto err;
12937
12938	tg->shares = NICE_0_LOAD;
12939
12940	init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
12941
12942	for_each_possible_cpu(i) {
12943		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
12944				      GFP_KERNEL, cpu_to_node(i));
12945		if (!cfs_rq)
12946			goto err;
12947
12948		se = kzalloc_node(sizeof(struct sched_entity_stats),
12949				  GFP_KERNEL, cpu_to_node(i));
12950		if (!se)
12951			goto err_free_rq;
12952
12953		init_cfs_rq(cfs_rq);
12954		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
12955		init_entity_runnable_average(se);
12956	}
12957
12958	return 1;
12959
12960err_free_rq:
12961	kfree(cfs_rq);
12962err:
12963	return 0;
12964}
12965
12966void online_fair_sched_group(struct task_group *tg)
12967{
12968	struct sched_entity *se;
12969	struct rq_flags rf;
12970	struct rq *rq;
12971	int i;
12972
12973	for_each_possible_cpu(i) {
12974		rq = cpu_rq(i);
12975		se = tg->se[i];
12976		rq_lock_irq(rq, &rf);
12977		update_rq_clock(rq);
12978		attach_entity_cfs_rq(se);
12979		sync_throttle(tg, i);
12980		rq_unlock_irq(rq, &rf);
12981	}
12982}
12983
12984void unregister_fair_sched_group(struct task_group *tg)
12985{
12986	unsigned long flags;
12987	struct rq *rq;
12988	int cpu;
12989
12990	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
12991
12992	for_each_possible_cpu(cpu) {
12993		if (tg->se[cpu])
12994			remove_entity_load_avg(tg->se[cpu]);
12995
12996		/*
12997		 * Only empty task groups can be destroyed; so we can speculatively
12998		 * check on_list without danger of it being re-added.
12999		 */
13000		if (!tg->cfs_rq[cpu]->on_list)
13001			continue;
13002
13003		rq = cpu_rq(cpu);
13004
13005		raw_spin_rq_lock_irqsave(rq, flags);
13006		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
13007		raw_spin_rq_unlock_irqrestore(rq, flags);
13008	}
13009}
13010
13011void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
13012			struct sched_entity *se, int cpu,
13013			struct sched_entity *parent)
13014{
13015	struct rq *rq = cpu_rq(cpu);
13016
13017	cfs_rq->tg = tg;
13018	cfs_rq->rq = rq;
13019	init_cfs_rq_runtime(cfs_rq);
13020
13021	tg->cfs_rq[cpu] = cfs_rq;
13022	tg->se[cpu] = se;
13023
13024	/* se could be NULL for root_task_group */
13025	if (!se)
13026		return;
13027
13028	if (!parent) {
13029		se->cfs_rq = &rq->cfs;
13030		se->depth = 0;
13031	} else {
13032		se->cfs_rq = parent->my_q;
13033		se->depth = parent->depth + 1;
13034	}
13035
13036	se->my_q = cfs_rq;
13037	/* guarantee group entities always have weight */
13038	update_load_set(&se->load, NICE_0_LOAD);
13039	se->parent = parent;
13040}
13041
13042static DEFINE_MUTEX(shares_mutex);
13043
13044static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
13045{
13046	int i;
13047
13048	lockdep_assert_held(&shares_mutex);
13049
13050	/*
13051	 * We can't change the weight of the root cgroup.
13052	 */
13053	if (!tg->se[0])
13054		return -EINVAL;
13055
13056	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
13057
13058	if (tg->shares == shares)
13059		return 0;
13060
13061	tg->shares = shares;
13062	for_each_possible_cpu(i) {
13063		struct rq *rq = cpu_rq(i);
13064		struct sched_entity *se = tg->se[i];
13065		struct rq_flags rf;
13066
13067		/* Propagate contribution to hierarchy */
13068		rq_lock_irqsave(rq, &rf);
13069		update_rq_clock(rq);
13070		for_each_sched_entity(se) {
13071			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
13072			update_cfs_group(se);
13073		}
13074		rq_unlock_irqrestore(rq, &rf);
13075	}
13076
13077	return 0;
13078}
13079
13080int sched_group_set_shares(struct task_group *tg, unsigned long shares)
13081{
13082	int ret;
13083
13084	mutex_lock(&shares_mutex);
13085	if (tg_is_idle(tg))
13086		ret = -EINVAL;
13087	else
13088		ret = __sched_group_set_shares(tg, shares);
13089	mutex_unlock(&shares_mutex);
13090
13091	return ret;
13092}
13093
13094int sched_group_set_idle(struct task_group *tg, long idle)
13095{
13096	int i;
13097
13098	if (tg == &root_task_group)
13099		return -EINVAL;
13100
13101	if (idle < 0 || idle > 1)
13102		return -EINVAL;
13103
13104	mutex_lock(&shares_mutex);
13105
13106	if (tg->idle == idle) {
13107		mutex_unlock(&shares_mutex);
13108		return 0;
13109	}
13110
13111	tg->idle = idle;
13112
13113	for_each_possible_cpu(i) {
13114		struct rq *rq = cpu_rq(i);
13115		struct sched_entity *se = tg->se[i];
13116		struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
13117		bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
13118		long idle_task_delta;
13119		struct rq_flags rf;
13120
13121		rq_lock_irqsave(rq, &rf);
13122
13123		grp_cfs_rq->idle = idle;
13124		if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
13125			goto next_cpu;
13126
13127		if (se->on_rq) {
13128			parent_cfs_rq = cfs_rq_of(se);
13129			if (cfs_rq_is_idle(grp_cfs_rq))
13130				parent_cfs_rq->idle_nr_running++;
13131			else
13132				parent_cfs_rq->idle_nr_running--;
13133		}
13134
13135		idle_task_delta = grp_cfs_rq->h_nr_running -
13136				  grp_cfs_rq->idle_h_nr_running;
13137		if (!cfs_rq_is_idle(grp_cfs_rq))
13138			idle_task_delta *= -1;
13139
13140		for_each_sched_entity(se) {
13141			struct cfs_rq *cfs_rq = cfs_rq_of(se);
13142
13143			if (!se->on_rq)
13144				break;
13145
13146			cfs_rq->idle_h_nr_running += idle_task_delta;
13147
13148			/* Already accounted at parent level and above. */
13149			if (cfs_rq_is_idle(cfs_rq))
13150				break;
13151		}
13152
13153next_cpu:
13154		rq_unlock_irqrestore(rq, &rf);
13155	}
13156
13157	/* Idle groups have minimum weight. */
13158	if (tg_is_idle(tg))
13159		__sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
13160	else
13161		__sched_group_set_shares(tg, NICE_0_LOAD);
13162
13163	mutex_unlock(&shares_mutex);
13164	return 0;
13165}
13166
13167#endif /* CONFIG_FAIR_GROUP_SCHED */
13168
13169
13170static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
13171{
13172	struct sched_entity *se = &task->se;
13173	unsigned int rr_interval = 0;
13174
13175	/*
13176	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
13177	 * idle runqueue:
13178	 */
13179	if (rq->cfs.load.weight)
13180		rr_interval = NS_TO_JIFFIES(se->slice);
13181
13182	return rr_interval;
13183}
13184
13185/*
13186 * All the scheduling class methods:
13187 */
13188DEFINE_SCHED_CLASS(fair) = {
13189
13190	.enqueue_task		= enqueue_task_fair,
13191	.dequeue_task		= dequeue_task_fair,
13192	.yield_task		= yield_task_fair,
13193	.yield_to_task		= yield_to_task_fair,
13194
13195	.wakeup_preempt		= check_preempt_wakeup_fair,
13196
13197	.pick_next_task		= __pick_next_task_fair,
13198	.put_prev_task		= put_prev_task_fair,
13199	.set_next_task          = set_next_task_fair,
13200
13201#ifdef CONFIG_SMP
13202	.balance		= balance_fair,
13203	.pick_task		= pick_task_fair,
13204	.select_task_rq		= select_task_rq_fair,
13205	.migrate_task_rq	= migrate_task_rq_fair,
13206
13207	.rq_online		= rq_online_fair,
13208	.rq_offline		= rq_offline_fair,
13209
13210	.task_dead		= task_dead_fair,
13211	.set_cpus_allowed	= set_cpus_allowed_fair,
13212#endif
13213
13214	.task_tick		= task_tick_fair,
13215	.task_fork		= task_fork_fair,
13216
13217	.prio_changed		= prio_changed_fair,
13218	.switched_from		= switched_from_fair,
13219	.switched_to		= switched_to_fair,
13220
13221	.get_rr_interval	= get_rr_interval_fair,
13222
13223	.update_curr		= update_curr_fair,
13224
13225#ifdef CONFIG_FAIR_GROUP_SCHED
13226	.task_change_group	= task_change_group_fair,
13227#endif
13228
13229#ifdef CONFIG_SCHED_CORE
13230	.task_is_throttled	= task_is_throttled_fair,
13231#endif
13232
13233#ifdef CONFIG_UCLAMP_TASK
13234	.uclamp_enabled		= 1,
13235#endif
13236};
13237
13238#ifdef CONFIG_SCHED_DEBUG
13239void print_cfs_stats(struct seq_file *m, int cpu)
13240{
13241	struct cfs_rq *cfs_rq, *pos;
13242
13243	rcu_read_lock();
13244	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
13245		print_cfs_rq(m, cpu, cfs_rq);
13246	rcu_read_unlock();
13247}
13248
13249#ifdef CONFIG_NUMA_BALANCING
13250void show_numa_stats(struct task_struct *p, struct seq_file *m)
13251{
13252	int node;
13253	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
13254	struct numa_group *ng;
13255
13256	rcu_read_lock();
13257	ng = rcu_dereference(p->numa_group);
13258	for_each_online_node(node) {
13259		if (p->numa_faults) {
13260			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
13261			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
13262		}
13263		if (ng) {
13264			gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
13265			gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
13266		}
13267		print_numa_stats(m, node, tsf, tpf, gsf, gpf);
13268	}
13269	rcu_read_unlock();
13270}
13271#endif /* CONFIG_NUMA_BALANCING */
13272#endif /* CONFIG_SCHED_DEBUG */
13273
13274__init void init_sched_fair_class(void)
13275{
13276#ifdef CONFIG_SMP
13277	int i;
13278
13279	for_each_possible_cpu(i) {
13280		zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
13281		zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
13282		zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
13283					GFP_KERNEL, cpu_to_node(i));
13284
13285#ifdef CONFIG_CFS_BANDWIDTH
13286		INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
13287		INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
13288#endif
13289	}
13290
13291	open_softirq(SCHED_SOFTIRQ, sched_balance_softirq);
13292
13293#ifdef CONFIG_NO_HZ_COMMON
13294	nohz.next_balance = jiffies;
13295	nohz.next_blocked = jiffies;
13296	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
13297#endif
13298#endif /* SMP */
13299
13300}
13301