sys_tune.c revision 256281
1/*
2 * Copyright (c) 2010 Mellanox Technologies. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33
34#include <linux/sched.h>
35#include <linux/mutex.h>
36#include <asm/atomic.h>
37
38#include "mlx4.h"
39
40#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
41
42
43
44/* Each CPU is put into a group.  In most cases, the group number is
45 * equal to the CPU number of one of the CPUs in the group.  The
46 * exception is group NR_CPUS which is the default group.  This is
47 * protected by sys_tune_startup_mutex. */
48DEFINE_PER_CPU(int, idle_cpu_group) = NR_CPUS;
49
50/* For each group, a count of the number of CPUs in the group which
51 * are known to be busy.  A busy CPU might be running the busy loop
52 * below or general kernel code.  The count is decremented on entry to
53 * the old pm_idle handler and incremented on exit.  The aim is to
54 * avoid the count going to zero or negative.  This situation can
55 * occur temporarily during module unload or CPU hot-plug but
56 * normality will be restored when the affected CPUs next exit the
57 * idle loop. */
58static atomic_t busy_cpu_count[NR_CPUS+1];
59
60/* A workqueue item to be executed to cause the CPU to exit from the
61 * idle loop. */
62DEFINE_PER_CPU(struct work_struct, sys_tune_cpu_work);
63
64#define sys_tune_set_state(CPU,STATE) \
65	do { } while(0)
66
67
68/* A mutex to protect most of the module datastructures. */
69static DEFINE_MUTEX(sys_tune_startup_mutex);
70
71/* The old pm_idle handler. */
72static void (*old_pm_idle)(void) = NULL;
73
74static void sys_tune_pm_idle(void)
75{
76	atomic_t *busy_cpus_ptr;
77	int busy_cpus;
78	int cpu = smp_processor_id();
79
80	busy_cpus_ptr = &(busy_cpu_count[per_cpu(idle_cpu_group, cpu)]);
81
82	sys_tune_set_state(cpu, 2);
83
84	local_irq_enable();
85	while (!need_resched()) {
86		busy_cpus = atomic_read(busy_cpus_ptr);
87
88		/* If other CPUs in this group are busy then let this
89		 * CPU go idle.  We mustn't let the number of busy
90		 * CPUs drop below 1. */
91		if ( busy_cpus > 1 &&
92		     old_pm_idle != NULL &&
93		     ( atomic_cmpxchg(busy_cpus_ptr, busy_cpus,
94				      busy_cpus-1) == busy_cpus ) ) {
95			local_irq_disable();
96			sys_tune_set_state(cpu, 3);
97			/* This check might not be necessary, but it
98			 * seems safest to include it because there
99			 * might be a kernel version which requires
100			 * it. */
101			if (need_resched())
102				local_irq_enable();
103			else
104				old_pm_idle();
105			/* This CPU is busy again. */
106			sys_tune_set_state(cpu, 1);
107			atomic_add(1, busy_cpus_ptr);
108			return;
109		}
110
111		cpu_relax();
112	}
113	sys_tune_set_state(cpu, 0);
114}
115
116
117void sys_tune_work_func(struct work_struct *work)
118{
119	/* Do nothing.  Since this function is running in process
120	 * context, the idle thread isn't running on this CPU. */
121}
122
123
124#ifdef CONFIG_SMP
125static void sys_tune_smp_call(void *info)
126{
127	schedule_work(&get_cpu_var(sys_tune_cpu_work));
128	put_cpu_var(sys_tune_cpu_work);
129}
130#endif
131
132
133#ifdef CONFIG_SMP
134static void sys_tune_refresh(void)
135{
136#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
137        on_each_cpu(&sys_tune_smp_call, NULL, 0, 1);
138#else
139        on_each_cpu(&sys_tune_smp_call, NULL, 1);
140#endif
141}
142#else
143static void sys_tune_refresh(void)
144{
145	/* The current thread is executing on the one and only CPU so
146	 * the idle thread isn't running. */
147}
148#endif
149
150
151
152static int sys_tune_cpu_group(int cpu)
153{
154#ifdef CONFIG_SMP
155	const cpumask_t *mask;
156	int other_cpu;
157	int group;
158
159#if defined(topology_thread_cpumask) && defined(ST_HAVE_EXPORTED_CPU_SIBLING_MAP)
160	/* Keep one hyperthread busy per core. */
161	mask = topology_thread_cpumask(cpu);
162#else
163	return cpu;
164#endif
165	for_each_cpu_mask(cpu, *(mask))	{
166		group = per_cpu(idle_cpu_group, other_cpu);
167		if (group != NR_CPUS)
168			return group;
169	}
170#endif
171
172	return cpu;
173}
174
175
176static void sys_tune_add_cpu(int cpu)
177{
178	int group;
179
180	/* Do nothing if this CPU has already been added. */
181	if (per_cpu(idle_cpu_group, cpu) != NR_CPUS)
182		return;
183
184	group = sys_tune_cpu_group(cpu);
185	per_cpu(idle_cpu_group, cpu) = group;
186	atomic_inc(&(busy_cpu_count[group]));
187
188}
189
190static void sys_tune_del_cpu(int cpu)
191{
192
193	int group;
194
195	if (per_cpu(idle_cpu_group, cpu) == NR_CPUS)
196		return;
197
198	group = per_cpu(idle_cpu_group, cpu);
199	/* If the CPU was busy, this can cause the count to drop to
200	 * zero.  To rectify this, we need to cause one of the other
201	 * CPUs in the group to exit the idle loop.  If the CPU was
202	 * not busy then this causes the contribution for this CPU to
203	 * go to -1 which can cause the overall count to drop to zero
204	 * or go negative.  To rectify this situation we need to cause
205	 * this CPU to exit the idle loop. */
206	atomic_dec(&(busy_cpu_count[group]));
207	per_cpu(idle_cpu_group, cpu) = NR_CPUS;
208
209}
210
211
212static int sys_tune_cpu_notify(struct notifier_block *self,
213			       unsigned long action, void *hcpu)
214{
215	int cpu = (long)hcpu;
216
217	switch(action) {
218#ifdef CPU_ONLINE_FROZEN
219	case CPU_ONLINE_FROZEN:
220#endif
221	case CPU_ONLINE:
222		mutex_lock(&sys_tune_startup_mutex);
223		sys_tune_add_cpu(cpu);
224		mutex_unlock(&sys_tune_startup_mutex);
225		/* The CPU might have already entered the idle loop in
226		 * the wrong group.  Make sure it exits the idle loop
227		 * so that it picks up the correct group. */
228		sys_tune_refresh();
229		break;
230
231#ifdef CPU_DEAD_FROZEN
232	case CPU_DEAD_FROZEN:
233#endif
234	case CPU_DEAD:
235		mutex_lock(&sys_tune_startup_mutex);
236		sys_tune_del_cpu(cpu);
237		mutex_unlock(&sys_tune_startup_mutex);
238		/* The deleted CPU may have been the only busy CPU in
239		 * the group.  Make sure one of the other CPUs in the
240		 * group exits the idle loop. */
241		sys_tune_refresh();
242		break;
243	}
244	return NOTIFY_OK;
245}
246
247
248static struct notifier_block sys_tune_cpu_nb = {
249	.notifier_call = sys_tune_cpu_notify,
250};
251
252
253static void sys_tune_ensure_init(void)
254{
255	BUG_ON (old_pm_idle != NULL);
256
257	/* Atomically update pm_idle to &sys_tune_pm_idle.  The old value
258	 * is stored in old_pm_idle before installing the new
259	 * handler. */
260	do {
261		old_pm_idle = pm_idle;
262	} while (cmpxchg(&pm_idle, old_pm_idle, &sys_tune_pm_idle) !=
263		 old_pm_idle);
264}
265#endif
266
267void sys_tune_fini(void)
268{
269#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
270	void (*old)(void);
271	int cpu;
272
273	unregister_cpu_notifier(&sys_tune_cpu_nb);
274
275	mutex_lock(&sys_tune_startup_mutex);
276
277
278	old = cmpxchg(&pm_idle, &sys_tune_pm_idle, old_pm_idle);
279
280	for_each_online_cpu(cpu)
281		sys_tune_del_cpu(cpu);
282
283	mutex_unlock(&sys_tune_startup_mutex);
284
285	/* Our handler may still be executing on other CPUs.
286	 * Schedule this thread on all CPUs to make sure all
287	 * idle threads get interrupted. */
288	sys_tune_refresh();
289
290	/* Make sure the work item has finished executing on all CPUs.
291	 * This in turn ensures that all idle threads have been
292	 * interrupted. */
293	flush_scheduled_work();
294#endif /* CONFIG_X86 */
295}
296
297void sys_tune_init(void)
298{
299#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
300	int cpu;
301
302	for_each_possible_cpu(cpu) {
303		INIT_WORK(&per_cpu(sys_tune_cpu_work, cpu),
304			  sys_tune_work_func);
305	}
306
307	/* Start by registering the handler to ensure we don't miss
308	 * any updates. */
309	register_cpu_notifier(&sys_tune_cpu_nb);
310
311	mutex_lock(&sys_tune_startup_mutex);
312
313	for_each_online_cpu(cpu)
314		sys_tune_add_cpu(cpu);
315
316	sys_tune_ensure_init();
317
318
319	mutex_unlock(&sys_tune_startup_mutex);
320
321	/* Ensure our idle handler starts to run. */
322	sys_tune_refresh();
323#endif
324}
325
326