sys_tune.c revision 255932
11541Srgrimes/*
21541Srgrimes * Copyright (c) 2010 Mellanox Technologies. All rights reserved.
31541Srgrimes *
41541Srgrimes * This software is available to you under a choice of one of two
51541Srgrimes * licenses.  You may choose to be licensed under the terms of the GNU
61541Srgrimes * General Public License (GPL) Version 2, available from the file
71541Srgrimes * COPYING in the main directory of this source tree, or the
81541Srgrimes * OpenIB.org BSD license below:
91541Srgrimes *
101541Srgrimes *     Redistribution and use in source and binary forms, with or
111541Srgrimes *     without modification, are permitted provided that the following
121541Srgrimes *     conditions are met:
131541Srgrimes *
141541Srgrimes *      - Redistributions of source code must retain the above
151541Srgrimes *        copyright notice, this list of conditions and the following
161541Srgrimes *        disclaimer.
171541Srgrimes *
181541Srgrimes *      - Redistributions in binary form must reproduce the above
191541Srgrimes *        copyright notice, this list of conditions and the following
201541Srgrimes *        disclaimer in the documentation and/or other materials
211541Srgrimes *        provided with the distribution.
221541Srgrimes *
231541Srgrimes * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
241541Srgrimes * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
251541Srgrimes * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
261541Srgrimes * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
271541Srgrimes * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
281541Srgrimes * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
291541Srgrimes * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
301541Srgrimes * SOFTWARE.
311541Srgrimes *
321541Srgrimes */
331541Srgrimes
348876Srgrimes#include <linux/sched.h>
351541Srgrimes#include <linux/mutex.h>
361541Srgrimes#include <asm/atomic.h>
371541Srgrimes
381549Srgrimes#include "mlx4.h"
391541Srgrimes
401541Srgrimes#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
411541Srgrimes
421541Srgrimes
431541Srgrimes
447280Swollman/* Each CPU is put into a group.  In most cases, the group number is
451541Srgrimes * equal to the CPU number of one of the CPUs in the group.  The
461541Srgrimes * exception is group NR_CPUS which is the default group.  This is
471541Srgrimes * protected by sys_tune_startup_mutex. */
481541SrgrimesDEFINE_PER_CPU(int, idle_cpu_group) = NR_CPUS;
491541Srgrimes
501541Srgrimes/* For each group, a count of the number of CPUs in the group which
511541Srgrimes * are known to be busy.  A busy CPU might be running the busy loop
521541Srgrimes * below or general kernel code.  The count is decremented on entry to
531541Srgrimes * the old pm_idle handler and incremented on exit.  The aim is to
546363Sphk * avoid the count going to zero or negative.  This situation can
556363Sphk * occur temporarily during module unload or CPU hot-plug but
561541Srgrimes * normality will be restored when the affected CPUs next exit the
577280Swollman * idle loop. */
587280Swollmanstatic atomic_t busy_cpu_count[NR_CPUS+1];
597280Swollman
607280Swollman/* A workqueue item to be executed to cause the CPU to exit from the
617280Swollman * idle loop. */
627280SwollmanDEFINE_PER_CPU(struct work_struct, sys_tune_cpu_work);
637280Swollman
647280Swollman#define sys_tune_set_state(CPU,STATE) \
657280Swollman	do { } while(0)
667280Swollman
677280Swollman
687280Swollman/* A mutex to protect most of the module datastructures. */
691541Srgrimesstatic DEFINE_MUTEX(sys_tune_startup_mutex);
701541Srgrimes
711541Srgrimes/* The old pm_idle handler. */
721541Srgrimesstatic void (*old_pm_idle)(void) = NULL;
731541Srgrimes
741541Srgrimesstatic void sys_tune_pm_idle(void)
751541Srgrimes{
761541Srgrimes	atomic_t *busy_cpus_ptr;
771541Srgrimes	int busy_cpus;
781541Srgrimes	int cpu = smp_processor_id();
791541Srgrimes
801541Srgrimes	busy_cpus_ptr = &(busy_cpu_count[per_cpu(idle_cpu_group, cpu)]);
811541Srgrimes
821541Srgrimes	sys_tune_set_state(cpu, 2);
831541Srgrimes
841541Srgrimes	local_irq_enable();
851541Srgrimes	while (!need_resched()) {
861541Srgrimes		busy_cpus = atomic_read(busy_cpus_ptr);
871541Srgrimes
881541Srgrimes		/* If other CPUs in this group are busy then let this
891541Srgrimes		 * CPU go idle.  We mustn't let the number of busy
901541Srgrimes		 * CPUs drop below 1. */
911541Srgrimes		if ( busy_cpus > 1 &&
921541Srgrimes		     old_pm_idle != NULL &&
931541Srgrimes		     ( atomic_cmpxchg(busy_cpus_ptr, busy_cpus,
941541Srgrimes				      busy_cpus-1) == busy_cpus ) ) {
951541Srgrimes			local_irq_disable();
961541Srgrimes			sys_tune_set_state(cpu, 3);
971541Srgrimes			/* This check might not be necessary, but it
981541Srgrimes			 * seems safest to include it because there
991541Srgrimes			 * might be a kernel version which requires
1001541Srgrimes			 * it. */
1011541Srgrimes			if (need_resched())
1021541Srgrimes				local_irq_enable();
1031541Srgrimes			else
1041541Srgrimes				old_pm_idle();
1051541Srgrimes			/* This CPU is busy again. */
1061541Srgrimes			sys_tune_set_state(cpu, 1);
1071541Srgrimes			atomic_add(1, busy_cpus_ptr);
1081541Srgrimes			return;
1091541Srgrimes		}
1101549Srgrimes
1111541Srgrimes		cpu_relax();
1121541Srgrimes	}
1131541Srgrimes	sys_tune_set_state(cpu, 0);
1141541Srgrimes}
1151541Srgrimes
1161541Srgrimes
1171541Srgrimesvoid sys_tune_work_func(struct work_struct *work)
1181541Srgrimes{
1191541Srgrimes	/* Do nothing.  Since this function is running in process
1201541Srgrimes	 * context, the idle thread isn't running on this CPU. */
1211541Srgrimes}
1221541Srgrimes
1231541Srgrimes
1241541Srgrimes#ifdef CONFIG_SMP
1251541Srgrimesstatic void sys_tune_smp_call(void *info)
1261541Srgrimes{
1271541Srgrimes	schedule_work(&get_cpu_var(sys_tune_cpu_work));
1281541Srgrimes	put_cpu_var(sys_tune_cpu_work);
1291541Srgrimes}
1301541Srgrimes#endif
1311541Srgrimes
1321541Srgrimes
1331541Srgrimes#ifdef CONFIG_SMP
1341549Srgrimesstatic void sys_tune_refresh(void)
1351541Srgrimes{
1361541Srgrimes#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
1371541Srgrimes        on_each_cpu(&sys_tune_smp_call, NULL, 0, 1);
1381541Srgrimes#else
1391541Srgrimes        on_each_cpu(&sys_tune_smp_call, NULL, 1);
1401541Srgrimes#endif
1411541Srgrimes}
1421541Srgrimes#else
1431541Srgrimesstatic void sys_tune_refresh(void)
1441541Srgrimes{
1451541Srgrimes	/* The current thread is executing on the one and only CPU so
1461541Srgrimes	 * the idle thread isn't running. */
1471541Srgrimes}
1481541Srgrimes#endif
1491541Srgrimes
1501541Srgrimes
1511541Srgrimes
1521541Srgrimesstatic int sys_tune_cpu_group(int cpu)
1531541Srgrimes{
1541541Srgrimes#ifdef CONFIG_SMP
1551541Srgrimes	const cpumask_t *mask;
1561541Srgrimes	int other_cpu;
1571541Srgrimes	int group;
1581541Srgrimes
1591541Srgrimes#if defined(topology_thread_cpumask) && defined(ST_HAVE_EXPORTED_CPU_SIBLING_MAP)
1601541Srgrimes	/* Keep one hyperthread busy per core. */
1611541Srgrimes	mask = topology_thread_cpumask(cpu);
1624127Swollman#else
1631541Srgrimes	return cpu;
1641541Srgrimes#endif
1651541Srgrimes	for_each_cpu_mask(cpu, *(mask))	{
1661541Srgrimes		group = per_cpu(idle_cpu_group, other_cpu);
1671541Srgrimes		if (group != NR_CPUS)
1681541Srgrimes			return group;
1691541Srgrimes	}
1701541Srgrimes#endif
1711541Srgrimes
1721541Srgrimes	return cpu;
1731541Srgrimes}
1741541Srgrimes
1751541Srgrimes
1761549Srgrimesstatic void sys_tune_add_cpu(int cpu)
1771541Srgrimes{
1781541Srgrimes	int group;
1791541Srgrimes
1801541Srgrimes	/* Do nothing if this CPU has already been added. */
1811541Srgrimes	if (per_cpu(idle_cpu_group, cpu) != NR_CPUS)
1821541Srgrimes		return;
1831541Srgrimes
1841541Srgrimes	group = sys_tune_cpu_group(cpu);
1851541Srgrimes	per_cpu(idle_cpu_group, cpu) = group;
1861541Srgrimes	atomic_inc(&(busy_cpu_count[group]));
1871541Srgrimes
1881541Srgrimes}
1891541Srgrimes
1901541Srgrimesstatic void sys_tune_del_cpu(int cpu)
1917280Swollman{
1921541Srgrimes
1931541Srgrimes	int group;
1941541Srgrimes
1951541Srgrimes	if (per_cpu(idle_cpu_group, cpu) == NR_CPUS)
1961541Srgrimes		return;
1971541Srgrimes
1981541Srgrimes	group = per_cpu(idle_cpu_group, cpu);
1991541Srgrimes	/* If the CPU was busy, this can cause the count to drop to
2001541Srgrimes	 * zero.  To rectify this, we need to cause one of the other
2011541Srgrimes	 * CPUs in the group to exit the idle loop.  If the CPU was
2021541Srgrimes	 * not busy then this causes the contribution for this CPU to
2031541Srgrimes	 * go to -1 which can cause the overall count to drop to zero
2041541Srgrimes	 * or go negative.  To rectify this situation we need to cause
2058071Swollman	 * this CPU to exit the idle loop. */
2068071Swollman	atomic_dec(&(busy_cpu_count[group]));
2078071Swollman	per_cpu(idle_cpu_group, cpu) = NR_CPUS;
2088071Swollman
2098071Swollman}
2108071Swollman
2118071Swollman
2128876Srgrimesstatic int sys_tune_cpu_notify(struct notifier_block *self,
2138071Swollman			       unsigned long action, void *hcpu)
2148071Swollman{
2158071Swollman	int cpu = (long)hcpu;
2168071Swollman
2178071Swollman	switch(action) {
2181541Srgrimes#ifdef CPU_ONLINE_FROZEN
2191541Srgrimes	case CPU_ONLINE_FROZEN:
2201541Srgrimes#endif
2211541Srgrimes	case CPU_ONLINE:
2221541Srgrimes		mutex_lock(&sys_tune_startup_mutex);
2231541Srgrimes		sys_tune_add_cpu(cpu);
2241541Srgrimes		mutex_unlock(&sys_tune_startup_mutex);
2251541Srgrimes		/* The CPU might have already entered the idle loop in
2261541Srgrimes		 * the wrong group.  Make sure it exits the idle loop
2271541Srgrimes		 * so that it picks up the correct group. */
2281541Srgrimes		sys_tune_refresh();
2291541Srgrimes		break;
2301541Srgrimes
2311541Srgrimes#ifdef CPU_DEAD_FROZEN
2321541Srgrimes	case CPU_DEAD_FROZEN:
2331541Srgrimes#endif
2341541Srgrimes	case CPU_DEAD:
2351541Srgrimes		mutex_lock(&sys_tune_startup_mutex);
2363311Sphk		sys_tune_del_cpu(cpu);
2373311Sphk		mutex_unlock(&sys_tune_startup_mutex);
2381541Srgrimes		/* The deleted CPU may have been the only busy CPU in
2391541Srgrimes		 * the group.  Make sure one of the other CPUs in the
2401541Srgrimes		 * group exits the idle loop. */
2411541Srgrimes		sys_tune_refresh();
2421541Srgrimes		break;
2431541Srgrimes	}
2443311Sphk	return NOTIFY_OK;
2453311Sphk}
2461541Srgrimes
2471541Srgrimes
2481541Srgrimesstatic struct notifier_block sys_tune_cpu_nb = {
2491541Srgrimes	.notifier_call = sys_tune_cpu_notify,
2501541Srgrimes};
2511541Srgrimes
2521541Srgrimes
2531541Srgrimesstatic void sys_tune_ensure_init(void)
2541541Srgrimes{
2551541Srgrimes	BUG_ON (old_pm_idle != NULL);
2561541Srgrimes
2571541Srgrimes	/* Atomically update pm_idle to &sys_tune_pm_idle.  The old value
2581541Srgrimes	 * is stored in old_pm_idle before installing the new
2591541Srgrimes	 * handler. */
2601541Srgrimes	do {
2611541Srgrimes		old_pm_idle = pm_idle;
2628090Spst	} while (cmpxchg(&pm_idle, old_pm_idle, &sys_tune_pm_idle) !=
2631541Srgrimes		 old_pm_idle);
2641541Srgrimes}
2651541Srgrimes#endif
2661541Srgrimes
2671541Srgrimesvoid sys_tune_fini(void)
2681541Srgrimes{
2691541Srgrimes#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
2701541Srgrimes	void (*old)(void);
2711541Srgrimes	int cpu;
2721541Srgrimes
2731541Srgrimes	unregister_cpu_notifier(&sys_tune_cpu_nb);
2741541Srgrimes
2751541Srgrimes	mutex_lock(&sys_tune_startup_mutex);
2761541Srgrimes
2771541Srgrimes
2781541Srgrimes	old = cmpxchg(&pm_idle, &sys_tune_pm_idle, old_pm_idle);
2791541Srgrimes
2801541Srgrimes	for_each_online_cpu(cpu)
2811541Srgrimes		sys_tune_del_cpu(cpu);
2821541Srgrimes
2831541Srgrimes	mutex_unlock(&sys_tune_startup_mutex);
2841541Srgrimes
2851541Srgrimes	/* Our handler may still be executing on other CPUs.
2861541Srgrimes	 * Schedule this thread on all CPUs to make sure all
2871541Srgrimes	 * idle threads get interrupted. */
2881541Srgrimes	sys_tune_refresh();
2891541Srgrimes
2901541Srgrimes	/* Make sure the work item has finished executing on all CPUs.
2911541Srgrimes	 * This in turn ensures that all idle threads have been
2921541Srgrimes	 * interrupted. */
2931541Srgrimes	flush_scheduled_work();
2941541Srgrimes#endif /* CONFIG_X86 */
2951541Srgrimes}
2961541Srgrimes
2971541Srgrimesvoid sys_tune_init(void)
2981541Srgrimes{
2991541Srgrimes#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
3001541Srgrimes	int cpu;
3011541Srgrimes
3021541Srgrimes	for_each_possible_cpu(cpu) {
3031541Srgrimes		INIT_WORK(&per_cpu(sys_tune_cpu_work, cpu),
3041541Srgrimes			  sys_tune_work_func);
3051541Srgrimes	}
3061541Srgrimes
3071541Srgrimes	/* Start by registering the handler to ensure we don't miss
3081541Srgrimes	 * any updates. */
3091541Srgrimes	register_cpu_notifier(&sys_tune_cpu_nb);
3101541Srgrimes
3111541Srgrimes	mutex_lock(&sys_tune_startup_mutex);
3121541Srgrimes
3131541Srgrimes	for_each_online_cpu(cpu)
3141541Srgrimes		sys_tune_add_cpu(cpu);
3151541Srgrimes
3161541Srgrimes	sys_tune_ensure_init();
3171541Srgrimes
3181541Srgrimes
3191541Srgrimes	mutex_unlock(&sys_tune_startup_mutex);
3201541Srgrimes
3211541Srgrimes	/* Ensure our idle handler starts to run. */
3221541Srgrimes	sys_tune_refresh();
3231541Srgrimes#endif
3241541Srgrimes}
3251541Srgrimes
3261541Srgrimes