sys_tune.c revision 255932
11541Srgrimes/* 21541Srgrimes * Copyright (c) 2010 Mellanox Technologies. All rights reserved. 31541Srgrimes * 41541Srgrimes * This software is available to you under a choice of one of two 51541Srgrimes * licenses. You may choose to be licensed under the terms of the GNU 61541Srgrimes * General Public License (GPL) Version 2, available from the file 71541Srgrimes * COPYING in the main directory of this source tree, or the 81541Srgrimes * OpenIB.org BSD license below: 91541Srgrimes * 101541Srgrimes * Redistribution and use in source and binary forms, with or 111541Srgrimes * without modification, are permitted provided that the following 121541Srgrimes * conditions are met: 131541Srgrimes * 141541Srgrimes * - Redistributions of source code must retain the above 151541Srgrimes * copyright notice, this list of conditions and the following 161541Srgrimes * disclaimer. 171541Srgrimes * 181541Srgrimes * - Redistributions in binary form must reproduce the above 191541Srgrimes * copyright notice, this list of conditions and the following 201541Srgrimes * disclaimer in the documentation and/or other materials 211541Srgrimes * provided with the distribution. 221541Srgrimes * 231541Srgrimes * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 241541Srgrimes * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 251541Srgrimes * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 261541Srgrimes * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 271541Srgrimes * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 281541Srgrimes * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 291541Srgrimes * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 301541Srgrimes * SOFTWARE. 311541Srgrimes * 321541Srgrimes */ 331541Srgrimes 348876Srgrimes#include <linux/sched.h> 351541Srgrimes#include <linux/mutex.h> 361541Srgrimes#include <asm/atomic.h> 371541Srgrimes 381549Srgrimes#include "mlx4.h" 391541Srgrimes 401541Srgrimes#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE) 411541Srgrimes 421541Srgrimes 431541Srgrimes 447280Swollman/* Each CPU is put into a group. In most cases, the group number is 451541Srgrimes * equal to the CPU number of one of the CPUs in the group. The 461541Srgrimes * exception is group NR_CPUS which is the default group. This is 471541Srgrimes * protected by sys_tune_startup_mutex. */ 481541SrgrimesDEFINE_PER_CPU(int, idle_cpu_group) = NR_CPUS; 491541Srgrimes 501541Srgrimes/* For each group, a count of the number of CPUs in the group which 511541Srgrimes * are known to be busy. A busy CPU might be running the busy loop 521541Srgrimes * below or general kernel code. The count is decremented on entry to 531541Srgrimes * the old pm_idle handler and incremented on exit. The aim is to 546363Sphk * avoid the count going to zero or negative. This situation can 556363Sphk * occur temporarily during module unload or CPU hot-plug but 561541Srgrimes * normality will be restored when the affected CPUs next exit the 577280Swollman * idle loop. */ 587280Swollmanstatic atomic_t busy_cpu_count[NR_CPUS+1]; 597280Swollman 607280Swollman/* A workqueue item to be executed to cause the CPU to exit from the 617280Swollman * idle loop. */ 627280SwollmanDEFINE_PER_CPU(struct work_struct, sys_tune_cpu_work); 637280Swollman 647280Swollman#define sys_tune_set_state(CPU,STATE) \ 657280Swollman do { } while(0) 667280Swollman 677280Swollman 687280Swollman/* A mutex to protect most of the module datastructures. */ 691541Srgrimesstatic DEFINE_MUTEX(sys_tune_startup_mutex); 701541Srgrimes 711541Srgrimes/* The old pm_idle handler. */ 721541Srgrimesstatic void (*old_pm_idle)(void) = NULL; 731541Srgrimes 741541Srgrimesstatic void sys_tune_pm_idle(void) 751541Srgrimes{ 761541Srgrimes atomic_t *busy_cpus_ptr; 771541Srgrimes int busy_cpus; 781541Srgrimes int cpu = smp_processor_id(); 791541Srgrimes 801541Srgrimes busy_cpus_ptr = &(busy_cpu_count[per_cpu(idle_cpu_group, cpu)]); 811541Srgrimes 821541Srgrimes sys_tune_set_state(cpu, 2); 831541Srgrimes 841541Srgrimes local_irq_enable(); 851541Srgrimes while (!need_resched()) { 861541Srgrimes busy_cpus = atomic_read(busy_cpus_ptr); 871541Srgrimes 881541Srgrimes /* If other CPUs in this group are busy then let this 891541Srgrimes * CPU go idle. We mustn't let the number of busy 901541Srgrimes * CPUs drop below 1. */ 911541Srgrimes if ( busy_cpus > 1 && 921541Srgrimes old_pm_idle != NULL && 931541Srgrimes ( atomic_cmpxchg(busy_cpus_ptr, busy_cpus, 941541Srgrimes busy_cpus-1) == busy_cpus ) ) { 951541Srgrimes local_irq_disable(); 961541Srgrimes sys_tune_set_state(cpu, 3); 971541Srgrimes /* This check might not be necessary, but it 981541Srgrimes * seems safest to include it because there 991541Srgrimes * might be a kernel version which requires 1001541Srgrimes * it. */ 1011541Srgrimes if (need_resched()) 1021541Srgrimes local_irq_enable(); 1031541Srgrimes else 1041541Srgrimes old_pm_idle(); 1051541Srgrimes /* This CPU is busy again. */ 1061541Srgrimes sys_tune_set_state(cpu, 1); 1071541Srgrimes atomic_add(1, busy_cpus_ptr); 1081541Srgrimes return; 1091541Srgrimes } 1101549Srgrimes 1111541Srgrimes cpu_relax(); 1121541Srgrimes } 1131541Srgrimes sys_tune_set_state(cpu, 0); 1141541Srgrimes} 1151541Srgrimes 1161541Srgrimes 1171541Srgrimesvoid sys_tune_work_func(struct work_struct *work) 1181541Srgrimes{ 1191541Srgrimes /* Do nothing. Since this function is running in process 1201541Srgrimes * context, the idle thread isn't running on this CPU. */ 1211541Srgrimes} 1221541Srgrimes 1231541Srgrimes 1241541Srgrimes#ifdef CONFIG_SMP 1251541Srgrimesstatic void sys_tune_smp_call(void *info) 1261541Srgrimes{ 1271541Srgrimes schedule_work(&get_cpu_var(sys_tune_cpu_work)); 1281541Srgrimes put_cpu_var(sys_tune_cpu_work); 1291541Srgrimes} 1301541Srgrimes#endif 1311541Srgrimes 1321541Srgrimes 1331541Srgrimes#ifdef CONFIG_SMP 1341549Srgrimesstatic void sys_tune_refresh(void) 1351541Srgrimes{ 1361541Srgrimes#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) 1371541Srgrimes on_each_cpu(&sys_tune_smp_call, NULL, 0, 1); 1381541Srgrimes#else 1391541Srgrimes on_each_cpu(&sys_tune_smp_call, NULL, 1); 1401541Srgrimes#endif 1411541Srgrimes} 1421541Srgrimes#else 1431541Srgrimesstatic void sys_tune_refresh(void) 1441541Srgrimes{ 1451541Srgrimes /* The current thread is executing on the one and only CPU so 1461541Srgrimes * the idle thread isn't running. */ 1471541Srgrimes} 1481541Srgrimes#endif 1491541Srgrimes 1501541Srgrimes 1511541Srgrimes 1521541Srgrimesstatic int sys_tune_cpu_group(int cpu) 1531541Srgrimes{ 1541541Srgrimes#ifdef CONFIG_SMP 1551541Srgrimes const cpumask_t *mask; 1561541Srgrimes int other_cpu; 1571541Srgrimes int group; 1581541Srgrimes 1591541Srgrimes#if defined(topology_thread_cpumask) && defined(ST_HAVE_EXPORTED_CPU_SIBLING_MAP) 1601541Srgrimes /* Keep one hyperthread busy per core. */ 1611541Srgrimes mask = topology_thread_cpumask(cpu); 1624127Swollman#else 1631541Srgrimes return cpu; 1641541Srgrimes#endif 1651541Srgrimes for_each_cpu_mask(cpu, *(mask)) { 1661541Srgrimes group = per_cpu(idle_cpu_group, other_cpu); 1671541Srgrimes if (group != NR_CPUS) 1681541Srgrimes return group; 1691541Srgrimes } 1701541Srgrimes#endif 1711541Srgrimes 1721541Srgrimes return cpu; 1731541Srgrimes} 1741541Srgrimes 1751541Srgrimes 1761549Srgrimesstatic void sys_tune_add_cpu(int cpu) 1771541Srgrimes{ 1781541Srgrimes int group; 1791541Srgrimes 1801541Srgrimes /* Do nothing if this CPU has already been added. */ 1811541Srgrimes if (per_cpu(idle_cpu_group, cpu) != NR_CPUS) 1821541Srgrimes return; 1831541Srgrimes 1841541Srgrimes group = sys_tune_cpu_group(cpu); 1851541Srgrimes per_cpu(idle_cpu_group, cpu) = group; 1861541Srgrimes atomic_inc(&(busy_cpu_count[group])); 1871541Srgrimes 1881541Srgrimes} 1891541Srgrimes 1901541Srgrimesstatic void sys_tune_del_cpu(int cpu) 1917280Swollman{ 1921541Srgrimes 1931541Srgrimes int group; 1941541Srgrimes 1951541Srgrimes if (per_cpu(idle_cpu_group, cpu) == NR_CPUS) 1961541Srgrimes return; 1971541Srgrimes 1981541Srgrimes group = per_cpu(idle_cpu_group, cpu); 1991541Srgrimes /* If the CPU was busy, this can cause the count to drop to 2001541Srgrimes * zero. To rectify this, we need to cause one of the other 2011541Srgrimes * CPUs in the group to exit the idle loop. If the CPU was 2021541Srgrimes * not busy then this causes the contribution for this CPU to 2031541Srgrimes * go to -1 which can cause the overall count to drop to zero 2041541Srgrimes * or go negative. To rectify this situation we need to cause 2058071Swollman * this CPU to exit the idle loop. */ 2068071Swollman atomic_dec(&(busy_cpu_count[group])); 2078071Swollman per_cpu(idle_cpu_group, cpu) = NR_CPUS; 2088071Swollman 2098071Swollman} 2108071Swollman 2118071Swollman 2128876Srgrimesstatic int sys_tune_cpu_notify(struct notifier_block *self, 2138071Swollman unsigned long action, void *hcpu) 2148071Swollman{ 2158071Swollman int cpu = (long)hcpu; 2168071Swollman 2178071Swollman switch(action) { 2181541Srgrimes#ifdef CPU_ONLINE_FROZEN 2191541Srgrimes case CPU_ONLINE_FROZEN: 2201541Srgrimes#endif 2211541Srgrimes case CPU_ONLINE: 2221541Srgrimes mutex_lock(&sys_tune_startup_mutex); 2231541Srgrimes sys_tune_add_cpu(cpu); 2241541Srgrimes mutex_unlock(&sys_tune_startup_mutex); 2251541Srgrimes /* The CPU might have already entered the idle loop in 2261541Srgrimes * the wrong group. Make sure it exits the idle loop 2271541Srgrimes * so that it picks up the correct group. */ 2281541Srgrimes sys_tune_refresh(); 2291541Srgrimes break; 2301541Srgrimes 2311541Srgrimes#ifdef CPU_DEAD_FROZEN 2321541Srgrimes case CPU_DEAD_FROZEN: 2331541Srgrimes#endif 2341541Srgrimes case CPU_DEAD: 2351541Srgrimes mutex_lock(&sys_tune_startup_mutex); 2363311Sphk sys_tune_del_cpu(cpu); 2373311Sphk mutex_unlock(&sys_tune_startup_mutex); 2381541Srgrimes /* The deleted CPU may have been the only busy CPU in 2391541Srgrimes * the group. Make sure one of the other CPUs in the 2401541Srgrimes * group exits the idle loop. */ 2411541Srgrimes sys_tune_refresh(); 2421541Srgrimes break; 2431541Srgrimes } 2443311Sphk return NOTIFY_OK; 2453311Sphk} 2461541Srgrimes 2471541Srgrimes 2481541Srgrimesstatic struct notifier_block sys_tune_cpu_nb = { 2491541Srgrimes .notifier_call = sys_tune_cpu_notify, 2501541Srgrimes}; 2511541Srgrimes 2521541Srgrimes 2531541Srgrimesstatic void sys_tune_ensure_init(void) 2541541Srgrimes{ 2551541Srgrimes BUG_ON (old_pm_idle != NULL); 2561541Srgrimes 2571541Srgrimes /* Atomically update pm_idle to &sys_tune_pm_idle. The old value 2581541Srgrimes * is stored in old_pm_idle before installing the new 2591541Srgrimes * handler. */ 2601541Srgrimes do { 2611541Srgrimes old_pm_idle = pm_idle; 2628090Spst } while (cmpxchg(&pm_idle, old_pm_idle, &sys_tune_pm_idle) != 2631541Srgrimes old_pm_idle); 2641541Srgrimes} 2651541Srgrimes#endif 2661541Srgrimes 2671541Srgrimesvoid sys_tune_fini(void) 2681541Srgrimes{ 2691541Srgrimes#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE) 2701541Srgrimes void (*old)(void); 2711541Srgrimes int cpu; 2721541Srgrimes 2731541Srgrimes unregister_cpu_notifier(&sys_tune_cpu_nb); 2741541Srgrimes 2751541Srgrimes mutex_lock(&sys_tune_startup_mutex); 2761541Srgrimes 2771541Srgrimes 2781541Srgrimes old = cmpxchg(&pm_idle, &sys_tune_pm_idle, old_pm_idle); 2791541Srgrimes 2801541Srgrimes for_each_online_cpu(cpu) 2811541Srgrimes sys_tune_del_cpu(cpu); 2821541Srgrimes 2831541Srgrimes mutex_unlock(&sys_tune_startup_mutex); 2841541Srgrimes 2851541Srgrimes /* Our handler may still be executing on other CPUs. 2861541Srgrimes * Schedule this thread on all CPUs to make sure all 2871541Srgrimes * idle threads get interrupted. */ 2881541Srgrimes sys_tune_refresh(); 2891541Srgrimes 2901541Srgrimes /* Make sure the work item has finished executing on all CPUs. 2911541Srgrimes * This in turn ensures that all idle threads have been 2921541Srgrimes * interrupted. */ 2931541Srgrimes flush_scheduled_work(); 2941541Srgrimes#endif /* CONFIG_X86 */ 2951541Srgrimes} 2961541Srgrimes 2971541Srgrimesvoid sys_tune_init(void) 2981541Srgrimes{ 2991541Srgrimes#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE) 3001541Srgrimes int cpu; 3011541Srgrimes 3021541Srgrimes for_each_possible_cpu(cpu) { 3031541Srgrimes INIT_WORK(&per_cpu(sys_tune_cpu_work, cpu), 3041541Srgrimes sys_tune_work_func); 3051541Srgrimes } 3061541Srgrimes 3071541Srgrimes /* Start by registering the handler to ensure we don't miss 3081541Srgrimes * any updates. */ 3091541Srgrimes register_cpu_notifier(&sys_tune_cpu_nb); 3101541Srgrimes 3111541Srgrimes mutex_lock(&sys_tune_startup_mutex); 3121541Srgrimes 3131541Srgrimes for_each_online_cpu(cpu) 3141541Srgrimes sys_tune_add_cpu(cpu); 3151541Srgrimes 3161541Srgrimes sys_tune_ensure_init(); 3171541Srgrimes 3181541Srgrimes 3191541Srgrimes mutex_unlock(&sys_tune_startup_mutex); 3201541Srgrimes 3211541Srgrimes /* Ensure our idle handler starts to run. */ 3221541Srgrimes sys_tune_refresh(); 3231541Srgrimes#endif 3241541Srgrimes} 3251541Srgrimes 3261541Srgrimes