sys_tune.c revision 256281
1/* 2 * Copyright (c) 2010 Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 * 32 */ 33 34#include <linux/sched.h> 35#include <linux/mutex.h> 36#include <asm/atomic.h> 37 38#include "mlx4.h" 39 40#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE) 41 42 43 44/* Each CPU is put into a group. In most cases, the group number is 45 * equal to the CPU number of one of the CPUs in the group. The 46 * exception is group NR_CPUS which is the default group. This is 47 * protected by sys_tune_startup_mutex. */ 48DEFINE_PER_CPU(int, idle_cpu_group) = NR_CPUS; 49 50/* For each group, a count of the number of CPUs in the group which 51 * are known to be busy. A busy CPU might be running the busy loop 52 * below or general kernel code. The count is decremented on entry to 53 * the old pm_idle handler and incremented on exit. The aim is to 54 * avoid the count going to zero or negative. This situation can 55 * occur temporarily during module unload or CPU hot-plug but 56 * normality will be restored when the affected CPUs next exit the 57 * idle loop. */ 58static atomic_t busy_cpu_count[NR_CPUS+1]; 59 60/* A workqueue item to be executed to cause the CPU to exit from the 61 * idle loop. */ 62DEFINE_PER_CPU(struct work_struct, sys_tune_cpu_work); 63 64#define sys_tune_set_state(CPU,STATE) \ 65 do { } while(0) 66 67 68/* A mutex to protect most of the module datastructures. */ 69static DEFINE_MUTEX(sys_tune_startup_mutex); 70 71/* The old pm_idle handler. */ 72static void (*old_pm_idle)(void) = NULL; 73 74static void sys_tune_pm_idle(void) 75{ 76 atomic_t *busy_cpus_ptr; 77 int busy_cpus; 78 int cpu = smp_processor_id(); 79 80 busy_cpus_ptr = &(busy_cpu_count[per_cpu(idle_cpu_group, cpu)]); 81 82 sys_tune_set_state(cpu, 2); 83 84 local_irq_enable(); 85 while (!need_resched()) { 86 busy_cpus = atomic_read(busy_cpus_ptr); 87 88 /* If other CPUs in this group are busy then let this 89 * CPU go idle. We mustn't let the number of busy 90 * CPUs drop below 1. */ 91 if ( busy_cpus > 1 && 92 old_pm_idle != NULL && 93 ( atomic_cmpxchg(busy_cpus_ptr, busy_cpus, 94 busy_cpus-1) == busy_cpus ) ) { 95 local_irq_disable(); 96 sys_tune_set_state(cpu, 3); 97 /* This check might not be necessary, but it 98 * seems safest to include it because there 99 * might be a kernel version which requires 100 * it. */ 101 if (need_resched()) 102 local_irq_enable(); 103 else 104 old_pm_idle(); 105 /* This CPU is busy again. */ 106 sys_tune_set_state(cpu, 1); 107 atomic_add(1, busy_cpus_ptr); 108 return; 109 } 110 111 cpu_relax(); 112 } 113 sys_tune_set_state(cpu, 0); 114} 115 116 117void sys_tune_work_func(struct work_struct *work) 118{ 119 /* Do nothing. Since this function is running in process 120 * context, the idle thread isn't running on this CPU. */ 121} 122 123 124#ifdef CONFIG_SMP 125static void sys_tune_smp_call(void *info) 126{ 127 schedule_work(&get_cpu_var(sys_tune_cpu_work)); 128 put_cpu_var(sys_tune_cpu_work); 129} 130#endif 131 132 133#ifdef CONFIG_SMP 134static void sys_tune_refresh(void) 135{ 136#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) 137 on_each_cpu(&sys_tune_smp_call, NULL, 0, 1); 138#else 139 on_each_cpu(&sys_tune_smp_call, NULL, 1); 140#endif 141} 142#else 143static void sys_tune_refresh(void) 144{ 145 /* The current thread is executing on the one and only CPU so 146 * the idle thread isn't running. */ 147} 148#endif 149 150 151 152static int sys_tune_cpu_group(int cpu) 153{ 154#ifdef CONFIG_SMP 155 const cpumask_t *mask; 156 int other_cpu; 157 int group; 158 159#if defined(topology_thread_cpumask) && defined(ST_HAVE_EXPORTED_CPU_SIBLING_MAP) 160 /* Keep one hyperthread busy per core. */ 161 mask = topology_thread_cpumask(cpu); 162#else 163 return cpu; 164#endif 165 for_each_cpu_mask(cpu, *(mask)) { 166 group = per_cpu(idle_cpu_group, other_cpu); 167 if (group != NR_CPUS) 168 return group; 169 } 170#endif 171 172 return cpu; 173} 174 175 176static void sys_tune_add_cpu(int cpu) 177{ 178 int group; 179 180 /* Do nothing if this CPU has already been added. */ 181 if (per_cpu(idle_cpu_group, cpu) != NR_CPUS) 182 return; 183 184 group = sys_tune_cpu_group(cpu); 185 per_cpu(idle_cpu_group, cpu) = group; 186 atomic_inc(&(busy_cpu_count[group])); 187 188} 189 190static void sys_tune_del_cpu(int cpu) 191{ 192 193 int group; 194 195 if (per_cpu(idle_cpu_group, cpu) == NR_CPUS) 196 return; 197 198 group = per_cpu(idle_cpu_group, cpu); 199 /* If the CPU was busy, this can cause the count to drop to 200 * zero. To rectify this, we need to cause one of the other 201 * CPUs in the group to exit the idle loop. If the CPU was 202 * not busy then this causes the contribution for this CPU to 203 * go to -1 which can cause the overall count to drop to zero 204 * or go negative. To rectify this situation we need to cause 205 * this CPU to exit the idle loop. */ 206 atomic_dec(&(busy_cpu_count[group])); 207 per_cpu(idle_cpu_group, cpu) = NR_CPUS; 208 209} 210 211 212static int sys_tune_cpu_notify(struct notifier_block *self, 213 unsigned long action, void *hcpu) 214{ 215 int cpu = (long)hcpu; 216 217 switch(action) { 218#ifdef CPU_ONLINE_FROZEN 219 case CPU_ONLINE_FROZEN: 220#endif 221 case CPU_ONLINE: 222 mutex_lock(&sys_tune_startup_mutex); 223 sys_tune_add_cpu(cpu); 224 mutex_unlock(&sys_tune_startup_mutex); 225 /* The CPU might have already entered the idle loop in 226 * the wrong group. Make sure it exits the idle loop 227 * so that it picks up the correct group. */ 228 sys_tune_refresh(); 229 break; 230 231#ifdef CPU_DEAD_FROZEN 232 case CPU_DEAD_FROZEN: 233#endif 234 case CPU_DEAD: 235 mutex_lock(&sys_tune_startup_mutex); 236 sys_tune_del_cpu(cpu); 237 mutex_unlock(&sys_tune_startup_mutex); 238 /* The deleted CPU may have been the only busy CPU in 239 * the group. Make sure one of the other CPUs in the 240 * group exits the idle loop. */ 241 sys_tune_refresh(); 242 break; 243 } 244 return NOTIFY_OK; 245} 246 247 248static struct notifier_block sys_tune_cpu_nb = { 249 .notifier_call = sys_tune_cpu_notify, 250}; 251 252 253static void sys_tune_ensure_init(void) 254{ 255 BUG_ON (old_pm_idle != NULL); 256 257 /* Atomically update pm_idle to &sys_tune_pm_idle. The old value 258 * is stored in old_pm_idle before installing the new 259 * handler. */ 260 do { 261 old_pm_idle = pm_idle; 262 } while (cmpxchg(&pm_idle, old_pm_idle, &sys_tune_pm_idle) != 263 old_pm_idle); 264} 265#endif 266 267void sys_tune_fini(void) 268{ 269#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE) 270 void (*old)(void); 271 int cpu; 272 273 unregister_cpu_notifier(&sys_tune_cpu_nb); 274 275 mutex_lock(&sys_tune_startup_mutex); 276 277 278 old = cmpxchg(&pm_idle, &sys_tune_pm_idle, old_pm_idle); 279 280 for_each_online_cpu(cpu) 281 sys_tune_del_cpu(cpu); 282 283 mutex_unlock(&sys_tune_startup_mutex); 284 285 /* Our handler may still be executing on other CPUs. 286 * Schedule this thread on all CPUs to make sure all 287 * idle threads get interrupted. */ 288 sys_tune_refresh(); 289 290 /* Make sure the work item has finished executing on all CPUs. 291 * This in turn ensures that all idle threads have been 292 * interrupted. */ 293 flush_scheduled_work(); 294#endif /* CONFIG_X86 */ 295} 296 297void sys_tune_init(void) 298{ 299#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE) 300 int cpu; 301 302 for_each_possible_cpu(cpu) { 303 INIT_WORK(&per_cpu(sys_tune_cpu_work, cpu), 304 sys_tune_work_func); 305 } 306 307 /* Start by registering the handler to ensure we don't miss 308 * any updates. */ 309 register_cpu_notifier(&sys_tune_cpu_nb); 310 311 mutex_lock(&sys_tune_startup_mutex); 312 313 for_each_online_cpu(cpu) 314 sys_tune_add_cpu(cpu); 315 316 sys_tune_ensure_init(); 317 318 319 mutex_unlock(&sys_tune_startup_mutex); 320 321 /* Ensure our idle handler starts to run. */ 322 sys_tune_refresh(); 323#endif 324} 325 326