kern_clock.c revision 113355
1169689Skan/*- 2169689Skan * Copyright (c) 1982, 1986, 1991, 1993 3169689Skan * The Regents of the University of California. All rights reserved. 4169689Skan * (c) UNIX System Laboratories, Inc. 5169689Skan * All or some portions of this file are derived from material licensed 6169689Skan * to the University of California by American Telephone and Telegraph 7169689Skan * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8169689Skan * the permission of UNIX System Laboratories, Inc. 9169689Skan * 10169689Skan * Redistribution and use in source and binary forms, with or without 11169689Skan * modification, are permitted provided that the following conditions 12169689Skan * are met: 13169689Skan * 1. Redistributions of source code must retain the above copyright 14169689Skan * notice, this list of conditions and the following disclaimer. 15169689Skan * 2. Redistributions in binary form must reproduce the above copyright 16169689Skan * notice, this list of conditions and the following disclaimer in the 17169689Skan * documentation and/or other materials provided with the distribution. 18169689Skan * 3. All advertising materials mentioning features or use of this software 19169689Skan * must display the following acknowledgement: 20169689Skan * This product includes software developed by the University of 21169689Skan * California, Berkeley and its contributors. 22169689Skan * 4. Neither the name of the University nor the names of its contributors 23169689Skan * may be used to endorse or promote products derived from this software 24169689Skan * without specific prior written permission. 25169689Skan * 26169689Skan * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27169689Skan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28169689Skan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29169689Skan * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30169689Skan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31169689Skan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32169689Skan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33169689Skan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34169689Skan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35169689Skan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36169689Skan * SUCH DAMAGE. 37169689Skan * 38169689Skan * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 39169689Skan * $FreeBSD: head/sys/kern/kern_clock.c 113355 2003-04-11 03:39:07Z jeff $ 40169689Skan */ 41169689Skan 42169689Skan#include "opt_ntp.h" 43169689Skan 44169689Skan#include <sys/param.h> 45169689Skan#include <sys/systm.h> 46169689Skan#include <sys/callout.h> 47169689Skan#include <sys/kernel.h> 48169689Skan#include <sys/lock.h> 49169689Skan#include <sys/ktr.h> 50169689Skan#include <sys/mutex.h> 51169689Skan#include <sys/proc.h> 52169689Skan#include <sys/resource.h> 53169689Skan#include <sys/resourcevar.h> 54169689Skan#include <sys/sched.h> 55169689Skan#include <sys/signalvar.h> 56169689Skan#include <sys/smp.h> 57169689Skan#include <vm/vm.h> 58169689Skan#include <vm/pmap.h> 59169689Skan#include <vm/vm_map.h> 60169689Skan#include <sys/sysctl.h> 61169689Skan#include <sys/bus.h> 62169689Skan#include <sys/interrupt.h> 63169689Skan#include <sys/timetc.h> 64169689Skan 65169689Skan#include <machine/cpu.h> 66169689Skan#include <machine/limits.h> 67169689Skan 68169689Skan#ifdef GPROF 69169689Skan#include <sys/gmon.h> 70169689Skan#endif 71169689Skan 72169689Skan#ifdef DEVICE_POLLING 73169689Skanextern void hardclock_device_poll(void); 74169689Skan#endif /* DEVICE_POLLING */ 75169689Skan 76169689Skanstatic void initclocks(void *dummy); 77169689SkanSYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL) 78169689Skan 79169689Skan/* Some of these don't belong here, but it's easiest to concentrate them. */ 80169689Skanlong cp_time[CPUSTATES]; 81169689Skan 82169689SkanSYSCTL_OPAQUE(_kern, OID_AUTO, cp_time, CTLFLAG_RD, &cp_time, sizeof(cp_time), 83169689Skan "LU", "CPU time statistics"); 84169689Skan 85169689Skan/* 86169689Skan * Clock handling routines. 87169689Skan * 88169689Skan * This code is written to operate with two timers that run independently of 89169689Skan * each other. 90169689Skan * 91169689Skan * The main timer, running hz times per second, is used to trigger interval 92169689Skan * timers, timeouts and rescheduling as needed. 93169689Skan * 94169689Skan * The second timer handles kernel and user profiling, 95169689Skan * and does resource use estimation. If the second timer is programmable, 96169689Skan * it is randomized to avoid aliasing between the two clocks. For example, 97169689Skan * the randomization prevents an adversary from always giving up the cpu 98169689Skan * just before its quantum expires. Otherwise, it would never accumulate 99169689Skan * cpu ticks. The mean frequency of the second timer is stathz. 100169689Skan * 101169689Skan * If no second timer exists, stathz will be zero; in this case we drive 102169689Skan * profiling and statistics off the main clock. This WILL NOT be accurate; 103169689Skan * do not do it unless absolutely necessary. 104169689Skan * 105169689Skan * The statistics clock may (or may not) be run at a higher rate while 106169689Skan * profiling. This profile clock runs at profhz. We require that profhz 107169689Skan * be an integral multiple of stathz. 108169689Skan * 109169689Skan * If the statistics clock is running fast, it must be divided by the ratio 110169689Skan * profhz/stathz for statistics. (For profiling, every tick counts.) 111169689Skan * 112169689Skan * Time-of-day is maintained using a "timecounter", which may or may 113169689Skan * not be related to the hardware generating the above mentioned 114169689Skan * interrupts. 115169689Skan */ 116169689Skan 117169689Skanint stathz; 118169689Skanint profhz; 119169689Skanint profprocs; 120169689Skanint ticks; 121169689Skanint psratio; 122169689Skan 123169689Skan/* 124169689Skan * Initialize clock frequencies and start both clocks running. 125169689Skan */ 126169689Skan/* ARGSUSED*/ 127169689Skanstatic void 128169689Skaninitclocks(dummy) 129169689Skan void *dummy; 130169689Skan{ 131169689Skan register int i; 132169689Skan 133169689Skan /* 134169689Skan * Set divisors to 1 (normal case) and let the machine-specific 135169689Skan * code do its bit. 136169689Skan */ 137169689Skan cpu_initclocks(); 138169689Skan 139169689Skan /* 140169689Skan * Compute profhz/stathz, and fix profhz if needed. 141169689Skan */ 142169689Skan i = stathz ? stathz : hz; 143169689Skan if (profhz == 0) 144169689Skan profhz = i; 145169689Skan psratio = profhz / i; 146169689Skan} 147169689Skan 148169689Skan/* 149169689Skan * Each time the real-time timer fires, this function is called on all CPUs. 150169689Skan * Note that hardclock() calls hardclock_process() for the boot CPU, so only 151169689Skan * the other CPUs in the system need to call this function. 152169689Skan */ 153169689Skanvoid 154169689Skanhardclock_process(frame) 155169689Skan register struct clockframe *frame; 156169689Skan{ 157169689Skan struct pstats *pstats; 158169689Skan struct thread *td = curthread; 159169689Skan struct proc *p = td->td_proc; 160169689Skan 161169689Skan /* 162169689Skan * Run current process's virtual and profile time, as needed. 163169689Skan */ 164169689Skan mtx_lock_spin_flags(&sched_lock, MTX_QUIET); 165169689Skan if (p->p_flag & P_THREADED) { 166169689Skan /* XXXKSE What to do? */ 167169689Skan } else { 168169689Skan pstats = p->p_stats; 169169689Skan if (CLKF_USERMODE(frame) && 170169689Skan timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && 171169689Skan itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) { 172169689Skan p->p_sflag |= PS_ALRMPEND; 173169689Skan td->td_flags |= TDF_ASTPENDING; 174169689Skan } 175169689Skan if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) && 176169689Skan itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) { 177169689Skan p->p_sflag |= PS_PROFPEND; 178169689Skan td->td_flags |= TDF_ASTPENDING; 179169689Skan } 180169689Skan } 181169689Skan mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); 182169689Skan} 183169689Skan 184169689Skan/* 185169689Skan * The real-time timer, interrupting hz times per second. 186169689Skan */ 187169689Skanvoid 188169689Skanhardclock(frame) 189169689Skan register struct clockframe *frame; 190169689Skan{ 191169689Skan int need_softclock = 0; 192169689Skan 193169689Skan CTR0(KTR_CLK, "hardclock fired"); 194169689Skan hardclock_process(frame); 195169689Skan 196169689Skan tc_ticktock(); 197169689Skan /* 198169689Skan * If no separate statistics clock is available, run it from here. 199169689Skan * 200169689Skan * XXX: this only works for UP 201169689Skan */ 202169689Skan if (stathz == 0) { 203169689Skan profclock(frame); 204169689Skan statclock(frame); 205169689Skan } 206169689Skan 207169689Skan#ifdef DEVICE_POLLING 208169689Skan hardclock_device_poll(); /* this is very short and quick */ 209169689Skan#endif /* DEVICE_POLLING */ 210169689Skan 211169689Skan /* 212169689Skan * Process callouts at a very low cpu priority, so we don't keep the 213169689Skan * relatively high clock interrupt priority any longer than necessary. 214169689Skan */ 215169689Skan mtx_lock_spin_flags(&callout_lock, MTX_QUIET); 216169689Skan ticks++; 217169689Skan if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) { 218169689Skan need_softclock = 1; 219169689Skan } else if (softticks + 1 == ticks) 220169689Skan ++softticks; 221169689Skan mtx_unlock_spin_flags(&callout_lock, MTX_QUIET); 222169689Skan 223169689Skan /* 224169689Skan * swi_sched acquires sched_lock, so we don't want to call it with 225169689Skan * callout_lock held; incorrect locking order. 226169689Skan */ 227169689Skan if (need_softclock) 228169689Skan swi_sched(softclock_ih, 0); 229169689Skan} 230169689Skan 231169689Skan/* 232169689Skan * Compute number of ticks in the specified amount of time. 233169689Skan */ 234169689Skanint 235169689Skantvtohz(tv) 236169689Skan struct timeval *tv; 237169689Skan{ 238169689Skan register unsigned long ticks; 239169689Skan register long sec, usec; 240169689Skan 241169689Skan /* 242169689Skan * If the number of usecs in the whole seconds part of the time 243169689Skan * difference fits in a long, then the total number of usecs will 244169689Skan * fit in an unsigned long. Compute the total and convert it to 245169689Skan * ticks, rounding up and adding 1 to allow for the current tick 246169689Skan * to expire. Rounding also depends on unsigned long arithmetic 247169689Skan * to avoid overflow. 248169689Skan * 249169689Skan * Otherwise, if the number of ticks in the whole seconds part of 250169689Skan * the time difference fits in a long, then convert the parts to 251169689Skan * ticks separately and add, using similar rounding methods and 252169689Skan * overflow avoidance. This method would work in the previous 253169689Skan * case but it is slightly slower and assumes that hz is integral. 254169689Skan * 255169689Skan * Otherwise, round the time difference down to the maximum 256169689Skan * representable value. 257169689Skan * 258169689Skan * If ints have 32 bits, then the maximum value for any timeout in 259169689Skan * 10ms ticks is 248 days. 260169689Skan */ 261169689Skan sec = tv->tv_sec; 262169689Skan usec = tv->tv_usec; 263169689Skan if (usec < 0) { 264169689Skan sec--; 265169689Skan usec += 1000000; 266169689Skan } 267169689Skan if (sec < 0) { 268169689Skan#ifdef DIAGNOSTIC 269169689Skan if (usec > 0) { 270169689Skan sec++; 271169689Skan usec -= 1000000; 272169689Skan } 273169689Skan printf("tvotohz: negative time difference %ld sec %ld usec\n", 274169689Skan sec, usec); 275169689Skan#endif 276169689Skan ticks = 1; 277169689Skan } else if (sec <= LONG_MAX / 1000000) 278169689Skan ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) 279169689Skan / tick + 1; 280169689Skan else if (sec <= LONG_MAX / hz) 281169689Skan ticks = sec * hz 282169689Skan + ((unsigned long)usec + (tick - 1)) / tick + 1; 283169689Skan else 284169689Skan ticks = LONG_MAX; 285169689Skan if (ticks > INT_MAX) 286169689Skan ticks = INT_MAX; 287169689Skan return ((int)ticks); 288169689Skan} 289169689Skan 290169689Skan/* 291169689Skan * Start profiling on a process. 292169689Skan * 293169689Skan * Kernel profiling passes proc0 which never exits and hence 294169689Skan * keeps the profile clock running constantly. 295169689Skan */ 296169689Skanvoid 297169689Skanstartprofclock(p) 298169689Skan register struct proc *p; 299169689Skan{ 300169689Skan 301169689Skan /* 302169689Skan * XXX; Right now sched_lock protects statclock(), but perhaps 303169689Skan * it should be protected later on by a time_lock, which would 304169689Skan * cover psdiv, etc. as well. 305169689Skan */ 306169689Skan mtx_lock_spin(&sched_lock); 307169689Skan if (p->p_sflag & PS_STOPPROF) { 308169689Skan mtx_unlock_spin(&sched_lock); 309169689Skan return; 310169689Skan } 311169689Skan if ((p->p_sflag & PS_PROFIL) == 0) { 312169689Skan p->p_sflag |= PS_PROFIL; 313169689Skan if (++profprocs == 1) 314169689Skan cpu_startprofclock(); 315169689Skan } 316169689Skan mtx_unlock_spin(&sched_lock); 317169689Skan} 318169689Skan 319169689Skan/* 320169689Skan * Stop profiling on a process. 321169689Skan */ 322169689Skanvoid 323169689Skanstopprofclock(p) 324169689Skan register struct proc *p; 325169689Skan{ 326169689Skan 327169689Skan PROC_LOCK_ASSERT(p, MA_OWNED); 328169689Skanretry: 329169689Skan mtx_lock_spin(&sched_lock); 330169689Skan if (p->p_sflag & PS_PROFIL) { 331169689Skan if (p->p_profthreads) { 332169689Skan p->p_sflag |= PS_STOPPROF; 333169689Skan mtx_unlock_spin(&sched_lock); 334169689Skan msleep(&p->p_profthreads, &p->p_mtx, PPAUSE, 335169689Skan "stopprof", NULL); 336169689Skan goto retry; 337169689Skan } 338169689Skan p->p_sflag &= ~(PS_PROFIL|PS_STOPPROF); 339169689Skan if (--profprocs == 0) 340169689Skan cpu_stopprofclock(); 341169689Skan } 342169689Skan mtx_unlock_spin(&sched_lock); 343169689Skan} 344169689Skan 345169689Skan/* 346169689Skan * Statistics clock. Grab profile sample, and if divider reaches 0, 347169689Skan * do process and kernel statistics. Most of the statistics are only 348169689Skan * used by user-level statistics programs. The main exceptions are 349169689Skan * ke->ke_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu. 350169689Skan * This should be called by all active processors. 351169689Skan */ 352169689Skanvoid 353169689Skanstatclock(frame) 354169689Skan register struct clockframe *frame; 355169689Skan{ 356169689Skan struct pstats *pstats; 357169689Skan struct rusage *ru; 358169689Skan struct vmspace *vm; 359169689Skan struct thread *td; 360169689Skan struct kse *ke; 361169689Skan struct proc *p; 362169689Skan long rss; 363169689Skan 364169689Skan td = curthread; 365169689Skan p = td->td_proc; 366169689Skan 367169689Skan mtx_lock_spin_flags(&sched_lock, MTX_QUIET); 368169689Skan ke = td->td_kse; 369169689Skan if (CLKF_USERMODE(frame)) { 370169689Skan /* 371169689Skan * Charge the time as appropriate. 372169689Skan */ 373169689Skan if (p->p_flag & P_THREADED) 374169689Skan thread_statclock(1); 375169689Skan p->p_uticks++; 376169689Skan if (ke->ke_ksegrp->kg_nice > NZERO) 377169689Skan cp_time[CP_NICE]++; 378169689Skan else 379169689Skan cp_time[CP_USER]++; 380169689Skan } else { 381169689Skan /* 382169689Skan * Came from kernel mode, so we were: 383169689Skan * - handling an interrupt, 384169689Skan * - doing syscall or trap work on behalf of the current 385169689Skan * user process, or 386169689Skan * - spinning in the idle loop. 387169689Skan * Whichever it is, charge the time as appropriate. 388169689Skan * Note that we charge interrupts to the current process, 389169689Skan * regardless of whether they are ``for'' that process, 390169689Skan * so that we know how much of its real time was spent 391169689Skan * in ``non-process'' (i.e., interrupt) work. 392169689Skan */ 393169689Skan if ((td->td_ithd != NULL) || td->td_intr_nesting_level >= 2) { 394169689Skan p->p_iticks++; 395169689Skan cp_time[CP_INTR]++; 396169689Skan } else { 397169689Skan if (p->p_flag & P_THREADED) 398169689Skan thread_statclock(0); 399169689Skan td->td_sticks++; 400169689Skan p->p_sticks++; 401169689Skan if (p != PCPU_GET(idlethread)->td_proc) 402169689Skan cp_time[CP_SYS]++; 403169689Skan else 404169689Skan cp_time[CP_IDLE]++; 405169689Skan } 406169689Skan } 407169689Skan 408169689Skan sched_clock(ke); 409169689Skan 410169689Skan /* Update resource usage integrals and maximums. */ 411169689Skan if ((pstats = p->p_stats) != NULL && 412169689Skan (ru = &pstats->p_ru) != NULL && 413169689Skan (vm = p->p_vmspace) != NULL) { 414169689Skan ru->ru_ixrss += pgtok(vm->vm_tsize); 415169689Skan ru->ru_idrss += pgtok(vm->vm_dsize); 416169689Skan ru->ru_isrss += pgtok(vm->vm_ssize); 417169689Skan rss = pgtok(vmspace_resident_count(vm)); 418169689Skan if (ru->ru_maxrss < rss) 419169689Skan ru->ru_maxrss = rss; 420169689Skan } 421169689Skan mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); 422169689Skan} 423169689Skan 424169689Skanvoid 425169689Skanprofclock(frame) 426169689Skan register struct clockframe *frame; 427169689Skan{ 428169689Skan struct thread *td; 429169689Skan#ifdef GPROF 430169689Skan struct gmonparam *g; 431169689Skan int i; 432169689Skan#endif 433169689Skan 434169689Skan td = curthread; 435169689Skan if (CLKF_USERMODE(frame)) { 436169689Skan /* 437169689Skan * Came from user mode; CPU was in user state. 438169689Skan * If this process is being profiled, record the tick. 439169689Skan * if there is no related user location yet, don't 440169689Skan * bother trying to count it. 441169689Skan */ 442169689Skan td = curthread; 443169689Skan if (td->td_proc->p_sflag & PS_PROFIL) 444169689Skan addupc_intr(td, CLKF_PC(frame), 1); 445169689Skan } 446169689Skan#ifdef GPROF 447169689Skan else { 448169689Skan /* 449169689Skan * Kernel statistics are just like addupc_intr, only easier. 450169689Skan */ 451169689Skan g = &_gmonparam; 452169689Skan if (g->state == GMON_PROF_ON) { 453169689Skan i = CLKF_PC(frame) - g->lowpc; 454169689Skan if (i < g->textsize) { 455169689Skan i /= HISTFRACTION * sizeof(*g->kcount); 456169689Skan g->kcount[i]++; 457169689Skan } 458169689Skan } 459169689Skan } 460169689Skan#endif 461169689Skan} 462169689Skan 463169689Skan/* 464169689Skan * Return information about system clocks. 465169689Skan */ 466169689Skanstatic int 467169689Skansysctl_kern_clockrate(SYSCTL_HANDLER_ARGS) 468169689Skan{ 469169689Skan struct clockinfo clkinfo; 470169689Skan /* 471169689Skan * Construct clockinfo structure. 472169689Skan */ 473169689Skan bzero(&clkinfo, sizeof(clkinfo)); 474169689Skan clkinfo.hz = hz; 475169689Skan clkinfo.tick = tick; 476169689Skan clkinfo.profhz = profhz; 477169689Skan clkinfo.stathz = stathz ? stathz : hz; 478169689Skan return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); 479169689Skan} 480169689Skan 481169689SkanSYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD, 482169689Skan 0, 0, sysctl_kern_clockrate, "S,clockinfo", 483169689Skan "Rate and period of various kernel clocks"); 484169689Skan