kern_clock.c revision 113355
1169689Skan/*-
2169689Skan * Copyright (c) 1982, 1986, 1991, 1993
3169689Skan *	The Regents of the University of California.  All rights reserved.
4169689Skan * (c) UNIX System Laboratories, Inc.
5169689Skan * All or some portions of this file are derived from material licensed
6169689Skan * to the University of California by American Telephone and Telegraph
7169689Skan * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8169689Skan * the permission of UNIX System Laboratories, Inc.
9169689Skan *
10169689Skan * Redistribution and use in source and binary forms, with or without
11169689Skan * modification, are permitted provided that the following conditions
12169689Skan * are met:
13169689Skan * 1. Redistributions of source code must retain the above copyright
14169689Skan *    notice, this list of conditions and the following disclaimer.
15169689Skan * 2. Redistributions in binary form must reproduce the above copyright
16169689Skan *    notice, this list of conditions and the following disclaimer in the
17169689Skan *    documentation and/or other materials provided with the distribution.
18169689Skan * 3. All advertising materials mentioning features or use of this software
19169689Skan *    must display the following acknowledgement:
20169689Skan *	This product includes software developed by the University of
21169689Skan *	California, Berkeley and its contributors.
22169689Skan * 4. Neither the name of the University nor the names of its contributors
23169689Skan *    may be used to endorse or promote products derived from this software
24169689Skan *    without specific prior written permission.
25169689Skan *
26169689Skan * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27169689Skan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28169689Skan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29169689Skan * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30169689Skan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31169689Skan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32169689Skan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33169689Skan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34169689Skan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35169689Skan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36169689Skan * SUCH DAMAGE.
37169689Skan *
38169689Skan *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
39169689Skan * $FreeBSD: head/sys/kern/kern_clock.c 113355 2003-04-11 03:39:07Z jeff $
40169689Skan */
41169689Skan
42169689Skan#include "opt_ntp.h"
43169689Skan
44169689Skan#include <sys/param.h>
45169689Skan#include <sys/systm.h>
46169689Skan#include <sys/callout.h>
47169689Skan#include <sys/kernel.h>
48169689Skan#include <sys/lock.h>
49169689Skan#include <sys/ktr.h>
50169689Skan#include <sys/mutex.h>
51169689Skan#include <sys/proc.h>
52169689Skan#include <sys/resource.h>
53169689Skan#include <sys/resourcevar.h>
54169689Skan#include <sys/sched.h>
55169689Skan#include <sys/signalvar.h>
56169689Skan#include <sys/smp.h>
57169689Skan#include <vm/vm.h>
58169689Skan#include <vm/pmap.h>
59169689Skan#include <vm/vm_map.h>
60169689Skan#include <sys/sysctl.h>
61169689Skan#include <sys/bus.h>
62169689Skan#include <sys/interrupt.h>
63169689Skan#include <sys/timetc.h>
64169689Skan
65169689Skan#include <machine/cpu.h>
66169689Skan#include <machine/limits.h>
67169689Skan
68169689Skan#ifdef GPROF
69169689Skan#include <sys/gmon.h>
70169689Skan#endif
71169689Skan
72169689Skan#ifdef DEVICE_POLLING
73169689Skanextern void hardclock_device_poll(void);
74169689Skan#endif /* DEVICE_POLLING */
75169689Skan
76169689Skanstatic void initclocks(void *dummy);
77169689SkanSYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
78169689Skan
79169689Skan/* Some of these don't belong here, but it's easiest to concentrate them. */
80169689Skanlong cp_time[CPUSTATES];
81169689Skan
82169689SkanSYSCTL_OPAQUE(_kern, OID_AUTO, cp_time, CTLFLAG_RD, &cp_time, sizeof(cp_time),
83169689Skan    "LU", "CPU time statistics");
84169689Skan
85169689Skan/*
86169689Skan * Clock handling routines.
87169689Skan *
88169689Skan * This code is written to operate with two timers that run independently of
89169689Skan * each other.
90169689Skan *
91169689Skan * The main timer, running hz times per second, is used to trigger interval
92169689Skan * timers, timeouts and rescheduling as needed.
93169689Skan *
94169689Skan * The second timer handles kernel and user profiling,
95169689Skan * and does resource use estimation.  If the second timer is programmable,
96169689Skan * it is randomized to avoid aliasing between the two clocks.  For example,
97169689Skan * the randomization prevents an adversary from always giving up the cpu
98169689Skan * just before its quantum expires.  Otherwise, it would never accumulate
99169689Skan * cpu ticks.  The mean frequency of the second timer is stathz.
100169689Skan *
101169689Skan * If no second timer exists, stathz will be zero; in this case we drive
102169689Skan * profiling and statistics off the main clock.  This WILL NOT be accurate;
103169689Skan * do not do it unless absolutely necessary.
104169689Skan *
105169689Skan * The statistics clock may (or may not) be run at a higher rate while
106169689Skan * profiling.  This profile clock runs at profhz.  We require that profhz
107169689Skan * be an integral multiple of stathz.
108169689Skan *
109169689Skan * If the statistics clock is running fast, it must be divided by the ratio
110169689Skan * profhz/stathz for statistics.  (For profiling, every tick counts.)
111169689Skan *
112169689Skan * Time-of-day is maintained using a "timecounter", which may or may
113169689Skan * not be related to the hardware generating the above mentioned
114169689Skan * interrupts.
115169689Skan */
116169689Skan
117169689Skanint	stathz;
118169689Skanint	profhz;
119169689Skanint	profprocs;
120169689Skanint	ticks;
121169689Skanint	psratio;
122169689Skan
123169689Skan/*
124169689Skan * Initialize clock frequencies and start both clocks running.
125169689Skan */
126169689Skan/* ARGSUSED*/
127169689Skanstatic void
128169689Skaninitclocks(dummy)
129169689Skan	void *dummy;
130169689Skan{
131169689Skan	register int i;
132169689Skan
133169689Skan	/*
134169689Skan	 * Set divisors to 1 (normal case) and let the machine-specific
135169689Skan	 * code do its bit.
136169689Skan	 */
137169689Skan	cpu_initclocks();
138169689Skan
139169689Skan	/*
140169689Skan	 * Compute profhz/stathz, and fix profhz if needed.
141169689Skan	 */
142169689Skan	i = stathz ? stathz : hz;
143169689Skan	if (profhz == 0)
144169689Skan		profhz = i;
145169689Skan	psratio = profhz / i;
146169689Skan}
147169689Skan
148169689Skan/*
149169689Skan * Each time the real-time timer fires, this function is called on all CPUs.
150169689Skan * Note that hardclock() calls hardclock_process() for the boot CPU, so only
151169689Skan * the other CPUs in the system need to call this function.
152169689Skan */
153169689Skanvoid
154169689Skanhardclock_process(frame)
155169689Skan	register struct clockframe *frame;
156169689Skan{
157169689Skan	struct pstats *pstats;
158169689Skan	struct thread *td = curthread;
159169689Skan	struct proc *p = td->td_proc;
160169689Skan
161169689Skan	/*
162169689Skan	 * Run current process's virtual and profile time, as needed.
163169689Skan	 */
164169689Skan	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
165169689Skan	if (p->p_flag & P_THREADED) {
166169689Skan		/* XXXKSE What to do? */
167169689Skan	} else {
168169689Skan		pstats = p->p_stats;
169169689Skan		if (CLKF_USERMODE(frame) &&
170169689Skan		    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
171169689Skan		    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
172169689Skan			p->p_sflag |= PS_ALRMPEND;
173169689Skan			td->td_flags |= TDF_ASTPENDING;
174169689Skan		}
175169689Skan		if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
176169689Skan		    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
177169689Skan			p->p_sflag |= PS_PROFPEND;
178169689Skan			td->td_flags |= TDF_ASTPENDING;
179169689Skan		}
180169689Skan	}
181169689Skan	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
182169689Skan}
183169689Skan
184169689Skan/*
185169689Skan * The real-time timer, interrupting hz times per second.
186169689Skan */
187169689Skanvoid
188169689Skanhardclock(frame)
189169689Skan	register struct clockframe *frame;
190169689Skan{
191169689Skan	int need_softclock = 0;
192169689Skan
193169689Skan	CTR0(KTR_CLK, "hardclock fired");
194169689Skan	hardclock_process(frame);
195169689Skan
196169689Skan	tc_ticktock();
197169689Skan	/*
198169689Skan	 * If no separate statistics clock is available, run it from here.
199169689Skan	 *
200169689Skan	 * XXX: this only works for UP
201169689Skan	 */
202169689Skan	if (stathz == 0) {
203169689Skan		profclock(frame);
204169689Skan		statclock(frame);
205169689Skan	}
206169689Skan
207169689Skan#ifdef DEVICE_POLLING
208169689Skan	hardclock_device_poll();	/* this is very short and quick */
209169689Skan#endif /* DEVICE_POLLING */
210169689Skan
211169689Skan	/*
212169689Skan	 * Process callouts at a very low cpu priority, so we don't keep the
213169689Skan	 * relatively high clock interrupt priority any longer than necessary.
214169689Skan	 */
215169689Skan	mtx_lock_spin_flags(&callout_lock, MTX_QUIET);
216169689Skan	ticks++;
217169689Skan	if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
218169689Skan		need_softclock = 1;
219169689Skan	} else if (softticks + 1 == ticks)
220169689Skan		++softticks;
221169689Skan	mtx_unlock_spin_flags(&callout_lock, MTX_QUIET);
222169689Skan
223169689Skan	/*
224169689Skan	 * swi_sched acquires sched_lock, so we don't want to call it with
225169689Skan	 * callout_lock held; incorrect locking order.
226169689Skan	 */
227169689Skan	if (need_softclock)
228169689Skan		swi_sched(softclock_ih, 0);
229169689Skan}
230169689Skan
231169689Skan/*
232169689Skan * Compute number of ticks in the specified amount of time.
233169689Skan */
234169689Skanint
235169689Skantvtohz(tv)
236169689Skan	struct timeval *tv;
237169689Skan{
238169689Skan	register unsigned long ticks;
239169689Skan	register long sec, usec;
240169689Skan
241169689Skan	/*
242169689Skan	 * If the number of usecs in the whole seconds part of the time
243169689Skan	 * difference fits in a long, then the total number of usecs will
244169689Skan	 * fit in an unsigned long.  Compute the total and convert it to
245169689Skan	 * ticks, rounding up and adding 1 to allow for the current tick
246169689Skan	 * to expire.  Rounding also depends on unsigned long arithmetic
247169689Skan	 * to avoid overflow.
248169689Skan	 *
249169689Skan	 * Otherwise, if the number of ticks in the whole seconds part of
250169689Skan	 * the time difference fits in a long, then convert the parts to
251169689Skan	 * ticks separately and add, using similar rounding methods and
252169689Skan	 * overflow avoidance.  This method would work in the previous
253169689Skan	 * case but it is slightly slower and assumes that hz is integral.
254169689Skan	 *
255169689Skan	 * Otherwise, round the time difference down to the maximum
256169689Skan	 * representable value.
257169689Skan	 *
258169689Skan	 * If ints have 32 bits, then the maximum value for any timeout in
259169689Skan	 * 10ms ticks is 248 days.
260169689Skan	 */
261169689Skan	sec = tv->tv_sec;
262169689Skan	usec = tv->tv_usec;
263169689Skan	if (usec < 0) {
264169689Skan		sec--;
265169689Skan		usec += 1000000;
266169689Skan	}
267169689Skan	if (sec < 0) {
268169689Skan#ifdef DIAGNOSTIC
269169689Skan		if (usec > 0) {
270169689Skan			sec++;
271169689Skan			usec -= 1000000;
272169689Skan		}
273169689Skan		printf("tvotohz: negative time difference %ld sec %ld usec\n",
274169689Skan		       sec, usec);
275169689Skan#endif
276169689Skan		ticks = 1;
277169689Skan	} else if (sec <= LONG_MAX / 1000000)
278169689Skan		ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
279169689Skan			/ tick + 1;
280169689Skan	else if (sec <= LONG_MAX / hz)
281169689Skan		ticks = sec * hz
282169689Skan			+ ((unsigned long)usec + (tick - 1)) / tick + 1;
283169689Skan	else
284169689Skan		ticks = LONG_MAX;
285169689Skan	if (ticks > INT_MAX)
286169689Skan		ticks = INT_MAX;
287169689Skan	return ((int)ticks);
288169689Skan}
289169689Skan
290169689Skan/*
291169689Skan * Start profiling on a process.
292169689Skan *
293169689Skan * Kernel profiling passes proc0 which never exits and hence
294169689Skan * keeps the profile clock running constantly.
295169689Skan */
296169689Skanvoid
297169689Skanstartprofclock(p)
298169689Skan	register struct proc *p;
299169689Skan{
300169689Skan
301169689Skan	/*
302169689Skan	 * XXX; Right now sched_lock protects statclock(), but perhaps
303169689Skan	 * it should be protected later on by a time_lock, which would
304169689Skan	 * cover psdiv, etc. as well.
305169689Skan	 */
306169689Skan	mtx_lock_spin(&sched_lock);
307169689Skan	if (p->p_sflag & PS_STOPPROF) {
308169689Skan		mtx_unlock_spin(&sched_lock);
309169689Skan		return;
310169689Skan	}
311169689Skan	if ((p->p_sflag & PS_PROFIL) == 0) {
312169689Skan		p->p_sflag |= PS_PROFIL;
313169689Skan		if (++profprocs == 1)
314169689Skan			cpu_startprofclock();
315169689Skan	}
316169689Skan	mtx_unlock_spin(&sched_lock);
317169689Skan}
318169689Skan
319169689Skan/*
320169689Skan * Stop profiling on a process.
321169689Skan */
322169689Skanvoid
323169689Skanstopprofclock(p)
324169689Skan	register struct proc *p;
325169689Skan{
326169689Skan
327169689Skan	PROC_LOCK_ASSERT(p, MA_OWNED);
328169689Skanretry:
329169689Skan	mtx_lock_spin(&sched_lock);
330169689Skan	if (p->p_sflag & PS_PROFIL) {
331169689Skan		if (p->p_profthreads) {
332169689Skan			p->p_sflag |= PS_STOPPROF;
333169689Skan			mtx_unlock_spin(&sched_lock);
334169689Skan			msleep(&p->p_profthreads, &p->p_mtx, PPAUSE,
335169689Skan			       "stopprof", NULL);
336169689Skan			goto retry;
337169689Skan		}
338169689Skan		p->p_sflag &= ~(PS_PROFIL|PS_STOPPROF);
339169689Skan		if (--profprocs == 0)
340169689Skan			cpu_stopprofclock();
341169689Skan	}
342169689Skan	mtx_unlock_spin(&sched_lock);
343169689Skan}
344169689Skan
345169689Skan/*
346169689Skan * Statistics clock.  Grab profile sample, and if divider reaches 0,
347169689Skan * do process and kernel statistics.  Most of the statistics are only
348169689Skan * used by user-level statistics programs.  The main exceptions are
349169689Skan * ke->ke_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu.
350169689Skan * This should be called by all active processors.
351169689Skan */
352169689Skanvoid
353169689Skanstatclock(frame)
354169689Skan	register struct clockframe *frame;
355169689Skan{
356169689Skan	struct pstats *pstats;
357169689Skan	struct rusage *ru;
358169689Skan	struct vmspace *vm;
359169689Skan	struct thread *td;
360169689Skan	struct kse *ke;
361169689Skan	struct proc *p;
362169689Skan	long rss;
363169689Skan
364169689Skan	td = curthread;
365169689Skan	p = td->td_proc;
366169689Skan
367169689Skan	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
368169689Skan	ke = td->td_kse;
369169689Skan	if (CLKF_USERMODE(frame)) {
370169689Skan		/*
371169689Skan		 * Charge the time as appropriate.
372169689Skan		 */
373169689Skan		if (p->p_flag & P_THREADED)
374169689Skan			thread_statclock(1);
375169689Skan		p->p_uticks++;
376169689Skan		if (ke->ke_ksegrp->kg_nice > NZERO)
377169689Skan			cp_time[CP_NICE]++;
378169689Skan		else
379169689Skan			cp_time[CP_USER]++;
380169689Skan	} else {
381169689Skan		/*
382169689Skan		 * Came from kernel mode, so we were:
383169689Skan		 * - handling an interrupt,
384169689Skan		 * - doing syscall or trap work on behalf of the current
385169689Skan		 *   user process, or
386169689Skan		 * - spinning in the idle loop.
387169689Skan		 * Whichever it is, charge the time as appropriate.
388169689Skan		 * Note that we charge interrupts to the current process,
389169689Skan		 * regardless of whether they are ``for'' that process,
390169689Skan		 * so that we know how much of its real time was spent
391169689Skan		 * in ``non-process'' (i.e., interrupt) work.
392169689Skan		 */
393169689Skan		if ((td->td_ithd != NULL) || td->td_intr_nesting_level >= 2) {
394169689Skan			p->p_iticks++;
395169689Skan			cp_time[CP_INTR]++;
396169689Skan		} else {
397169689Skan			if (p->p_flag & P_THREADED)
398169689Skan				thread_statclock(0);
399169689Skan			td->td_sticks++;
400169689Skan			p->p_sticks++;
401169689Skan			if (p != PCPU_GET(idlethread)->td_proc)
402169689Skan				cp_time[CP_SYS]++;
403169689Skan			else
404169689Skan				cp_time[CP_IDLE]++;
405169689Skan		}
406169689Skan	}
407169689Skan
408169689Skan	sched_clock(ke);
409169689Skan
410169689Skan	/* Update resource usage integrals and maximums. */
411169689Skan	if ((pstats = p->p_stats) != NULL &&
412169689Skan	    (ru = &pstats->p_ru) != NULL &&
413169689Skan	    (vm = p->p_vmspace) != NULL) {
414169689Skan		ru->ru_ixrss += pgtok(vm->vm_tsize);
415169689Skan		ru->ru_idrss += pgtok(vm->vm_dsize);
416169689Skan		ru->ru_isrss += pgtok(vm->vm_ssize);
417169689Skan		rss = pgtok(vmspace_resident_count(vm));
418169689Skan		if (ru->ru_maxrss < rss)
419169689Skan			ru->ru_maxrss = rss;
420169689Skan	}
421169689Skan	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
422169689Skan}
423169689Skan
424169689Skanvoid
425169689Skanprofclock(frame)
426169689Skan	register struct clockframe *frame;
427169689Skan{
428169689Skan	struct thread *td;
429169689Skan#ifdef GPROF
430169689Skan	struct gmonparam *g;
431169689Skan	int i;
432169689Skan#endif
433169689Skan
434169689Skan	td = curthread;
435169689Skan	if (CLKF_USERMODE(frame)) {
436169689Skan		/*
437169689Skan		 * Came from user mode; CPU was in user state.
438169689Skan		 * If this process is being profiled, record the tick.
439169689Skan		 * if there is no related user location yet, don't
440169689Skan		 * bother trying to count it.
441169689Skan		 */
442169689Skan		td = curthread;
443169689Skan		if (td->td_proc->p_sflag & PS_PROFIL)
444169689Skan			addupc_intr(td, CLKF_PC(frame), 1);
445169689Skan	}
446169689Skan#ifdef GPROF
447169689Skan	else {
448169689Skan		/*
449169689Skan		 * Kernel statistics are just like addupc_intr, only easier.
450169689Skan		 */
451169689Skan		g = &_gmonparam;
452169689Skan		if (g->state == GMON_PROF_ON) {
453169689Skan			i = CLKF_PC(frame) - g->lowpc;
454169689Skan			if (i < g->textsize) {
455169689Skan				i /= HISTFRACTION * sizeof(*g->kcount);
456169689Skan				g->kcount[i]++;
457169689Skan			}
458169689Skan		}
459169689Skan	}
460169689Skan#endif
461169689Skan}
462169689Skan
463169689Skan/*
464169689Skan * Return information about system clocks.
465169689Skan */
466169689Skanstatic int
467169689Skansysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
468169689Skan{
469169689Skan	struct clockinfo clkinfo;
470169689Skan	/*
471169689Skan	 * Construct clockinfo structure.
472169689Skan	 */
473169689Skan	bzero(&clkinfo, sizeof(clkinfo));
474169689Skan	clkinfo.hz = hz;
475169689Skan	clkinfo.tick = tick;
476169689Skan	clkinfo.profhz = profhz;
477169689Skan	clkinfo.stathz = stathz ? stathz : hz;
478169689Skan	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
479169689Skan}
480169689Skan
481169689SkanSYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
482169689Skan	0, 0, sysctl_kern_clockrate, "S,clockinfo",
483169689Skan	"Rate and period of various kernel clocks");
484169689Skan