kern_clock.c revision 110996
1/*-
2 * Copyright (c) 1982, 1986, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
39 * $FreeBSD: head/sys/kern/kern_clock.c 110996 2003-02-16 13:22:15Z phk $
40 */
41
42#include "opt_ntp.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/dkstat.h>
47#include <sys/callout.h>
48#include <sys/kernel.h>
49#include <sys/lock.h>
50#include <sys/ktr.h>
51#include <sys/mutex.h>
52#include <sys/proc.h>
53#include <sys/resourcevar.h>
54#include <sys/sched.h>
55#include <sys/signalvar.h>
56#include <sys/smp.h>
57#include <vm/vm.h>
58#include <vm/pmap.h>
59#include <vm/vm_map.h>
60#include <sys/sysctl.h>
61#include <sys/bus.h>
62#include <sys/interrupt.h>
63#include <sys/timetc.h>
64
65#include <machine/cpu.h>
66#include <machine/limits.h>
67
68#ifdef GPROF
69#include <sys/gmon.h>
70#endif
71
72#ifdef DEVICE_POLLING
73extern void init_device_poll(void);
74extern void hardclock_device_poll(void);
75#endif /* DEVICE_POLLING */
76
77static void initclocks(void *dummy);
78SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
79
80/* Some of these don't belong here, but it's easiest to concentrate them. */
81long cp_time[CPUSTATES];
82
83SYSCTL_OPAQUE(_kern, OID_AUTO, cp_time, CTLFLAG_RD, &cp_time, sizeof(cp_time),
84    "LU", "CPU time statistics");
85
86/*
87 * Clock handling routines.
88 *
89 * This code is written to operate with two timers that run independently of
90 * each other.
91 *
92 * The main timer, running hz times per second, is used to trigger interval
93 * timers, timeouts and rescheduling as needed.
94 *
95 * The second timer handles kernel and user profiling,
96 * and does resource use estimation.  If the second timer is programmable,
97 * it is randomized to avoid aliasing between the two clocks.  For example,
98 * the randomization prevents an adversary from always giving up the cpu
99 * just before its quantum expires.  Otherwise, it would never accumulate
100 * cpu ticks.  The mean frequency of the second timer is stathz.
101 *
102 * If no second timer exists, stathz will be zero; in this case we drive
103 * profiling and statistics off the main clock.  This WILL NOT be accurate;
104 * do not do it unless absolutely necessary.
105 *
106 * The statistics clock may (or may not) be run at a higher rate while
107 * profiling.  This profile clock runs at profhz.  We require that profhz
108 * be an integral multiple of stathz.
109 *
110 * If the statistics clock is running fast, it must be divided by the ratio
111 * profhz/stathz for statistics.  (For profiling, every tick counts.)
112 *
113 * Time-of-day is maintained using a "timecounter", which may or may
114 * not be related to the hardware generating the above mentioned
115 * interrupts.
116 */
117
118int	stathz;
119int	profhz;
120int	profprocs;
121int	ticks;
122int	psratio;
123
124/*
125 * Initialize clock frequencies and start both clocks running.
126 */
127/* ARGSUSED*/
128static void
129initclocks(dummy)
130	void *dummy;
131{
132	register int i;
133
134	/*
135	 * Set divisors to 1 (normal case) and let the machine-specific
136	 * code do its bit.
137	 */
138	cpu_initclocks();
139
140#ifdef DEVICE_POLLING
141	init_device_poll();
142#endif
143	/*
144	 * Compute profhz/stathz, and fix profhz if needed.
145	 */
146	i = stathz ? stathz : hz;
147	if (profhz == 0)
148		profhz = i;
149	psratio = profhz / i;
150}
151
152/*
153 * Each time the real-time timer fires, this function is called on all CPUs.
154 * Note that hardclock() calls hardclock_process() for the boot CPU, so only
155 * the other CPUs in the system need to call this function.
156 */
157void
158hardclock_process(frame)
159	register struct clockframe *frame;
160{
161	struct pstats *pstats;
162	struct thread *td = curthread;
163	struct proc *p = td->td_proc;
164
165	/*
166	 * Run current process's virtual and profile time, as needed.
167	 */
168	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
169	if (p->p_flag & P_KSES) {
170		/* XXXKSE What to do? */
171	} else {
172		pstats = p->p_stats;
173		if (CLKF_USERMODE(frame) &&
174		    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
175		    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
176			p->p_sflag |= PS_ALRMPEND;
177			td->td_kse->ke_flags |= KEF_ASTPENDING;
178		}
179		if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
180		    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
181			p->p_sflag |= PS_PROFPEND;
182			td->td_kse->ke_flags |= KEF_ASTPENDING;
183		}
184	}
185	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
186}
187
188/*
189 * The real-time timer, interrupting hz times per second.
190 */
191void
192hardclock(frame)
193	register struct clockframe *frame;
194{
195	int need_softclock = 0;
196
197	CTR0(KTR_CLK, "hardclock fired");
198	hardclock_process(frame);
199
200	tc_ticktock();
201	/*
202	 * If no separate statistics clock is available, run it from here.
203	 *
204	 * XXX: this only works for UP
205	 */
206	if (stathz == 0) {
207		profclock(frame);
208		statclock(frame);
209	}
210
211#ifdef DEVICE_POLLING
212	hardclock_device_poll();	/* this is very short and quick */
213#endif /* DEVICE_POLLING */
214
215	/*
216	 * Process callouts at a very low cpu priority, so we don't keep the
217	 * relatively high clock interrupt priority any longer than necessary.
218	 */
219	mtx_lock_spin_flags(&callout_lock, MTX_QUIET);
220	ticks++;
221	if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
222		need_softclock = 1;
223	} else if (softticks + 1 == ticks)
224		++softticks;
225	mtx_unlock_spin_flags(&callout_lock, MTX_QUIET);
226
227	/*
228	 * swi_sched acquires sched_lock, so we don't want to call it with
229	 * callout_lock held; incorrect locking order.
230	 */
231	if (need_softclock)
232		swi_sched(softclock_ih, 0);
233}
234
235/*
236 * Compute number of ticks in the specified amount of time.
237 */
238int
239tvtohz(tv)
240	struct timeval *tv;
241{
242	register unsigned long ticks;
243	register long sec, usec;
244
245	/*
246	 * If the number of usecs in the whole seconds part of the time
247	 * difference fits in a long, then the total number of usecs will
248	 * fit in an unsigned long.  Compute the total and convert it to
249	 * ticks, rounding up and adding 1 to allow for the current tick
250	 * to expire.  Rounding also depends on unsigned long arithmetic
251	 * to avoid overflow.
252	 *
253	 * Otherwise, if the number of ticks in the whole seconds part of
254	 * the time difference fits in a long, then convert the parts to
255	 * ticks separately and add, using similar rounding methods and
256	 * overflow avoidance.  This method would work in the previous
257	 * case but it is slightly slower and assumes that hz is integral.
258	 *
259	 * Otherwise, round the time difference down to the maximum
260	 * representable value.
261	 *
262	 * If ints have 32 bits, then the maximum value for any timeout in
263	 * 10ms ticks is 248 days.
264	 */
265	sec = tv->tv_sec;
266	usec = tv->tv_usec;
267	if (usec < 0) {
268		sec--;
269		usec += 1000000;
270	}
271	if (sec < 0) {
272#ifdef DIAGNOSTIC
273		if (usec > 0) {
274			sec++;
275			usec -= 1000000;
276		}
277		printf("tvotohz: negative time difference %ld sec %ld usec\n",
278		       sec, usec);
279#endif
280		ticks = 1;
281	} else if (sec <= LONG_MAX / 1000000)
282		ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
283			/ tick + 1;
284	else if (sec <= LONG_MAX / hz)
285		ticks = sec * hz
286			+ ((unsigned long)usec + (tick - 1)) / tick + 1;
287	else
288		ticks = LONG_MAX;
289	if (ticks > INT_MAX)
290		ticks = INT_MAX;
291	return ((int)ticks);
292}
293
294/*
295 * Start profiling on a process.
296 *
297 * Kernel profiling passes proc0 which never exits and hence
298 * keeps the profile clock running constantly.
299 */
300void
301startprofclock(p)
302	register struct proc *p;
303{
304
305	/*
306	 * XXX; Right now sched_lock protects statclock(), but perhaps
307	 * it should be protected later on by a time_lock, which would
308	 * cover psdiv, etc. as well.
309	 */
310	mtx_lock_spin(&sched_lock);
311	if (p->p_sflag & PS_STOPPROF) {
312		mtx_unlock_spin(&sched_lock);
313		return;
314	}
315	if ((p->p_sflag & PS_PROFIL) == 0) {
316		p->p_sflag |= PS_PROFIL;
317		if (++profprocs == 1)
318			cpu_startprofclock();
319	}
320	mtx_unlock_spin(&sched_lock);
321}
322
323/*
324 * Stop profiling on a process.
325 */
326void
327stopprofclock(p)
328	register struct proc *p;
329{
330
331	PROC_LOCK_ASSERT(p, MA_OWNED);
332retry:
333	mtx_lock_spin(&sched_lock);
334	if (p->p_sflag & PS_PROFIL) {
335		if (p->p_profthreads) {
336			p->p_sflag |= PS_STOPPROF;
337			mtx_unlock_spin(&sched_lock);
338			msleep(&p->p_profthreads, &p->p_mtx, PPAUSE,
339			       "stopprof", NULL);
340			goto retry;
341		}
342		p->p_sflag &= ~(PS_PROFIL|PS_STOPPROF);
343		if (--profprocs == 0)
344			cpu_stopprofclock();
345	}
346	mtx_unlock_spin(&sched_lock);
347}
348
349/*
350 * Statistics clock.  Grab profile sample, and if divider reaches 0,
351 * do process and kernel statistics.  Most of the statistics are only
352 * used by user-level statistics programs.  The main exceptions are
353 * ke->ke_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu.
354 * This should be called by all active processors.
355 */
356void
357statclock(frame)
358	register struct clockframe *frame;
359{
360	struct pstats *pstats;
361	struct rusage *ru;
362	struct vmspace *vm;
363	struct thread *td;
364	struct kse *ke;
365	struct proc *p;
366	long rss;
367
368	td = curthread;
369	p = td->td_proc;
370
371	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
372	ke = td->td_kse;
373	if (CLKF_USERMODE(frame)) {
374		/*
375		 * Charge the time as appropriate.
376		 */
377		if (p->p_flag & P_KSES)
378			thread_add_ticks_intr(1, 1);
379		ke->ke_uticks++;
380		if (ke->ke_ksegrp->kg_nice > NZERO)
381			cp_time[CP_NICE]++;
382		else
383			cp_time[CP_USER]++;
384	} else {
385		/*
386		 * Came from kernel mode, so we were:
387		 * - handling an interrupt,
388		 * - doing syscall or trap work on behalf of the current
389		 *   user process, or
390		 * - spinning in the idle loop.
391		 * Whichever it is, charge the time as appropriate.
392		 * Note that we charge interrupts to the current process,
393		 * regardless of whether they are ``for'' that process,
394		 * so that we know how much of its real time was spent
395		 * in ``non-process'' (i.e., interrupt) work.
396		 */
397		if ((td->td_ithd != NULL) || td->td_intr_nesting_level >= 2) {
398			ke->ke_iticks++;
399			cp_time[CP_INTR]++;
400		} else {
401			if (p->p_flag & P_KSES)
402				thread_add_ticks_intr(0, 1);
403			ke->ke_sticks++;
404			if (p != PCPU_GET(idlethread)->td_proc)
405				cp_time[CP_SYS]++;
406			else
407				cp_time[CP_IDLE]++;
408		}
409	}
410
411	sched_clock(td);
412
413	/* Update resource usage integrals and maximums. */
414	if ((pstats = p->p_stats) != NULL &&
415	    (ru = &pstats->p_ru) != NULL &&
416	    (vm = p->p_vmspace) != NULL) {
417		ru->ru_ixrss += pgtok(vm->vm_tsize);
418		ru->ru_idrss += pgtok(vm->vm_dsize);
419		ru->ru_isrss += pgtok(vm->vm_ssize);
420		rss = pgtok(vmspace_resident_count(vm));
421		if (ru->ru_maxrss < rss)
422			ru->ru_maxrss = rss;
423	}
424	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
425}
426
427void
428profclock(frame)
429	register struct clockframe *frame;
430{
431	struct thread *td;
432#ifdef GPROF
433	struct gmonparam *g;
434	int i;
435#endif
436
437	if (CLKF_USERMODE(frame)) {
438		/*
439		 * Came from user mode; CPU was in user state.
440		 * If this process is being profiled, record the tick.
441		 * if there is no related user location yet, don't
442		 * bother trying to count it.
443		 */
444		td = curthread;
445		if ((td->td_proc->p_sflag & PS_PROFIL) &&
446		    !(td->td_flags & TDF_UPCALLING))
447			addupc_intr(td->td_kse, CLKF_PC(frame), 1);
448	}
449#ifdef GPROF
450	else {
451		/*
452		 * Kernel statistics are just like addupc_intr, only easier.
453		 */
454		g = &_gmonparam;
455		if (g->state == GMON_PROF_ON) {
456			i = CLKF_PC(frame) - g->lowpc;
457			if (i < g->textsize) {
458				i /= HISTFRACTION * sizeof(*g->kcount);
459				g->kcount[i]++;
460			}
461		}
462	}
463#endif
464}
465
466/*
467 * Return information about system clocks.
468 */
469static int
470sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
471{
472	struct clockinfo clkinfo;
473	/*
474	 * Construct clockinfo structure.
475	 */
476	bzero(&clkinfo, sizeof(clkinfo));
477	clkinfo.hz = hz;
478	clkinfo.tick = tick;
479	clkinfo.profhz = profhz;
480	clkinfo.stathz = stathz ? stathz : hz;
481	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
482}
483
484SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
485	0, 0, sysctl_kern_clockrate, "S,clockinfo",
486	"Rate and period of various kernel clocks");
487