kern_clock.c revision 177253
117651Speter/*-
2206924Sdelphij * Copyright (c) 1982, 1986, 1991, 1993
3131377Stjr *	The Regents of the University of California.  All rights reserved.
417651Speter * (c) UNIX System Laboratories, Inc.
517651Speter * All or some portions of this file are derived from material licensed
617651Speter * to the University of California by American Telephone and Telegraph
717651Speter * Co. or Unix System Laboratories, Inc. and are reproduced herein with
817651Speter * the permission of UNIX System Laboratories, Inc.
917651Speter *
1017651Speter * Redistribution and use in source and binary forms, with or without
11206924Sdelphij * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/kern_clock.c 177253 2008-03-16 10:58:09Z rwatson $");
39
40#include "opt_kdb.h"
41#include "opt_device_polling.h"
42#include "opt_hwpmc_hooks.h"
43#include "opt_ntp.h"
44#include "opt_watchdog.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48#include <sys/callout.h>
49#include <sys/kdb.h>
50#include <sys/kernel.h>
51#include <sys/lock.h>
52#include <sys/ktr.h>
53#include <sys/mutex.h>
54#include <sys/proc.h>
55#include <sys/resource.h>
56#include <sys/resourcevar.h>
57#include <sys/sched.h>
58#include <sys/signalvar.h>
59#include <sys/smp.h>
60#include <vm/vm.h>
61#include <vm/pmap.h>
62#include <vm/vm_map.h>
63#include <sys/sysctl.h>
64#include <sys/bus.h>
65#include <sys/interrupt.h>
66#include <sys/limits.h>
67#include <sys/timetc.h>
68
69#ifdef GPROF
70#include <sys/gmon.h>
71#endif
72
73#ifdef HWPMC_HOOKS
74#include <sys/pmckern.h>
75#endif
76
77#ifdef DEVICE_POLLING
78extern void hardclock_device_poll(void);
79#endif /* DEVICE_POLLING */
80
81static void initclocks(void *dummy);
82SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL);
83
84/* Spin-lock protecting profiling statistics. */
85static struct mtx time_lock;
86
87static int
88sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS)
89{
90	int error;
91	long cp_time[CPUSTATES];
92#ifdef SCTL_MASK32
93	int i;
94	unsigned int cp_time32[CPUSTATES];
95#endif
96
97	read_cpu_time(cp_time);
98#ifdef SCTL_MASK32
99	if (req->flags & SCTL_MASK32) {
100		if (!req->oldptr)
101			return SYSCTL_OUT(req, 0, sizeof(cp_time32));
102		for (i = 0; i < CPUSTATES; i++)
103			cp_time32[i] = (unsigned int)cp_time[i];
104		error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32));
105	} else
106#endif
107	{
108		if (!req->oldptr)
109			return SYSCTL_OUT(req, 0, sizeof(cp_time));
110		error = SYSCTL_OUT(req, cp_time, sizeof(cp_time));
111	}
112	return error;
113}
114
115SYSCTL_PROC(_kern, OID_AUTO, cp_time, CTLTYPE_LONG|CTLFLAG_RD,
116    0,0, sysctl_kern_cp_time, "LU", "CPU time statistics");
117
118static long empty[CPUSTATES];
119
120static int
121sysctl_kern_cp_times(SYSCTL_HANDLER_ARGS)
122{
123	struct pcpu *pcpu;
124	int error;
125	int c;
126	long *cp_time;
127#ifdef SCTL_MASK32
128	unsigned int cp_time32[CPUSTATES];
129	int i;
130#endif
131
132	if (!req->oldptr) {
133#ifdef SCTL_MASK32
134		if (req->flags & SCTL_MASK32)
135			return SYSCTL_OUT(req, 0, sizeof(cp_time32) * (mp_maxid + 1));
136		else
137#endif
138			return SYSCTL_OUT(req, 0, sizeof(long) * CPUSTATES * (mp_maxid + 1));
139	}
140	for (error = 0, c = 0; error == 0 && c <= mp_maxid; c++) {
141		if (!CPU_ABSENT(c)) {
142			pcpu = pcpu_find(c);
143			cp_time = pcpu->pc_cp_time;
144		} else {
145			cp_time = empty;
146		}
147#ifdef SCTL_MASK32
148		if (req->flags & SCTL_MASK32) {
149			for (i = 0; i < CPUSTATES; i++)
150				cp_time32[i] = (unsigned int)cp_time[i];
151			error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32));
152		} else
153#endif
154			error = SYSCTL_OUT(req, cp_time, sizeof(long) * CPUSTATES);
155	}
156	return error;
157}
158
159SYSCTL_PROC(_kern, OID_AUTO, cp_times, CTLTYPE_LONG|CTLFLAG_RD,
160    0,0, sysctl_kern_cp_times, "LU", "per-CPU time statistics");
161
162void
163read_cpu_time(long *cp_time)
164{
165	struct pcpu *pc;
166	int i, j;
167
168	/* Sum up global cp_time[]. */
169	bzero(cp_time, sizeof(long) * CPUSTATES);
170	for (i = 0; i <= mp_maxid; i++) {
171		if (CPU_ABSENT(i))
172			continue;
173		pc = pcpu_find(i);
174		for (j = 0; j < CPUSTATES; j++)
175			cp_time[j] += pc->pc_cp_time[j];
176	}
177}
178
179#ifdef SW_WATCHDOG
180#include <sys/watchdog.h>
181
182static int watchdog_ticks;
183static int watchdog_enabled;
184static void watchdog_fire(void);
185static void watchdog_config(void *, u_int, int *);
186#endif /* SW_WATCHDOG */
187
188/*
189 * Clock handling routines.
190 *
191 * This code is written to operate with two timers that run independently of
192 * each other.
193 *
194 * The main timer, running hz times per second, is used to trigger interval
195 * timers, timeouts and rescheduling as needed.
196 *
197 * The second timer handles kernel and user profiling,
198 * and does resource use estimation.  If the second timer is programmable,
199 * it is randomized to avoid aliasing between the two clocks.  For example,
200 * the randomization prevents an adversary from always giving up the cpu
201 * just before its quantum expires.  Otherwise, it would never accumulate
202 * cpu ticks.  The mean frequency of the second timer is stathz.
203 *
204 * If no second timer exists, stathz will be zero; in this case we drive
205 * profiling and statistics off the main clock.  This WILL NOT be accurate;
206 * do not do it unless absolutely necessary.
207 *
208 * The statistics clock may (or may not) be run at a higher rate while
209 * profiling.  This profile clock runs at profhz.  We require that profhz
210 * be an integral multiple of stathz.
211 *
212 * If the statistics clock is running fast, it must be divided by the ratio
213 * profhz/stathz for statistics.  (For profiling, every tick counts.)
214 *
215 * Time-of-day is maintained using a "timecounter", which may or may
216 * not be related to the hardware generating the above mentioned
217 * interrupts.
218 */
219
220int	stathz;
221int	profhz;
222int	profprocs;
223int	ticks;
224int	psratio;
225
226/*
227 * Initialize clock frequencies and start both clocks running.
228 */
229/* ARGSUSED*/
230static void
231initclocks(dummy)
232	void *dummy;
233{
234	register int i;
235
236	/*
237	 * Set divisors to 1 (normal case) and let the machine-specific
238	 * code do its bit.
239	 */
240	mtx_init(&time_lock, "time lock", NULL, MTX_SPIN);
241	cpu_initclocks();
242
243	/*
244	 * Compute profhz/stathz, and fix profhz if needed.
245	 */
246	i = stathz ? stathz : hz;
247	if (profhz == 0)
248		profhz = i;
249	psratio = profhz / i;
250#ifdef SW_WATCHDOG
251	EVENTHANDLER_REGISTER(watchdog_list, watchdog_config, NULL, 0);
252#endif
253}
254
255/*
256 * Each time the real-time timer fires, this function is called on all CPUs.
257 * Note that hardclock() calls hardclock_cpu() for the boot CPU, so only
258 * the other CPUs in the system need to call this function.
259 */
260void
261hardclock_cpu(int usermode)
262{
263	struct pstats *pstats;
264	struct thread *td = curthread;
265	struct proc *p = td->td_proc;
266	int flags;
267
268	/*
269	 * Run current process's virtual and profile time, as needed.
270	 */
271	pstats = p->p_stats;
272	flags = 0;
273	if (usermode &&
274	    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) {
275		PROC_SLOCK(p);
276		if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
277			flags |= TDF_ALRMPEND | TDF_ASTPENDING;
278		PROC_SUNLOCK(p);
279	}
280	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) {
281		PROC_SLOCK(p);
282		if (itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
283			flags |= TDF_PROFPEND | TDF_ASTPENDING;
284		PROC_SUNLOCK(p);
285	}
286	thread_lock(td);
287	sched_tick();
288	td->td_flags |= flags;
289	thread_unlock(td);
290
291#ifdef	HWPMC_HOOKS
292	if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid)))
293		PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL);
294#endif
295}
296
297/*
298 * The real-time timer, interrupting hz times per second.
299 */
300void
301hardclock(int usermode, uintfptr_t pc)
302{
303	int need_softclock = 0;
304
305	hardclock_cpu(usermode);
306
307	tc_ticktock();
308	/*
309	 * If no separate statistics clock is available, run it from here.
310	 *
311	 * XXX: this only works for UP
312	 */
313	if (stathz == 0) {
314		profclock(usermode, pc);
315		statclock(usermode);
316	}
317
318#ifdef DEVICE_POLLING
319	hardclock_device_poll();	/* this is very short and quick */
320#endif /* DEVICE_POLLING */
321
322	/*
323	 * Process callouts at a very low cpu priority, so we don't keep the
324	 * relatively high clock interrupt priority any longer than necessary.
325	 */
326	mtx_lock_spin_flags(&callout_lock, MTX_QUIET);
327	ticks++;
328	if (!TAILQ_EMPTY(&callwheel[ticks & callwheelmask])) {
329		need_softclock = 1;
330	} else if (softticks + 1 == ticks)
331		++softticks;
332	mtx_unlock_spin_flags(&callout_lock, MTX_QUIET);
333
334	/*
335	 * swi_sched acquires the thread lock, so we don't want to call it
336	 * with callout_lock held; incorrect locking order.
337	 */
338	if (need_softclock)
339		swi_sched(softclock_ih, 0);
340
341#ifdef SW_WATCHDOG
342	if (watchdog_enabled > 0 && --watchdog_ticks <= 0)
343		watchdog_fire();
344#endif /* SW_WATCHDOG */
345}
346
347/*
348 * Compute number of ticks in the specified amount of time.
349 */
350int
351tvtohz(tv)
352	struct timeval *tv;
353{
354	register unsigned long ticks;
355	register long sec, usec;
356
357	/*
358	 * If the number of usecs in the whole seconds part of the time
359	 * difference fits in a long, then the total number of usecs will
360	 * fit in an unsigned long.  Compute the total and convert it to
361	 * ticks, rounding up and adding 1 to allow for the current tick
362	 * to expire.  Rounding also depends on unsigned long arithmetic
363	 * to avoid overflow.
364	 *
365	 * Otherwise, if the number of ticks in the whole seconds part of
366	 * the time difference fits in a long, then convert the parts to
367	 * ticks separately and add, using similar rounding methods and
368	 * overflow avoidance.  This method would work in the previous
369	 * case but it is slightly slower and assumes that hz is integral.
370	 *
371	 * Otherwise, round the time difference down to the maximum
372	 * representable value.
373	 *
374	 * If ints have 32 bits, then the maximum value for any timeout in
375	 * 10ms ticks is 248 days.
376	 */
377	sec = tv->tv_sec;
378	usec = tv->tv_usec;
379	if (usec < 0) {
380		sec--;
381		usec += 1000000;
382	}
383	if (sec < 0) {
384#ifdef DIAGNOSTIC
385		if (usec > 0) {
386			sec++;
387			usec -= 1000000;
388		}
389		printf("tvotohz: negative time difference %ld sec %ld usec\n",
390		       sec, usec);
391#endif
392		ticks = 1;
393	} else if (sec <= LONG_MAX / 1000000)
394		ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
395			/ tick + 1;
396	else if (sec <= LONG_MAX / hz)
397		ticks = sec * hz
398			+ ((unsigned long)usec + (tick - 1)) / tick + 1;
399	else
400		ticks = LONG_MAX;
401	if (ticks > INT_MAX)
402		ticks = INT_MAX;
403	return ((int)ticks);
404}
405
406/*
407 * Start profiling on a process.
408 *
409 * Kernel profiling passes proc0 which never exits and hence
410 * keeps the profile clock running constantly.
411 */
412void
413startprofclock(p)
414	register struct proc *p;
415{
416
417	PROC_LOCK_ASSERT(p, MA_OWNED);
418	if (p->p_flag & P_STOPPROF)
419		return;
420	if ((p->p_flag & P_PROFIL) == 0) {
421		p->p_flag |= P_PROFIL;
422		mtx_lock_spin(&time_lock);
423		if (++profprocs == 1)
424			cpu_startprofclock();
425		mtx_unlock_spin(&time_lock);
426	}
427}
428
429/*
430 * Stop profiling on a process.
431 */
432void
433stopprofclock(p)
434	register struct proc *p;
435{
436
437	PROC_LOCK_ASSERT(p, MA_OWNED);
438	if (p->p_flag & P_PROFIL) {
439		if (p->p_profthreads != 0) {
440			p->p_flag |= P_STOPPROF;
441			while (p->p_profthreads != 0)
442				msleep(&p->p_profthreads, &p->p_mtx, PPAUSE,
443				    "stopprof", 0);
444			p->p_flag &= ~P_STOPPROF;
445		}
446		if ((p->p_flag & P_PROFIL) == 0)
447			return;
448		p->p_flag &= ~P_PROFIL;
449		mtx_lock_spin(&time_lock);
450		if (--profprocs == 0)
451			cpu_stopprofclock();
452		mtx_unlock_spin(&time_lock);
453	}
454}
455
456/*
457 * Statistics clock.  Updates rusage information and calls the scheduler
458 * to adjust priorities of the active thread.
459 *
460 * This should be called by all active processors.
461 */
462void
463statclock(int usermode)
464{
465	struct rusage *ru;
466	struct vmspace *vm;
467	struct thread *td;
468	struct proc *p;
469	long rss;
470	long *cp_time;
471
472	td = curthread;
473	p = td->td_proc;
474
475	cp_time = (long *)PCPU_PTR(cp_time);
476	if (usermode) {
477		/*
478		 * Charge the time as appropriate.
479		 */
480		td->td_uticks++;
481		if (p->p_nice > NZERO)
482			cp_time[CP_NICE]++;
483		else
484			cp_time[CP_USER]++;
485	} else {
486		/*
487		 * Came from kernel mode, so we were:
488		 * - handling an interrupt,
489		 * - doing syscall or trap work on behalf of the current
490		 *   user process, or
491		 * - spinning in the idle loop.
492		 * Whichever it is, charge the time as appropriate.
493		 * Note that we charge interrupts to the current process,
494		 * regardless of whether they are ``for'' that process,
495		 * so that we know how much of its real time was spent
496		 * in ``non-process'' (i.e., interrupt) work.
497		 */
498		if ((td->td_pflags & TDP_ITHREAD) ||
499		    td->td_intr_nesting_level >= 2) {
500			td->td_iticks++;
501			cp_time[CP_INTR]++;
502		} else {
503			td->td_pticks++;
504			td->td_sticks++;
505			if (!TD_IS_IDLETHREAD(td))
506				cp_time[CP_SYS]++;
507			else
508				cp_time[CP_IDLE]++;
509		}
510	}
511
512	/* Update resource usage integrals and maximums. */
513	MPASS(p->p_vmspace != NULL);
514	vm = p->p_vmspace;
515	ru = &td->td_ru;
516	ru->ru_ixrss += pgtok(vm->vm_tsize);
517	ru->ru_idrss += pgtok(vm->vm_dsize);
518	ru->ru_isrss += pgtok(vm->vm_ssize);
519	rss = pgtok(vmspace_resident_count(vm));
520	if (ru->ru_maxrss < rss)
521		ru->ru_maxrss = rss;
522	CTR4(KTR_SCHED, "statclock: %p(%s) prio %d stathz %d",
523	    td, td->td_name, td->td_priority, (stathz)?stathz:hz);
524	thread_lock_flags(td, MTX_QUIET);
525	sched_clock(td);
526	thread_unlock(td);
527}
528
529void
530profclock(int usermode, uintfptr_t pc)
531{
532	struct thread *td;
533#ifdef GPROF
534	struct gmonparam *g;
535	uintfptr_t i;
536#endif
537
538	td = curthread;
539	if (usermode) {
540		/*
541		 * Came from user mode; CPU was in user state.
542		 * If this process is being profiled, record the tick.
543		 * if there is no related user location yet, don't
544		 * bother trying to count it.
545		 */
546		if (td->td_proc->p_flag & P_PROFIL)
547			addupc_intr(td, pc, 1);
548	}
549#ifdef GPROF
550	else {
551		/*
552		 * Kernel statistics are just like addupc_intr, only easier.
553		 */
554		g = &_gmonparam;
555		if (g->state == GMON_PROF_ON && pc >= g->lowpc) {
556			i = PC_TO_I(g, pc);
557			if (i < g->textsize) {
558				KCOUNT(g, i)++;
559			}
560		}
561	}
562#endif
563}
564
565/*
566 * Return information about system clocks.
567 */
568static int
569sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
570{
571	struct clockinfo clkinfo;
572	/*
573	 * Construct clockinfo structure.
574	 */
575	bzero(&clkinfo, sizeof(clkinfo));
576	clkinfo.hz = hz;
577	clkinfo.tick = tick;
578	clkinfo.profhz = profhz;
579	clkinfo.stathz = stathz ? stathz : hz;
580	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
581}
582
583SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
584	0, 0, sysctl_kern_clockrate, "S,clockinfo",
585	"Rate and period of various kernel clocks");
586
587#ifdef SW_WATCHDOG
588
589static void
590watchdog_config(void *unused __unused, u_int cmd, int *error)
591{
592	u_int u;
593
594	u = cmd & WD_INTERVAL;
595	if (u >= WD_TO_1SEC) {
596		watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz;
597		watchdog_enabled = 1;
598		*error = 0;
599	} else {
600		watchdog_enabled = 0;
601	}
602}
603
604/*
605 * Handle a watchdog timeout by dumping interrupt information and
606 * then either dropping to DDB or panicking.
607 */
608static void
609watchdog_fire(void)
610{
611	int nintr;
612	u_int64_t inttotal;
613	u_long *curintr;
614	char *curname;
615
616	curintr = intrcnt;
617	curname = intrnames;
618	inttotal = 0;
619	nintr = eintrcnt - intrcnt;
620
621	printf("interrupt                   total\n");
622	while (--nintr >= 0) {
623		if (*curintr)
624			printf("%-12s %20lu\n", curname, *curintr);
625		curname += strlen(curname) + 1;
626		inttotal += *curintr++;
627	}
628	printf("Total        %20ju\n", (uintmax_t)inttotal);
629
630#if defined(KDB) && !defined(KDB_UNATTENDED)
631	kdb_backtrace();
632	kdb_enter(KDB_WHY_WATCHDOG, "watchdog timeout");
633#else
634	panic("watchdog timeout");
635#endif
636}
637
638#endif /* SW_WATCHDOG */
639