kern_tc.c revision 305866
1/*-
2 * ----------------------------------------------------------------------------
3 * "THE BEER-WARE LICENSE" (Revision 42):
4 * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5 * can do whatever you want with this stuff. If we meet some day, and you think
6 * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7 * ----------------------------------------------------------------------------
8 *
9 * Copyright (c) 2011, 2015, 2016 The FreeBSD Foundation
10 * All rights reserved.
11 *
12 * Portions of this software were developed by Julien Ridoux at the University
13 * of Melbourne under sponsorship from the FreeBSD Foundation.
14 *
15 * Portions of this software were developed by Konstantin Belousov
16 * under sponsorship from the FreeBSD Foundation.
17 */
18
19#include <sys/cdefs.h>
20__FBSDID("$FreeBSD: stable/11/sys/kern/kern_tc.c 305866 2016-09-16 10:04:28Z kib $");
21
22#include "opt_compat.h"
23#include "opt_ntp.h"
24#include "opt_ffclock.h"
25
26#include <sys/param.h>
27#include <sys/kernel.h>
28#include <sys/limits.h>
29#include <sys/lock.h>
30#include <sys/mutex.h>
31#include <sys/sbuf.h>
32#include <sys/sysctl.h>
33#include <sys/syslog.h>
34#include <sys/systm.h>
35#include <sys/timeffc.h>
36#include <sys/timepps.h>
37#include <sys/timetc.h>
38#include <sys/timex.h>
39#include <sys/vdso.h>
40
41/*
42 * A large step happens on boot.  This constant detects such steps.
43 * It is relatively small so that ntp_update_second gets called enough
44 * in the typical 'missed a couple of seconds' case, but doesn't loop
45 * forever when the time step is large.
46 */
47#define LARGE_STEP	200
48
49/*
50 * Implement a dummy timecounter which we can use until we get a real one
51 * in the air.  This allows the console and other early stuff to use
52 * time services.
53 */
54
55static u_int
56dummy_get_timecount(struct timecounter *tc)
57{
58	static u_int now;
59
60	return (++now);
61}
62
63static struct timecounter dummy_timecounter = {
64	dummy_get_timecount, 0, ~0u, 1000000, "dummy", -1000000
65};
66
67struct timehands {
68	/* These fields must be initialized by the driver. */
69	struct timecounter	*th_counter;
70	int64_t			th_adjustment;
71	uint64_t		th_scale;
72	u_int	 		th_offset_count;
73	struct bintime		th_offset;
74	struct bintime		th_bintime;
75	struct timeval		th_microtime;
76	struct timespec		th_nanotime;
77	struct bintime		th_boottime;
78	/* Fields not to be copied in tc_windup start with th_generation. */
79	u_int			th_generation;
80	struct timehands	*th_next;
81};
82
83static struct timehands th0;
84static struct timehands th1 = {
85	.th_next = &th0
86};
87static struct timehands th0 = {
88	.th_counter = &dummy_timecounter,
89	.th_scale = (uint64_t)-1 / 1000000,
90	.th_offset = { .sec = 1 },
91	.th_generation = 1,
92	.th_next = &th1
93};
94
95static struct timehands *volatile timehands = &th0;
96struct timecounter *timecounter = &dummy_timecounter;
97static struct timecounter *timecounters = &dummy_timecounter;
98
99int tc_min_ticktock_freq = 1;
100
101volatile time_t time_second = 1;
102volatile time_t time_uptime = 1;
103
104struct bintime boottimebin;
105struct timeval boottime;
106static int sysctl_kern_boottime(SYSCTL_HANDLER_ARGS);
107SYSCTL_PROC(_kern, KERN_BOOTTIME, boottime, CTLTYPE_STRUCT|CTLFLAG_RD,
108    NULL, 0, sysctl_kern_boottime, "S,timeval", "System boottime");
109
110SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
111static SYSCTL_NODE(_kern_timecounter, OID_AUTO, tc, CTLFLAG_RW, 0, "");
112
113static int timestepwarnings;
114SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW,
115    &timestepwarnings, 0, "Log time steps");
116
117struct bintime bt_timethreshold;
118struct bintime bt_tickthreshold;
119sbintime_t sbt_timethreshold;
120sbintime_t sbt_tickthreshold;
121struct bintime tc_tick_bt;
122sbintime_t tc_tick_sbt;
123int tc_precexp;
124int tc_timepercentage = TC_DEFAULTPERC;
125static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS);
126SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation,
127    CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
128    sysctl_kern_timecounter_adjprecision, "I",
129    "Allowed time interval deviation in percents");
130
131static int tc_chosen;	/* Non-zero if a specific tc was chosen via sysctl. */
132
133static void tc_windup(struct bintime *new_boottimebin);
134static void cpu_tick_calibrate(int);
135
136void dtrace_getnanotime(struct timespec *tsp);
137
138static int
139sysctl_kern_boottime(SYSCTL_HANDLER_ARGS)
140{
141	struct timeval boottime_x;
142
143	getboottime(&boottime_x);
144
145#ifndef __mips__
146#ifdef SCTL_MASK32
147	int tv[2];
148
149	if (req->flags & SCTL_MASK32) {
150		tv[0] = boottime_x.tv_sec;
151		tv[1] = boottime_x.tv_usec;
152		return (SYSCTL_OUT(req, tv, sizeof(tv)));
153	}
154#endif
155#endif
156	return (SYSCTL_OUT(req, &boottime_x, sizeof(boottime_x)));
157}
158
159static int
160sysctl_kern_timecounter_get(SYSCTL_HANDLER_ARGS)
161{
162	u_int ncount;
163	struct timecounter *tc = arg1;
164
165	ncount = tc->tc_get_timecount(tc);
166	return (sysctl_handle_int(oidp, &ncount, 0, req));
167}
168
169static int
170sysctl_kern_timecounter_freq(SYSCTL_HANDLER_ARGS)
171{
172	uint64_t freq;
173	struct timecounter *tc = arg1;
174
175	freq = tc->tc_frequency;
176	return (sysctl_handle_64(oidp, &freq, 0, req));
177}
178
179/*
180 * Return the difference between the timehands' counter value now and what
181 * was when we copied it to the timehands' offset_count.
182 */
183static __inline u_int
184tc_delta(struct timehands *th)
185{
186	struct timecounter *tc;
187
188	tc = th->th_counter;
189	return ((tc->tc_get_timecount(tc) - th->th_offset_count) &
190	    tc->tc_counter_mask);
191}
192
193/*
194 * Functions for reading the time.  We have to loop until we are sure that
195 * the timehands that we operated on was not updated under our feet.  See
196 * the comment in <sys/time.h> for a description of these 12 functions.
197 */
198
199#ifdef FFCLOCK
200void
201fbclock_binuptime(struct bintime *bt)
202{
203	struct timehands *th;
204	unsigned int gen;
205
206	do {
207		th = timehands;
208		gen = atomic_load_acq_int(&th->th_generation);
209		*bt = th->th_offset;
210		bintime_addx(bt, th->th_scale * tc_delta(th));
211		atomic_thread_fence_acq();
212	} while (gen == 0 || gen != th->th_generation);
213}
214
215void
216fbclock_nanouptime(struct timespec *tsp)
217{
218	struct bintime bt;
219
220	fbclock_binuptime(&bt);
221	bintime2timespec(&bt, tsp);
222}
223
224void
225fbclock_microuptime(struct timeval *tvp)
226{
227	struct bintime bt;
228
229	fbclock_binuptime(&bt);
230	bintime2timeval(&bt, tvp);
231}
232
233void
234fbclock_bintime(struct bintime *bt)
235{
236	struct timehands *th;
237	unsigned int gen;
238
239	do {
240		th = timehands;
241		gen = atomic_load_acq_int(&th->th_generation);
242		*bt = th->th_bintime;
243		bintime_addx(bt, th->th_scale * tc_delta(th));
244		atomic_thread_fence_acq();
245	} while (gen == 0 || gen != th->th_generation);
246}
247
248void
249fbclock_nanotime(struct timespec *tsp)
250{
251	struct bintime bt;
252
253	fbclock_bintime(&bt);
254	bintime2timespec(&bt, tsp);
255}
256
257void
258fbclock_microtime(struct timeval *tvp)
259{
260	struct bintime bt;
261
262	fbclock_bintime(&bt);
263	bintime2timeval(&bt, tvp);
264}
265
266void
267fbclock_getbinuptime(struct bintime *bt)
268{
269	struct timehands *th;
270	unsigned int gen;
271
272	do {
273		th = timehands;
274		gen = atomic_load_acq_int(&th->th_generation);
275		*bt = th->th_offset;
276		atomic_thread_fence_acq();
277	} while (gen == 0 || gen != th->th_generation);
278}
279
280void
281fbclock_getnanouptime(struct timespec *tsp)
282{
283	struct timehands *th;
284	unsigned int gen;
285
286	do {
287		th = timehands;
288		gen = atomic_load_acq_int(&th->th_generation);
289		bintime2timespec(&th->th_offset, tsp);
290		atomic_thread_fence_acq();
291	} while (gen == 0 || gen != th->th_generation);
292}
293
294void
295fbclock_getmicrouptime(struct timeval *tvp)
296{
297	struct timehands *th;
298	unsigned int gen;
299
300	do {
301		th = timehands;
302		gen = atomic_load_acq_int(&th->th_generation);
303		bintime2timeval(&th->th_offset, tvp);
304		atomic_thread_fence_acq();
305	} while (gen == 0 || gen != th->th_generation);
306}
307
308void
309fbclock_getbintime(struct bintime *bt)
310{
311	struct timehands *th;
312	unsigned int gen;
313
314	do {
315		th = timehands;
316		gen = atomic_load_acq_int(&th->th_generation);
317		*bt = th->th_bintime;
318		atomic_thread_fence_acq();
319	} while (gen == 0 || gen != th->th_generation);
320}
321
322void
323fbclock_getnanotime(struct timespec *tsp)
324{
325	struct timehands *th;
326	unsigned int gen;
327
328	do {
329		th = timehands;
330		gen = atomic_load_acq_int(&th->th_generation);
331		*tsp = th->th_nanotime;
332		atomic_thread_fence_acq();
333	} while (gen == 0 || gen != th->th_generation);
334}
335
336void
337fbclock_getmicrotime(struct timeval *tvp)
338{
339	struct timehands *th;
340	unsigned int gen;
341
342	do {
343		th = timehands;
344		gen = atomic_load_acq_int(&th->th_generation);
345		*tvp = th->th_microtime;
346		atomic_thread_fence_acq();
347	} while (gen == 0 || gen != th->th_generation);
348}
349#else /* !FFCLOCK */
350void
351binuptime(struct bintime *bt)
352{
353	struct timehands *th;
354	u_int gen;
355
356	do {
357		th = timehands;
358		gen = atomic_load_acq_int(&th->th_generation);
359		*bt = th->th_offset;
360		bintime_addx(bt, th->th_scale * tc_delta(th));
361		atomic_thread_fence_acq();
362	} while (gen == 0 || gen != th->th_generation);
363}
364
365void
366nanouptime(struct timespec *tsp)
367{
368	struct bintime bt;
369
370	binuptime(&bt);
371	bintime2timespec(&bt, tsp);
372}
373
374void
375microuptime(struct timeval *tvp)
376{
377	struct bintime bt;
378
379	binuptime(&bt);
380	bintime2timeval(&bt, tvp);
381}
382
383void
384bintime(struct bintime *bt)
385{
386	struct timehands *th;
387	u_int gen;
388
389	do {
390		th = timehands;
391		gen = atomic_load_acq_int(&th->th_generation);
392		*bt = th->th_bintime;
393		bintime_addx(bt, th->th_scale * tc_delta(th));
394		atomic_thread_fence_acq();
395	} while (gen == 0 || gen != th->th_generation);
396}
397
398void
399nanotime(struct timespec *tsp)
400{
401	struct bintime bt;
402
403	bintime(&bt);
404	bintime2timespec(&bt, tsp);
405}
406
407void
408microtime(struct timeval *tvp)
409{
410	struct bintime bt;
411
412	bintime(&bt);
413	bintime2timeval(&bt, tvp);
414}
415
416void
417getbinuptime(struct bintime *bt)
418{
419	struct timehands *th;
420	u_int gen;
421
422	do {
423		th = timehands;
424		gen = atomic_load_acq_int(&th->th_generation);
425		*bt = th->th_offset;
426		atomic_thread_fence_acq();
427	} while (gen == 0 || gen != th->th_generation);
428}
429
430void
431getnanouptime(struct timespec *tsp)
432{
433	struct timehands *th;
434	u_int gen;
435
436	do {
437		th = timehands;
438		gen = atomic_load_acq_int(&th->th_generation);
439		bintime2timespec(&th->th_offset, tsp);
440		atomic_thread_fence_acq();
441	} while (gen == 0 || gen != th->th_generation);
442}
443
444void
445getmicrouptime(struct timeval *tvp)
446{
447	struct timehands *th;
448	u_int gen;
449
450	do {
451		th = timehands;
452		gen = atomic_load_acq_int(&th->th_generation);
453		bintime2timeval(&th->th_offset, tvp);
454		atomic_thread_fence_acq();
455	} while (gen == 0 || gen != th->th_generation);
456}
457
458void
459getbintime(struct bintime *bt)
460{
461	struct timehands *th;
462	u_int gen;
463
464	do {
465		th = timehands;
466		gen = atomic_load_acq_int(&th->th_generation);
467		*bt = th->th_bintime;
468		atomic_thread_fence_acq();
469	} while (gen == 0 || gen != th->th_generation);
470}
471
472void
473getnanotime(struct timespec *tsp)
474{
475	struct timehands *th;
476	u_int gen;
477
478	do {
479		th = timehands;
480		gen = atomic_load_acq_int(&th->th_generation);
481		*tsp = th->th_nanotime;
482		atomic_thread_fence_acq();
483	} while (gen == 0 || gen != th->th_generation);
484}
485
486void
487getmicrotime(struct timeval *tvp)
488{
489	struct timehands *th;
490	u_int gen;
491
492	do {
493		th = timehands;
494		gen = atomic_load_acq_int(&th->th_generation);
495		*tvp = th->th_microtime;
496		atomic_thread_fence_acq();
497	} while (gen == 0 || gen != th->th_generation);
498}
499#endif /* FFCLOCK */
500
501void
502getboottime(struct timeval *boottime_x)
503{
504	struct bintime boottimebin_x;
505
506	getboottimebin(&boottimebin_x);
507	bintime2timeval(&boottimebin_x, boottime_x);
508}
509
510void
511getboottimebin(struct bintime *boottimebin_x)
512{
513	struct timehands *th;
514	u_int gen;
515
516	do {
517		th = timehands;
518		gen = atomic_load_acq_int(&th->th_generation);
519		*boottimebin_x = th->th_boottime;
520		atomic_thread_fence_acq();
521	} while (gen == 0 || gen != th->th_generation);
522}
523
524#ifdef FFCLOCK
525/*
526 * Support for feed-forward synchronization algorithms. This is heavily inspired
527 * by the timehands mechanism but kept independent from it. *_windup() functions
528 * have some connection to avoid accessing the timecounter hardware more than
529 * necessary.
530 */
531
532/* Feed-forward clock estimates kept updated by the synchronization daemon. */
533struct ffclock_estimate ffclock_estimate;
534struct bintime ffclock_boottime;	/* Feed-forward boot time estimate. */
535uint32_t ffclock_status;		/* Feed-forward clock status. */
536int8_t ffclock_updated;			/* New estimates are available. */
537struct mtx ffclock_mtx;			/* Mutex on ffclock_estimate. */
538
539struct fftimehands {
540	struct ffclock_estimate	cest;
541	struct bintime		tick_time;
542	struct bintime		tick_time_lerp;
543	ffcounter		tick_ffcount;
544	uint64_t		period_lerp;
545	volatile uint8_t	gen;
546	struct fftimehands	*next;
547};
548
549#define	NUM_ELEMENTS(x) (sizeof(x) / sizeof(*x))
550
551static struct fftimehands ffth[10];
552static struct fftimehands *volatile fftimehands = ffth;
553
554static void
555ffclock_init(void)
556{
557	struct fftimehands *cur;
558	struct fftimehands *last;
559
560	memset(ffth, 0, sizeof(ffth));
561
562	last = ffth + NUM_ELEMENTS(ffth) - 1;
563	for (cur = ffth; cur < last; cur++)
564		cur->next = cur + 1;
565	last->next = ffth;
566
567	ffclock_updated = 0;
568	ffclock_status = FFCLOCK_STA_UNSYNC;
569	mtx_init(&ffclock_mtx, "ffclock lock", NULL, MTX_DEF);
570}
571
572/*
573 * Reset the feed-forward clock estimates. Called from inittodr() to get things
574 * kick started and uses the timecounter nominal frequency as a first period
575 * estimate. Note: this function may be called several time just after boot.
576 * Note: this is the only function that sets the value of boot time for the
577 * monotonic (i.e. uptime) version of the feed-forward clock.
578 */
579void
580ffclock_reset_clock(struct timespec *ts)
581{
582	struct timecounter *tc;
583	struct ffclock_estimate cest;
584
585	tc = timehands->th_counter;
586	memset(&cest, 0, sizeof(struct ffclock_estimate));
587
588	timespec2bintime(ts, &ffclock_boottime);
589	timespec2bintime(ts, &(cest.update_time));
590	ffclock_read_counter(&cest.update_ffcount);
591	cest.leapsec_next = 0;
592	cest.period = ((1ULL << 63) / tc->tc_frequency) << 1;
593	cest.errb_abs = 0;
594	cest.errb_rate = 0;
595	cest.status = FFCLOCK_STA_UNSYNC;
596	cest.leapsec_total = 0;
597	cest.leapsec = 0;
598
599	mtx_lock(&ffclock_mtx);
600	bcopy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate));
601	ffclock_updated = INT8_MAX;
602	mtx_unlock(&ffclock_mtx);
603
604	printf("ffclock reset: %s (%llu Hz), time = %ld.%09lu\n", tc->tc_name,
605	    (unsigned long long)tc->tc_frequency, (long)ts->tv_sec,
606	    (unsigned long)ts->tv_nsec);
607}
608
609/*
610 * Sub-routine to convert a time interval measured in RAW counter units to time
611 * in seconds stored in bintime format.
612 * NOTE: bintime_mul requires u_int, but the value of the ffcounter may be
613 * larger than the max value of u_int (on 32 bit architecture). Loop to consume
614 * extra cycles.
615 */
616static void
617ffclock_convert_delta(ffcounter ffdelta, uint64_t period, struct bintime *bt)
618{
619	struct bintime bt2;
620	ffcounter delta, delta_max;
621
622	delta_max = (1ULL << (8 * sizeof(unsigned int))) - 1;
623	bintime_clear(bt);
624	do {
625		if (ffdelta > delta_max)
626			delta = delta_max;
627		else
628			delta = ffdelta;
629		bt2.sec = 0;
630		bt2.frac = period;
631		bintime_mul(&bt2, (unsigned int)delta);
632		bintime_add(bt, &bt2);
633		ffdelta -= delta;
634	} while (ffdelta > 0);
635}
636
637/*
638 * Update the fftimehands.
639 * Push the tick ffcount and time(s) forward based on current clock estimate.
640 * The conversion from ffcounter to bintime relies on the difference clock
641 * principle, whose accuracy relies on computing small time intervals. If a new
642 * clock estimate has been passed by the synchronisation daemon, make it
643 * current, and compute the linear interpolation for monotonic time if needed.
644 */
645static void
646ffclock_windup(unsigned int delta)
647{
648	struct ffclock_estimate *cest;
649	struct fftimehands *ffth;
650	struct bintime bt, gap_lerp;
651	ffcounter ffdelta;
652	uint64_t frac;
653	unsigned int polling;
654	uint8_t forward_jump, ogen;
655
656	/*
657	 * Pick the next timehand, copy current ffclock estimates and move tick
658	 * times and counter forward.
659	 */
660	forward_jump = 0;
661	ffth = fftimehands->next;
662	ogen = ffth->gen;
663	ffth->gen = 0;
664	cest = &ffth->cest;
665	bcopy(&fftimehands->cest, cest, sizeof(struct ffclock_estimate));
666	ffdelta = (ffcounter)delta;
667	ffth->period_lerp = fftimehands->period_lerp;
668
669	ffth->tick_time = fftimehands->tick_time;
670	ffclock_convert_delta(ffdelta, cest->period, &bt);
671	bintime_add(&ffth->tick_time, &bt);
672
673	ffth->tick_time_lerp = fftimehands->tick_time_lerp;
674	ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt);
675	bintime_add(&ffth->tick_time_lerp, &bt);
676
677	ffth->tick_ffcount = fftimehands->tick_ffcount + ffdelta;
678
679	/*
680	 * Assess the status of the clock, if the last update is too old, it is
681	 * likely the synchronisation daemon is dead and the clock is free
682	 * running.
683	 */
684	if (ffclock_updated == 0) {
685		ffdelta = ffth->tick_ffcount - cest->update_ffcount;
686		ffclock_convert_delta(ffdelta, cest->period, &bt);
687		if (bt.sec > 2 * FFCLOCK_SKM_SCALE)
688			ffclock_status |= FFCLOCK_STA_UNSYNC;
689	}
690
691	/*
692	 * If available, grab updated clock estimates and make them current.
693	 * Recompute time at this tick using the updated estimates. The clock
694	 * estimates passed the feed-forward synchronisation daemon may result
695	 * in time conversion that is not monotonically increasing (just after
696	 * the update). time_lerp is a particular linear interpolation over the
697	 * synchronisation algo polling period that ensures monotonicity for the
698	 * clock ids requesting it.
699	 */
700	if (ffclock_updated > 0) {
701		bcopy(&ffclock_estimate, cest, sizeof(struct ffclock_estimate));
702		ffdelta = ffth->tick_ffcount - cest->update_ffcount;
703		ffth->tick_time = cest->update_time;
704		ffclock_convert_delta(ffdelta, cest->period, &bt);
705		bintime_add(&ffth->tick_time, &bt);
706
707		/* ffclock_reset sets ffclock_updated to INT8_MAX */
708		if (ffclock_updated == INT8_MAX)
709			ffth->tick_time_lerp = ffth->tick_time;
710
711		if (bintime_cmp(&ffth->tick_time, &ffth->tick_time_lerp, >))
712			forward_jump = 1;
713		else
714			forward_jump = 0;
715
716		bintime_clear(&gap_lerp);
717		if (forward_jump) {
718			gap_lerp = ffth->tick_time;
719			bintime_sub(&gap_lerp, &ffth->tick_time_lerp);
720		} else {
721			gap_lerp = ffth->tick_time_lerp;
722			bintime_sub(&gap_lerp, &ffth->tick_time);
723		}
724
725		/*
726		 * The reset from the RTC clock may be far from accurate, and
727		 * reducing the gap between real time and interpolated time
728		 * could take a very long time if the interpolated clock insists
729		 * on strict monotonicity. The clock is reset under very strict
730		 * conditions (kernel time is known to be wrong and
731		 * synchronization daemon has been restarted recently.
732		 * ffclock_boottime absorbs the jump to ensure boot time is
733		 * correct and uptime functions stay consistent.
734		 */
735		if (((ffclock_status & FFCLOCK_STA_UNSYNC) == FFCLOCK_STA_UNSYNC) &&
736		    ((cest->status & FFCLOCK_STA_UNSYNC) == 0) &&
737		    ((cest->status & FFCLOCK_STA_WARMUP) == FFCLOCK_STA_WARMUP)) {
738			if (forward_jump)
739				bintime_add(&ffclock_boottime, &gap_lerp);
740			else
741				bintime_sub(&ffclock_boottime, &gap_lerp);
742			ffth->tick_time_lerp = ffth->tick_time;
743			bintime_clear(&gap_lerp);
744		}
745
746		ffclock_status = cest->status;
747		ffth->period_lerp = cest->period;
748
749		/*
750		 * Compute corrected period used for the linear interpolation of
751		 * time. The rate of linear interpolation is capped to 5000PPM
752		 * (5ms/s).
753		 */
754		if (bintime_isset(&gap_lerp)) {
755			ffdelta = cest->update_ffcount;
756			ffdelta -= fftimehands->cest.update_ffcount;
757			ffclock_convert_delta(ffdelta, cest->period, &bt);
758			polling = bt.sec;
759			bt.sec = 0;
760			bt.frac = 5000000 * (uint64_t)18446744073LL;
761			bintime_mul(&bt, polling);
762			if (bintime_cmp(&gap_lerp, &bt, >))
763				gap_lerp = bt;
764
765			/* Approximate 1 sec by 1-(1/2^64) to ease arithmetic */
766			frac = 0;
767			if (gap_lerp.sec > 0) {
768				frac -= 1;
769				frac /= ffdelta / gap_lerp.sec;
770			}
771			frac += gap_lerp.frac / ffdelta;
772
773			if (forward_jump)
774				ffth->period_lerp += frac;
775			else
776				ffth->period_lerp -= frac;
777		}
778
779		ffclock_updated = 0;
780	}
781	if (++ogen == 0)
782		ogen = 1;
783	ffth->gen = ogen;
784	fftimehands = ffth;
785}
786
787/*
788 * Adjust the fftimehands when the timecounter is changed. Stating the obvious,
789 * the old and new hardware counter cannot be read simultaneously. tc_windup()
790 * does read the two counters 'back to back', but a few cycles are effectively
791 * lost, and not accumulated in tick_ffcount. This is a fairly radical
792 * operation for a feed-forward synchronization daemon, and it is its job to not
793 * pushing irrelevant data to the kernel. Because there is no locking here,
794 * simply force to ignore pending or next update to give daemon a chance to
795 * realize the counter has changed.
796 */
797static void
798ffclock_change_tc(struct timehands *th)
799{
800	struct fftimehands *ffth;
801	struct ffclock_estimate *cest;
802	struct timecounter *tc;
803	uint8_t ogen;
804
805	tc = th->th_counter;
806	ffth = fftimehands->next;
807	ogen = ffth->gen;
808	ffth->gen = 0;
809
810	cest = &ffth->cest;
811	bcopy(&(fftimehands->cest), cest, sizeof(struct ffclock_estimate));
812	cest->period = ((1ULL << 63) / tc->tc_frequency ) << 1;
813	cest->errb_abs = 0;
814	cest->errb_rate = 0;
815	cest->status |= FFCLOCK_STA_UNSYNC;
816
817	ffth->tick_ffcount = fftimehands->tick_ffcount;
818	ffth->tick_time_lerp = fftimehands->tick_time_lerp;
819	ffth->tick_time = fftimehands->tick_time;
820	ffth->period_lerp = cest->period;
821
822	/* Do not lock but ignore next update from synchronization daemon. */
823	ffclock_updated--;
824
825	if (++ogen == 0)
826		ogen = 1;
827	ffth->gen = ogen;
828	fftimehands = ffth;
829}
830
831/*
832 * Retrieve feed-forward counter and time of last kernel tick.
833 */
834void
835ffclock_last_tick(ffcounter *ffcount, struct bintime *bt, uint32_t flags)
836{
837	struct fftimehands *ffth;
838	uint8_t gen;
839
840	/*
841	 * No locking but check generation has not changed. Also need to make
842	 * sure ffdelta is positive, i.e. ffcount > tick_ffcount.
843	 */
844	do {
845		ffth = fftimehands;
846		gen = ffth->gen;
847		if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP)
848			*bt = ffth->tick_time_lerp;
849		else
850			*bt = ffth->tick_time;
851		*ffcount = ffth->tick_ffcount;
852	} while (gen == 0 || gen != ffth->gen);
853}
854
855/*
856 * Absolute clock conversion. Low level function to convert ffcounter to
857 * bintime. The ffcounter is converted using the current ffclock period estimate
858 * or the "interpolated period" to ensure monotonicity.
859 * NOTE: this conversion may have been deferred, and the clock updated since the
860 * hardware counter has been read.
861 */
862void
863ffclock_convert_abs(ffcounter ffcount, struct bintime *bt, uint32_t flags)
864{
865	struct fftimehands *ffth;
866	struct bintime bt2;
867	ffcounter ffdelta;
868	uint8_t gen;
869
870	/*
871	 * No locking but check generation has not changed. Also need to make
872	 * sure ffdelta is positive, i.e. ffcount > tick_ffcount.
873	 */
874	do {
875		ffth = fftimehands;
876		gen = ffth->gen;
877		if (ffcount > ffth->tick_ffcount)
878			ffdelta = ffcount - ffth->tick_ffcount;
879		else
880			ffdelta = ffth->tick_ffcount - ffcount;
881
882		if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP) {
883			*bt = ffth->tick_time_lerp;
884			ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt2);
885		} else {
886			*bt = ffth->tick_time;
887			ffclock_convert_delta(ffdelta, ffth->cest.period, &bt2);
888		}
889
890		if (ffcount > ffth->tick_ffcount)
891			bintime_add(bt, &bt2);
892		else
893			bintime_sub(bt, &bt2);
894	} while (gen == 0 || gen != ffth->gen);
895}
896
897/*
898 * Difference clock conversion.
899 * Low level function to Convert a time interval measured in RAW counter units
900 * into bintime. The difference clock allows measuring small intervals much more
901 * reliably than the absolute clock.
902 */
903void
904ffclock_convert_diff(ffcounter ffdelta, struct bintime *bt)
905{
906	struct fftimehands *ffth;
907	uint8_t gen;
908
909	/* No locking but check generation has not changed. */
910	do {
911		ffth = fftimehands;
912		gen = ffth->gen;
913		ffclock_convert_delta(ffdelta, ffth->cest.period, bt);
914	} while (gen == 0 || gen != ffth->gen);
915}
916
917/*
918 * Access to current ffcounter value.
919 */
920void
921ffclock_read_counter(ffcounter *ffcount)
922{
923	struct timehands *th;
924	struct fftimehands *ffth;
925	unsigned int gen, delta;
926
927	/*
928	 * ffclock_windup() called from tc_windup(), safe to rely on
929	 * th->th_generation only, for correct delta and ffcounter.
930	 */
931	do {
932		th = timehands;
933		gen = atomic_load_acq_int(&th->th_generation);
934		ffth = fftimehands;
935		delta = tc_delta(th);
936		*ffcount = ffth->tick_ffcount;
937		atomic_thread_fence_acq();
938	} while (gen == 0 || gen != th->th_generation);
939
940	*ffcount += delta;
941}
942
943void
944binuptime(struct bintime *bt)
945{
946
947	binuptime_fromclock(bt, sysclock_active);
948}
949
950void
951nanouptime(struct timespec *tsp)
952{
953
954	nanouptime_fromclock(tsp, sysclock_active);
955}
956
957void
958microuptime(struct timeval *tvp)
959{
960
961	microuptime_fromclock(tvp, sysclock_active);
962}
963
964void
965bintime(struct bintime *bt)
966{
967
968	bintime_fromclock(bt, sysclock_active);
969}
970
971void
972nanotime(struct timespec *tsp)
973{
974
975	nanotime_fromclock(tsp, sysclock_active);
976}
977
978void
979microtime(struct timeval *tvp)
980{
981
982	microtime_fromclock(tvp, sysclock_active);
983}
984
985void
986getbinuptime(struct bintime *bt)
987{
988
989	getbinuptime_fromclock(bt, sysclock_active);
990}
991
992void
993getnanouptime(struct timespec *tsp)
994{
995
996	getnanouptime_fromclock(tsp, sysclock_active);
997}
998
999void
1000getmicrouptime(struct timeval *tvp)
1001{
1002
1003	getmicrouptime_fromclock(tvp, sysclock_active);
1004}
1005
1006void
1007getbintime(struct bintime *bt)
1008{
1009
1010	getbintime_fromclock(bt, sysclock_active);
1011}
1012
1013void
1014getnanotime(struct timespec *tsp)
1015{
1016
1017	getnanotime_fromclock(tsp, sysclock_active);
1018}
1019
1020void
1021getmicrotime(struct timeval *tvp)
1022{
1023
1024	getmicrouptime_fromclock(tvp, sysclock_active);
1025}
1026
1027#endif /* FFCLOCK */
1028
1029/*
1030 * This is a clone of getnanotime and used for walltimestamps.
1031 * The dtrace_ prefix prevents fbt from creating probes for
1032 * it so walltimestamp can be safely used in all fbt probes.
1033 */
1034void
1035dtrace_getnanotime(struct timespec *tsp)
1036{
1037	struct timehands *th;
1038	u_int gen;
1039
1040	do {
1041		th = timehands;
1042		gen = atomic_load_acq_int(&th->th_generation);
1043		*tsp = th->th_nanotime;
1044		atomic_thread_fence_acq();
1045	} while (gen == 0 || gen != th->th_generation);
1046}
1047
1048/*
1049 * System clock currently providing time to the system. Modifiable via sysctl
1050 * when the FFCLOCK option is defined.
1051 */
1052int sysclock_active = SYSCLOCK_FBCK;
1053
1054/* Internal NTP status and error estimates. */
1055extern int time_status;
1056extern long time_esterror;
1057
1058/*
1059 * Take a snapshot of sysclock data which can be used to compare system clocks
1060 * and generate timestamps after the fact.
1061 */
1062void
1063sysclock_getsnapshot(struct sysclock_snap *clock_snap, int fast)
1064{
1065	struct fbclock_info *fbi;
1066	struct timehands *th;
1067	struct bintime bt;
1068	unsigned int delta, gen;
1069#ifdef FFCLOCK
1070	ffcounter ffcount;
1071	struct fftimehands *ffth;
1072	struct ffclock_info *ffi;
1073	struct ffclock_estimate cest;
1074
1075	ffi = &clock_snap->ff_info;
1076#endif
1077
1078	fbi = &clock_snap->fb_info;
1079	delta = 0;
1080
1081	do {
1082		th = timehands;
1083		gen = atomic_load_acq_int(&th->th_generation);
1084		fbi->th_scale = th->th_scale;
1085		fbi->tick_time = th->th_offset;
1086#ifdef FFCLOCK
1087		ffth = fftimehands;
1088		ffi->tick_time = ffth->tick_time_lerp;
1089		ffi->tick_time_lerp = ffth->tick_time_lerp;
1090		ffi->period = ffth->cest.period;
1091		ffi->period_lerp = ffth->period_lerp;
1092		clock_snap->ffcount = ffth->tick_ffcount;
1093		cest = ffth->cest;
1094#endif
1095		if (!fast)
1096			delta = tc_delta(th);
1097		atomic_thread_fence_acq();
1098	} while (gen == 0 || gen != th->th_generation);
1099
1100	clock_snap->delta = delta;
1101	clock_snap->sysclock_active = sysclock_active;
1102
1103	/* Record feedback clock status and error. */
1104	clock_snap->fb_info.status = time_status;
1105	/* XXX: Very crude estimate of feedback clock error. */
1106	bt.sec = time_esterror / 1000000;
1107	bt.frac = ((time_esterror - bt.sec) * 1000000) *
1108	    (uint64_t)18446744073709ULL;
1109	clock_snap->fb_info.error = bt;
1110
1111#ifdef FFCLOCK
1112	if (!fast)
1113		clock_snap->ffcount += delta;
1114
1115	/* Record feed-forward clock leap second adjustment. */
1116	ffi->leapsec_adjustment = cest.leapsec_total;
1117	if (clock_snap->ffcount > cest.leapsec_next)
1118		ffi->leapsec_adjustment -= cest.leapsec;
1119
1120	/* Record feed-forward clock status and error. */
1121	clock_snap->ff_info.status = cest.status;
1122	ffcount = clock_snap->ffcount - cest.update_ffcount;
1123	ffclock_convert_delta(ffcount, cest.period, &bt);
1124	/* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s]. */
1125	bintime_mul(&bt, cest.errb_rate * (uint64_t)18446744073709ULL);
1126	/* 18446744073 = int(2^64 / 1e9), since err_abs in [ns]. */
1127	bintime_addx(&bt, cest.errb_abs * (uint64_t)18446744073ULL);
1128	clock_snap->ff_info.error = bt;
1129#endif
1130}
1131
1132/*
1133 * Convert a sysclock snapshot into a struct bintime based on the specified
1134 * clock source and flags.
1135 */
1136int
1137sysclock_snap2bintime(struct sysclock_snap *cs, struct bintime *bt,
1138    int whichclock, uint32_t flags)
1139{
1140	struct bintime boottimebin_x;
1141#ifdef FFCLOCK
1142	struct bintime bt2;
1143	uint64_t period;
1144#endif
1145
1146	switch (whichclock) {
1147	case SYSCLOCK_FBCK:
1148		*bt = cs->fb_info.tick_time;
1149
1150		/* If snapshot was created with !fast, delta will be >0. */
1151		if (cs->delta > 0)
1152			bintime_addx(bt, cs->fb_info.th_scale * cs->delta);
1153
1154		if ((flags & FBCLOCK_UPTIME) == 0) {
1155			getboottimebin(&boottimebin_x);
1156			bintime_add(bt, &boottimebin_x);
1157		}
1158		break;
1159#ifdef FFCLOCK
1160	case SYSCLOCK_FFWD:
1161		if (flags & FFCLOCK_LERP) {
1162			*bt = cs->ff_info.tick_time_lerp;
1163			period = cs->ff_info.period_lerp;
1164		} else {
1165			*bt = cs->ff_info.tick_time;
1166			period = cs->ff_info.period;
1167		}
1168
1169		/* If snapshot was created with !fast, delta will be >0. */
1170		if (cs->delta > 0) {
1171			ffclock_convert_delta(cs->delta, period, &bt2);
1172			bintime_add(bt, &bt2);
1173		}
1174
1175		/* Leap second adjustment. */
1176		if (flags & FFCLOCK_LEAPSEC)
1177			bt->sec -= cs->ff_info.leapsec_adjustment;
1178
1179		/* Boot time adjustment, for uptime/monotonic clocks. */
1180		if (flags & FFCLOCK_UPTIME)
1181			bintime_sub(bt, &ffclock_boottime);
1182		break;
1183#endif
1184	default:
1185		return (EINVAL);
1186		break;
1187	}
1188
1189	return (0);
1190}
1191
1192/*
1193 * Initialize a new timecounter and possibly use it.
1194 */
1195void
1196tc_init(struct timecounter *tc)
1197{
1198	u_int u;
1199	struct sysctl_oid *tc_root;
1200
1201	u = tc->tc_frequency / tc->tc_counter_mask;
1202	/* XXX: We need some margin here, 10% is a guess */
1203	u *= 11;
1204	u /= 10;
1205	if (u > hz && tc->tc_quality >= 0) {
1206		tc->tc_quality = -2000;
1207		if (bootverbose) {
1208			printf("Timecounter \"%s\" frequency %ju Hz",
1209			    tc->tc_name, (uintmax_t)tc->tc_frequency);
1210			printf(" -- Insufficient hz, needs at least %u\n", u);
1211		}
1212	} else if (tc->tc_quality >= 0 || bootverbose) {
1213		printf("Timecounter \"%s\" frequency %ju Hz quality %d\n",
1214		    tc->tc_name, (uintmax_t)tc->tc_frequency,
1215		    tc->tc_quality);
1216	}
1217
1218	tc->tc_next = timecounters;
1219	timecounters = tc;
1220	/*
1221	 * Set up sysctl tree for this counter.
1222	 */
1223	tc_root = SYSCTL_ADD_NODE(NULL,
1224	    SYSCTL_STATIC_CHILDREN(_kern_timecounter_tc), OID_AUTO, tc->tc_name,
1225	    CTLFLAG_RW, 0, "timecounter description");
1226	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
1227	    "mask", CTLFLAG_RD, &(tc->tc_counter_mask), 0,
1228	    "mask for implemented bits");
1229	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
1230	    "counter", CTLTYPE_UINT | CTLFLAG_RD, tc, sizeof(*tc),
1231	    sysctl_kern_timecounter_get, "IU", "current timecounter value");
1232	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
1233	    "frequency", CTLTYPE_U64 | CTLFLAG_RD, tc, sizeof(*tc),
1234	     sysctl_kern_timecounter_freq, "QU", "timecounter frequency");
1235	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
1236	    "quality", CTLFLAG_RD, &(tc->tc_quality), 0,
1237	    "goodness of time counter");
1238	/*
1239	 * Do not automatically switch if the current tc was specifically
1240	 * chosen.  Never automatically use a timecounter with negative quality.
1241	 * Even though we run on the dummy counter, switching here may be
1242	 * worse since this timecounter may not be monotonic.
1243	 */
1244	if (tc_chosen)
1245		return;
1246	if (tc->tc_quality < 0)
1247		return;
1248	if (tc->tc_quality < timecounter->tc_quality)
1249		return;
1250	if (tc->tc_quality == timecounter->tc_quality &&
1251	    tc->tc_frequency < timecounter->tc_frequency)
1252		return;
1253	(void)tc->tc_get_timecount(tc);
1254	(void)tc->tc_get_timecount(tc);
1255	timecounter = tc;
1256}
1257
1258/* Report the frequency of the current timecounter. */
1259uint64_t
1260tc_getfrequency(void)
1261{
1262
1263	return (timehands->th_counter->tc_frequency);
1264}
1265
1266static struct mtx tc_setclock_mtx;
1267MTX_SYSINIT(tc_setclock_init, &tc_setclock_mtx, "tcsetc", MTX_SPIN);
1268
1269/*
1270 * Step our concept of UTC.  This is done by modifying our estimate of
1271 * when we booted.
1272 */
1273void
1274tc_setclock(struct timespec *ts)
1275{
1276	struct timespec tbef, taft;
1277	struct bintime bt, bt2;
1278
1279	timespec2bintime(ts, &bt);
1280	nanotime(&tbef);
1281	mtx_lock_spin(&tc_setclock_mtx);
1282	cpu_tick_calibrate(1);
1283	binuptime(&bt2);
1284	bintime_sub(&bt, &bt2);
1285
1286	/* XXX fiddle all the little crinkly bits around the fiords... */
1287	tc_windup(&bt);
1288	mtx_unlock_spin(&tc_setclock_mtx);
1289	getboottimebin(&boottimebin);
1290	bintime2timeval(&boottimebin, &boottime);
1291	if (timestepwarnings) {
1292		nanotime(&taft);
1293		log(LOG_INFO,
1294		    "Time stepped from %jd.%09ld to %jd.%09ld (%jd.%09ld)\n",
1295		    (intmax_t)tbef.tv_sec, tbef.tv_nsec,
1296		    (intmax_t)taft.tv_sec, taft.tv_nsec,
1297		    (intmax_t)ts->tv_sec, ts->tv_nsec);
1298	}
1299}
1300
1301/*
1302 * Initialize the next struct timehands in the ring and make
1303 * it the active timehands.  Along the way we might switch to a different
1304 * timecounter and/or do seconds processing in NTP.  Slightly magic.
1305 */
1306static void
1307tc_windup(struct bintime *new_boottimebin)
1308{
1309	struct bintime bt;
1310	struct timehands *th, *tho;
1311	uint64_t scale;
1312	u_int delta, ncount, ogen;
1313	int i;
1314	time_t t;
1315
1316	/*
1317	 * Make the next timehands a copy of the current one, but do
1318	 * not overwrite the generation or next pointer.  While we
1319	 * update the contents, the generation must be zero.  We need
1320	 * to ensure that the zero generation is visible before the
1321	 * data updates become visible, which requires release fence.
1322	 * For similar reasons, re-reading of the generation after the
1323	 * data is read should use acquire fence.
1324	 */
1325	tho = timehands;
1326	th = tho->th_next;
1327	ogen = th->th_generation;
1328	th->th_generation = 0;
1329	atomic_thread_fence_rel();
1330	bcopy(tho, th, offsetof(struct timehands, th_generation));
1331	if (new_boottimebin != NULL)
1332		th->th_boottime = *new_boottimebin;
1333
1334	/*
1335	 * Capture a timecounter delta on the current timecounter and if
1336	 * changing timecounters, a counter value from the new timecounter.
1337	 * Update the offset fields accordingly.
1338	 */
1339	delta = tc_delta(th);
1340	if (th->th_counter != timecounter)
1341		ncount = timecounter->tc_get_timecount(timecounter);
1342	else
1343		ncount = 0;
1344#ifdef FFCLOCK
1345	ffclock_windup(delta);
1346#endif
1347	th->th_offset_count += delta;
1348	th->th_offset_count &= th->th_counter->tc_counter_mask;
1349	while (delta > th->th_counter->tc_frequency) {
1350		/* Eat complete unadjusted seconds. */
1351		delta -= th->th_counter->tc_frequency;
1352		th->th_offset.sec++;
1353	}
1354	if ((delta > th->th_counter->tc_frequency / 2) &&
1355	    (th->th_scale * delta < ((uint64_t)1 << 63))) {
1356		/* The product th_scale * delta just barely overflows. */
1357		th->th_offset.sec++;
1358	}
1359	bintime_addx(&th->th_offset, th->th_scale * delta);
1360
1361	/*
1362	 * Hardware latching timecounters may not generate interrupts on
1363	 * PPS events, so instead we poll them.  There is a finite risk that
1364	 * the hardware might capture a count which is later than the one we
1365	 * got above, and therefore possibly in the next NTP second which might
1366	 * have a different rate than the current NTP second.  It doesn't
1367	 * matter in practice.
1368	 */
1369	if (tho->th_counter->tc_poll_pps)
1370		tho->th_counter->tc_poll_pps(tho->th_counter);
1371
1372	/*
1373	 * Deal with NTP second processing.  The for loop normally
1374	 * iterates at most once, but in extreme situations it might
1375	 * keep NTP sane if timeouts are not run for several seconds.
1376	 * At boot, the time step can be large when the TOD hardware
1377	 * has been read, so on really large steps, we call
1378	 * ntp_update_second only twice.  We need to call it twice in
1379	 * case we missed a leap second.
1380	 */
1381	bt = th->th_offset;
1382	bintime_add(&bt, &th->th_boottime);
1383	i = bt.sec - tho->th_microtime.tv_sec;
1384	if (i > LARGE_STEP)
1385		i = 2;
1386	for (; i > 0; i--) {
1387		t = bt.sec;
1388		ntp_update_second(&th->th_adjustment, &bt.sec);
1389		if (bt.sec != t)
1390			th->th_boottime.sec += bt.sec - t;
1391	}
1392	th->th_bintime = th->th_offset;
1393	bintime_add(&th->th_bintime, &th->th_boottime);
1394	/* Update the UTC timestamps used by the get*() functions. */
1395	/* XXX shouldn't do this here.  Should force non-`get' versions. */
1396	bintime2timeval(&bt, &th->th_microtime);
1397	bintime2timespec(&bt, &th->th_nanotime);
1398
1399	/* Now is a good time to change timecounters. */
1400	if (th->th_counter != timecounter) {
1401#ifndef __arm__
1402		if ((timecounter->tc_flags & TC_FLAGS_C2STOP) != 0)
1403			cpu_disable_c2_sleep++;
1404		if ((th->th_counter->tc_flags & TC_FLAGS_C2STOP) != 0)
1405			cpu_disable_c2_sleep--;
1406#endif
1407		th->th_counter = timecounter;
1408		th->th_offset_count = ncount;
1409		tc_min_ticktock_freq = max(1, timecounter->tc_frequency /
1410		    (((uint64_t)timecounter->tc_counter_mask + 1) / 3));
1411#ifdef FFCLOCK
1412		ffclock_change_tc(th);
1413#endif
1414	}
1415
1416	/*-
1417	 * Recalculate the scaling factor.  We want the number of 1/2^64
1418	 * fractions of a second per period of the hardware counter, taking
1419	 * into account the th_adjustment factor which the NTP PLL/adjtime(2)
1420	 * processing provides us with.
1421	 *
1422	 * The th_adjustment is nanoseconds per second with 32 bit binary
1423	 * fraction and we want 64 bit binary fraction of second:
1424	 *
1425	 *	 x = a * 2^32 / 10^9 = a * 4.294967296
1426	 *
1427	 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int
1428	 * we can only multiply by about 850 without overflowing, that
1429	 * leaves no suitably precise fractions for multiply before divide.
1430	 *
1431	 * Divide before multiply with a fraction of 2199/512 results in a
1432	 * systematic undercompensation of 10PPM of th_adjustment.  On a
1433	 * 5000PPM adjustment this is a 0.05PPM error.  This is acceptable.
1434 	 *
1435	 * We happily sacrifice the lowest of the 64 bits of our result
1436	 * to the goddess of code clarity.
1437	 *
1438	 */
1439	scale = (uint64_t)1 << 63;
1440	scale += (th->th_adjustment / 1024) * 2199;
1441	scale /= th->th_counter->tc_frequency;
1442	th->th_scale = scale * 2;
1443
1444	/*
1445	 * Now that the struct timehands is again consistent, set the new
1446	 * generation number, making sure to not make it zero.
1447	 */
1448	if (++ogen == 0)
1449		ogen = 1;
1450	atomic_store_rel_int(&th->th_generation, ogen);
1451
1452	/* Go live with the new struct timehands. */
1453#ifdef FFCLOCK
1454	switch (sysclock_active) {
1455	case SYSCLOCK_FBCK:
1456#endif
1457		time_second = th->th_microtime.tv_sec;
1458		time_uptime = th->th_offset.sec;
1459#ifdef FFCLOCK
1460		break;
1461	case SYSCLOCK_FFWD:
1462		time_second = fftimehands->tick_time_lerp.sec;
1463		time_uptime = fftimehands->tick_time_lerp.sec - ffclock_boottime.sec;
1464		break;
1465	}
1466#endif
1467
1468	timehands = th;
1469	timekeep_push_vdso();
1470}
1471
1472/* Report or change the active timecounter hardware. */
1473static int
1474sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS)
1475{
1476	char newname[32];
1477	struct timecounter *newtc, *tc;
1478	int error;
1479
1480	tc = timecounter;
1481	strlcpy(newname, tc->tc_name, sizeof(newname));
1482
1483	error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req);
1484	if (error != 0 || req->newptr == NULL)
1485		return (error);
1486	/* Record that the tc in use now was specifically chosen. */
1487	tc_chosen = 1;
1488	if (strcmp(newname, tc->tc_name) == 0)
1489		return (0);
1490	for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
1491		if (strcmp(newname, newtc->tc_name) != 0)
1492			continue;
1493
1494		/* Warm up new timecounter. */
1495		(void)newtc->tc_get_timecount(newtc);
1496		(void)newtc->tc_get_timecount(newtc);
1497
1498		timecounter = newtc;
1499
1500		/*
1501		 * The vdso timehands update is deferred until the next
1502		 * 'tc_windup()'.
1503		 *
1504		 * This is prudent given that 'timekeep_push_vdso()' does not
1505		 * use any locking and that it can be called in hard interrupt
1506		 * context via 'tc_windup()'.
1507		 */
1508		return (0);
1509	}
1510	return (EINVAL);
1511}
1512
1513SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RW,
1514    0, 0, sysctl_kern_timecounter_hardware, "A",
1515    "Timecounter hardware selected");
1516
1517
1518/* Report the available timecounter hardware. */
1519static int
1520sysctl_kern_timecounter_choice(SYSCTL_HANDLER_ARGS)
1521{
1522	struct sbuf sb;
1523	struct timecounter *tc;
1524	int error;
1525
1526	sbuf_new_for_sysctl(&sb, NULL, 0, req);
1527	for (tc = timecounters; tc != NULL; tc = tc->tc_next) {
1528		if (tc != timecounters)
1529			sbuf_putc(&sb, ' ');
1530		sbuf_printf(&sb, "%s(%d)", tc->tc_name, tc->tc_quality);
1531	}
1532	error = sbuf_finish(&sb);
1533	sbuf_delete(&sb);
1534	return (error);
1535}
1536
1537SYSCTL_PROC(_kern_timecounter, OID_AUTO, choice, CTLTYPE_STRING | CTLFLAG_RD,
1538    0, 0, sysctl_kern_timecounter_choice, "A", "Timecounter hardware detected");
1539
1540/*
1541 * RFC 2783 PPS-API implementation.
1542 */
1543
1544/*
1545 *  Return true if the driver is aware of the abi version extensions in the
1546 *  pps_state structure, and it supports at least the given abi version number.
1547 */
1548static inline int
1549abi_aware(struct pps_state *pps, int vers)
1550{
1551
1552	return ((pps->kcmode & KCMODE_ABIFLAG) && pps->driver_abi >= vers);
1553}
1554
1555static int
1556pps_fetch(struct pps_fetch_args *fapi, struct pps_state *pps)
1557{
1558	int err, timo;
1559	pps_seq_t aseq, cseq;
1560	struct timeval tv;
1561
1562	if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
1563		return (EINVAL);
1564
1565	/*
1566	 * If no timeout is requested, immediately return whatever values were
1567	 * most recently captured.  If timeout seconds is -1, that's a request
1568	 * to block without a timeout.  WITNESS won't let us sleep forever
1569	 * without a lock (we really don't need a lock), so just repeatedly
1570	 * sleep a long time.
1571	 */
1572	if (fapi->timeout.tv_sec || fapi->timeout.tv_nsec) {
1573		if (fapi->timeout.tv_sec == -1)
1574			timo = 0x7fffffff;
1575		else {
1576			tv.tv_sec = fapi->timeout.tv_sec;
1577			tv.tv_usec = fapi->timeout.tv_nsec / 1000;
1578			timo = tvtohz(&tv);
1579		}
1580		aseq = pps->ppsinfo.assert_sequence;
1581		cseq = pps->ppsinfo.clear_sequence;
1582		while (aseq == pps->ppsinfo.assert_sequence &&
1583		    cseq == pps->ppsinfo.clear_sequence) {
1584			if (abi_aware(pps, 1) && pps->driver_mtx != NULL) {
1585				if (pps->flags & PPSFLAG_MTX_SPIN) {
1586					err = msleep_spin(pps, pps->driver_mtx,
1587					    "ppsfch", timo);
1588				} else {
1589					err = msleep(pps, pps->driver_mtx, PCATCH,
1590					    "ppsfch", timo);
1591				}
1592			} else {
1593				err = tsleep(pps, PCATCH, "ppsfch", timo);
1594			}
1595			if (err == EWOULDBLOCK) {
1596				if (fapi->timeout.tv_sec == -1) {
1597					continue;
1598				} else {
1599					return (ETIMEDOUT);
1600				}
1601			} else if (err != 0) {
1602				return (err);
1603			}
1604		}
1605	}
1606
1607	pps->ppsinfo.current_mode = pps->ppsparam.mode;
1608	fapi->pps_info_buf = pps->ppsinfo;
1609
1610	return (0);
1611}
1612
1613int
1614pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
1615{
1616	pps_params_t *app;
1617	struct pps_fetch_args *fapi;
1618#ifdef FFCLOCK
1619	struct pps_fetch_ffc_args *fapi_ffc;
1620#endif
1621#ifdef PPS_SYNC
1622	struct pps_kcbind_args *kapi;
1623#endif
1624
1625	KASSERT(pps != NULL, ("NULL pps pointer in pps_ioctl"));
1626	switch (cmd) {
1627	case PPS_IOC_CREATE:
1628		return (0);
1629	case PPS_IOC_DESTROY:
1630		return (0);
1631	case PPS_IOC_SETPARAMS:
1632		app = (pps_params_t *)data;
1633		if (app->mode & ~pps->ppscap)
1634			return (EINVAL);
1635#ifdef FFCLOCK
1636		/* Ensure only a single clock is selected for ffc timestamp. */
1637		if ((app->mode & PPS_TSCLK_MASK) == PPS_TSCLK_MASK)
1638			return (EINVAL);
1639#endif
1640		pps->ppsparam = *app;
1641		return (0);
1642	case PPS_IOC_GETPARAMS:
1643		app = (pps_params_t *)data;
1644		*app = pps->ppsparam;
1645		app->api_version = PPS_API_VERS_1;
1646		return (0);
1647	case PPS_IOC_GETCAP:
1648		*(int*)data = pps->ppscap;
1649		return (0);
1650	case PPS_IOC_FETCH:
1651		fapi = (struct pps_fetch_args *)data;
1652		return (pps_fetch(fapi, pps));
1653#ifdef FFCLOCK
1654	case PPS_IOC_FETCH_FFCOUNTER:
1655		fapi_ffc = (struct pps_fetch_ffc_args *)data;
1656		if (fapi_ffc->tsformat && fapi_ffc->tsformat !=
1657		    PPS_TSFMT_TSPEC)
1658			return (EINVAL);
1659		if (fapi_ffc->timeout.tv_sec || fapi_ffc->timeout.tv_nsec)
1660			return (EOPNOTSUPP);
1661		pps->ppsinfo_ffc.current_mode = pps->ppsparam.mode;
1662		fapi_ffc->pps_info_buf_ffc = pps->ppsinfo_ffc;
1663		/* Overwrite timestamps if feedback clock selected. */
1664		switch (pps->ppsparam.mode & PPS_TSCLK_MASK) {
1665		case PPS_TSCLK_FBCK:
1666			fapi_ffc->pps_info_buf_ffc.assert_timestamp =
1667			    pps->ppsinfo.assert_timestamp;
1668			fapi_ffc->pps_info_buf_ffc.clear_timestamp =
1669			    pps->ppsinfo.clear_timestamp;
1670			break;
1671		case PPS_TSCLK_FFWD:
1672			break;
1673		default:
1674			break;
1675		}
1676		return (0);
1677#endif /* FFCLOCK */
1678	case PPS_IOC_KCBIND:
1679#ifdef PPS_SYNC
1680		kapi = (struct pps_kcbind_args *)data;
1681		/* XXX Only root should be able to do this */
1682		if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
1683			return (EINVAL);
1684		if (kapi->kernel_consumer != PPS_KC_HARDPPS)
1685			return (EINVAL);
1686		if (kapi->edge & ~pps->ppscap)
1687			return (EINVAL);
1688		pps->kcmode = (kapi->edge & KCMODE_EDGEMASK) |
1689		    (pps->kcmode & KCMODE_ABIFLAG);
1690		return (0);
1691#else
1692		return (EOPNOTSUPP);
1693#endif
1694	default:
1695		return (ENOIOCTL);
1696	}
1697}
1698
1699void
1700pps_init(struct pps_state *pps)
1701{
1702	pps->ppscap |= PPS_TSFMT_TSPEC | PPS_CANWAIT;
1703	if (pps->ppscap & PPS_CAPTUREASSERT)
1704		pps->ppscap |= PPS_OFFSETASSERT;
1705	if (pps->ppscap & PPS_CAPTURECLEAR)
1706		pps->ppscap |= PPS_OFFSETCLEAR;
1707#ifdef FFCLOCK
1708	pps->ppscap |= PPS_TSCLK_MASK;
1709#endif
1710	pps->kcmode &= ~KCMODE_ABIFLAG;
1711}
1712
1713void
1714pps_init_abi(struct pps_state *pps)
1715{
1716
1717	pps_init(pps);
1718	if (pps->driver_abi > 0) {
1719		pps->kcmode |= KCMODE_ABIFLAG;
1720		pps->kernel_abi = PPS_ABI_VERSION;
1721	}
1722}
1723
1724void
1725pps_capture(struct pps_state *pps)
1726{
1727	struct timehands *th;
1728
1729	KASSERT(pps != NULL, ("NULL pps pointer in pps_capture"));
1730	th = timehands;
1731	pps->capgen = atomic_load_acq_int(&th->th_generation);
1732	pps->capth = th;
1733#ifdef FFCLOCK
1734	pps->capffth = fftimehands;
1735#endif
1736	pps->capcount = th->th_counter->tc_get_timecount(th->th_counter);
1737	atomic_thread_fence_acq();
1738	if (pps->capgen != th->th_generation)
1739		pps->capgen = 0;
1740}
1741
1742void
1743pps_event(struct pps_state *pps, int event)
1744{
1745	struct bintime bt;
1746	struct timespec ts, *tsp, *osp;
1747	u_int tcount, *pcount;
1748	int foff;
1749	pps_seq_t *pseq;
1750#ifdef FFCLOCK
1751	struct timespec *tsp_ffc;
1752	pps_seq_t *pseq_ffc;
1753	ffcounter *ffcount;
1754#endif
1755#ifdef PPS_SYNC
1756	int fhard;
1757#endif
1758
1759	KASSERT(pps != NULL, ("NULL pps pointer in pps_event"));
1760	/* Nothing to do if not currently set to capture this event type. */
1761	if ((event & pps->ppsparam.mode) == 0)
1762		return;
1763	/* If the timecounter was wound up underneath us, bail out. */
1764	if (pps->capgen == 0 || pps->capgen !=
1765	    atomic_load_acq_int(&pps->capth->th_generation))
1766		return;
1767
1768	/* Things would be easier with arrays. */
1769	if (event == PPS_CAPTUREASSERT) {
1770		tsp = &pps->ppsinfo.assert_timestamp;
1771		osp = &pps->ppsparam.assert_offset;
1772		foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
1773#ifdef PPS_SYNC
1774		fhard = pps->kcmode & PPS_CAPTUREASSERT;
1775#endif
1776		pcount = &pps->ppscount[0];
1777		pseq = &pps->ppsinfo.assert_sequence;
1778#ifdef FFCLOCK
1779		ffcount = &pps->ppsinfo_ffc.assert_ffcount;
1780		tsp_ffc = &pps->ppsinfo_ffc.assert_timestamp;
1781		pseq_ffc = &pps->ppsinfo_ffc.assert_sequence;
1782#endif
1783	} else {
1784		tsp = &pps->ppsinfo.clear_timestamp;
1785		osp = &pps->ppsparam.clear_offset;
1786		foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
1787#ifdef PPS_SYNC
1788		fhard = pps->kcmode & PPS_CAPTURECLEAR;
1789#endif
1790		pcount = &pps->ppscount[1];
1791		pseq = &pps->ppsinfo.clear_sequence;
1792#ifdef FFCLOCK
1793		ffcount = &pps->ppsinfo_ffc.clear_ffcount;
1794		tsp_ffc = &pps->ppsinfo_ffc.clear_timestamp;
1795		pseq_ffc = &pps->ppsinfo_ffc.clear_sequence;
1796#endif
1797	}
1798
1799	/*
1800	 * If the timecounter changed, we cannot compare the count values, so
1801	 * we have to drop the rest of the PPS-stuff until the next event.
1802	 */
1803	if (pps->ppstc != pps->capth->th_counter) {
1804		pps->ppstc = pps->capth->th_counter;
1805		*pcount = pps->capcount;
1806		pps->ppscount[2] = pps->capcount;
1807		return;
1808	}
1809
1810	/* Convert the count to a timespec. */
1811	tcount = pps->capcount - pps->capth->th_offset_count;
1812	tcount &= pps->capth->th_counter->tc_counter_mask;
1813	bt = pps->capth->th_bintime;
1814	bintime_addx(&bt, pps->capth->th_scale * tcount);
1815	bintime2timespec(&bt, &ts);
1816
1817	/* If the timecounter was wound up underneath us, bail out. */
1818	atomic_thread_fence_acq();
1819	if (pps->capgen != pps->capth->th_generation)
1820		return;
1821
1822	*pcount = pps->capcount;
1823	(*pseq)++;
1824	*tsp = ts;
1825
1826	if (foff) {
1827		timespecadd(tsp, osp);
1828		if (tsp->tv_nsec < 0) {
1829			tsp->tv_nsec += 1000000000;
1830			tsp->tv_sec -= 1;
1831		}
1832	}
1833
1834#ifdef FFCLOCK
1835	*ffcount = pps->capffth->tick_ffcount + tcount;
1836	bt = pps->capffth->tick_time;
1837	ffclock_convert_delta(tcount, pps->capffth->cest.period, &bt);
1838	bintime_add(&bt, &pps->capffth->tick_time);
1839	bintime2timespec(&bt, &ts);
1840	(*pseq_ffc)++;
1841	*tsp_ffc = ts;
1842#endif
1843
1844#ifdef PPS_SYNC
1845	if (fhard) {
1846		uint64_t scale;
1847
1848		/*
1849		 * Feed the NTP PLL/FLL.
1850		 * The FLL wants to know how many (hardware) nanoseconds
1851		 * elapsed since the previous event.
1852		 */
1853		tcount = pps->capcount - pps->ppscount[2];
1854		pps->ppscount[2] = pps->capcount;
1855		tcount &= pps->capth->th_counter->tc_counter_mask;
1856		scale = (uint64_t)1 << 63;
1857		scale /= pps->capth->th_counter->tc_frequency;
1858		scale *= 2;
1859		bt.sec = 0;
1860		bt.frac = 0;
1861		bintime_addx(&bt, scale * tcount);
1862		bintime2timespec(&bt, &ts);
1863		hardpps(tsp, ts.tv_nsec + 1000000000 * ts.tv_sec);
1864	}
1865#endif
1866
1867	/* Wakeup anyone sleeping in pps_fetch().  */
1868	wakeup(pps);
1869}
1870
1871/*
1872 * Timecounters need to be updated every so often to prevent the hardware
1873 * counter from overflowing.  Updating also recalculates the cached values
1874 * used by the get*() family of functions, so their precision depends on
1875 * the update frequency.
1876 */
1877
1878static int tc_tick;
1879SYSCTL_INT(_kern_timecounter, OID_AUTO, tick, CTLFLAG_RD, &tc_tick, 0,
1880    "Approximate number of hardclock ticks in a millisecond");
1881
1882void
1883tc_ticktock(int cnt)
1884{
1885	static int count;
1886
1887	if (mtx_trylock_spin(&tc_setclock_mtx)) {
1888		count += cnt;
1889		if (count >= tc_tick) {
1890			count = 0;
1891			tc_windup(NULL);
1892		}
1893		mtx_unlock_spin(&tc_setclock_mtx);
1894	}
1895}
1896
1897static void __inline
1898tc_adjprecision(void)
1899{
1900	int t;
1901
1902	if (tc_timepercentage > 0) {
1903		t = (99 + tc_timepercentage) / tc_timepercentage;
1904		tc_precexp = fls(t + (t >> 1)) - 1;
1905		FREQ2BT(hz / tc_tick, &bt_timethreshold);
1906		FREQ2BT(hz, &bt_tickthreshold);
1907		bintime_shift(&bt_timethreshold, tc_precexp);
1908		bintime_shift(&bt_tickthreshold, tc_precexp);
1909	} else {
1910		tc_precexp = 31;
1911		bt_timethreshold.sec = INT_MAX;
1912		bt_timethreshold.frac = ~(uint64_t)0;
1913		bt_tickthreshold = bt_timethreshold;
1914	}
1915	sbt_timethreshold = bttosbt(bt_timethreshold);
1916	sbt_tickthreshold = bttosbt(bt_tickthreshold);
1917}
1918
1919static int
1920sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS)
1921{
1922	int error, val;
1923
1924	val = tc_timepercentage;
1925	error = sysctl_handle_int(oidp, &val, 0, req);
1926	if (error != 0 || req->newptr == NULL)
1927		return (error);
1928	tc_timepercentage = val;
1929	if (cold)
1930		goto done;
1931	tc_adjprecision();
1932done:
1933	return (0);
1934}
1935
1936static void
1937inittimecounter(void *dummy)
1938{
1939	u_int p;
1940	int tick_rate;
1941
1942	/*
1943	 * Set the initial timeout to
1944	 * max(1, <approx. number of hardclock ticks in a millisecond>).
1945	 * People should probably not use the sysctl to set the timeout
1946	 * to smaller than its initial value, since that value is the
1947	 * smallest reasonable one.  If they want better timestamps they
1948	 * should use the non-"get"* functions.
1949	 */
1950	if (hz > 1000)
1951		tc_tick = (hz + 500) / 1000;
1952	else
1953		tc_tick = 1;
1954	tc_adjprecision();
1955	FREQ2BT(hz, &tick_bt);
1956	tick_sbt = bttosbt(tick_bt);
1957	tick_rate = hz / tc_tick;
1958	FREQ2BT(tick_rate, &tc_tick_bt);
1959	tc_tick_sbt = bttosbt(tc_tick_bt);
1960	p = (tc_tick * 1000000) / hz;
1961	printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000);
1962
1963#ifdef FFCLOCK
1964	ffclock_init();
1965#endif
1966	/* warm up new timecounter (again) and get rolling. */
1967	(void)timecounter->tc_get_timecount(timecounter);
1968	(void)timecounter->tc_get_timecount(timecounter);
1969	mtx_lock_spin(&tc_setclock_mtx);
1970	tc_windup(NULL);
1971	mtx_unlock_spin(&tc_setclock_mtx);
1972}
1973
1974SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_SECOND, inittimecounter, NULL);
1975
1976/* Cpu tick handling -------------------------------------------------*/
1977
1978static int cpu_tick_variable;
1979static uint64_t	cpu_tick_frequency;
1980
1981static DPCPU_DEFINE(uint64_t, tc_cpu_ticks_base);
1982static DPCPU_DEFINE(unsigned, tc_cpu_ticks_last);
1983
1984static uint64_t
1985tc_cpu_ticks(void)
1986{
1987	struct timecounter *tc;
1988	uint64_t res, *base;
1989	unsigned u, *last;
1990
1991	critical_enter();
1992	base = DPCPU_PTR(tc_cpu_ticks_base);
1993	last = DPCPU_PTR(tc_cpu_ticks_last);
1994	tc = timehands->th_counter;
1995	u = tc->tc_get_timecount(tc) & tc->tc_counter_mask;
1996	if (u < *last)
1997		*base += (uint64_t)tc->tc_counter_mask + 1;
1998	*last = u;
1999	res = u + *base;
2000	critical_exit();
2001	return (res);
2002}
2003
2004void
2005cpu_tick_calibration(void)
2006{
2007	static time_t last_calib;
2008
2009	if (time_uptime != last_calib && !(time_uptime & 0xf)) {
2010		cpu_tick_calibrate(0);
2011		last_calib = time_uptime;
2012	}
2013}
2014
2015/*
2016 * This function gets called every 16 seconds on only one designated
2017 * CPU in the system from hardclock() via cpu_tick_calibration()().
2018 *
2019 * Whenever the real time clock is stepped we get called with reset=1
2020 * to make sure we handle suspend/resume and similar events correctly.
2021 */
2022
2023static void
2024cpu_tick_calibrate(int reset)
2025{
2026	static uint64_t c_last;
2027	uint64_t c_this, c_delta;
2028	static struct bintime  t_last;
2029	struct bintime t_this, t_delta;
2030	uint32_t divi;
2031
2032	if (reset) {
2033		/* The clock was stepped, abort & reset */
2034		t_last.sec = 0;
2035		return;
2036	}
2037
2038	/* we don't calibrate fixed rate cputicks */
2039	if (!cpu_tick_variable)
2040		return;
2041
2042	getbinuptime(&t_this);
2043	c_this = cpu_ticks();
2044	if (t_last.sec != 0) {
2045		c_delta = c_this - c_last;
2046		t_delta = t_this;
2047		bintime_sub(&t_delta, &t_last);
2048		/*
2049		 * Headroom:
2050		 * 	2^(64-20) / 16[s] =
2051		 * 	2^(44) / 16[s] =
2052		 * 	17.592.186.044.416 / 16 =
2053		 * 	1.099.511.627.776 [Hz]
2054		 */
2055		divi = t_delta.sec << 20;
2056		divi |= t_delta.frac >> (64 - 20);
2057		c_delta <<= 20;
2058		c_delta /= divi;
2059		if (c_delta > cpu_tick_frequency) {
2060			if (0 && bootverbose)
2061				printf("cpu_tick increased to %ju Hz\n",
2062				    c_delta);
2063			cpu_tick_frequency = c_delta;
2064		}
2065	}
2066	c_last = c_this;
2067	t_last = t_this;
2068}
2069
2070void
2071set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var)
2072{
2073
2074	if (func == NULL) {
2075		cpu_ticks = tc_cpu_ticks;
2076	} else {
2077		cpu_tick_frequency = freq;
2078		cpu_tick_variable = var;
2079		cpu_ticks = func;
2080	}
2081}
2082
2083uint64_t
2084cpu_tickrate(void)
2085{
2086
2087	if (cpu_ticks == tc_cpu_ticks)
2088		return (tc_getfrequency());
2089	return (cpu_tick_frequency);
2090}
2091
2092/*
2093 * We need to be slightly careful converting cputicks to microseconds.
2094 * There is plenty of margin in 64 bits of microseconds (half a million
2095 * years) and in 64 bits at 4 GHz (146 years), but if we do a multiply
2096 * before divide conversion (to retain precision) we find that the
2097 * margin shrinks to 1.5 hours (one millionth of 146y).
2098 * With a three prong approach we never lose significant bits, no
2099 * matter what the cputick rate and length of timeinterval is.
2100 */
2101
2102uint64_t
2103cputick2usec(uint64_t tick)
2104{
2105
2106	if (tick > 18446744073709551LL)		/* floor(2^64 / 1000) */
2107		return (tick / (cpu_tickrate() / 1000000LL));
2108	else if (tick > 18446744073709LL)	/* floor(2^64 / 1000000) */
2109		return ((tick * 1000LL) / (cpu_tickrate() / 1000LL));
2110	else
2111		return ((tick * 1000000LL) / cpu_tickrate());
2112}
2113
2114cpu_tick_f	*cpu_ticks = tc_cpu_ticks;
2115
2116static int vdso_th_enable = 1;
2117static int
2118sysctl_fast_gettime(SYSCTL_HANDLER_ARGS)
2119{
2120	int old_vdso_th_enable, error;
2121
2122	old_vdso_th_enable = vdso_th_enable;
2123	error = sysctl_handle_int(oidp, &old_vdso_th_enable, 0, req);
2124	if (error != 0)
2125		return (error);
2126	vdso_th_enable = old_vdso_th_enable;
2127	return (0);
2128}
2129SYSCTL_PROC(_kern_timecounter, OID_AUTO, fast_gettime,
2130    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
2131    NULL, 0, sysctl_fast_gettime, "I", "Enable fast time of day");
2132
2133uint32_t
2134tc_fill_vdso_timehands(struct vdso_timehands *vdso_th)
2135{
2136	struct timehands *th;
2137	uint32_t enabled;
2138
2139	th = timehands;
2140	vdso_th->th_scale = th->th_scale;
2141	vdso_th->th_offset_count = th->th_offset_count;
2142	vdso_th->th_counter_mask = th->th_counter->tc_counter_mask;
2143	vdso_th->th_offset = th->th_offset;
2144	vdso_th->th_boottime = th->th_boottime;
2145	if (th->th_counter->tc_fill_vdso_timehands != NULL) {
2146		enabled = th->th_counter->tc_fill_vdso_timehands(vdso_th,
2147		    th->th_counter);
2148	} else
2149		enabled = 0;
2150	if (!vdso_th_enable)
2151		enabled = 0;
2152	return (enabled);
2153}
2154
2155#ifdef COMPAT_FREEBSD32
2156uint32_t
2157tc_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
2158{
2159	struct timehands *th;
2160	uint32_t enabled;
2161
2162	th = timehands;
2163	*(uint64_t *)&vdso_th32->th_scale[0] = th->th_scale;
2164	vdso_th32->th_offset_count = th->th_offset_count;
2165	vdso_th32->th_counter_mask = th->th_counter->tc_counter_mask;
2166	vdso_th32->th_offset.sec = th->th_offset.sec;
2167	*(uint64_t *)&vdso_th32->th_offset.frac[0] = th->th_offset.frac;
2168	vdso_th32->th_boottime.sec = th->th_boottime.sec;
2169	*(uint64_t *)&vdso_th32->th_boottime.frac[0] = th->th_boottime.frac;
2170	if (th->th_counter->tc_fill_vdso_timehands32 != NULL) {
2171		enabled = th->th_counter->tc_fill_vdso_timehands32(vdso_th32,
2172		    th->th_counter);
2173	} else
2174		enabled = 0;
2175	if (!vdso_th_enable)
2176		enabled = 0;
2177	return (enabled);
2178}
2179#endif
2180