profile.c revision 314667
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
21 * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
22 *
23 * $FreeBSD: stable/10/sys/cddl/dev/profile/profile.c 314667 2017-03-04 13:03:31Z avg $
24 *
25 */
26
27/*
28 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
29 * Use is subject to license terms.
30 */
31
32#include <sys/cdefs.h>
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/conf.h>
36#include <sys/cpuvar.h>
37#include <sys/fcntl.h>
38#include <sys/filio.h>
39#include <sys/kdb.h>
40#include <sys/kernel.h>
41#include <sys/kmem.h>
42#include <sys/kthread.h>
43#include <sys/limits.h>
44#include <sys/linker.h>
45#include <sys/lock.h>
46#include <sys/malloc.h>
47#include <sys/module.h>
48#include <sys/mutex.h>
49#include <sys/poll.h>
50#include <sys/proc.h>
51#include <sys/selinfo.h>
52#include <sys/smp.h>
53#include <sys/uio.h>
54#include <sys/unistd.h>
55#include <machine/cpu.h>
56#include <machine/stdarg.h>
57
58#include <sys/dtrace.h>
59#include <sys/dtrace_bsd.h>
60
61#define	PROF_NAMELEN		15
62
63#define	PROF_PROFILE		0
64#define	PROF_TICK		1
65#define	PROF_PREFIX_PROFILE	"profile-"
66#define	PROF_PREFIX_TICK	"tick-"
67
68/*
69 * Regardless of platform, there are five artificial frames in the case of the
70 * profile provider:
71 *
72 *	profile_fire
73 *	cyclic_expire
74 *	cyclic_fire
75 *	[ cbe ]
76 *	[ locore ]
77 *
78 * On amd64, there are two frames associated with locore:  one in locore, and
79 * another in common interrupt dispatch code.  (i386 has not been modified to
80 * use this common layer.)  Further, on i386, the interrupted instruction
81 * appears as its own stack frame.  All of this means that we need to add one
82 * frame for amd64, and then take one away for both amd64 and i386.
83 *
84 * On SPARC, the picture is further complicated because the compiler
85 * optimizes away tail-calls -- so the following frames are optimized away:
86 *
87 * 	profile_fire
88 *	cyclic_expire
89 *
90 * This gives three frames.  However, on DEBUG kernels, the cyclic_expire
91 * frame cannot be tail-call eliminated, yielding four frames in this case.
92 *
93 * All of the above constraints lead to the mess below.  Yes, the profile
94 * provider should ideally figure this out on-the-fly by hiting one of its own
95 * probes and then walking its own stack trace.  This is complicated, however,
96 * and the static definition doesn't seem to be overly brittle.  Still, we
97 * allow for a manual override in case we get it completely wrong.
98 */
99#ifdef __amd64
100#define	PROF_ARTIFICIAL_FRAMES	10
101#else
102#ifdef __i386
103#define	PROF_ARTIFICIAL_FRAMES	6
104#else
105#ifdef __sparc
106#ifdef DEBUG
107#define	PROF_ARTIFICIAL_FRAMES	4
108#else
109#define	PROF_ARTIFICIAL_FRAMES	3
110#endif
111#endif
112#endif
113#endif
114
115#ifdef __mips
116/*
117 * This value is bogus just to make module compilable on mips
118 */
119#define	PROF_ARTIFICIAL_FRAMES	3
120#endif
121
122#ifdef __powerpc__
123/*
124 * This value is bogus just to make module compilable on powerpc
125 */
126#define	PROF_ARTIFICIAL_FRAMES	3
127#endif
128
129struct profile_probe_percpu;
130
131typedef struct profile_probe {
132	char		prof_name[PROF_NAMELEN];
133	dtrace_id_t	prof_id;
134	int		prof_kind;
135#ifdef illumos
136	hrtime_t	prof_interval;
137	cyclic_id_t	prof_cyclic;
138#else
139	sbintime_t	prof_interval;
140	struct callout	prof_cyclic;
141	sbintime_t	prof_expected;
142	struct profile_probe_percpu **prof_pcpus;
143#endif
144} profile_probe_t;
145
146typedef struct profile_probe_percpu {
147	hrtime_t	profc_expected;
148	hrtime_t	profc_interval;
149	profile_probe_t	*profc_probe;
150#ifdef __FreeBSD__
151	struct callout	profc_cyclic;
152#endif
153} profile_probe_percpu_t;
154
155static d_open_t	profile_open;
156static int	profile_unload(void);
157static void	profile_create(hrtime_t, char *, int);
158static void	profile_destroy(void *, dtrace_id_t, void *);
159static void	profile_enable(void *, dtrace_id_t, void *);
160static void	profile_disable(void *, dtrace_id_t, void *);
161static void	profile_load(void *);
162static void	profile_provide(void *, dtrace_probedesc_t *);
163
164static int profile_rates[] = {
165    97, 199, 499, 997, 1999,
166    4001, 4999, 0, 0, 0,
167    0, 0, 0, 0, 0,
168    0, 0, 0, 0, 0
169};
170
171static int profile_ticks[] = {
172    1, 10, 100, 500, 1000,
173    5000, 0, 0, 0, 0,
174    0, 0, 0, 0, 0
175};
176
177/*
178 * profile_max defines the upper bound on the number of profile probes that
179 * can exist (this is to prevent malicious or clumsy users from exhausing
180 * system resources by creating a slew of profile probes). At mod load time,
181 * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
182 * present in the profile.conf file.
183 */
184#define	PROFILE_MAX_DEFAULT	1000	/* default max. number of probes */
185static uint32_t profile_max = PROFILE_MAX_DEFAULT;
186					/* maximum number of profile probes */
187static uint32_t profile_total;		/* current number of profile probes */
188
189static struct cdevsw profile_cdevsw = {
190	.d_version	= D_VERSION,
191	.d_open		= profile_open,
192	.d_name		= "profile",
193};
194
195static dtrace_pattr_t profile_attr = {
196{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
197{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
198{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
199{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
200{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
201};
202
203static dtrace_pops_t profile_pops = {
204	profile_provide,
205	NULL,
206	profile_enable,
207	profile_disable,
208	NULL,
209	NULL,
210	NULL,
211	NULL,
212	NULL,
213	profile_destroy
214};
215
216static struct cdev		*profile_cdev;
217static dtrace_provider_id_t	profile_id;
218static hrtime_t			profile_interval_min = NANOSEC / 5000;	/* 5000 hz */
219static int			profile_aframes = 0;			/* override */
220
221static sbintime_t
222nsec_to_sbt(hrtime_t nsec)
223{
224	time_t sec;
225
226	/*
227	 * We need to calculate nsec * 2^32 / 10^9
228	 * Seconds and nanoseconds are split to avoid overflow.
229	 */
230	sec = nsec / NANOSEC;
231	nsec = nsec % NANOSEC;
232	return (((sbintime_t)sec << 32) | ((sbintime_t)nsec << 32) / NANOSEC);
233}
234
235static hrtime_t
236sbt_to_nsec(sbintime_t sbt)
237{
238
239	return ((sbt >> 32) * NANOSEC +
240	    (((uint32_t)sbt * (hrtime_t)NANOSEC) >> 32));
241}
242
243static void
244profile_fire(void *arg)
245{
246	profile_probe_percpu_t *pcpu = arg;
247	profile_probe_t *prof = pcpu->profc_probe;
248	hrtime_t late;
249	struct trapframe *frame;
250	uintfptr_t pc, upc;
251
252#ifdef illumos
253	late = gethrtime() - pcpu->profc_expected;
254#else
255	late = sbt_to_nsec(sbinuptime() - pcpu->profc_expected);
256#endif
257
258	pc = 0;
259	upc = 0;
260
261	/*
262	 * td_intr_frame can be unset if this is a catch up event
263	 * after waking up from idle sleep.
264	 * This can only happen on a CPU idle thread.
265	 */
266	frame = curthread->td_intr_frame;
267	if (frame != NULL) {
268		if (TRAPF_USERMODE(frame))
269			upc = TRAPF_PC(frame);
270		else
271			pc = TRAPF_PC(frame);
272	}
273	dtrace_probe(prof->prof_id, pc, upc, late, 0, 0);
274
275	pcpu->profc_expected += pcpu->profc_interval;
276	callout_schedule_sbt_curcpu(&pcpu->profc_cyclic,
277	    pcpu->profc_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
278}
279
280static void
281profile_tick(void *arg)
282{
283	profile_probe_t *prof = arg;
284	struct trapframe *frame;
285	uintfptr_t pc, upc;
286
287	pc = 0;
288	upc = 0;
289
290	/*
291	 * td_intr_frame can be unset if this is a catch up event
292	 * after waking up from idle sleep.
293	 * This can only happen on a CPU idle thread.
294	 */
295	frame = curthread->td_intr_frame;
296	if (frame != NULL) {
297		if (TRAPF_USERMODE(frame))
298			upc = TRAPF_PC(frame);
299		else
300			pc = TRAPF_PC(frame);
301	}
302	dtrace_probe(prof->prof_id, pc, upc, 0, 0, 0);
303
304	prof->prof_expected += prof->prof_interval;
305	callout_schedule_sbt(&prof->prof_cyclic,
306	    prof->prof_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
307}
308
309static void
310profile_create(hrtime_t interval, char *name, int kind)
311{
312	profile_probe_t *prof;
313
314	if (interval < profile_interval_min)
315		return;
316
317	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
318		return;
319
320	atomic_add_32(&profile_total, 1);
321	if (profile_total > profile_max) {
322		atomic_add_32(&profile_total, -1);
323		return;
324	}
325
326	prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
327	(void) strcpy(prof->prof_name, name);
328#ifdef illumos
329	prof->prof_interval = interval;
330	prof->prof_cyclic = CYCLIC_NONE;
331#else
332	prof->prof_interval = nsec_to_sbt(interval);
333	callout_init(&prof->prof_cyclic, 1);
334#endif
335	prof->prof_kind = kind;
336	prof->prof_id = dtrace_probe_create(profile_id,
337	    NULL, NULL, name,
338	    profile_aframes ? profile_aframes : PROF_ARTIFICIAL_FRAMES, prof);
339}
340
341/*ARGSUSED*/
342static void
343profile_provide(void *arg, dtrace_probedesc_t *desc)
344{
345	int i, j, rate, kind;
346	hrtime_t val = 0, mult = 1, len = 0;
347	char *name, *suffix = NULL;
348
349	const struct {
350		char *prefix;
351		int kind;
352	} types[] = {
353		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
354		{ PROF_PREFIX_TICK, PROF_TICK },
355		{ 0, 0 }
356	};
357
358	const struct {
359		char *name;
360		hrtime_t mult;
361	} suffixes[] = {
362		{ "ns", 	NANOSEC / NANOSEC },
363		{ "nsec",	NANOSEC / NANOSEC },
364		{ "us",		NANOSEC / MICROSEC },
365		{ "usec",	NANOSEC / MICROSEC },
366		{ "ms",		NANOSEC / MILLISEC },
367		{ "msec",	NANOSEC / MILLISEC },
368		{ "s",		NANOSEC / SEC },
369		{ "sec",	NANOSEC / SEC },
370		{ "m",		NANOSEC * (hrtime_t)60 },
371		{ "min",	NANOSEC * (hrtime_t)60 },
372		{ "h",		NANOSEC * (hrtime_t)(60 * 60) },
373		{ "hour",	NANOSEC * (hrtime_t)(60 * 60) },
374		{ "d",		NANOSEC * (hrtime_t)(24 * 60 * 60) },
375		{ "day",	NANOSEC * (hrtime_t)(24 * 60 * 60) },
376		{ "hz",		0 },
377		{ NULL }
378	};
379
380	if (desc == NULL) {
381		char n[PROF_NAMELEN];
382
383		/*
384		 * If no description was provided, provide all of our probes.
385		 */
386		for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
387			if ((rate = profile_rates[i]) == 0)
388				continue;
389
390			(void) snprintf(n, PROF_NAMELEN, "%s%d",
391			    PROF_PREFIX_PROFILE, rate);
392			profile_create(NANOSEC / rate, n, PROF_PROFILE);
393		}
394
395		for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
396			if ((rate = profile_ticks[i]) == 0)
397				continue;
398
399			(void) snprintf(n, PROF_NAMELEN, "%s%d",
400			    PROF_PREFIX_TICK, rate);
401			profile_create(NANOSEC / rate, n, PROF_TICK);
402		}
403
404		return;
405	}
406
407	name = desc->dtpd_name;
408
409	for (i = 0; types[i].prefix != NULL; i++) {
410		len = strlen(types[i].prefix);
411
412		if (strncmp(name, types[i].prefix, len) != 0)
413			continue;
414		break;
415	}
416
417	if (types[i].prefix == NULL)
418		return;
419
420	kind = types[i].kind;
421	j = strlen(name) - len;
422
423	/*
424	 * We need to start before any time suffix.
425	 */
426	for (j = strlen(name); j >= len; j--) {
427		if (name[j] >= '0' && name[j] <= '9')
428			break;
429		suffix = &name[j];
430	}
431
432	ASSERT(suffix != NULL);
433
434	/*
435	 * Now determine the numerical value present in the probe name.
436	 */
437	for (; j >= len; j--) {
438		if (name[j] < '0' || name[j] > '9')
439			return;
440
441		val += (name[j] - '0') * mult;
442		mult *= (hrtime_t)10;
443	}
444
445	if (val == 0)
446		return;
447
448	/*
449	 * Look-up the suffix to determine the multiplier.
450	 */
451	for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
452		if (strcasecmp(suffixes[i].name, suffix) == 0) {
453			mult = suffixes[i].mult;
454			break;
455		}
456	}
457
458	if (suffixes[i].name == NULL && *suffix != '\0')
459		return;
460
461	if (mult == 0) {
462		/*
463		 * The default is frequency-per-second.
464		 */
465		val = NANOSEC / val;
466	} else {
467		val *= mult;
468	}
469
470	profile_create(val, name, kind);
471}
472
473/* ARGSUSED */
474static void
475profile_destroy(void *arg, dtrace_id_t id, void *parg)
476{
477	profile_probe_t *prof = parg;
478
479#ifdef illumos
480	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
481#else
482	ASSERT(!callout_active(&prof->prof_cyclic) && prof->prof_pcpus == NULL);
483#endif
484	kmem_free(prof, sizeof (profile_probe_t));
485
486	ASSERT(profile_total >= 1);
487	atomic_add_32(&profile_total, -1);
488}
489
490#ifdef illumos
491/*ARGSUSED*/
492static void
493profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
494{
495	profile_probe_t *prof = arg;
496	profile_probe_percpu_t *pcpu;
497
498	pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
499	pcpu->profc_probe = prof;
500
501	hdlr->cyh_func = profile_fire;
502	hdlr->cyh_arg = pcpu;
503
504	when->cyt_interval = prof->prof_interval;
505	when->cyt_when = gethrtime() + when->cyt_interval;
506
507	pcpu->profc_expected = when->cyt_when;
508	pcpu->profc_interval = when->cyt_interval;
509}
510
511/*ARGSUSED*/
512static void
513profile_offline(void *arg, cpu_t *cpu, void *oarg)
514{
515	profile_probe_percpu_t *pcpu = oarg;
516
517	ASSERT(pcpu->profc_probe == arg);
518	kmem_free(pcpu, sizeof (profile_probe_percpu_t));
519}
520
521/* ARGSUSED */
522static void
523profile_enable(void *arg, dtrace_id_t id, void *parg)
524{
525	profile_probe_t *prof = parg;
526	cyc_omni_handler_t omni;
527	cyc_handler_t hdlr;
528	cyc_time_t when;
529
530	ASSERT(prof->prof_interval != 0);
531	ASSERT(MUTEX_HELD(&cpu_lock));
532
533	if (prof->prof_kind == PROF_TICK) {
534		hdlr.cyh_func = profile_tick;
535		hdlr.cyh_arg = prof;
536
537		when.cyt_interval = prof->prof_interval;
538		when.cyt_when = gethrtime() + when.cyt_interval;
539	} else {
540		ASSERT(prof->prof_kind == PROF_PROFILE);
541		omni.cyo_online = profile_online;
542		omni.cyo_offline = profile_offline;
543		omni.cyo_arg = prof;
544	}
545
546	if (prof->prof_kind == PROF_TICK) {
547		prof->prof_cyclic = cyclic_add(&hdlr, &when);
548	} else {
549		prof->prof_cyclic = cyclic_add_omni(&omni);
550	}
551}
552
553/* ARGSUSED */
554static void
555profile_disable(void *arg, dtrace_id_t id, void *parg)
556{
557	profile_probe_t *prof = parg;
558
559	ASSERT(prof->prof_cyclic != CYCLIC_NONE);
560	ASSERT(MUTEX_HELD(&cpu_lock));
561
562	cyclic_remove(prof->prof_cyclic);
563	prof->prof_cyclic = CYCLIC_NONE;
564}
565
566#else
567
568static void
569profile_enable_omni(profile_probe_t *prof)
570{
571	profile_probe_percpu_t *pcpu;
572	int cpu;
573
574	prof->prof_pcpus = kmem_zalloc((mp_maxid + 1) * sizeof(pcpu), KM_SLEEP);
575	CPU_FOREACH(cpu) {
576		pcpu = kmem_zalloc(sizeof(profile_probe_percpu_t), KM_SLEEP);
577		prof->prof_pcpus[cpu] = pcpu;
578		pcpu->profc_probe = prof;
579		pcpu->profc_expected = sbinuptime() + prof->prof_interval;
580		pcpu->profc_interval = prof->prof_interval;
581		callout_init(&pcpu->profc_cyclic, 1);
582		callout_reset_sbt_on(&pcpu->profc_cyclic,
583		    pcpu->profc_expected, 0, profile_fire, pcpu,
584		    cpu, C_DIRECT_EXEC | C_ABSOLUTE);
585	}
586}
587
588static void
589profile_disable_omni(profile_probe_t *prof)
590{
591	profile_probe_percpu_t *pcpu;
592	int cpu;
593
594	ASSERT(prof->prof_pcpus != NULL);
595	CPU_FOREACH(cpu) {
596		pcpu = prof->prof_pcpus[cpu];
597		ASSERT(pcpu->profc_probe == prof);
598		ASSERT(callout_active(&pcpu->profc_cyclic));
599		callout_stop(&pcpu->profc_cyclic);
600		callout_drain(&pcpu->profc_cyclic);
601		kmem_free(pcpu, sizeof(profile_probe_percpu_t));
602	}
603	kmem_free(prof->prof_pcpus, (mp_maxid + 1) * sizeof(pcpu));
604	prof->prof_pcpus = NULL;
605}
606
607/* ARGSUSED */
608static void
609profile_enable(void *arg, dtrace_id_t id, void *parg)
610{
611	profile_probe_t *prof = parg;
612
613	if (prof->prof_kind == PROF_TICK) {
614		prof->prof_expected = sbinuptime() + prof->prof_interval;
615		callout_reset_sbt(&prof->prof_cyclic,
616		    prof->prof_expected, 0, profile_tick, prof,
617		    C_DIRECT_EXEC | C_ABSOLUTE);
618	} else {
619		ASSERT(prof->prof_kind == PROF_PROFILE);
620		profile_enable_omni(prof);
621	}
622}
623
624/* ARGSUSED */
625static void
626profile_disable(void *arg, dtrace_id_t id, void *parg)
627{
628	profile_probe_t *prof = parg;
629
630	if (prof->prof_kind == PROF_TICK) {
631		ASSERT(callout_active(&prof->prof_cyclic));
632		callout_stop(&prof->prof_cyclic);
633		callout_drain(&prof->prof_cyclic);
634	} else {
635		ASSERT(prof->prof_kind == PROF_PROFILE);
636		profile_disable_omni(prof);
637	}
638}
639#endif
640
641static void
642profile_load(void *dummy)
643{
644	/* Create the /dev/dtrace/profile entry. */
645	profile_cdev = make_dev(&profile_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
646	    "dtrace/profile");
647
648	if (dtrace_register("profile", &profile_attr, DTRACE_PRIV_USER,
649	    NULL, &profile_pops, NULL, &profile_id) != 0)
650		return;
651}
652
653
654static int
655profile_unload()
656{
657	int error = 0;
658
659	if ((error = dtrace_unregister(profile_id)) != 0)
660		return (error);
661
662	destroy_dev(profile_cdev);
663
664	return (error);
665}
666
667/* ARGSUSED */
668static int
669profile_modevent(module_t mod __unused, int type, void *data __unused)
670{
671	int error = 0;
672
673	switch (type) {
674	case MOD_LOAD:
675		break;
676
677	case MOD_UNLOAD:
678		break;
679
680	case MOD_SHUTDOWN:
681		break;
682
683	default:
684		error = EOPNOTSUPP;
685		break;
686
687	}
688	return (error);
689}
690
691/* ARGSUSED */
692static int
693profile_open(struct cdev *dev __unused, int oflags __unused, int devtype __unused, struct thread *td __unused)
694{
695	return (0);
696}
697
698SYSINIT(profile_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_load, NULL);
699SYSUNINIT(profile_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_unload, NULL);
700
701DEV_MODULE(profile, profile_modevent, NULL);
702MODULE_VERSION(profile, 1);
703MODULE_DEPEND(profile, dtrace, 1, 1, 1);
704MODULE_DEPEND(profile, opensolaris, 1, 1, 1);
705