lockstat.c revision 297077
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <stdio.h>
29#include <stddef.h>
30#include <stdlib.h>
31#include <stdarg.h>
32#include <string.h>
33#include <strings.h>
34#include <ctype.h>
35#include <fcntl.h>
36#include <unistd.h>
37#include <errno.h>
38#include <limits.h>
39#include <sys/types.h>
40#include <sys/modctl.h>
41#include <sys/stat.h>
42#include <sys/wait.h>
43#include <dtrace.h>
44#include <sys/lockstat.h>
45#include <alloca.h>
46#include <signal.h>
47#include <assert.h>
48
49#ifdef illumos
50#define	GETOPT_EOF	EOF
51#else
52#include <sys/time.h>
53#include <sys/resource.h>
54
55#define	mergesort(a, b, c, d)	lsmergesort(a, b, c, d)
56#define	GETOPT_EOF		(-1)
57
58typedef	uintptr_t	pc_t;
59#endif
60
61#define	LOCKSTAT_OPTSTR	"x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"
62
63#define	LS_MAX_STACK_DEPTH	50
64#define	LS_MAX_EVENTS		64
65
66typedef struct lsrec {
67	struct lsrec	*ls_next;	/* next in hash chain */
68	uintptr_t	ls_lock;	/* lock address */
69	uintptr_t	ls_caller;	/* caller address */
70	uint32_t	ls_count;	/* cumulative event count */
71	uint32_t	ls_event;	/* type of event */
72	uintptr_t	ls_refcnt;	/* cumulative reference count */
73	uint64_t	ls_time;	/* cumulative event duration */
74	uint32_t	ls_hist[64];	/* log2(duration) histogram */
75	uintptr_t	ls_stack[LS_MAX_STACK_DEPTH];
76} lsrec_t;
77
78typedef struct lsdata {
79	struct lsrec	*lsd_next;	/* next available */
80	int		lsd_count;	/* number of records */
81} lsdata_t;
82
83/*
84 * Definitions for the types of experiments which can be run.  They are
85 * listed in increasing order of memory cost and processing time cost.
86 * The numerical value of each type is the number of bytes needed per record.
87 */
88#define	LS_BASIC	offsetof(lsrec_t, ls_time)
89#define	LS_TIME		offsetof(lsrec_t, ls_hist[0])
90#define	LS_HIST		offsetof(lsrec_t, ls_stack[0])
91#define	LS_STACK(depth)	offsetof(lsrec_t, ls_stack[depth])
92
93static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
94static void report_trace(FILE *, lsrec_t **);
95
96extern int symtab_init(void);
97extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
98extern uintptr_t sym_to_addr(char *name);
99extern size_t sym_size(char *name);
100extern char *strtok_r(char *, const char *, char **);
101
102#define	DEFAULT_NRECS	10000
103#define	DEFAULT_HZ	97
104#define	MAX_HZ		1000
105#define	MIN_AGGSIZE	(16 * 1024)
106#define	MAX_AGGSIZE	(32 * 1024 * 1024)
107
108static int g_stkdepth;
109static int g_topn = INT_MAX;
110static hrtime_t g_elapsed;
111static int g_rates = 0;
112static int g_pflag = 0;
113static int g_Pflag = 0;
114static int g_wflag = 0;
115static int g_Wflag = 0;
116static int g_cflag = 0;
117static int g_kflag = 0;
118static int g_gflag = 0;
119static int g_Vflag = 0;
120static int g_tracing = 0;
121static size_t g_recsize;
122static size_t g_nrecs;
123static int g_nrecs_used;
124static uchar_t g_enabled[LS_MAX_EVENTS];
125static hrtime_t g_min_duration[LS_MAX_EVENTS];
126static dtrace_hdl_t *g_dtp;
127static char *g_predicate;
128static char *g_ipredicate;
129static char *g_prog;
130static int g_proglen;
131static int g_dropped;
132
133typedef struct ls_event_info {
134	char	ev_type;
135	char	ev_lhdr[20];
136	char	ev_desc[80];
137	char	ev_units[10];
138	char	ev_name[DTRACE_NAMELEN];
139	char	*ev_predicate;
140	char	*ev_acquire;
141} ls_event_info_t;
142
143static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
144	{ 'C',	"Lock",	"Adaptive mutex spin",			"nsec",
145	    "lockstat:::adaptive-spin" },
146	{ 'C',	"Lock",	"Adaptive mutex block",			"nsec",
147	    "lockstat:::adaptive-block" },
148	{ 'C',	"Lock",	"Spin lock spin",			"nsec",
149	    "lockstat:::spin-spin" },
150	{ 'C',	"Lock",	"Thread lock spin",			"nsec",
151	    "lockstat:::thread-spin" },
152	{ 'C',	"Lock",	"R/W writer blocked by writer",		"nsec",
153	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
154	{ 'C',	"Lock",	"R/W writer blocked by readers",	"nsec",
155	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
156	{ 'C',	"Lock",	"R/W reader blocked by writer",		"nsec",
157	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
158	{ 'C',	"Lock",	"R/W reader blocked by write wanted",	"nsec",
159	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
160	{ 'C',	"Lock",	"R/W writer spin on writer",		"nsec",
161	    "lockstat:::rw-spin", "arg2 == 0 && arg3 == 1" },
162	{ 'C',	"Lock",	"R/W writer spin on readers",		"nsec",
163	    "lockstat:::rw-spin", "arg2 == 0 && arg3 == 0 && arg4" },
164	{ 'C',	"Lock",	"R/W reader spin on writer",		"nsec",
165	    "lockstat:::rw-spin", "arg2 != 0 && arg3 == 1" },
166	{ 'C',	"Lock",	"R/W reader spin on write wanted",	"nsec",
167	    "lockstat:::rw-spin", "arg2 != 0 && arg3 == 0 && arg4" },
168	{ 'C',	"Lock",	"SX exclusive block",			"nsec",
169	    "lockstat:::sx-block", "arg2 == 0" },
170	{ 'C',	"Lock",	"SX shared block",			"nsec",
171	    "lockstat:::sx-block", "arg2 != 0" },
172	{ 'C',	"Lock",	"SX exclusive spin",			"nsec",
173	    "lockstat:::sx-spin", "arg2 == 0" },
174	{ 'C',	"Lock",	"SX shared spin",			"nsec",
175	    "lockstat:::sx-spin", "arg2 != 0" },
176	{ 'C',	"Lock",	"Unknown event (type 16)",		"units"	},
177	{ 'C',	"Lock",	"Unknown event (type 17)",		"units"	},
178	{ 'C',	"Lock",	"Unknown event (type 18)",		"units"	},
179	{ 'C',	"Lock",	"Unknown event (type 19)",		"units"	},
180	{ 'C',	"Lock",	"Unknown event (type 20)",		"units"	},
181	{ 'C',	"Lock",	"Unknown event (type 21)",		"units"	},
182	{ 'C',	"Lock",	"Unknown event (type 22)",		"units"	},
183	{ 'C',	"Lock",	"Unknown event (type 23)",		"units"	},
184	{ 'C',	"Lock",	"Unknown event (type 24)",		"units"	},
185	{ 'C',	"Lock",	"Unknown event (type 25)",		"units"	},
186	{ 'C',	"Lock",	"Unknown event (type 26)",		"units"	},
187	{ 'C',	"Lock",	"Unknown event (type 27)",		"units"	},
188	{ 'C',	"Lock",	"Unknown event (type 28)",		"units"	},
189	{ 'C',	"Lock",	"Unknown event (type 29)",		"units"	},
190	{ 'C',	"Lock",	"Unknown event (type 30)",		"units"	},
191	{ 'C',	"Lock",	"Unknown event (type 31)",		"units"	},
192	{ 'H',	"Lock",	"Adaptive mutex hold",			"nsec",
193	    "lockstat:::adaptive-release", NULL,
194	    "lockstat:::adaptive-acquire" },
195	{ 'H',	"Lock",	"Spin lock hold",			"nsec",
196	    "lockstat:::spin-release", NULL,
197	    "lockstat:::spin-acquire" },
198	{ 'H',	"Lock",	"R/W writer hold",			"nsec",
199	    "lockstat::rw_wunlock:rw-release", NULL,
200	    "lockstat::rw_wlock:rw-acquire" },
201	{ 'H',	"Lock",	"R/W reader hold",			"nsec",
202	    "lockstat::rw_runlock:rw-release", NULL,
203	    "lockstat::rw_rlock:rw-acquire" },
204	{ 'H',	"Lock",	"SX shared hold",			"nsec",
205	    "lockstat::sx_sunlock:sx-release", NULL,
206	    "lockstat::sx_slock:sx-acquire" },
207	{ 'H',	"Lock",	"SX exclusive hold",			"nsec",
208	    "lockstat::sx_xunlock:sx-release", NULL,
209	    "lockstat::sx_xlock:sx-acquire" },
210	{ 'H',	"Lock",	"Unknown event (type 38)",		"units"	},
211	{ 'H',	"Lock",	"Unknown event (type 39)",		"units"	},
212	{ 'H',	"Lock",	"Unknown event (type 40)",		"units"	},
213	{ 'H',	"Lock",	"Unknown event (type 41)",		"units"	},
214	{ 'H',	"Lock",	"Unknown event (type 42)",		"units"	},
215	{ 'H',	"Lock",	"Unknown event (type 43)",		"units"	},
216	{ 'H',	"Lock",	"Unknown event (type 44)",		"units"	},
217	{ 'H',	"Lock",	"Unknown event (type 45)",		"units"	},
218	{ 'H',	"Lock",	"Unknown event (type 46)",		"units"	},
219	{ 'H',	"Lock",	"Unknown event (type 47)",		"units"	},
220	{ 'H',	"Lock",	"Unknown event (type 48)",		"units"	},
221	{ 'H',	"Lock",	"Unknown event (type 49)",		"units"	},
222	{ 'H',	"Lock",	"Unknown event (type 50)",		"units"	},
223	{ 'H',	"Lock",	"Unknown event (type 51)",		"units"	},
224	{ 'H',	"Lock",	"Unknown event (type 52)",		"units"	},
225	{ 'H',	"Lock",	"Unknown event (type 53)",		"units"	},
226	{ 'H',	"Lock",	"Unknown event (type 54)",		"units"	},
227	{ 'H',	"Lock",	"Unknown event (type 55)",		"units"	},
228#ifdef illumos
229	{ 'I',	"CPU+PIL", "Profiling interrupt",		"nsec",
230#else
231	{ 'I',	"CPU+Pri_Class", "Profiling interrupt",		"nsec",
232#endif
233	    "profile:::profile-97", NULL },
234	{ 'I',	"Lock",	"Unknown event (type 57)",		"units"	},
235	{ 'I',	"Lock",	"Unknown event (type 58)",		"units"	},
236	{ 'I',	"Lock",	"Unknown event (type 59)",		"units"	},
237	{ 'E',	"Lock",	"Recursive lock entry detected",	"(N/A)",
238	    "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
239	{ 'E',	"Lock",	"Lockstat enter failure",		"(N/A)"	},
240	{ 'E',	"Lock",	"Lockstat exit failure",		"nsec"	},
241	{ 'E',	"Lock",	"Lockstat record failure",		"(N/A)"	},
242};
243
244#ifndef illumos
245static char *g_pri_class[] = {
246	"",
247	"Intr",
248	"RealT",
249	"TShar",
250	"Idle"
251};
252#endif
253
254static void
255fail(int do_perror, const char *message, ...)
256{
257	va_list args;
258	int save_errno = errno;
259
260	va_start(args, message);
261	(void) fprintf(stderr, "lockstat: ");
262	(void) vfprintf(stderr, message, args);
263	va_end(args);
264	if (do_perror)
265		(void) fprintf(stderr, ": %s", strerror(save_errno));
266	(void) fprintf(stderr, "\n");
267	exit(2);
268}
269
270static void
271dfail(const char *message, ...)
272{
273	va_list args;
274
275	va_start(args, message);
276	(void) fprintf(stderr, "lockstat: ");
277	(void) vfprintf(stderr, message, args);
278	va_end(args);
279	(void) fprintf(stderr, ": %s\n",
280	    dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
281
282	exit(2);
283}
284
285static void
286show_events(char event_type, char *desc)
287{
288	int i, first = -1, last;
289
290	for (i = 0; i < LS_MAX_EVENTS; i++) {
291		ls_event_info_t *evp = &g_event_info[i];
292		if (evp->ev_type != event_type ||
293		    strncmp(evp->ev_desc, "Unknown event", 13) == 0)
294			continue;
295		if (first == -1)
296			first = i;
297		last = i;
298	}
299
300	(void) fprintf(stderr,
301	    "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
302	    desc, event_type, first, last);
303
304	for (i = first; i <= last; i++)
305		(void) fprintf(stderr,
306		    "%4d = %s\n", i, g_event_info[i].ev_desc);
307}
308
309static void
310usage(void)
311{
312	(void) fprintf(stderr,
313	    "Usage: lockstat [options] command [args]\n"
314	    "\nEvent selection options:\n\n"
315	    "  -C              watch contention events [on by default]\n"
316	    "  -E              watch error events [off by default]\n"
317	    "  -H              watch hold events [off by default]\n"
318	    "  -I              watch interrupt events [off by default]\n"
319	    "  -A              watch all lock events [equivalent to -CH]\n"
320	    "  -e event_list   only watch the specified events (shown below);\n"
321	    "                  <event_list> is a comma-separated list of\n"
322	    "                  events or ranges of events, e.g. 1,4-7,35\n"
323	    "  -i rate         interrupt rate for -I [default: %d Hz]\n"
324	    "\nData gathering options:\n\n"
325	    "  -b              basic statistics (lock, caller, event count)\n"
326	    "  -t              timing for all events [default]\n"
327	    "  -h              histograms for event times\n"
328	    "  -s depth        stack traces <depth> deep\n"
329	    "  -x opt[=val]    enable or modify DTrace options\n"
330	    "\nData filtering options:\n\n"
331	    "  -n nrecords     maximum number of data records [default: %d]\n"
332	    "  -l lock[,size]  only watch <lock>, which can be specified as a\n"
333	    "                  symbolic name or hex address; <size> defaults\n"
334	    "                  to the ELF symbol size if available, 1 if not\n"
335	    "  -f func[,size]  only watch events generated by <func>\n"
336	    "  -d duration     only watch events longer than <duration>\n"
337	    "  -T              trace (rather than sample) events\n"
338	    "\nData reporting options:\n\n"
339	    "  -c              coalesce lock data for arrays like pse_mutex[]\n"
340	    "  -k              coalesce PCs within functions\n"
341	    "  -g              show total events generated by function\n"
342	    "  -w              wherever: don't distinguish events by caller\n"
343	    "  -W              whichever: don't distinguish events by lock\n"
344	    "  -R              display rates rather than counts\n"
345	    "  -p              parsable output format (awk(1)-friendly)\n"
346	    "  -P              sort lock data by (count * avg_time) product\n"
347	    "  -D n            only display top <n> events of each type\n"
348	    "  -o filename     send output to <filename>\n",
349	    DEFAULT_HZ, DEFAULT_NRECS);
350
351	show_events('C', "Contention");
352	show_events('H', "Hold-time");
353	show_events('I', "Interrupt");
354	show_events('E', "Error");
355	(void) fprintf(stderr, "\n");
356
357	exit(1);
358}
359
360static int
361lockcmp(lsrec_t *a, lsrec_t *b)
362{
363	int i;
364
365	if (a->ls_event < b->ls_event)
366		return (-1);
367	if (a->ls_event > b->ls_event)
368		return (1);
369
370	for (i = g_stkdepth - 1; i >= 0; i--) {
371		if (a->ls_stack[i] < b->ls_stack[i])
372			return (-1);
373		if (a->ls_stack[i] > b->ls_stack[i])
374			return (1);
375	}
376
377	if (a->ls_caller < b->ls_caller)
378		return (-1);
379	if (a->ls_caller > b->ls_caller)
380		return (1);
381
382	if (a->ls_lock < b->ls_lock)
383		return (-1);
384	if (a->ls_lock > b->ls_lock)
385		return (1);
386
387	return (0);
388}
389
390static int
391countcmp(lsrec_t *a, lsrec_t *b)
392{
393	if (a->ls_event < b->ls_event)
394		return (-1);
395	if (a->ls_event > b->ls_event)
396		return (1);
397
398	return (b->ls_count - a->ls_count);
399}
400
401static int
402timecmp(lsrec_t *a, lsrec_t *b)
403{
404	if (a->ls_event < b->ls_event)
405		return (-1);
406	if (a->ls_event > b->ls_event)
407		return (1);
408
409	if (a->ls_time < b->ls_time)
410		return (1);
411	if (a->ls_time > b->ls_time)
412		return (-1);
413
414	return (0);
415}
416
417static int
418lockcmp_anywhere(lsrec_t *a, lsrec_t *b)
419{
420	if (a->ls_event < b->ls_event)
421		return (-1);
422	if (a->ls_event > b->ls_event)
423		return (1);
424
425	if (a->ls_lock < b->ls_lock)
426		return (-1);
427	if (a->ls_lock > b->ls_lock)
428		return (1);
429
430	return (0);
431}
432
433static int
434lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
435{
436	if (a->ls_event < b->ls_event)
437		return (-1);
438	if (a->ls_event > b->ls_event)
439		return (1);
440
441	if (a->ls_lock < b->ls_lock)
442		return (-1);
443	if (a->ls_lock > b->ls_lock)
444		return (1);
445
446	return (b->ls_count - a->ls_count);
447}
448
449static int
450sitecmp_anylock(lsrec_t *a, lsrec_t *b)
451{
452	int i;
453
454	if (a->ls_event < b->ls_event)
455		return (-1);
456	if (a->ls_event > b->ls_event)
457		return (1);
458
459	for (i = g_stkdepth - 1; i >= 0; i--) {
460		if (a->ls_stack[i] < b->ls_stack[i])
461			return (-1);
462		if (a->ls_stack[i] > b->ls_stack[i])
463			return (1);
464	}
465
466	if (a->ls_caller < b->ls_caller)
467		return (-1);
468	if (a->ls_caller > b->ls_caller)
469		return (1);
470
471	return (0);
472}
473
474static int
475site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
476{
477	int i;
478
479	if (a->ls_event < b->ls_event)
480		return (-1);
481	if (a->ls_event > b->ls_event)
482		return (1);
483
484	for (i = g_stkdepth - 1; i >= 0; i--) {
485		if (a->ls_stack[i] < b->ls_stack[i])
486			return (-1);
487		if (a->ls_stack[i] > b->ls_stack[i])
488			return (1);
489	}
490
491	if (a->ls_caller < b->ls_caller)
492		return (-1);
493	if (a->ls_caller > b->ls_caller)
494		return (1);
495
496	return (b->ls_count - a->ls_count);
497}
498
499static void
500lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
501{
502	int m = n / 2;
503	int i, j;
504
505	if (m > 1)
506		lsmergesort(cmp, a, b, m);
507	if (n - m > 1)
508		lsmergesort(cmp, a + m, b + m, n - m);
509	for (i = m; i > 0; i--)
510		b[i - 1] = a[i - 1];
511	for (j = m - 1; j < n - 1; j++)
512		b[n + m - j - 2] = a[j + 1];
513	while (i < j)
514		*a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
515	*a = b[i];
516}
517
518static void
519coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
520{
521	int i, j;
522	lsrec_t *target, *current;
523
524	target = lock[0];
525
526	for (i = 1; i < n; i++) {
527		current = lock[i];
528		if (cmp(current, target) != 0) {
529			target = current;
530			continue;
531		}
532		current->ls_event = LS_MAX_EVENTS;
533		target->ls_count += current->ls_count;
534		target->ls_refcnt += current->ls_refcnt;
535		if (g_recsize < LS_TIME)
536			continue;
537		target->ls_time += current->ls_time;
538		if (g_recsize < LS_HIST)
539			continue;
540		for (j = 0; j < 64; j++)
541			target->ls_hist[j] += current->ls_hist[j];
542	}
543}
544
545static void
546coalesce_symbol(uintptr_t *addrp)
547{
548	uintptr_t symoff;
549	size_t symsize;
550
551	if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
552		*addrp -= symoff;
553}
554
555static void
556predicate_add(char **pred, char *what, char *cmp, uintptr_t value)
557{
558	char *new;
559	int len, newlen;
560
561	if (what == NULL)
562		return;
563
564	if (*pred == NULL) {
565		*pred = malloc(1);
566		*pred[0] = '\0';
567	}
568
569	len = strlen(*pred);
570	newlen = len + strlen(what) + 32 + strlen("( && )");
571	new = malloc(newlen);
572
573	if (*pred[0] != '\0') {
574		if (cmp != NULL) {
575			(void) sprintf(new, "(%s) && (%s %s 0x%p)",
576			    *pred, what, cmp, (void *)value);
577		} else {
578			(void) sprintf(new, "(%s) && (%s)", *pred, what);
579		}
580	} else {
581		if (cmp != NULL) {
582			(void) sprintf(new, "%s %s 0x%p",
583			    what, cmp, (void *)value);
584		} else {
585			(void) sprintf(new, "%s", what);
586		}
587	}
588
589	free(*pred);
590	*pred = new;
591}
592
593static void
594predicate_destroy(char **pred)
595{
596	free(*pred);
597	*pred = NULL;
598}
599
600static void
601filter_add(char **filt, char *what, uintptr_t base, uintptr_t size)
602{
603	char buf[256], *c = buf, *new;
604	int len, newlen;
605
606	if (*filt == NULL) {
607		*filt = malloc(1);
608		*filt[0] = '\0';
609	}
610
611#ifdef illumos
612	(void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
613	    " || " : "", what, (void *)base, what, (void *)(base + size));
614#else
615	(void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ?
616	    " || " : "", what, (void *)base, what, (void *)(base + size));
617#endif
618
619	newlen = (len = strlen(*filt) + 1) + strlen(c);
620	new = malloc(newlen);
621	bcopy(*filt, new, len);
622	(void) strcat(new, c);
623	free(*filt);
624	*filt = new;
625}
626
627static void
628filter_destroy(char **filt)
629{
630	free(*filt);
631	*filt = NULL;
632}
633
634static void
635dprog_add(const char *fmt, ...)
636{
637	va_list args;
638	int size, offs;
639	char c;
640
641	va_start(args, fmt);
642	size = vsnprintf(&c, 1, fmt, args) + 1;
643	va_end(args);
644
645	if (g_proglen == 0) {
646		offs = 0;
647	} else {
648		offs = g_proglen - 1;
649	}
650
651	g_proglen = offs + size;
652
653	if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
654		fail(1, "failed to reallocate program text");
655
656	va_start(args, fmt);
657	(void) vsnprintf(&g_prog[offs], size, fmt, args);
658	va_end(args);
659}
660
661/*
662 * This function may read like an open sewer, but keep in mind that programs
663 * that generate other programs are rarely pretty.  If one has the unenviable
664 * task of maintaining or -- worse -- extending this code, use the -V option
665 * to examine the D program as generated by this function.
666 */
667static void
668dprog_addevent(int event)
669{
670	ls_event_info_t *info = &g_event_info[event];
671	char *pred = NULL;
672	char stack[20];
673	const char *arg0, *caller;
674	char *arg1 = "arg1";
675	char buf[80];
676	hrtime_t dur;
677	int depth;
678
679	if (info->ev_name[0] == '\0')
680		return;
681
682	if (info->ev_type == 'I') {
683		/*
684		 * For interrupt events, arg0 (normally the lock pointer) is
685		 * the CPU address plus the current pil, and arg1 (normally
686		 * the number of nanoseconds) is the number of nanoseconds
687		 * late -- and it's stored in arg2.
688		 */
689#ifdef illumos
690		arg0 = "(uintptr_t)curthread->t_cpu + \n"
691		    "\t    curthread->t_cpu->cpu_profile_pil";
692#else
693		arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n"
694		    "\t    0x01000000 + curthread->td_pri_class";
695#endif
696		caller = "(uintptr_t)arg0";
697		arg1 = "arg2";
698	} else {
699		arg0 = "(uintptr_t)arg0";
700		caller = "caller";
701	}
702
703	if (g_recsize > LS_HIST) {
704		for (depth = 0; g_recsize > LS_STACK(depth); depth++)
705			continue;
706
707		if (g_tracing) {
708			(void) sprintf(stack, "\tstack(%d);\n", depth);
709		} else {
710			(void) sprintf(stack, ", stack(%d)", depth);
711		}
712	} else {
713		(void) sprintf(stack, "");
714	}
715
716	if (info->ev_acquire != NULL) {
717		/*
718		 * If this is a hold event, we need to generate an additional
719		 * clause for the acquire; the clause for the release will be
720		 * generated with the aggregating statement, below.
721		 */
722		dprog_add("%s\n", info->ev_acquire);
723		predicate_add(&pred, info->ev_predicate, NULL, 0);
724		predicate_add(&pred, g_predicate, NULL, 0);
725		if (pred != NULL)
726			dprog_add("/%s/\n", pred);
727
728		dprog_add("{\n");
729		(void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);
730
731		if (info->ev_type == 'H') {
732			dprog_add("\t%s = timestamp;\n", buf);
733		} else {
734			/*
735			 * If this isn't a hold event, it's the recursive
736			 * error event.  For this, we simply bump the
737			 * thread-local, per-lock count.
738			 */
739			dprog_add("\t%s++;\n", buf);
740		}
741
742		dprog_add("}\n\n");
743		predicate_destroy(&pred);
744		pred = NULL;
745
746		if (info->ev_type == 'E') {
747			/*
748			 * If this is the recursive lock error event, we need
749			 * to generate an additional clause to decrement the
750			 * thread-local, per-lock count.  This assures that we
751			 * only execute the aggregating clause if we have
752			 * recursive entry.
753			 */
754			dprog_add("%s\n", info->ev_name);
755			dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
756		}
757
758		predicate_add(&pred, buf, NULL, 0);
759
760		if (info->ev_type == 'H') {
761			(void) sprintf(buf, "timestamp -\n\t    "
762			    "self->ev%d[(uintptr_t)arg0]", event);
763		}
764
765		arg1 = buf;
766	} else {
767		predicate_add(&pred, info->ev_predicate, NULL, 0);
768		if (info->ev_type != 'I')
769			predicate_add(&pred, g_predicate, NULL, 0);
770		else
771			predicate_add(&pred, g_ipredicate, NULL, 0);
772	}
773
774	if ((dur = g_min_duration[event]) != 0)
775		predicate_add(&pred, arg1, ">=", dur);
776
777	dprog_add("%s\n", info->ev_name);
778
779	if (pred != NULL)
780		dprog_add("/%s/\n", pred);
781	predicate_destroy(&pred);
782
783	dprog_add("{\n");
784
785	if (g_tracing) {
786		dprog_add("\ttrace(%dULL);\n", event);
787		dprog_add("\ttrace(%s);\n", arg0);
788		dprog_add("\ttrace(%s);\n", caller);
789		dprog_add(stack);
790	} else {
791		/*
792		 * The ordering here is important:  when we process the
793		 * aggregate, we count on the fact that @avg appears before
794		 * @hist in program order to assure that @avg is assigned the
795		 * first aggregation variable ID and @hist assigned the
796		 * second; see the comment in process_aggregate() for details.
797		 */
798		dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
799		    event, arg0, caller, stack, arg1);
800
801		if (g_recsize >= LS_HIST) {
802			dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
803			    "(%s);\n", event, arg0, caller, stack, arg1);
804		}
805	}
806
807	if (info->ev_acquire != NULL)
808		dprog_add("\tself->ev%d[arg0] = 0;\n", event);
809
810	dprog_add("}\n\n");
811}
812
813static void
814dprog_compile()
815{
816	dtrace_prog_t *prog;
817	dtrace_proginfo_t info;
818
819	if (g_Vflag) {
820		(void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
821		(void) fputs(g_prog, stderr);
822		(void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
823	}
824
825	if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
826	    DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
827		dfail("failed to compile program");
828
829	if (dtrace_program_exec(g_dtp, prog, &info) == -1)
830		dfail("failed to enable probes");
831
832	if (dtrace_go(g_dtp) != 0)
833		dfail("couldn't start tracing");
834}
835
836static void
837#ifdef illumos
838status_fire(void)
839#else
840status_fire(int i)
841#endif
842{}
843
844static void
845status_init(void)
846{
847	dtrace_optval_t val, status, agg;
848	struct sigaction act;
849	struct itimerspec ts;
850	struct sigevent ev;
851	timer_t tid;
852
853	if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
854		dfail("failed to get 'statusrate'");
855
856	if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
857		dfail("failed to get 'statusrate'");
858
859	/*
860	 * We would want to awaken at a rate that is the GCD of the statusrate
861	 * and the aggrate -- but that seems a bit absurd.  Instead, we'll
862	 * simply awaken at a rate that is the more frequent of the two, which
863	 * assures that we're never later than the interval implied by the
864	 * more frequent rate.
865	 */
866	val = status < agg ? status : agg;
867
868	(void) sigemptyset(&act.sa_mask);
869	act.sa_flags = 0;
870	act.sa_handler = status_fire;
871	(void) sigaction(SIGUSR1, &act, NULL);
872
873	ev.sigev_notify = SIGEV_SIGNAL;
874	ev.sigev_signo = SIGUSR1;
875
876	if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
877		dfail("cannot create CLOCK_REALTIME timer");
878
879	ts.it_value.tv_sec = val / NANOSEC;
880	ts.it_value.tv_nsec = val % NANOSEC;
881	ts.it_interval = ts.it_value;
882
883	if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
884		dfail("cannot set time on CLOCK_REALTIME timer");
885}
886
887static void
888status_check(void)
889{
890	if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
891		dfail("failed to snap aggregate");
892
893	if (dtrace_status(g_dtp) == -1)
894		dfail("dtrace_status()");
895}
896
897static void
898lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
899{
900	bzero(lsrec, g_recsize);
901	lsrec->ls_count = 1;
902
903	if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
904		fail(0, "truncated DTrace record");
905
906	if (rec->dtrd_size != sizeof (uint64_t))
907		fail(0, "bad event size in first record");
908
909	/* LINTED - alignment */
910	lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
911	rec++;
912
913	if (rec->dtrd_size != sizeof (uintptr_t))
914		fail(0, "bad lock address size in second record");
915
916	/* LINTED - alignment */
917	lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
918	rec++;
919
920	if (rec->dtrd_size != sizeof (uintptr_t))
921		fail(0, "bad caller size in third record");
922
923	/* LINTED - alignment */
924	lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
925	rec++;
926
927	if (g_recsize > LS_HIST) {
928		int frames, i;
929		pc_t *stack;
930
931		frames = rec->dtrd_size / sizeof (pc_t);
932		/* LINTED - alignment */
933		stack = (pc_t *)(data + rec->dtrd_offset);
934
935		for (i = 1; i < frames; i++)
936			lsrec->ls_stack[i - 1] = stack[i];
937	}
938}
939
940/*ARGSUSED*/
941static int
942count_aggregate(const dtrace_aggdata_t *agg, void *arg)
943{
944	*((size_t *)arg) += 1;
945
946	return (DTRACE_AGGWALK_NEXT);
947}
948
949static int
950process_aggregate(const dtrace_aggdata_t *agg, void *arg)
951{
952	const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
953	caddr_t data = agg->dtada_data;
954	lsdata_t *lsdata = arg;
955	lsrec_t *lsrec = lsdata->lsd_next;
956	const dtrace_recdesc_t *rec;
957	uint64_t *avg, *quantized;
958	int i, j;
959
960	assert(lsdata->lsd_count < g_nrecs);
961
962	/*
963	 * Aggregation variable IDs are guaranteed to be generated in program
964	 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
965	 * plus one.  As "avg" appears before "hist" in program order, we know
966	 * that "avg" will be allocated the first aggregation variable ID, and
967	 * "hist" will be allocated the second aggregation variable ID -- and
968	 * we therefore use the aggregation variable ID to differentiate the
969	 * cases.
970	 */
971	if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {
972		/*
973		 * If this is the histogram entry.  We'll copy the quantized
974		 * data into lc_hist, and jump over the rest.
975		 */
976		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
977
978		if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)
979			fail(0, "bad variable ID in aggregation record");
980
981		if (rec->dtrd_size !=
982		    DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
983			fail(0, "bad quantize size in aggregation record");
984
985		/* LINTED - alignment */
986		quantized = (uint64_t *)(data + rec->dtrd_offset);
987
988		for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
989		    i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
990			lsrec->ls_hist[j] = quantized[i];
991
992		goto out;
993	}
994
995	lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
996	    aggdesc->dtagd_nrecs - 1, data);
997
998	rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
999
1000	if (rec->dtrd_size != 2 * sizeof (uint64_t))
1001		fail(0, "bad avg size in aggregation record");
1002
1003	/* LINTED - alignment */
1004	avg = (uint64_t *)(data + rec->dtrd_offset);
1005	lsrec->ls_count = (uint32_t)avg[0];
1006	lsrec->ls_time = (uintptr_t)avg[1];
1007
1008	if (g_recsize >= LS_HIST)
1009		return (DTRACE_AGGWALK_NEXT);
1010
1011out:
1012	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1013	lsdata->lsd_count++;
1014
1015	return (DTRACE_AGGWALK_NEXT);
1016}
1017
1018static int
1019process_trace(const dtrace_probedata_t *pdata, void *arg)
1020{
1021	lsdata_t *lsdata = arg;
1022	lsrec_t *lsrec = lsdata->lsd_next;
1023	dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
1024	caddr_t data = pdata->dtpda_data;
1025
1026	if (lsdata->lsd_count >= g_nrecs)
1027		return (DTRACE_CONSUME_NEXT);
1028
1029	lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);
1030
1031	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1032	lsdata->lsd_count++;
1033
1034	return (DTRACE_CONSUME_NEXT);
1035}
1036
1037static int
1038process_data(FILE *out, char *data)
1039{
1040	lsdata_t lsdata;
1041
1042	/* LINTED - alignment */
1043	lsdata.lsd_next = (lsrec_t *)data;
1044	lsdata.lsd_count = 0;
1045
1046	if (g_tracing) {
1047		if (dtrace_consume(g_dtp, out,
1048		    process_trace, NULL, &lsdata) != 0)
1049			dfail("failed to consume buffer");
1050
1051		return (lsdata.lsd_count);
1052	}
1053
1054	if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
1055	    process_aggregate, &lsdata) != 0)
1056		dfail("failed to walk aggregate");
1057
1058	return (lsdata.lsd_count);
1059}
1060
1061/*ARGSUSED*/
1062static int
1063drophandler(const dtrace_dropdata_t *data, void *arg)
1064{
1065	g_dropped++;
1066	(void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
1067	return (DTRACE_HANDLE_OK);
1068}
1069
1070int
1071main(int argc, char **argv)
1072{
1073	char *data_buf;
1074	lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
1075	FILE *out = stdout;
1076	int c;
1077	pid_t child;
1078	int status;
1079	int i, j;
1080	hrtime_t duration;
1081	char *addrp, *offp, *sizep, *evp, *lastp, *p;
1082	uintptr_t addr;
1083	size_t size, off;
1084	int events_specified = 0;
1085	int exec_errno = 0;
1086	uint32_t event;
1087	char *filt = NULL, *ifilt = NULL;
1088	static uint64_t ev_count[LS_MAX_EVENTS + 1];
1089	static uint64_t ev_time[LS_MAX_EVENTS + 1];
1090	dtrace_optval_t aggsize;
1091	char aggstr[10];
1092	long ncpus;
1093	int dynvar = 0;
1094	int err;
1095
1096	if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
1097		fail(0, "cannot open dtrace library: %s",
1098		    dtrace_errmsg(NULL, err));
1099	}
1100
1101	if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
1102		dfail("couldn't establish drop handler");
1103
1104	if (symtab_init() == -1)
1105		fail(1, "can't load kernel symbols");
1106
1107	g_nrecs = DEFAULT_NRECS;
1108
1109	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1110		switch (c) {
1111		case 'b':
1112			g_recsize = LS_BASIC;
1113			break;
1114
1115		case 't':
1116			g_recsize = LS_TIME;
1117			break;
1118
1119		case 'h':
1120			g_recsize = LS_HIST;
1121			break;
1122
1123		case 's':
1124			if (!isdigit(optarg[0]))
1125				usage();
1126			g_stkdepth = atoi(optarg);
1127			if (g_stkdepth > LS_MAX_STACK_DEPTH)
1128				fail(0, "max stack depth is %d",
1129				    LS_MAX_STACK_DEPTH);
1130			g_recsize = LS_STACK(g_stkdepth);
1131			break;
1132
1133		case 'n':
1134			if (!isdigit(optarg[0]))
1135				usage();
1136			g_nrecs = atoi(optarg);
1137			break;
1138
1139		case 'd':
1140			if (!isdigit(optarg[0]))
1141				usage();
1142			duration = atoll(optarg);
1143
1144			/*
1145			 * XXX -- durations really should be per event
1146			 * since the units are different, but it's hard
1147			 * to express this nicely in the interface.
1148			 * Not clear yet what the cleanest solution is.
1149			 */
1150			for (i = 0; i < LS_MAX_EVENTS; i++)
1151				if (g_event_info[i].ev_type != 'E')
1152					g_min_duration[i] = duration;
1153
1154			break;
1155
1156		case 'i':
1157			if (!isdigit(optarg[0]))
1158				usage();
1159			i = atoi(optarg);
1160			if (i <= 0)
1161				usage();
1162			if (i > MAX_HZ)
1163				fail(0, "max interrupt rate is %d Hz", MAX_HZ);
1164
1165			for (j = 0; j < LS_MAX_EVENTS; j++)
1166				if (strcmp(g_event_info[j].ev_desc,
1167				    "Profiling interrupt") == 0)
1168					break;
1169
1170			(void) sprintf(g_event_info[j].ev_name,
1171			    "profile:::profile-%d", i);
1172			break;
1173
1174		case 'l':
1175		case 'f':
1176			addrp = strtok(optarg, ",");
1177			sizep = strtok(NULL, ",");
1178			addrp = strtok(optarg, ",+");
1179			offp = strtok(NULL, ",");
1180
1181			size = sizep ? strtoul(sizep, NULL, 0) : 1;
1182			off = offp ? strtoul(offp, NULL, 0) : 0;
1183
1184			if (addrp[0] == '0') {
1185				addr = strtoul(addrp, NULL, 16) + off;
1186			} else {
1187				addr = sym_to_addr(addrp) + off;
1188				if (sizep == NULL)
1189					size = sym_size(addrp) - off;
1190				if (addr - off == 0)
1191					fail(0, "symbol '%s' not found", addrp);
1192				if (size == 0)
1193					size = 1;
1194			}
1195
1196
1197			if (c == 'l') {
1198				filter_add(&filt, "arg0", addr, size);
1199			} else {
1200				filter_add(&filt, "caller", addr, size);
1201				filter_add(&ifilt, "arg0", addr, size);
1202			}
1203			break;
1204
1205		case 'e':
1206			evp = strtok_r(optarg, ",", &lastp);
1207			while (evp) {
1208				int ev1, ev2;
1209				char *evp2;
1210
1211				(void) strtok(evp, "-");
1212				evp2 = strtok(NULL, "-");
1213				ev1 = atoi(evp);
1214				ev2 = evp2 ? atoi(evp2) : ev1;
1215				if ((uint_t)ev1 >= LS_MAX_EVENTS ||
1216				    (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
1217					fail(0, "-e events out of range");
1218				for (i = ev1; i <= ev2; i++)
1219					g_enabled[i] = 1;
1220				evp = strtok_r(NULL, ",", &lastp);
1221			}
1222			events_specified = 1;
1223			break;
1224
1225		case 'c':
1226			g_cflag = 1;
1227			break;
1228
1229		case 'k':
1230			g_kflag = 1;
1231			break;
1232
1233		case 'w':
1234			g_wflag = 1;
1235			break;
1236
1237		case 'W':
1238			g_Wflag = 1;
1239			break;
1240
1241		case 'g':
1242			g_gflag = 1;
1243			break;
1244
1245		case 'C':
1246		case 'E':
1247		case 'H':
1248		case 'I':
1249			for (i = 0; i < LS_MAX_EVENTS; i++)
1250				if (g_event_info[i].ev_type == c)
1251					g_enabled[i] = 1;
1252			events_specified = 1;
1253			break;
1254
1255		case 'A':
1256			for (i = 0; i < LS_MAX_EVENTS; i++)
1257				if (strchr("CH", g_event_info[i].ev_type))
1258					g_enabled[i] = 1;
1259			events_specified = 1;
1260			break;
1261
1262		case 'T':
1263			g_tracing = 1;
1264			break;
1265
1266		case 'D':
1267			if (!isdigit(optarg[0]))
1268				usage();
1269			g_topn = atoi(optarg);
1270			break;
1271
1272		case 'R':
1273			g_rates = 1;
1274			break;
1275
1276		case 'p':
1277			g_pflag = 1;
1278			break;
1279
1280		case 'P':
1281			g_Pflag = 1;
1282			break;
1283
1284		case 'o':
1285			if ((out = fopen(optarg, "w")) == NULL)
1286				fail(1, "error opening file");
1287			break;
1288
1289		case 'V':
1290			g_Vflag = 1;
1291			break;
1292
1293		default:
1294			if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
1295				usage();
1296		}
1297	}
1298
1299	if (filt != NULL) {
1300		predicate_add(&g_predicate, filt, NULL, 0);
1301		filter_destroy(&filt);
1302	}
1303
1304	if (ifilt != NULL) {
1305		predicate_add(&g_ipredicate, ifilt, NULL, 0);
1306		filter_destroy(&ifilt);
1307	}
1308
1309	if (g_recsize == 0) {
1310		if (g_gflag) {
1311			g_stkdepth = LS_MAX_STACK_DEPTH;
1312			g_recsize = LS_STACK(g_stkdepth);
1313		} else {
1314			g_recsize = LS_TIME;
1315		}
1316	}
1317
1318	if (g_gflag && g_recsize <= LS_STACK(0))
1319		fail(0, "'-g' requires at least '-s 1' data gathering");
1320
1321	/*
1322	 * Make sure the alignment is reasonable
1323	 */
1324	g_recsize = -(-g_recsize & -sizeof (uint64_t));
1325
1326	for (i = 0; i < LS_MAX_EVENTS; i++) {
1327		/*
1328		 * If no events were specified, enable -C.
1329		 */
1330		if (!events_specified && g_event_info[i].ev_type == 'C')
1331			g_enabled[i] = 1;
1332	}
1333
1334	for (i = 0; i < LS_MAX_EVENTS; i++) {
1335		if (!g_enabled[i])
1336			continue;
1337
1338		if (g_event_info[i].ev_acquire != NULL) {
1339			/*
1340			 * If we've enabled a hold event, we must explicitly
1341			 * allocate dynamic variable space.
1342			 */
1343			dynvar = 1;
1344		}
1345
1346		dprog_addevent(i);
1347	}
1348
1349	/*
1350	 * Make sure there are remaining arguments to specify a child command
1351	 * to execute.
1352	 */
1353	if (argc <= optind)
1354		usage();
1355
1356	if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
1357		dfail("couldn't determine number of online CPUs");
1358
1359	/*
1360	 * By default, we set our data buffer size to be the number of records
1361	 * multiplied by the size of the record, doubled to account for some
1362	 * DTrace slop and divided by the number of CPUs.  We silently clamp
1363	 * the aggregation size at both a minimum and a maximum to prevent
1364	 * absurdly low or high values.
1365	 */
1366	if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
1367		aggsize = MIN_AGGSIZE;
1368
1369	if (aggsize > MAX_AGGSIZE)
1370		aggsize = MAX_AGGSIZE;
1371
1372	(void) sprintf(aggstr, "%lld", (long long)aggsize);
1373
1374	if (!g_tracing) {
1375		if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
1376			dfail("failed to set 'bufsize'");
1377
1378		if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
1379			dfail("failed to set 'aggsize'");
1380
1381		if (dynvar) {
1382			/*
1383			 * If we're using dynamic variables, we set our
1384			 * dynamic variable size to be one megabyte per CPU,
1385			 * with a hard-limit of 32 megabytes.  This may still
1386			 * be too small in some cases, but it can be tuned
1387			 * manually via -x if need be.
1388			 */
1389			(void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);
1390
1391			if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
1392				dfail("failed to set 'dynvarsize'");
1393		}
1394	} else {
1395		if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
1396			dfail("failed to set 'bufsize'");
1397	}
1398
1399	if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
1400		dfail("failed to set 'statusrate'");
1401
1402	optind = 1;
1403	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1404		switch (c) {
1405		case 'x':
1406			if ((p = strchr(optarg, '=')) != NULL)
1407				*p++ = '\0';
1408
1409			if (dtrace_setopt(g_dtp, optarg, p) != 0)
1410				dfail("failed to set -x %s", optarg);
1411			break;
1412		}
1413	}
1414
1415	argc -= optind;
1416	argv += optind;
1417
1418	dprog_compile();
1419	status_init();
1420
1421	g_elapsed = -gethrtime();
1422
1423	/*
1424	 * Spawn the specified command and wait for it to complete.
1425	 */
1426	child = fork();
1427	if (child == -1)
1428		fail(1, "cannot fork");
1429	if (child == 0) {
1430		(void) dtrace_close(g_dtp);
1431		(void) execvp(argv[0], &argv[0]);
1432		exec_errno = errno;
1433		exit(127);
1434	}
1435
1436#ifdef illumos
1437	while (waitpid(child, &status, WEXITED) != child)
1438#else
1439	while (waitpid(child, &status, 0) != child)
1440#endif
1441		status_check();
1442
1443	g_elapsed += gethrtime();
1444
1445	if (WIFEXITED(status)) {
1446		if (WEXITSTATUS(status) != 0) {
1447			if (exec_errno != 0) {
1448				errno = exec_errno;
1449				fail(1, "could not execute %s", argv[0]);
1450			}
1451			(void) fprintf(stderr,
1452			    "lockstat: warning: %s exited with code %d\n",
1453			    argv[0], WEXITSTATUS(status));
1454		}
1455	} else {
1456		(void) fprintf(stderr,
1457		    "lockstat: warning: %s died on signal %d\n",
1458		    argv[0], WTERMSIG(status));
1459	}
1460
1461	if (dtrace_stop(g_dtp) == -1)
1462		dfail("failed to stop dtrace");
1463
1464	/*
1465	 * Before we read out the results, we need to allocate our buffer.
1466	 * If we're tracing, then we'll just use the precalculated size.  If
1467	 * we're not, then we'll take a snapshot of the aggregate, and walk
1468	 * it to count the number of records.
1469	 */
1470	if (!g_tracing) {
1471		if (dtrace_aggregate_snap(g_dtp) != 0)
1472			dfail("failed to snap aggregate");
1473
1474		g_nrecs = 0;
1475
1476		if (dtrace_aggregate_walk(g_dtp,
1477		    count_aggregate, &g_nrecs) != 0)
1478			dfail("failed to walk aggregate");
1479	}
1480
1481#ifdef illumos
1482	if ((data_buf = memalign(sizeof (uint64_t),
1483	    (g_nrecs + 1) * g_recsize)) == NULL)
1484#else
1485	if (posix_memalign((void **)&data_buf, sizeof (uint64_t),
1486	    (g_nrecs + 1) * g_recsize) )
1487#endif
1488		fail(1, "Memory allocation failed");
1489
1490	/*
1491	 * Read out the DTrace data.
1492	 */
1493	g_nrecs_used = process_data(out, data_buf);
1494
1495	if (g_nrecs_used > g_nrecs || g_dropped)
1496		(void) fprintf(stderr, "lockstat: warning: "
1497		    "ran out of data records (use -n for more)\n");
1498
1499	/* LINTED - alignment */
1500	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1501	    /* LINTED - alignment */
1502	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1503		ev_count[lsp->ls_event] += lsp->ls_count;
1504		ev_time[lsp->ls_event] += lsp->ls_time;
1505	}
1506
1507	/*
1508	 * If -g was specified, convert stacks into individual records.
1509	 */
1510	if (g_gflag) {
1511		lsrec_t *newlsp, *oldlsp;
1512
1513#ifdef illumos
1514		newlsp = memalign(sizeof (uint64_t),
1515		    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1516#else
1517		posix_memalign((void **)&newlsp, sizeof (uint64_t),
1518		    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1519#endif
1520		if (newlsp == NULL)
1521			fail(1, "Cannot allocate space for -g processing");
1522		lsp = newlsp;
1523		/* LINTED - alignment */
1524		for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1525		    /* LINTED - alignment */
1526		    oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
1527			int fr;
1528			int caller_in_stack = 0;
1529
1530			if (oldlsp->ls_count == 0)
1531				continue;
1532
1533			for (fr = 0; fr < g_stkdepth; fr++) {
1534				if (oldlsp->ls_stack[fr] == 0)
1535					break;
1536				if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
1537					caller_in_stack = 1;
1538				bcopy(oldlsp, lsp, LS_TIME);
1539				lsp->ls_caller = oldlsp->ls_stack[fr];
1540				/* LINTED - alignment */
1541				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1542			}
1543			if (!caller_in_stack) {
1544				bcopy(oldlsp, lsp, LS_TIME);
1545				/* LINTED - alignment */
1546				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1547			}
1548		}
1549		g_nrecs = g_nrecs_used =
1550		    ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
1551		g_recsize = LS_TIME;
1552		g_stkdepth = 0;
1553		free(data_buf);
1554		data_buf = (char *)newlsp;
1555	}
1556
1557	if ((sort_buf = calloc(2 * (g_nrecs + 1),
1558	    sizeof (void *))) == NULL)
1559		fail(1, "Sort buffer allocation failed");
1560	merge_buf = sort_buf + (g_nrecs + 1);
1561
1562	/*
1563	 * Build the sort buffer, discarding zero-count records along the way.
1564	 */
1565	/* LINTED - alignment */
1566	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1567	    /* LINTED - alignment */
1568	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1569		if (lsp->ls_count == 0)
1570			lsp->ls_event = LS_MAX_EVENTS;
1571		sort_buf[i] = lsp;
1572	}
1573
1574	if (g_nrecs_used == 0)
1575		exit(0);
1576
1577	/*
1578	 * Add a sentinel after the last record
1579	 */
1580	sort_buf[i] = lsp;
1581	lsp->ls_event = LS_MAX_EVENTS;
1582
1583	if (g_tracing) {
1584		report_trace(out, sort_buf);
1585		return (0);
1586	}
1587
1588	/*
1589	 * Application of -g may have resulted in multiple records
1590	 * with the same signature; coalesce them.
1591	 */
1592	if (g_gflag) {
1593		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1594		coalesce(lockcmp, sort_buf, g_nrecs_used);
1595	}
1596
1597	/*
1598	 * Coalesce locks within the same symbol if -c option specified.
1599	 * Coalesce PCs within the same function if -k option specified.
1600	 */
1601	if (g_cflag || g_kflag) {
1602		for (i = 0; i < g_nrecs_used; i++) {
1603			int fr;
1604			lsp = sort_buf[i];
1605			if (g_cflag)
1606				coalesce_symbol(&lsp->ls_lock);
1607			if (g_kflag) {
1608				for (fr = 0; fr < g_stkdepth; fr++)
1609					coalesce_symbol(&lsp->ls_stack[fr]);
1610				coalesce_symbol(&lsp->ls_caller);
1611			}
1612		}
1613		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1614		coalesce(lockcmp, sort_buf, g_nrecs_used);
1615	}
1616
1617	/*
1618	 * Coalesce callers if -w option specified
1619	 */
1620	if (g_wflag) {
1621		mergesort(lock_and_count_cmp_anywhere,
1622		    sort_buf, merge_buf, g_nrecs_used);
1623		coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
1624	}
1625
1626	/*
1627	 * Coalesce locks if -W option specified
1628	 */
1629	if (g_Wflag) {
1630		mergesort(site_and_count_cmp_anylock,
1631		    sort_buf, merge_buf, g_nrecs_used);
1632		coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
1633	}
1634
1635	/*
1636	 * Sort data by contention count (ls_count) or total time (ls_time),
1637	 * depending on g_Pflag.  Override g_Pflag if time wasn't measured.
1638	 */
1639	if (g_recsize < LS_TIME)
1640		g_Pflag = 0;
1641
1642	if (g_Pflag)
1643		mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
1644	else
1645		mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);
1646
1647	/*
1648	 * Display data by event type
1649	 */
1650	first = &sort_buf[0];
1651	while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
1652		current = first;
1653		while ((lsp = *current)->ls_event == event)
1654			current++;
1655		report_stats(out, first, current - first, ev_count[event],
1656		    ev_time[event]);
1657		first = current;
1658	}
1659
1660	return (0);
1661}
1662
1663static char *
1664format_symbol(char *buf, uintptr_t addr, int show_size)
1665{
1666	uintptr_t symoff;
1667	char *symname;
1668	size_t symsize;
1669
1670	symname = addr_to_sym(addr, &symoff, &symsize);
1671
1672	if (show_size && symoff == 0)
1673		(void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
1674	else if (symoff == 0)
1675		(void) sprintf(buf, "%s", symname);
1676	else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0)	/* CPU+PIL */
1677#ifdef illumos
1678		(void) sprintf(buf, "%s+%ld", symname, (long)symoff);
1679#else
1680		(void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]);
1681#endif
1682	else if (symoff <= symsize || (symoff < 256 && addr != symoff))
1683		(void) sprintf(buf, "%s+0x%llx", symname,
1684		    (unsigned long long)symoff);
1685	else
1686		(void) sprintf(buf, "0x%llx", (unsigned long long)addr);
1687	return (buf);
1688}
1689
1690static void
1691report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
1692	uint64_t total_time)
1693{
1694	uint32_t event = sort_buf[0]->ls_event;
1695	lsrec_t *lsp;
1696	double ptotal = 0.0;
1697	double percent;
1698	int i, j, fr;
1699	int displayed;
1700	int first_bin, last_bin, max_bin_count, total_bin_count;
1701	int rectype;
1702	char buf[256];
1703	char lhdr[80], chdr[80];
1704
1705	rectype = g_recsize;
1706
1707	if (g_topn == 0) {
1708		(void) fprintf(out, "%20llu %s\n",
1709		    g_rates == 0 ? total_count :
1710		    ((unsigned long long)total_count * NANOSEC) / g_elapsed,
1711		    g_event_info[event].ev_desc);
1712		return;
1713	}
1714
1715	(void) sprintf(lhdr, "%s%s",
1716	    g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
1717	(void) sprintf(chdr, "%s%s",
1718	    g_wflag ? "Hottest " : "", "Caller");
1719
1720	if (!g_pflag)
1721		(void) fprintf(out,
1722		    "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
1723		    g_event_info[event].ev_desc, (double)total_count,
1724		    (double)g_elapsed / NANOSEC,
1725		    (double)total_count * NANOSEC / g_elapsed);
1726
1727	if (!g_pflag && rectype < LS_HIST) {
1728		(void) sprintf(buf, "%s", g_event_info[event].ev_units);
1729		(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1730		    g_rates ? "ops/s" : "Count",
1731		    g_gflag ? "genr" : "indv",
1732		    "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
1733		(void) fprintf(out, "---------------------------------"
1734		    "----------------------------------------------\n");
1735	}
1736
1737	displayed = 0;
1738	for (i = 0; i < nrecs; i++) {
1739		lsp = sort_buf[i];
1740
1741		if (displayed++ >= g_topn)
1742			break;
1743
1744		if (g_pflag) {
1745			int j;
1746
1747			(void) fprintf(out, "%u %u",
1748			    lsp->ls_event, lsp->ls_count);
1749			(void) fprintf(out, " %s",
1750			    format_symbol(buf, lsp->ls_lock, g_cflag));
1751			(void) fprintf(out, " %s",
1752			    format_symbol(buf, lsp->ls_caller, 0));
1753			(void) fprintf(out, " %f",
1754			    (double)lsp->ls_refcnt / lsp->ls_count);
1755			if (rectype >= LS_TIME)
1756				(void) fprintf(out, " %llu",
1757				    (unsigned long long)lsp->ls_time);
1758			if (rectype >= LS_HIST) {
1759				for (j = 0; j < 64; j++)
1760					(void) fprintf(out, " %u",
1761					    lsp->ls_hist[j]);
1762			}
1763			for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
1764				if (rectype <= LS_STACK(j) ||
1765				    lsp->ls_stack[j] == 0)
1766					break;
1767				(void) fprintf(out, " %s",
1768				    format_symbol(buf, lsp->ls_stack[j], 0));
1769			}
1770			(void) fprintf(out, "\n");
1771			continue;
1772		}
1773
1774		if (rectype >= LS_HIST) {
1775			(void) fprintf(out, "---------------------------------"
1776			    "----------------------------------------------\n");
1777			(void) sprintf(buf, "%s",
1778			    g_event_info[event].ev_units);
1779			(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1780			    g_rates ? "ops/s" : "Count",
1781			    g_gflag ? "genr" : "indv",
1782			    "cuml", "rcnt", buf, lhdr, chdr);
1783		}
1784
1785		if (g_Pflag && total_time != 0)
1786			percent = (lsp->ls_time * 100.00) / total_time;
1787		else
1788			percent = (lsp->ls_count * 100.00) / total_count;
1789
1790		ptotal += percent;
1791
1792		if (rectype >= LS_TIME)
1793			(void) sprintf(buf, "%llu",
1794			    (unsigned long long)(lsp->ls_time / lsp->ls_count));
1795		else
1796			buf[0] = '\0';
1797
1798		(void) fprintf(out, "%5llu ",
1799		    g_rates == 0 ? lsp->ls_count :
1800		    ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);
1801
1802		(void) fprintf(out, "%3.0f%% ", percent);
1803
1804		if (g_gflag)
1805			(void) fprintf(out, "---- ");
1806		else
1807			(void) fprintf(out, "%3.0f%% ", ptotal);
1808
1809		(void) fprintf(out, "%4.2f %8s ",
1810		    (double)lsp->ls_refcnt / lsp->ls_count, buf);
1811
1812		(void) fprintf(out, "%-22s ",
1813		    format_symbol(buf, lsp->ls_lock, g_cflag));
1814
1815		(void) fprintf(out, "%-24s\n",
1816		    format_symbol(buf, lsp->ls_caller, 0));
1817
1818		if (rectype < LS_HIST)
1819			continue;
1820
1821		(void) fprintf(out, "\n");
1822		(void) fprintf(out, "%10s %31s %-9s %-24s\n",
1823		    g_event_info[event].ev_units,
1824		    "------ Time Distribution ------",
1825		    g_rates ? "ops/s" : "count",
1826		    rectype > LS_STACK(0) ? "Stack" : "");
1827
1828		first_bin = 0;
1829		while (lsp->ls_hist[first_bin] == 0)
1830			first_bin++;
1831
1832		last_bin = 63;
1833		while (lsp->ls_hist[last_bin] == 0)
1834			last_bin--;
1835
1836		max_bin_count = 0;
1837		total_bin_count = 0;
1838		for (j = first_bin; j <= last_bin; j++) {
1839			total_bin_count += lsp->ls_hist[j];
1840			if (lsp->ls_hist[j] > max_bin_count)
1841				max_bin_count = lsp->ls_hist[j];
1842		}
1843
1844		/*
1845		 * If we went a few frames below the caller, ignore them
1846		 */
1847		for (fr = 3; fr > 0; fr--)
1848			if (lsp->ls_stack[fr] == lsp->ls_caller)
1849				break;
1850
1851		for (j = first_bin; j <= last_bin; j++) {
1852			uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
1853			(void) fprintf(out, "%10llu |%s%s %-9u ",
1854			    1ULL << j,
1855			    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
1856			    "                              " + depth,
1857			    g_rates == 0 ? lsp->ls_hist[j] :
1858			    (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
1859			    g_elapsed));
1860			if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
1861				(void) fprintf(out, "\n");
1862				continue;
1863			}
1864			(void) fprintf(out, "%-24s\n",
1865			    format_symbol(buf, lsp->ls_stack[fr], 0));
1866			fr++;
1867		}
1868		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1869			(void) fprintf(out, "%15s %-36s %-24s\n", "", "",
1870			    format_symbol(buf, lsp->ls_stack[fr], 0));
1871			fr++;
1872		}
1873	}
1874
1875	if (!g_pflag)
1876		(void) fprintf(out, "---------------------------------"
1877		    "----------------------------------------------\n");
1878
1879	(void) fflush(out);
1880}
1881
1882static void
1883report_trace(FILE *out, lsrec_t **sort_buf)
1884{
1885	lsrec_t *lsp;
1886	int i, fr;
1887	int rectype;
1888	char buf[256], buf2[256];
1889
1890	rectype = g_recsize;
1891
1892	if (!g_pflag) {
1893		(void) fprintf(out, "%5s  %7s  %11s  %-24s  %-24s\n",
1894		    "Event", "Time", "Owner", "Lock", "Caller");
1895		(void) fprintf(out, "---------------------------------"
1896		    "----------------------------------------------\n");
1897	}
1898
1899	for (i = 0; i < g_nrecs_used; i++) {
1900
1901		lsp = sort_buf[i];
1902
1903		if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
1904			continue;
1905
1906		(void) fprintf(out, "%2d  %10llu  %11p  %-24s  %-24s\n",
1907		    lsp->ls_event, (unsigned long long)lsp->ls_time,
1908		    (void *)lsp->ls_next,
1909		    format_symbol(buf, lsp->ls_lock, 0),
1910		    format_symbol(buf2, lsp->ls_caller, 0));
1911
1912		if (rectype <= LS_STACK(0))
1913			continue;
1914
1915		/*
1916		 * If we went a few frames below the caller, ignore them
1917		 */
1918		for (fr = 3; fr > 0; fr--)
1919			if (lsp->ls_stack[fr] == lsp->ls_caller)
1920				break;
1921
1922		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1923			(void) fprintf(out, "%53s  %-24s\n", "",
1924			    format_symbol(buf, lsp->ls_stack[fr], 0));
1925			fr++;
1926		}
1927		(void) fprintf(out, "\n");
1928	}
1929
1930	(void) fflush(out);
1931}
1932