1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <stdio.h>
29#include <stddef.h>
30#include <stdlib.h>
31#include <stdarg.h>
32#include <string.h>
33#include <strings.h>
34#include <ctype.h>
35#include <fcntl.h>
36#include <unistd.h>
37#include <errno.h>
38#include <limits.h>
39#include <sys/types.h>
40#include <sys/modctl.h>
41#include <sys/stat.h>
42#include <sys/wait.h>
43#include <dtrace.h>
44#include <sys/lockstat.h>
45#include <alloca.h>
46#include <signal.h>
47#include <assert.h>
48
49#ifdef illumos
50#define	GETOPT_EOF	EOF
51#else
52#include <sys/time.h>
53#include <sys/resource.h>
54
55#define	mergesort(a, b, c, d)	lsmergesort(a, b, c, d)
56#define	GETOPT_EOF		(-1)
57
58typedef	uintptr_t	pc_t;
59#endif
60
61#define	LOCKSTAT_OPTSTR	"x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"
62
63#define	LS_MAX_STACK_DEPTH	50
64#define	LS_MAX_EVENTS		64
65
66typedef struct lsrec {
67	struct lsrec	*ls_next;	/* next in hash chain */
68#ifdef illumos
69	uintptr_t	ls_lock;	/* lock address */
70#else
71	char		*ls_lock;	/* lock name */
72#endif
73	uintptr_t	ls_caller;	/* caller address */
74	uint32_t	ls_count;	/* cumulative event count */
75	uint32_t	ls_event;	/* type of event */
76	uintptr_t	ls_refcnt;	/* cumulative reference count */
77	uint64_t	ls_time;	/* cumulative event duration */
78	uint32_t	ls_hist[64];	/* log2(duration) histogram */
79	uintptr_t	ls_stack[LS_MAX_STACK_DEPTH];
80} lsrec_t;
81
82typedef struct lsdata {
83	struct lsrec	*lsd_next;	/* next available */
84	int		lsd_count;	/* number of records */
85} lsdata_t;
86
87/*
88 * Definitions for the types of experiments which can be run.  They are
89 * listed in increasing order of memory cost and processing time cost.
90 * The numerical value of each type is the number of bytes needed per record.
91 */
92#define	LS_BASIC	offsetof(lsrec_t, ls_time)
93#define	LS_TIME		offsetof(lsrec_t, ls_hist[0])
94#define	LS_HIST		offsetof(lsrec_t, ls_stack[0])
95#define	LS_STACK(depth)	offsetof(lsrec_t, ls_stack[depth])
96
97static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
98static void report_trace(FILE *, lsrec_t **);
99
100extern int symtab_init(void);
101extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
102extern uintptr_t sym_to_addr(char *name);
103extern size_t sym_size(char *name);
104extern char *strtok_r(char *, const char *, char **);
105
106#define	DEFAULT_NRECS	10000
107#define	DEFAULT_HZ	97
108#define	MAX_HZ		1000
109#define	MIN_AGGSIZE	(16 * 1024)
110#define	MAX_AGGSIZE	(32 * 1024 * 1024)
111
112static int g_stkdepth;
113static int g_topn = INT_MAX;
114static hrtime_t g_elapsed;
115static int g_rates = 0;
116static int g_pflag = 0;
117static int g_Pflag = 0;
118static int g_wflag = 0;
119static int g_Wflag = 0;
120static int g_cflag = 0;
121static int g_kflag = 0;
122static int g_gflag = 0;
123static int g_Vflag = 0;
124static int g_tracing = 0;
125static size_t g_recsize;
126static size_t g_nrecs;
127static int g_nrecs_used;
128static uchar_t g_enabled[LS_MAX_EVENTS];
129static hrtime_t g_min_duration[LS_MAX_EVENTS];
130static dtrace_hdl_t *g_dtp;
131static char *g_predicate;
132static char *g_ipredicate;
133static char *g_prog;
134static int g_proglen;
135static int g_dropped;
136
137typedef struct ls_event_info {
138	char	ev_type;
139	char	ev_lhdr[20];
140	char	ev_desc[80];
141	char	ev_units[10];
142	char	ev_name[DTRACE_NAMELEN];
143	char	*ev_predicate;
144	char	*ev_acquire;
145} ls_event_info_t;
146
147static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
148	{ 'C',	"Lock",	"Adaptive mutex spin",			"nsec",
149	    "lockstat:::adaptive-spin" },
150	{ 'C',	"Lock",	"Adaptive mutex block",			"nsec",
151	    "lockstat:::adaptive-block" },
152	{ 'C',	"Lock",	"Spin lock spin",			"nsec",
153	    "lockstat:::spin-spin" },
154	{ 'C',	"Lock",	"Thread lock spin",			"nsec",
155	    "lockstat:::thread-spin" },
156	{ 'C',	"Lock",	"R/W writer blocked by writer",		"nsec",
157	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
158	{ 'C',	"Lock",	"R/W writer blocked by readers",	"nsec",
159	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
160	{ 'C',	"Lock",	"R/W reader blocked by writer",		"nsec",
161	    "lockstat:::rw-block", "arg2 == 1 && arg3 == 1" },
162	{ 'C',	"Lock",	"R/W reader blocked by write wanted",	"nsec",
163	    "lockstat:::rw-block", "arg2 == 1 && arg3 == 0 && arg4" },
164	{ 'C',	"Lock",	"R/W writer spin on writer",		"nsec",
165	    "lockstat:::rw-spin", "arg2 == 0 && arg3 == 1" },
166	{ 'C',	"Lock",	"R/W writer spin on readers",		"nsec",
167	    "lockstat:::rw-spin", "arg2 == 0 && arg3 == 0 && arg4" },
168	{ 'C',	"Lock",	"R/W reader spin on writer",		"nsec",
169	    "lockstat:::rw-spin", "arg2 == 1 && arg3 == 1" },
170	{ 'C',	"Lock",	"R/W reader spin on write wanted",	"nsec",
171	    "lockstat:::rw-spin", "arg2 == 1 && arg3 == 0 && arg4" },
172	{ 'C',	"Lock",	"SX exclusive block",			"nsec",
173	    "lockstat:::sx-block", "arg2 == 0" },
174	{ 'C',	"Lock",	"SX shared block",			"nsec",
175	    "lockstat:::sx-block", "arg2 == 1" },
176	{ 'C',	"Lock",	"SX exclusive spin",			"nsec",
177	    "lockstat:::sx-spin", "arg2 == 0" },
178	{ 'C',	"Lock",	"SX shared spin",			"nsec",
179	    "lockstat:::sx-spin", "arg2 == 1" },
180	{ 'C',	"Lock",	"lockmgr writer blocked by writer",	"nsec",
181	    "lockstat:::lockmgr-block", "arg2 == 0 && arg3 == 1" },
182	{ 'C',	"Lock",	"lockmgr writer blocked by readers",	"nsec",
183	    "lockstat:::lockmgr-block", "arg2 == 0 && arg3 == 0 && arg4" },
184	{ 'C',	"Lock",	"lockmgr reader blocked by writer",	"nsec",
185	    "lockstat:::lockmgr-block", "arg2 == 1 && arg3 == 1" },
186	{ 'C',	"Lock",	"lockmgr reader blocked by write wanted", "nsec",
187	    "lockstat:::lockmgr-block", "arg2 == 1 && arg3 == 0 && arg4" },
188	{ 'C',	"Lock",	"Unknown event (type 20)",		"units"	},
189	{ 'C',	"Lock",	"Unknown event (type 21)",		"units"	},
190	{ 'C',	"Lock",	"Unknown event (type 22)",		"units"	},
191	{ 'C',	"Lock",	"Unknown event (type 23)",		"units"	},
192	{ 'C',	"Lock",	"Unknown event (type 24)",		"units"	},
193	{ 'C',	"Lock",	"Unknown event (type 25)",		"units"	},
194	{ 'C',	"Lock",	"Unknown event (type 26)",		"units"	},
195	{ 'C',	"Lock",	"Unknown event (type 27)",		"units"	},
196	{ 'C',	"Lock",	"Unknown event (type 28)",		"units"	},
197	{ 'C',	"Lock",	"Unknown event (type 29)",		"units"	},
198	{ 'C',	"Lock",	"Unknown event (type 30)",		"units"	},
199	{ 'C',	"Lock",	"Unknown event (type 31)",		"units"	},
200	{ 'H',	"Lock",	"Adaptive mutex hold",			"nsec",
201	    "lockstat:::adaptive-release", NULL,
202	    "lockstat:::adaptive-acquire" },
203	{ 'H',	"Lock",	"Spin lock hold",			"nsec",
204	    "lockstat:::spin-release", NULL,
205	    "lockstat:::spin-acquire" },
206	{ 'H',	"Lock",	"R/W writer hold",			"nsec",
207	    "lockstat:::rw-release", "arg1 == 0",
208	    "lockstat:::rw-acquire" },
209	{ 'H',	"Lock",	"R/W reader hold",			"nsec",
210	    "lockstat:::rw-release", "arg1 == 1",
211	    "lockstat:::rw-acquire" },
212	{ 'H',	"Lock",	"SX shared hold",			"nsec",
213	    "lockstat:::sx-release", "arg1 == 1",
214	    "lockstat:::sx-acquire" },
215	{ 'H',	"Lock",	"SX exclusive hold",			"nsec",
216	    "lockstat:::sx-release", "arg1 == 0",
217	    "lockstat:::sx-acquire" },
218	{ 'H',	"Lock",	"lockmgr shared hold",			"nsec",
219	    "lockstat:::lockmgr-release", "arg1 == 1",
220	    "lockstat:::lockmgr-acquire" },
221	{ 'H',	"Lock",	"lockmgr exclusive hold",		"nsec",
222	    "lockstat:::lockmgr-release,lockstat:::lockmgr-disown", "arg1 == 0",
223	    "lockstat:::lockmgr-acquire" },
224	{ 'H',	"Lock",	"Unknown event (type 40)",		"units"	},
225	{ 'H',	"Lock",	"Unknown event (type 41)",		"units"	},
226	{ 'H',	"Lock",	"Unknown event (type 42)",		"units"	},
227	{ 'H',	"Lock",	"Unknown event (type 43)",		"units"	},
228	{ 'H',	"Lock",	"Unknown event (type 44)",		"units"	},
229	{ 'H',	"Lock",	"Unknown event (type 45)",		"units"	},
230	{ 'H',	"Lock",	"Unknown event (type 46)",		"units"	},
231	{ 'H',	"Lock",	"Unknown event (type 47)",		"units"	},
232	{ 'H',	"Lock",	"Unknown event (type 48)",		"units"	},
233	{ 'H',	"Lock",	"Unknown event (type 49)",		"units"	},
234	{ 'H',	"Lock",	"Unknown event (type 50)",		"units"	},
235	{ 'H',	"Lock",	"Unknown event (type 51)",		"units"	},
236	{ 'H',	"Lock",	"Unknown event (type 52)",		"units"	},
237	{ 'H',	"Lock",	"Unknown event (type 53)",		"units"	},
238	{ 'H',	"Lock",	"Unknown event (type 54)",		"units"	},
239	{ 'H',	"Lock",	"Unknown event (type 55)",		"units"	},
240#ifdef illumos
241	{ 'I',	"CPU+PIL", "Profiling interrupt",		"nsec",
242#else
243	{ 'I',	"CPU+Pri_Class", "Profiling interrupt",		"nsec",
244#endif
245	    "profile:::profile-97", NULL },
246	{ 'I',	"Lock",	"Unknown event (type 57)",		"units"	},
247	{ 'I',	"Lock",	"Unknown event (type 58)",		"units"	},
248	{ 'I',	"Lock",	"Unknown event (type 59)",		"units"	},
249	{ 'E',	"Lock",	"Recursive lock entry detected",	"(N/A)",
250	    "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
251	{ 'E',	"Lock",	"Lockstat enter failure",		"(N/A)"	},
252	{ 'E',	"Lock",	"Lockstat exit failure",		"nsec"	},
253	{ 'E',	"Lock",	"Lockstat record failure",		"(N/A)"	},
254};
255
256#ifndef illumos
257static char *g_pri_class[] = {
258	"",
259	"Intr",
260	"RealT",
261	"TShar",
262	"Idle"
263};
264#endif
265
266static void
267fail(int do_perror, const char *message, ...)
268{
269	va_list args;
270	int save_errno = errno;
271
272	va_start(args, message);
273	(void) fprintf(stderr, "lockstat: ");
274	(void) vfprintf(stderr, message, args);
275	va_end(args);
276	if (do_perror)
277		(void) fprintf(stderr, ": %s", strerror(save_errno));
278	(void) fprintf(stderr, "\n");
279	exit(2);
280}
281
282static void
283dfail(const char *message, ...)
284{
285	va_list args;
286
287	va_start(args, message);
288	(void) fprintf(stderr, "lockstat: ");
289	(void) vfprintf(stderr, message, args);
290	va_end(args);
291	(void) fprintf(stderr, ": %s\n",
292	    dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
293
294	exit(2);
295}
296
297static void
298show_events(char event_type, char *desc)
299{
300	int i, first = -1, last;
301
302	for (i = 0; i < LS_MAX_EVENTS; i++) {
303		ls_event_info_t *evp = &g_event_info[i];
304		if (evp->ev_type != event_type ||
305		    strncmp(evp->ev_desc, "Unknown event", 13) == 0)
306			continue;
307		if (first == -1)
308			first = i;
309		last = i;
310	}
311
312	(void) fprintf(stderr,
313	    "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
314	    desc, event_type, first, last);
315
316	for (i = first; i <= last; i++)
317		(void) fprintf(stderr,
318		    "%4d = %s\n", i, g_event_info[i].ev_desc);
319}
320
321static void
322usage(void)
323{
324	(void) fprintf(stderr,
325	    "Usage: lockstat [options] command [args]\n"
326	    "\nGeneral options:\n\n"
327	    "  -V              print the corresponding D program\n"
328	    "\nEvent selection options:\n\n"
329	    "  -C              watch contention events [on by default]\n"
330	    "  -E              watch error events [off by default]\n"
331	    "  -H              watch hold events [off by default]\n"
332	    "  -I              watch interrupt events [off by default]\n"
333	    "  -A              watch all lock events [equivalent to -CH]\n"
334	    "  -e event_list   only watch the specified events (shown below);\n"
335	    "                  <event_list> is a comma-separated list of\n"
336	    "                  events or ranges of events, e.g. 1,4-7,35\n"
337	    "  -i rate         interrupt rate for -I [default: %d Hz]\n"
338	    "\nData gathering options:\n\n"
339	    "  -b              basic statistics (lock, caller, event count)\n"
340	    "  -t              timing for all events [default]\n"
341	    "  -h              histograms for event times\n"
342	    "  -s depth        stack traces <depth> deep\n"
343	    "  -x opt[=val]    enable or modify DTrace options\n"
344	    "\nData filtering options:\n\n"
345	    "  -n nrecords     maximum number of data records [default: %d]\n"
346	    "  -l lock[,size]  only watch <lock>, which can be specified as a\n"
347	    "                  symbolic name or hex address; <size> defaults\n"
348	    "                  to the ELF symbol size if available, 1 if not\n"
349	    "  -f func[,size]  only watch events generated by <func>\n"
350	    "  -d duration     only watch events longer than <duration>\n"
351	    "  -T              trace (rather than sample) events\n"
352	    "\nData reporting options:\n\n"
353#ifdef illumos
354	    "  -c              coalesce lock data for arrays like pse_mutex[]\n"
355#endif
356	    "  -k              coalesce PCs within functions\n"
357	    "  -g              show total events generated by function\n"
358	    "  -w              wherever: don't distinguish events by caller\n"
359	    "  -W              whichever: don't distinguish events by lock\n"
360	    "  -R              display rates rather than counts\n"
361	    "  -p              parsable output format (awk(1)-friendly)\n"
362	    "  -P              sort lock data by (count * avg_time) product\n"
363	    "  -D n            only display top <n> events of each type\n"
364	    "  -o filename     send output to <filename>\n",
365	    DEFAULT_HZ, DEFAULT_NRECS);
366
367	show_events('C', "Contention");
368	show_events('H', "Hold-time");
369	show_events('I', "Interrupt");
370	show_events('E', "Error");
371	(void) fprintf(stderr, "\n");
372
373	exit(1);
374}
375
376static int
377lockcmp(lsrec_t *a, lsrec_t *b)
378{
379	int i;
380
381	if (a->ls_event < b->ls_event)
382		return (-1);
383	if (a->ls_event > b->ls_event)
384		return (1);
385
386	for (i = g_stkdepth - 1; i >= 0; i--) {
387		if (a->ls_stack[i] < b->ls_stack[i])
388			return (-1);
389		if (a->ls_stack[i] > b->ls_stack[i])
390			return (1);
391	}
392
393	if (a->ls_caller < b->ls_caller)
394		return (-1);
395	if (a->ls_caller > b->ls_caller)
396		return (1);
397
398#ifdef illumos
399	if (a->ls_lock < b->ls_lock)
400		return (-1);
401	if (a->ls_lock > b->ls_lock)
402		return (1);
403
404	return (0);
405#else
406	return (strcmp(a->ls_lock, b->ls_lock));
407#endif
408}
409
410static int
411countcmp(lsrec_t *a, lsrec_t *b)
412{
413	if (a->ls_event < b->ls_event)
414		return (-1);
415	if (a->ls_event > b->ls_event)
416		return (1);
417
418	return (b->ls_count - a->ls_count);
419}
420
421static int
422timecmp(lsrec_t *a, lsrec_t *b)
423{
424	if (a->ls_event < b->ls_event)
425		return (-1);
426	if (a->ls_event > b->ls_event)
427		return (1);
428
429	if (a->ls_time < b->ls_time)
430		return (1);
431	if (a->ls_time > b->ls_time)
432		return (-1);
433
434	return (0);
435}
436
437static int
438lockcmp_anywhere(lsrec_t *a, lsrec_t *b)
439{
440	if (a->ls_event < b->ls_event)
441		return (-1);
442	if (a->ls_event > b->ls_event)
443		return (1);
444
445#ifdef illumos
446	if (a->ls_lock < b->ls_lock)
447		return (-1);
448	if (a->ls_lock > b->ls_lock)
449		return (1);
450
451	return (0);
452#else
453	return (strcmp(a->ls_lock, b->ls_lock));
454#endif
455}
456
457static int
458lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
459{
460#ifndef illumos
461	int cmp;
462#endif
463
464	if (a->ls_event < b->ls_event)
465		return (-1);
466	if (a->ls_event > b->ls_event)
467		return (1);
468
469#ifdef illumos
470	if (a->ls_lock < b->ls_lock)
471		return (-1);
472	if (a->ls_lock > b->ls_lock)
473		return (1);
474#else
475	cmp = strcmp(a->ls_lock, b->ls_lock);
476	if (cmp != 0)
477		return (cmp);
478#endif
479
480	return (b->ls_count - a->ls_count);
481}
482
483static int
484sitecmp_anylock(lsrec_t *a, lsrec_t *b)
485{
486	int i;
487
488	if (a->ls_event < b->ls_event)
489		return (-1);
490	if (a->ls_event > b->ls_event)
491		return (1);
492
493	for (i = g_stkdepth - 1; i >= 0; i--) {
494		if (a->ls_stack[i] < b->ls_stack[i])
495			return (-1);
496		if (a->ls_stack[i] > b->ls_stack[i])
497			return (1);
498	}
499
500	if (a->ls_caller < b->ls_caller)
501		return (-1);
502	if (a->ls_caller > b->ls_caller)
503		return (1);
504
505	return (0);
506}
507
508static int
509site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
510{
511	int i;
512
513	if (a->ls_event < b->ls_event)
514		return (-1);
515	if (a->ls_event > b->ls_event)
516		return (1);
517
518	for (i = g_stkdepth - 1; i >= 0; i--) {
519		if (a->ls_stack[i] < b->ls_stack[i])
520			return (-1);
521		if (a->ls_stack[i] > b->ls_stack[i])
522			return (1);
523	}
524
525	if (a->ls_caller < b->ls_caller)
526		return (-1);
527	if (a->ls_caller > b->ls_caller)
528		return (1);
529
530	return (b->ls_count - a->ls_count);
531}
532
533static void
534lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
535{
536	int m = n / 2;
537	int i, j;
538
539	if (m > 1)
540		lsmergesort(cmp, a, b, m);
541	if (n - m > 1)
542		lsmergesort(cmp, a + m, b + m, n - m);
543	for (i = m; i > 0; i--)
544		b[i - 1] = a[i - 1];
545	for (j = m - 1; j < n - 1; j++)
546		b[n + m - j - 2] = a[j + 1];
547	while (i < j)
548		*a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
549	*a = b[i];
550}
551
552static void
553coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
554{
555	int i, j;
556	lsrec_t *target, *current;
557
558	target = lock[0];
559
560	for (i = 1; i < n; i++) {
561		current = lock[i];
562		if (cmp(current, target) != 0) {
563			target = current;
564			continue;
565		}
566		current->ls_event = LS_MAX_EVENTS;
567		target->ls_count += current->ls_count;
568		target->ls_refcnt += current->ls_refcnt;
569		if (g_recsize < LS_TIME)
570			continue;
571		target->ls_time += current->ls_time;
572		if (g_recsize < LS_HIST)
573			continue;
574		for (j = 0; j < 64; j++)
575			target->ls_hist[j] += current->ls_hist[j];
576	}
577}
578
579static void
580coalesce_symbol(uintptr_t *addrp)
581{
582	uintptr_t symoff;
583	size_t symsize;
584
585	if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
586		*addrp -= symoff;
587}
588
589static void
590predicate_add(char **pred, char *what, char *cmp, uintptr_t value)
591{
592	char *new;
593	int len, newlen;
594
595	if (what == NULL)
596		return;
597
598	if (*pred == NULL) {
599		*pred = malloc(1);
600		*pred[0] = '\0';
601	}
602
603	len = strlen(*pred);
604	newlen = len + strlen(what) + 32 + strlen("( && )");
605	new = malloc(newlen);
606
607	if (*pred[0] != '\0') {
608		if (cmp != NULL) {
609			(void) sprintf(new, "(%s) && (%s %s %p)",
610			    *pred, what, cmp, (void *)value);
611		} else {
612			(void) sprintf(new, "(%s) && (%s)", *pred, what);
613		}
614	} else {
615		if (cmp != NULL) {
616			(void) sprintf(new, "%s %s %p",
617			    what, cmp, (void *)value);
618		} else {
619			(void) sprintf(new, "%s", what);
620		}
621	}
622
623	free(*pred);
624	*pred = new;
625}
626
627static void
628predicate_destroy(char **pred)
629{
630	free(*pred);
631	*pred = NULL;
632}
633
634static void
635filter_add(char **filt, char *what, uintptr_t base, size_t size)
636{
637	char buf[256], *c = buf, *new;
638	int len, newlen;
639
640	if (*filt == NULL) {
641		*filt = malloc(1);
642		*filt[0] = '\0';
643	}
644
645#ifdef illumos
646	(void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
647	    " || " : "", what, (void *)base, what, (void *)(base + size));
648#else
649	(void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ?
650	    " || " : "", what, (void *)base, what, (void *)(base + size));
651#endif
652
653	newlen = (len = strlen(*filt) + 1) + strlen(c);
654	new = malloc(newlen);
655	bcopy(*filt, new, len);
656	(void) strcat(new, c);
657	free(*filt);
658	*filt = new;
659}
660
661static void
662filter_destroy(char **filt)
663{
664	free(*filt);
665	*filt = NULL;
666}
667
668static void
669dprog_add(const char *fmt, ...)
670{
671	va_list args;
672	int size, offs;
673	char c;
674
675	va_start(args, fmt);
676	size = vsnprintf(&c, 1, fmt, args) + 1;
677	va_end(args);
678
679	if (g_proglen == 0) {
680		offs = 0;
681	} else {
682		offs = g_proglen - 1;
683	}
684
685	g_proglen = offs + size;
686
687	if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
688		fail(1, "failed to reallocate program text");
689
690	va_start(args, fmt);
691	(void) vsnprintf(&g_prog[offs], size, fmt, args);
692	va_end(args);
693}
694
695/*
696 * This function may read like an open sewer, but keep in mind that programs
697 * that generate other programs are rarely pretty.  If one has the unenviable
698 * task of maintaining or -- worse -- extending this code, use the -V option
699 * to examine the D program as generated by this function.
700 */
701static void
702dprog_addevent(int event)
703{
704	ls_event_info_t *info = &g_event_info[event];
705	char *pred = NULL;
706	char stack[20];
707	const char *arg0, *caller;
708	char *arg1 = "arg1";
709	char buf[80];
710	hrtime_t dur;
711	int depth;
712
713	if (info->ev_name[0] == '\0')
714		return;
715
716	if (info->ev_type == 'I') {
717		/*
718		 * For interrupt events, arg0 (normally the lock pointer) is
719		 * the CPU address plus the current pil, and arg1 (normally
720		 * the number of nanoseconds) is the number of nanoseconds
721		 * late -- and it's stored in arg2.
722		 */
723#ifdef illumos
724		arg0 = "(uintptr_t)curthread->t_cpu + \n"
725		    "\t    curthread->t_cpu->cpu_profile_pil";
726#else
727		arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n"
728		    "\t    0x01000000 + curthread->td_pri_class";
729#endif
730		caller = "(uintptr_t)arg0";
731		arg1 = "arg2";
732	} else {
733#ifdef illumos
734		arg0 = "(uintptr_t)arg0";
735#else
736		arg0 = "stringof(args[0]->lock_object.lo_name)";
737#endif
738		caller = "caller";
739	}
740
741	if (g_recsize > LS_HIST) {
742		for (depth = 0; g_recsize > LS_STACK(depth); depth++)
743			continue;
744
745		if (g_tracing) {
746			(void) sprintf(stack, "\tstack(%d);\n", depth);
747		} else {
748			(void) sprintf(stack, ", stack(%d)", depth);
749		}
750	} else {
751		(void) sprintf(stack, "");
752	}
753
754	if (info->ev_acquire != NULL) {
755		/*
756		 * If this is a hold event, we need to generate an additional
757		 * clause for the acquire; the clause for the release will be
758		 * generated with the aggregating statement, below.
759		 */
760		dprog_add("%s\n", info->ev_acquire);
761		predicate_add(&pred, info->ev_predicate, NULL, 0);
762		predicate_add(&pred, g_predicate, NULL, 0);
763		if (pred != NULL)
764			dprog_add("/%s/\n", pred);
765
766		dprog_add("{\n");
767		(void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);
768
769		if (info->ev_type == 'H') {
770			dprog_add("\t%s = timestamp;\n", buf);
771		} else {
772			/*
773			 * If this isn't a hold event, it's the recursive
774			 * error event.  For this, we simply bump the
775			 * thread-local, per-lock count.
776			 */
777			dprog_add("\t%s++;\n", buf);
778		}
779
780		dprog_add("}\n\n");
781		predicate_destroy(&pred);
782		pred = NULL;
783
784		if (info->ev_type == 'E') {
785			/*
786			 * If this is the recursive lock error event, we need
787			 * to generate an additional clause to decrement the
788			 * thread-local, per-lock count.  This assures that we
789			 * only execute the aggregating clause if we have
790			 * recursive entry.
791			 */
792			dprog_add("%s\n", info->ev_name);
793			dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
794		}
795
796		predicate_add(&pred, buf, NULL, 0);
797
798		if (info->ev_type == 'H') {
799			(void) sprintf(buf, "timestamp -\n\t    "
800			    "self->ev%d[(uintptr_t)arg0]", event);
801		}
802
803		arg1 = buf;
804	} else {
805		predicate_add(&pred, info->ev_predicate, NULL, 0);
806		if (info->ev_type != 'I')
807			predicate_add(&pred, g_predicate, NULL, 0);
808		else
809			predicate_add(&pred, g_ipredicate, NULL, 0);
810	}
811
812	if ((dur = g_min_duration[event]) != 0)
813		predicate_add(&pred, arg1, ">=", dur);
814
815	dprog_add("%s\n", info->ev_name);
816
817	if (pred != NULL)
818		dprog_add("/%s/\n", pred);
819	predicate_destroy(&pred);
820
821	dprog_add("{\n");
822
823	if (g_tracing) {
824		dprog_add("\ttrace(%dULL);\n", event);
825		dprog_add("\ttrace(%s);\n", arg0);
826		dprog_add("\ttrace(%s);\n", caller);
827		dprog_add(stack);
828	} else {
829		/*
830		 * The ordering here is important:  when we process the
831		 * aggregate, we count on the fact that @avg appears before
832		 * @hist in program order to assure that @avg is assigned the
833		 * first aggregation variable ID and @hist assigned the
834		 * second; see the comment in process_aggregate() for details.
835		 */
836		dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
837		    event, arg0, caller, stack, arg1);
838
839		if (g_recsize >= LS_HIST) {
840			dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
841			    "(%s);\n", event, arg0, caller, stack, arg1);
842		}
843	}
844
845	if (info->ev_acquire != NULL)
846		dprog_add("\tself->ev%d[arg0] = 0;\n", event);
847
848	dprog_add("}\n\n");
849}
850
851static void
852dprog_compile()
853{
854	dtrace_prog_t *prog;
855	dtrace_proginfo_t info;
856
857	if (g_Vflag) {
858		(void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
859		(void) fputs(g_prog, stderr);
860		(void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
861	}
862
863	if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
864	    DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
865		dfail("failed to compile program");
866
867	if (dtrace_program_exec(g_dtp, prog, &info) == -1)
868		dfail("failed to enable probes");
869
870	if (dtrace_go(g_dtp) != 0)
871		dfail("couldn't start tracing");
872}
873
874static void
875#ifdef illumos
876status_fire(void)
877#else
878status_fire(int i)
879#endif
880{}
881
882static void
883status_init(void)
884{
885	dtrace_optval_t val, status, agg;
886	struct sigaction act;
887	struct itimerspec ts;
888	struct sigevent ev;
889	timer_t tid;
890
891	if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
892		dfail("failed to get 'statusrate'");
893
894	if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
895		dfail("failed to get 'statusrate'");
896
897	/*
898	 * We would want to awaken at a rate that is the GCD of the statusrate
899	 * and the aggrate -- but that seems a bit absurd.  Instead, we'll
900	 * simply awaken at a rate that is the more frequent of the two, which
901	 * assures that we're never later than the interval implied by the
902	 * more frequent rate.
903	 */
904	val = status < agg ? status : agg;
905
906	(void) sigemptyset(&act.sa_mask);
907	act.sa_flags = 0;
908	act.sa_handler = status_fire;
909	(void) sigaction(SIGUSR1, &act, NULL);
910
911	ev.sigev_notify = SIGEV_SIGNAL;
912	ev.sigev_signo = SIGUSR1;
913
914	if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
915		dfail("cannot create CLOCK_REALTIME timer");
916
917	ts.it_value.tv_sec = val / NANOSEC;
918	ts.it_value.tv_nsec = val % NANOSEC;
919	ts.it_interval = ts.it_value;
920
921	if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
922		dfail("cannot set time on CLOCK_REALTIME timer");
923}
924
925static void
926status_check(void)
927{
928	if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
929		dfail("failed to snap aggregate");
930
931	if (dtrace_status(g_dtp) == -1)
932		dfail("dtrace_status()");
933}
934
935static void
936lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
937{
938	bzero(lsrec, g_recsize);
939	lsrec->ls_count = 1;
940
941	if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
942		fail(0, "truncated DTrace record");
943
944	if (rec->dtrd_size != sizeof (uint64_t))
945		fail(0, "bad event size in first record");
946
947	/* LINTED - alignment */
948	lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
949	rec++;
950
951#ifdef illumos
952	if (rec->dtrd_size != sizeof (uintptr_t))
953		fail(0, "bad lock address size in second record");
954
955	/* LINTED - alignment */
956	lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
957	rec++;
958#else
959	lsrec->ls_lock = strdup((const char *)(data + rec->dtrd_offset));
960	rec++;
961#endif
962
963	if (rec->dtrd_size != sizeof (uintptr_t))
964		fail(0, "bad caller size in third record");
965
966	/* LINTED - alignment */
967	lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
968	rec++;
969
970	if (g_recsize > LS_HIST) {
971		int frames, i;
972		pc_t *stack;
973
974		frames = rec->dtrd_size / sizeof (pc_t);
975		/* LINTED - alignment */
976		stack = (pc_t *)(data + rec->dtrd_offset);
977
978		for (i = 1; i < frames; i++)
979			lsrec->ls_stack[i - 1] = stack[i];
980	}
981}
982
983/*ARGSUSED*/
984static int
985count_aggregate(const dtrace_aggdata_t *agg, void *arg)
986{
987	*((size_t *)arg) += 1;
988
989	return (DTRACE_AGGWALK_NEXT);
990}
991
992static int
993process_aggregate(const dtrace_aggdata_t *agg, void *arg)
994{
995	const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
996	caddr_t data = agg->dtada_data;
997	lsdata_t *lsdata = arg;
998	lsrec_t *lsrec = lsdata->lsd_next;
999	const dtrace_recdesc_t *rec;
1000	uint64_t *avg, *quantized;
1001	int i, j;
1002
1003	assert(lsdata->lsd_count < g_nrecs);
1004
1005	/*
1006	 * Aggregation variable IDs are guaranteed to be generated in program
1007	 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
1008	 * plus one.  As "avg" appears before "hist" in program order, we know
1009	 * that "avg" will be allocated the first aggregation variable ID, and
1010	 * "hist" will be allocated the second aggregation variable ID -- and
1011	 * we therefore use the aggregation variable ID to differentiate the
1012	 * cases.
1013	 */
1014	if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {
1015		/*
1016		 * If this is the histogram entry.  We'll copy the quantized
1017		 * data into lc_hist, and jump over the rest.
1018		 */
1019		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1020
1021		if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)
1022			fail(0, "bad variable ID in aggregation record");
1023
1024		if (rec->dtrd_size !=
1025		    DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
1026			fail(0, "bad quantize size in aggregation record");
1027
1028		/* LINTED - alignment */
1029		quantized = (uint64_t *)(data + rec->dtrd_offset);
1030
1031		for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
1032		    i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
1033			lsrec->ls_hist[j] = quantized[i];
1034
1035		goto out;
1036	}
1037
1038	lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
1039	    aggdesc->dtagd_nrecs - 1, data);
1040
1041	rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1042
1043	if (rec->dtrd_size != 2 * sizeof (uint64_t))
1044		fail(0, "bad avg size in aggregation record");
1045
1046	/* LINTED - alignment */
1047	avg = (uint64_t *)(data + rec->dtrd_offset);
1048	lsrec->ls_count = (uint32_t)avg[0];
1049	lsrec->ls_time = (uintptr_t)avg[1];
1050
1051	if (g_recsize >= LS_HIST)
1052		return (DTRACE_AGGWALK_NEXT);
1053
1054out:
1055	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1056	lsdata->lsd_count++;
1057
1058	return (DTRACE_AGGWALK_NEXT);
1059}
1060
1061static int
1062process_trace(const dtrace_probedata_t *pdata, void *arg)
1063{
1064	lsdata_t *lsdata = arg;
1065	lsrec_t *lsrec = lsdata->lsd_next;
1066	dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
1067	caddr_t data = pdata->dtpda_data;
1068
1069	if (lsdata->lsd_count >= g_nrecs)
1070		return (DTRACE_CONSUME_NEXT);
1071
1072	lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);
1073
1074	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1075	lsdata->lsd_count++;
1076
1077	return (DTRACE_CONSUME_NEXT);
1078}
1079
1080static int
1081process_data(FILE *out, char *data)
1082{
1083	lsdata_t lsdata;
1084
1085	/* LINTED - alignment */
1086	lsdata.lsd_next = (lsrec_t *)data;
1087	lsdata.lsd_count = 0;
1088
1089	if (g_tracing) {
1090		if (dtrace_consume(g_dtp, out,
1091		    process_trace, NULL, &lsdata) != 0)
1092			dfail("failed to consume buffer");
1093
1094		return (lsdata.lsd_count);
1095	}
1096
1097	if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
1098	    process_aggregate, &lsdata) != 0)
1099		dfail("failed to walk aggregate");
1100
1101	return (lsdata.lsd_count);
1102}
1103
1104/*ARGSUSED*/
1105static int
1106drophandler(const dtrace_dropdata_t *data, void *arg)
1107{
1108	g_dropped++;
1109	(void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
1110	return (DTRACE_HANDLE_OK);
1111}
1112
1113int
1114main(int argc, char **argv)
1115{
1116	char *data_buf;
1117	lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
1118	FILE *out = stdout;
1119	int c;
1120	pid_t child;
1121	int status;
1122	int i, j;
1123	hrtime_t duration;
1124	char *addrp, *offp, *sizep, *evp, *lastp, *p;
1125	uintptr_t addr;
1126	size_t size, off;
1127	int events_specified = 0;
1128	int exec_errno = 0;
1129	uint32_t event;
1130	char *filt = NULL, *ifilt = NULL;
1131	static uint64_t ev_count[LS_MAX_EVENTS + 1];
1132	static uint64_t ev_time[LS_MAX_EVENTS + 1];
1133	dtrace_optval_t aggsize;
1134	char aggstr[10];
1135	long ncpus;
1136	int dynvar = 0;
1137	int err;
1138
1139	if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
1140		fail(0, "cannot open dtrace library: %s",
1141		    dtrace_errmsg(NULL, err));
1142	}
1143
1144	if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
1145		dfail("couldn't establish drop handler");
1146
1147	if (symtab_init() == -1)
1148		fail(1, "can't load kernel symbols");
1149
1150	g_nrecs = DEFAULT_NRECS;
1151
1152	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1153		switch (c) {
1154		case 'b':
1155			g_recsize = LS_BASIC;
1156			break;
1157
1158		case 't':
1159			g_recsize = LS_TIME;
1160			break;
1161
1162		case 'h':
1163			g_recsize = LS_HIST;
1164			break;
1165
1166		case 's':
1167			if (!isdigit(optarg[0]))
1168				usage();
1169			g_stkdepth = atoi(optarg);
1170			if (g_stkdepth > LS_MAX_STACK_DEPTH)
1171				fail(0, "max stack depth is %d",
1172				    LS_MAX_STACK_DEPTH);
1173			g_recsize = LS_STACK(g_stkdepth);
1174			break;
1175
1176		case 'n':
1177			if (!isdigit(optarg[0]))
1178				usage();
1179			g_nrecs = atoi(optarg);
1180			break;
1181
1182		case 'd':
1183			if (!isdigit(optarg[0]))
1184				usage();
1185			duration = atoll(optarg);
1186
1187			/*
1188			 * XXX -- durations really should be per event
1189			 * since the units are different, but it's hard
1190			 * to express this nicely in the interface.
1191			 * Not clear yet what the cleanest solution is.
1192			 */
1193			for (i = 0; i < LS_MAX_EVENTS; i++)
1194				if (g_event_info[i].ev_type != 'E')
1195					g_min_duration[i] = duration;
1196
1197			break;
1198
1199		case 'i':
1200			if (!isdigit(optarg[0]))
1201				usage();
1202			i = atoi(optarg);
1203			if (i <= 0)
1204				usage();
1205			if (i > MAX_HZ)
1206				fail(0, "max interrupt rate is %d Hz", MAX_HZ);
1207
1208			for (j = 0; j < LS_MAX_EVENTS; j++)
1209				if (strcmp(g_event_info[j].ev_desc,
1210				    "Profiling interrupt") == 0)
1211					break;
1212
1213			(void) sprintf(g_event_info[j].ev_name,
1214			    "profile:::profile-%d", i);
1215			break;
1216
1217		case 'l':
1218		case 'f':
1219			addrp = strtok(optarg, ",");
1220			sizep = strtok(NULL, ",");
1221			addrp = strtok(optarg, ",+");
1222			offp = strtok(NULL, ",");
1223
1224			size = sizep ? strtoul(sizep, NULL, 0) : 1;
1225			off = offp ? strtoul(offp, NULL, 0) : 0;
1226
1227			if (addrp[0] == '0') {
1228				addr = strtoul(addrp, NULL, 16) + off;
1229			} else {
1230				addr = sym_to_addr(addrp) + off;
1231				if (sizep == NULL)
1232					size = sym_size(addrp) - off;
1233				if (addr - off == 0)
1234					fail(0, "symbol '%s' not found", addrp);
1235				if (size == 0)
1236					size = 1;
1237			}
1238
1239
1240			if (c == 'l') {
1241				filter_add(&filt, "arg0", addr, size);
1242			} else {
1243				filter_add(&filt, "caller", addr, size);
1244				filter_add(&ifilt, "arg0", addr, size);
1245			}
1246			break;
1247
1248		case 'e':
1249			evp = strtok_r(optarg, ",", &lastp);
1250			while (evp) {
1251				int ev1, ev2;
1252				char *evp2;
1253
1254				(void) strtok(evp, "-");
1255				evp2 = strtok(NULL, "-");
1256				ev1 = atoi(evp);
1257				ev2 = evp2 ? atoi(evp2) : ev1;
1258				if ((uint_t)ev1 >= LS_MAX_EVENTS ||
1259				    (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
1260					fail(0, "-e events out of range");
1261				for (i = ev1; i <= ev2; i++)
1262					g_enabled[i] = 1;
1263				evp = strtok_r(NULL, ",", &lastp);
1264			}
1265			events_specified = 1;
1266			break;
1267
1268#ifdef illumos
1269		case 'c':
1270			g_cflag = 1;
1271			break;
1272#endif
1273
1274		case 'k':
1275			g_kflag = 1;
1276			break;
1277
1278		case 'w':
1279			g_wflag = 1;
1280			break;
1281
1282		case 'W':
1283			g_Wflag = 1;
1284			break;
1285
1286		case 'g':
1287			g_gflag = 1;
1288			break;
1289
1290		case 'C':
1291		case 'E':
1292		case 'H':
1293		case 'I':
1294			for (i = 0; i < LS_MAX_EVENTS; i++)
1295				if (g_event_info[i].ev_type == c)
1296					g_enabled[i] = 1;
1297			events_specified = 1;
1298			break;
1299
1300		case 'A':
1301			for (i = 0; i < LS_MAX_EVENTS; i++)
1302				if (strchr("CH", g_event_info[i].ev_type))
1303					g_enabled[i] = 1;
1304			events_specified = 1;
1305			break;
1306
1307		case 'T':
1308			g_tracing = 1;
1309			break;
1310
1311		case 'D':
1312			if (!isdigit(optarg[0]))
1313				usage();
1314			g_topn = atoi(optarg);
1315			break;
1316
1317		case 'R':
1318			g_rates = 1;
1319			break;
1320
1321		case 'p':
1322			g_pflag = 1;
1323			break;
1324
1325		case 'P':
1326			g_Pflag = 1;
1327			break;
1328
1329		case 'o':
1330			if ((out = fopen(optarg, "w")) == NULL)
1331				fail(1, "error opening file");
1332			break;
1333
1334		case 'V':
1335			g_Vflag = 1;
1336			break;
1337
1338		default:
1339			if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
1340				usage();
1341		}
1342	}
1343
1344	if (filt != NULL) {
1345		predicate_add(&g_predicate, filt, NULL, 0);
1346		filter_destroy(&filt);
1347	}
1348
1349	if (ifilt != NULL) {
1350		predicate_add(&g_ipredicate, ifilt, NULL, 0);
1351		filter_destroy(&ifilt);
1352	}
1353
1354	if (g_recsize == 0) {
1355		if (g_gflag) {
1356			g_stkdepth = LS_MAX_STACK_DEPTH;
1357			g_recsize = LS_STACK(g_stkdepth);
1358		} else {
1359			g_recsize = LS_TIME;
1360		}
1361	}
1362
1363	if (g_gflag && g_recsize <= LS_STACK(0))
1364		fail(0, "'-g' requires at least '-s 1' data gathering");
1365
1366	/*
1367	 * Make sure the alignment is reasonable
1368	 */
1369	g_recsize = -(-g_recsize & -sizeof (uint64_t));
1370
1371	for (i = 0; i < LS_MAX_EVENTS; i++) {
1372		/*
1373		 * If no events were specified, enable -C.
1374		 */
1375		if (!events_specified && g_event_info[i].ev_type == 'C')
1376			g_enabled[i] = 1;
1377	}
1378
1379	for (i = 0; i < LS_MAX_EVENTS; i++) {
1380		if (!g_enabled[i])
1381			continue;
1382
1383		if (g_event_info[i].ev_acquire != NULL) {
1384			/*
1385			 * If we've enabled a hold event, we must explicitly
1386			 * allocate dynamic variable space.
1387			 */
1388			dynvar = 1;
1389		}
1390
1391		dprog_addevent(i);
1392	}
1393
1394	/*
1395	 * Make sure there are remaining arguments to specify a child command
1396	 * to execute.
1397	 */
1398	if (argc <= optind)
1399		usage();
1400
1401	if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
1402		dfail("couldn't determine number of online CPUs");
1403
1404	/*
1405	 * By default, we set our data buffer size to be the number of records
1406	 * multiplied by the size of the record, doubled to account for some
1407	 * DTrace slop and divided by the number of CPUs.  We silently clamp
1408	 * the aggregation size at both a minimum and a maximum to prevent
1409	 * absurdly low or high values.
1410	 */
1411	if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
1412		aggsize = MIN_AGGSIZE;
1413
1414	if (aggsize > MAX_AGGSIZE)
1415		aggsize = MAX_AGGSIZE;
1416
1417	(void) sprintf(aggstr, "%lld", (long long)aggsize);
1418
1419	if (!g_tracing) {
1420		if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
1421			dfail("failed to set 'bufsize'");
1422
1423		if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
1424			dfail("failed to set 'aggsize'");
1425
1426		if (dynvar) {
1427			/*
1428			 * If we're using dynamic variables, we set our
1429			 * dynamic variable size to be one megabyte per CPU,
1430			 * with a hard-limit of 32 megabytes.  This may still
1431			 * be too small in some cases, but it can be tuned
1432			 * manually via -x if need be.
1433			 */
1434			(void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);
1435
1436			if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
1437				dfail("failed to set 'dynvarsize'");
1438		}
1439	} else {
1440		if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
1441			dfail("failed to set 'bufsize'");
1442	}
1443
1444	if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
1445		dfail("failed to set 'statusrate'");
1446
1447	optind = 1;
1448	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1449		switch (c) {
1450		case 'x':
1451			if ((p = strchr(optarg, '=')) != NULL)
1452				*p++ = '\0';
1453
1454			if (dtrace_setopt(g_dtp, optarg, p) != 0)
1455				dfail("failed to set -x %s", optarg);
1456			break;
1457		}
1458	}
1459
1460	argc -= optind;
1461	argv += optind;
1462
1463	dprog_compile();
1464	status_init();
1465
1466	g_elapsed = -gethrtime();
1467
1468	/*
1469	 * Spawn the specified command and wait for it to complete.
1470	 */
1471	child = fork();
1472	if (child == -1)
1473		fail(1, "cannot fork");
1474	if (child == 0) {
1475		(void) dtrace_close(g_dtp);
1476		(void) execvp(argv[0], &argv[0]);
1477		exec_errno = errno;
1478		exit(127);
1479	}
1480
1481#ifdef illumos
1482	while (waitpid(child, &status, WEXITED) != child)
1483#else
1484	while (waitpid(child, &status, 0) != child)
1485#endif
1486		status_check();
1487
1488	g_elapsed += gethrtime();
1489
1490	if (WIFEXITED(status)) {
1491		if (WEXITSTATUS(status) != 0) {
1492			if (exec_errno != 0) {
1493				errno = exec_errno;
1494				fail(1, "could not execute %s", argv[0]);
1495			}
1496			(void) fprintf(stderr,
1497			    "lockstat: warning: %s exited with code %d\n",
1498			    argv[0], WEXITSTATUS(status));
1499		}
1500	} else {
1501		(void) fprintf(stderr,
1502		    "lockstat: warning: %s died on signal %d\n",
1503		    argv[0], WTERMSIG(status));
1504	}
1505
1506	if (dtrace_stop(g_dtp) == -1)
1507		dfail("failed to stop dtrace");
1508
1509	/*
1510	 * Before we read out the results, we need to allocate our buffer.
1511	 * If we're tracing, then we'll just use the precalculated size.  If
1512	 * we're not, then we'll take a snapshot of the aggregate, and walk
1513	 * it to count the number of records.
1514	 */
1515	if (!g_tracing) {
1516		if (dtrace_aggregate_snap(g_dtp) != 0)
1517			dfail("failed to snap aggregate");
1518
1519		g_nrecs = 0;
1520
1521		if (dtrace_aggregate_walk(g_dtp,
1522		    count_aggregate, &g_nrecs) != 0)
1523			dfail("failed to walk aggregate");
1524	}
1525
1526#ifdef illumos
1527	if ((data_buf = memalign(sizeof (uint64_t),
1528	    (g_nrecs + 1) * g_recsize)) == NULL)
1529#else
1530	if (posix_memalign((void **)&data_buf, sizeof (uint64_t),
1531	    (g_nrecs + 1) * g_recsize) )
1532#endif
1533		fail(1, "Memory allocation failed");
1534
1535	/*
1536	 * Read out the DTrace data.
1537	 */
1538	g_nrecs_used = process_data(out, data_buf);
1539
1540	if (g_nrecs_used > g_nrecs || g_dropped)
1541		(void) fprintf(stderr, "lockstat: warning: "
1542		    "ran out of data records (use -n for more)\n");
1543
1544	/* LINTED - alignment */
1545	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1546	    /* LINTED - alignment */
1547	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1548		ev_count[lsp->ls_event] += lsp->ls_count;
1549		ev_time[lsp->ls_event] += lsp->ls_time;
1550	}
1551
1552	/*
1553	 * If -g was specified, convert stacks into individual records.
1554	 */
1555	if (g_gflag) {
1556		lsrec_t *newlsp, *oldlsp;
1557
1558#ifdef illumos
1559		newlsp = memalign(sizeof (uint64_t),
1560		    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1561#else
1562		posix_memalign((void **)&newlsp, sizeof (uint64_t),
1563		    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1564#endif
1565		if (newlsp == NULL)
1566			fail(1, "Cannot allocate space for -g processing");
1567		lsp = newlsp;
1568		/* LINTED - alignment */
1569		for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1570		    /* LINTED - alignment */
1571		    oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
1572			int fr;
1573			int caller_in_stack = 0;
1574
1575			if (oldlsp->ls_count == 0)
1576				continue;
1577
1578			for (fr = 0; fr < g_stkdepth; fr++) {
1579				if (oldlsp->ls_stack[fr] == 0)
1580					break;
1581				if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
1582					caller_in_stack = 1;
1583				bcopy(oldlsp, lsp, LS_TIME);
1584				lsp->ls_caller = oldlsp->ls_stack[fr];
1585#ifndef illumos
1586				lsp->ls_lock = strdup(oldlsp->ls_lock);
1587#endif
1588				/* LINTED - alignment */
1589				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1590			}
1591			if (!caller_in_stack) {
1592				bcopy(oldlsp, lsp, LS_TIME);
1593				/* LINTED - alignment */
1594				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1595			}
1596#ifndef illumos
1597			free(oldlsp->ls_lock);
1598#endif
1599		}
1600		g_nrecs = g_nrecs_used =
1601		    ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
1602		g_recsize = LS_TIME;
1603		g_stkdepth = 0;
1604		free(data_buf);
1605		data_buf = (char *)newlsp;
1606	}
1607
1608	if ((sort_buf = calloc(2 * (g_nrecs + 1),
1609	    sizeof (void *))) == NULL)
1610		fail(1, "Sort buffer allocation failed");
1611	merge_buf = sort_buf + (g_nrecs + 1);
1612
1613	/*
1614	 * Build the sort buffer, discarding zero-count records along the way.
1615	 */
1616	/* LINTED - alignment */
1617	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1618	    /* LINTED - alignment */
1619	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1620		if (lsp->ls_count == 0)
1621			lsp->ls_event = LS_MAX_EVENTS;
1622		sort_buf[i] = lsp;
1623	}
1624
1625	if (g_nrecs_used == 0)
1626		exit(0);
1627
1628	/*
1629	 * Add a sentinel after the last record
1630	 */
1631	sort_buf[i] = lsp;
1632	lsp->ls_event = LS_MAX_EVENTS;
1633
1634	if (g_tracing) {
1635		report_trace(out, sort_buf);
1636		return (0);
1637	}
1638
1639	/*
1640	 * Application of -g may have resulted in multiple records
1641	 * with the same signature; coalesce them.
1642	 */
1643	if (g_gflag) {
1644		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1645		coalesce(lockcmp, sort_buf, g_nrecs_used);
1646	}
1647
1648	/*
1649	 * Coalesce locks within the same symbol if -c option specified.
1650	 * Coalesce PCs within the same function if -k option specified.
1651	 */
1652	if (g_cflag || g_kflag) {
1653		for (i = 0; i < g_nrecs_used; i++) {
1654			int fr;
1655			lsp = sort_buf[i];
1656#ifdef illumos
1657			if (g_cflag)
1658				coalesce_symbol(&lsp->ls_lock);
1659#endif
1660			if (g_kflag) {
1661				for (fr = 0; fr < g_stkdepth; fr++)
1662					coalesce_symbol(&lsp->ls_stack[fr]);
1663				coalesce_symbol(&lsp->ls_caller);
1664			}
1665		}
1666		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1667		coalesce(lockcmp, sort_buf, g_nrecs_used);
1668	}
1669
1670	/*
1671	 * Coalesce callers if -w option specified
1672	 */
1673	if (g_wflag) {
1674		mergesort(lock_and_count_cmp_anywhere,
1675		    sort_buf, merge_buf, g_nrecs_used);
1676		coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
1677	}
1678
1679	/*
1680	 * Coalesce locks if -W option specified
1681	 */
1682	if (g_Wflag) {
1683		mergesort(site_and_count_cmp_anylock,
1684		    sort_buf, merge_buf, g_nrecs_used);
1685		coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
1686	}
1687
1688	/*
1689	 * Sort data by contention count (ls_count) or total time (ls_time),
1690	 * depending on g_Pflag.  Override g_Pflag if time wasn't measured.
1691	 */
1692	if (g_recsize < LS_TIME)
1693		g_Pflag = 0;
1694
1695	if (g_Pflag)
1696		mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
1697	else
1698		mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);
1699
1700	/*
1701	 * Display data by event type
1702	 */
1703	first = &sort_buf[0];
1704	while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
1705		current = first;
1706		while ((lsp = *current)->ls_event == event)
1707			current++;
1708		report_stats(out, first, current - first, ev_count[event],
1709		    ev_time[event]);
1710		first = current;
1711	}
1712
1713#ifndef illumos
1714	/*
1715	 * Free lock name buffers
1716	 */
1717	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1718	    lsp = (lsrec_t *)((char *)lsp + g_recsize))
1719		free(lsp->ls_lock);
1720#endif
1721
1722	return (0);
1723}
1724
1725static char *
1726format_symbol(char *buf, uintptr_t addr, int show_size)
1727{
1728	uintptr_t symoff;
1729	char *symname;
1730	size_t symsize;
1731
1732	symname = addr_to_sym(addr, &symoff, &symsize);
1733
1734	if (show_size && symoff == 0)
1735		(void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
1736	else if (symoff == 0)
1737		(void) sprintf(buf, "%s", symname);
1738	else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0)	/* CPU+PIL */
1739#ifdef illumos
1740		(void) sprintf(buf, "%s+%ld", symname, (long)symoff);
1741#else
1742		(void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]);
1743#endif
1744	else if (symoff <= symsize || (symoff < 256 && addr != symoff))
1745		(void) sprintf(buf, "%s+0x%llx", symname,
1746		    (unsigned long long)symoff);
1747	else
1748		(void) sprintf(buf, "0x%llx", (unsigned long long)addr);
1749	return (buf);
1750}
1751
1752static void
1753report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
1754	uint64_t total_time)
1755{
1756	uint32_t event = sort_buf[0]->ls_event;
1757	lsrec_t *lsp;
1758	double ptotal = 0.0;
1759	double percent;
1760	int i, j, fr;
1761	int displayed;
1762	int first_bin, last_bin, max_bin_count, total_bin_count;
1763	int rectype;
1764	char buf[256];
1765	char lhdr[80], chdr[80];
1766
1767	rectype = g_recsize;
1768
1769	if (g_topn == 0) {
1770		(void) fprintf(out, "%20llu %s\n",
1771		    g_rates == 0 ? total_count :
1772		    ((unsigned long long)total_count * NANOSEC) / g_elapsed,
1773		    g_event_info[event].ev_desc);
1774		return;
1775	}
1776
1777	(void) sprintf(lhdr, "%s%s",
1778	    g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
1779	(void) sprintf(chdr, "%s%s",
1780	    g_wflag ? "Hottest " : "", "Caller");
1781
1782	if (!g_pflag)
1783		(void) fprintf(out,
1784		    "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
1785		    g_event_info[event].ev_desc, (double)total_count,
1786		    (double)g_elapsed / NANOSEC,
1787		    (double)total_count * NANOSEC / g_elapsed);
1788
1789	if (!g_pflag && rectype < LS_HIST) {
1790		(void) sprintf(buf, "%s", g_event_info[event].ev_units);
1791		(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1792		    g_rates ? "ops/s" : "Count",
1793		    g_gflag ? "genr" : "indv",
1794		    "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
1795		(void) fprintf(out, "---------------------------------"
1796		    "----------------------------------------------\n");
1797	}
1798
1799	displayed = 0;
1800	for (i = 0; i < nrecs; i++) {
1801		lsp = sort_buf[i];
1802
1803		if (displayed++ >= g_topn)
1804			break;
1805
1806		if (g_pflag) {
1807			int j;
1808
1809			(void) fprintf(out, "%u %u",
1810			    lsp->ls_event, lsp->ls_count);
1811#ifdef illumos
1812			(void) fprintf(out, " %s",
1813			    format_symbol(buf, lsp->ls_lock, g_cflag));
1814#else
1815			(void) fprintf(out, " %s", lsp->ls_lock);
1816#endif
1817			(void) fprintf(out, " %s",
1818			    format_symbol(buf, lsp->ls_caller, 0));
1819			(void) fprintf(out, " %f",
1820			    (double)lsp->ls_refcnt / lsp->ls_count);
1821			if (rectype >= LS_TIME)
1822				(void) fprintf(out, " %llu",
1823				    (unsigned long long)lsp->ls_time);
1824			if (rectype >= LS_HIST) {
1825				for (j = 0; j < 64; j++)
1826					(void) fprintf(out, " %u",
1827					    lsp->ls_hist[j]);
1828			}
1829			for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
1830				if (rectype <= LS_STACK(j) ||
1831				    lsp->ls_stack[j] == 0)
1832					break;
1833				(void) fprintf(out, " %s",
1834				    format_symbol(buf, lsp->ls_stack[j], 0));
1835			}
1836			(void) fprintf(out, "\n");
1837			continue;
1838		}
1839
1840		if (rectype >= LS_HIST) {
1841			(void) fprintf(out, "---------------------------------"
1842			    "----------------------------------------------\n");
1843			(void) sprintf(buf, "%s",
1844			    g_event_info[event].ev_units);
1845			(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1846			    g_rates ? "ops/s" : "Count",
1847			    g_gflag ? "genr" : "indv",
1848			    "cuml", "rcnt", buf, lhdr, chdr);
1849		}
1850
1851		if (g_Pflag && total_time != 0)
1852			percent = (lsp->ls_time * 100.00) / total_time;
1853		else
1854			percent = (lsp->ls_count * 100.00) / total_count;
1855
1856		ptotal += percent;
1857
1858		if (rectype >= LS_TIME)
1859			(void) sprintf(buf, "%llu",
1860			    (unsigned long long)(lsp->ls_time / lsp->ls_count));
1861		else
1862			buf[0] = '\0';
1863
1864		(void) fprintf(out, "%5llu ",
1865		    g_rates == 0 ? lsp->ls_count :
1866		    ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);
1867
1868		(void) fprintf(out, "%3.0f%% ", percent);
1869
1870		if (g_gflag)
1871			(void) fprintf(out, "---- ");
1872		else
1873			(void) fprintf(out, "%3.0f%% ", ptotal);
1874
1875		(void) fprintf(out, "%4.2f %8s ",
1876		    (double)lsp->ls_refcnt / lsp->ls_count, buf);
1877
1878#ifdef illumos
1879		(void) fprintf(out, "%-22s ",
1880		    format_symbol(buf, lsp->ls_lock, g_cflag));
1881#else
1882		(void) fprintf(out, "%-22s ", lsp->ls_lock);
1883#endif
1884
1885		(void) fprintf(out, "%-24s\n",
1886		    format_symbol(buf, lsp->ls_caller, 0));
1887
1888		if (rectype < LS_HIST)
1889			continue;
1890
1891		(void) fprintf(out, "\n");
1892		(void) fprintf(out, "%10s %31s %-9s %-24s\n",
1893		    g_event_info[event].ev_units,
1894		    "------ Time Distribution ------",
1895		    g_rates ? "ops/s" : "count",
1896		    rectype > LS_STACK(0) ? "Stack" : "");
1897
1898		first_bin = 0;
1899		while (lsp->ls_hist[first_bin] == 0)
1900			first_bin++;
1901
1902		last_bin = 63;
1903		while (lsp->ls_hist[last_bin] == 0)
1904			last_bin--;
1905
1906		max_bin_count = 0;
1907		total_bin_count = 0;
1908		for (j = first_bin; j <= last_bin; j++) {
1909			total_bin_count += lsp->ls_hist[j];
1910			if (lsp->ls_hist[j] > max_bin_count)
1911				max_bin_count = lsp->ls_hist[j];
1912		}
1913
1914		/*
1915		 * If we went a few frames below the caller, ignore them
1916		 */
1917		for (fr = 3; fr > 0; fr--)
1918			if (lsp->ls_stack[fr] == lsp->ls_caller)
1919				break;
1920
1921		for (j = first_bin; j <= last_bin; j++) {
1922			uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
1923			(void) fprintf(out, "%10llu |%s%s %-9u ",
1924			    1ULL << j,
1925			    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
1926			    "                              " + depth,
1927			    g_rates == 0 ? lsp->ls_hist[j] :
1928			    (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
1929			    g_elapsed));
1930			if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
1931				(void) fprintf(out, "\n");
1932				continue;
1933			}
1934			(void) fprintf(out, "%-24s\n",
1935			    format_symbol(buf, lsp->ls_stack[fr], 0));
1936			fr++;
1937		}
1938		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1939			(void) fprintf(out, "%15s %-36s %-24s\n", "", "",
1940			    format_symbol(buf, lsp->ls_stack[fr], 0));
1941			fr++;
1942		}
1943	}
1944
1945	if (!g_pflag)
1946		(void) fprintf(out, "---------------------------------"
1947		    "----------------------------------------------\n");
1948
1949	(void) fflush(out);
1950}
1951
1952static void
1953report_trace(FILE *out, lsrec_t **sort_buf)
1954{
1955	lsrec_t *lsp;
1956	int i, fr;
1957	int rectype;
1958	char buf[256], buf2[256];
1959
1960	rectype = g_recsize;
1961
1962	if (!g_pflag) {
1963		(void) fprintf(out, "%5s  %7s  %11s  %-24s  %-24s\n",
1964		    "Event", "Time", "Owner", "Lock", "Caller");
1965		(void) fprintf(out, "---------------------------------"
1966		    "----------------------------------------------\n");
1967	}
1968
1969	for (i = 0; i < g_nrecs_used; i++) {
1970
1971		lsp = sort_buf[i];
1972
1973		if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
1974			continue;
1975
1976		(void) fprintf(out, "%2d  %10llu  %11p  %-24s  %-24s\n",
1977		    lsp->ls_event, (unsigned long long)lsp->ls_time,
1978		    (void *)lsp->ls_next,
1979#ifdef illumos
1980		    format_symbol(buf, lsp->ls_lock, 0),
1981#else
1982		    lsp->ls_lock,
1983#endif
1984		    format_symbol(buf2, lsp->ls_caller, 0));
1985
1986		if (rectype <= LS_STACK(0))
1987			continue;
1988
1989		/*
1990		 * If we went a few frames below the caller, ignore them
1991		 */
1992		for (fr = 3; fr > 0; fr--)
1993			if (lsp->ls_stack[fr] == lsp->ls_caller)
1994				break;
1995
1996		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1997			(void) fprintf(out, "%53s  %-24s\n", "",
1998			    format_symbol(buf, lsp->ls_stack[fr], 0));
1999			fr++;
2000		}
2001		(void) fprintf(out, "\n");
2002	}
2003
2004	(void) fflush(out);
2005}
2006