1192867Ssson/*
2192867Ssson * CDDL HEADER START
3192867Ssson *
4192867Ssson * The contents of this file are subject to the terms of the
5192867Ssson * Common Development and Distribution License (the "License").
6192867Ssson * You may not use this file except in compliance with the License.
7192867Ssson *
8192867Ssson * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9192867Ssson * or http://www.opensolaris.org/os/licensing.
10192867Ssson * See the License for the specific language governing permissions
11192867Ssson * and limitations under the License.
12192867Ssson *
13192867Ssson * When distributing Covered Code, include this CDDL HEADER in each
14192867Ssson * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15192867Ssson * If applicable, add the following below this CDDL HEADER, with the
16192867Ssson * fields enclosed by brackets "[]" replaced with your own identifying
17192867Ssson * information: Portions Copyright [yyyy] [name of copyright owner]
18192867Ssson *
19192867Ssson * CDDL HEADER END
20192867Ssson */
21192867Ssson/*
22192867Ssson * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23192867Ssson * Use is subject to license terms.
24192867Ssson */
25192867Ssson
26192867Ssson#pragma ident	"%Z%%M%	%I%	%E% SMI"
27192867Ssson
28192867Ssson#include <stdio.h>
29192867Ssson#include <stddef.h>
30192867Ssson#include <stdlib.h>
31192867Ssson#include <stdarg.h>
32192867Ssson#include <string.h>
33192867Ssson#include <strings.h>
34192867Ssson#include <ctype.h>
35192867Ssson#include <fcntl.h>
36192867Ssson#include <unistd.h>
37192867Ssson#include <errno.h>
38192867Ssson#include <limits.h>
39192867Ssson#include <sys/types.h>
40192867Ssson#include <sys/modctl.h>
41192867Ssson#include <sys/stat.h>
42192867Ssson#include <sys/wait.h>
43192867Ssson#include <dtrace.h>
44192867Ssson#include <sys/lockstat.h>
45192867Ssson#include <alloca.h>
46192867Ssson#include <signal.h>
47192867Ssson#include <assert.h>
48192867Ssson
49192937Ssson#if defined(sun)
50192937Ssson#define	GETOPT_EOF	EOF
51192937Ssson#else
52192937Ssson/* FreeBSD */
53192867Ssson#include <sys/time.h>
54192867Ssson#include <sys/resource.h>
55192867Ssson
56192937Ssson#define	mergesort(a, b, c, d)	lsmergesort(a, b, c, d)
57192937Ssson#define	GETOPT_EOF		(-1)
58192867Ssson
59192937Sssontypedef	uintptr_t	pc_t;
60192937Ssson#endif /* defined(sun) */
61192937Ssson
62192867Ssson#define	LOCKSTAT_OPTSTR	"x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"
63192867Ssson
64192867Ssson#define	LS_MAX_STACK_DEPTH	50
65192867Ssson#define	LS_MAX_EVENTS		64
66192867Ssson
67192867Sssontypedef struct lsrec {
68192867Ssson	struct lsrec	*ls_next;	/* next in hash chain */
69192867Ssson	uintptr_t	ls_lock;	/* lock address */
70192867Ssson	uintptr_t	ls_caller;	/* caller address */
71192867Ssson	uint32_t	ls_count;	/* cumulative event count */
72192867Ssson	uint32_t	ls_event;	/* type of event */
73192867Ssson	uintptr_t	ls_refcnt;	/* cumulative reference count */
74192867Ssson	uint64_t	ls_time;	/* cumulative event duration */
75192867Ssson	uint32_t	ls_hist[64];	/* log2(duration) histogram */
76192867Ssson	uintptr_t	ls_stack[LS_MAX_STACK_DEPTH];
77192867Ssson} lsrec_t;
78192867Ssson
79192867Sssontypedef struct lsdata {
80192867Ssson	struct lsrec	*lsd_next;	/* next available */
81192867Ssson	int		lsd_count;	/* number of records */
82192867Ssson} lsdata_t;
83192867Ssson
84192867Ssson/*
85192867Ssson * Definitions for the types of experiments which can be run.  They are
86192867Ssson * listed in increasing order of memory cost and processing time cost.
87192867Ssson * The numerical value of each type is the number of bytes needed per record.
88192867Ssson */
89192867Ssson#define	LS_BASIC	offsetof(lsrec_t, ls_time)
90192867Ssson#define	LS_TIME		offsetof(lsrec_t, ls_hist[0])
91192867Ssson#define	LS_HIST		offsetof(lsrec_t, ls_stack[0])
92192867Ssson#define	LS_STACK(depth)	offsetof(lsrec_t, ls_stack[depth])
93192867Ssson
94192867Sssonstatic void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
95192867Sssonstatic void report_trace(FILE *, lsrec_t **);
96192867Ssson
97192867Sssonextern int symtab_init(void);
98192867Sssonextern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
99192867Sssonextern uintptr_t sym_to_addr(char *name);
100192867Sssonextern size_t sym_size(char *name);
101192867Sssonextern char *strtok_r(char *, const char *, char **);
102192867Ssson
103192867Ssson#define	DEFAULT_NRECS	10000
104192867Ssson#define	DEFAULT_HZ	97
105192867Ssson#define	MAX_HZ		1000
106192867Ssson#define	MIN_AGGSIZE	(16 * 1024)
107192867Ssson#define	MAX_AGGSIZE	(32 * 1024 * 1024)
108192867Ssson
109192867Sssonstatic int g_stkdepth;
110192867Sssonstatic int g_topn = INT_MAX;
111192867Sssonstatic hrtime_t g_elapsed;
112192867Sssonstatic int g_rates = 0;
113192867Sssonstatic int g_pflag = 0;
114192867Sssonstatic int g_Pflag = 0;
115192867Sssonstatic int g_wflag = 0;
116192867Sssonstatic int g_Wflag = 0;
117192867Sssonstatic int g_cflag = 0;
118192867Sssonstatic int g_kflag = 0;
119192867Sssonstatic int g_gflag = 0;
120192867Sssonstatic int g_Vflag = 0;
121192867Sssonstatic int g_tracing = 0;
122192867Sssonstatic size_t g_recsize;
123192867Sssonstatic size_t g_nrecs;
124192867Sssonstatic int g_nrecs_used;
125192867Sssonstatic uchar_t g_enabled[LS_MAX_EVENTS];
126192867Sssonstatic hrtime_t g_min_duration[LS_MAX_EVENTS];
127192867Sssonstatic dtrace_hdl_t *g_dtp;
128192867Sssonstatic char *g_predicate;
129192867Sssonstatic char *g_ipredicate;
130192867Sssonstatic char *g_prog;
131192867Sssonstatic int g_proglen;
132192867Sssonstatic int g_dropped;
133192867Ssson
134192867Sssontypedef struct ls_event_info {
135192867Ssson	char	ev_type;
136192867Ssson	char	ev_lhdr[20];
137192867Ssson	char	ev_desc[80];
138192867Ssson	char	ev_units[10];
139192867Ssson	char	ev_name[DTRACE_NAMELEN];
140192867Ssson	char	*ev_predicate;
141192867Ssson	char	*ev_acquire;
142192867Ssson} ls_event_info_t;
143192867Ssson
144192867Sssonstatic ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
145192867Ssson	{ 'C',	"Lock",	"Adaptive mutex spin",			"nsec",
146192867Ssson	    "lockstat:::adaptive-spin" },
147192867Ssson	{ 'C',	"Lock",	"Adaptive mutex block",			"nsec",
148192867Ssson	    "lockstat:::adaptive-block" },
149192867Ssson	{ 'C',	"Lock",	"Spin lock spin",			"nsec",
150192867Ssson	    "lockstat:::spin-spin" },
151192867Ssson	{ 'C',	"Lock",	"Thread lock spin",			"nsec",
152192867Ssson	    "lockstat:::thread-spin" },
153192867Ssson	{ 'C',	"Lock",	"R/W writer blocked by writer",		"nsec",
154192867Ssson	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
155192867Ssson	{ 'C',	"Lock",	"R/W writer blocked by readers",	"nsec",
156192867Ssson	    "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
157192867Ssson	{ 'C',	"Lock",	"R/W reader blocked by writer",		"nsec",
158192867Ssson	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
159192867Ssson	{ 'C',	"Lock",	"R/W reader blocked by write wanted",	"nsec",
160192867Ssson	    "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
161192867Ssson	{ 'C',	"Lock",	"Unknown event (type 8)",		"units"	},
162192867Ssson	{ 'C',	"Lock",	"Unknown event (type 9)",		"units"	},
163192867Ssson	{ 'C',	"Lock",	"Unknown event (type 10)",		"units"	},
164192867Ssson	{ 'C',	"Lock",	"Unknown event (type 11)",		"units"	},
165192867Ssson	{ 'C',	"Lock",	"Unknown event (type 12)",		"units"	},
166192867Ssson	{ 'C',	"Lock",	"Unknown event (type 13)",		"units"	},
167192867Ssson	{ 'C',	"Lock",	"Unknown event (type 14)",		"units"	},
168192867Ssson	{ 'C',	"Lock",	"Unknown event (type 15)",		"units"	},
169192867Ssson	{ 'C',	"Lock",	"Unknown event (type 16)",		"units"	},
170192867Ssson	{ 'C',	"Lock",	"Unknown event (type 17)",		"units"	},
171192867Ssson	{ 'C',	"Lock",	"Unknown event (type 18)",		"units"	},
172192867Ssson	{ 'C',	"Lock",	"Unknown event (type 19)",		"units"	},
173192867Ssson	{ 'C',	"Lock",	"Unknown event (type 20)",		"units"	},
174192867Ssson	{ 'C',	"Lock",	"Unknown event (type 21)",		"units"	},
175192867Ssson	{ 'C',	"Lock",	"Unknown event (type 22)",		"units"	},
176192867Ssson	{ 'C',	"Lock",	"Unknown event (type 23)",		"units"	},
177192867Ssson	{ 'C',	"Lock",	"Unknown event (type 24)",		"units"	},
178192867Ssson	{ 'C',	"Lock",	"Unknown event (type 25)",		"units"	},
179192867Ssson	{ 'C',	"Lock",	"Unknown event (type 26)",		"units"	},
180192867Ssson	{ 'C',	"Lock",	"Unknown event (type 27)",		"units"	},
181192867Ssson	{ 'C',	"Lock",	"Unknown event (type 28)",		"units"	},
182192867Ssson	{ 'C',	"Lock",	"Unknown event (type 29)",		"units"	},
183192867Ssson	{ 'C',	"Lock",	"Unknown event (type 30)",		"units"	},
184192867Ssson	{ 'C',	"Lock",	"Unknown event (type 31)",		"units"	},
185192867Ssson	{ 'H',	"Lock",	"Adaptive mutex hold",			"nsec",
186192867Ssson	    "lockstat:::adaptive-release", NULL,
187192867Ssson	    "lockstat:::adaptive-acquire" },
188192867Ssson	{ 'H',	"Lock",	"Spin lock hold",			"nsec",
189192867Ssson	    "lockstat:::spin-release", NULL,
190192867Ssson	    "lockstat:::spin-acquire" },
191192867Ssson	{ 'H',	"Lock",	"R/W writer hold",			"nsec",
192192867Ssson	    "lockstat:::rw-release", "arg1 == 0",
193192867Ssson	    "lockstat:::rw-acquire" },
194192867Ssson	{ 'H',	"Lock",	"R/W reader hold",			"nsec",
195192867Ssson	    "lockstat:::rw-release", "arg1 != 0",
196192867Ssson	    "lockstat:::rw-acquire" },
197192867Ssson	{ 'H',	"Lock",	"Unknown event (type 36)",		"units"	},
198192867Ssson	{ 'H',	"Lock",	"Unknown event (type 37)",		"units"	},
199192867Ssson	{ 'H',	"Lock",	"Unknown event (type 38)",		"units"	},
200192867Ssson	{ 'H',	"Lock",	"Unknown event (type 39)",		"units"	},
201192867Ssson	{ 'H',	"Lock",	"Unknown event (type 40)",		"units"	},
202192867Ssson	{ 'H',	"Lock",	"Unknown event (type 41)",		"units"	},
203192867Ssson	{ 'H',	"Lock",	"Unknown event (type 42)",		"units"	},
204192867Ssson	{ 'H',	"Lock",	"Unknown event (type 43)",		"units"	},
205192867Ssson	{ 'H',	"Lock",	"Unknown event (type 44)",		"units"	},
206192867Ssson	{ 'H',	"Lock",	"Unknown event (type 45)",		"units"	},
207192867Ssson	{ 'H',	"Lock",	"Unknown event (type 46)",		"units"	},
208192867Ssson	{ 'H',	"Lock",	"Unknown event (type 47)",		"units"	},
209192867Ssson	{ 'H',	"Lock",	"Unknown event (type 48)",		"units"	},
210192867Ssson	{ 'H',	"Lock",	"Unknown event (type 49)",		"units"	},
211192867Ssson	{ 'H',	"Lock",	"Unknown event (type 50)",		"units"	},
212192867Ssson	{ 'H',	"Lock",	"Unknown event (type 51)",		"units"	},
213192867Ssson	{ 'H',	"Lock",	"Unknown event (type 52)",		"units"	},
214192867Ssson	{ 'H',	"Lock",	"Unknown event (type 53)",		"units"	},
215192867Ssson	{ 'H',	"Lock",	"Unknown event (type 54)",		"units"	},
216192867Ssson	{ 'H',	"Lock",	"Unknown event (type 55)",		"units"	},
217192867Ssson#if defined(sun)
218192867Ssson	{ 'I',	"CPU+PIL", "Profiling interrupt",		"nsec",
219192867Ssson#else
220192867Ssson	/* FreeBSD */
221192867Ssson	{ 'I',	"CPU+Pri_Class", "Profiling interrupt",		"nsec",
222192867Ssson#endif
223192867Ssson	    "profile:::profile-97", NULL },
224192867Ssson	{ 'I',	"Lock",	"Unknown event (type 57)",		"units"	},
225192867Ssson	{ 'I',	"Lock",	"Unknown event (type 58)",		"units"	},
226192867Ssson	{ 'I',	"Lock",	"Unknown event (type 59)",		"units"	},
227192867Ssson	{ 'E',	"Lock",	"Recursive lock entry detected",	"(N/A)",
228192867Ssson	    "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
229192867Ssson	{ 'E',	"Lock",	"Lockstat enter failure",		"(N/A)"	},
230192867Ssson	{ 'E',	"Lock",	"Lockstat exit failure",		"nsec"	},
231192867Ssson	{ 'E',	"Lock",	"Lockstat record failure",		"(N/A)"	},
232192867Ssson};
233192867Ssson
234192867Ssson#if !defined(sun)
235192867Sssonstatic char *g_pri_class[] = {
236192867Ssson	"",
237192867Ssson	"Intr",
238192867Ssson	"RealT",
239192867Ssson	"TShar",
240192867Ssson	"Idle"
241192867Ssson};
242192867Ssson#endif
243192867Ssson
244192867Sssonstatic void
245192867Sssonfail(int do_perror, const char *message, ...)
246192867Ssson{
247192867Ssson	va_list args;
248192867Ssson	int save_errno = errno;
249192867Ssson
250192867Ssson	va_start(args, message);
251192867Ssson	(void) fprintf(stderr, "lockstat: ");
252192867Ssson	(void) vfprintf(stderr, message, args);
253192867Ssson	va_end(args);
254192867Ssson	if (do_perror)
255192867Ssson		(void) fprintf(stderr, ": %s", strerror(save_errno));
256192867Ssson	(void) fprintf(stderr, "\n");
257192867Ssson	exit(2);
258192867Ssson}
259192867Ssson
260192867Sssonstatic void
261192867Sssondfail(const char *message, ...)
262192867Ssson{
263192867Ssson	va_list args;
264192867Ssson
265192867Ssson	va_start(args, message);
266192867Ssson	(void) fprintf(stderr, "lockstat: ");
267192867Ssson	(void) vfprintf(stderr, message, args);
268192867Ssson	va_end(args);
269192867Ssson	(void) fprintf(stderr, ": %s\n",
270192867Ssson	    dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
271192867Ssson
272192867Ssson	exit(2);
273192867Ssson}
274192867Ssson
275192867Sssonstatic void
276192867Sssonshow_events(char event_type, char *desc)
277192867Ssson{
278192867Ssson	int i, first = -1, last;
279192867Ssson
280192867Ssson	for (i = 0; i < LS_MAX_EVENTS; i++) {
281192867Ssson		ls_event_info_t *evp = &g_event_info[i];
282192867Ssson		if (evp->ev_type != event_type ||
283192867Ssson		    strncmp(evp->ev_desc, "Unknown event", 13) == 0)
284192867Ssson			continue;
285192867Ssson		if (first == -1)
286192867Ssson			first = i;
287192867Ssson		last = i;
288192867Ssson	}
289192867Ssson
290192867Ssson	(void) fprintf(stderr,
291192867Ssson	    "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
292192867Ssson	    desc, event_type, first, last);
293192867Ssson
294192867Ssson	for (i = first; i <= last; i++)
295192867Ssson		(void) fprintf(stderr,
296192867Ssson		    "%4d = %s\n", i, g_event_info[i].ev_desc);
297192867Ssson}
298192867Ssson
299192867Sssonstatic void
300192867Sssonusage(void)
301192867Ssson{
302192867Ssson	(void) fprintf(stderr,
303192867Ssson	    "Usage: lockstat [options] command [args]\n"
304192867Ssson	    "\nEvent selection options:\n\n"
305192867Ssson	    "  -C              watch contention events [on by default]\n"
306192867Ssson	    "  -E              watch error events [off by default]\n"
307192867Ssson	    "  -H              watch hold events [off by default]\n"
308192867Ssson	    "  -I              watch interrupt events [off by default]\n"
309192867Ssson	    "  -A              watch all lock events [equivalent to -CH]\n"
310192867Ssson	    "  -e event_list   only watch the specified events (shown below);\n"
311192867Ssson	    "                  <event_list> is a comma-separated list of\n"
312192867Ssson	    "                  events or ranges of events, e.g. 1,4-7,35\n"
313192867Ssson	    "  -i rate         interrupt rate for -I [default: %d Hz]\n"
314192867Ssson	    "\nData gathering options:\n\n"
315192867Ssson	    "  -b              basic statistics (lock, caller, event count)\n"
316192867Ssson	    "  -t              timing for all events [default]\n"
317192867Ssson	    "  -h              histograms for event times\n"
318192867Ssson	    "  -s depth        stack traces <depth> deep\n"
319192867Ssson	    "  -x opt[=val]    enable or modify DTrace options\n"
320192867Ssson	    "\nData filtering options:\n\n"
321192867Ssson	    "  -n nrecords     maximum number of data records [default: %d]\n"
322192867Ssson	    "  -l lock[,size]  only watch <lock>, which can be specified as a\n"
323192867Ssson	    "                  symbolic name or hex address; <size> defaults\n"
324192867Ssson	    "                  to the ELF symbol size if available, 1 if not\n"
325192867Ssson	    "  -f func[,size]  only watch events generated by <func>\n"
326192867Ssson	    "  -d duration     only watch events longer than <duration>\n"
327192867Ssson	    "  -T              trace (rather than sample) events\n"
328192867Ssson	    "\nData reporting options:\n\n"
329192867Ssson	    "  -c              coalesce lock data for arrays like pse_mutex[]\n"
330192867Ssson	    "  -k              coalesce PCs within functions\n"
331192867Ssson	    "  -g              show total events generated by function\n"
332192867Ssson	    "  -w              wherever: don't distinguish events by caller\n"
333192867Ssson	    "  -W              whichever: don't distinguish events by lock\n"
334192867Ssson	    "  -R              display rates rather than counts\n"
335192867Ssson	    "  -p              parsable output format (awk(1)-friendly)\n"
336192867Ssson	    "  -P              sort lock data by (count * avg_time) product\n"
337192867Ssson	    "  -D n            only display top <n> events of each type\n"
338192867Ssson	    "  -o filename     send output to <filename>\n",
339192867Ssson	    DEFAULT_HZ, DEFAULT_NRECS);
340192867Ssson
341192867Ssson	show_events('C', "Contention");
342192867Ssson	show_events('H', "Hold-time");
343192867Ssson	show_events('I', "Interrupt");
344192867Ssson	show_events('E', "Error");
345192867Ssson	(void) fprintf(stderr, "\n");
346192867Ssson
347192867Ssson	exit(1);
348192867Ssson}
349192867Ssson
350192867Sssonstatic int
351192867Sssonlockcmp(lsrec_t *a, lsrec_t *b)
352192867Ssson{
353192867Ssson	int i;
354192867Ssson
355192867Ssson	if (a->ls_event < b->ls_event)
356192867Ssson		return (-1);
357192867Ssson	if (a->ls_event > b->ls_event)
358192867Ssson		return (1);
359192867Ssson
360192867Ssson	for (i = g_stkdepth - 1; i >= 0; i--) {
361192867Ssson		if (a->ls_stack[i] < b->ls_stack[i])
362192867Ssson			return (-1);
363192867Ssson		if (a->ls_stack[i] > b->ls_stack[i])
364192867Ssson			return (1);
365192867Ssson	}
366192867Ssson
367192867Ssson	if (a->ls_caller < b->ls_caller)
368192867Ssson		return (-1);
369192867Ssson	if (a->ls_caller > b->ls_caller)
370192867Ssson		return (1);
371192867Ssson
372192867Ssson	if (a->ls_lock < b->ls_lock)
373192867Ssson		return (-1);
374192867Ssson	if (a->ls_lock > b->ls_lock)
375192867Ssson		return (1);
376192867Ssson
377192867Ssson	return (0);
378192867Ssson}
379192867Ssson
380192867Sssonstatic int
381192867Sssoncountcmp(lsrec_t *a, lsrec_t *b)
382192867Ssson{
383192867Ssson	if (a->ls_event < b->ls_event)
384192867Ssson		return (-1);
385192867Ssson	if (a->ls_event > b->ls_event)
386192867Ssson		return (1);
387192867Ssson
388192867Ssson	return (b->ls_count - a->ls_count);
389192867Ssson}
390192867Ssson
391192867Sssonstatic int
392192867Sssontimecmp(lsrec_t *a, lsrec_t *b)
393192867Ssson{
394192867Ssson	if (a->ls_event < b->ls_event)
395192867Ssson		return (-1);
396192867Ssson	if (a->ls_event > b->ls_event)
397192867Ssson		return (1);
398192867Ssson
399192867Ssson	if (a->ls_time < b->ls_time)
400192867Ssson		return (1);
401192867Ssson	if (a->ls_time > b->ls_time)
402192867Ssson		return (-1);
403192867Ssson
404192867Ssson	return (0);
405192867Ssson}
406192867Ssson
407192867Sssonstatic int
408192867Sssonlockcmp_anywhere(lsrec_t *a, lsrec_t *b)
409192867Ssson{
410192867Ssson	if (a->ls_event < b->ls_event)
411192867Ssson		return (-1);
412192867Ssson	if (a->ls_event > b->ls_event)
413192867Ssson		return (1);
414192867Ssson
415192867Ssson	if (a->ls_lock < b->ls_lock)
416192867Ssson		return (-1);
417192867Ssson	if (a->ls_lock > b->ls_lock)
418192867Ssson		return (1);
419192867Ssson
420192867Ssson	return (0);
421192867Ssson}
422192867Ssson
423192867Sssonstatic int
424192867Sssonlock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
425192867Ssson{
426192867Ssson	if (a->ls_event < b->ls_event)
427192867Ssson		return (-1);
428192867Ssson	if (a->ls_event > b->ls_event)
429192867Ssson		return (1);
430192867Ssson
431192867Ssson	if (a->ls_lock < b->ls_lock)
432192867Ssson		return (-1);
433192867Ssson	if (a->ls_lock > b->ls_lock)
434192867Ssson		return (1);
435192867Ssson
436192867Ssson	return (b->ls_count - a->ls_count);
437192867Ssson}
438192867Ssson
439192867Sssonstatic int
440192867Sssonsitecmp_anylock(lsrec_t *a, lsrec_t *b)
441192867Ssson{
442192867Ssson	int i;
443192867Ssson
444192867Ssson	if (a->ls_event < b->ls_event)
445192867Ssson		return (-1);
446192867Ssson	if (a->ls_event > b->ls_event)
447192867Ssson		return (1);
448192867Ssson
449192867Ssson	for (i = g_stkdepth - 1; i >= 0; i--) {
450192867Ssson		if (a->ls_stack[i] < b->ls_stack[i])
451192867Ssson			return (-1);
452192867Ssson		if (a->ls_stack[i] > b->ls_stack[i])
453192867Ssson			return (1);
454192867Ssson	}
455192867Ssson
456192867Ssson	if (a->ls_caller < b->ls_caller)
457192867Ssson		return (-1);
458192867Ssson	if (a->ls_caller > b->ls_caller)
459192867Ssson		return (1);
460192867Ssson
461192867Ssson	return (0);
462192867Ssson}
463192867Ssson
464192867Sssonstatic int
465192867Sssonsite_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
466192867Ssson{
467192867Ssson	int i;
468192867Ssson
469192867Ssson	if (a->ls_event < b->ls_event)
470192867Ssson		return (-1);
471192867Ssson	if (a->ls_event > b->ls_event)
472192867Ssson		return (1);
473192867Ssson
474192867Ssson	for (i = g_stkdepth - 1; i >= 0; i--) {
475192867Ssson		if (a->ls_stack[i] < b->ls_stack[i])
476192867Ssson			return (-1);
477192867Ssson		if (a->ls_stack[i] > b->ls_stack[i])
478192867Ssson			return (1);
479192867Ssson	}
480192867Ssson
481192867Ssson	if (a->ls_caller < b->ls_caller)
482192867Ssson		return (-1);
483192867Ssson	if (a->ls_caller > b->ls_caller)
484192867Ssson		return (1);
485192867Ssson
486192867Ssson	return (b->ls_count - a->ls_count);
487192867Ssson}
488192867Ssson
489192867Sssonstatic void
490192867Sssonlsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
491192867Ssson{
492192867Ssson	int m = n / 2;
493192867Ssson	int i, j;
494192867Ssson
495192867Ssson	if (m > 1)
496192867Ssson		lsmergesort(cmp, a, b, m);
497192867Ssson	if (n - m > 1)
498192867Ssson		lsmergesort(cmp, a + m, b + m, n - m);
499192867Ssson	for (i = m; i > 0; i--)
500192867Ssson		b[i - 1] = a[i - 1];
501192867Ssson	for (j = m - 1; j < n - 1; j++)
502192867Ssson		b[n + m - j - 2] = a[j + 1];
503192867Ssson	while (i < j)
504192867Ssson		*a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
505192867Ssson	*a = b[i];
506192867Ssson}
507192867Ssson
508192867Sssonstatic void
509192867Sssoncoalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
510192867Ssson{
511192867Ssson	int i, j;
512192867Ssson	lsrec_t *target, *current;
513192867Ssson
514192867Ssson	target = lock[0];
515192867Ssson
516192867Ssson	for (i = 1; i < n; i++) {
517192867Ssson		current = lock[i];
518192867Ssson		if (cmp(current, target) != 0) {
519192867Ssson			target = current;
520192867Ssson			continue;
521192867Ssson		}
522192867Ssson		current->ls_event = LS_MAX_EVENTS;
523192867Ssson		target->ls_count += current->ls_count;
524192867Ssson		target->ls_refcnt += current->ls_refcnt;
525192867Ssson		if (g_recsize < LS_TIME)
526192867Ssson			continue;
527192867Ssson		target->ls_time += current->ls_time;
528192867Ssson		if (g_recsize < LS_HIST)
529192867Ssson			continue;
530192867Ssson		for (j = 0; j < 64; j++)
531192867Ssson			target->ls_hist[j] += current->ls_hist[j];
532192867Ssson	}
533192867Ssson}
534192867Ssson
535192867Sssonstatic void
536192867Sssoncoalesce_symbol(uintptr_t *addrp)
537192867Ssson{
538192867Ssson	uintptr_t symoff;
539192867Ssson	size_t symsize;
540192867Ssson
541192867Ssson	if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
542192867Ssson		*addrp -= symoff;
543192867Ssson}
544192867Ssson
545192867Sssonstatic void
546192867Sssonpredicate_add(char **pred, char *what, char *cmp, uintptr_t value)
547192867Ssson{
548192867Ssson	char *new;
549192867Ssson	int len, newlen;
550192867Ssson
551192867Ssson	if (what == NULL)
552192867Ssson		return;
553192867Ssson
554192867Ssson	if (*pred == NULL) {
555192867Ssson		*pred = malloc(1);
556192867Ssson		*pred[0] = '\0';
557192867Ssson	}
558192867Ssson
559192867Ssson	len = strlen(*pred);
560192867Ssson	newlen = len + strlen(what) + 32 + strlen("( && )");
561192867Ssson	new = malloc(newlen);
562192867Ssson
563192867Ssson	if (*pred[0] != '\0') {
564192867Ssson		if (cmp != NULL) {
565192867Ssson			(void) sprintf(new, "(%s) && (%s %s 0x%p)",
566192867Ssson			    *pred, what, cmp, (void *)value);
567192867Ssson		} else {
568192867Ssson			(void) sprintf(new, "(%s) && (%s)", *pred, what);
569192867Ssson		}
570192867Ssson	} else {
571192867Ssson		if (cmp != NULL) {
572192867Ssson			(void) sprintf(new, "%s %s 0x%p",
573192867Ssson			    what, cmp, (void *)value);
574192867Ssson		} else {
575192867Ssson			(void) sprintf(new, "%s", what);
576192867Ssson		}
577192867Ssson	}
578192867Ssson
579192867Ssson	free(*pred);
580192867Ssson	*pred = new;
581192867Ssson}
582192867Ssson
583192867Sssonstatic void
584192867Sssonpredicate_destroy(char **pred)
585192867Ssson{
586192867Ssson	free(*pred);
587192867Ssson	*pred = NULL;
588192867Ssson}
589192867Ssson
590192867Sssonstatic void
591192867Sssonfilter_add(char **filt, char *what, uintptr_t base, uintptr_t size)
592192867Ssson{
593192867Ssson	char buf[256], *c = buf, *new;
594192867Ssson	int len, newlen;
595192867Ssson
596192867Ssson	if (*filt == NULL) {
597192867Ssson		*filt = malloc(1);
598192867Ssson		*filt[0] = '\0';
599192867Ssson	}
600192867Ssson
601192867Ssson#if defined(sun)
602192867Ssson	(void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
603192867Ssson	    " || " : "", what, (void *)base, what, (void *)(base + size));
604192867Ssson#else
605192867Ssson	(void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ?
606192867Ssson	    " || " : "", what, (void *)base, what, (void *)(base + size));
607192867Ssson#endif
608192867Ssson
609192867Ssson	newlen = (len = strlen(*filt) + 1) + strlen(c);
610192867Ssson	new = malloc(newlen);
611192867Ssson	bcopy(*filt, new, len);
612192867Ssson	(void) strcat(new, c);
613192867Ssson	free(*filt);
614192867Ssson	*filt = new;
615192867Ssson}
616192867Ssson
617192867Sssonstatic void
618192867Sssonfilter_destroy(char **filt)
619192867Ssson{
620192867Ssson	free(*filt);
621192867Ssson	*filt = NULL;
622192867Ssson}
623192867Ssson
624192867Sssonstatic void
625192867Sssondprog_add(const char *fmt, ...)
626192867Ssson{
627192867Ssson	va_list args;
628192867Ssson	int size, offs;
629192867Ssson	char c;
630192867Ssson
631192867Ssson	va_start(args, fmt);
632192867Ssson	size = vsnprintf(&c, 1, fmt, args) + 1;
633192867Ssson	va_end(args);
634192867Ssson
635192867Ssson	if (g_proglen == 0) {
636192867Ssson		offs = 0;
637192867Ssson	} else {
638192867Ssson		offs = g_proglen - 1;
639192867Ssson	}
640192867Ssson
641192867Ssson	g_proglen = offs + size;
642192867Ssson
643192867Ssson	if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
644192867Ssson		fail(1, "failed to reallocate program text");
645192867Ssson
646192867Ssson	va_start(args, fmt);
647192867Ssson	(void) vsnprintf(&g_prog[offs], size, fmt, args);
648192867Ssson	va_end(args);
649192867Ssson}
650192867Ssson
651192867Ssson/*
652192867Ssson * This function may read like an open sewer, but keep in mind that programs
653192867Ssson * that generate other programs are rarely pretty.  If one has the unenviable
654192867Ssson * task of maintaining or -- worse -- extending this code, use the -V option
655192867Ssson * to examine the D program as generated by this function.
656192867Ssson */
657192867Sssonstatic void
658192867Sssondprog_addevent(int event)
659192867Ssson{
660192867Ssson	ls_event_info_t *info = &g_event_info[event];
661192867Ssson	char *pred = NULL;
662192867Ssson	char stack[20];
663192867Ssson	const char *arg0, *caller;
664192867Ssson	char *arg1 = "arg1";
665192867Ssson	char buf[80];
666192867Ssson	hrtime_t dur;
667192867Ssson	int depth;
668192867Ssson
669192867Ssson	if (info->ev_name[0] == '\0')
670192867Ssson		return;
671192867Ssson
672192867Ssson	if (info->ev_type == 'I') {
673192867Ssson		/*
674192867Ssson		 * For interrupt events, arg0 (normally the lock pointer) is
675192867Ssson		 * the CPU address plus the current pil, and arg1 (normally
676192867Ssson		 * the number of nanoseconds) is the number of nanoseconds
677192867Ssson		 * late -- and it's stored in arg2.
678192867Ssson		 */
679192867Ssson#if defined(sun)
680192867Ssson		arg0 = "(uintptr_t)curthread->t_cpu + \n"
681192867Ssson		    "\t    curthread->t_cpu->cpu_profile_pil";
682192867Ssson#else
683192867Ssson		arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n"
684192867Ssson		    "\t    0x01000000 + curthread->td_pri_class";
685192867Ssson#endif
686192867Ssson		caller = "(uintptr_t)arg0";
687192867Ssson		arg1 = "arg2";
688192867Ssson	} else {
689192867Ssson		arg0 = "(uintptr_t)arg0";
690192867Ssson		caller = "caller";
691192867Ssson	}
692192867Ssson
693192867Ssson	if (g_recsize > LS_HIST) {
694192867Ssson		for (depth = 0; g_recsize > LS_STACK(depth); depth++)
695192867Ssson			continue;
696192867Ssson
697192867Ssson		if (g_tracing) {
698192867Ssson			(void) sprintf(stack, "\tstack(%d);\n", depth);
699192867Ssson		} else {
700192867Ssson			(void) sprintf(stack, ", stack(%d)", depth);
701192867Ssson		}
702192867Ssson	} else {
703192867Ssson		(void) sprintf(stack, "");
704192867Ssson	}
705192867Ssson
706192867Ssson	if (info->ev_acquire != NULL) {
707192867Ssson		/*
708192867Ssson		 * If this is a hold event, we need to generate an additional
709192867Ssson		 * clause for the acquire; the clause for the release will be
710192867Ssson		 * generated with the aggregating statement, below.
711192867Ssson		 */
712192867Ssson		dprog_add("%s\n", info->ev_acquire);
713192867Ssson		predicate_add(&pred, info->ev_predicate, NULL, 0);
714192867Ssson		predicate_add(&pred, g_predicate, NULL, 0);
715192867Ssson		if (pred != NULL)
716192867Ssson			dprog_add("/%s/\n", pred);
717192867Ssson
718192867Ssson		dprog_add("{\n");
719192867Ssson		(void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);
720192867Ssson
721192867Ssson		if (info->ev_type == 'H') {
722192867Ssson			dprog_add("\t%s = timestamp;\n", buf);
723192867Ssson		} else {
724192867Ssson			/*
725192867Ssson			 * If this isn't a hold event, it's the recursive
726192867Ssson			 * error event.  For this, we simply bump the
727192867Ssson			 * thread-local, per-lock count.
728192867Ssson			 */
729192867Ssson			dprog_add("\t%s++;\n", buf);
730192867Ssson		}
731192867Ssson
732192867Ssson		dprog_add("}\n\n");
733192867Ssson		predicate_destroy(&pred);
734192867Ssson		pred = NULL;
735192867Ssson
736192867Ssson		if (info->ev_type == 'E') {
737192867Ssson			/*
738192867Ssson			 * If this is the recursive lock error event, we need
739192867Ssson			 * to generate an additional clause to decrement the
740192867Ssson			 * thread-local, per-lock count.  This assures that we
741192867Ssson			 * only execute the aggregating clause if we have
742192867Ssson			 * recursive entry.
743192867Ssson			 */
744192867Ssson			dprog_add("%s\n", info->ev_name);
745192867Ssson			dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
746192867Ssson		}
747192867Ssson
748192867Ssson		predicate_add(&pred, buf, NULL, 0);
749192867Ssson
750192867Ssson		if (info->ev_type == 'H') {
751192867Ssson			(void) sprintf(buf, "timestamp -\n\t    "
752192867Ssson			    "self->ev%d[(uintptr_t)arg0]", event);
753192867Ssson		}
754192867Ssson
755192867Ssson		arg1 = buf;
756192867Ssson	} else {
757192867Ssson		predicate_add(&pred, info->ev_predicate, NULL, 0);
758192867Ssson		if (info->ev_type != 'I')
759192867Ssson			predicate_add(&pred, g_predicate, NULL, 0);
760192867Ssson		else
761192867Ssson			predicate_add(&pred, g_ipredicate, NULL, 0);
762192867Ssson	}
763192867Ssson
764192867Ssson	if ((dur = g_min_duration[event]) != 0)
765192867Ssson		predicate_add(&pred, arg1, ">=", dur);
766192867Ssson
767192867Ssson	dprog_add("%s\n", info->ev_name);
768192867Ssson
769192867Ssson	if (pred != NULL)
770192867Ssson		dprog_add("/%s/\n", pred);
771192867Ssson	predicate_destroy(&pred);
772192867Ssson
773192867Ssson	dprog_add("{\n");
774192867Ssson
775192867Ssson	if (g_tracing) {
776192867Ssson		dprog_add("\ttrace(%dULL);\n", event);
777192867Ssson		dprog_add("\ttrace(%s);\n", arg0);
778192867Ssson		dprog_add("\ttrace(%s);\n", caller);
779192867Ssson		dprog_add(stack);
780192867Ssson	} else {
781192867Ssson		/*
782192867Ssson		 * The ordering here is important:  when we process the
783192867Ssson		 * aggregate, we count on the fact that @avg appears before
784192867Ssson		 * @hist in program order to assure that @avg is assigned the
785192867Ssson		 * first aggregation variable ID and @hist assigned the
786192867Ssson		 * second; see the comment in process_aggregate() for details.
787192867Ssson		 */
788192867Ssson		dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
789192867Ssson		    event, arg0, caller, stack, arg1);
790192867Ssson
791192867Ssson		if (g_recsize >= LS_HIST) {
792192867Ssson			dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
793192867Ssson			    "(%s);\n", event, arg0, caller, stack, arg1);
794192867Ssson		}
795192867Ssson	}
796192867Ssson
797192867Ssson	if (info->ev_acquire != NULL)
798192867Ssson		dprog_add("\tself->ev%d[arg0] = 0;\n", event);
799192867Ssson
800192867Ssson	dprog_add("}\n\n");
801192867Ssson}
802192867Ssson
803192867Sssonstatic void
804192867Sssondprog_compile()
805192867Ssson{
806192867Ssson	dtrace_prog_t *prog;
807192867Ssson	dtrace_proginfo_t info;
808192867Ssson
809192867Ssson	if (g_Vflag) {
810192867Ssson		(void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
811192867Ssson		(void) fputs(g_prog, stderr);
812192867Ssson		(void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
813192867Ssson	}
814192867Ssson
815192867Ssson	if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
816192867Ssson	    DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
817192867Ssson		dfail("failed to compile program");
818192867Ssson
819192867Ssson	if (dtrace_program_exec(g_dtp, prog, &info) == -1)
820192867Ssson		dfail("failed to enable probes");
821192867Ssson
822192867Ssson	if (dtrace_go(g_dtp) != 0)
823192867Ssson		dfail("couldn't start tracing");
824192867Ssson}
825192867Ssson
826192867Sssonstatic void
827192867Ssson#if defined(sun)
828192867Sssonstatus_fire(void)
829192867Ssson#else
830192867Sssonstatus_fire(int i)
831192867Ssson#endif
832192867Ssson{}
833192867Ssson
834192867Sssonstatic void
835192867Sssonstatus_init(void)
836192867Ssson{
837192867Ssson	dtrace_optval_t val, status, agg;
838192867Ssson	struct sigaction act;
839192867Ssson	struct itimerspec ts;
840192867Ssson	struct sigevent ev;
841192867Ssson	timer_t tid;
842192867Ssson
843192867Ssson	if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
844192867Ssson		dfail("failed to get 'statusrate'");
845192867Ssson
846192867Ssson	if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
847192867Ssson		dfail("failed to get 'statusrate'");
848192867Ssson
849192867Ssson	/*
850192867Ssson	 * We would want to awaken at a rate that is the GCD of the statusrate
851192867Ssson	 * and the aggrate -- but that seems a bit absurd.  Instead, we'll
852192867Ssson	 * simply awaken at a rate that is the more frequent of the two, which
853192867Ssson	 * assures that we're never later than the interval implied by the
854192867Ssson	 * more frequent rate.
855192867Ssson	 */
856192867Ssson	val = status < agg ? status : agg;
857192867Ssson
858192867Ssson	(void) sigemptyset(&act.sa_mask);
859192867Ssson	act.sa_flags = 0;
860192867Ssson	act.sa_handler = status_fire;
861192867Ssson	(void) sigaction(SIGUSR1, &act, NULL);
862192867Ssson
863192867Ssson	ev.sigev_notify = SIGEV_SIGNAL;
864192867Ssson	ev.sigev_signo = SIGUSR1;
865192867Ssson
866192867Ssson	if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
867192867Ssson		dfail("cannot create CLOCK_REALTIME timer");
868192867Ssson
869192867Ssson	ts.it_value.tv_sec = val / NANOSEC;
870192867Ssson	ts.it_value.tv_nsec = val % NANOSEC;
871192867Ssson	ts.it_interval = ts.it_value;
872192867Ssson
873192867Ssson	if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
874192867Ssson		dfail("cannot set time on CLOCK_REALTIME timer");
875192867Ssson}
876192867Ssson
877192867Sssonstatic void
878192867Sssonstatus_check(void)
879192867Ssson{
880192867Ssson	if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
881192867Ssson		dfail("failed to snap aggregate");
882192867Ssson
883192867Ssson	if (dtrace_status(g_dtp) == -1)
884192867Ssson		dfail("dtrace_status()");
885192867Ssson}
886192867Ssson
887192867Sssonstatic void
888192867Sssonlsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
889192867Ssson{
890192867Ssson	bzero(lsrec, g_recsize);
891192867Ssson	lsrec->ls_count = 1;
892192867Ssson
893192867Ssson	if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
894192867Ssson		fail(0, "truncated DTrace record");
895192867Ssson
896192867Ssson	if (rec->dtrd_size != sizeof (uint64_t))
897192867Ssson		fail(0, "bad event size in first record");
898192867Ssson
899192867Ssson	/* LINTED - alignment */
900192867Ssson	lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
901192867Ssson	rec++;
902192867Ssson
903192867Ssson	if (rec->dtrd_size != sizeof (uintptr_t))
904192867Ssson		fail(0, "bad lock address size in second record");
905192867Ssson
906192867Ssson	/* LINTED - alignment */
907192867Ssson	lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
908192867Ssson	rec++;
909192867Ssson
910192867Ssson	if (rec->dtrd_size != sizeof (uintptr_t))
911192867Ssson		fail(0, "bad caller size in third record");
912192867Ssson
913192867Ssson	/* LINTED - alignment */
914192867Ssson	lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
915192867Ssson	rec++;
916192867Ssson
917192867Ssson	if (g_recsize > LS_HIST) {
918192867Ssson		int frames, i;
919192867Ssson		pc_t *stack;
920192867Ssson
921192867Ssson		frames = rec->dtrd_size / sizeof (pc_t);
922192867Ssson		/* LINTED - alignment */
923192867Ssson		stack = (pc_t *)(data + rec->dtrd_offset);
924192867Ssson
925192867Ssson		for (i = 1; i < frames; i++)
926192867Ssson			lsrec->ls_stack[i - 1] = stack[i];
927192867Ssson	}
928192867Ssson}
929192867Ssson
930192867Ssson/*ARGSUSED*/
931192867Sssonstatic int
932192867Sssoncount_aggregate(const dtrace_aggdata_t *agg, void *arg)
933192867Ssson{
934192867Ssson	*((size_t *)arg) += 1;
935192867Ssson
936192867Ssson	return (DTRACE_AGGWALK_NEXT);
937192867Ssson}
938192867Ssson
939192867Sssonstatic int
940192867Sssonprocess_aggregate(const dtrace_aggdata_t *agg, void *arg)
941192867Ssson{
942192867Ssson	const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
943192867Ssson	caddr_t data = agg->dtada_data;
944192867Ssson	lsdata_t *lsdata = arg;
945192867Ssson	lsrec_t *lsrec = lsdata->lsd_next;
946192867Ssson	const dtrace_recdesc_t *rec;
947192867Ssson	uint64_t *avg, *quantized;
948192867Ssson	int i, j;
949192867Ssson
950192867Ssson	assert(lsdata->lsd_count < g_nrecs);
951192867Ssson
952192867Ssson	/*
953192867Ssson	 * Aggregation variable IDs are guaranteed to be generated in program
954192867Ssson	 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
955192867Ssson	 * plus one.  As "avg" appears before "hist" in program order, we know
956192867Ssson	 * that "avg" will be allocated the first aggregation variable ID, and
957192867Ssson	 * "hist" will be allocated the second aggregation variable ID -- and
958192867Ssson	 * we therefore use the aggregation variable ID to differentiate the
959192867Ssson	 * cases.
960192867Ssson	 */
961192867Ssson	if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {
962192867Ssson		/*
963192867Ssson		 * If this is the histogram entry.  We'll copy the quantized
964192867Ssson		 * data into lc_hist, and jump over the rest.
965192867Ssson		 */
966192867Ssson		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
967192867Ssson
968192867Ssson		if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)
969192867Ssson			fail(0, "bad variable ID in aggregation record");
970192867Ssson
971192867Ssson		if (rec->dtrd_size !=
972192867Ssson		    DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
973192867Ssson			fail(0, "bad quantize size in aggregation record");
974192867Ssson
975192867Ssson		/* LINTED - alignment */
976192867Ssson		quantized = (uint64_t *)(data + rec->dtrd_offset);
977192867Ssson
978192867Ssson		for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
979192867Ssson		    i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
980192867Ssson			lsrec->ls_hist[j] = quantized[i];
981192867Ssson
982192867Ssson		goto out;
983192867Ssson	}
984192867Ssson
985192867Ssson	lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
986192867Ssson	    aggdesc->dtagd_nrecs - 1, data);
987192867Ssson
988192867Ssson	rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
989192867Ssson
990192867Ssson	if (rec->dtrd_size != 2 * sizeof (uint64_t))
991192867Ssson		fail(0, "bad avg size in aggregation record");
992192867Ssson
993192867Ssson	/* LINTED - alignment */
994192867Ssson	avg = (uint64_t *)(data + rec->dtrd_offset);
995192867Ssson	lsrec->ls_count = (uint32_t)avg[0];
996192867Ssson	lsrec->ls_time = (uintptr_t)avg[1];
997192867Ssson
998192867Ssson	if (g_recsize >= LS_HIST)
999192867Ssson		return (DTRACE_AGGWALK_NEXT);
1000192867Ssson
1001192867Sssonout:
1002192867Ssson	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1003192867Ssson	lsdata->lsd_count++;
1004192867Ssson
1005192867Ssson	return (DTRACE_AGGWALK_NEXT);
1006192867Ssson}
1007192867Ssson
1008192867Sssonstatic int
1009192867Sssonprocess_trace(const dtrace_probedata_t *pdata, void *arg)
1010192867Ssson{
1011192867Ssson	lsdata_t *lsdata = arg;
1012192867Ssson	lsrec_t *lsrec = lsdata->lsd_next;
1013192867Ssson	dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
1014192867Ssson	caddr_t data = pdata->dtpda_data;
1015192867Ssson
1016192867Ssson	if (lsdata->lsd_count >= g_nrecs)
1017192867Ssson		return (DTRACE_CONSUME_NEXT);
1018192867Ssson
1019192867Ssson	lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);
1020192867Ssson
1021192867Ssson	lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1022192867Ssson	lsdata->lsd_count++;
1023192867Ssson
1024192867Ssson	return (DTRACE_CONSUME_NEXT);
1025192867Ssson}
1026192867Ssson
1027192867Sssonstatic int
1028192867Sssonprocess_data(FILE *out, char *data)
1029192867Ssson{
1030192867Ssson	lsdata_t lsdata;
1031192867Ssson
1032192867Ssson	/* LINTED - alignment */
1033192867Ssson	lsdata.lsd_next = (lsrec_t *)data;
1034192867Ssson	lsdata.lsd_count = 0;
1035192867Ssson
1036192867Ssson	if (g_tracing) {
1037192867Ssson		if (dtrace_consume(g_dtp, out,
1038192867Ssson		    process_trace, NULL, &lsdata) != 0)
1039192867Ssson			dfail("failed to consume buffer");
1040192867Ssson
1041192867Ssson		return (lsdata.lsd_count);
1042192867Ssson	}
1043192867Ssson
1044192867Ssson	if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
1045192867Ssson	    process_aggregate, &lsdata) != 0)
1046192867Ssson		dfail("failed to walk aggregate");
1047192867Ssson
1048192867Ssson	return (lsdata.lsd_count);
1049192867Ssson}
1050192867Ssson
1051192867Ssson/*ARGSUSED*/
1052192867Sssonstatic int
1053192867Sssondrophandler(const dtrace_dropdata_t *data, void *arg)
1054192867Ssson{
1055192867Ssson	g_dropped++;
1056192867Ssson	(void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
1057192867Ssson	return (DTRACE_HANDLE_OK);
1058192867Ssson}
1059192867Ssson
1060192867Sssonint
1061192867Sssonmain(int argc, char **argv)
1062192867Ssson{
1063192867Ssson	char *data_buf;
1064192867Ssson	lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
1065192867Ssson	FILE *out = stdout;
1066192937Ssson	int c;
1067192867Ssson	pid_t child;
1068192867Ssson	int status;
1069192867Ssson	int i, j;
1070192867Ssson	hrtime_t duration;
1071192867Ssson	char *addrp, *offp, *sizep, *evp, *lastp, *p;
1072192867Ssson	uintptr_t addr;
1073192867Ssson	size_t size, off;
1074192867Ssson	int events_specified = 0;
1075192867Ssson	int exec_errno = 0;
1076192867Ssson	uint32_t event;
1077192867Ssson	char *filt = NULL, *ifilt = NULL;
1078192867Ssson	static uint64_t ev_count[LS_MAX_EVENTS + 1];
1079192867Ssson	static uint64_t ev_time[LS_MAX_EVENTS + 1];
1080192867Ssson	dtrace_optval_t aggsize;
1081192867Ssson	char aggstr[10];
1082192867Ssson	long ncpus;
1083192867Ssson	int dynvar = 0;
1084192867Ssson	int err;
1085192867Ssson
1086192867Ssson	if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
1087192867Ssson		fail(0, "cannot open dtrace library: %s",
1088192867Ssson		    dtrace_errmsg(NULL, err));
1089192867Ssson	}
1090192867Ssson
1091192867Ssson	if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
1092192867Ssson		dfail("couldn't establish drop handler");
1093192867Ssson
1094192867Ssson	if (symtab_init() == -1)
1095192867Ssson		fail(1, "can't load kernel symbols");
1096192867Ssson
1097192867Ssson	g_nrecs = DEFAULT_NRECS;
1098192867Ssson
1099192937Ssson	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1100192867Ssson		switch (c) {
1101192867Ssson		case 'b':
1102192867Ssson			g_recsize = LS_BASIC;
1103192867Ssson			break;
1104192867Ssson
1105192867Ssson		case 't':
1106192867Ssson			g_recsize = LS_TIME;
1107192867Ssson			break;
1108192867Ssson
1109192867Ssson		case 'h':
1110192867Ssson			g_recsize = LS_HIST;
1111192867Ssson			break;
1112192867Ssson
1113192867Ssson		case 's':
1114192867Ssson			if (!isdigit(optarg[0]))
1115192867Ssson				usage();
1116192867Ssson			g_stkdepth = atoi(optarg);
1117192867Ssson			if (g_stkdepth > LS_MAX_STACK_DEPTH)
1118192867Ssson				fail(0, "max stack depth is %d",
1119192867Ssson				    LS_MAX_STACK_DEPTH);
1120192867Ssson			g_recsize = LS_STACK(g_stkdepth);
1121192867Ssson			break;
1122192867Ssson
1123192867Ssson		case 'n':
1124192867Ssson			if (!isdigit(optarg[0]))
1125192867Ssson				usage();
1126192867Ssson			g_nrecs = atoi(optarg);
1127192867Ssson			break;
1128192867Ssson
1129192867Ssson		case 'd':
1130192867Ssson			if (!isdigit(optarg[0]))
1131192867Ssson				usage();
1132192867Ssson			duration = atoll(optarg);
1133192867Ssson
1134192867Ssson			/*
1135192867Ssson			 * XXX -- durations really should be per event
1136192867Ssson			 * since the units are different, but it's hard
1137192867Ssson			 * to express this nicely in the interface.
1138192867Ssson			 * Not clear yet what the cleanest solution is.
1139192867Ssson			 */
1140192867Ssson			for (i = 0; i < LS_MAX_EVENTS; i++)
1141192867Ssson				if (g_event_info[i].ev_type != 'E')
1142192867Ssson					g_min_duration[i] = duration;
1143192867Ssson
1144192867Ssson			break;
1145192867Ssson
1146192867Ssson		case 'i':
1147192867Ssson			if (!isdigit(optarg[0]))
1148192867Ssson				usage();
1149192867Ssson			i = atoi(optarg);
1150192867Ssson			if (i <= 0)
1151192867Ssson				usage();
1152192867Ssson			if (i > MAX_HZ)
1153192867Ssson				fail(0, "max interrupt rate is %d Hz", MAX_HZ);
1154192867Ssson
1155192867Ssson			for (j = 0; j < LS_MAX_EVENTS; j++)
1156192867Ssson				if (strcmp(g_event_info[j].ev_desc,
1157192867Ssson				    "Profiling interrupt") == 0)
1158192867Ssson					break;
1159192867Ssson
1160192867Ssson			(void) sprintf(g_event_info[j].ev_name,
1161192867Ssson			    "profile:::profile-%d", i);
1162192867Ssson			break;
1163192867Ssson
1164192867Ssson		case 'l':
1165192867Ssson		case 'f':
1166192867Ssson			addrp = strtok(optarg, ",");
1167192867Ssson			sizep = strtok(NULL, ",");
1168192867Ssson			addrp = strtok(optarg, ",+");
1169192867Ssson			offp = strtok(NULL, ",");
1170192867Ssson
1171192867Ssson			size = sizep ? strtoul(sizep, NULL, 0) : 1;
1172192867Ssson			off = offp ? strtoul(offp, NULL, 0) : 0;
1173192867Ssson
1174192867Ssson			if (addrp[0] == '0') {
1175192867Ssson				addr = strtoul(addrp, NULL, 16) + off;
1176192867Ssson			} else {
1177192867Ssson				addr = sym_to_addr(addrp) + off;
1178192867Ssson				if (sizep == NULL)
1179192867Ssson					size = sym_size(addrp) - off;
1180192867Ssson				if (addr - off == 0)
1181192867Ssson					fail(0, "symbol '%s' not found", addrp);
1182192867Ssson				if (size == 0)
1183192867Ssson					size = 1;
1184192867Ssson			}
1185192867Ssson
1186192867Ssson
1187192867Ssson			if (c == 'l') {
1188192867Ssson				filter_add(&filt, "arg0", addr, size);
1189192867Ssson			} else {
1190192867Ssson				filter_add(&filt, "caller", addr, size);
1191192867Ssson				filter_add(&ifilt, "arg0", addr, size);
1192192867Ssson			}
1193192867Ssson			break;
1194192867Ssson
1195192867Ssson		case 'e':
1196192867Ssson			evp = strtok_r(optarg, ",", &lastp);
1197192867Ssson			while (evp) {
1198192867Ssson				int ev1, ev2;
1199192867Ssson				char *evp2;
1200192867Ssson
1201192867Ssson				(void) strtok(evp, "-");
1202192867Ssson				evp2 = strtok(NULL, "-");
1203192867Ssson				ev1 = atoi(evp);
1204192867Ssson				ev2 = evp2 ? atoi(evp2) : ev1;
1205192867Ssson				if ((uint_t)ev1 >= LS_MAX_EVENTS ||
1206192867Ssson				    (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
1207192867Ssson					fail(0, "-e events out of range");
1208192867Ssson				for (i = ev1; i <= ev2; i++)
1209192867Ssson					g_enabled[i] = 1;
1210192867Ssson				evp = strtok_r(NULL, ",", &lastp);
1211192867Ssson			}
1212192867Ssson			events_specified = 1;
1213192867Ssson			break;
1214192867Ssson
1215192867Ssson		case 'c':
1216192867Ssson			g_cflag = 1;
1217192867Ssson			break;
1218192867Ssson
1219192867Ssson		case 'k':
1220192867Ssson			g_kflag = 1;
1221192867Ssson			break;
1222192867Ssson
1223192867Ssson		case 'w':
1224192867Ssson			g_wflag = 1;
1225192867Ssson			break;
1226192867Ssson
1227192867Ssson		case 'W':
1228192867Ssson			g_Wflag = 1;
1229192867Ssson			break;
1230192867Ssson
1231192867Ssson		case 'g':
1232192867Ssson			g_gflag = 1;
1233192867Ssson			break;
1234192867Ssson
1235192867Ssson		case 'C':
1236192867Ssson		case 'E':
1237192867Ssson		case 'H':
1238192867Ssson		case 'I':
1239192867Ssson			for (i = 0; i < LS_MAX_EVENTS; i++)
1240192867Ssson				if (g_event_info[i].ev_type == c)
1241192867Ssson					g_enabled[i] = 1;
1242192867Ssson			events_specified = 1;
1243192867Ssson			break;
1244192867Ssson
1245192867Ssson		case 'A':
1246192867Ssson			for (i = 0; i < LS_MAX_EVENTS; i++)
1247192867Ssson				if (strchr("CH", g_event_info[i].ev_type))
1248192867Ssson					g_enabled[i] = 1;
1249192867Ssson			events_specified = 1;
1250192867Ssson			break;
1251192867Ssson
1252192867Ssson		case 'T':
1253192867Ssson			g_tracing = 1;
1254192867Ssson			break;
1255192867Ssson
1256192867Ssson		case 'D':
1257192867Ssson			if (!isdigit(optarg[0]))
1258192867Ssson				usage();
1259192867Ssson			g_topn = atoi(optarg);
1260192867Ssson			break;
1261192867Ssson
1262192867Ssson		case 'R':
1263192867Ssson			g_rates = 1;
1264192867Ssson			break;
1265192867Ssson
1266192867Ssson		case 'p':
1267192867Ssson			g_pflag = 1;
1268192867Ssson			break;
1269192867Ssson
1270192867Ssson		case 'P':
1271192867Ssson			g_Pflag = 1;
1272192867Ssson			break;
1273192867Ssson
1274192867Ssson		case 'o':
1275192867Ssson			if ((out = fopen(optarg, "w")) == NULL)
1276192867Ssson				fail(1, "error opening file");
1277192867Ssson			break;
1278192867Ssson
1279192867Ssson		case 'V':
1280192867Ssson			g_Vflag = 1;
1281192867Ssson			break;
1282192867Ssson
1283192867Ssson		default:
1284192867Ssson			if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
1285192867Ssson				usage();
1286192867Ssson		}
1287192867Ssson	}
1288192867Ssson
1289192867Ssson	if (filt != NULL) {
1290192867Ssson		predicate_add(&g_predicate, filt, NULL, 0);
1291192867Ssson		filter_destroy(&filt);
1292192867Ssson	}
1293192867Ssson
1294192867Ssson	if (ifilt != NULL) {
1295192867Ssson		predicate_add(&g_ipredicate, ifilt, NULL, 0);
1296192867Ssson		filter_destroy(&ifilt);
1297192867Ssson	}
1298192867Ssson
1299192867Ssson	if (g_recsize == 0) {
1300192867Ssson		if (g_gflag) {
1301192867Ssson			g_stkdepth = LS_MAX_STACK_DEPTH;
1302192867Ssson			g_recsize = LS_STACK(g_stkdepth);
1303192867Ssson		} else {
1304192867Ssson			g_recsize = LS_TIME;
1305192867Ssson		}
1306192867Ssson	}
1307192867Ssson
1308192867Ssson	if (g_gflag && g_recsize <= LS_STACK(0))
1309192867Ssson		fail(0, "'-g' requires at least '-s 1' data gathering");
1310192867Ssson
1311192867Ssson	/*
1312192867Ssson	 * Make sure the alignment is reasonable
1313192867Ssson	 */
1314192867Ssson	g_recsize = -(-g_recsize & -sizeof (uint64_t));
1315192867Ssson
1316192867Ssson	for (i = 0; i < LS_MAX_EVENTS; i++) {
1317192867Ssson		/*
1318192867Ssson		 * If no events were specified, enable -C.
1319192867Ssson		 */
1320192867Ssson		if (!events_specified && g_event_info[i].ev_type == 'C')
1321192867Ssson			g_enabled[i] = 1;
1322192867Ssson	}
1323192867Ssson
1324192867Ssson	for (i = 0; i < LS_MAX_EVENTS; i++) {
1325192867Ssson		if (!g_enabled[i])
1326192867Ssson			continue;
1327192867Ssson
1328192867Ssson		if (g_event_info[i].ev_acquire != NULL) {
1329192867Ssson			/*
1330192867Ssson			 * If we've enabled a hold event, we must explicitly
1331192867Ssson			 * allocate dynamic variable space.
1332192867Ssson			 */
1333192867Ssson			dynvar = 1;
1334192867Ssson		}
1335192867Ssson
1336192867Ssson		dprog_addevent(i);
1337192867Ssson	}
1338192867Ssson
1339192867Ssson	/*
1340192867Ssson	 * Make sure there are remaining arguments to specify a child command
1341192867Ssson	 * to execute.
1342192867Ssson	 */
1343192867Ssson	if (argc <= optind)
1344192867Ssson		usage();
1345192867Ssson
1346192867Ssson	if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
1347192867Ssson		dfail("couldn't determine number of online CPUs");
1348192867Ssson
1349192867Ssson	/*
1350192867Ssson	 * By default, we set our data buffer size to be the number of records
1351192867Ssson	 * multiplied by the size of the record, doubled to account for some
1352192867Ssson	 * DTrace slop and divided by the number of CPUs.  We silently clamp
1353192867Ssson	 * the aggregation size at both a minimum and a maximum to prevent
1354192867Ssson	 * absurdly low or high values.
1355192867Ssson	 */
1356192867Ssson	if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
1357192867Ssson		aggsize = MIN_AGGSIZE;
1358192867Ssson
1359192867Ssson	if (aggsize > MAX_AGGSIZE)
1360192867Ssson		aggsize = MAX_AGGSIZE;
1361192867Ssson
1362192867Ssson	(void) sprintf(aggstr, "%lld", (long long)aggsize);
1363192867Ssson
1364192867Ssson	if (!g_tracing) {
1365192867Ssson		if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
1366192867Ssson			dfail("failed to set 'bufsize'");
1367192867Ssson
1368192867Ssson		if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
1369192867Ssson			dfail("failed to set 'aggsize'");
1370192867Ssson
1371192867Ssson		if (dynvar) {
1372192867Ssson			/*
1373192867Ssson			 * If we're using dynamic variables, we set our
1374192867Ssson			 * dynamic variable size to be one megabyte per CPU,
1375192867Ssson			 * with a hard-limit of 32 megabytes.  This may still
1376192867Ssson			 * be too small in some cases, but it can be tuned
1377192867Ssson			 * manually via -x if need be.
1378192867Ssson			 */
1379192867Ssson			(void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);
1380192867Ssson
1381192867Ssson			if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
1382192867Ssson				dfail("failed to set 'dynvarsize'");
1383192867Ssson		}
1384192867Ssson	} else {
1385192867Ssson		if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
1386192867Ssson			dfail("failed to set 'bufsize'");
1387192867Ssson	}
1388192867Ssson
1389192867Ssson	if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
1390192867Ssson		dfail("failed to set 'statusrate'");
1391192867Ssson
1392192867Ssson	optind = 1;
1393192937Ssson	while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1394192867Ssson		switch (c) {
1395192867Ssson		case 'x':
1396192867Ssson			if ((p = strchr(optarg, '=')) != NULL)
1397192867Ssson				*p++ = '\0';
1398192867Ssson
1399192867Ssson			if (dtrace_setopt(g_dtp, optarg, p) != 0)
1400192867Ssson				dfail("failed to set -x %s", optarg);
1401192867Ssson			break;
1402192867Ssson		}
1403192867Ssson	}
1404192867Ssson
1405192867Ssson	argc -= optind;
1406192867Ssson	argv += optind;
1407192867Ssson
1408192867Ssson	dprog_compile();
1409192867Ssson	status_init();
1410192867Ssson
1411192867Ssson	g_elapsed = -gethrtime();
1412192867Ssson
1413192867Ssson	/*
1414192867Ssson	 * Spawn the specified command and wait for it to complete.
1415192867Ssson	 */
1416192867Ssson	child = fork();
1417192867Ssson	if (child == -1)
1418192867Ssson		fail(1, "cannot fork");
1419192867Ssson	if (child == 0) {
1420192867Ssson		(void) dtrace_close(g_dtp);
1421192867Ssson		(void) execvp(argv[0], &argv[0]);
1422192867Ssson		exec_errno = errno;
1423192867Ssson		exit(127);
1424192867Ssson	}
1425192867Ssson
1426192867Ssson#if defined(sun)
1427192867Ssson	while (waitpid(child, &status, WEXITED) != child)
1428192867Ssson#else
1429192867Ssson	while (waitpid(child, &status, 0) != child)
1430192867Ssson#endif
1431192867Ssson		status_check();
1432192867Ssson
1433192867Ssson	g_elapsed += gethrtime();
1434192867Ssson
1435192867Ssson	if (WIFEXITED(status)) {
1436192867Ssson		if (WEXITSTATUS(status) != 0) {
1437192867Ssson			if (exec_errno != 0) {
1438192867Ssson				errno = exec_errno;
1439192867Ssson				fail(1, "could not execute %s", argv[0]);
1440192867Ssson			}
1441192867Ssson			(void) fprintf(stderr,
1442192867Ssson			    "lockstat: warning: %s exited with code %d\n",
1443192867Ssson			    argv[0], WEXITSTATUS(status));
1444192867Ssson		}
1445192867Ssson	} else {
1446192867Ssson		(void) fprintf(stderr,
1447192867Ssson		    "lockstat: warning: %s died on signal %d\n",
1448192867Ssson		    argv[0], WTERMSIG(status));
1449192867Ssson	}
1450192867Ssson
1451192867Ssson	if (dtrace_stop(g_dtp) == -1)
1452192867Ssson		dfail("failed to stop dtrace");
1453192867Ssson
1454192867Ssson	/*
1455192867Ssson	 * Before we read out the results, we need to allocate our buffer.
1456192867Ssson	 * If we're tracing, then we'll just use the precalculated size.  If
1457192867Ssson	 * we're not, then we'll take a snapshot of the aggregate, and walk
1458192867Ssson	 * it to count the number of records.
1459192867Ssson	 */
1460192867Ssson	if (!g_tracing) {
1461192867Ssson		if (dtrace_aggregate_snap(g_dtp) != 0)
1462192867Ssson			dfail("failed to snap aggregate");
1463192867Ssson
1464192867Ssson		g_nrecs = 0;
1465192867Ssson
1466192867Ssson		if (dtrace_aggregate_walk(g_dtp,
1467192867Ssson		    count_aggregate, &g_nrecs) != 0)
1468192867Ssson			dfail("failed to walk aggregate");
1469192867Ssson	}
1470192867Ssson
1471192867Ssson#if defined(sun)
1472192867Ssson	if ((data_buf = memalign(sizeof (uint64_t),
1473192867Ssson	    (g_nrecs + 1) * g_recsize)) == NULL)
1474192867Ssson#else
1475192867Ssson	if (posix_memalign((void **)&data_buf, sizeof (uint64_t),
1476192867Ssson	    (g_nrecs + 1) * g_recsize) )
1477192867Ssson#endif
1478192867Ssson		fail(1, "Memory allocation failed");
1479192867Ssson
1480192867Ssson	/*
1481192867Ssson	 * Read out the DTrace data.
1482192867Ssson	 */
1483192867Ssson	g_nrecs_used = process_data(out, data_buf);
1484192867Ssson
1485192867Ssson	if (g_nrecs_used > g_nrecs || g_dropped)
1486192867Ssson		(void) fprintf(stderr, "lockstat: warning: "
1487192867Ssson		    "ran out of data records (use -n for more)\n");
1488192867Ssson
1489192867Ssson	/* LINTED - alignment */
1490192867Ssson	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1491192867Ssson	    /* LINTED - alignment */
1492192867Ssson	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1493192867Ssson		ev_count[lsp->ls_event] += lsp->ls_count;
1494192867Ssson		ev_time[lsp->ls_event] += lsp->ls_time;
1495192867Ssson	}
1496192867Ssson
1497192867Ssson	/*
1498192867Ssson	 * If -g was specified, convert stacks into individual records.
1499192867Ssson	 */
1500192867Ssson	if (g_gflag) {
1501192867Ssson		lsrec_t *newlsp, *oldlsp;
1502192867Ssson
1503192867Ssson#if defined(sun)
1504192867Ssson		newlsp = memalign(sizeof (uint64_t),
1505192867Ssson		    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1506192867Ssson#else
1507192867Ssson		posix_memalign((void **)&newlsp, sizeof (uint64_t),
1508192867Ssson		    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1509192867Ssson#endif
1510192867Ssson		if (newlsp == NULL)
1511192867Ssson			fail(1, "Cannot allocate space for -g processing");
1512192867Ssson		lsp = newlsp;
1513192867Ssson		/* LINTED - alignment */
1514192867Ssson		for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1515192867Ssson		    /* LINTED - alignment */
1516192867Ssson		    oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
1517192867Ssson			int fr;
1518192867Ssson			int caller_in_stack = 0;
1519192867Ssson
1520192867Ssson			if (oldlsp->ls_count == 0)
1521192867Ssson				continue;
1522192867Ssson
1523192867Ssson			for (fr = 0; fr < g_stkdepth; fr++) {
1524192867Ssson				if (oldlsp->ls_stack[fr] == 0)
1525192867Ssson					break;
1526192867Ssson				if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
1527192867Ssson					caller_in_stack = 1;
1528192867Ssson				bcopy(oldlsp, lsp, LS_TIME);
1529192867Ssson				lsp->ls_caller = oldlsp->ls_stack[fr];
1530192867Ssson				/* LINTED - alignment */
1531192867Ssson				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1532192867Ssson			}
1533192867Ssson			if (!caller_in_stack) {
1534192867Ssson				bcopy(oldlsp, lsp, LS_TIME);
1535192867Ssson				/* LINTED - alignment */
1536192867Ssson				lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1537192867Ssson			}
1538192867Ssson		}
1539192867Ssson		g_nrecs = g_nrecs_used =
1540192867Ssson		    ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
1541192867Ssson		g_recsize = LS_TIME;
1542192867Ssson		g_stkdepth = 0;
1543192867Ssson		free(data_buf);
1544192867Ssson		data_buf = (char *)newlsp;
1545192867Ssson	}
1546192867Ssson
1547192867Ssson	if ((sort_buf = calloc(2 * (g_nrecs + 1),
1548192867Ssson	    sizeof (void *))) == NULL)
1549192867Ssson		fail(1, "Sort buffer allocation failed");
1550192867Ssson	merge_buf = sort_buf + (g_nrecs + 1);
1551192867Ssson
1552192867Ssson	/*
1553192867Ssson	 * Build the sort buffer, discarding zero-count records along the way.
1554192867Ssson	 */
1555192867Ssson	/* LINTED - alignment */
1556192867Ssson	for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1557192867Ssson	    /* LINTED - alignment */
1558192867Ssson	    lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1559192867Ssson		if (lsp->ls_count == 0)
1560192867Ssson			lsp->ls_event = LS_MAX_EVENTS;
1561192867Ssson		sort_buf[i] = lsp;
1562192867Ssson	}
1563192867Ssson
1564192867Ssson	if (g_nrecs_used == 0)
1565192867Ssson		exit(0);
1566192867Ssson
1567192867Ssson	/*
1568192867Ssson	 * Add a sentinel after the last record
1569192867Ssson	 */
1570192867Ssson	sort_buf[i] = lsp;
1571192867Ssson	lsp->ls_event = LS_MAX_EVENTS;
1572192867Ssson
1573192867Ssson	if (g_tracing) {
1574192867Ssson		report_trace(out, sort_buf);
1575192867Ssson		return (0);
1576192867Ssson	}
1577192867Ssson
1578192867Ssson	/*
1579192867Ssson	 * Application of -g may have resulted in multiple records
1580192867Ssson	 * with the same signature; coalesce them.
1581192867Ssson	 */
1582192867Ssson	if (g_gflag) {
1583192867Ssson		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1584192867Ssson		coalesce(lockcmp, sort_buf, g_nrecs_used);
1585192867Ssson	}
1586192867Ssson
1587192867Ssson	/*
1588192867Ssson	 * Coalesce locks within the same symbol if -c option specified.
1589192867Ssson	 * Coalesce PCs within the same function if -k option specified.
1590192867Ssson	 */
1591192867Ssson	if (g_cflag || g_kflag) {
1592192867Ssson		for (i = 0; i < g_nrecs_used; i++) {
1593192867Ssson			int fr;
1594192867Ssson			lsp = sort_buf[i];
1595192867Ssson			if (g_cflag)
1596192867Ssson				coalesce_symbol(&lsp->ls_lock);
1597192867Ssson			if (g_kflag) {
1598192867Ssson				for (fr = 0; fr < g_stkdepth; fr++)
1599192867Ssson					coalesce_symbol(&lsp->ls_stack[fr]);
1600192867Ssson				coalesce_symbol(&lsp->ls_caller);
1601192867Ssson			}
1602192867Ssson		}
1603192867Ssson		mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1604192867Ssson		coalesce(lockcmp, sort_buf, g_nrecs_used);
1605192867Ssson	}
1606192867Ssson
1607192867Ssson	/*
1608192867Ssson	 * Coalesce callers if -w option specified
1609192867Ssson	 */
1610192867Ssson	if (g_wflag) {
1611192867Ssson		mergesort(lock_and_count_cmp_anywhere,
1612192867Ssson		    sort_buf, merge_buf, g_nrecs_used);
1613192867Ssson		coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
1614192867Ssson	}
1615192867Ssson
1616192867Ssson	/*
1617192867Ssson	 * Coalesce locks if -W option specified
1618192867Ssson	 */
1619192867Ssson	if (g_Wflag) {
1620192867Ssson		mergesort(site_and_count_cmp_anylock,
1621192867Ssson		    sort_buf, merge_buf, g_nrecs_used);
1622192867Ssson		coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
1623192867Ssson	}
1624192867Ssson
1625192867Ssson	/*
1626192867Ssson	 * Sort data by contention count (ls_count) or total time (ls_time),
1627192867Ssson	 * depending on g_Pflag.  Override g_Pflag if time wasn't measured.
1628192867Ssson	 */
1629192867Ssson	if (g_recsize < LS_TIME)
1630192867Ssson		g_Pflag = 0;
1631192867Ssson
1632192867Ssson	if (g_Pflag)
1633192867Ssson		mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
1634192867Ssson	else
1635192867Ssson		mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);
1636192867Ssson
1637192867Ssson	/*
1638192867Ssson	 * Display data by event type
1639192867Ssson	 */
1640192867Ssson	first = &sort_buf[0];
1641192867Ssson	while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
1642192867Ssson		current = first;
1643192867Ssson		while ((lsp = *current)->ls_event == event)
1644192867Ssson			current++;
1645192867Ssson		report_stats(out, first, current - first, ev_count[event],
1646192867Ssson		    ev_time[event]);
1647192867Ssson		first = current;
1648192867Ssson	}
1649192867Ssson
1650192867Ssson	return (0);
1651192867Ssson}
1652192867Ssson
1653192867Sssonstatic char *
1654192867Sssonformat_symbol(char *buf, uintptr_t addr, int show_size)
1655192867Ssson{
1656192867Ssson	uintptr_t symoff;
1657192867Ssson	char *symname;
1658192867Ssson	size_t symsize;
1659192867Ssson
1660192867Ssson	symname = addr_to_sym(addr, &symoff, &symsize);
1661192867Ssson
1662192867Ssson	if (show_size && symoff == 0)
1663192867Ssson		(void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
1664192867Ssson	else if (symoff == 0)
1665192867Ssson		(void) sprintf(buf, "%s", symname);
1666192867Ssson	else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0)	/* CPU+PIL */
1667192867Ssson#if defined(sun)
1668192867Ssson		(void) sprintf(buf, "%s+%ld", symname, (long)symoff);
1669192867Ssson#else
1670192867Ssson		(void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]);
1671192867Ssson#endif
1672192867Ssson	else if (symoff <= symsize || (symoff < 256 && addr != symoff))
1673192867Ssson		(void) sprintf(buf, "%s+0x%llx", symname,
1674192867Ssson		    (unsigned long long)symoff);
1675192867Ssson	else
1676192867Ssson		(void) sprintf(buf, "0x%llx", (unsigned long long)addr);
1677192867Ssson	return (buf);
1678192867Ssson}
1679192867Ssson
1680192867Sssonstatic void
1681192867Sssonreport_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
1682192867Ssson	uint64_t total_time)
1683192867Ssson{
1684192867Ssson	uint32_t event = sort_buf[0]->ls_event;
1685192867Ssson	lsrec_t *lsp;
1686192867Ssson	double ptotal = 0.0;
1687192867Ssson	double percent;
1688192867Ssson	int i, j, fr;
1689192867Ssson	int displayed;
1690192867Ssson	int first_bin, last_bin, max_bin_count, total_bin_count;
1691192867Ssson	int rectype;
1692192867Ssson	char buf[256];
1693192867Ssson	char lhdr[80], chdr[80];
1694192867Ssson
1695192867Ssson	rectype = g_recsize;
1696192867Ssson
1697192867Ssson	if (g_topn == 0) {
1698192867Ssson		(void) fprintf(out, "%20llu %s\n",
1699192867Ssson		    g_rates == 0 ? total_count :
1700192867Ssson		    ((unsigned long long)total_count * NANOSEC) / g_elapsed,
1701192867Ssson		    g_event_info[event].ev_desc);
1702192867Ssson		return;
1703192867Ssson	}
1704192867Ssson
1705192867Ssson	(void) sprintf(lhdr, "%s%s",
1706192867Ssson	    g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
1707192867Ssson	(void) sprintf(chdr, "%s%s",
1708192867Ssson	    g_wflag ? "Hottest " : "", "Caller");
1709192867Ssson
1710192867Ssson	if (!g_pflag)
1711192867Ssson		(void) fprintf(out,
1712192867Ssson		    "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
1713192867Ssson		    g_event_info[event].ev_desc, (double)total_count,
1714192867Ssson		    (double)g_elapsed / NANOSEC,
1715192867Ssson		    (double)total_count * NANOSEC / g_elapsed);
1716192867Ssson
1717192867Ssson	if (!g_pflag && rectype < LS_HIST) {
1718192867Ssson		(void) sprintf(buf, "%s", g_event_info[event].ev_units);
1719192867Ssson		(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1720192867Ssson		    g_rates ? "ops/s" : "Count",
1721192867Ssson		    g_gflag ? "genr" : "indv",
1722192867Ssson		    "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
1723192867Ssson		(void) fprintf(out, "---------------------------------"
1724192867Ssson		    "----------------------------------------------\n");
1725192867Ssson	}
1726192867Ssson
1727192867Ssson	displayed = 0;
1728192867Ssson	for (i = 0; i < nrecs; i++) {
1729192867Ssson		lsp = sort_buf[i];
1730192867Ssson
1731192867Ssson		if (displayed++ >= g_topn)
1732192867Ssson			break;
1733192867Ssson
1734192867Ssson		if (g_pflag) {
1735192867Ssson			int j;
1736192867Ssson
1737192867Ssson			(void) fprintf(out, "%u %u",
1738192867Ssson			    lsp->ls_event, lsp->ls_count);
1739192867Ssson			(void) fprintf(out, " %s",
1740192867Ssson			    format_symbol(buf, lsp->ls_lock, g_cflag));
1741192867Ssson			(void) fprintf(out, " %s",
1742192867Ssson			    format_symbol(buf, lsp->ls_caller, 0));
1743192867Ssson			(void) fprintf(out, " %f",
1744192867Ssson			    (double)lsp->ls_refcnt / lsp->ls_count);
1745192867Ssson			if (rectype >= LS_TIME)
1746192867Ssson				(void) fprintf(out, " %llu",
1747192867Ssson				    (unsigned long long)lsp->ls_time);
1748192867Ssson			if (rectype >= LS_HIST) {
1749192867Ssson				for (j = 0; j < 64; j++)
1750192867Ssson					(void) fprintf(out, " %u",
1751192867Ssson					    lsp->ls_hist[j]);
1752192867Ssson			}
1753192867Ssson			for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
1754192867Ssson				if (rectype <= LS_STACK(j) ||
1755192867Ssson				    lsp->ls_stack[j] == 0)
1756192867Ssson					break;
1757192867Ssson				(void) fprintf(out, " %s",
1758192867Ssson				    format_symbol(buf, lsp->ls_stack[j], 0));
1759192867Ssson			}
1760192867Ssson			(void) fprintf(out, "\n");
1761192867Ssson			continue;
1762192867Ssson		}
1763192867Ssson
1764192867Ssson		if (rectype >= LS_HIST) {
1765192867Ssson			(void) fprintf(out, "---------------------------------"
1766192867Ssson			    "----------------------------------------------\n");
1767192867Ssson			(void) sprintf(buf, "%s",
1768192867Ssson			    g_event_info[event].ev_units);
1769192867Ssson			(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1770192867Ssson			    g_rates ? "ops/s" : "Count",
1771192867Ssson			    g_gflag ? "genr" : "indv",
1772192867Ssson			    "cuml", "rcnt", buf, lhdr, chdr);
1773192867Ssson		}
1774192867Ssson
1775192867Ssson		if (g_Pflag && total_time != 0)
1776192867Ssson			percent = (lsp->ls_time * 100.00) / total_time;
1777192867Ssson		else
1778192867Ssson			percent = (lsp->ls_count * 100.00) / total_count;
1779192867Ssson
1780192867Ssson		ptotal += percent;
1781192867Ssson
1782192867Ssson		if (rectype >= LS_TIME)
1783192867Ssson			(void) sprintf(buf, "%llu",
1784192867Ssson			    (unsigned long long)(lsp->ls_time / lsp->ls_count));
1785192867Ssson		else
1786192867Ssson			buf[0] = '\0';
1787192867Ssson
1788192867Ssson		(void) fprintf(out, "%5llu ",
1789192867Ssson		    g_rates == 0 ? lsp->ls_count :
1790192867Ssson		    ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);
1791192867Ssson
1792192867Ssson		(void) fprintf(out, "%3.0f%% ", percent);
1793192867Ssson
1794192867Ssson		if (g_gflag)
1795192867Ssson			(void) fprintf(out, "---- ");
1796192867Ssson		else
1797192867Ssson			(void) fprintf(out, "%3.0f%% ", ptotal);
1798192867Ssson
1799192867Ssson		(void) fprintf(out, "%4.2f %8s ",
1800192867Ssson		    (double)lsp->ls_refcnt / lsp->ls_count, buf);
1801192867Ssson
1802192867Ssson		(void) fprintf(out, "%-22s ",
1803192867Ssson		    format_symbol(buf, lsp->ls_lock, g_cflag));
1804192867Ssson
1805192867Ssson		(void) fprintf(out, "%-24s\n",
1806192867Ssson		    format_symbol(buf, lsp->ls_caller, 0));
1807192867Ssson
1808192867Ssson		if (rectype < LS_HIST)
1809192867Ssson			continue;
1810192867Ssson
1811192867Ssson		(void) fprintf(out, "\n");
1812192867Ssson		(void) fprintf(out, "%10s %31s %-9s %-24s\n",
1813192867Ssson		    g_event_info[event].ev_units,
1814192867Ssson		    "------ Time Distribution ------",
1815192867Ssson		    g_rates ? "ops/s" : "count",
1816192867Ssson		    rectype > LS_STACK(0) ? "Stack" : "");
1817192867Ssson
1818192867Ssson		first_bin = 0;
1819192867Ssson		while (lsp->ls_hist[first_bin] == 0)
1820192867Ssson			first_bin++;
1821192867Ssson
1822192867Ssson		last_bin = 63;
1823192867Ssson		while (lsp->ls_hist[last_bin] == 0)
1824192867Ssson			last_bin--;
1825192867Ssson
1826192867Ssson		max_bin_count = 0;
1827192867Ssson		total_bin_count = 0;
1828192867Ssson		for (j = first_bin; j <= last_bin; j++) {
1829192867Ssson			total_bin_count += lsp->ls_hist[j];
1830192867Ssson			if (lsp->ls_hist[j] > max_bin_count)
1831192867Ssson				max_bin_count = lsp->ls_hist[j];
1832192867Ssson		}
1833192867Ssson
1834192867Ssson		/*
1835192867Ssson		 * If we went a few frames below the caller, ignore them
1836192867Ssson		 */
1837192867Ssson		for (fr = 3; fr > 0; fr--)
1838192867Ssson			if (lsp->ls_stack[fr] == lsp->ls_caller)
1839192867Ssson				break;
1840192867Ssson
1841192867Ssson		for (j = first_bin; j <= last_bin; j++) {
1842192867Ssson			uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
1843192867Ssson			(void) fprintf(out, "%10llu |%s%s %-9u ",
1844192867Ssson			    1ULL << j,
1845192867Ssson			    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
1846192867Ssson			    "                              " + depth,
1847192867Ssson			    g_rates == 0 ? lsp->ls_hist[j] :
1848192867Ssson			    (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
1849192867Ssson			    g_elapsed));
1850192867Ssson			if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
1851192867Ssson				(void) fprintf(out, "\n");
1852192867Ssson				continue;
1853192867Ssson			}
1854192867Ssson			(void) fprintf(out, "%-24s\n",
1855192867Ssson			    format_symbol(buf, lsp->ls_stack[fr], 0));
1856192867Ssson			fr++;
1857192867Ssson		}
1858192867Ssson		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1859192867Ssson			(void) fprintf(out, "%15s %-36s %-24s\n", "", "",
1860192867Ssson			    format_symbol(buf, lsp->ls_stack[fr], 0));
1861192867Ssson			fr++;
1862192867Ssson		}
1863192867Ssson	}
1864192867Ssson
1865192867Ssson	if (!g_pflag)
1866192867Ssson		(void) fprintf(out, "---------------------------------"
1867192867Ssson		    "----------------------------------------------\n");
1868192867Ssson
1869192867Ssson	(void) fflush(out);
1870192867Ssson}
1871192867Ssson
1872192867Sssonstatic void
1873192867Sssonreport_trace(FILE *out, lsrec_t **sort_buf)
1874192867Ssson{
1875192867Ssson	lsrec_t *lsp;
1876192867Ssson	int i, fr;
1877192867Ssson	int rectype;
1878192867Ssson	char buf[256], buf2[256];
1879192867Ssson
1880192867Ssson	rectype = g_recsize;
1881192867Ssson
1882192867Ssson	if (!g_pflag) {
1883192867Ssson		(void) fprintf(out, "%5s  %7s  %11s  %-24s  %-24s\n",
1884192867Ssson		    "Event", "Time", "Owner", "Lock", "Caller");
1885192867Ssson		(void) fprintf(out, "---------------------------------"
1886192867Ssson		    "----------------------------------------------\n");
1887192867Ssson	}
1888192867Ssson
1889192867Ssson	for (i = 0; i < g_nrecs_used; i++) {
1890192867Ssson
1891192867Ssson		lsp = sort_buf[i];
1892192867Ssson
1893192867Ssson		if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
1894192867Ssson			continue;
1895192867Ssson
1896192867Ssson		(void) fprintf(out, "%2d  %10llu  %11p  %-24s  %-24s\n",
1897192867Ssson		    lsp->ls_event, (unsigned long long)lsp->ls_time,
1898192867Ssson		    (void *)lsp->ls_next,
1899192867Ssson		    format_symbol(buf, lsp->ls_lock, 0),
1900192867Ssson		    format_symbol(buf2, lsp->ls_caller, 0));
1901192867Ssson
1902192867Ssson		if (rectype <= LS_STACK(0))
1903192867Ssson			continue;
1904192867Ssson
1905192867Ssson		/*
1906192867Ssson		 * If we went a few frames below the caller, ignore them
1907192867Ssson		 */
1908192867Ssson		for (fr = 3; fr > 0; fr--)
1909192867Ssson			if (lsp->ls_stack[fr] == lsp->ls_caller)
1910192867Ssson				break;
1911192867Ssson
1912192867Ssson		while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1913192867Ssson			(void) fprintf(out, "%53s  %-24s\n", "",
1914192867Ssson			    format_symbol(buf, lsp->ls_stack[fr], 0));
1915192867Ssson			fr++;
1916192867Ssson		}
1917192867Ssson		(void) fprintf(out, "\n");
1918192867Ssson	}
1919192867Ssson
1920192867Ssson	(void) fflush(out);
1921192867Ssson}
1922