1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
29 * Copyright (c) 2012 by Delphix. All rights reserved.
30 */
31
32#include <stdlib.h>
33#include <strings.h>
34#include <errno.h>
35#include <unistd.h>
36#include <dt_impl.h>
37#include <assert.h>
38#include <dt_oformat.h>
39#ifdef illumos
40#include <alloca.h>
41#else
42#include <sys/sysctl.h>
43#include <libproc_compat.h>
44#endif
45#include <limits.h>
46
47#define	DTRACE_AHASHSIZE	32779		/* big 'ol prime */
48
49/*
50 * Because qsort(3C) does not allow an argument to be passed to a comparison
51 * function, the variables that affect comparison must regrettably be global;
52 * they are protected by a global static lock, dt_qsort_lock.
53 */
54static pthread_mutex_t dt_qsort_lock = PTHREAD_MUTEX_INITIALIZER;
55
56static int dt_revsort;
57static int dt_keysort;
58static int dt_keypos;
59
60#define	DT_LESSTHAN	(dt_revsort == 0 ? -1 : 1)
61#define	DT_GREATERTHAN	(dt_revsort == 0 ? 1 : -1)
62
63static void
64dt_aggregate_count(int64_t *existing, int64_t *new, size_t size)
65{
66	uint_t i;
67
68	for (i = 0; i < size / sizeof (int64_t); i++)
69		existing[i] = existing[i] + new[i];
70}
71
72static int
73dt_aggregate_countcmp(int64_t *lhs, int64_t *rhs)
74{
75	int64_t lvar = *lhs;
76	int64_t rvar = *rhs;
77
78	if (lvar < rvar)
79		return (DT_LESSTHAN);
80
81	if (lvar > rvar)
82		return (DT_GREATERTHAN);
83
84	return (0);
85}
86
87/*ARGSUSED*/
88static void
89dt_aggregate_min(int64_t *existing, int64_t *new, size_t size)
90{
91	if (*new < *existing)
92		*existing = *new;
93}
94
95/*ARGSUSED*/
96static void
97dt_aggregate_max(int64_t *existing, int64_t *new, size_t size)
98{
99	if (*new > *existing)
100		*existing = *new;
101}
102
103static int
104dt_aggregate_averagecmp(int64_t *lhs, int64_t *rhs)
105{
106	int64_t lavg = lhs[0] ? (lhs[1] / lhs[0]) : 0;
107	int64_t ravg = rhs[0] ? (rhs[1] / rhs[0]) : 0;
108
109	if (lavg < ravg)
110		return (DT_LESSTHAN);
111
112	if (lavg > ravg)
113		return (DT_GREATERTHAN);
114
115	return (0);
116}
117
118static int
119dt_aggregate_stddevcmp(int64_t *lhs, int64_t *rhs)
120{
121	uint64_t lsd = dt_stddev((uint64_t *)lhs, 1);
122	uint64_t rsd = dt_stddev((uint64_t *)rhs, 1);
123
124	if (lsd < rsd)
125		return (DT_LESSTHAN);
126
127	if (lsd > rsd)
128		return (DT_GREATERTHAN);
129
130	return (0);
131}
132
133/*ARGSUSED*/
134static void
135dt_aggregate_lquantize(int64_t *existing, int64_t *new, size_t size)
136{
137	int64_t arg = *existing++;
138	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
139	int i;
140
141	for (i = 0; i <= levels + 1; i++)
142		existing[i] = existing[i] + new[i + 1];
143}
144
145static long double
146dt_aggregate_lquantizedsum(int64_t *lquanta)
147{
148	int64_t arg = *lquanta++;
149	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
150	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
151	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg), i;
152	long double total = (long double)lquanta[0] * (long double)(base - 1);
153
154	for (i = 0; i < levels; base += step, i++)
155		total += (long double)lquanta[i + 1] * (long double)base;
156
157	return (total + (long double)lquanta[levels + 1] *
158	    (long double)(base + 1));
159}
160
161static int64_t
162dt_aggregate_lquantizedzero(int64_t *lquanta)
163{
164	int64_t arg = *lquanta++;
165	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
166	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
167	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg), i;
168
169	if (base - 1 == 0)
170		return (lquanta[0]);
171
172	for (i = 0; i < levels; base += step, i++) {
173		if (base != 0)
174			continue;
175
176		return (lquanta[i + 1]);
177	}
178
179	if (base + 1 == 0)
180		return (lquanta[levels + 1]);
181
182	return (0);
183}
184
185static int
186dt_aggregate_lquantizedcmp(int64_t *lhs, int64_t *rhs)
187{
188	long double lsum = dt_aggregate_lquantizedsum(lhs);
189	long double rsum = dt_aggregate_lquantizedsum(rhs);
190	int64_t lzero, rzero;
191
192	if (lsum < rsum)
193		return (DT_LESSTHAN);
194
195	if (lsum > rsum)
196		return (DT_GREATERTHAN);
197
198	/*
199	 * If they're both equal, then we will compare based on the weights at
200	 * zero.  If the weights at zero are equal (or if zero is not within
201	 * the range of the linear quantization), then this will be judged a
202	 * tie and will be resolved based on the key comparison.
203	 */
204	lzero = dt_aggregate_lquantizedzero(lhs);
205	rzero = dt_aggregate_lquantizedzero(rhs);
206
207	if (lzero < rzero)
208		return (DT_LESSTHAN);
209
210	if (lzero > rzero)
211		return (DT_GREATERTHAN);
212
213	return (0);
214}
215
216static void
217dt_aggregate_llquantize(int64_t *existing, int64_t *new, size_t size)
218{
219	int i;
220
221	for (i = 1; i < size / sizeof (int64_t); i++)
222		existing[i] = existing[i] + new[i];
223}
224
225static long double
226dt_aggregate_llquantizedsum(int64_t *llquanta)
227{
228	int64_t arg = *llquanta++;
229	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
230	uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
231	uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
232	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
233	int bin = 0, order;
234	int64_t value = 1, next, step;
235	long double total;
236
237	assert(nsteps >= factor);
238	assert(nsteps % factor == 0);
239
240	for (order = 0; order < low; order++)
241		value *= factor;
242
243	total = (long double)llquanta[bin++] * (long double)(value - 1);
244
245	next = value * factor;
246	step = next > nsteps ? next / nsteps : 1;
247
248	while (order <= high) {
249		assert(value < next);
250		total += (long double)llquanta[bin++] * (long double)(value);
251
252		if ((value += step) != next)
253			continue;
254
255		next = value * factor;
256		step = next > nsteps ? next / nsteps : 1;
257		order++;
258	}
259
260	return (total + (long double)llquanta[bin] * (long double)value);
261}
262
263static int
264dt_aggregate_llquantizedcmp(int64_t *lhs, int64_t *rhs)
265{
266	long double lsum = dt_aggregate_llquantizedsum(lhs);
267	long double rsum = dt_aggregate_llquantizedsum(rhs);
268	int64_t lzero, rzero;
269
270	if (lsum < rsum)
271		return (DT_LESSTHAN);
272
273	if (lsum > rsum)
274		return (DT_GREATERTHAN);
275
276	/*
277	 * If they're both equal, then we will compare based on the weights at
278	 * zero.  If the weights at zero are equal, then this will be judged a
279	 * tie and will be resolved based on the key comparison.
280	 */
281	lzero = lhs[1];
282	rzero = rhs[1];
283
284	if (lzero < rzero)
285		return (DT_LESSTHAN);
286
287	if (lzero > rzero)
288		return (DT_GREATERTHAN);
289
290	return (0);
291}
292
293static int
294dt_aggregate_quantizedcmp(int64_t *lhs, int64_t *rhs)
295{
296	int nbuckets = DTRACE_QUANTIZE_NBUCKETS;
297	long double ltotal = 0, rtotal = 0;
298	int64_t lzero, rzero;
299	uint_t i;
300
301	for (i = 0; i < nbuckets; i++) {
302		int64_t bucketval = DTRACE_QUANTIZE_BUCKETVAL(i);
303
304		if (bucketval == 0) {
305			lzero = lhs[i];
306			rzero = rhs[i];
307		}
308
309		ltotal += (long double)bucketval * (long double)lhs[i];
310		rtotal += (long double)bucketval * (long double)rhs[i];
311	}
312
313	if (ltotal < rtotal)
314		return (DT_LESSTHAN);
315
316	if (ltotal > rtotal)
317		return (DT_GREATERTHAN);
318
319	/*
320	 * If they're both equal, then we will compare based on the weights at
321	 * zero.  If the weights at zero are equal, then this will be judged a
322	 * tie and will be resolved based on the key comparison.
323	 */
324	if (lzero < rzero)
325		return (DT_LESSTHAN);
326
327	if (lzero > rzero)
328		return (DT_GREATERTHAN);
329
330	return (0);
331}
332
333static void
334dt_aggregate_usym(dtrace_hdl_t *dtp, uint64_t *data)
335{
336	uint64_t pid = data[0];
337	uint64_t *pc = &data[1];
338	struct ps_prochandle *P;
339	GElf_Sym sym;
340
341	if (dtp->dt_vector != NULL)
342		return;
343
344	if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0)) == NULL)
345		return;
346
347	dt_proc_lock(dtp, P);
348
349	if (Plookup_by_addr(P, *pc, NULL, 0, &sym) == 0)
350		*pc = sym.st_value;
351
352	dt_proc_unlock(dtp, P);
353	dt_proc_release(dtp, P);
354}
355
356static void
357dt_aggregate_umod(dtrace_hdl_t *dtp, uint64_t *data)
358{
359	uint64_t pid = data[0];
360	uint64_t *pc = &data[1];
361	struct ps_prochandle *P;
362	const prmap_t *map;
363
364	if (dtp->dt_vector != NULL)
365		return;
366
367	if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0)) == NULL)
368		return;
369
370	dt_proc_lock(dtp, P);
371
372	if ((map = Paddr_to_map(P, *pc)) != NULL)
373		*pc = map->pr_vaddr;
374
375	dt_proc_unlock(dtp, P);
376	dt_proc_release(dtp, P);
377}
378
379static void
380dt_aggregate_sym(dtrace_hdl_t *dtp, uint64_t *data)
381{
382	GElf_Sym sym;
383	uint64_t *pc = data;
384
385	if (dtrace_lookup_by_addr(dtp, *pc, &sym, NULL) == 0)
386		*pc = sym.st_value;
387}
388
389static void
390dt_aggregate_mod(dtrace_hdl_t *dtp, uint64_t *data)
391{
392	uint64_t *pc = data;
393	dt_module_t *dmp;
394
395	if (dtp->dt_vector != NULL) {
396		/*
397		 * We don't have a way of just getting the module for a
398		 * vectored open, and it doesn't seem to be worth defining
399		 * one.  This means that use of mod() won't get true
400		 * aggregation in the postmortem case (some modules may
401		 * appear more than once in aggregation output).  It seems
402		 * unlikely that anyone will ever notice or care...
403		 */
404		return;
405	}
406
407	for (dmp = dt_list_next(&dtp->dt_modlist); dmp != NULL;
408	    dmp = dt_list_next(dmp)) {
409		if (*pc - dmp->dm_text_va < dmp->dm_text_size) {
410			*pc = dmp->dm_text_va;
411			return;
412		}
413	}
414}
415
416static dtrace_aggvarid_t
417dt_aggregate_aggvarid(dt_ahashent_t *ent)
418{
419	dtrace_aggdesc_t *agg = ent->dtahe_data.dtada_desc;
420	caddr_t data = ent->dtahe_data.dtada_data;
421	dtrace_recdesc_t *rec = agg->dtagd_rec;
422
423	/*
424	 * First, we'll check the variable ID in the aggdesc.  If it's valid,
425	 * we'll return it.  If not, we'll use the compiler-generated ID
426	 * present as the first record.
427	 */
428	if (agg->dtagd_varid != DTRACE_AGGVARIDNONE)
429		return (agg->dtagd_varid);
430
431	agg->dtagd_varid = *((dtrace_aggvarid_t *)(uintptr_t)(data +
432	    rec->dtrd_offset));
433
434	return (agg->dtagd_varid);
435}
436
437
438static int
439dt_aggregate_snap_cpu(dtrace_hdl_t *dtp, processorid_t cpu)
440{
441	dtrace_epid_t id;
442	uint64_t hashval;
443	size_t offs, roffs, size, ndx;
444	int i, j, rval;
445	caddr_t addr, data;
446	dtrace_recdesc_t *rec;
447	dt_aggregate_t *agp = &dtp->dt_aggregate;
448	dtrace_aggdesc_t *agg;
449	dt_ahash_t *hash = &agp->dtat_hash;
450	dt_ahashent_t *h;
451	dtrace_bufdesc_t b = agp->dtat_buf, *buf = &b;
452	dtrace_aggdata_t *aggdata;
453	int flags = agp->dtat_flags;
454
455	buf->dtbd_cpu = cpu;
456
457#ifdef illumos
458	if (dt_ioctl(dtp, DTRACEIOC_AGGSNAP, buf) == -1) {
459#else
460	if (dt_ioctl(dtp, DTRACEIOC_AGGSNAP, &buf) == -1) {
461#endif
462		if (errno == ENOENT) {
463			/*
464			 * If that failed with ENOENT, it may be because the
465			 * CPU was unconfigured.  This is okay; we'll just
466			 * do nothing but return success.
467			 */
468			return (0);
469		}
470
471		return (dt_set_errno(dtp, errno));
472	}
473
474	if (buf->dtbd_drops != 0) {
475		xo_open_instance("probes");
476		dt_oformat_drop(dtp, cpu);
477		if (dt_handle_cpudrop(dtp, cpu,
478		    DTRACEDROP_AGGREGATION, buf->dtbd_drops) == -1) {
479			xo_close_instance("probes");
480			return (-1);
481		}
482		xo_close_instance("probes");
483	}
484
485	if (buf->dtbd_size == 0)
486		return (0);
487
488	if (hash->dtah_hash == NULL) {
489		size_t size;
490
491		hash->dtah_size = DTRACE_AHASHSIZE;
492		size = hash->dtah_size * sizeof (dt_ahashent_t *);
493
494		if ((hash->dtah_hash = malloc(size)) == NULL)
495			return (dt_set_errno(dtp, EDT_NOMEM));
496
497		bzero(hash->dtah_hash, size);
498	}
499
500	for (offs = 0; offs < buf->dtbd_size; ) {
501		/*
502		 * We're guaranteed to have an ID.
503		 */
504		id = *((dtrace_epid_t *)((uintptr_t)buf->dtbd_data +
505		    (uintptr_t)offs));
506
507		if (id == DTRACE_AGGIDNONE) {
508			/*
509			 * This is filler to assure proper alignment of the
510			 * next record; we simply ignore it.
511			 */
512			offs += sizeof (id);
513			continue;
514		}
515
516		if ((rval = dt_aggid_lookup(dtp, id, &agg)) != 0)
517			return (rval);
518
519		addr = buf->dtbd_data + offs;
520		size = agg->dtagd_size;
521		hashval = 0;
522
523		for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
524			rec = &agg->dtagd_rec[j];
525			roffs = rec->dtrd_offset;
526
527			switch (rec->dtrd_action) {
528			case DTRACEACT_USYM:
529				dt_aggregate_usym(dtp,
530				    /* LINTED - alignment */
531				    (uint64_t *)&addr[roffs]);
532				break;
533
534			case DTRACEACT_UMOD:
535				dt_aggregate_umod(dtp,
536				    /* LINTED - alignment */
537				    (uint64_t *)&addr[roffs]);
538				break;
539
540			case DTRACEACT_SYM:
541				/* LINTED - alignment */
542				dt_aggregate_sym(dtp, (uint64_t *)&addr[roffs]);
543				break;
544
545			case DTRACEACT_MOD:
546				/* LINTED - alignment */
547				dt_aggregate_mod(dtp, (uint64_t *)&addr[roffs]);
548				break;
549
550			default:
551				break;
552			}
553
554			for (i = 0; i < rec->dtrd_size; i++)
555				hashval += addr[roffs + i];
556		}
557
558		ndx = hashval % hash->dtah_size;
559
560		for (h = hash->dtah_hash[ndx]; h != NULL; h = h->dtahe_next) {
561			if (h->dtahe_hashval != hashval)
562				continue;
563
564			if (h->dtahe_size != size)
565				continue;
566
567			aggdata = &h->dtahe_data;
568			data = aggdata->dtada_data;
569
570			for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
571				rec = &agg->dtagd_rec[j];
572				roffs = rec->dtrd_offset;
573
574				for (i = 0; i < rec->dtrd_size; i++)
575					if (addr[roffs + i] != data[roffs + i])
576						goto hashnext;
577			}
578
579			/*
580			 * We found it.  Now we need to apply the aggregating
581			 * action on the data here.
582			 */
583			rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
584			roffs = rec->dtrd_offset;
585			/* LINTED - alignment */
586			h->dtahe_aggregate((int64_t *)&data[roffs],
587			    /* LINTED - alignment */
588			    (int64_t *)&addr[roffs], rec->dtrd_size);
589
590			/*
591			 * If we're keeping per CPU data, apply the aggregating
592			 * action there as well.
593			 */
594			if (aggdata->dtada_percpu != NULL) {
595				data = aggdata->dtada_percpu[cpu];
596
597				/* LINTED - alignment */
598				h->dtahe_aggregate((int64_t *)data,
599				    /* LINTED - alignment */
600				    (int64_t *)&addr[roffs], rec->dtrd_size);
601			}
602
603			goto bufnext;
604hashnext:
605			continue;
606		}
607
608		/*
609		 * If we're here, we couldn't find an entry for this record.
610		 */
611		if ((h = malloc(sizeof (dt_ahashent_t))) == NULL)
612			return (dt_set_errno(dtp, EDT_NOMEM));
613		bzero(h, sizeof (dt_ahashent_t));
614		aggdata = &h->dtahe_data;
615
616		if ((aggdata->dtada_data = malloc(size)) == NULL) {
617			free(h);
618			return (dt_set_errno(dtp, EDT_NOMEM));
619		}
620
621		bcopy(addr, aggdata->dtada_data, size);
622		aggdata->dtada_size = size;
623		aggdata->dtada_desc = agg;
624		aggdata->dtada_handle = dtp;
625		(void) dt_epid_lookup(dtp, agg->dtagd_epid,
626		    &aggdata->dtada_edesc, &aggdata->dtada_pdesc);
627		aggdata->dtada_normal = 1;
628
629		h->dtahe_hashval = hashval;
630		h->dtahe_size = size;
631		(void) dt_aggregate_aggvarid(h);
632
633		rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
634
635		if (flags & DTRACE_A_PERCPU) {
636			int max_cpus = agp->dtat_maxcpu;
637			caddr_t *percpu = malloc(max_cpus * sizeof (caddr_t));
638
639			if (percpu == NULL) {
640				free(aggdata->dtada_data);
641				free(h);
642				return (dt_set_errno(dtp, EDT_NOMEM));
643			}
644
645			for (j = 0; j < max_cpus; j++) {
646				percpu[j] = malloc(rec->dtrd_size);
647
648				if (percpu[j] == NULL) {
649					while (--j >= 0)
650						free(percpu[j]);
651
652					free(aggdata->dtada_data);
653					free(h);
654					return (dt_set_errno(dtp, EDT_NOMEM));
655				}
656
657				if (j == cpu) {
658					bcopy(&addr[rec->dtrd_offset],
659					    percpu[j], rec->dtrd_size);
660				} else {
661					bzero(percpu[j], rec->dtrd_size);
662				}
663			}
664
665			aggdata->dtada_percpu = percpu;
666		}
667
668		switch (rec->dtrd_action) {
669		case DTRACEAGG_MIN:
670			h->dtahe_aggregate = dt_aggregate_min;
671			break;
672
673		case DTRACEAGG_MAX:
674			h->dtahe_aggregate = dt_aggregate_max;
675			break;
676
677		case DTRACEAGG_LQUANTIZE:
678			h->dtahe_aggregate = dt_aggregate_lquantize;
679			break;
680
681		case DTRACEAGG_LLQUANTIZE:
682			h->dtahe_aggregate = dt_aggregate_llquantize;
683			break;
684
685		case DTRACEAGG_COUNT:
686		case DTRACEAGG_SUM:
687		case DTRACEAGG_AVG:
688		case DTRACEAGG_STDDEV:
689		case DTRACEAGG_QUANTIZE:
690			h->dtahe_aggregate = dt_aggregate_count;
691			break;
692
693		default:
694			return (dt_set_errno(dtp, EDT_BADAGG));
695		}
696
697		if (hash->dtah_hash[ndx] != NULL)
698			hash->dtah_hash[ndx]->dtahe_prev = h;
699
700		h->dtahe_next = hash->dtah_hash[ndx];
701		hash->dtah_hash[ndx] = h;
702
703		if (hash->dtah_all != NULL)
704			hash->dtah_all->dtahe_prevall = h;
705
706		h->dtahe_nextall = hash->dtah_all;
707		hash->dtah_all = h;
708bufnext:
709		offs += agg->dtagd_size;
710	}
711
712	return (0);
713}
714
715int
716dtrace_aggregate_snap(dtrace_hdl_t *dtp)
717{
718	int i, rval;
719	dt_aggregate_t *agp = &dtp->dt_aggregate;
720	hrtime_t now = gethrtime();
721	dtrace_optval_t interval = dtp->dt_options[DTRACEOPT_AGGRATE];
722
723	if (dtp->dt_lastagg != 0) {
724		if (now - dtp->dt_lastagg < interval)
725			return (0);
726
727		dtp->dt_lastagg += interval;
728	} else {
729		dtp->dt_lastagg = now;
730	}
731
732	if (!dtp->dt_active)
733		return (dt_set_errno(dtp, EINVAL));
734
735	if (agp->dtat_buf.dtbd_size == 0)
736		return (0);
737
738	for (i = 0; i < agp->dtat_ncpus; i++) {
739		if ((rval = dt_aggregate_snap_cpu(dtp, agp->dtat_cpus[i])))
740			return (rval);
741	}
742
743	return (0);
744}
745
746static int
747dt_aggregate_hashcmp(const void *lhs, const void *rhs)
748{
749	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
750	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
751	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
752	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
753
754	if (lagg->dtagd_nrecs < ragg->dtagd_nrecs)
755		return (DT_LESSTHAN);
756
757	if (lagg->dtagd_nrecs > ragg->dtagd_nrecs)
758		return (DT_GREATERTHAN);
759
760	return (0);
761}
762
763static int
764dt_aggregate_varcmp(const void *lhs, const void *rhs)
765{
766	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
767	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
768	dtrace_aggvarid_t lid, rid;
769
770	lid = dt_aggregate_aggvarid(lh);
771	rid = dt_aggregate_aggvarid(rh);
772
773	if (lid < rid)
774		return (DT_LESSTHAN);
775
776	if (lid > rid)
777		return (DT_GREATERTHAN);
778
779	return (0);
780}
781
782static int
783dt_aggregate_keycmp(const void *lhs, const void *rhs)
784{
785	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
786	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
787	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
788	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
789	dtrace_recdesc_t *lrec, *rrec;
790	char *ldata, *rdata;
791	int rval, i, j, keypos, nrecs;
792
793	if ((rval = dt_aggregate_hashcmp(lhs, rhs)) != 0)
794		return (rval);
795
796	nrecs = lagg->dtagd_nrecs - 1;
797	assert(nrecs == ragg->dtagd_nrecs - 1);
798
799	keypos = dt_keypos + 1 >= nrecs ? 0 : dt_keypos;
800
801	for (i = 1; i < nrecs; i++) {
802		uint64_t lval, rval;
803		int ndx = i + keypos;
804
805		if (ndx >= nrecs)
806			ndx = ndx - nrecs + 1;
807
808		lrec = &lagg->dtagd_rec[ndx];
809		rrec = &ragg->dtagd_rec[ndx];
810
811		ldata = lh->dtahe_data.dtada_data + lrec->dtrd_offset;
812		rdata = rh->dtahe_data.dtada_data + rrec->dtrd_offset;
813
814		if (lrec->dtrd_size < rrec->dtrd_size)
815			return (DT_LESSTHAN);
816
817		if (lrec->dtrd_size > rrec->dtrd_size)
818			return (DT_GREATERTHAN);
819
820		switch (lrec->dtrd_size) {
821		case sizeof (uint64_t):
822			/* LINTED - alignment */
823			lval = *((uint64_t *)ldata);
824			/* LINTED - alignment */
825			rval = *((uint64_t *)rdata);
826			break;
827
828		case sizeof (uint32_t):
829			/* LINTED - alignment */
830			lval = *((uint32_t *)ldata);
831			/* LINTED - alignment */
832			rval = *((uint32_t *)rdata);
833			break;
834
835		case sizeof (uint16_t):
836			/* LINTED - alignment */
837			lval = *((uint16_t *)ldata);
838			/* LINTED - alignment */
839			rval = *((uint16_t *)rdata);
840			break;
841
842		case sizeof (uint8_t):
843			lval = *((uint8_t *)ldata);
844			rval = *((uint8_t *)rdata);
845			break;
846
847		default:
848			switch (lrec->dtrd_action) {
849			case DTRACEACT_UMOD:
850			case DTRACEACT_UADDR:
851			case DTRACEACT_USYM:
852				for (j = 0; j < 2; j++) {
853					/* LINTED - alignment */
854					lval = ((uint64_t *)ldata)[j];
855					/* LINTED - alignment */
856					rval = ((uint64_t *)rdata)[j];
857
858					if (lval < rval)
859						return (DT_LESSTHAN);
860
861					if (lval > rval)
862						return (DT_GREATERTHAN);
863				}
864
865				break;
866
867			default:
868				for (j = 0; j < lrec->dtrd_size; j++) {
869					lval = ((uint8_t *)ldata)[j];
870					rval = ((uint8_t *)rdata)[j];
871
872					if (lval < rval)
873						return (DT_LESSTHAN);
874
875					if (lval > rval)
876						return (DT_GREATERTHAN);
877				}
878			}
879
880			continue;
881		}
882
883		if (lval < rval)
884			return (DT_LESSTHAN);
885
886		if (lval > rval)
887			return (DT_GREATERTHAN);
888	}
889
890	return (0);
891}
892
893static int
894dt_aggregate_valcmp(const void *lhs, const void *rhs)
895{
896	dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
897	dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
898	dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
899	dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
900	caddr_t ldata = lh->dtahe_data.dtada_data;
901	caddr_t rdata = rh->dtahe_data.dtada_data;
902	dtrace_recdesc_t *lrec, *rrec;
903	int64_t *laddr, *raddr;
904	int rval;
905
906	assert(lagg->dtagd_nrecs == ragg->dtagd_nrecs);
907
908	lrec = &lagg->dtagd_rec[lagg->dtagd_nrecs - 1];
909	rrec = &ragg->dtagd_rec[ragg->dtagd_nrecs - 1];
910
911	assert(lrec->dtrd_action == rrec->dtrd_action);
912
913	laddr = (int64_t *)(uintptr_t)(ldata + lrec->dtrd_offset);
914	raddr = (int64_t *)(uintptr_t)(rdata + rrec->dtrd_offset);
915
916	switch (lrec->dtrd_action) {
917	case DTRACEAGG_AVG:
918		rval = dt_aggregate_averagecmp(laddr, raddr);
919		break;
920
921	case DTRACEAGG_STDDEV:
922		rval = dt_aggregate_stddevcmp(laddr, raddr);
923		break;
924
925	case DTRACEAGG_QUANTIZE:
926		rval = dt_aggregate_quantizedcmp(laddr, raddr);
927		break;
928
929	case DTRACEAGG_LQUANTIZE:
930		rval = dt_aggregate_lquantizedcmp(laddr, raddr);
931		break;
932
933	case DTRACEAGG_LLQUANTIZE:
934		rval = dt_aggregate_llquantizedcmp(laddr, raddr);
935		break;
936
937	case DTRACEAGG_COUNT:
938	case DTRACEAGG_SUM:
939	case DTRACEAGG_MIN:
940	case DTRACEAGG_MAX:
941		rval = dt_aggregate_countcmp(laddr, raddr);
942		break;
943
944	default:
945		assert(0);
946	}
947
948	return (rval);
949}
950
951static int
952dt_aggregate_valkeycmp(const void *lhs, const void *rhs)
953{
954	int rval;
955
956	if ((rval = dt_aggregate_valcmp(lhs, rhs)) != 0)
957		return (rval);
958
959	/*
960	 * If we're here, the values for the two aggregation elements are
961	 * equal.  We already know that the key layout is the same for the two
962	 * elements; we must now compare the keys themselves as a tie-breaker.
963	 */
964	return (dt_aggregate_keycmp(lhs, rhs));
965}
966
967static int
968dt_aggregate_keyvarcmp(const void *lhs, const void *rhs)
969{
970	int rval;
971
972	if ((rval = dt_aggregate_keycmp(lhs, rhs)) != 0)
973		return (rval);
974
975	return (dt_aggregate_varcmp(lhs, rhs));
976}
977
978static int
979dt_aggregate_varkeycmp(const void *lhs, const void *rhs)
980{
981	int rval;
982
983	if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
984		return (rval);
985
986	return (dt_aggregate_keycmp(lhs, rhs));
987}
988
989static int
990dt_aggregate_valvarcmp(const void *lhs, const void *rhs)
991{
992	int rval;
993
994	if ((rval = dt_aggregate_valkeycmp(lhs, rhs)) != 0)
995		return (rval);
996
997	return (dt_aggregate_varcmp(lhs, rhs));
998}
999
1000static int
1001dt_aggregate_varvalcmp(const void *lhs, const void *rhs)
1002{
1003	int rval;
1004
1005	if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
1006		return (rval);
1007
1008	return (dt_aggregate_valkeycmp(lhs, rhs));
1009}
1010
1011static int
1012dt_aggregate_keyvarrevcmp(const void *lhs, const void *rhs)
1013{
1014	return (dt_aggregate_keyvarcmp(rhs, lhs));
1015}
1016
1017static int
1018dt_aggregate_varkeyrevcmp(const void *lhs, const void *rhs)
1019{
1020	return (dt_aggregate_varkeycmp(rhs, lhs));
1021}
1022
1023static int
1024dt_aggregate_valvarrevcmp(const void *lhs, const void *rhs)
1025{
1026	return (dt_aggregate_valvarcmp(rhs, lhs));
1027}
1028
1029static int
1030dt_aggregate_varvalrevcmp(const void *lhs, const void *rhs)
1031{
1032	return (dt_aggregate_varvalcmp(rhs, lhs));
1033}
1034
1035static int
1036dt_aggregate_bundlecmp(const void *lhs, const void *rhs)
1037{
1038	dt_ahashent_t **lh = *((dt_ahashent_t ***)lhs);
1039	dt_ahashent_t **rh = *((dt_ahashent_t ***)rhs);
1040	int i, rval;
1041
1042	if (dt_keysort) {
1043		/*
1044		 * If we're sorting on keys, we need to scan until we find the
1045		 * last entry -- that's the representative key.  (The order of
1046		 * the bundle is values followed by key to accommodate the
1047		 * default behavior of sorting by value.)  If the keys are
1048		 * equal, we'll fall into the value comparison loop, below.
1049		 */
1050		for (i = 0; lh[i + 1] != NULL; i++)
1051			continue;
1052
1053		assert(i != 0);
1054		assert(rh[i + 1] == NULL);
1055
1056		if ((rval = dt_aggregate_keycmp(&lh[i], &rh[i])) != 0)
1057			return (rval);
1058	}
1059
1060	for (i = 0; ; i++) {
1061		if (lh[i + 1] == NULL) {
1062			/*
1063			 * All of the values are equal; if we're sorting on
1064			 * keys, then we're only here because the keys were
1065			 * found to be equal and these records are therefore
1066			 * equal.  If we're not sorting on keys, we'll use the
1067			 * key comparison from the representative key as the
1068			 * tie-breaker.
1069			 */
1070			if (dt_keysort)
1071				return (0);
1072
1073			assert(i != 0);
1074			assert(rh[i + 1] == NULL);
1075			return (dt_aggregate_keycmp(&lh[i], &rh[i]));
1076		} else {
1077			if ((rval = dt_aggregate_valcmp(&lh[i], &rh[i])) != 0)
1078				return (rval);
1079		}
1080	}
1081}
1082
1083int
1084dt_aggregate_go(dtrace_hdl_t *dtp)
1085{
1086	dt_aggregate_t *agp = &dtp->dt_aggregate;
1087	dtrace_optval_t size, cpu;
1088	dtrace_bufdesc_t *buf = &agp->dtat_buf;
1089	int rval, i;
1090
1091	assert(agp->dtat_maxcpu == 0);
1092	assert(agp->dtat_ncpu == 0);
1093	assert(agp->dtat_cpus == NULL);
1094
1095	agp->dtat_maxcpu = dt_sysconf(dtp, _SC_CPUID_MAX) + 1;
1096	agp->dtat_ncpu = dt_sysconf(dtp, _SC_NPROCESSORS_MAX);
1097	agp->dtat_cpus = malloc(agp->dtat_ncpu * sizeof (processorid_t));
1098
1099	if (agp->dtat_cpus == NULL)
1100		return (dt_set_errno(dtp, EDT_NOMEM));
1101
1102	/*
1103	 * Use the aggregation buffer size as reloaded from the kernel.
1104	 */
1105	size = dtp->dt_options[DTRACEOPT_AGGSIZE];
1106
1107	rval = dtrace_getopt(dtp, "aggsize", &size);
1108	assert(rval == 0);
1109
1110	if (size == 0 || size == DTRACEOPT_UNSET)
1111		return (0);
1112
1113	buf = &agp->dtat_buf;
1114	buf->dtbd_size = size;
1115
1116	if ((buf->dtbd_data = malloc(buf->dtbd_size)) == NULL)
1117		return (dt_set_errno(dtp, EDT_NOMEM));
1118
1119	/*
1120	 * Now query for the CPUs enabled.
1121	 */
1122	rval = dtrace_getopt(dtp, "cpu", &cpu);
1123	assert(rval == 0 && cpu != DTRACEOPT_UNSET);
1124
1125	if (cpu != DTRACE_CPUALL) {
1126		assert(cpu < agp->dtat_ncpu);
1127		agp->dtat_cpus[agp->dtat_ncpus++] = (processorid_t)cpu;
1128
1129		return (0);
1130	}
1131
1132	agp->dtat_ncpus = 0;
1133	for (i = 0; i < agp->dtat_maxcpu; i++) {
1134		if (dt_status(dtp, i) == -1)
1135			continue;
1136
1137		agp->dtat_cpus[agp->dtat_ncpus++] = i;
1138	}
1139
1140	return (0);
1141}
1142
1143static int
1144dt_aggwalk_rval(dtrace_hdl_t *dtp, dt_ahashent_t *h, int rval)
1145{
1146	dt_aggregate_t *agp = &dtp->dt_aggregate;
1147	dtrace_aggdata_t *data;
1148	dtrace_aggdesc_t *aggdesc;
1149	dtrace_recdesc_t *rec;
1150	int i;
1151
1152	switch (rval) {
1153	case DTRACE_AGGWALK_NEXT:
1154		break;
1155
1156	case DTRACE_AGGWALK_CLEAR: {
1157		uint32_t size, offs = 0;
1158
1159		aggdesc = h->dtahe_data.dtada_desc;
1160		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1161		size = rec->dtrd_size;
1162		data = &h->dtahe_data;
1163
1164		if (rec->dtrd_action == DTRACEAGG_LQUANTIZE) {
1165			offs = sizeof (uint64_t);
1166			size -= sizeof (uint64_t);
1167		}
1168
1169		bzero(&data->dtada_data[rec->dtrd_offset] + offs, size);
1170
1171		if (data->dtada_percpu == NULL)
1172			break;
1173
1174		for (i = 0; i < dtp->dt_aggregate.dtat_maxcpu; i++)
1175			bzero(data->dtada_percpu[i] + offs, size);
1176		break;
1177	}
1178
1179	case DTRACE_AGGWALK_ERROR:
1180		/*
1181		 * We assume that errno is already set in this case.
1182		 */
1183		return (dt_set_errno(dtp, errno));
1184
1185	case DTRACE_AGGWALK_ABORT:
1186		return (dt_set_errno(dtp, EDT_DIRABORT));
1187
1188	case DTRACE_AGGWALK_DENORMALIZE:
1189		h->dtahe_data.dtada_normal = 1;
1190		return (0);
1191
1192	case DTRACE_AGGWALK_NORMALIZE:
1193		if (h->dtahe_data.dtada_normal == 0) {
1194			h->dtahe_data.dtada_normal = 1;
1195			return (dt_set_errno(dtp, EDT_BADRVAL));
1196		}
1197
1198		return (0);
1199
1200	case DTRACE_AGGWALK_REMOVE: {
1201		dtrace_aggdata_t *aggdata = &h->dtahe_data;
1202		int max_cpus = agp->dtat_maxcpu;
1203
1204		/*
1205		 * First, remove this hash entry from its hash chain.
1206		 */
1207		if (h->dtahe_prev != NULL) {
1208			h->dtahe_prev->dtahe_next = h->dtahe_next;
1209		} else {
1210			dt_ahash_t *hash = &agp->dtat_hash;
1211			size_t ndx = h->dtahe_hashval % hash->dtah_size;
1212
1213			assert(hash->dtah_hash[ndx] == h);
1214			hash->dtah_hash[ndx] = h->dtahe_next;
1215		}
1216
1217		if (h->dtahe_next != NULL)
1218			h->dtahe_next->dtahe_prev = h->dtahe_prev;
1219
1220		/*
1221		 * Now remove it from the list of all hash entries.
1222		 */
1223		if (h->dtahe_prevall != NULL) {
1224			h->dtahe_prevall->dtahe_nextall = h->dtahe_nextall;
1225		} else {
1226			dt_ahash_t *hash = &agp->dtat_hash;
1227
1228			assert(hash->dtah_all == h);
1229			hash->dtah_all = h->dtahe_nextall;
1230		}
1231
1232		if (h->dtahe_nextall != NULL)
1233			h->dtahe_nextall->dtahe_prevall = h->dtahe_prevall;
1234
1235		/*
1236		 * We're unlinked.  We can safely destroy the data.
1237		 */
1238		if (aggdata->dtada_percpu != NULL) {
1239			for (i = 0; i < max_cpus; i++)
1240				free(aggdata->dtada_percpu[i]);
1241			free(aggdata->dtada_percpu);
1242		}
1243
1244		free(aggdata->dtada_data);
1245		free(h);
1246
1247		return (0);
1248	}
1249
1250	default:
1251		return (dt_set_errno(dtp, EDT_BADRVAL));
1252	}
1253
1254	return (0);
1255}
1256
1257void
1258dt_aggregate_qsort(dtrace_hdl_t *dtp, void *base, size_t nel, size_t width,
1259    int (*compar)(const void *, const void *))
1260{
1261	int rev = dt_revsort, key = dt_keysort, keypos = dt_keypos;
1262	dtrace_optval_t keyposopt = dtp->dt_options[DTRACEOPT_AGGSORTKEYPOS];
1263
1264	dt_revsort = (dtp->dt_options[DTRACEOPT_AGGSORTREV] != DTRACEOPT_UNSET);
1265	dt_keysort = (dtp->dt_options[DTRACEOPT_AGGSORTKEY] != DTRACEOPT_UNSET);
1266
1267	if (keyposopt != DTRACEOPT_UNSET && keyposopt <= INT_MAX) {
1268		dt_keypos = (int)keyposopt;
1269	} else {
1270		dt_keypos = 0;
1271	}
1272
1273	if (compar == NULL) {
1274		if (!dt_keysort) {
1275			compar = dt_aggregate_varvalcmp;
1276		} else {
1277			compar = dt_aggregate_varkeycmp;
1278		}
1279	}
1280
1281	qsort(base, nel, width, compar);
1282
1283	dt_revsort = rev;
1284	dt_keysort = key;
1285	dt_keypos = keypos;
1286}
1287
1288int
1289dtrace_aggregate_walk(dtrace_hdl_t *dtp, dtrace_aggregate_f *func, void *arg)
1290{
1291	dt_ahashent_t *h, *next;
1292	dt_ahash_t *hash = &dtp->dt_aggregate.dtat_hash;
1293
1294	for (h = hash->dtah_all; h != NULL; h = next) {
1295		/*
1296		 * dt_aggwalk_rval() can potentially remove the current hash
1297		 * entry; we need to load the next hash entry before calling
1298		 * into it.
1299		 */
1300		next = h->dtahe_nextall;
1301
1302		if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1)
1303			return (-1);
1304	}
1305
1306	return (0);
1307}
1308
1309static int
1310dt_aggregate_total(dtrace_hdl_t *dtp, boolean_t clear)
1311{
1312	dt_ahashent_t *h;
1313	dtrace_aggdata_t **total;
1314	dtrace_aggid_t max = DTRACE_AGGVARIDNONE, id;
1315	dt_aggregate_t *agp = &dtp->dt_aggregate;
1316	dt_ahash_t *hash = &agp->dtat_hash;
1317	uint32_t tflags;
1318
1319	tflags = DTRACE_A_TOTAL | DTRACE_A_HASNEGATIVES | DTRACE_A_HASPOSITIVES;
1320
1321	/*
1322	 * If we need to deliver per-aggregation totals, we're going to take
1323	 * three passes over the aggregate:  one to clear everything out and
1324	 * determine our maximum aggregation ID, one to actually total
1325	 * everything up, and a final pass to assign the totals to the
1326	 * individual elements.
1327	 */
1328	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1329		dtrace_aggdata_t *aggdata = &h->dtahe_data;
1330
1331		if ((id = dt_aggregate_aggvarid(h)) > max)
1332			max = id;
1333
1334		aggdata->dtada_total = 0;
1335		aggdata->dtada_flags &= ~tflags;
1336	}
1337
1338	if (clear || max == DTRACE_AGGVARIDNONE)
1339		return (0);
1340
1341	total = dt_zalloc(dtp, (max + 1) * sizeof (dtrace_aggdata_t *));
1342
1343	if (total == NULL)
1344		return (-1);
1345
1346	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1347		dtrace_aggdata_t *aggdata = &h->dtahe_data;
1348		dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1349		dtrace_recdesc_t *rec;
1350		caddr_t data;
1351		int64_t val, *addr;
1352
1353		rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
1354		data = aggdata->dtada_data;
1355		addr = (int64_t *)(uintptr_t)(data + rec->dtrd_offset);
1356
1357		switch (rec->dtrd_action) {
1358		case DTRACEAGG_STDDEV:
1359			val = dt_stddev((uint64_t *)addr, 1);
1360			break;
1361
1362		case DTRACEAGG_SUM:
1363		case DTRACEAGG_COUNT:
1364			val = *addr;
1365			break;
1366
1367		case DTRACEAGG_AVG:
1368			val = addr[0] ? (addr[1] / addr[0]) : 0;
1369			break;
1370
1371		default:
1372			continue;
1373		}
1374
1375		if (total[agg->dtagd_varid] == NULL) {
1376			total[agg->dtagd_varid] = aggdata;
1377			aggdata->dtada_flags |= DTRACE_A_TOTAL;
1378		} else {
1379			aggdata = total[agg->dtagd_varid];
1380		}
1381
1382		if (val > 0)
1383			aggdata->dtada_flags |= DTRACE_A_HASPOSITIVES;
1384
1385		if (val < 0) {
1386			aggdata->dtada_flags |= DTRACE_A_HASNEGATIVES;
1387			val = -val;
1388		}
1389
1390		if (dtp->dt_options[DTRACEOPT_AGGZOOM] != DTRACEOPT_UNSET) {
1391			val = (int64_t)((long double)val *
1392			    (1 / DTRACE_AGGZOOM_MAX));
1393
1394			if (val > aggdata->dtada_total)
1395				aggdata->dtada_total = val;
1396		} else {
1397			aggdata->dtada_total += val;
1398		}
1399	}
1400
1401	/*
1402	 * And now one final pass to set everyone's total.
1403	 */
1404	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1405		dtrace_aggdata_t *aggdata = &h->dtahe_data, *t;
1406		dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1407
1408		if ((t = total[agg->dtagd_varid]) == NULL || aggdata == t)
1409			continue;
1410
1411		aggdata->dtada_total = t->dtada_total;
1412		aggdata->dtada_flags |= (t->dtada_flags & tflags);
1413	}
1414
1415	dt_free(dtp, total);
1416
1417	return (0);
1418}
1419
1420static int
1421dt_aggregate_minmaxbin(dtrace_hdl_t *dtp, boolean_t clear)
1422{
1423	dt_ahashent_t *h;
1424	dtrace_aggdata_t **minmax;
1425	dtrace_aggid_t max = DTRACE_AGGVARIDNONE, id;
1426	dt_aggregate_t *agp = &dtp->dt_aggregate;
1427	dt_ahash_t *hash = &agp->dtat_hash;
1428
1429	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1430		dtrace_aggdata_t *aggdata = &h->dtahe_data;
1431
1432		if ((id = dt_aggregate_aggvarid(h)) > max)
1433			max = id;
1434
1435		aggdata->dtada_minbin = 0;
1436		aggdata->dtada_maxbin = 0;
1437		aggdata->dtada_flags &= ~DTRACE_A_MINMAXBIN;
1438	}
1439
1440	if (clear || max == DTRACE_AGGVARIDNONE)
1441		return (0);
1442
1443	minmax = dt_zalloc(dtp, (max + 1) * sizeof (dtrace_aggdata_t *));
1444
1445	if (minmax == NULL)
1446		return (-1);
1447
1448	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1449		dtrace_aggdata_t *aggdata = &h->dtahe_data;
1450		dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1451		dtrace_recdesc_t *rec;
1452		caddr_t data;
1453		int64_t *addr;
1454		int minbin = -1, maxbin = -1, i;
1455		int start = 0, size;
1456
1457		rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
1458		size = rec->dtrd_size / sizeof (int64_t);
1459		data = aggdata->dtada_data;
1460		addr = (int64_t *)(uintptr_t)(data + rec->dtrd_offset);
1461
1462		switch (rec->dtrd_action) {
1463		case DTRACEAGG_LQUANTIZE:
1464			/*
1465			 * For lquantize(), we always display the entire range
1466			 * of the aggregation when aggpack is set.
1467			 */
1468			start = 1;
1469			minbin = start;
1470			maxbin = size - 1 - start;
1471			break;
1472
1473		case DTRACEAGG_QUANTIZE:
1474			for (i = start; i < size; i++) {
1475				if (!addr[i])
1476					continue;
1477
1478				if (minbin == -1)
1479					minbin = i - start;
1480
1481				maxbin = i - start;
1482			}
1483
1484			if (minbin == -1) {
1485				/*
1486				 * If we have no data (e.g., due to a clear()
1487				 * or negative increments), we'll use the
1488				 * zero bucket as both our min and max.
1489				 */
1490				minbin = maxbin = DTRACE_QUANTIZE_ZEROBUCKET;
1491			}
1492
1493			break;
1494
1495		default:
1496			continue;
1497		}
1498
1499		if (minmax[agg->dtagd_varid] == NULL) {
1500			minmax[agg->dtagd_varid] = aggdata;
1501			aggdata->dtada_flags |= DTRACE_A_MINMAXBIN;
1502			aggdata->dtada_minbin = minbin;
1503			aggdata->dtada_maxbin = maxbin;
1504			continue;
1505		}
1506
1507		if (minbin < minmax[agg->dtagd_varid]->dtada_minbin)
1508			minmax[agg->dtagd_varid]->dtada_minbin = minbin;
1509
1510		if (maxbin > minmax[agg->dtagd_varid]->dtada_maxbin)
1511			minmax[agg->dtagd_varid]->dtada_maxbin = maxbin;
1512	}
1513
1514	/*
1515	 * And now one final pass to set everyone's minbin and maxbin.
1516	 */
1517	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1518		dtrace_aggdata_t *aggdata = &h->dtahe_data, *mm;
1519		dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1520
1521		if ((mm = minmax[agg->dtagd_varid]) == NULL || aggdata == mm)
1522			continue;
1523
1524		aggdata->dtada_minbin = mm->dtada_minbin;
1525		aggdata->dtada_maxbin = mm->dtada_maxbin;
1526		aggdata->dtada_flags |= DTRACE_A_MINMAXBIN;
1527	}
1528
1529	dt_free(dtp, minmax);
1530
1531	return (0);
1532}
1533
1534static int
1535dt_aggregate_walk_sorted(dtrace_hdl_t *dtp,
1536    dtrace_aggregate_f *func, void *arg,
1537    int (*sfunc)(const void *, const void *))
1538{
1539	dt_aggregate_t *agp = &dtp->dt_aggregate;
1540	dt_ahashent_t *h, **sorted;
1541	dt_ahash_t *hash = &agp->dtat_hash;
1542	size_t i, nentries = 0;
1543	int rval = -1;
1544
1545	agp->dtat_flags &= ~(DTRACE_A_TOTAL | DTRACE_A_MINMAXBIN);
1546
1547	if (dtp->dt_options[DTRACEOPT_AGGHIST] != DTRACEOPT_UNSET) {
1548		agp->dtat_flags |= DTRACE_A_TOTAL;
1549
1550		if (dt_aggregate_total(dtp, B_FALSE) != 0)
1551			return (-1);
1552	}
1553
1554	if (dtp->dt_options[DTRACEOPT_AGGPACK] != DTRACEOPT_UNSET) {
1555		agp->dtat_flags |= DTRACE_A_MINMAXBIN;
1556
1557		if (dt_aggregate_minmaxbin(dtp, B_FALSE) != 0)
1558			return (-1);
1559	}
1560
1561	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall)
1562		nentries++;
1563
1564	sorted = dt_alloc(dtp, nentries * sizeof (dt_ahashent_t *));
1565
1566	if (sorted == NULL)
1567		goto out;
1568
1569	for (h = hash->dtah_all, i = 0; h != NULL; h = h->dtahe_nextall)
1570		sorted[i++] = h;
1571
1572	(void) pthread_mutex_lock(&dt_qsort_lock);
1573
1574	if (sfunc == NULL) {
1575		dt_aggregate_qsort(dtp, sorted, nentries,
1576		    sizeof (dt_ahashent_t *), NULL);
1577	} else {
1578		/*
1579		 * If we've been explicitly passed a sorting function,
1580		 * we'll use that -- ignoring the values of the "aggsortrev",
1581		 * "aggsortkey" and "aggsortkeypos" options.
1582		 */
1583		qsort(sorted, nentries, sizeof (dt_ahashent_t *), sfunc);
1584	}
1585
1586	(void) pthread_mutex_unlock(&dt_qsort_lock);
1587
1588	for (i = 0; i < nentries; i++) {
1589		h = sorted[i];
1590
1591		if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1)
1592			goto out;
1593	}
1594
1595	rval = 0;
1596out:
1597	if (agp->dtat_flags & DTRACE_A_TOTAL)
1598		(void) dt_aggregate_total(dtp, B_TRUE);
1599
1600	if (agp->dtat_flags & DTRACE_A_MINMAXBIN)
1601		(void) dt_aggregate_minmaxbin(dtp, B_TRUE);
1602
1603	dt_free(dtp, sorted);
1604	return (rval);
1605}
1606
1607int
1608dtrace_aggregate_walk_sorted(dtrace_hdl_t *dtp,
1609    dtrace_aggregate_f *func, void *arg)
1610{
1611	return (dt_aggregate_walk_sorted(dtp, func, arg, NULL));
1612}
1613
1614int
1615dtrace_aggregate_walk_keysorted(dtrace_hdl_t *dtp,
1616    dtrace_aggregate_f *func, void *arg)
1617{
1618	return (dt_aggregate_walk_sorted(dtp, func,
1619	    arg, dt_aggregate_varkeycmp));
1620}
1621
1622int
1623dtrace_aggregate_walk_valsorted(dtrace_hdl_t *dtp,
1624    dtrace_aggregate_f *func, void *arg)
1625{
1626	return (dt_aggregate_walk_sorted(dtp, func,
1627	    arg, dt_aggregate_varvalcmp));
1628}
1629
1630int
1631dtrace_aggregate_walk_keyvarsorted(dtrace_hdl_t *dtp,
1632    dtrace_aggregate_f *func, void *arg)
1633{
1634	return (dt_aggregate_walk_sorted(dtp, func,
1635	    arg, dt_aggregate_keyvarcmp));
1636}
1637
1638int
1639dtrace_aggregate_walk_valvarsorted(dtrace_hdl_t *dtp,
1640    dtrace_aggregate_f *func, void *arg)
1641{
1642	return (dt_aggregate_walk_sorted(dtp, func,
1643	    arg, dt_aggregate_valvarcmp));
1644}
1645
1646int
1647dtrace_aggregate_walk_keyrevsorted(dtrace_hdl_t *dtp,
1648    dtrace_aggregate_f *func, void *arg)
1649{
1650	return (dt_aggregate_walk_sorted(dtp, func,
1651	    arg, dt_aggregate_varkeyrevcmp));
1652}
1653
1654int
1655dtrace_aggregate_walk_valrevsorted(dtrace_hdl_t *dtp,
1656    dtrace_aggregate_f *func, void *arg)
1657{
1658	return (dt_aggregate_walk_sorted(dtp, func,
1659	    arg, dt_aggregate_varvalrevcmp));
1660}
1661
1662int
1663dtrace_aggregate_walk_keyvarrevsorted(dtrace_hdl_t *dtp,
1664    dtrace_aggregate_f *func, void *arg)
1665{
1666	return (dt_aggregate_walk_sorted(dtp, func,
1667	    arg, dt_aggregate_keyvarrevcmp));
1668}
1669
1670int
1671dtrace_aggregate_walk_valvarrevsorted(dtrace_hdl_t *dtp,
1672    dtrace_aggregate_f *func, void *arg)
1673{
1674	return (dt_aggregate_walk_sorted(dtp, func,
1675	    arg, dt_aggregate_valvarrevcmp));
1676}
1677
1678int
1679dtrace_aggregate_walk_joined(dtrace_hdl_t *dtp, dtrace_aggvarid_t *aggvars,
1680    int naggvars, dtrace_aggregate_walk_joined_f *func, void *arg)
1681{
1682	dt_aggregate_t *agp = &dtp->dt_aggregate;
1683	dt_ahashent_t *h, **sorted = NULL, ***bundle, **nbundle;
1684	const dtrace_aggdata_t **data;
1685	dt_ahashent_t *zaggdata = NULL;
1686	dt_ahash_t *hash = &agp->dtat_hash;
1687	size_t nentries = 0, nbundles = 0, start, zsize = 0, bundlesize;
1688	dtrace_aggvarid_t max = 0, aggvar;
1689	int rval = -1, *map, *remap = NULL;
1690	int i, j;
1691	dtrace_optval_t sortpos = dtp->dt_options[DTRACEOPT_AGGSORTPOS];
1692
1693	/*
1694	 * If the sorting position is greater than the number of aggregation
1695	 * variable IDs, we silently set it to 0.
1696	 */
1697	if (sortpos == DTRACEOPT_UNSET || sortpos >= naggvars)
1698		sortpos = 0;
1699
1700	/*
1701	 * First we need to translate the specified aggregation variable IDs
1702	 * into a linear map that will allow us to translate an aggregation
1703	 * variable ID into its position in the specified aggvars.
1704	 */
1705	for (i = 0; i < naggvars; i++) {
1706		if (aggvars[i] == DTRACE_AGGVARIDNONE || aggvars[i] < 0)
1707			return (dt_set_errno(dtp, EDT_BADAGGVAR));
1708
1709		if (aggvars[i] > max)
1710			max = aggvars[i];
1711	}
1712
1713	if ((map = dt_zalloc(dtp, (max + 1) * sizeof (int))) == NULL)
1714		return (-1);
1715
1716	zaggdata = dt_zalloc(dtp, naggvars * sizeof (dt_ahashent_t));
1717
1718	if (zaggdata == NULL)
1719		goto out;
1720
1721	for (i = 0; i < naggvars; i++) {
1722		int ndx = i + sortpos;
1723
1724		if (ndx >= naggvars)
1725			ndx -= naggvars;
1726
1727		aggvar = aggvars[ndx];
1728		assert(aggvar <= max);
1729
1730		if (map[aggvar]) {
1731			/*
1732			 * We have an aggregation variable that is present
1733			 * more than once in the array of aggregation
1734			 * variables.  While it's unclear why one might want
1735			 * to do this, it's legal.  To support this construct,
1736			 * we will allocate a remap that will indicate the
1737			 * position from which this aggregation variable
1738			 * should be pulled.  (That is, where the remap will
1739			 * map from one position to another.)
1740			 */
1741			if (remap == NULL) {
1742				remap = dt_zalloc(dtp, naggvars * sizeof (int));
1743
1744				if (remap == NULL)
1745					goto out;
1746			}
1747
1748			/*
1749			 * Given that the variable is already present, assert
1750			 * that following through the mapping and adjusting
1751			 * for the sort position yields the same aggregation
1752			 * variable ID.
1753			 */
1754			assert(aggvars[(map[aggvar] - 1 + sortpos) %
1755			    naggvars] == aggvars[ndx]);
1756
1757			remap[i] = map[aggvar];
1758			continue;
1759		}
1760
1761		map[aggvar] = i + 1;
1762	}
1763
1764	/*
1765	 * We need to take two passes over the data to size our allocation, so
1766	 * we'll use the first pass to also fill in the zero-filled data to be
1767	 * used to properly format a zero-valued aggregation.
1768	 */
1769	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1770		dtrace_aggvarid_t id;
1771		int ndx;
1772
1773		if ((id = dt_aggregate_aggvarid(h)) > max || !(ndx = map[id]))
1774			continue;
1775
1776		if (zaggdata[ndx - 1].dtahe_size == 0) {
1777			zaggdata[ndx - 1].dtahe_size = h->dtahe_size;
1778			zaggdata[ndx - 1].dtahe_data = h->dtahe_data;
1779		}
1780
1781		nentries++;
1782	}
1783
1784	if (nentries == 0) {
1785		/*
1786		 * We couldn't find any entries; there is nothing else to do.
1787		 */
1788		rval = 0;
1789		goto out;
1790	}
1791
1792	/*
1793	 * Before we sort the data, we're going to look for any holes in our
1794	 * zero-filled data.  This will occur if an aggregation variable that
1795	 * we are being asked to print has not yet been assigned the result of
1796	 * any aggregating action for _any_ tuple.  The issue becomes that we
1797	 * would like a zero value to be printed for all columns for this
1798	 * aggregation, but without any record description, we don't know the
1799	 * aggregating action that corresponds to the aggregation variable.  To
1800	 * try to find a match, we're simply going to lookup aggregation IDs
1801	 * (which are guaranteed to be contiguous and to start from 1), looking
1802	 * for the specified aggregation variable ID.  If we find a match,
1803	 * we'll use that.  If we iterate over all aggregation IDs and don't
1804	 * find a match, then we must be an anonymous enabling.  (Anonymous
1805	 * enablings can't currently derive either aggregation variable IDs or
1806	 * aggregation variable names given only an aggregation ID.)  In this
1807	 * obscure case (anonymous enabling, multiple aggregation printa() with
1808	 * some aggregations not represented for any tuple), our defined
1809	 * behavior is that the zero will be printed in the format of the first
1810	 * aggregation variable that contains any non-zero value.
1811	 */
1812	for (i = 0; i < naggvars; i++) {
1813		if (zaggdata[i].dtahe_size == 0) {
1814			dtrace_aggvarid_t aggvar;
1815
1816			aggvar = aggvars[(i - sortpos + naggvars) % naggvars];
1817			assert(zaggdata[i].dtahe_data.dtada_data == NULL);
1818
1819			for (j = DTRACE_AGGIDNONE + 1; ; j++) {
1820				dtrace_aggdesc_t *agg;
1821				dtrace_aggdata_t *aggdata;
1822
1823				if (dt_aggid_lookup(dtp, j, &agg) != 0)
1824					break;
1825
1826				if (agg->dtagd_varid != aggvar)
1827					continue;
1828
1829				/*
1830				 * We have our description -- now we need to
1831				 * cons up the zaggdata entry for it.
1832				 */
1833				aggdata = &zaggdata[i].dtahe_data;
1834				aggdata->dtada_size = agg->dtagd_size;
1835				aggdata->dtada_desc = agg;
1836				aggdata->dtada_handle = dtp;
1837				(void) dt_epid_lookup(dtp, agg->dtagd_epid,
1838				    &aggdata->dtada_edesc,
1839				    &aggdata->dtada_pdesc);
1840				aggdata->dtada_normal = 1;
1841				zaggdata[i].dtahe_hashval = 0;
1842				zaggdata[i].dtahe_size = agg->dtagd_size;
1843				break;
1844			}
1845
1846			if (zaggdata[i].dtahe_size == 0) {
1847				caddr_t data;
1848
1849				/*
1850				 * We couldn't find this aggregation, meaning
1851				 * that we have never seen it before for any
1852				 * tuple _and_ this is an anonymous enabling.
1853				 * That is, we're in the obscure case outlined
1854				 * above.  In this case, our defined behavior
1855				 * is to format the data in the format of the
1856				 * first non-zero aggregation -- of which, of
1857				 * course, we know there to be at least one
1858				 * (or nentries would have been zero).
1859				 */
1860				for (j = 0; j < naggvars; j++) {
1861					if (zaggdata[j].dtahe_size != 0)
1862						break;
1863				}
1864
1865				assert(j < naggvars);
1866				zaggdata[i] = zaggdata[j];
1867
1868				data = zaggdata[i].dtahe_data.dtada_data;
1869				assert(data != NULL);
1870			}
1871		}
1872	}
1873
1874	/*
1875	 * Now we need to allocate our zero-filled data for use for
1876	 * aggregations that don't have a value corresponding to a given key.
1877	 */
1878	for (i = 0; i < naggvars; i++) {
1879		dtrace_aggdata_t *aggdata = &zaggdata[i].dtahe_data;
1880		dtrace_aggdesc_t *aggdesc = aggdata->dtada_desc;
1881		dtrace_recdesc_t *rec;
1882		uint64_t larg;
1883		caddr_t zdata;
1884
1885		zsize = zaggdata[i].dtahe_size;
1886		assert(zsize != 0);
1887
1888		if ((zdata = dt_zalloc(dtp, zsize)) == NULL) {
1889			/*
1890			 * If we failed to allocated some zero-filled data, we
1891			 * need to zero out the remaining dtada_data pointers
1892			 * to prevent the wrong data from being freed below.
1893			 */
1894			for (j = i; j < naggvars; j++)
1895				zaggdata[j].dtahe_data.dtada_data = NULL;
1896			goto out;
1897		}
1898
1899		aggvar = aggvars[(i - sortpos + naggvars) % naggvars];
1900
1901		/*
1902		 * First, the easy bit.  To maintain compatibility with
1903		 * consumers that pull the compiler-generated ID out of the
1904		 * data, we put that ID at the top of the zero-filled data.
1905		 */
1906		rec = &aggdesc->dtagd_rec[0];
1907		/* LINTED - alignment */
1908		*((dtrace_aggvarid_t *)(zdata + rec->dtrd_offset)) = aggvar;
1909
1910		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1911
1912		/*
1913		 * Now for the more complicated part.  If (and only if) this
1914		 * is an lquantize() aggregating action, zero-filled data is
1915		 * not equivalent to an empty record:  we must also get the
1916		 * parameters for the lquantize().
1917		 */
1918		if (rec->dtrd_action == DTRACEAGG_LQUANTIZE) {
1919			if (aggdata->dtada_data != NULL) {
1920				/*
1921				 * The easier case here is if we actually have
1922				 * some prototype data -- in which case we
1923				 * manually dig it out of the aggregation
1924				 * record.
1925				 */
1926				/* LINTED - alignment */
1927				larg = *((uint64_t *)(aggdata->dtada_data +
1928				    rec->dtrd_offset));
1929			} else {
1930				/*
1931				 * We don't have any prototype data.  As a
1932				 * result, we know that we _do_ have the
1933				 * compiler-generated information.  (If this
1934				 * were an anonymous enabling, all of our
1935				 * zero-filled data would have prototype data
1936				 * -- either directly or indirectly.) So as
1937				 * gross as it is, we'll grovel around in the
1938				 * compiler-generated information to find the
1939				 * lquantize() parameters.
1940				 */
1941				dtrace_stmtdesc_t *sdp;
1942				dt_ident_t *aid;
1943				dt_idsig_t *isp;
1944
1945				sdp = (dtrace_stmtdesc_t *)(uintptr_t)
1946				    aggdesc->dtagd_rec[0].dtrd_uarg;
1947				aid = sdp->dtsd_aggdata;
1948				isp = (dt_idsig_t *)aid->di_data;
1949				assert(isp->dis_auxinfo != 0);
1950				larg = isp->dis_auxinfo;
1951			}
1952
1953			/* LINTED - alignment */
1954			*((uint64_t *)(zdata + rec->dtrd_offset)) = larg;
1955		}
1956
1957		aggdata->dtada_data = zdata;
1958	}
1959
1960	/*
1961	 * Now that we've dealt with setting up our zero-filled data, we can
1962	 * allocate our sorted array, and take another pass over the data to
1963	 * fill it.
1964	 */
1965	sorted = dt_alloc(dtp, nentries * sizeof (dt_ahashent_t *));
1966
1967	if (sorted == NULL)
1968		goto out;
1969
1970	for (h = hash->dtah_all, i = 0; h != NULL; h = h->dtahe_nextall) {
1971		dtrace_aggvarid_t id;
1972
1973		if ((id = dt_aggregate_aggvarid(h)) > max || !map[id])
1974			continue;
1975
1976		sorted[i++] = h;
1977	}
1978
1979	assert(i == nentries);
1980
1981	/*
1982	 * We've loaded our array; now we need to sort by value to allow us
1983	 * to create bundles of like value.  We're going to acquire the
1984	 * dt_qsort_lock here, and hold it across all of our subsequent
1985	 * comparison and sorting.
1986	 */
1987	(void) pthread_mutex_lock(&dt_qsort_lock);
1988
1989	qsort(sorted, nentries, sizeof (dt_ahashent_t *),
1990	    dt_aggregate_keyvarcmp);
1991
1992	/*
1993	 * Now we need to go through and create bundles.  Because the number
1994	 * of bundles is bounded by the size of the sorted array, we're going
1995	 * to reuse the underlying storage.  And note that "bundle" is an
1996	 * array of pointers to arrays of pointers to dt_ahashent_t -- making
1997	 * its type (regrettably) "dt_ahashent_t ***".  (Regrettable because
1998	 * '*' -- like '_' and 'X' -- should never appear in triplicate in
1999	 * an ideal world.)
2000	 */
2001	bundle = (dt_ahashent_t ***)sorted;
2002
2003	for (i = 1, start = 0; i <= nentries; i++) {
2004		if (i < nentries &&
2005		    dt_aggregate_keycmp(&sorted[i], &sorted[i - 1]) == 0)
2006			continue;
2007
2008		/*
2009		 * We have a bundle boundary.  Everything from start to
2010		 * (i - 1) belongs in one bundle.
2011		 */
2012		assert(i - start <= naggvars);
2013		bundlesize = (naggvars + 2) * sizeof (dt_ahashent_t *);
2014
2015		if ((nbundle = dt_zalloc(dtp, bundlesize)) == NULL) {
2016			(void) pthread_mutex_unlock(&dt_qsort_lock);
2017			goto out;
2018		}
2019
2020		for (j = start; j < i; j++) {
2021			dtrace_aggvarid_t id = dt_aggregate_aggvarid(sorted[j]);
2022
2023			assert(id <= max);
2024			assert(map[id] != 0);
2025			assert(map[id] - 1 < naggvars);
2026			assert(nbundle[map[id] - 1] == NULL);
2027			nbundle[map[id] - 1] = sorted[j];
2028
2029			if (nbundle[naggvars] == NULL)
2030				nbundle[naggvars] = sorted[j];
2031		}
2032
2033		for (j = 0; j < naggvars; j++) {
2034			if (nbundle[j] != NULL)
2035				continue;
2036
2037			/*
2038			 * Before we assume that this aggregation variable
2039			 * isn't present (and fall back to using the
2040			 * zero-filled data allocated earlier), check the
2041			 * remap.  If we have a remapping, we'll drop it in
2042			 * here.  Note that we might be remapping an
2043			 * aggregation variable that isn't present for this
2044			 * key; in this case, the aggregation data that we
2045			 * copy will point to the zeroed data.
2046			 */
2047			if (remap != NULL && remap[j]) {
2048				assert(remap[j] - 1 < j);
2049				assert(nbundle[remap[j] - 1] != NULL);
2050				nbundle[j] = nbundle[remap[j] - 1];
2051			} else {
2052				nbundle[j] = &zaggdata[j];
2053			}
2054		}
2055
2056		bundle[nbundles++] = nbundle;
2057		start = i;
2058	}
2059
2060	/*
2061	 * Now we need to re-sort based on the first value.
2062	 */
2063	dt_aggregate_qsort(dtp, bundle, nbundles, sizeof (dt_ahashent_t **),
2064	    dt_aggregate_bundlecmp);
2065
2066	(void) pthread_mutex_unlock(&dt_qsort_lock);
2067
2068	/*
2069	 * We're done!  Now we just need to go back over the sorted bundles,
2070	 * calling the function.
2071	 */
2072	data = alloca((naggvars + 1) * sizeof (dtrace_aggdata_t *));
2073
2074	for (i = 0; i < nbundles; i++) {
2075		for (j = 0; j < naggvars; j++)
2076			data[j + 1] = NULL;
2077
2078		for (j = 0; j < naggvars; j++) {
2079			int ndx = j - sortpos;
2080
2081			if (ndx < 0)
2082				ndx += naggvars;
2083
2084			assert(bundle[i][ndx] != NULL);
2085			data[j + 1] = &bundle[i][ndx]->dtahe_data;
2086		}
2087
2088		for (j = 0; j < naggvars; j++)
2089			assert(data[j + 1] != NULL);
2090
2091		/*
2092		 * The representative key is the last element in the bundle.
2093		 * Assert that we have one, and then set it to be the first
2094		 * element of data.
2095		 */
2096		assert(bundle[i][j] != NULL);
2097		data[0] = &bundle[i][j]->dtahe_data;
2098
2099		if ((rval = func(data, naggvars + 1, arg)) == -1)
2100			goto out;
2101	}
2102
2103	rval = 0;
2104out:
2105	for (i = 0; i < nbundles; i++)
2106		dt_free(dtp, bundle[i]);
2107
2108	if (zaggdata != NULL) {
2109		for (i = 0; i < naggvars; i++)
2110			dt_free(dtp, zaggdata[i].dtahe_data.dtada_data);
2111	}
2112
2113	dt_free(dtp, zaggdata);
2114	dt_free(dtp, sorted);
2115	dt_free(dtp, remap);
2116	dt_free(dtp, map);
2117
2118	return (rval);
2119}
2120
2121int
2122dtrace_aggregate_print(dtrace_hdl_t *dtp, FILE *fp,
2123    dtrace_aggregate_walk_f *func)
2124{
2125	dt_print_aggdata_t pd;
2126
2127	bzero(&pd, sizeof (pd));
2128
2129	pd.dtpa_dtp = dtp;
2130	pd.dtpa_fp = fp;
2131	pd.dtpa_allunprint = 1;
2132
2133	if (func == NULL)
2134		func = dtrace_aggregate_walk_sorted;
2135
2136	if (dtp->dt_oformat) {
2137		if ((*func)(dtp, dt_format_agg, &pd) == -1)
2138			return (dt_set_errno(dtp, dtp->dt_errno));
2139	} else {
2140		if ((*func)(dtp, dt_print_agg, &pd) == -1)
2141			return (dt_set_errno(dtp, dtp->dt_errno));
2142	}
2143
2144	return (0);
2145}
2146
2147void
2148dtrace_aggregate_clear(dtrace_hdl_t *dtp)
2149{
2150	dt_aggregate_t *agp = &dtp->dt_aggregate;
2151	dt_ahash_t *hash = &agp->dtat_hash;
2152	dt_ahashent_t *h;
2153	dtrace_aggdata_t *data;
2154	dtrace_aggdesc_t *aggdesc;
2155	dtrace_recdesc_t *rec;
2156	int i, max_cpus = agp->dtat_maxcpu;
2157
2158	for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
2159		aggdesc = h->dtahe_data.dtada_desc;
2160		rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
2161		data = &h->dtahe_data;
2162
2163		bzero(&data->dtada_data[rec->dtrd_offset], rec->dtrd_size);
2164
2165		if (data->dtada_percpu == NULL)
2166			continue;
2167
2168		for (i = 0; i < max_cpus; i++)
2169			bzero(data->dtada_percpu[i], rec->dtrd_size);
2170	}
2171}
2172
2173void
2174dt_aggregate_destroy(dtrace_hdl_t *dtp)
2175{
2176	dt_aggregate_t *agp = &dtp->dt_aggregate;
2177	dt_ahash_t *hash = &agp->dtat_hash;
2178	dt_ahashent_t *h, *next;
2179	dtrace_aggdata_t *aggdata;
2180	int i, max_cpus = agp->dtat_maxcpu;
2181
2182	if (hash->dtah_hash == NULL) {
2183		assert(hash->dtah_all == NULL);
2184	} else {
2185		free(hash->dtah_hash);
2186
2187		for (h = hash->dtah_all; h != NULL; h = next) {
2188			next = h->dtahe_nextall;
2189
2190			aggdata = &h->dtahe_data;
2191
2192			if (aggdata->dtada_percpu != NULL) {
2193				for (i = 0; i < max_cpus; i++)
2194					free(aggdata->dtada_percpu[i]);
2195				free(aggdata->dtada_percpu);
2196			}
2197
2198			free(aggdata->dtada_data);
2199			free(h);
2200		}
2201
2202		hash->dtah_hash = NULL;
2203		hash->dtah_all = NULL;
2204		hash->dtah_size = 0;
2205	}
2206
2207	free(agp->dtat_buf.dtbd_data);
2208	free(agp->dtat_cpus);
2209}
2210