dtrace.c revision 284138
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
21 * $FreeBSD: stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c 284138 2015-06-07 21:14:48Z pfg $
22 */
23
24/*
25 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
28 */
29
30/*
31 * DTrace - Dynamic Tracing for Solaris
32 *
33 * This is the implementation of the Solaris Dynamic Tracing framework
34 * (DTrace).  The user-visible interface to DTrace is described at length in
35 * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
36 * library, the in-kernel DTrace framework, and the DTrace providers are
37 * described in the block comments in the <sys/dtrace.h> header file.  The
38 * internal architecture of DTrace is described in the block comments in the
39 * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
40 * implementation very much assume mastery of all of these sources; if one has
41 * an unanswered question about the implementation, one should consult them
42 * first.
43 *
44 * The functions here are ordered roughly as follows:
45 *
46 *   - Probe context functions
47 *   - Probe hashing functions
48 *   - Non-probe context utility functions
49 *   - Matching functions
50 *   - Provider-to-Framework API functions
51 *   - Probe management functions
52 *   - DIF object functions
53 *   - Format functions
54 *   - Predicate functions
55 *   - ECB functions
56 *   - Buffer functions
57 *   - Enabling functions
58 *   - DOF functions
59 *   - Anonymous enabling functions
60 *   - Consumer state functions
61 *   - Helper functions
62 *   - Hook functions
63 *   - Driver cookbook functions
64 *
65 * Each group of functions begins with a block comment labelled the "DTrace
66 * [Group] Functions", allowing one to find each block by searching forward
67 * on capital-f functions.
68 */
69#include <sys/errno.h>
70#if !defined(sun)
71#include <sys/time.h>
72#endif
73#include <sys/stat.h>
74#include <sys/modctl.h>
75#include <sys/conf.h>
76#include <sys/systm.h>
77#if defined(sun)
78#include <sys/ddi.h>
79#include <sys/sunddi.h>
80#endif
81#include <sys/cpuvar.h>
82#include <sys/kmem.h>
83#if defined(sun)
84#include <sys/strsubr.h>
85#endif
86#include <sys/sysmacros.h>
87#include <sys/dtrace_impl.h>
88#include <sys/atomic.h>
89#include <sys/cmn_err.h>
90#if defined(sun)
91#include <sys/mutex_impl.h>
92#include <sys/rwlock_impl.h>
93#endif
94#include <sys/ctf_api.h>
95#if defined(sun)
96#include <sys/panic.h>
97#include <sys/priv_impl.h>
98#endif
99#include <sys/policy.h>
100#if defined(sun)
101#include <sys/cred_impl.h>
102#include <sys/procfs_isa.h>
103#endif
104#include <sys/taskq.h>
105#if defined(sun)
106#include <sys/mkdev.h>
107#include <sys/kdi.h>
108#endif
109#include <sys/zone.h>
110#include <sys/socket.h>
111#include <netinet/in.h>
112#include "strtolctype.h"
113
114/* FreeBSD includes: */
115#if !defined(sun)
116#include <sys/callout.h>
117#include <sys/ctype.h>
118#include <sys/eventhandler.h>
119#include <sys/limits.h>
120#include <sys/kdb.h>
121#include <sys/kernel.h>
122#include <sys/malloc.h>
123#include <sys/sysctl.h>
124#include <sys/lock.h>
125#include <sys/mutex.h>
126#include <sys/rwlock.h>
127#include <sys/sx.h>
128#include <sys/dtrace_bsd.h>
129#include <netinet/in.h>
130#include "dtrace_cddl.h"
131#include "dtrace_debug.c"
132#endif
133
134/*
135 * DTrace Tunable Variables
136 *
137 * The following variables may be tuned by adding a line to /etc/system that
138 * includes both the name of the DTrace module ("dtrace") and the name of the
139 * variable.  For example:
140 *
141 *   set dtrace:dtrace_destructive_disallow = 1
142 *
143 * In general, the only variables that one should be tuning this way are those
144 * that affect system-wide DTrace behavior, and for which the default behavior
145 * is undesirable.  Most of these variables are tunable on a per-consumer
146 * basis using DTrace options, and need not be tuned on a system-wide basis.
147 * When tuning these variables, avoid pathological values; while some attempt
148 * is made to verify the integrity of these variables, they are not considered
149 * part of the supported interface to DTrace, and they are therefore not
150 * checked comprehensively.  Further, these variables should not be tuned
151 * dynamically via "mdb -kw" or other means; they should only be tuned via
152 * /etc/system.
153 */
154int		dtrace_destructive_disallow = 0;
155dtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
156size_t		dtrace_difo_maxsize = (256 * 1024);
157dtrace_optval_t	dtrace_dof_maxsize = (8 * 1024 * 1024);
158size_t		dtrace_global_maxsize = (16 * 1024);
159size_t		dtrace_actions_max = (16 * 1024);
160size_t		dtrace_retain_max = 1024;
161dtrace_optval_t	dtrace_helper_actions_max = 128;
162dtrace_optval_t	dtrace_helper_providers_max = 32;
163dtrace_optval_t	dtrace_dstate_defsize = (1 * 1024 * 1024);
164size_t		dtrace_strsize_default = 256;
165dtrace_optval_t	dtrace_cleanrate_default = 9900990;		/* 101 hz */
166dtrace_optval_t	dtrace_cleanrate_min = 200000;			/* 5000 hz */
167dtrace_optval_t	dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;	/* 1/minute */
168dtrace_optval_t	dtrace_aggrate_default = NANOSEC;		/* 1 hz */
169dtrace_optval_t	dtrace_statusrate_default = NANOSEC;		/* 1 hz */
170dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;	 /* 6/minute */
171dtrace_optval_t	dtrace_switchrate_default = NANOSEC;		/* 1 hz */
172dtrace_optval_t	dtrace_nspec_default = 1;
173dtrace_optval_t	dtrace_specsize_default = 32 * 1024;
174dtrace_optval_t dtrace_stackframes_default = 20;
175dtrace_optval_t dtrace_ustackframes_default = 20;
176dtrace_optval_t dtrace_jstackframes_default = 50;
177dtrace_optval_t dtrace_jstackstrsize_default = 512;
178int		dtrace_msgdsize_max = 128;
179hrtime_t	dtrace_chill_max = MSEC2NSEC(500);		/* 500 ms */
180hrtime_t	dtrace_chill_interval = NANOSEC;		/* 1000 ms */
181int		dtrace_devdepth_max = 32;
182int		dtrace_err_verbose;
183hrtime_t	dtrace_deadman_interval = NANOSEC;
184hrtime_t	dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
185hrtime_t	dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
186hrtime_t	dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
187#if !defined(sun)
188int		dtrace_memstr_max = 4096;
189#endif
190
191/*
192 * DTrace External Variables
193 *
194 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
195 * available to DTrace consumers via the backtick (`) syntax.  One of these,
196 * dtrace_zero, is made deliberately so:  it is provided as a source of
197 * well-known, zero-filled memory.  While this variable is not documented,
198 * it is used by some translators as an implementation detail.
199 */
200const char	dtrace_zero[256] = { 0 };	/* zero-filled memory */
201
202/*
203 * DTrace Internal Variables
204 */
205#if defined(sun)
206static dev_info_t	*dtrace_devi;		/* device info */
207#endif
208#if defined(sun)
209static vmem_t		*dtrace_arena;		/* probe ID arena */
210static vmem_t		*dtrace_minor;		/* minor number arena */
211#else
212static taskq_t		*dtrace_taskq;		/* task queue */
213static struct unrhdr	*dtrace_arena;		/* Probe ID number.     */
214#endif
215static dtrace_probe_t	**dtrace_probes;	/* array of all probes */
216static int		dtrace_nprobes;		/* number of probes */
217static dtrace_provider_t *dtrace_provider;	/* provider list */
218static dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
219static int		dtrace_opens;		/* number of opens */
220static int		dtrace_helpers;		/* number of helpers */
221static int		dtrace_getf;		/* number of unpriv getf()s */
222#if defined(sun)
223static void		*dtrace_softstate;	/* softstate pointer */
224#endif
225static dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
226static dtrace_hash_t	*dtrace_byfunc;		/* probes hashed by function */
227static dtrace_hash_t	*dtrace_byname;		/* probes hashed by name */
228static dtrace_toxrange_t *dtrace_toxrange;	/* toxic range array */
229static int		dtrace_toxranges;	/* number of toxic ranges */
230static int		dtrace_toxranges_max;	/* size of toxic range array */
231static dtrace_anon_t	dtrace_anon;		/* anonymous enabling */
232static kmem_cache_t	*dtrace_state_cache;	/* cache for dynamic state */
233static uint64_t		dtrace_vtime_references; /* number of vtimestamp refs */
234static kthread_t	*dtrace_panicked;	/* panicking thread */
235static dtrace_ecb_t	*dtrace_ecb_create_cache; /* cached created ECB */
236static dtrace_genid_t	dtrace_probegen;	/* current probe generation */
237static dtrace_helpers_t *dtrace_deferred_pid;	/* deferred helper list */
238static dtrace_enabling_t *dtrace_retained;	/* list of retained enablings */
239static dtrace_genid_t	dtrace_retained_gen;	/* current retained enab gen */
240static dtrace_dynvar_t	dtrace_dynhash_sink;	/* end of dynamic hash chains */
241static int		dtrace_dynvar_failclean; /* dynvars failed to clean */
242#if !defined(sun)
243static struct mtx	dtrace_unr_mtx;
244MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
245int		dtrace_in_probe;	/* non-zero if executing a probe */
246#if defined(__i386__) || defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
247uintptr_t	dtrace_in_probe_addr;	/* Address of invop when already in probe */
248#endif
249static eventhandler_tag	dtrace_kld_load_tag;
250static eventhandler_tag	dtrace_kld_unload_try_tag;
251#endif
252
253/*
254 * DTrace Locking
255 * DTrace is protected by three (relatively coarse-grained) locks:
256 *
257 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
258 *     including enabling state, probes, ECBs, consumer state, helper state,
259 *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
260 *     probe context is lock-free -- synchronization is handled via the
261 *     dtrace_sync() cross call mechanism.
262 *
263 * (2) dtrace_provider_lock is required when manipulating provider state, or
264 *     when provider state must be held constant.
265 *
266 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
267 *     when meta provider state must be held constant.
268 *
269 * The lock ordering between these three locks is dtrace_meta_lock before
270 * dtrace_provider_lock before dtrace_lock.  (In particular, there are
271 * several places where dtrace_provider_lock is held by the framework as it
272 * calls into the providers -- which then call back into the framework,
273 * grabbing dtrace_lock.)
274 *
275 * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
276 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
277 * role as a coarse-grained lock; it is acquired before both of these locks.
278 * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
279 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
280 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
281 * acquired _between_ dtrace_provider_lock and dtrace_lock.
282 */
283static kmutex_t		dtrace_lock;		/* probe state lock */
284static kmutex_t		dtrace_provider_lock;	/* provider state lock */
285static kmutex_t		dtrace_meta_lock;	/* meta-provider state lock */
286
287#if !defined(sun)
288/* XXX FreeBSD hacks. */
289#define cr_suid		cr_svuid
290#define cr_sgid		cr_svgid
291#define	ipaddr_t	in_addr_t
292#define mod_modname	pathname
293#define vuprintf	vprintf
294#define ttoproc(_a)	((_a)->td_proc)
295#define crgetzoneid(_a)	0
296#define	NCPU		MAXCPU
297#define SNOCD		0
298#define CPU_ON_INTR(_a)	0
299
300#define PRIV_EFFECTIVE		(1 << 0)
301#define PRIV_DTRACE_KERNEL	(1 << 1)
302#define PRIV_DTRACE_PROC	(1 << 2)
303#define PRIV_DTRACE_USER	(1 << 3)
304#define PRIV_PROC_OWNER		(1 << 4)
305#define PRIV_PROC_ZONE		(1 << 5)
306#define PRIV_ALL		~0
307
308SYSCTL_DECL(_debug_dtrace);
309SYSCTL_DECL(_kern_dtrace);
310#endif
311
312#if defined(sun)
313#define curcpu	CPU->cpu_id
314#endif
315
316
317/*
318 * DTrace Provider Variables
319 *
320 * These are the variables relating to DTrace as a provider (that is, the
321 * provider of the BEGIN, END, and ERROR probes).
322 */
323static dtrace_pattr_t	dtrace_provider_attr = {
324{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
325{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
326{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
327{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
328{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
329};
330
331static void
332dtrace_nullop(void)
333{}
334
335static dtrace_pops_t	dtrace_provider_ops = {
336	(void (*)(void *, dtrace_probedesc_t *))dtrace_nullop,
337	(void (*)(void *, modctl_t *))dtrace_nullop,
338	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
339	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
340	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
341	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
342	NULL,
343	NULL,
344	NULL,
345	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop
346};
347
348static dtrace_id_t	dtrace_probeid_begin;	/* special BEGIN probe */
349static dtrace_id_t	dtrace_probeid_end;	/* special END probe */
350dtrace_id_t		dtrace_probeid_error;	/* special ERROR probe */
351
352/*
353 * DTrace Helper Tracing Variables
354 *
355 * These variables should be set dynamically to enable helper tracing.  The
356 * only variables that should be set are dtrace_helptrace_enable (which should
357 * be set to a non-zero value to allocate helper tracing buffers on the next
358 * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a
359 * non-zero value to deallocate helper tracing buffers on the next close of
360 * /dev/dtrace).  When (and only when) helper tracing is disabled, the
361 * buffer size may also be set via dtrace_helptrace_bufsize.
362 */
363int			dtrace_helptrace_enable = 0;
364int			dtrace_helptrace_disable = 0;
365int			dtrace_helptrace_bufsize = 16 * 1024 * 1024;
366uint32_t		dtrace_helptrace_nlocals;
367static dtrace_helptrace_t *dtrace_helptrace_buffer;
368static uint32_t		dtrace_helptrace_next = 0;
369static int		dtrace_helptrace_wrapped = 0;
370
371/*
372 * DTrace Error Hashing
373 *
374 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
375 * table.  This is very useful for checking coverage of tests that are
376 * expected to induce DIF or DOF processing errors, and may be useful for
377 * debugging problems in the DIF code generator or in DOF generation .  The
378 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
379 */
380#ifdef DEBUG
381static dtrace_errhash_t	dtrace_errhash[DTRACE_ERRHASHSZ];
382static const char *dtrace_errlast;
383static kthread_t *dtrace_errthread;
384static kmutex_t dtrace_errlock;
385#endif
386
387/*
388 * DTrace Macros and Constants
389 *
390 * These are various macros that are useful in various spots in the
391 * implementation, along with a few random constants that have no meaning
392 * outside of the implementation.  There is no real structure to this cpp
393 * mishmash -- but is there ever?
394 */
395#define	DTRACE_HASHSTR(hash, probe)	\
396	dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
397
398#define	DTRACE_HASHNEXT(hash, probe)	\
399	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
400
401#define	DTRACE_HASHPREV(hash, probe)	\
402	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
403
404#define	DTRACE_HASHEQ(hash, lhs, rhs)	\
405	(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
406	    *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
407
408#define	DTRACE_AGGHASHSIZE_SLEW		17
409
410#define	DTRACE_V4MAPPED_OFFSET		(sizeof (uint32_t) * 3)
411
412/*
413 * The key for a thread-local variable consists of the lower 61 bits of the
414 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
415 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
416 * equal to a variable identifier.  This is necessary (but not sufficient) to
417 * assure that global associative arrays never collide with thread-local
418 * variables.  To guarantee that they cannot collide, we must also define the
419 * order for keying dynamic variables.  That order is:
420 *
421 *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
422 *
423 * Because the variable-key and the tls-key are in orthogonal spaces, there is
424 * no way for a global variable key signature to match a thread-local key
425 * signature.
426 */
427#if defined(sun)
428#define	DTRACE_TLS_THRKEY(where) { \
429	uint_t intr = 0; \
430	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
431	for (; actv; actv >>= 1) \
432		intr++; \
433	ASSERT(intr < (1 << 3)); \
434	(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
435	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
436}
437#else
438#define	DTRACE_TLS_THRKEY(where) { \
439	solaris_cpu_t *_c = &solaris_cpu[curcpu]; \
440	uint_t intr = 0; \
441	uint_t actv = _c->cpu_intr_actv; \
442	for (; actv; actv >>= 1) \
443		intr++; \
444	ASSERT(intr < (1 << 3)); \
445	(where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \
446	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
447}
448#endif
449
450#define	DT_BSWAP_8(x)	((x) & 0xff)
451#define	DT_BSWAP_16(x)	((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
452#define	DT_BSWAP_32(x)	((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
453#define	DT_BSWAP_64(x)	((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
454
455#define	DT_MASK_LO 0x00000000FFFFFFFFULL
456
457#define	DTRACE_STORE(type, tomax, offset, what) \
458	*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
459
460#ifndef __x86
461#define	DTRACE_ALIGNCHECK(addr, size, flags)				\
462	if (addr & (size - 1)) {					\
463		*flags |= CPU_DTRACE_BADALIGN;				\
464		cpu_core[curcpu].cpuc_dtrace_illval = addr;	\
465		return (0);						\
466	}
467#else
468#define	DTRACE_ALIGNCHECK(addr, size, flags)
469#endif
470
471/*
472 * Test whether a range of memory starting at testaddr of size testsz falls
473 * within the range of memory described by addr, sz.  We take care to avoid
474 * problems with overflow and underflow of the unsigned quantities, and
475 * disallow all negative sizes.  Ranges of size 0 are allowed.
476 */
477#define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
478	((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
479	(testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
480	(testaddr) + (testsz) >= (testaddr))
481
482/*
483 * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
484 * alloc_sz on the righthand side of the comparison in order to avoid overflow
485 * or underflow in the comparison with it.  This is simpler than the INRANGE
486 * check above, because we know that the dtms_scratch_ptr is valid in the
487 * range.  Allocations of size zero are allowed.
488 */
489#define	DTRACE_INSCRATCH(mstate, alloc_sz) \
490	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
491	(mstate)->dtms_scratch_ptr >= (alloc_sz))
492
493#define	DTRACE_LOADFUNC(bits)						\
494/*CSTYLED*/								\
495uint##bits##_t								\
496dtrace_load##bits(uintptr_t addr)					\
497{									\
498	size_t size = bits / NBBY;					\
499	/*CSTYLED*/							\
500	uint##bits##_t rval;						\
501	int i;								\
502	volatile uint16_t *flags = (volatile uint16_t *)		\
503	    &cpu_core[curcpu].cpuc_dtrace_flags;			\
504									\
505	DTRACE_ALIGNCHECK(addr, size, flags);				\
506									\
507	for (i = 0; i < dtrace_toxranges; i++) {			\
508		if (addr >= dtrace_toxrange[i].dtt_limit)		\
509			continue;					\
510									\
511		if (addr + size <= dtrace_toxrange[i].dtt_base)		\
512			continue;					\
513									\
514		/*							\
515		 * This address falls within a toxic region; return 0.	\
516		 */							\
517		*flags |= CPU_DTRACE_BADADDR;				\
518		cpu_core[curcpu].cpuc_dtrace_illval = addr;		\
519		return (0);						\
520	}								\
521									\
522	*flags |= CPU_DTRACE_NOFAULT;					\
523	/*CSTYLED*/							\
524	rval = *((volatile uint##bits##_t *)addr);			\
525	*flags &= ~CPU_DTRACE_NOFAULT;					\
526									\
527	return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);		\
528}
529
530#ifdef _LP64
531#define	dtrace_loadptr	dtrace_load64
532#else
533#define	dtrace_loadptr	dtrace_load32
534#endif
535
536#define	DTRACE_DYNHASH_FREE	0
537#define	DTRACE_DYNHASH_SINK	1
538#define	DTRACE_DYNHASH_VALID	2
539
540#define	DTRACE_MATCH_NEXT	0
541#define	DTRACE_MATCH_DONE	1
542#define	DTRACE_ANCHORED(probe)	((probe)->dtpr_func[0] != '\0')
543#define	DTRACE_STATE_ALIGN	64
544
545#define	DTRACE_FLAGS2FLT(flags)						\
546	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :		\
547	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :		\
548	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :		\
549	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :		\
550	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :		\
551	((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :		\
552	((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :		\
553	((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :	\
554	((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :		\
555	DTRACEFLT_UNKNOWN)
556
557#define	DTRACEACT_ISSTRING(act)						\
558	((act)->dta_kind == DTRACEACT_DIFEXPR &&			\
559	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
560
561/* Function prototype definitions: */
562static size_t dtrace_strlen(const char *, size_t);
563static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
564static void dtrace_enabling_provide(dtrace_provider_t *);
565static int dtrace_enabling_match(dtrace_enabling_t *, int *);
566static void dtrace_enabling_matchall(void);
567static void dtrace_enabling_reap(void);
568static dtrace_state_t *dtrace_anon_grab(void);
569static uint64_t dtrace_helper(int, dtrace_mstate_t *,
570    dtrace_state_t *, uint64_t, uint64_t);
571static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
572static void dtrace_buffer_drop(dtrace_buffer_t *);
573static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
574static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
575    dtrace_state_t *, dtrace_mstate_t *);
576static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
577    dtrace_optval_t);
578static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
579static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
580uint16_t dtrace_load16(uintptr_t);
581uint32_t dtrace_load32(uintptr_t);
582uint64_t dtrace_load64(uintptr_t);
583uint8_t dtrace_load8(uintptr_t);
584void dtrace_dynvar_clean(dtrace_dstate_t *);
585dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
586    size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
587uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
588static int dtrace_priv_proc(dtrace_state_t *);
589static void dtrace_getf_barrier(void);
590
591/*
592 * DTrace Probe Context Functions
593 *
594 * These functions are called from probe context.  Because probe context is
595 * any context in which C may be called, arbitrarily locks may be held,
596 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
597 * As a result, functions called from probe context may only call other DTrace
598 * support functions -- they may not interact at all with the system at large.
599 * (Note that the ASSERT macro is made probe-context safe by redefining it in
600 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
601 * loads are to be performed from probe context, they _must_ be in terms of
602 * the safe dtrace_load*() variants.
603 *
604 * Some functions in this block are not actually called from probe context;
605 * for these functions, there will be a comment above the function reading
606 * "Note:  not called from probe context."
607 */
608void
609dtrace_panic(const char *format, ...)
610{
611	va_list alist;
612
613	va_start(alist, format);
614#ifdef __FreeBSD__
615	vpanic(format, alist);
616#else
617	dtrace_vpanic(format, alist);
618#endif
619	va_end(alist);
620}
621
622int
623dtrace_assfail(const char *a, const char *f, int l)
624{
625	dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
626
627	/*
628	 * We just need something here that even the most clever compiler
629	 * cannot optimize away.
630	 */
631	return (a[(uintptr_t)f]);
632}
633
634/*
635 * Atomically increment a specified error counter from probe context.
636 */
637static void
638dtrace_error(uint32_t *counter)
639{
640	/*
641	 * Most counters stored to in probe context are per-CPU counters.
642	 * However, there are some error conditions that are sufficiently
643	 * arcane that they don't merit per-CPU storage.  If these counters
644	 * are incremented concurrently on different CPUs, scalability will be
645	 * adversely affected -- but we don't expect them to be white-hot in a
646	 * correctly constructed enabling...
647	 */
648	uint32_t oval, nval;
649
650	do {
651		oval = *counter;
652
653		if ((nval = oval + 1) == 0) {
654			/*
655			 * If the counter would wrap, set it to 1 -- assuring
656			 * that the counter is never zero when we have seen
657			 * errors.  (The counter must be 32-bits because we
658			 * aren't guaranteed a 64-bit compare&swap operation.)
659			 * To save this code both the infamy of being fingered
660			 * by a priggish news story and the indignity of being
661			 * the target of a neo-puritan witch trial, we're
662			 * carefully avoiding any colorful description of the
663			 * likelihood of this condition -- but suffice it to
664			 * say that it is only slightly more likely than the
665			 * overflow of predicate cache IDs, as discussed in
666			 * dtrace_predicate_create().
667			 */
668			nval = 1;
669		}
670	} while (dtrace_cas32(counter, oval, nval) != oval);
671}
672
673/*
674 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
675 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
676 */
677DTRACE_LOADFUNC(8)
678DTRACE_LOADFUNC(16)
679DTRACE_LOADFUNC(32)
680DTRACE_LOADFUNC(64)
681
682static int
683dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
684{
685	if (dest < mstate->dtms_scratch_base)
686		return (0);
687
688	if (dest + size < dest)
689		return (0);
690
691	if (dest + size > mstate->dtms_scratch_ptr)
692		return (0);
693
694	return (1);
695}
696
697static int
698dtrace_canstore_statvar(uint64_t addr, size_t sz,
699    dtrace_statvar_t **svars, int nsvars)
700{
701	int i;
702
703	for (i = 0; i < nsvars; i++) {
704		dtrace_statvar_t *svar = svars[i];
705
706		if (svar == NULL || svar->dtsv_size == 0)
707			continue;
708
709		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
710			return (1);
711	}
712
713	return (0);
714}
715
716/*
717 * Check to see if the address is within a memory region to which a store may
718 * be issued.  This includes the DTrace scratch areas, and any DTrace variable
719 * region.  The caller of dtrace_canstore() is responsible for performing any
720 * alignment checks that are needed before stores are actually executed.
721 */
722static int
723dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
724    dtrace_vstate_t *vstate)
725{
726	/*
727	 * First, check to see if the address is in scratch space...
728	 */
729	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
730	    mstate->dtms_scratch_size))
731		return (1);
732
733	/*
734	 * Now check to see if it's a dynamic variable.  This check will pick
735	 * up both thread-local variables and any global dynamically-allocated
736	 * variables.
737	 */
738	if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
739	    vstate->dtvs_dynvars.dtds_size)) {
740		dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
741		uintptr_t base = (uintptr_t)dstate->dtds_base +
742		    (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
743		uintptr_t chunkoffs;
744
745		/*
746		 * Before we assume that we can store here, we need to make
747		 * sure that it isn't in our metadata -- storing to our
748		 * dynamic variable metadata would corrupt our state.  For
749		 * the range to not include any dynamic variable metadata,
750		 * it must:
751		 *
752		 *	(1) Start above the hash table that is at the base of
753		 *	the dynamic variable space
754		 *
755		 *	(2) Have a starting chunk offset that is beyond the
756		 *	dtrace_dynvar_t that is at the base of every chunk
757		 *
758		 *	(3) Not span a chunk boundary
759		 *
760		 */
761		if (addr < base)
762			return (0);
763
764		chunkoffs = (addr - base) % dstate->dtds_chunksize;
765
766		if (chunkoffs < sizeof (dtrace_dynvar_t))
767			return (0);
768
769		if (chunkoffs + sz > dstate->dtds_chunksize)
770			return (0);
771
772		return (1);
773	}
774
775	/*
776	 * Finally, check the static local and global variables.  These checks
777	 * take the longest, so we perform them last.
778	 */
779	if (dtrace_canstore_statvar(addr, sz,
780	    vstate->dtvs_locals, vstate->dtvs_nlocals))
781		return (1);
782
783	if (dtrace_canstore_statvar(addr, sz,
784	    vstate->dtvs_globals, vstate->dtvs_nglobals))
785		return (1);
786
787	return (0);
788}
789
790
791/*
792 * Convenience routine to check to see if the address is within a memory
793 * region in which a load may be issued given the user's privilege level;
794 * if not, it sets the appropriate error flags and loads 'addr' into the
795 * illegal value slot.
796 *
797 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
798 * appropriate memory access protection.
799 */
800static int
801dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
802    dtrace_vstate_t *vstate)
803{
804	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
805	file_t *fp;
806
807	/*
808	 * If we hold the privilege to read from kernel memory, then
809	 * everything is readable.
810	 */
811	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
812		return (1);
813
814	/*
815	 * You can obviously read that which you can store.
816	 */
817	if (dtrace_canstore(addr, sz, mstate, vstate))
818		return (1);
819
820	/*
821	 * We're allowed to read from our own string table.
822	 */
823	if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
824	    mstate->dtms_difo->dtdo_strlen))
825		return (1);
826
827	if (vstate->dtvs_state != NULL &&
828	    dtrace_priv_proc(vstate->dtvs_state)) {
829		proc_t *p;
830
831		/*
832		 * When we have privileges to the current process, there are
833		 * several context-related kernel structures that are safe to
834		 * read, even absent the privilege to read from kernel memory.
835		 * These reads are safe because these structures contain only
836		 * state that (1) we're permitted to read, (2) is harmless or
837		 * (3) contains pointers to additional kernel state that we're
838		 * not permitted to read (and as such, do not present an
839		 * opportunity for privilege escalation).  Finally (and
840		 * critically), because of the nature of their relation with
841		 * the current thread context, the memory associated with these
842		 * structures cannot change over the duration of probe context,
843		 * and it is therefore impossible for this memory to be
844		 * deallocated and reallocated as something else while it's
845		 * being operated upon.
846		 */
847		if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t)))
848			return (1);
849
850		if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
851		    sz, curthread->t_procp, sizeof (proc_t))) {
852			return (1);
853		}
854
855		if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
856		    curthread->t_cred, sizeof (cred_t))) {
857			return (1);
858		}
859
860#if defined(sun)
861		if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
862		    &(p->p_pidp->pid_id), sizeof (pid_t))) {
863			return (1);
864		}
865
866		if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
867		    curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
868			return (1);
869		}
870#endif
871	}
872
873	if ((fp = mstate->dtms_getf) != NULL) {
874		uintptr_t psz = sizeof (void *);
875		vnode_t *vp;
876		vnodeops_t *op;
877
878		/*
879		 * When getf() returns a file_t, the enabling is implicitly
880		 * granted the (transient) right to read the returned file_t
881		 * as well as the v_path and v_op->vnop_name of the underlying
882		 * vnode.  These accesses are allowed after a successful
883		 * getf() because the members that they refer to cannot change
884		 * once set -- and the barrier logic in the kernel's closef()
885		 * path assures that the file_t and its referenced vode_t
886		 * cannot themselves be stale (that is, it impossible for
887		 * either dtms_getf itself or its f_vnode member to reference
888		 * freed memory).
889		 */
890		if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t)))
891			return (1);
892
893		if ((vp = fp->f_vnode) != NULL) {
894#if defined(sun)
895			if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz))
896				return (1);
897			if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz,
898			    vp->v_path, strlen(vp->v_path) + 1)) {
899				return (1);
900			}
901#endif
902
903			if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz))
904				return (1);
905
906#if defined(sun)
907			if ((op = vp->v_op) != NULL &&
908			    DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
909				return (1);
910			}
911
912			if (op != NULL && op->vnop_name != NULL &&
913			    DTRACE_INRANGE(addr, sz, op->vnop_name,
914			    strlen(op->vnop_name) + 1)) {
915				return (1);
916			}
917#endif
918		}
919	}
920
921	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
922	*illval = addr;
923	return (0);
924}
925
926/*
927 * Convenience routine to check to see if a given string is within a memory
928 * region in which a load may be issued given the user's privilege level;
929 * this exists so that we don't need to issue unnecessary dtrace_strlen()
930 * calls in the event that the user has all privileges.
931 */
932static int
933dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
934    dtrace_vstate_t *vstate)
935{
936	size_t strsz;
937
938	/*
939	 * If we hold the privilege to read from kernel memory, then
940	 * everything is readable.
941	 */
942	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
943		return (1);
944
945	strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
946	if (dtrace_canload(addr, strsz, mstate, vstate))
947		return (1);
948
949	return (0);
950}
951
952/*
953 * Convenience routine to check to see if a given variable is within a memory
954 * region in which a load may be issued given the user's privilege level.
955 */
956static int
957dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
958    dtrace_vstate_t *vstate)
959{
960	size_t sz;
961	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
962
963	/*
964	 * If we hold the privilege to read from kernel memory, then
965	 * everything is readable.
966	 */
967	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
968		return (1);
969
970	if (type->dtdt_kind == DIF_TYPE_STRING)
971		sz = dtrace_strlen(src,
972		    vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
973	else
974		sz = type->dtdt_size;
975
976	return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
977}
978
979/*
980 * Convert a string to a signed integer using safe loads.
981 *
982 * NOTE: This function uses various macros from strtolctype.h to manipulate
983 * digit values, etc -- these have all been checked to ensure they make
984 * no additional function calls.
985 */
986static int64_t
987dtrace_strtoll(char *input, int base, size_t limit)
988{
989	uintptr_t pos = (uintptr_t)input;
990	int64_t val = 0;
991	int x;
992	boolean_t neg = B_FALSE;
993	char c, cc, ccc;
994	uintptr_t end = pos + limit;
995
996	/*
997	 * Consume any whitespace preceding digits.
998	 */
999	while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
1000		pos++;
1001
1002	/*
1003	 * Handle an explicit sign if one is present.
1004	 */
1005	if (c == '-' || c == '+') {
1006		if (c == '-')
1007			neg = B_TRUE;
1008		c = dtrace_load8(++pos);
1009	}
1010
1011	/*
1012	 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1013	 * if present.
1014	 */
1015	if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1016	    cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1017		pos += 2;
1018		c = ccc;
1019	}
1020
1021	/*
1022	 * Read in contiguous digits until the first non-digit character.
1023	 */
1024	for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1025	    c = dtrace_load8(++pos))
1026		val = val * base + x;
1027
1028	return (neg ? -val : val);
1029}
1030
1031/*
1032 * Compare two strings using safe loads.
1033 */
1034static int
1035dtrace_strncmp(char *s1, char *s2, size_t limit)
1036{
1037	uint8_t c1, c2;
1038	volatile uint16_t *flags;
1039
1040	if (s1 == s2 || limit == 0)
1041		return (0);
1042
1043	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1044
1045	do {
1046		if (s1 == NULL) {
1047			c1 = '\0';
1048		} else {
1049			c1 = dtrace_load8((uintptr_t)s1++);
1050		}
1051
1052		if (s2 == NULL) {
1053			c2 = '\0';
1054		} else {
1055			c2 = dtrace_load8((uintptr_t)s2++);
1056		}
1057
1058		if (c1 != c2)
1059			return (c1 - c2);
1060	} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1061
1062	return (0);
1063}
1064
1065/*
1066 * Compute strlen(s) for a string using safe memory accesses.  The additional
1067 * len parameter is used to specify a maximum length to ensure completion.
1068 */
1069static size_t
1070dtrace_strlen(const char *s, size_t lim)
1071{
1072	uint_t len;
1073
1074	for (len = 0; len != lim; len++) {
1075		if (dtrace_load8((uintptr_t)s++) == '\0')
1076			break;
1077	}
1078
1079	return (len);
1080}
1081
1082/*
1083 * Check if an address falls within a toxic region.
1084 */
1085static int
1086dtrace_istoxic(uintptr_t kaddr, size_t size)
1087{
1088	uintptr_t taddr, tsize;
1089	int i;
1090
1091	for (i = 0; i < dtrace_toxranges; i++) {
1092		taddr = dtrace_toxrange[i].dtt_base;
1093		tsize = dtrace_toxrange[i].dtt_limit - taddr;
1094
1095		if (kaddr - taddr < tsize) {
1096			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1097			cpu_core[curcpu].cpuc_dtrace_illval = kaddr;
1098			return (1);
1099		}
1100
1101		if (taddr - kaddr < size) {
1102			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1103			cpu_core[curcpu].cpuc_dtrace_illval = taddr;
1104			return (1);
1105		}
1106	}
1107
1108	return (0);
1109}
1110
1111/*
1112 * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
1113 * memory specified by the DIF program.  The dst is assumed to be safe memory
1114 * that we can store to directly because it is managed by DTrace.  As with
1115 * standard bcopy, overlapping copies are handled properly.
1116 */
1117static void
1118dtrace_bcopy(const void *src, void *dst, size_t len)
1119{
1120	if (len != 0) {
1121		uint8_t *s1 = dst;
1122		const uint8_t *s2 = src;
1123
1124		if (s1 <= s2) {
1125			do {
1126				*s1++ = dtrace_load8((uintptr_t)s2++);
1127			} while (--len != 0);
1128		} else {
1129			s2 += len;
1130			s1 += len;
1131
1132			do {
1133				*--s1 = dtrace_load8((uintptr_t)--s2);
1134			} while (--len != 0);
1135		}
1136	}
1137}
1138
1139/*
1140 * Copy src to dst using safe memory accesses, up to either the specified
1141 * length, or the point that a nul byte is encountered.  The src is assumed to
1142 * be unsafe memory specified by the DIF program.  The dst is assumed to be
1143 * safe memory that we can store to directly because it is managed by DTrace.
1144 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1145 */
1146static void
1147dtrace_strcpy(const void *src, void *dst, size_t len)
1148{
1149	if (len != 0) {
1150		uint8_t *s1 = dst, c;
1151		const uint8_t *s2 = src;
1152
1153		do {
1154			*s1++ = c = dtrace_load8((uintptr_t)s2++);
1155		} while (--len != 0 && c != '\0');
1156	}
1157}
1158
1159/*
1160 * Copy src to dst, deriving the size and type from the specified (BYREF)
1161 * variable type.  The src is assumed to be unsafe memory specified by the DIF
1162 * program.  The dst is assumed to be DTrace variable memory that is of the
1163 * specified type; we assume that we can store to directly.
1164 */
1165static void
1166dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
1167{
1168	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1169
1170	if (type->dtdt_kind == DIF_TYPE_STRING) {
1171		dtrace_strcpy(src, dst, type->dtdt_size);
1172	} else {
1173		dtrace_bcopy(src, dst, type->dtdt_size);
1174	}
1175}
1176
1177/*
1178 * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1179 * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1180 * safe memory that we can access directly because it is managed by DTrace.
1181 */
1182static int
1183dtrace_bcmp(const void *s1, const void *s2, size_t len)
1184{
1185	volatile uint16_t *flags;
1186
1187	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1188
1189	if (s1 == s2)
1190		return (0);
1191
1192	if (s1 == NULL || s2 == NULL)
1193		return (1);
1194
1195	if (s1 != s2 && len != 0) {
1196		const uint8_t *ps1 = s1;
1197		const uint8_t *ps2 = s2;
1198
1199		do {
1200			if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1201				return (1);
1202		} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1203	}
1204	return (0);
1205}
1206
1207/*
1208 * Zero the specified region using a simple byte-by-byte loop.  Note that this
1209 * is for safe DTrace-managed memory only.
1210 */
1211static void
1212dtrace_bzero(void *dst, size_t len)
1213{
1214	uchar_t *cp;
1215
1216	for (cp = dst; len != 0; len--)
1217		*cp++ = 0;
1218}
1219
1220static void
1221dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1222{
1223	uint64_t result[2];
1224
1225	result[0] = addend1[0] + addend2[0];
1226	result[1] = addend1[1] + addend2[1] +
1227	    (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1228
1229	sum[0] = result[0];
1230	sum[1] = result[1];
1231}
1232
1233/*
1234 * Shift the 128-bit value in a by b. If b is positive, shift left.
1235 * If b is negative, shift right.
1236 */
1237static void
1238dtrace_shift_128(uint64_t *a, int b)
1239{
1240	uint64_t mask;
1241
1242	if (b == 0)
1243		return;
1244
1245	if (b < 0) {
1246		b = -b;
1247		if (b >= 64) {
1248			a[0] = a[1] >> (b - 64);
1249			a[1] = 0;
1250		} else {
1251			a[0] >>= b;
1252			mask = 1LL << (64 - b);
1253			mask -= 1;
1254			a[0] |= ((a[1] & mask) << (64 - b));
1255			a[1] >>= b;
1256		}
1257	} else {
1258		if (b >= 64) {
1259			a[1] = a[0] << (b - 64);
1260			a[0] = 0;
1261		} else {
1262			a[1] <<= b;
1263			mask = a[0] >> (64 - b);
1264			a[1] |= mask;
1265			a[0] <<= b;
1266		}
1267	}
1268}
1269
1270/*
1271 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1272 * use native multiplication on those, and then re-combine into the
1273 * resulting 128-bit value.
1274 *
1275 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1276 *     hi1 * hi2 << 64 +
1277 *     hi1 * lo2 << 32 +
1278 *     hi2 * lo1 << 32 +
1279 *     lo1 * lo2
1280 */
1281static void
1282dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1283{
1284	uint64_t hi1, hi2, lo1, lo2;
1285	uint64_t tmp[2];
1286
1287	hi1 = factor1 >> 32;
1288	hi2 = factor2 >> 32;
1289
1290	lo1 = factor1 & DT_MASK_LO;
1291	lo2 = factor2 & DT_MASK_LO;
1292
1293	product[0] = lo1 * lo2;
1294	product[1] = hi1 * hi2;
1295
1296	tmp[0] = hi1 * lo2;
1297	tmp[1] = 0;
1298	dtrace_shift_128(tmp, 32);
1299	dtrace_add_128(product, tmp, product);
1300
1301	tmp[0] = hi2 * lo1;
1302	tmp[1] = 0;
1303	dtrace_shift_128(tmp, 32);
1304	dtrace_add_128(product, tmp, product);
1305}
1306
1307/*
1308 * This privilege check should be used by actions and subroutines to
1309 * verify that the user credentials of the process that enabled the
1310 * invoking ECB match the target credentials
1311 */
1312static int
1313dtrace_priv_proc_common_user(dtrace_state_t *state)
1314{
1315	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1316
1317	/*
1318	 * We should always have a non-NULL state cred here, since if cred
1319	 * is null (anonymous tracing), we fast-path bypass this routine.
1320	 */
1321	ASSERT(s_cr != NULL);
1322
1323	if ((cr = CRED()) != NULL &&
1324	    s_cr->cr_uid == cr->cr_uid &&
1325	    s_cr->cr_uid == cr->cr_ruid &&
1326	    s_cr->cr_uid == cr->cr_suid &&
1327	    s_cr->cr_gid == cr->cr_gid &&
1328	    s_cr->cr_gid == cr->cr_rgid &&
1329	    s_cr->cr_gid == cr->cr_sgid)
1330		return (1);
1331
1332	return (0);
1333}
1334
1335/*
1336 * This privilege check should be used by actions and subroutines to
1337 * verify that the zone of the process that enabled the invoking ECB
1338 * matches the target credentials
1339 */
1340static int
1341dtrace_priv_proc_common_zone(dtrace_state_t *state)
1342{
1343#if defined(sun)
1344	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1345
1346	/*
1347	 * We should always have a non-NULL state cred here, since if cred
1348	 * is null (anonymous tracing), we fast-path bypass this routine.
1349	 */
1350	ASSERT(s_cr != NULL);
1351
1352	if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
1353		return (1);
1354
1355	return (0);
1356#else
1357	return (1);
1358#endif
1359}
1360
1361/*
1362 * This privilege check should be used by actions and subroutines to
1363 * verify that the process has not setuid or changed credentials.
1364 */
1365static int
1366dtrace_priv_proc_common_nocd(void)
1367{
1368	proc_t *proc;
1369
1370	if ((proc = ttoproc(curthread)) != NULL &&
1371	    !(proc->p_flag & SNOCD))
1372		return (1);
1373
1374	return (0);
1375}
1376
1377static int
1378dtrace_priv_proc_destructive(dtrace_state_t *state)
1379{
1380	int action = state->dts_cred.dcr_action;
1381
1382	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1383	    dtrace_priv_proc_common_zone(state) == 0)
1384		goto bad;
1385
1386	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1387	    dtrace_priv_proc_common_user(state) == 0)
1388		goto bad;
1389
1390	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1391	    dtrace_priv_proc_common_nocd() == 0)
1392		goto bad;
1393
1394	return (1);
1395
1396bad:
1397	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1398
1399	return (0);
1400}
1401
1402static int
1403dtrace_priv_proc_control(dtrace_state_t *state)
1404{
1405	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1406		return (1);
1407
1408	if (dtrace_priv_proc_common_zone(state) &&
1409	    dtrace_priv_proc_common_user(state) &&
1410	    dtrace_priv_proc_common_nocd())
1411		return (1);
1412
1413	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1414
1415	return (0);
1416}
1417
1418static int
1419dtrace_priv_proc(dtrace_state_t *state)
1420{
1421	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1422		return (1);
1423
1424	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1425
1426	return (0);
1427}
1428
1429static int
1430dtrace_priv_kernel(dtrace_state_t *state)
1431{
1432	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1433		return (1);
1434
1435	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1436
1437	return (0);
1438}
1439
1440static int
1441dtrace_priv_kernel_destructive(dtrace_state_t *state)
1442{
1443	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1444		return (1);
1445
1446	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1447
1448	return (0);
1449}
1450
1451/*
1452 * Determine if the dte_cond of the specified ECB allows for processing of
1453 * the current probe to continue.  Note that this routine may allow continued
1454 * processing, but with access(es) stripped from the mstate's dtms_access
1455 * field.
1456 */
1457static int
1458dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
1459    dtrace_ecb_t *ecb)
1460{
1461	dtrace_probe_t *probe = ecb->dte_probe;
1462	dtrace_provider_t *prov = probe->dtpr_provider;
1463	dtrace_pops_t *pops = &prov->dtpv_pops;
1464	int mode = DTRACE_MODE_NOPRIV_DROP;
1465
1466	ASSERT(ecb->dte_cond);
1467
1468#if defined(sun)
1469	if (pops->dtps_mode != NULL) {
1470		mode = pops->dtps_mode(prov->dtpv_arg,
1471		    probe->dtpr_id, probe->dtpr_arg);
1472
1473		ASSERT((mode & DTRACE_MODE_USER) ||
1474		    (mode & DTRACE_MODE_KERNEL));
1475		ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
1476		    (mode & DTRACE_MODE_NOPRIV_DROP));
1477	}
1478
1479	/*
1480	 * If the dte_cond bits indicate that this consumer is only allowed to
1481	 * see user-mode firings of this probe, call the provider's dtps_mode()
1482	 * entry point to check that the probe was fired while in a user
1483	 * context.  If that's not the case, use the policy specified by the
1484	 * provider to determine if we drop the probe or merely restrict
1485	 * operation.
1486	 */
1487	if (ecb->dte_cond & DTRACE_COND_USERMODE) {
1488		ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
1489
1490		if (!(mode & DTRACE_MODE_USER)) {
1491			if (mode & DTRACE_MODE_NOPRIV_DROP)
1492				return (0);
1493
1494			mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1495		}
1496	}
1497#endif
1498
1499	/*
1500	 * This is more subtle than it looks. We have to be absolutely certain
1501	 * that CRED() isn't going to change out from under us so it's only
1502	 * legit to examine that structure if we're in constrained situations.
1503	 * Currently, the only times we'll this check is if a non-super-user
1504	 * has enabled the profile or syscall providers -- providers that
1505	 * allow visibility of all processes. For the profile case, the check
1506	 * above will ensure that we're examining a user context.
1507	 */
1508	if (ecb->dte_cond & DTRACE_COND_OWNER) {
1509		cred_t *cr;
1510		cred_t *s_cr = state->dts_cred.dcr_cred;
1511		proc_t *proc;
1512
1513		ASSERT(s_cr != NULL);
1514
1515		if ((cr = CRED()) == NULL ||
1516		    s_cr->cr_uid != cr->cr_uid ||
1517		    s_cr->cr_uid != cr->cr_ruid ||
1518		    s_cr->cr_uid != cr->cr_suid ||
1519		    s_cr->cr_gid != cr->cr_gid ||
1520		    s_cr->cr_gid != cr->cr_rgid ||
1521		    s_cr->cr_gid != cr->cr_sgid ||
1522		    (proc = ttoproc(curthread)) == NULL ||
1523		    (proc->p_flag & SNOCD)) {
1524			if (mode & DTRACE_MODE_NOPRIV_DROP)
1525				return (0);
1526
1527#if defined(sun)
1528			mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
1529#endif
1530		}
1531	}
1532
1533#if defined(sun)
1534	/*
1535	 * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
1536	 * in our zone, check to see if our mode policy is to restrict rather
1537	 * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
1538	 * and DTRACE_ACCESS_ARGS
1539	 */
1540	if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
1541		cred_t *cr;
1542		cred_t *s_cr = state->dts_cred.dcr_cred;
1543
1544		ASSERT(s_cr != NULL);
1545
1546		if ((cr = CRED()) == NULL ||
1547		    s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
1548			if (mode & DTRACE_MODE_NOPRIV_DROP)
1549				return (0);
1550
1551			mstate->dtms_access &=
1552			    ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
1553		}
1554	}
1555#endif
1556
1557	return (1);
1558}
1559
1560/*
1561 * Note:  not called from probe context.  This function is called
1562 * asynchronously (and at a regular interval) from outside of probe context to
1563 * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1564 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1565 */
1566void
1567dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1568{
1569	dtrace_dynvar_t *dirty;
1570	dtrace_dstate_percpu_t *dcpu;
1571	dtrace_dynvar_t **rinsep;
1572	int i, j, work = 0;
1573
1574	for (i = 0; i < NCPU; i++) {
1575		dcpu = &dstate->dtds_percpu[i];
1576		rinsep = &dcpu->dtdsc_rinsing;
1577
1578		/*
1579		 * If the dirty list is NULL, there is no dirty work to do.
1580		 */
1581		if (dcpu->dtdsc_dirty == NULL)
1582			continue;
1583
1584		if (dcpu->dtdsc_rinsing != NULL) {
1585			/*
1586			 * If the rinsing list is non-NULL, then it is because
1587			 * this CPU was selected to accept another CPU's
1588			 * dirty list -- and since that time, dirty buffers
1589			 * have accumulated.  This is a highly unlikely
1590			 * condition, but we choose to ignore the dirty
1591			 * buffers -- they'll be picked up a future cleanse.
1592			 */
1593			continue;
1594		}
1595
1596		if (dcpu->dtdsc_clean != NULL) {
1597			/*
1598			 * If the clean list is non-NULL, then we're in a
1599			 * situation where a CPU has done deallocations (we
1600			 * have a non-NULL dirty list) but no allocations (we
1601			 * also have a non-NULL clean list).  We can't simply
1602			 * move the dirty list into the clean list on this
1603			 * CPU, yet we also don't want to allow this condition
1604			 * to persist, lest a short clean list prevent a
1605			 * massive dirty list from being cleaned (which in
1606			 * turn could lead to otherwise avoidable dynamic
1607			 * drops).  To deal with this, we look for some CPU
1608			 * with a NULL clean list, NULL dirty list, and NULL
1609			 * rinsing list -- and then we borrow this CPU to
1610			 * rinse our dirty list.
1611			 */
1612			for (j = 0; j < NCPU; j++) {
1613				dtrace_dstate_percpu_t *rinser;
1614
1615				rinser = &dstate->dtds_percpu[j];
1616
1617				if (rinser->dtdsc_rinsing != NULL)
1618					continue;
1619
1620				if (rinser->dtdsc_dirty != NULL)
1621					continue;
1622
1623				if (rinser->dtdsc_clean != NULL)
1624					continue;
1625
1626				rinsep = &rinser->dtdsc_rinsing;
1627				break;
1628			}
1629
1630			if (j == NCPU) {
1631				/*
1632				 * We were unable to find another CPU that
1633				 * could accept this dirty list -- we are
1634				 * therefore unable to clean it now.
1635				 */
1636				dtrace_dynvar_failclean++;
1637				continue;
1638			}
1639		}
1640
1641		work = 1;
1642
1643		/*
1644		 * Atomically move the dirty list aside.
1645		 */
1646		do {
1647			dirty = dcpu->dtdsc_dirty;
1648
1649			/*
1650			 * Before we zap the dirty list, set the rinsing list.
1651			 * (This allows for a potential assertion in
1652			 * dtrace_dynvar():  if a free dynamic variable appears
1653			 * on a hash chain, either the dirty list or the
1654			 * rinsing list for some CPU must be non-NULL.)
1655			 */
1656			*rinsep = dirty;
1657			dtrace_membar_producer();
1658		} while (dtrace_casptr(&dcpu->dtdsc_dirty,
1659		    dirty, NULL) != dirty);
1660	}
1661
1662	if (!work) {
1663		/*
1664		 * We have no work to do; we can simply return.
1665		 */
1666		return;
1667	}
1668
1669	dtrace_sync();
1670
1671	for (i = 0; i < NCPU; i++) {
1672		dcpu = &dstate->dtds_percpu[i];
1673
1674		if (dcpu->dtdsc_rinsing == NULL)
1675			continue;
1676
1677		/*
1678		 * We are now guaranteed that no hash chain contains a pointer
1679		 * into this dirty list; we can make it clean.
1680		 */
1681		ASSERT(dcpu->dtdsc_clean == NULL);
1682		dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1683		dcpu->dtdsc_rinsing = NULL;
1684	}
1685
1686	/*
1687	 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1688	 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1689	 * This prevents a race whereby a CPU incorrectly decides that
1690	 * the state should be something other than DTRACE_DSTATE_CLEAN
1691	 * after dtrace_dynvar_clean() has completed.
1692	 */
1693	dtrace_sync();
1694
1695	dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1696}
1697
1698/*
1699 * Depending on the value of the op parameter, this function looks-up,
1700 * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1701 * allocation is requested, this function will return a pointer to a
1702 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1703 * variable can be allocated.  If NULL is returned, the appropriate counter
1704 * will be incremented.
1705 */
1706dtrace_dynvar_t *
1707dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1708    dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1709    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1710{
1711	uint64_t hashval = DTRACE_DYNHASH_VALID;
1712	dtrace_dynhash_t *hash = dstate->dtds_hash;
1713	dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1714	processorid_t me = curcpu, cpu = me;
1715	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1716	size_t bucket, ksize;
1717	size_t chunksize = dstate->dtds_chunksize;
1718	uintptr_t kdata, lock, nstate;
1719	uint_t i;
1720
1721	ASSERT(nkeys != 0);
1722
1723	/*
1724	 * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1725	 * algorithm.  For the by-value portions, we perform the algorithm in
1726	 * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1727	 * bit, and seems to have only a minute effect on distribution.  For
1728	 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1729	 * over each referenced byte.  It's painful to do this, but it's much
1730	 * better than pathological hash distribution.  The efficacy of the
1731	 * hashing algorithm (and a comparison with other algorithms) may be
1732	 * found by running the ::dtrace_dynstat MDB dcmd.
1733	 */
1734	for (i = 0; i < nkeys; i++) {
1735		if (key[i].dttk_size == 0) {
1736			uint64_t val = key[i].dttk_value;
1737
1738			hashval += (val >> 48) & 0xffff;
1739			hashval += (hashval << 10);
1740			hashval ^= (hashval >> 6);
1741
1742			hashval += (val >> 32) & 0xffff;
1743			hashval += (hashval << 10);
1744			hashval ^= (hashval >> 6);
1745
1746			hashval += (val >> 16) & 0xffff;
1747			hashval += (hashval << 10);
1748			hashval ^= (hashval >> 6);
1749
1750			hashval += val & 0xffff;
1751			hashval += (hashval << 10);
1752			hashval ^= (hashval >> 6);
1753		} else {
1754			/*
1755			 * This is incredibly painful, but it beats the hell
1756			 * out of the alternative.
1757			 */
1758			uint64_t j, size = key[i].dttk_size;
1759			uintptr_t base = (uintptr_t)key[i].dttk_value;
1760
1761			if (!dtrace_canload(base, size, mstate, vstate))
1762				break;
1763
1764			for (j = 0; j < size; j++) {
1765				hashval += dtrace_load8(base + j);
1766				hashval += (hashval << 10);
1767				hashval ^= (hashval >> 6);
1768			}
1769		}
1770	}
1771
1772	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1773		return (NULL);
1774
1775	hashval += (hashval << 3);
1776	hashval ^= (hashval >> 11);
1777	hashval += (hashval << 15);
1778
1779	/*
1780	 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1781	 * comes out to be one of our two sentinel hash values.  If this
1782	 * actually happens, we set the hashval to be a value known to be a
1783	 * non-sentinel value.
1784	 */
1785	if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1786		hashval = DTRACE_DYNHASH_VALID;
1787
1788	/*
1789	 * Yes, it's painful to do a divide here.  If the cycle count becomes
1790	 * important here, tricks can be pulled to reduce it.  (However, it's
1791	 * critical that hash collisions be kept to an absolute minimum;
1792	 * they're much more painful than a divide.)  It's better to have a
1793	 * solution that generates few collisions and still keeps things
1794	 * relatively simple.
1795	 */
1796	bucket = hashval % dstate->dtds_hashsize;
1797
1798	if (op == DTRACE_DYNVAR_DEALLOC) {
1799		volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1800
1801		for (;;) {
1802			while ((lock = *lockp) & 1)
1803				continue;
1804
1805			if (dtrace_casptr((volatile void *)lockp,
1806			    (volatile void *)lock, (volatile void *)(lock + 1)) == (void *)lock)
1807				break;
1808		}
1809
1810		dtrace_membar_producer();
1811	}
1812
1813top:
1814	prev = NULL;
1815	lock = hash[bucket].dtdh_lock;
1816
1817	dtrace_membar_consumer();
1818
1819	start = hash[bucket].dtdh_chain;
1820	ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1821	    start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1822	    op != DTRACE_DYNVAR_DEALLOC));
1823
1824	for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1825		dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1826		dtrace_key_t *dkey = &dtuple->dtt_key[0];
1827
1828		if (dvar->dtdv_hashval != hashval) {
1829			if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1830				/*
1831				 * We've reached the sink, and therefore the
1832				 * end of the hash chain; we can kick out of
1833				 * the loop knowing that we have seen a valid
1834				 * snapshot of state.
1835				 */
1836				ASSERT(dvar->dtdv_next == NULL);
1837				ASSERT(dvar == &dtrace_dynhash_sink);
1838				break;
1839			}
1840
1841			if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1842				/*
1843				 * We've gone off the rails:  somewhere along
1844				 * the line, one of the members of this hash
1845				 * chain was deleted.  Note that we could also
1846				 * detect this by simply letting this loop run
1847				 * to completion, as we would eventually hit
1848				 * the end of the dirty list.  However, we
1849				 * want to avoid running the length of the
1850				 * dirty list unnecessarily (it might be quite
1851				 * long), so we catch this as early as
1852				 * possible by detecting the hash marker.  In
1853				 * this case, we simply set dvar to NULL and
1854				 * break; the conditional after the loop will
1855				 * send us back to top.
1856				 */
1857				dvar = NULL;
1858				break;
1859			}
1860
1861			goto next;
1862		}
1863
1864		if (dtuple->dtt_nkeys != nkeys)
1865			goto next;
1866
1867		for (i = 0; i < nkeys; i++, dkey++) {
1868			if (dkey->dttk_size != key[i].dttk_size)
1869				goto next; /* size or type mismatch */
1870
1871			if (dkey->dttk_size != 0) {
1872				if (dtrace_bcmp(
1873				    (void *)(uintptr_t)key[i].dttk_value,
1874				    (void *)(uintptr_t)dkey->dttk_value,
1875				    dkey->dttk_size))
1876					goto next;
1877			} else {
1878				if (dkey->dttk_value != key[i].dttk_value)
1879					goto next;
1880			}
1881		}
1882
1883		if (op != DTRACE_DYNVAR_DEALLOC)
1884			return (dvar);
1885
1886		ASSERT(dvar->dtdv_next == NULL ||
1887		    dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1888
1889		if (prev != NULL) {
1890			ASSERT(hash[bucket].dtdh_chain != dvar);
1891			ASSERT(start != dvar);
1892			ASSERT(prev->dtdv_next == dvar);
1893			prev->dtdv_next = dvar->dtdv_next;
1894		} else {
1895			if (dtrace_casptr(&hash[bucket].dtdh_chain,
1896			    start, dvar->dtdv_next) != start) {
1897				/*
1898				 * We have failed to atomically swing the
1899				 * hash table head pointer, presumably because
1900				 * of a conflicting allocation on another CPU.
1901				 * We need to reread the hash chain and try
1902				 * again.
1903				 */
1904				goto top;
1905			}
1906		}
1907
1908		dtrace_membar_producer();
1909
1910		/*
1911		 * Now set the hash value to indicate that it's free.
1912		 */
1913		ASSERT(hash[bucket].dtdh_chain != dvar);
1914		dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1915
1916		dtrace_membar_producer();
1917
1918		/*
1919		 * Set the next pointer to point at the dirty list, and
1920		 * atomically swing the dirty pointer to the newly freed dvar.
1921		 */
1922		do {
1923			next = dcpu->dtdsc_dirty;
1924			dvar->dtdv_next = next;
1925		} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1926
1927		/*
1928		 * Finally, unlock this hash bucket.
1929		 */
1930		ASSERT(hash[bucket].dtdh_lock == lock);
1931		ASSERT(lock & 1);
1932		hash[bucket].dtdh_lock++;
1933
1934		return (NULL);
1935next:
1936		prev = dvar;
1937		continue;
1938	}
1939
1940	if (dvar == NULL) {
1941		/*
1942		 * If dvar is NULL, it is because we went off the rails:
1943		 * one of the elements that we traversed in the hash chain
1944		 * was deleted while we were traversing it.  In this case,
1945		 * we assert that we aren't doing a dealloc (deallocs lock
1946		 * the hash bucket to prevent themselves from racing with
1947		 * one another), and retry the hash chain traversal.
1948		 */
1949		ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1950		goto top;
1951	}
1952
1953	if (op != DTRACE_DYNVAR_ALLOC) {
1954		/*
1955		 * If we are not to allocate a new variable, we want to
1956		 * return NULL now.  Before we return, check that the value
1957		 * of the lock word hasn't changed.  If it has, we may have
1958		 * seen an inconsistent snapshot.
1959		 */
1960		if (op == DTRACE_DYNVAR_NOALLOC) {
1961			if (hash[bucket].dtdh_lock != lock)
1962				goto top;
1963		} else {
1964			ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1965			ASSERT(hash[bucket].dtdh_lock == lock);
1966			ASSERT(lock & 1);
1967			hash[bucket].dtdh_lock++;
1968		}
1969
1970		return (NULL);
1971	}
1972
1973	/*
1974	 * We need to allocate a new dynamic variable.  The size we need is the
1975	 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1976	 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1977	 * the size of any referred-to data (dsize).  We then round the final
1978	 * size up to the chunksize for allocation.
1979	 */
1980	for (ksize = 0, i = 0; i < nkeys; i++)
1981		ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1982
1983	/*
1984	 * This should be pretty much impossible, but could happen if, say,
1985	 * strange DIF specified the tuple.  Ideally, this should be an
1986	 * assertion and not an error condition -- but that requires that the
1987	 * chunksize calculation in dtrace_difo_chunksize() be absolutely
1988	 * bullet-proof.  (That is, it must not be able to be fooled by
1989	 * malicious DIF.)  Given the lack of backwards branches in DIF,
1990	 * solving this would presumably not amount to solving the Halting
1991	 * Problem -- but it still seems awfully hard.
1992	 */
1993	if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1994	    ksize + dsize > chunksize) {
1995		dcpu->dtdsc_drops++;
1996		return (NULL);
1997	}
1998
1999	nstate = DTRACE_DSTATE_EMPTY;
2000
2001	do {
2002retry:
2003		free = dcpu->dtdsc_free;
2004
2005		if (free == NULL) {
2006			dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2007			void *rval;
2008
2009			if (clean == NULL) {
2010				/*
2011				 * We're out of dynamic variable space on
2012				 * this CPU.  Unless we have tried all CPUs,
2013				 * we'll try to allocate from a different
2014				 * CPU.
2015				 */
2016				switch (dstate->dtds_state) {
2017				case DTRACE_DSTATE_CLEAN: {
2018					void *sp = &dstate->dtds_state;
2019
2020					if (++cpu >= NCPU)
2021						cpu = 0;
2022
2023					if (dcpu->dtdsc_dirty != NULL &&
2024					    nstate == DTRACE_DSTATE_EMPTY)
2025						nstate = DTRACE_DSTATE_DIRTY;
2026
2027					if (dcpu->dtdsc_rinsing != NULL)
2028						nstate = DTRACE_DSTATE_RINSING;
2029
2030					dcpu = &dstate->dtds_percpu[cpu];
2031
2032					if (cpu != me)
2033						goto retry;
2034
2035					(void) dtrace_cas32(sp,
2036					    DTRACE_DSTATE_CLEAN, nstate);
2037
2038					/*
2039					 * To increment the correct bean
2040					 * counter, take another lap.
2041					 */
2042					goto retry;
2043				}
2044
2045				case DTRACE_DSTATE_DIRTY:
2046					dcpu->dtdsc_dirty_drops++;
2047					break;
2048
2049				case DTRACE_DSTATE_RINSING:
2050					dcpu->dtdsc_rinsing_drops++;
2051					break;
2052
2053				case DTRACE_DSTATE_EMPTY:
2054					dcpu->dtdsc_drops++;
2055					break;
2056				}
2057
2058				DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2059				return (NULL);
2060			}
2061
2062			/*
2063			 * The clean list appears to be non-empty.  We want to
2064			 * move the clean list to the free list; we start by
2065			 * moving the clean pointer aside.
2066			 */
2067			if (dtrace_casptr(&dcpu->dtdsc_clean,
2068			    clean, NULL) != clean) {
2069				/*
2070				 * We are in one of two situations:
2071				 *
2072				 *  (a)	The clean list was switched to the
2073				 *	free list by another CPU.
2074				 *
2075				 *  (b)	The clean list was added to by the
2076				 *	cleansing cyclic.
2077				 *
2078				 * In either of these situations, we can
2079				 * just reattempt the free list allocation.
2080				 */
2081				goto retry;
2082			}
2083
2084			ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2085
2086			/*
2087			 * Now we'll move the clean list to our free list.
2088			 * It's impossible for this to fail:  the only way
2089			 * the free list can be updated is through this
2090			 * code path, and only one CPU can own the clean list.
2091			 * Thus, it would only be possible for this to fail if
2092			 * this code were racing with dtrace_dynvar_clean().
2093			 * (That is, if dtrace_dynvar_clean() updated the clean
2094			 * list, and we ended up racing to update the free
2095			 * list.)  This race is prevented by the dtrace_sync()
2096			 * in dtrace_dynvar_clean() -- which flushes the
2097			 * owners of the clean lists out before resetting
2098			 * the clean lists.
2099			 */
2100			dcpu = &dstate->dtds_percpu[me];
2101			rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2102			ASSERT(rval == NULL);
2103			goto retry;
2104		}
2105
2106		dvar = free;
2107		new_free = dvar->dtdv_next;
2108	} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2109
2110	/*
2111	 * We have now allocated a new chunk.  We copy the tuple keys into the
2112	 * tuple array and copy any referenced key data into the data space
2113	 * following the tuple array.  As we do this, we relocate dttk_value
2114	 * in the final tuple to point to the key data address in the chunk.
2115	 */
2116	kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2117	dvar->dtdv_data = (void *)(kdata + ksize);
2118	dvar->dtdv_tuple.dtt_nkeys = nkeys;
2119
2120	for (i = 0; i < nkeys; i++) {
2121		dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2122		size_t kesize = key[i].dttk_size;
2123
2124		if (kesize != 0) {
2125			dtrace_bcopy(
2126			    (const void *)(uintptr_t)key[i].dttk_value,
2127			    (void *)kdata, kesize);
2128			dkey->dttk_value = kdata;
2129			kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2130		} else {
2131			dkey->dttk_value = key[i].dttk_value;
2132		}
2133
2134		dkey->dttk_size = kesize;
2135	}
2136
2137	ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2138	dvar->dtdv_hashval = hashval;
2139	dvar->dtdv_next = start;
2140
2141	if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2142		return (dvar);
2143
2144	/*
2145	 * The cas has failed.  Either another CPU is adding an element to
2146	 * this hash chain, or another CPU is deleting an element from this
2147	 * hash chain.  The simplest way to deal with both of these cases
2148	 * (though not necessarily the most efficient) is to free our
2149	 * allocated block and tail-call ourselves.  Note that the free is
2150	 * to the dirty list and _not_ to the free list.  This is to prevent
2151	 * races with allocators, above.
2152	 */
2153	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2154
2155	dtrace_membar_producer();
2156
2157	do {
2158		free = dcpu->dtdsc_dirty;
2159		dvar->dtdv_next = free;
2160	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2161
2162	return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2163}
2164
2165/*ARGSUSED*/
2166static void
2167dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2168{
2169	if ((int64_t)nval < (int64_t)*oval)
2170		*oval = nval;
2171}
2172
2173/*ARGSUSED*/
2174static void
2175dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2176{
2177	if ((int64_t)nval > (int64_t)*oval)
2178		*oval = nval;
2179}
2180
2181static void
2182dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2183{
2184	int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2185	int64_t val = (int64_t)nval;
2186
2187	if (val < 0) {
2188		for (i = 0; i < zero; i++) {
2189			if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2190				quanta[i] += incr;
2191				return;
2192			}
2193		}
2194	} else {
2195		for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2196			if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2197				quanta[i - 1] += incr;
2198				return;
2199			}
2200		}
2201
2202		quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2203		return;
2204	}
2205
2206	ASSERT(0);
2207}
2208
2209static void
2210dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2211{
2212	uint64_t arg = *lquanta++;
2213	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2214	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2215	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2216	int32_t val = (int32_t)nval, level;
2217
2218	ASSERT(step != 0);
2219	ASSERT(levels != 0);
2220
2221	if (val < base) {
2222		/*
2223		 * This is an underflow.
2224		 */
2225		lquanta[0] += incr;
2226		return;
2227	}
2228
2229	level = (val - base) / step;
2230
2231	if (level < levels) {
2232		lquanta[level + 1] += incr;
2233		return;
2234	}
2235
2236	/*
2237	 * This is an overflow.
2238	 */
2239	lquanta[levels + 1] += incr;
2240}
2241
2242static int
2243dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
2244    uint16_t high, uint16_t nsteps, int64_t value)
2245{
2246	int64_t this = 1, last, next;
2247	int base = 1, order;
2248
2249	ASSERT(factor <= nsteps);
2250	ASSERT(nsteps % factor == 0);
2251
2252	for (order = 0; order < low; order++)
2253		this *= factor;
2254
2255	/*
2256	 * If our value is less than our factor taken to the power of the
2257	 * low order of magnitude, it goes into the zeroth bucket.
2258	 */
2259	if (value < (last = this))
2260		return (0);
2261
2262	for (this *= factor; order <= high; order++) {
2263		int nbuckets = this > nsteps ? nsteps : this;
2264
2265		if ((next = this * factor) < this) {
2266			/*
2267			 * We should not generally get log/linear quantizations
2268			 * with a high magnitude that allows 64-bits to
2269			 * overflow, but we nonetheless protect against this
2270			 * by explicitly checking for overflow, and clamping
2271			 * our value accordingly.
2272			 */
2273			value = this - 1;
2274		}
2275
2276		if (value < this) {
2277			/*
2278			 * If our value lies within this order of magnitude,
2279			 * determine its position by taking the offset within
2280			 * the order of magnitude, dividing by the bucket
2281			 * width, and adding to our (accumulated) base.
2282			 */
2283			return (base + (value - last) / (this / nbuckets));
2284		}
2285
2286		base += nbuckets - (nbuckets / factor);
2287		last = this;
2288		this = next;
2289	}
2290
2291	/*
2292	 * Our value is greater than or equal to our factor taken to the
2293	 * power of one plus the high magnitude -- return the top bucket.
2294	 */
2295	return (base);
2296}
2297
2298static void
2299dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2300{
2301	uint64_t arg = *llquanta++;
2302	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2303	uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2304	uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2305	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2306
2307	llquanta[dtrace_aggregate_llquantize_bucket(factor,
2308	    low, high, nsteps, nval)] += incr;
2309}
2310
2311/*ARGSUSED*/
2312static void
2313dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2314{
2315	data[0]++;
2316	data[1] += nval;
2317}
2318
2319/*ARGSUSED*/
2320static void
2321dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2322{
2323	int64_t snval = (int64_t)nval;
2324	uint64_t tmp[2];
2325
2326	data[0]++;
2327	data[1] += nval;
2328
2329	/*
2330	 * What we want to say here is:
2331	 *
2332	 * data[2] += nval * nval;
2333	 *
2334	 * But given that nval is 64-bit, we could easily overflow, so
2335	 * we do this as 128-bit arithmetic.
2336	 */
2337	if (snval < 0)
2338		snval = -snval;
2339
2340	dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2341	dtrace_add_128(data + 2, tmp, data + 2);
2342}
2343
2344/*ARGSUSED*/
2345static void
2346dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2347{
2348	*oval = *oval + 1;
2349}
2350
2351/*ARGSUSED*/
2352static void
2353dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2354{
2355	*oval += nval;
2356}
2357
2358/*
2359 * Aggregate given the tuple in the principal data buffer, and the aggregating
2360 * action denoted by the specified dtrace_aggregation_t.  The aggregation
2361 * buffer is specified as the buf parameter.  This routine does not return
2362 * failure; if there is no space in the aggregation buffer, the data will be
2363 * dropped, and a corresponding counter incremented.
2364 */
2365static void
2366dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2367    intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2368{
2369	dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2370	uint32_t i, ndx, size, fsize;
2371	uint32_t align = sizeof (uint64_t) - 1;
2372	dtrace_aggbuffer_t *agb;
2373	dtrace_aggkey_t *key;
2374	uint32_t hashval = 0, limit, isstr;
2375	caddr_t tomax, data, kdata;
2376	dtrace_actkind_t action;
2377	dtrace_action_t *act;
2378	uintptr_t offs;
2379
2380	if (buf == NULL)
2381		return;
2382
2383	if (!agg->dtag_hasarg) {
2384		/*
2385		 * Currently, only quantize() and lquantize() take additional
2386		 * arguments, and they have the same semantics:  an increment
2387		 * value that defaults to 1 when not present.  If additional
2388		 * aggregating actions take arguments, the setting of the
2389		 * default argument value will presumably have to become more
2390		 * sophisticated...
2391		 */
2392		arg = 1;
2393	}
2394
2395	action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2396	size = rec->dtrd_offset - agg->dtag_base;
2397	fsize = size + rec->dtrd_size;
2398
2399	ASSERT(dbuf->dtb_tomax != NULL);
2400	data = dbuf->dtb_tomax + offset + agg->dtag_base;
2401
2402	if ((tomax = buf->dtb_tomax) == NULL) {
2403		dtrace_buffer_drop(buf);
2404		return;
2405	}
2406
2407	/*
2408	 * The metastructure is always at the bottom of the buffer.
2409	 */
2410	agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2411	    sizeof (dtrace_aggbuffer_t));
2412
2413	if (buf->dtb_offset == 0) {
2414		/*
2415		 * We just kludge up approximately 1/8th of the size to be
2416		 * buckets.  If this guess ends up being routinely
2417		 * off-the-mark, we may need to dynamically readjust this
2418		 * based on past performance.
2419		 */
2420		uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2421
2422		if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2423		    (uintptr_t)tomax || hashsize == 0) {
2424			/*
2425			 * We've been given a ludicrously small buffer;
2426			 * increment our drop count and leave.
2427			 */
2428			dtrace_buffer_drop(buf);
2429			return;
2430		}
2431
2432		/*
2433		 * And now, a pathetic attempt to try to get a an odd (or
2434		 * perchance, a prime) hash size for better hash distribution.
2435		 */
2436		if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2437			hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2438
2439		agb->dtagb_hashsize = hashsize;
2440		agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2441		    agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2442		agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2443
2444		for (i = 0; i < agb->dtagb_hashsize; i++)
2445			agb->dtagb_hash[i] = NULL;
2446	}
2447
2448	ASSERT(agg->dtag_first != NULL);
2449	ASSERT(agg->dtag_first->dta_intuple);
2450
2451	/*
2452	 * Calculate the hash value based on the key.  Note that we _don't_
2453	 * include the aggid in the hashing (but we will store it as part of
2454	 * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2455	 * algorithm: a simple, quick algorithm that has no known funnels, and
2456	 * gets good distribution in practice.  The efficacy of the hashing
2457	 * algorithm (and a comparison with other algorithms) may be found by
2458	 * running the ::dtrace_aggstat MDB dcmd.
2459	 */
2460	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2461		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2462		limit = i + act->dta_rec.dtrd_size;
2463		ASSERT(limit <= size);
2464		isstr = DTRACEACT_ISSTRING(act);
2465
2466		for (; i < limit; i++) {
2467			hashval += data[i];
2468			hashval += (hashval << 10);
2469			hashval ^= (hashval >> 6);
2470
2471			if (isstr && data[i] == '\0')
2472				break;
2473		}
2474	}
2475
2476	hashval += (hashval << 3);
2477	hashval ^= (hashval >> 11);
2478	hashval += (hashval << 15);
2479
2480	/*
2481	 * Yes, the divide here is expensive -- but it's generally the least
2482	 * of the performance issues given the amount of data that we iterate
2483	 * over to compute hash values, compare data, etc.
2484	 */
2485	ndx = hashval % agb->dtagb_hashsize;
2486
2487	for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2488		ASSERT((caddr_t)key >= tomax);
2489		ASSERT((caddr_t)key < tomax + buf->dtb_size);
2490
2491		if (hashval != key->dtak_hashval || key->dtak_size != size)
2492			continue;
2493
2494		kdata = key->dtak_data;
2495		ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2496
2497		for (act = agg->dtag_first; act->dta_intuple;
2498		    act = act->dta_next) {
2499			i = act->dta_rec.dtrd_offset - agg->dtag_base;
2500			limit = i + act->dta_rec.dtrd_size;
2501			ASSERT(limit <= size);
2502			isstr = DTRACEACT_ISSTRING(act);
2503
2504			for (; i < limit; i++) {
2505				if (kdata[i] != data[i])
2506					goto next;
2507
2508				if (isstr && data[i] == '\0')
2509					break;
2510			}
2511		}
2512
2513		if (action != key->dtak_action) {
2514			/*
2515			 * We are aggregating on the same value in the same
2516			 * aggregation with two different aggregating actions.
2517			 * (This should have been picked up in the compiler,
2518			 * so we may be dealing with errant or devious DIF.)
2519			 * This is an error condition; we indicate as much,
2520			 * and return.
2521			 */
2522			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2523			return;
2524		}
2525
2526		/*
2527		 * This is a hit:  we need to apply the aggregator to
2528		 * the value at this key.
2529		 */
2530		agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2531		return;
2532next:
2533		continue;
2534	}
2535
2536	/*
2537	 * We didn't find it.  We need to allocate some zero-filled space,
2538	 * link it into the hash table appropriately, and apply the aggregator
2539	 * to the (zero-filled) value.
2540	 */
2541	offs = buf->dtb_offset;
2542	while (offs & (align - 1))
2543		offs += sizeof (uint32_t);
2544
2545	/*
2546	 * If we don't have enough room to both allocate a new key _and_
2547	 * its associated data, increment the drop count and return.
2548	 */
2549	if ((uintptr_t)tomax + offs + fsize >
2550	    agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2551		dtrace_buffer_drop(buf);
2552		return;
2553	}
2554
2555	/*CONSTCOND*/
2556	ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2557	key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2558	agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2559
2560	key->dtak_data = kdata = tomax + offs;
2561	buf->dtb_offset = offs + fsize;
2562
2563	/*
2564	 * Now copy the data across.
2565	 */
2566	*((dtrace_aggid_t *)kdata) = agg->dtag_id;
2567
2568	for (i = sizeof (dtrace_aggid_t); i < size; i++)
2569		kdata[i] = data[i];
2570
2571	/*
2572	 * Because strings are not zeroed out by default, we need to iterate
2573	 * looking for actions that store strings, and we need to explicitly
2574	 * pad these strings out with zeroes.
2575	 */
2576	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2577		int nul;
2578
2579		if (!DTRACEACT_ISSTRING(act))
2580			continue;
2581
2582		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2583		limit = i + act->dta_rec.dtrd_size;
2584		ASSERT(limit <= size);
2585
2586		for (nul = 0; i < limit; i++) {
2587			if (nul) {
2588				kdata[i] = '\0';
2589				continue;
2590			}
2591
2592			if (data[i] != '\0')
2593				continue;
2594
2595			nul = 1;
2596		}
2597	}
2598
2599	for (i = size; i < fsize; i++)
2600		kdata[i] = 0;
2601
2602	key->dtak_hashval = hashval;
2603	key->dtak_size = size;
2604	key->dtak_action = action;
2605	key->dtak_next = agb->dtagb_hash[ndx];
2606	agb->dtagb_hash[ndx] = key;
2607
2608	/*
2609	 * Finally, apply the aggregator.
2610	 */
2611	*((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2612	agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2613}
2614
2615/*
2616 * Given consumer state, this routine finds a speculation in the INACTIVE
2617 * state and transitions it into the ACTIVE state.  If there is no speculation
2618 * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2619 * incremented -- it is up to the caller to take appropriate action.
2620 */
2621static int
2622dtrace_speculation(dtrace_state_t *state)
2623{
2624	int i = 0;
2625	dtrace_speculation_state_t current;
2626	uint32_t *stat = &state->dts_speculations_unavail, count;
2627
2628	while (i < state->dts_nspeculations) {
2629		dtrace_speculation_t *spec = &state->dts_speculations[i];
2630
2631		current = spec->dtsp_state;
2632
2633		if (current != DTRACESPEC_INACTIVE) {
2634			if (current == DTRACESPEC_COMMITTINGMANY ||
2635			    current == DTRACESPEC_COMMITTING ||
2636			    current == DTRACESPEC_DISCARDING)
2637				stat = &state->dts_speculations_busy;
2638			i++;
2639			continue;
2640		}
2641
2642		if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2643		    current, DTRACESPEC_ACTIVE) == current)
2644			return (i + 1);
2645	}
2646
2647	/*
2648	 * We couldn't find a speculation.  If we found as much as a single
2649	 * busy speculation buffer, we'll attribute this failure as "busy"
2650	 * instead of "unavail".
2651	 */
2652	do {
2653		count = *stat;
2654	} while (dtrace_cas32(stat, count, count + 1) != count);
2655
2656	return (0);
2657}
2658
2659/*
2660 * This routine commits an active speculation.  If the specified speculation
2661 * is not in a valid state to perform a commit(), this routine will silently do
2662 * nothing.  The state of the specified speculation is transitioned according
2663 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2664 */
2665static void
2666dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2667    dtrace_specid_t which)
2668{
2669	dtrace_speculation_t *spec;
2670	dtrace_buffer_t *src, *dest;
2671	uintptr_t daddr, saddr, dlimit, slimit;
2672	dtrace_speculation_state_t current, new = 0;
2673	intptr_t offs;
2674	uint64_t timestamp;
2675
2676	if (which == 0)
2677		return;
2678
2679	if (which > state->dts_nspeculations) {
2680		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2681		return;
2682	}
2683
2684	spec = &state->dts_speculations[which - 1];
2685	src = &spec->dtsp_buffer[cpu];
2686	dest = &state->dts_buffer[cpu];
2687
2688	do {
2689		current = spec->dtsp_state;
2690
2691		if (current == DTRACESPEC_COMMITTINGMANY)
2692			break;
2693
2694		switch (current) {
2695		case DTRACESPEC_INACTIVE:
2696		case DTRACESPEC_DISCARDING:
2697			return;
2698
2699		case DTRACESPEC_COMMITTING:
2700			/*
2701			 * This is only possible if we are (a) commit()'ing
2702			 * without having done a prior speculate() on this CPU
2703			 * and (b) racing with another commit() on a different
2704			 * CPU.  There's nothing to do -- we just assert that
2705			 * our offset is 0.
2706			 */
2707			ASSERT(src->dtb_offset == 0);
2708			return;
2709
2710		case DTRACESPEC_ACTIVE:
2711			new = DTRACESPEC_COMMITTING;
2712			break;
2713
2714		case DTRACESPEC_ACTIVEONE:
2715			/*
2716			 * This speculation is active on one CPU.  If our
2717			 * buffer offset is non-zero, we know that the one CPU
2718			 * must be us.  Otherwise, we are committing on a
2719			 * different CPU from the speculate(), and we must
2720			 * rely on being asynchronously cleaned.
2721			 */
2722			if (src->dtb_offset != 0) {
2723				new = DTRACESPEC_COMMITTING;
2724				break;
2725			}
2726			/*FALLTHROUGH*/
2727
2728		case DTRACESPEC_ACTIVEMANY:
2729			new = DTRACESPEC_COMMITTINGMANY;
2730			break;
2731
2732		default:
2733			ASSERT(0);
2734		}
2735	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2736	    current, new) != current);
2737
2738	/*
2739	 * We have set the state to indicate that we are committing this
2740	 * speculation.  Now reserve the necessary space in the destination
2741	 * buffer.
2742	 */
2743	if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2744	    sizeof (uint64_t), state, NULL)) < 0) {
2745		dtrace_buffer_drop(dest);
2746		goto out;
2747	}
2748
2749	/*
2750	 * We have sufficient space to copy the speculative buffer into the
2751	 * primary buffer.  First, modify the speculative buffer, filling
2752	 * in the timestamp of all entries with the current time.  The data
2753	 * must have the commit() time rather than the time it was traced,
2754	 * so that all entries in the primary buffer are in timestamp order.
2755	 */
2756	timestamp = dtrace_gethrtime();
2757	saddr = (uintptr_t)src->dtb_tomax;
2758	slimit = saddr + src->dtb_offset;
2759	while (saddr < slimit) {
2760		size_t size;
2761		dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2762
2763		if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2764			saddr += sizeof (dtrace_epid_t);
2765			continue;
2766		}
2767		ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
2768		size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2769
2770		ASSERT3U(saddr + size, <=, slimit);
2771		ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
2772		ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
2773
2774		DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2775
2776		saddr += size;
2777	}
2778
2779	/*
2780	 * Copy the buffer across.  (Note that this is a
2781	 * highly subobtimal bcopy(); in the unlikely event that this becomes
2782	 * a serious performance issue, a high-performance DTrace-specific
2783	 * bcopy() should obviously be invented.)
2784	 */
2785	daddr = (uintptr_t)dest->dtb_tomax + offs;
2786	dlimit = daddr + src->dtb_offset;
2787	saddr = (uintptr_t)src->dtb_tomax;
2788
2789	/*
2790	 * First, the aligned portion.
2791	 */
2792	while (dlimit - daddr >= sizeof (uint64_t)) {
2793		*((uint64_t *)daddr) = *((uint64_t *)saddr);
2794
2795		daddr += sizeof (uint64_t);
2796		saddr += sizeof (uint64_t);
2797	}
2798
2799	/*
2800	 * Now any left-over bit...
2801	 */
2802	while (dlimit - daddr)
2803		*((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2804
2805	/*
2806	 * Finally, commit the reserved space in the destination buffer.
2807	 */
2808	dest->dtb_offset = offs + src->dtb_offset;
2809
2810out:
2811	/*
2812	 * If we're lucky enough to be the only active CPU on this speculation
2813	 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2814	 */
2815	if (current == DTRACESPEC_ACTIVE ||
2816	    (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2817		uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2818		    DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2819
2820		ASSERT(rval == DTRACESPEC_COMMITTING);
2821	}
2822
2823	src->dtb_offset = 0;
2824	src->dtb_xamot_drops += src->dtb_drops;
2825	src->dtb_drops = 0;
2826}
2827
2828/*
2829 * This routine discards an active speculation.  If the specified speculation
2830 * is not in a valid state to perform a discard(), this routine will silently
2831 * do nothing.  The state of the specified speculation is transitioned
2832 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2833 */
2834static void
2835dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2836    dtrace_specid_t which)
2837{
2838	dtrace_speculation_t *spec;
2839	dtrace_speculation_state_t current, new = 0;
2840	dtrace_buffer_t *buf;
2841
2842	if (which == 0)
2843		return;
2844
2845	if (which > state->dts_nspeculations) {
2846		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2847		return;
2848	}
2849
2850	spec = &state->dts_speculations[which - 1];
2851	buf = &spec->dtsp_buffer[cpu];
2852
2853	do {
2854		current = spec->dtsp_state;
2855
2856		switch (current) {
2857		case DTRACESPEC_INACTIVE:
2858		case DTRACESPEC_COMMITTINGMANY:
2859		case DTRACESPEC_COMMITTING:
2860		case DTRACESPEC_DISCARDING:
2861			return;
2862
2863		case DTRACESPEC_ACTIVE:
2864		case DTRACESPEC_ACTIVEMANY:
2865			new = DTRACESPEC_DISCARDING;
2866			break;
2867
2868		case DTRACESPEC_ACTIVEONE:
2869			if (buf->dtb_offset != 0) {
2870				new = DTRACESPEC_INACTIVE;
2871			} else {
2872				new = DTRACESPEC_DISCARDING;
2873			}
2874			break;
2875
2876		default:
2877			ASSERT(0);
2878		}
2879	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2880	    current, new) != current);
2881
2882	buf->dtb_offset = 0;
2883	buf->dtb_drops = 0;
2884}
2885
2886/*
2887 * Note:  not called from probe context.  This function is called
2888 * asynchronously from cross call context to clean any speculations that are
2889 * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2890 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2891 * speculation.
2892 */
2893static void
2894dtrace_speculation_clean_here(dtrace_state_t *state)
2895{
2896	dtrace_icookie_t cookie;
2897	processorid_t cpu = curcpu;
2898	dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2899	dtrace_specid_t i;
2900
2901	cookie = dtrace_interrupt_disable();
2902
2903	if (dest->dtb_tomax == NULL) {
2904		dtrace_interrupt_enable(cookie);
2905		return;
2906	}
2907
2908	for (i = 0; i < state->dts_nspeculations; i++) {
2909		dtrace_speculation_t *spec = &state->dts_speculations[i];
2910		dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2911
2912		if (src->dtb_tomax == NULL)
2913			continue;
2914
2915		if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2916			src->dtb_offset = 0;
2917			continue;
2918		}
2919
2920		if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2921			continue;
2922
2923		if (src->dtb_offset == 0)
2924			continue;
2925
2926		dtrace_speculation_commit(state, cpu, i + 1);
2927	}
2928
2929	dtrace_interrupt_enable(cookie);
2930}
2931
2932/*
2933 * Note:  not called from probe context.  This function is called
2934 * asynchronously (and at a regular interval) to clean any speculations that
2935 * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2936 * is work to be done, it cross calls all CPUs to perform that work;
2937 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2938 * INACTIVE state until they have been cleaned by all CPUs.
2939 */
2940static void
2941dtrace_speculation_clean(dtrace_state_t *state)
2942{
2943	int work = 0, rv;
2944	dtrace_specid_t i;
2945
2946	for (i = 0; i < state->dts_nspeculations; i++) {
2947		dtrace_speculation_t *spec = &state->dts_speculations[i];
2948
2949		ASSERT(!spec->dtsp_cleaning);
2950
2951		if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2952		    spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2953			continue;
2954
2955		work++;
2956		spec->dtsp_cleaning = 1;
2957	}
2958
2959	if (!work)
2960		return;
2961
2962	dtrace_xcall(DTRACE_CPUALL,
2963	    (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2964
2965	/*
2966	 * We now know that all CPUs have committed or discarded their
2967	 * speculation buffers, as appropriate.  We can now set the state
2968	 * to inactive.
2969	 */
2970	for (i = 0; i < state->dts_nspeculations; i++) {
2971		dtrace_speculation_t *spec = &state->dts_speculations[i];
2972		dtrace_speculation_state_t current, new;
2973
2974		if (!spec->dtsp_cleaning)
2975			continue;
2976
2977		current = spec->dtsp_state;
2978		ASSERT(current == DTRACESPEC_DISCARDING ||
2979		    current == DTRACESPEC_COMMITTINGMANY);
2980
2981		new = DTRACESPEC_INACTIVE;
2982
2983		rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2984		ASSERT(rv == current);
2985		spec->dtsp_cleaning = 0;
2986	}
2987}
2988
2989/*
2990 * Called as part of a speculate() to get the speculative buffer associated
2991 * with a given speculation.  Returns NULL if the specified speculation is not
2992 * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
2993 * the active CPU is not the specified CPU -- the speculation will be
2994 * atomically transitioned into the ACTIVEMANY state.
2995 */
2996static dtrace_buffer_t *
2997dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2998    dtrace_specid_t which)
2999{
3000	dtrace_speculation_t *spec;
3001	dtrace_speculation_state_t current, new = 0;
3002	dtrace_buffer_t *buf;
3003
3004	if (which == 0)
3005		return (NULL);
3006
3007	if (which > state->dts_nspeculations) {
3008		cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3009		return (NULL);
3010	}
3011
3012	spec = &state->dts_speculations[which - 1];
3013	buf = &spec->dtsp_buffer[cpuid];
3014
3015	do {
3016		current = spec->dtsp_state;
3017
3018		switch (current) {
3019		case DTRACESPEC_INACTIVE:
3020		case DTRACESPEC_COMMITTINGMANY:
3021		case DTRACESPEC_DISCARDING:
3022			return (NULL);
3023
3024		case DTRACESPEC_COMMITTING:
3025			ASSERT(buf->dtb_offset == 0);
3026			return (NULL);
3027
3028		case DTRACESPEC_ACTIVEONE:
3029			/*
3030			 * This speculation is currently active on one CPU.
3031			 * Check the offset in the buffer; if it's non-zero,
3032			 * that CPU must be us (and we leave the state alone).
3033			 * If it's zero, assume that we're starting on a new
3034			 * CPU -- and change the state to indicate that the
3035			 * speculation is active on more than one CPU.
3036			 */
3037			if (buf->dtb_offset != 0)
3038				return (buf);
3039
3040			new = DTRACESPEC_ACTIVEMANY;
3041			break;
3042
3043		case DTRACESPEC_ACTIVEMANY:
3044			return (buf);
3045
3046		case DTRACESPEC_ACTIVE:
3047			new = DTRACESPEC_ACTIVEONE;
3048			break;
3049
3050		default:
3051			ASSERT(0);
3052		}
3053	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3054	    current, new) != current);
3055
3056	ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3057	return (buf);
3058}
3059
3060/*
3061 * Return a string.  In the event that the user lacks the privilege to access
3062 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3063 * don't fail access checking.
3064 *
3065 * dtrace_dif_variable() uses this routine as a helper for various
3066 * builtin values such as 'execname' and 'probefunc.'
3067 */
3068uintptr_t
3069dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3070    dtrace_mstate_t *mstate)
3071{
3072	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3073	uintptr_t ret;
3074	size_t strsz;
3075
3076	/*
3077	 * The easy case: this probe is allowed to read all of memory, so
3078	 * we can just return this as a vanilla pointer.
3079	 */
3080	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3081		return (addr);
3082
3083	/*
3084	 * This is the tougher case: we copy the string in question from
3085	 * kernel memory into scratch memory and return it that way: this
3086	 * ensures that we won't trip up when access checking tests the
3087	 * BYREF return value.
3088	 */
3089	strsz = dtrace_strlen((char *)addr, size) + 1;
3090
3091	if (mstate->dtms_scratch_ptr + strsz >
3092	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3093		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3094		return (0);
3095	}
3096
3097	dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3098	    strsz);
3099	ret = mstate->dtms_scratch_ptr;
3100	mstate->dtms_scratch_ptr += strsz;
3101	return (ret);
3102}
3103
3104/*
3105 * Return a string from a memoy address which is known to have one or
3106 * more concatenated, individually zero terminated, sub-strings.
3107 * In the event that the user lacks the privilege to access
3108 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3109 * don't fail access checking.
3110 *
3111 * dtrace_dif_variable() uses this routine as a helper for various
3112 * builtin values such as 'execargs'.
3113 */
3114static uintptr_t
3115dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state,
3116    dtrace_mstate_t *mstate)
3117{
3118	char *p;
3119	size_t i;
3120	uintptr_t ret;
3121
3122	if (mstate->dtms_scratch_ptr + strsz >
3123	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3124		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3125		return (0);
3126	}
3127
3128	dtrace_bcopy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3129	    strsz);
3130
3131	/* Replace sub-string termination characters with a space. */
3132	for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1;
3133	    p++, i++)
3134		if (*p == '\0')
3135			*p = ' ';
3136
3137	ret = mstate->dtms_scratch_ptr;
3138	mstate->dtms_scratch_ptr += strsz;
3139	return (ret);
3140}
3141
3142/*
3143 * This function implements the DIF emulator's variable lookups.  The emulator
3144 * passes a reserved variable identifier and optional built-in array index.
3145 */
3146static uint64_t
3147dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3148    uint64_t ndx)
3149{
3150	/*
3151	 * If we're accessing one of the uncached arguments, we'll turn this
3152	 * into a reference in the args array.
3153	 */
3154	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3155		ndx = v - DIF_VAR_ARG0;
3156		v = DIF_VAR_ARGS;
3157	}
3158
3159	switch (v) {
3160	case DIF_VAR_ARGS:
3161		ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3162		if (ndx >= sizeof (mstate->dtms_arg) /
3163		    sizeof (mstate->dtms_arg[0])) {
3164			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3165			dtrace_provider_t *pv;
3166			uint64_t val;
3167
3168			pv = mstate->dtms_probe->dtpr_provider;
3169			if (pv->dtpv_pops.dtps_getargval != NULL)
3170				val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3171				    mstate->dtms_probe->dtpr_id,
3172				    mstate->dtms_probe->dtpr_arg, ndx, aframes);
3173			else
3174				val = dtrace_getarg(ndx, aframes);
3175
3176			/*
3177			 * This is regrettably required to keep the compiler
3178			 * from tail-optimizing the call to dtrace_getarg().
3179			 * The condition always evaluates to true, but the
3180			 * compiler has no way of figuring that out a priori.
3181			 * (None of this would be necessary if the compiler
3182			 * could be relied upon to _always_ tail-optimize
3183			 * the call to dtrace_getarg() -- but it can't.)
3184			 */
3185			if (mstate->dtms_probe != NULL)
3186				return (val);
3187
3188			ASSERT(0);
3189		}
3190
3191		return (mstate->dtms_arg[ndx]);
3192
3193#if defined(sun)
3194	case DIF_VAR_UREGS: {
3195		klwp_t *lwp;
3196
3197		if (!dtrace_priv_proc(state))
3198			return (0);
3199
3200		if ((lwp = curthread->t_lwp) == NULL) {
3201			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3202			cpu_core[curcpu].cpuc_dtrace_illval = NULL;
3203			return (0);
3204		}
3205
3206		return (dtrace_getreg(lwp->lwp_regs, ndx));
3207		return (0);
3208	}
3209#else
3210	case DIF_VAR_UREGS: {
3211		struct trapframe *tframe;
3212
3213		if (!dtrace_priv_proc(state))
3214			return (0);
3215
3216		if ((tframe = curthread->td_frame) == NULL) {
3217			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3218			cpu_core[curcpu].cpuc_dtrace_illval = 0;
3219			return (0);
3220		}
3221
3222		return (dtrace_getreg(tframe, ndx));
3223	}
3224#endif
3225
3226	case DIF_VAR_CURTHREAD:
3227		if (!dtrace_priv_proc(state))
3228			return (0);
3229		return ((uint64_t)(uintptr_t)curthread);
3230
3231	case DIF_VAR_TIMESTAMP:
3232		if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3233			mstate->dtms_timestamp = dtrace_gethrtime();
3234			mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3235		}
3236		return (mstate->dtms_timestamp);
3237
3238	case DIF_VAR_VTIMESTAMP:
3239		ASSERT(dtrace_vtime_references != 0);
3240		return (curthread->t_dtrace_vtime);
3241
3242	case DIF_VAR_WALLTIMESTAMP:
3243		if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3244			mstate->dtms_walltimestamp = dtrace_gethrestime();
3245			mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3246		}
3247		return (mstate->dtms_walltimestamp);
3248
3249#if defined(sun)
3250	case DIF_VAR_IPL:
3251		if (!dtrace_priv_kernel(state))
3252			return (0);
3253		if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3254			mstate->dtms_ipl = dtrace_getipl();
3255			mstate->dtms_present |= DTRACE_MSTATE_IPL;
3256		}
3257		return (mstate->dtms_ipl);
3258#endif
3259
3260	case DIF_VAR_EPID:
3261		ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3262		return (mstate->dtms_epid);
3263
3264	case DIF_VAR_ID:
3265		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3266		return (mstate->dtms_probe->dtpr_id);
3267
3268	case DIF_VAR_STACKDEPTH:
3269		if (!dtrace_priv_kernel(state))
3270			return (0);
3271		if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3272			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3273
3274			mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3275			mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3276		}
3277		return (mstate->dtms_stackdepth);
3278
3279	case DIF_VAR_USTACKDEPTH:
3280		if (!dtrace_priv_proc(state))
3281			return (0);
3282		if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3283			/*
3284			 * See comment in DIF_VAR_PID.
3285			 */
3286			if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3287			    CPU_ON_INTR(CPU)) {
3288				mstate->dtms_ustackdepth = 0;
3289			} else {
3290				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3291				mstate->dtms_ustackdepth =
3292				    dtrace_getustackdepth();
3293				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3294			}
3295			mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3296		}
3297		return (mstate->dtms_ustackdepth);
3298
3299	case DIF_VAR_CALLER:
3300		if (!dtrace_priv_kernel(state))
3301			return (0);
3302		if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3303			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3304
3305			if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3306				/*
3307				 * If this is an unanchored probe, we are
3308				 * required to go through the slow path:
3309				 * dtrace_caller() only guarantees correct
3310				 * results for anchored probes.
3311				 */
3312				pc_t caller[2] = {0, 0};
3313
3314				dtrace_getpcstack(caller, 2, aframes,
3315				    (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3316				mstate->dtms_caller = caller[1];
3317			} else if ((mstate->dtms_caller =
3318			    dtrace_caller(aframes)) == -1) {
3319				/*
3320				 * We have failed to do this the quick way;
3321				 * we must resort to the slower approach of
3322				 * calling dtrace_getpcstack().
3323				 */
3324				pc_t caller = 0;
3325
3326				dtrace_getpcstack(&caller, 1, aframes, NULL);
3327				mstate->dtms_caller = caller;
3328			}
3329
3330			mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3331		}
3332		return (mstate->dtms_caller);
3333
3334	case DIF_VAR_UCALLER:
3335		if (!dtrace_priv_proc(state))
3336			return (0);
3337
3338		if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3339			uint64_t ustack[3];
3340
3341			/*
3342			 * dtrace_getupcstack() fills in the first uint64_t
3343			 * with the current PID.  The second uint64_t will
3344			 * be the program counter at user-level.  The third
3345			 * uint64_t will contain the caller, which is what
3346			 * we're after.
3347			 */
3348			ustack[2] = 0;
3349			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3350			dtrace_getupcstack(ustack, 3);
3351			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3352			mstate->dtms_ucaller = ustack[2];
3353			mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3354		}
3355
3356		return (mstate->dtms_ucaller);
3357
3358	case DIF_VAR_PROBEPROV:
3359		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3360		return (dtrace_dif_varstr(
3361		    (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3362		    state, mstate));
3363
3364	case DIF_VAR_PROBEMOD:
3365		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3366		return (dtrace_dif_varstr(
3367		    (uintptr_t)mstate->dtms_probe->dtpr_mod,
3368		    state, mstate));
3369
3370	case DIF_VAR_PROBEFUNC:
3371		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3372		return (dtrace_dif_varstr(
3373		    (uintptr_t)mstate->dtms_probe->dtpr_func,
3374		    state, mstate));
3375
3376	case DIF_VAR_PROBENAME:
3377		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3378		return (dtrace_dif_varstr(
3379		    (uintptr_t)mstate->dtms_probe->dtpr_name,
3380		    state, mstate));
3381
3382	case DIF_VAR_PID:
3383		if (!dtrace_priv_proc(state))
3384			return (0);
3385
3386#if defined(sun)
3387		/*
3388		 * Note that we are assuming that an unanchored probe is
3389		 * always due to a high-level interrupt.  (And we're assuming
3390		 * that there is only a single high level interrupt.)
3391		 */
3392		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3393			return (pid0.pid_id);
3394
3395		/*
3396		 * It is always safe to dereference one's own t_procp pointer:
3397		 * it always points to a valid, allocated proc structure.
3398		 * Further, it is always safe to dereference the p_pidp member
3399		 * of one's own proc structure.  (These are truisms becuase
3400		 * threads and processes don't clean up their own state --
3401		 * they leave that task to whomever reaps them.)
3402		 */
3403		return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3404#else
3405		return ((uint64_t)curproc->p_pid);
3406#endif
3407
3408	case DIF_VAR_PPID:
3409		if (!dtrace_priv_proc(state))
3410			return (0);
3411
3412#if defined(sun)
3413		/*
3414		 * See comment in DIF_VAR_PID.
3415		 */
3416		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3417			return (pid0.pid_id);
3418
3419		/*
3420		 * It is always safe to dereference one's own t_procp pointer:
3421		 * it always points to a valid, allocated proc structure.
3422		 * (This is true because threads don't clean up their own
3423		 * state -- they leave that task to whomever reaps them.)
3424		 */
3425		return ((uint64_t)curthread->t_procp->p_ppid);
3426#else
3427		if (curproc->p_pid == proc0.p_pid)
3428			return (curproc->p_pid);
3429		else
3430			return (curproc->p_pptr->p_pid);
3431#endif
3432
3433	case DIF_VAR_TID:
3434#if defined(sun)
3435		/*
3436		 * See comment in DIF_VAR_PID.
3437		 */
3438		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3439			return (0);
3440#endif
3441
3442		return ((uint64_t)curthread->t_tid);
3443
3444	case DIF_VAR_EXECARGS: {
3445		struct pargs *p_args = curthread->td_proc->p_args;
3446
3447		if (p_args == NULL)
3448			return(0);
3449
3450		return (dtrace_dif_varstrz(
3451		    (uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));
3452	}
3453
3454	case DIF_VAR_EXECNAME:
3455#if defined(sun)
3456		if (!dtrace_priv_proc(state))
3457			return (0);
3458
3459		/*
3460		 * See comment in DIF_VAR_PID.
3461		 */
3462		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3463			return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3464
3465		/*
3466		 * It is always safe to dereference one's own t_procp pointer:
3467		 * it always points to a valid, allocated proc structure.
3468		 * (This is true because threads don't clean up their own
3469		 * state -- they leave that task to whomever reaps them.)
3470		 */
3471		return (dtrace_dif_varstr(
3472		    (uintptr_t)curthread->t_procp->p_user.u_comm,
3473		    state, mstate));
3474#else
3475		return (dtrace_dif_varstr(
3476		    (uintptr_t) curthread->td_proc->p_comm, state, mstate));
3477#endif
3478
3479	case DIF_VAR_ZONENAME:
3480#if defined(sun)
3481		if (!dtrace_priv_proc(state))
3482			return (0);
3483
3484		/*
3485		 * See comment in DIF_VAR_PID.
3486		 */
3487		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3488			return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3489
3490		/*
3491		 * It is always safe to dereference one's own t_procp pointer:
3492		 * it always points to a valid, allocated proc structure.
3493		 * (This is true because threads don't clean up their own
3494		 * state -- they leave that task to whomever reaps them.)
3495		 */
3496		return (dtrace_dif_varstr(
3497		    (uintptr_t)curthread->t_procp->p_zone->zone_name,
3498		    state, mstate));
3499#else
3500		return (0);
3501#endif
3502
3503	case DIF_VAR_UID:
3504		if (!dtrace_priv_proc(state))
3505			return (0);
3506
3507#if defined(sun)
3508		/*
3509		 * See comment in DIF_VAR_PID.
3510		 */
3511		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3512			return ((uint64_t)p0.p_cred->cr_uid);
3513#endif
3514
3515		/*
3516		 * It is always safe to dereference one's own t_procp pointer:
3517		 * it always points to a valid, allocated proc structure.
3518		 * (This is true because threads don't clean up their own
3519		 * state -- they leave that task to whomever reaps them.)
3520		 *
3521		 * Additionally, it is safe to dereference one's own process
3522		 * credential, since this is never NULL after process birth.
3523		 */
3524		return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3525
3526	case DIF_VAR_GID:
3527		if (!dtrace_priv_proc(state))
3528			return (0);
3529
3530#if defined(sun)
3531		/*
3532		 * See comment in DIF_VAR_PID.
3533		 */
3534		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3535			return ((uint64_t)p0.p_cred->cr_gid);
3536#endif
3537
3538		/*
3539		 * It is always safe to dereference one's own t_procp pointer:
3540		 * it always points to a valid, allocated proc structure.
3541		 * (This is true because threads don't clean up their own
3542		 * state -- they leave that task to whomever reaps them.)
3543		 *
3544		 * Additionally, it is safe to dereference one's own process
3545		 * credential, since this is never NULL after process birth.
3546		 */
3547		return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3548
3549	case DIF_VAR_ERRNO: {
3550#if defined(sun)
3551		klwp_t *lwp;
3552		if (!dtrace_priv_proc(state))
3553			return (0);
3554
3555		/*
3556		 * See comment in DIF_VAR_PID.
3557		 */
3558		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3559			return (0);
3560
3561		/*
3562		 * It is always safe to dereference one's own t_lwp pointer in
3563		 * the event that this pointer is non-NULL.  (This is true
3564		 * because threads and lwps don't clean up their own state --
3565		 * they leave that task to whomever reaps them.)
3566		 */
3567		if ((lwp = curthread->t_lwp) == NULL)
3568			return (0);
3569
3570		return ((uint64_t)lwp->lwp_errno);
3571#else
3572		return (curthread->td_errno);
3573#endif
3574	}
3575#if !defined(sun)
3576	case DIF_VAR_CPU: {
3577		return curcpu;
3578	}
3579#endif
3580	default:
3581		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3582		return (0);
3583	}
3584}
3585
3586
3587typedef enum dtrace_json_state {
3588	DTRACE_JSON_REST = 1,
3589	DTRACE_JSON_OBJECT,
3590	DTRACE_JSON_STRING,
3591	DTRACE_JSON_STRING_ESCAPE,
3592	DTRACE_JSON_STRING_ESCAPE_UNICODE,
3593	DTRACE_JSON_COLON,
3594	DTRACE_JSON_COMMA,
3595	DTRACE_JSON_VALUE,
3596	DTRACE_JSON_IDENTIFIER,
3597	DTRACE_JSON_NUMBER,
3598	DTRACE_JSON_NUMBER_FRAC,
3599	DTRACE_JSON_NUMBER_EXP,
3600	DTRACE_JSON_COLLECT_OBJECT
3601} dtrace_json_state_t;
3602
3603/*
3604 * This function possesses just enough knowledge about JSON to extract a single
3605 * value from a JSON string and store it in the scratch buffer.  It is able
3606 * to extract nested object values, and members of arrays by index.
3607 *
3608 * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3609 * be looked up as we descend into the object tree.  e.g.
3610 *
3611 *    foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3612 *       with nelems = 5.
3613 *
3614 * The run time of this function must be bounded above by strsize to limit the
3615 * amount of work done in probe context.  As such, it is implemented as a
3616 * simple state machine, reading one character at a time using safe loads
3617 * until we find the requested element, hit a parsing error or run off the
3618 * end of the object or string.
3619 *
3620 * As there is no way for a subroutine to return an error without interrupting
3621 * clause execution, we simply return NULL in the event of a missing key or any
3622 * other error condition.  Each NULL return in this function is commented with
3623 * the error condition it represents -- parsing or otherwise.
3624 *
3625 * The set of states for the state machine closely matches the JSON
3626 * specification (http://json.org/).  Briefly:
3627 *
3628 *   DTRACE_JSON_REST:
3629 *     Skip whitespace until we find either a top-level Object, moving
3630 *     to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3631 *
3632 *   DTRACE_JSON_OBJECT:
3633 *     Locate the next key String in an Object.  Sets a flag to denote
3634 *     the next String as a key string and moves to DTRACE_JSON_STRING.
3635 *
3636 *   DTRACE_JSON_COLON:
3637 *     Skip whitespace until we find the colon that separates key Strings
3638 *     from their values.  Once found, move to DTRACE_JSON_VALUE.
3639 *
3640 *   DTRACE_JSON_VALUE:
3641 *     Detects the type of the next value (String, Number, Identifier, Object
3642 *     or Array) and routes to the states that process that type.  Here we also
3643 *     deal with the element selector list if we are requested to traverse down
3644 *     into the object tree.
3645 *
3646 *   DTRACE_JSON_COMMA:
3647 *     Skip whitespace until we find the comma that separates key-value pairs
3648 *     in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3649 *     (similarly DTRACE_JSON_VALUE).  All following literal value processing
3650 *     states return to this state at the end of their value, unless otherwise
3651 *     noted.
3652 *
3653 *   DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3654 *     Processes a Number literal from the JSON, including any exponent
3655 *     component that may be present.  Numbers are returned as strings, which
3656 *     may be passed to strtoll() if an integer is required.
3657 *
3658 *   DTRACE_JSON_IDENTIFIER:
3659 *     Processes a "true", "false" or "null" literal in the JSON.
3660 *
3661 *   DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3662 *   DTRACE_JSON_STRING_ESCAPE_UNICODE:
3663 *     Processes a String literal from the JSON, whether the String denotes
3664 *     a key, a value or part of a larger Object.  Handles all escape sequences
3665 *     present in the specification, including four-digit unicode characters,
3666 *     but merely includes the escape sequence without converting it to the
3667 *     actual escaped character.  If the String is flagged as a key, we
3668 *     move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3669 *
3670 *   DTRACE_JSON_COLLECT_OBJECT:
3671 *     This state collects an entire Object (or Array), correctly handling
3672 *     embedded strings.  If the full element selector list matches this nested
3673 *     object, we return the Object in full as a string.  If not, we use this
3674 *     state to skip to the next value at this level and continue processing.
3675 *
3676 * NOTE: This function uses various macros from strtolctype.h to manipulate
3677 * digit values, etc -- these have all been checked to ensure they make
3678 * no additional function calls.
3679 */
3680static char *
3681dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3682    char *dest)
3683{
3684	dtrace_json_state_t state = DTRACE_JSON_REST;
3685	int64_t array_elem = INT64_MIN;
3686	int64_t array_pos = 0;
3687	uint8_t escape_unicount = 0;
3688	boolean_t string_is_key = B_FALSE;
3689	boolean_t collect_object = B_FALSE;
3690	boolean_t found_key = B_FALSE;
3691	boolean_t in_array = B_FALSE;
3692	uint32_t braces = 0, brackets = 0;
3693	char *elem = elemlist;
3694	char *dd = dest;
3695	uintptr_t cur;
3696
3697	for (cur = json; cur < json + size; cur++) {
3698		char cc = dtrace_load8(cur);
3699		if (cc == '\0')
3700			return (NULL);
3701
3702		switch (state) {
3703		case DTRACE_JSON_REST:
3704			if (isspace(cc))
3705				break;
3706
3707			if (cc == '{') {
3708				state = DTRACE_JSON_OBJECT;
3709				break;
3710			}
3711
3712			if (cc == '[') {
3713				in_array = B_TRUE;
3714				array_pos = 0;
3715				array_elem = dtrace_strtoll(elem, 10, size);
3716				found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3717				state = DTRACE_JSON_VALUE;
3718				break;
3719			}
3720
3721			/*
3722			 * ERROR: expected to find a top-level object or array.
3723			 */
3724			return (NULL);
3725		case DTRACE_JSON_OBJECT:
3726			if (isspace(cc))
3727				break;
3728
3729			if (cc == '"') {
3730				state = DTRACE_JSON_STRING;
3731				string_is_key = B_TRUE;
3732				break;
3733			}
3734
3735			/*
3736			 * ERROR: either the object did not start with a key
3737			 * string, or we've run off the end of the object
3738			 * without finding the requested key.
3739			 */
3740			return (NULL);
3741		case DTRACE_JSON_STRING:
3742			if (cc == '\\') {
3743				*dd++ = '\\';
3744				state = DTRACE_JSON_STRING_ESCAPE;
3745				break;
3746			}
3747
3748			if (cc == '"') {
3749				if (collect_object) {
3750					/*
3751					 * We don't reset the dest here, as
3752					 * the string is part of a larger
3753					 * object being collected.
3754					 */
3755					*dd++ = cc;
3756					collect_object = B_FALSE;
3757					state = DTRACE_JSON_COLLECT_OBJECT;
3758					break;
3759				}
3760				*dd = '\0';
3761				dd = dest; /* reset string buffer */
3762				if (string_is_key) {
3763					if (dtrace_strncmp(dest, elem,
3764					    size) == 0)
3765						found_key = B_TRUE;
3766				} else if (found_key) {
3767					if (nelems > 1) {
3768						/*
3769						 * We expected an object, not
3770						 * this string.
3771						 */
3772						return (NULL);
3773					}
3774					return (dest);
3775				}
3776				state = string_is_key ? DTRACE_JSON_COLON :
3777				    DTRACE_JSON_COMMA;
3778				string_is_key = B_FALSE;
3779				break;
3780			}
3781
3782			*dd++ = cc;
3783			break;
3784		case DTRACE_JSON_STRING_ESCAPE:
3785			*dd++ = cc;
3786			if (cc == 'u') {
3787				escape_unicount = 0;
3788				state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3789			} else {
3790				state = DTRACE_JSON_STRING;
3791			}
3792			break;
3793		case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3794			if (!isxdigit(cc)) {
3795				/*
3796				 * ERROR: invalid unicode escape, expected
3797				 * four valid hexidecimal digits.
3798				 */
3799				return (NULL);
3800			}
3801
3802			*dd++ = cc;
3803			if (++escape_unicount == 4)
3804				state = DTRACE_JSON_STRING;
3805			break;
3806		case DTRACE_JSON_COLON:
3807			if (isspace(cc))
3808				break;
3809
3810			if (cc == ':') {
3811				state = DTRACE_JSON_VALUE;
3812				break;
3813			}
3814
3815			/*
3816			 * ERROR: expected a colon.
3817			 */
3818			return (NULL);
3819		case DTRACE_JSON_COMMA:
3820			if (isspace(cc))
3821				break;
3822
3823			if (cc == ',') {
3824				if (in_array) {
3825					state = DTRACE_JSON_VALUE;
3826					if (++array_pos == array_elem)
3827						found_key = B_TRUE;
3828				} else {
3829					state = DTRACE_JSON_OBJECT;
3830				}
3831				break;
3832			}
3833
3834			/*
3835			 * ERROR: either we hit an unexpected character, or
3836			 * we reached the end of the object or array without
3837			 * finding the requested key.
3838			 */
3839			return (NULL);
3840		case DTRACE_JSON_IDENTIFIER:
3841			if (islower(cc)) {
3842				*dd++ = cc;
3843				break;
3844			}
3845
3846			*dd = '\0';
3847			dd = dest; /* reset string buffer */
3848
3849			if (dtrace_strncmp(dest, "true", 5) == 0 ||
3850			    dtrace_strncmp(dest, "false", 6) == 0 ||
3851			    dtrace_strncmp(dest, "null", 5) == 0) {
3852				if (found_key) {
3853					if (nelems > 1) {
3854						/*
3855						 * ERROR: We expected an object,
3856						 * not this identifier.
3857						 */
3858						return (NULL);
3859					}
3860					return (dest);
3861				} else {
3862					cur--;
3863					state = DTRACE_JSON_COMMA;
3864					break;
3865				}
3866			}
3867
3868			/*
3869			 * ERROR: we did not recognise the identifier as one
3870			 * of those in the JSON specification.
3871			 */
3872			return (NULL);
3873		case DTRACE_JSON_NUMBER:
3874			if (cc == '.') {
3875				*dd++ = cc;
3876				state = DTRACE_JSON_NUMBER_FRAC;
3877				break;
3878			}
3879
3880			if (cc == 'x' || cc == 'X') {
3881				/*
3882				 * ERROR: specification explicitly excludes
3883				 * hexidecimal or octal numbers.
3884				 */
3885				return (NULL);
3886			}
3887
3888			/* FALLTHRU */
3889		case DTRACE_JSON_NUMBER_FRAC:
3890			if (cc == 'e' || cc == 'E') {
3891				*dd++ = cc;
3892				state = DTRACE_JSON_NUMBER_EXP;
3893				break;
3894			}
3895
3896			if (cc == '+' || cc == '-') {
3897				/*
3898				 * ERROR: expect sign as part of exponent only.
3899				 */
3900				return (NULL);
3901			}
3902			/* FALLTHRU */
3903		case DTRACE_JSON_NUMBER_EXP:
3904			if (isdigit(cc) || cc == '+' || cc == '-') {
3905				*dd++ = cc;
3906				break;
3907			}
3908
3909			*dd = '\0';
3910			dd = dest; /* reset string buffer */
3911			if (found_key) {
3912				if (nelems > 1) {
3913					/*
3914					 * ERROR: We expected an object, not
3915					 * this number.
3916					 */
3917					return (NULL);
3918				}
3919				return (dest);
3920			}
3921
3922			cur--;
3923			state = DTRACE_JSON_COMMA;
3924			break;
3925		case DTRACE_JSON_VALUE:
3926			if (isspace(cc))
3927				break;
3928
3929			if (cc == '{' || cc == '[') {
3930				if (nelems > 1 && found_key) {
3931					in_array = cc == '[' ? B_TRUE : B_FALSE;
3932					/*
3933					 * If our element selector directs us
3934					 * to descend into this nested object,
3935					 * then move to the next selector
3936					 * element in the list and restart the
3937					 * state machine.
3938					 */
3939					while (*elem != '\0')
3940						elem++;
3941					elem++; /* skip the inter-element NUL */
3942					nelems--;
3943					dd = dest;
3944					if (in_array) {
3945						state = DTRACE_JSON_VALUE;
3946						array_pos = 0;
3947						array_elem = dtrace_strtoll(
3948						    elem, 10, size);
3949						found_key = array_elem == 0 ?
3950						    B_TRUE : B_FALSE;
3951					} else {
3952						found_key = B_FALSE;
3953						state = DTRACE_JSON_OBJECT;
3954					}
3955					break;
3956				}
3957
3958				/*
3959				 * Otherwise, we wish to either skip this
3960				 * nested object or return it in full.
3961				 */
3962				if (cc == '[')
3963					brackets = 1;
3964				else
3965					braces = 1;
3966				*dd++ = cc;
3967				state = DTRACE_JSON_COLLECT_OBJECT;
3968				break;
3969			}
3970
3971			if (cc == '"') {
3972				state = DTRACE_JSON_STRING;
3973				break;
3974			}
3975
3976			if (islower(cc)) {
3977				/*
3978				 * Here we deal with true, false and null.
3979				 */
3980				*dd++ = cc;
3981				state = DTRACE_JSON_IDENTIFIER;
3982				break;
3983			}
3984
3985			if (cc == '-' || isdigit(cc)) {
3986				*dd++ = cc;
3987				state = DTRACE_JSON_NUMBER;
3988				break;
3989			}
3990
3991			/*
3992			 * ERROR: unexpected character at start of value.
3993			 */
3994			return (NULL);
3995		case DTRACE_JSON_COLLECT_OBJECT:
3996			if (cc == '\0')
3997				/*
3998				 * ERROR: unexpected end of input.
3999				 */
4000				return (NULL);
4001
4002			*dd++ = cc;
4003			if (cc == '"') {
4004				collect_object = B_TRUE;
4005				state = DTRACE_JSON_STRING;
4006				break;
4007			}
4008
4009			if (cc == ']') {
4010				if (brackets-- == 0) {
4011					/*
4012					 * ERROR: unbalanced brackets.
4013					 */
4014					return (NULL);
4015				}
4016			} else if (cc == '}') {
4017				if (braces-- == 0) {
4018					/*
4019					 * ERROR: unbalanced braces.
4020					 */
4021					return (NULL);
4022				}
4023			} else if (cc == '{') {
4024				braces++;
4025			} else if (cc == '[') {
4026				brackets++;
4027			}
4028
4029			if (brackets == 0 && braces == 0) {
4030				if (found_key) {
4031					*dd = '\0';
4032					return (dest);
4033				}
4034				dd = dest; /* reset string buffer */
4035				state = DTRACE_JSON_COMMA;
4036			}
4037			break;
4038		}
4039	}
4040	return (NULL);
4041}
4042
4043/*
4044 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
4045 * Notice that we don't bother validating the proper number of arguments or
4046 * their types in the tuple stack.  This isn't needed because all argument
4047 * interpretation is safe because of our load safety -- the worst that can
4048 * happen is that a bogus program can obtain bogus results.
4049 */
4050static void
4051dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
4052    dtrace_key_t *tupregs, int nargs,
4053    dtrace_mstate_t *mstate, dtrace_state_t *state)
4054{
4055	volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
4056	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
4057	dtrace_vstate_t *vstate = &state->dts_vstate;
4058
4059#if defined(sun)
4060	union {
4061		mutex_impl_t mi;
4062		uint64_t mx;
4063	} m;
4064
4065	union {
4066		krwlock_t ri;
4067		uintptr_t rw;
4068	} r;
4069#else
4070	struct thread *lowner;
4071	union {
4072		struct lock_object *li;
4073		uintptr_t lx;
4074	} l;
4075#endif
4076
4077	switch (subr) {
4078	case DIF_SUBR_RAND:
4079		regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
4080		break;
4081
4082#if defined(sun)
4083	case DIF_SUBR_MUTEX_OWNED:
4084		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4085		    mstate, vstate)) {
4086			regs[rd] = 0;
4087			break;
4088		}
4089
4090		m.mx = dtrace_load64(tupregs[0].dttk_value);
4091		if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4092			regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4093		else
4094			regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4095		break;
4096
4097	case DIF_SUBR_MUTEX_OWNER:
4098		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4099		    mstate, vstate)) {
4100			regs[rd] = 0;
4101			break;
4102		}
4103
4104		m.mx = dtrace_load64(tupregs[0].dttk_value);
4105		if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4106		    MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4107			regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4108		else
4109			regs[rd] = 0;
4110		break;
4111
4112	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4113		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4114		    mstate, vstate)) {
4115			regs[rd] = 0;
4116			break;
4117		}
4118
4119		m.mx = dtrace_load64(tupregs[0].dttk_value);
4120		regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4121		break;
4122
4123	case DIF_SUBR_MUTEX_TYPE_SPIN:
4124		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4125		    mstate, vstate)) {
4126			regs[rd] = 0;
4127			break;
4128		}
4129
4130		m.mx = dtrace_load64(tupregs[0].dttk_value);
4131		regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4132		break;
4133
4134	case DIF_SUBR_RW_READ_HELD: {
4135		uintptr_t tmp;
4136
4137		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4138		    mstate, vstate)) {
4139			regs[rd] = 0;
4140			break;
4141		}
4142
4143		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4144		regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4145		break;
4146	}
4147
4148	case DIF_SUBR_RW_WRITE_HELD:
4149		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4150		    mstate, vstate)) {
4151			regs[rd] = 0;
4152			break;
4153		}
4154
4155		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4156		regs[rd] = _RW_WRITE_HELD(&r.ri);
4157		break;
4158
4159	case DIF_SUBR_RW_ISWRITER:
4160		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4161		    mstate, vstate)) {
4162			regs[rd] = 0;
4163			break;
4164		}
4165
4166		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4167		regs[rd] = _RW_ISWRITER(&r.ri);
4168		break;
4169
4170#else
4171	case DIF_SUBR_MUTEX_OWNED:
4172		if (!dtrace_canload(tupregs[0].dttk_value,
4173			sizeof (struct lock_object), mstate, vstate)) {
4174			regs[rd] = 0;
4175			break;
4176		}
4177		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4178		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4179		break;
4180
4181	case DIF_SUBR_MUTEX_OWNER:
4182		if (!dtrace_canload(tupregs[0].dttk_value,
4183			sizeof (struct lock_object), mstate, vstate)) {
4184			regs[rd] = 0;
4185			break;
4186		}
4187		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4188		LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4189		regs[rd] = (uintptr_t)lowner;
4190		break;
4191
4192	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4193		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4194		    mstate, vstate)) {
4195			regs[rd] = 0;
4196			break;
4197		}
4198		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4199		/* XXX - should be only LC_SLEEPABLE? */
4200		regs[rd] = (LOCK_CLASS(l.li)->lc_flags &
4201		    (LC_SLEEPLOCK | LC_SLEEPABLE)) != 0;
4202		break;
4203
4204	case DIF_SUBR_MUTEX_TYPE_SPIN:
4205		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4206		    mstate, vstate)) {
4207			regs[rd] = 0;
4208			break;
4209		}
4210		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4211		regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;
4212		break;
4213
4214	case DIF_SUBR_RW_READ_HELD:
4215	case DIF_SUBR_SX_SHARED_HELD:
4216		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4217		    mstate, vstate)) {
4218			regs[rd] = 0;
4219			break;
4220		}
4221		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4222		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4223		    lowner == NULL;
4224		break;
4225
4226	case DIF_SUBR_RW_WRITE_HELD:
4227	case DIF_SUBR_SX_EXCLUSIVE_HELD:
4228		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4229		    mstate, vstate)) {
4230			regs[rd] = 0;
4231			break;
4232		}
4233		l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4234		LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4235		regs[rd] = (lowner == curthread);
4236		break;
4237
4238	case DIF_SUBR_RW_ISWRITER:
4239	case DIF_SUBR_SX_ISEXCLUSIVE:
4240		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4241		    mstate, vstate)) {
4242			regs[rd] = 0;
4243			break;
4244		}
4245		l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4246		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4247		    lowner != NULL;
4248		break;
4249#endif /* ! defined(sun) */
4250
4251	case DIF_SUBR_BCOPY: {
4252		/*
4253		 * We need to be sure that the destination is in the scratch
4254		 * region -- no other region is allowed.
4255		 */
4256		uintptr_t src = tupregs[0].dttk_value;
4257		uintptr_t dest = tupregs[1].dttk_value;
4258		size_t size = tupregs[2].dttk_value;
4259
4260		if (!dtrace_inscratch(dest, size, mstate)) {
4261			*flags |= CPU_DTRACE_BADADDR;
4262			*illval = regs[rd];
4263			break;
4264		}
4265
4266		if (!dtrace_canload(src, size, mstate, vstate)) {
4267			regs[rd] = 0;
4268			break;
4269		}
4270
4271		dtrace_bcopy((void *)src, (void *)dest, size);
4272		break;
4273	}
4274
4275	case DIF_SUBR_ALLOCA:
4276	case DIF_SUBR_COPYIN: {
4277		uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4278		uint64_t size =
4279		    tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4280		size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4281
4282		/*
4283		 * This action doesn't require any credential checks since
4284		 * probes will not activate in user contexts to which the
4285		 * enabling user does not have permissions.
4286		 */
4287
4288		/*
4289		 * Rounding up the user allocation size could have overflowed
4290		 * a large, bogus allocation (like -1ULL) to 0.
4291		 */
4292		if (scratch_size < size ||
4293		    !DTRACE_INSCRATCH(mstate, scratch_size)) {
4294			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4295			regs[rd] = 0;
4296			break;
4297		}
4298
4299		if (subr == DIF_SUBR_COPYIN) {
4300			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4301			dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4302			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4303		}
4304
4305		mstate->dtms_scratch_ptr += scratch_size;
4306		regs[rd] = dest;
4307		break;
4308	}
4309
4310	case DIF_SUBR_COPYINTO: {
4311		uint64_t size = tupregs[1].dttk_value;
4312		uintptr_t dest = tupregs[2].dttk_value;
4313
4314		/*
4315		 * This action doesn't require any credential checks since
4316		 * probes will not activate in user contexts to which the
4317		 * enabling user does not have permissions.
4318		 */
4319		if (!dtrace_inscratch(dest, size, mstate)) {
4320			*flags |= CPU_DTRACE_BADADDR;
4321			*illval = regs[rd];
4322			break;
4323		}
4324
4325		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4326		dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4327		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4328		break;
4329	}
4330
4331	case DIF_SUBR_COPYINSTR: {
4332		uintptr_t dest = mstate->dtms_scratch_ptr;
4333		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4334
4335		if (nargs > 1 && tupregs[1].dttk_value < size)
4336			size = tupregs[1].dttk_value + 1;
4337
4338		/*
4339		 * This action doesn't require any credential checks since
4340		 * probes will not activate in user contexts to which the
4341		 * enabling user does not have permissions.
4342		 */
4343		if (!DTRACE_INSCRATCH(mstate, size)) {
4344			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4345			regs[rd] = 0;
4346			break;
4347		}
4348
4349		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4350		dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4351		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4352
4353		((char *)dest)[size - 1] = '\0';
4354		mstate->dtms_scratch_ptr += size;
4355		regs[rd] = dest;
4356		break;
4357	}
4358
4359#if defined(sun)
4360	case DIF_SUBR_MSGSIZE:
4361	case DIF_SUBR_MSGDSIZE: {
4362		uintptr_t baddr = tupregs[0].dttk_value, daddr;
4363		uintptr_t wptr, rptr;
4364		size_t count = 0;
4365		int cont = 0;
4366
4367		while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
4368
4369			if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
4370			    vstate)) {
4371				regs[rd] = 0;
4372				break;
4373			}
4374
4375			wptr = dtrace_loadptr(baddr +
4376			    offsetof(mblk_t, b_wptr));
4377
4378			rptr = dtrace_loadptr(baddr +
4379			    offsetof(mblk_t, b_rptr));
4380
4381			if (wptr < rptr) {
4382				*flags |= CPU_DTRACE_BADADDR;
4383				*illval = tupregs[0].dttk_value;
4384				break;
4385			}
4386
4387			daddr = dtrace_loadptr(baddr +
4388			    offsetof(mblk_t, b_datap));
4389
4390			baddr = dtrace_loadptr(baddr +
4391			    offsetof(mblk_t, b_cont));
4392
4393			/*
4394			 * We want to prevent against denial-of-service here,
4395			 * so we're only going to search the list for
4396			 * dtrace_msgdsize_max mblks.
4397			 */
4398			if (cont++ > dtrace_msgdsize_max) {
4399				*flags |= CPU_DTRACE_ILLOP;
4400				break;
4401			}
4402
4403			if (subr == DIF_SUBR_MSGDSIZE) {
4404				if (dtrace_load8(daddr +
4405				    offsetof(dblk_t, db_type)) != M_DATA)
4406					continue;
4407			}
4408
4409			count += wptr - rptr;
4410		}
4411
4412		if (!(*flags & CPU_DTRACE_FAULT))
4413			regs[rd] = count;
4414
4415		break;
4416	}
4417#endif
4418
4419	case DIF_SUBR_PROGENYOF: {
4420		pid_t pid = tupregs[0].dttk_value;
4421		proc_t *p;
4422		int rval = 0;
4423
4424		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4425
4426		for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
4427#if defined(sun)
4428			if (p->p_pidp->pid_id == pid) {
4429#else
4430			if (p->p_pid == pid) {
4431#endif
4432				rval = 1;
4433				break;
4434			}
4435		}
4436
4437		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4438
4439		regs[rd] = rval;
4440		break;
4441	}
4442
4443	case DIF_SUBR_SPECULATION:
4444		regs[rd] = dtrace_speculation(state);
4445		break;
4446
4447	case DIF_SUBR_COPYOUT: {
4448		uintptr_t kaddr = tupregs[0].dttk_value;
4449		uintptr_t uaddr = tupregs[1].dttk_value;
4450		uint64_t size = tupregs[2].dttk_value;
4451
4452		if (!dtrace_destructive_disallow &&
4453		    dtrace_priv_proc_control(state) &&
4454		    !dtrace_istoxic(kaddr, size)) {
4455			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4456			dtrace_copyout(kaddr, uaddr, size, flags);
4457			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4458		}
4459		break;
4460	}
4461
4462	case DIF_SUBR_COPYOUTSTR: {
4463		uintptr_t kaddr = tupregs[0].dttk_value;
4464		uintptr_t uaddr = tupregs[1].dttk_value;
4465		uint64_t size = tupregs[2].dttk_value;
4466
4467		if (!dtrace_destructive_disallow &&
4468		    dtrace_priv_proc_control(state) &&
4469		    !dtrace_istoxic(kaddr, size)) {
4470			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4471			dtrace_copyoutstr(kaddr, uaddr, size, flags);
4472			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4473		}
4474		break;
4475	}
4476
4477	case DIF_SUBR_STRLEN: {
4478		size_t sz;
4479		uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4480		sz = dtrace_strlen((char *)addr,
4481		    state->dts_options[DTRACEOPT_STRSIZE]);
4482
4483		if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
4484			regs[rd] = 0;
4485			break;
4486		}
4487
4488		regs[rd] = sz;
4489
4490		break;
4491	}
4492
4493	case DIF_SUBR_STRCHR:
4494	case DIF_SUBR_STRRCHR: {
4495		/*
4496		 * We're going to iterate over the string looking for the
4497		 * specified character.  We will iterate until we have reached
4498		 * the string length or we have found the character.  If this
4499		 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4500		 * of the specified character instead of the first.
4501		 */
4502		uintptr_t saddr = tupregs[0].dttk_value;
4503		uintptr_t addr = tupregs[0].dttk_value;
4504		uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
4505		char c, target = (char)tupregs[1].dttk_value;
4506
4507		for (regs[rd] = 0; addr < limit; addr++) {
4508			if ((c = dtrace_load8(addr)) == target) {
4509				regs[rd] = addr;
4510
4511				if (subr == DIF_SUBR_STRCHR)
4512					break;
4513			}
4514
4515			if (c == '\0')
4516				break;
4517		}
4518
4519		if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
4520			regs[rd] = 0;
4521			break;
4522		}
4523
4524		break;
4525	}
4526
4527	case DIF_SUBR_STRSTR:
4528	case DIF_SUBR_INDEX:
4529	case DIF_SUBR_RINDEX: {
4530		/*
4531		 * We're going to iterate over the string looking for the
4532		 * specified string.  We will iterate until we have reached
4533		 * the string length or we have found the string.  (Yes, this
4534		 * is done in the most naive way possible -- but considering
4535		 * that the string we're searching for is likely to be
4536		 * relatively short, the complexity of Rabin-Karp or similar
4537		 * hardly seems merited.)
4538		 */
4539		char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4540		char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4541		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4542		size_t len = dtrace_strlen(addr, size);
4543		size_t sublen = dtrace_strlen(substr, size);
4544		char *limit = addr + len, *orig = addr;
4545		int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4546		int inc = 1;
4547
4548		regs[rd] = notfound;
4549
4550		if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4551			regs[rd] = 0;
4552			break;
4553		}
4554
4555		if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4556		    vstate)) {
4557			regs[rd] = 0;
4558			break;
4559		}
4560
4561		/*
4562		 * strstr() and index()/rindex() have similar semantics if
4563		 * both strings are the empty string: strstr() returns a
4564		 * pointer to the (empty) string, and index() and rindex()
4565		 * both return index 0 (regardless of any position argument).
4566		 */
4567		if (sublen == 0 && len == 0) {
4568			if (subr == DIF_SUBR_STRSTR)
4569				regs[rd] = (uintptr_t)addr;
4570			else
4571				regs[rd] = 0;
4572			break;
4573		}
4574
4575		if (subr != DIF_SUBR_STRSTR) {
4576			if (subr == DIF_SUBR_RINDEX) {
4577				limit = orig - 1;
4578				addr += len;
4579				inc = -1;
4580			}
4581
4582			/*
4583			 * Both index() and rindex() take an optional position
4584			 * argument that denotes the starting position.
4585			 */
4586			if (nargs == 3) {
4587				int64_t pos = (int64_t)tupregs[2].dttk_value;
4588
4589				/*
4590				 * If the position argument to index() is
4591				 * negative, Perl implicitly clamps it at
4592				 * zero.  This semantic is a little surprising
4593				 * given the special meaning of negative
4594				 * positions to similar Perl functions like
4595				 * substr(), but it appears to reflect a
4596				 * notion that index() can start from a
4597				 * negative index and increment its way up to
4598				 * the string.  Given this notion, Perl's
4599				 * rindex() is at least self-consistent in
4600				 * that it implicitly clamps positions greater
4601				 * than the string length to be the string
4602				 * length.  Where Perl completely loses
4603				 * coherence, however, is when the specified
4604				 * substring is the empty string ("").  In
4605				 * this case, even if the position is
4606				 * negative, rindex() returns 0 -- and even if
4607				 * the position is greater than the length,
4608				 * index() returns the string length.  These
4609				 * semantics violate the notion that index()
4610				 * should never return a value less than the
4611				 * specified position and that rindex() should
4612				 * never return a value greater than the
4613				 * specified position.  (One assumes that
4614				 * these semantics are artifacts of Perl's
4615				 * implementation and not the results of
4616				 * deliberate design -- it beggars belief that
4617				 * even Larry Wall could desire such oddness.)
4618				 * While in the abstract one would wish for
4619				 * consistent position semantics across
4620				 * substr(), index() and rindex() -- or at the
4621				 * very least self-consistent position
4622				 * semantics for index() and rindex() -- we
4623				 * instead opt to keep with the extant Perl
4624				 * semantics, in all their broken glory.  (Do
4625				 * we have more desire to maintain Perl's
4626				 * semantics than Perl does?  Probably.)
4627				 */
4628				if (subr == DIF_SUBR_RINDEX) {
4629					if (pos < 0) {
4630						if (sublen == 0)
4631							regs[rd] = 0;
4632						break;
4633					}
4634
4635					if (pos > len)
4636						pos = len;
4637				} else {
4638					if (pos < 0)
4639						pos = 0;
4640
4641					if (pos >= len) {
4642						if (sublen == 0)
4643							regs[rd] = len;
4644						break;
4645					}
4646				}
4647
4648				addr = orig + pos;
4649			}
4650		}
4651
4652		for (regs[rd] = notfound; addr != limit; addr += inc) {
4653			if (dtrace_strncmp(addr, substr, sublen) == 0) {
4654				if (subr != DIF_SUBR_STRSTR) {
4655					/*
4656					 * As D index() and rindex() are
4657					 * modeled on Perl (and not on awk),
4658					 * we return a zero-based (and not a
4659					 * one-based) index.  (For you Perl
4660					 * weenies: no, we're not going to add
4661					 * $[ -- and shouldn't you be at a con
4662					 * or something?)
4663					 */
4664					regs[rd] = (uintptr_t)(addr - orig);
4665					break;
4666				}
4667
4668				ASSERT(subr == DIF_SUBR_STRSTR);
4669				regs[rd] = (uintptr_t)addr;
4670				break;
4671			}
4672		}
4673
4674		break;
4675	}
4676
4677	case DIF_SUBR_STRTOK: {
4678		uintptr_t addr = tupregs[0].dttk_value;
4679		uintptr_t tokaddr = tupregs[1].dttk_value;
4680		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4681		uintptr_t limit, toklimit = tokaddr + size;
4682		uint8_t c = 0, tokmap[32];	 /* 256 / 8 */
4683		char *dest = (char *)mstate->dtms_scratch_ptr;
4684		int i;
4685
4686		/*
4687		 * Check both the token buffer and (later) the input buffer,
4688		 * since both could be non-scratch addresses.
4689		 */
4690		if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
4691			regs[rd] = 0;
4692			break;
4693		}
4694
4695		if (!DTRACE_INSCRATCH(mstate, size)) {
4696			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4697			regs[rd] = 0;
4698			break;
4699		}
4700
4701		if (addr == 0) {
4702			/*
4703			 * If the address specified is NULL, we use our saved
4704			 * strtok pointer from the mstate.  Note that this
4705			 * means that the saved strtok pointer is _only_
4706			 * valid within multiple enablings of the same probe --
4707			 * it behaves like an implicit clause-local variable.
4708			 */
4709			addr = mstate->dtms_strtok;
4710		} else {
4711			/*
4712			 * If the user-specified address is non-NULL we must
4713			 * access check it.  This is the only time we have
4714			 * a chance to do so, since this address may reside
4715			 * in the string table of this clause-- future calls
4716			 * (when we fetch addr from mstate->dtms_strtok)
4717			 * would fail this access check.
4718			 */
4719			if (!dtrace_strcanload(addr, size, mstate, vstate)) {
4720				regs[rd] = 0;
4721				break;
4722			}
4723		}
4724
4725		/*
4726		 * First, zero the token map, and then process the token
4727		 * string -- setting a bit in the map for every character
4728		 * found in the token string.
4729		 */
4730		for (i = 0; i < sizeof (tokmap); i++)
4731			tokmap[i] = 0;
4732
4733		for (; tokaddr < toklimit; tokaddr++) {
4734			if ((c = dtrace_load8(tokaddr)) == '\0')
4735				break;
4736
4737			ASSERT((c >> 3) < sizeof (tokmap));
4738			tokmap[c >> 3] |= (1 << (c & 0x7));
4739		}
4740
4741		for (limit = addr + size; addr < limit; addr++) {
4742			/*
4743			 * We're looking for a character that is _not_ contained
4744			 * in the token string.
4745			 */
4746			if ((c = dtrace_load8(addr)) == '\0')
4747				break;
4748
4749			if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4750				break;
4751		}
4752
4753		if (c == '\0') {
4754			/*
4755			 * We reached the end of the string without finding
4756			 * any character that was not in the token string.
4757			 * We return NULL in this case, and we set the saved
4758			 * address to NULL as well.
4759			 */
4760			regs[rd] = 0;
4761			mstate->dtms_strtok = 0;
4762			break;
4763		}
4764
4765		/*
4766		 * From here on, we're copying into the destination string.
4767		 */
4768		for (i = 0; addr < limit && i < size - 1; addr++) {
4769			if ((c = dtrace_load8(addr)) == '\0')
4770				break;
4771
4772			if (tokmap[c >> 3] & (1 << (c & 0x7)))
4773				break;
4774
4775			ASSERT(i < size);
4776			dest[i++] = c;
4777		}
4778
4779		ASSERT(i < size);
4780		dest[i] = '\0';
4781		regs[rd] = (uintptr_t)dest;
4782		mstate->dtms_scratch_ptr += size;
4783		mstate->dtms_strtok = addr;
4784		break;
4785	}
4786
4787	case DIF_SUBR_SUBSTR: {
4788		uintptr_t s = tupregs[0].dttk_value;
4789		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4790		char *d = (char *)mstate->dtms_scratch_ptr;
4791		int64_t index = (int64_t)tupregs[1].dttk_value;
4792		int64_t remaining = (int64_t)tupregs[2].dttk_value;
4793		size_t len = dtrace_strlen((char *)s, size);
4794		int64_t i;
4795
4796		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4797			regs[rd] = 0;
4798			break;
4799		}
4800
4801		if (!DTRACE_INSCRATCH(mstate, size)) {
4802			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4803			regs[rd] = 0;
4804			break;
4805		}
4806
4807		if (nargs <= 2)
4808			remaining = (int64_t)size;
4809
4810		if (index < 0) {
4811			index += len;
4812
4813			if (index < 0 && index + remaining > 0) {
4814				remaining += index;
4815				index = 0;
4816			}
4817		}
4818
4819		if (index >= len || index < 0) {
4820			remaining = 0;
4821		} else if (remaining < 0) {
4822			remaining += len - index;
4823		} else if (index + remaining > size) {
4824			remaining = size - index;
4825		}
4826
4827		for (i = 0; i < remaining; i++) {
4828			if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4829				break;
4830		}
4831
4832		d[i] = '\0';
4833
4834		mstate->dtms_scratch_ptr += size;
4835		regs[rd] = (uintptr_t)d;
4836		break;
4837	}
4838
4839	case DIF_SUBR_JSON: {
4840		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4841		uintptr_t json = tupregs[0].dttk_value;
4842		size_t jsonlen = dtrace_strlen((char *)json, size);
4843		uintptr_t elem = tupregs[1].dttk_value;
4844		size_t elemlen = dtrace_strlen((char *)elem, size);
4845
4846		char *dest = (char *)mstate->dtms_scratch_ptr;
4847		char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
4848		char *ee = elemlist;
4849		int nelems = 1;
4850		uintptr_t cur;
4851
4852		if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
4853		    !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
4854			regs[rd] = 0;
4855			break;
4856		}
4857
4858		if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
4859			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4860			regs[rd] = 0;
4861			break;
4862		}
4863
4864		/*
4865		 * Read the element selector and split it up into a packed list
4866		 * of strings.
4867		 */
4868		for (cur = elem; cur < elem + elemlen; cur++) {
4869			char cc = dtrace_load8(cur);
4870
4871			if (cur == elem && cc == '[') {
4872				/*
4873				 * If the first element selector key is
4874				 * actually an array index then ignore the
4875				 * bracket.
4876				 */
4877				continue;
4878			}
4879
4880			if (cc == ']')
4881				continue;
4882
4883			if (cc == '.' || cc == '[') {
4884				nelems++;
4885				cc = '\0';
4886			}
4887
4888			*ee++ = cc;
4889		}
4890		*ee++ = '\0';
4891
4892		if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
4893		    nelems, dest)) != 0)
4894			mstate->dtms_scratch_ptr += jsonlen + 1;
4895		break;
4896	}
4897
4898	case DIF_SUBR_TOUPPER:
4899	case DIF_SUBR_TOLOWER: {
4900		uintptr_t s = tupregs[0].dttk_value;
4901		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4902		char *dest = (char *)mstate->dtms_scratch_ptr, c;
4903		size_t len = dtrace_strlen((char *)s, size);
4904		char lower, upper, convert;
4905		int64_t i;
4906
4907		if (subr == DIF_SUBR_TOUPPER) {
4908			lower = 'a';
4909			upper = 'z';
4910			convert = 'A';
4911		} else {
4912			lower = 'A';
4913			upper = 'Z';
4914			convert = 'a';
4915		}
4916
4917		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4918			regs[rd] = 0;
4919			break;
4920		}
4921
4922		if (!DTRACE_INSCRATCH(mstate, size)) {
4923			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4924			regs[rd] = 0;
4925			break;
4926		}
4927
4928		for (i = 0; i < size - 1; i++) {
4929			if ((c = dtrace_load8(s + i)) == '\0')
4930				break;
4931
4932			if (c >= lower && c <= upper)
4933				c = convert + (c - lower);
4934
4935			dest[i] = c;
4936		}
4937
4938		ASSERT(i < size);
4939		dest[i] = '\0';
4940		regs[rd] = (uintptr_t)dest;
4941		mstate->dtms_scratch_ptr += size;
4942		break;
4943	}
4944
4945#if defined(sun)
4946	case DIF_SUBR_GETMAJOR:
4947#ifdef _LP64
4948		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
4949#else
4950		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
4951#endif
4952		break;
4953
4954	case DIF_SUBR_GETMINOR:
4955#ifdef _LP64
4956		regs[rd] = tupregs[0].dttk_value & MAXMIN64;
4957#else
4958		regs[rd] = tupregs[0].dttk_value & MAXMIN;
4959#endif
4960		break;
4961
4962	case DIF_SUBR_DDI_PATHNAME: {
4963		/*
4964		 * This one is a galactic mess.  We are going to roughly
4965		 * emulate ddi_pathname(), but it's made more complicated
4966		 * by the fact that we (a) want to include the minor name and
4967		 * (b) must proceed iteratively instead of recursively.
4968		 */
4969		uintptr_t dest = mstate->dtms_scratch_ptr;
4970		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4971		char *start = (char *)dest, *end = start + size - 1;
4972		uintptr_t daddr = tupregs[0].dttk_value;
4973		int64_t minor = (int64_t)tupregs[1].dttk_value;
4974		char *s;
4975		int i, len, depth = 0;
4976
4977		/*
4978		 * Due to all the pointer jumping we do and context we must
4979		 * rely upon, we just mandate that the user must have kernel
4980		 * read privileges to use this routine.
4981		 */
4982		if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4983			*flags |= CPU_DTRACE_KPRIV;
4984			*illval = daddr;
4985			regs[rd] = 0;
4986		}
4987
4988		if (!DTRACE_INSCRATCH(mstate, size)) {
4989			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4990			regs[rd] = 0;
4991			break;
4992		}
4993
4994		*end = '\0';
4995
4996		/*
4997		 * We want to have a name for the minor.  In order to do this,
4998		 * we need to walk the minor list from the devinfo.  We want
4999		 * to be sure that we don't infinitely walk a circular list,
5000		 * so we check for circularity by sending a scout pointer
5001		 * ahead two elements for every element that we iterate over;
5002		 * if the list is circular, these will ultimately point to the
5003		 * same element.  You may recognize this little trick as the
5004		 * answer to a stupid interview question -- one that always
5005		 * seems to be asked by those who had to have it laboriously
5006		 * explained to them, and who can't even concisely describe
5007		 * the conditions under which one would be forced to resort to
5008		 * this technique.  Needless to say, those conditions are
5009		 * found here -- and probably only here.  Is this the only use
5010		 * of this infamous trick in shipping, production code?  If it
5011		 * isn't, it probably should be...
5012		 */
5013		if (minor != -1) {
5014			uintptr_t maddr = dtrace_loadptr(daddr +
5015			    offsetof(struct dev_info, devi_minor));
5016
5017			uintptr_t next = offsetof(struct ddi_minor_data, next);
5018			uintptr_t name = offsetof(struct ddi_minor_data,
5019			    d_minor) + offsetof(struct ddi_minor, name);
5020			uintptr_t dev = offsetof(struct ddi_minor_data,
5021			    d_minor) + offsetof(struct ddi_minor, dev);
5022			uintptr_t scout;
5023
5024			if (maddr != NULL)
5025				scout = dtrace_loadptr(maddr + next);
5026
5027			while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
5028				uint64_t m;
5029#ifdef _LP64
5030				m = dtrace_load64(maddr + dev) & MAXMIN64;
5031#else
5032				m = dtrace_load32(maddr + dev) & MAXMIN;
5033#endif
5034				if (m != minor) {
5035					maddr = dtrace_loadptr(maddr + next);
5036
5037					if (scout == NULL)
5038						continue;
5039
5040					scout = dtrace_loadptr(scout + next);
5041
5042					if (scout == NULL)
5043						continue;
5044
5045					scout = dtrace_loadptr(scout + next);
5046
5047					if (scout == NULL)
5048						continue;
5049
5050					if (scout == maddr) {
5051						*flags |= CPU_DTRACE_ILLOP;
5052						break;
5053					}
5054
5055					continue;
5056				}
5057
5058				/*
5059				 * We have the minor data.  Now we need to
5060				 * copy the minor's name into the end of the
5061				 * pathname.
5062				 */
5063				s = (char *)dtrace_loadptr(maddr + name);
5064				len = dtrace_strlen(s, size);
5065
5066				if (*flags & CPU_DTRACE_FAULT)
5067					break;
5068
5069				if (len != 0) {
5070					if ((end -= (len + 1)) < start)
5071						break;
5072
5073					*end = ':';
5074				}
5075
5076				for (i = 1; i <= len; i++)
5077					end[i] = dtrace_load8((uintptr_t)s++);
5078				break;
5079			}
5080		}
5081
5082		while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
5083			ddi_node_state_t devi_state;
5084
5085			devi_state = dtrace_load32(daddr +
5086			    offsetof(struct dev_info, devi_node_state));
5087
5088			if (*flags & CPU_DTRACE_FAULT)
5089				break;
5090
5091			if (devi_state >= DS_INITIALIZED) {
5092				s = (char *)dtrace_loadptr(daddr +
5093				    offsetof(struct dev_info, devi_addr));
5094				len = dtrace_strlen(s, size);
5095
5096				if (*flags & CPU_DTRACE_FAULT)
5097					break;
5098
5099				if (len != 0) {
5100					if ((end -= (len + 1)) < start)
5101						break;
5102
5103					*end = '@';
5104				}
5105
5106				for (i = 1; i <= len; i++)
5107					end[i] = dtrace_load8((uintptr_t)s++);
5108			}
5109
5110			/*
5111			 * Now for the node name...
5112			 */
5113			s = (char *)dtrace_loadptr(daddr +
5114			    offsetof(struct dev_info, devi_node_name));
5115
5116			daddr = dtrace_loadptr(daddr +
5117			    offsetof(struct dev_info, devi_parent));
5118
5119			/*
5120			 * If our parent is NULL (that is, if we're the root
5121			 * node), we're going to use the special path
5122			 * "devices".
5123			 */
5124			if (daddr == 0)
5125				s = "devices";
5126
5127			len = dtrace_strlen(s, size);
5128			if (*flags & CPU_DTRACE_FAULT)
5129				break;
5130
5131			if ((end -= (len + 1)) < start)
5132				break;
5133
5134			for (i = 1; i <= len; i++)
5135				end[i] = dtrace_load8((uintptr_t)s++);
5136			*end = '/';
5137
5138			if (depth++ > dtrace_devdepth_max) {
5139				*flags |= CPU_DTRACE_ILLOP;
5140				break;
5141			}
5142		}
5143
5144		if (end < start)
5145			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5146
5147		if (daddr == 0) {
5148			regs[rd] = (uintptr_t)end;
5149			mstate->dtms_scratch_ptr += size;
5150		}
5151
5152		break;
5153	}
5154#endif
5155
5156	case DIF_SUBR_STRJOIN: {
5157		char *d = (char *)mstate->dtms_scratch_ptr;
5158		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5159		uintptr_t s1 = tupregs[0].dttk_value;
5160		uintptr_t s2 = tupregs[1].dttk_value;
5161		int i = 0;
5162
5163		if (!dtrace_strcanload(s1, size, mstate, vstate) ||
5164		    !dtrace_strcanload(s2, size, mstate, vstate)) {
5165			regs[rd] = 0;
5166			break;
5167		}
5168
5169		if (!DTRACE_INSCRATCH(mstate, size)) {
5170			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5171			regs[rd] = 0;
5172			break;
5173		}
5174
5175		for (;;) {
5176			if (i >= size) {
5177				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5178				regs[rd] = 0;
5179				break;
5180			}
5181
5182			if ((d[i++] = dtrace_load8(s1++)) == '\0') {
5183				i--;
5184				break;
5185			}
5186		}
5187
5188		for (;;) {
5189			if (i >= size) {
5190				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5191				regs[rd] = 0;
5192				break;
5193			}
5194
5195			if ((d[i++] = dtrace_load8(s2++)) == '\0')
5196				break;
5197		}
5198
5199		if (i < size) {
5200			mstate->dtms_scratch_ptr += i;
5201			regs[rd] = (uintptr_t)d;
5202		}
5203
5204		break;
5205	}
5206
5207	case DIF_SUBR_STRTOLL: {
5208		uintptr_t s = tupregs[0].dttk_value;
5209		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5210		int base = 10;
5211
5212		if (nargs > 1) {
5213			if ((base = tupregs[1].dttk_value) <= 1 ||
5214			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5215				*flags |= CPU_DTRACE_ILLOP;
5216				break;
5217			}
5218		}
5219
5220		if (!dtrace_strcanload(s, size, mstate, vstate)) {
5221			regs[rd] = INT64_MIN;
5222			break;
5223		}
5224
5225		regs[rd] = dtrace_strtoll((char *)s, base, size);
5226		break;
5227	}
5228
5229	case DIF_SUBR_LLTOSTR: {
5230		int64_t i = (int64_t)tupregs[0].dttk_value;
5231		uint64_t val, digit;
5232		uint64_t size = 65;	/* enough room for 2^64 in binary */
5233		char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
5234		int base = 10;
5235
5236		if (nargs > 1) {
5237			if ((base = tupregs[1].dttk_value) <= 1 ||
5238			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5239				*flags |= CPU_DTRACE_ILLOP;
5240				break;
5241			}
5242		}
5243
5244		val = (base == 10 && i < 0) ? i * -1 : i;
5245
5246		if (!DTRACE_INSCRATCH(mstate, size)) {
5247			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5248			regs[rd] = 0;
5249			break;
5250		}
5251
5252		for (*end-- = '\0'; val; val /= base) {
5253			if ((digit = val % base) <= '9' - '0') {
5254				*end-- = '0' + digit;
5255			} else {
5256				*end-- = 'a' + (digit - ('9' - '0') - 1);
5257			}
5258		}
5259
5260		if (i == 0 && base == 16)
5261			*end-- = '0';
5262
5263		if (base == 16)
5264			*end-- = 'x';
5265
5266		if (i == 0 || base == 8 || base == 16)
5267			*end-- = '0';
5268
5269		if (i < 0 && base == 10)
5270			*end-- = '-';
5271
5272		regs[rd] = (uintptr_t)end + 1;
5273		mstate->dtms_scratch_ptr += size;
5274		break;
5275	}
5276
5277	case DIF_SUBR_HTONS:
5278	case DIF_SUBR_NTOHS:
5279#if BYTE_ORDER == BIG_ENDIAN
5280		regs[rd] = (uint16_t)tupregs[0].dttk_value;
5281#else
5282		regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
5283#endif
5284		break;
5285
5286
5287	case DIF_SUBR_HTONL:
5288	case DIF_SUBR_NTOHL:
5289#if BYTE_ORDER == BIG_ENDIAN
5290		regs[rd] = (uint32_t)tupregs[0].dttk_value;
5291#else
5292		regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
5293#endif
5294		break;
5295
5296
5297	case DIF_SUBR_HTONLL:
5298	case DIF_SUBR_NTOHLL:
5299#if BYTE_ORDER == BIG_ENDIAN
5300		regs[rd] = (uint64_t)tupregs[0].dttk_value;
5301#else
5302		regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
5303#endif
5304		break;
5305
5306
5307	case DIF_SUBR_DIRNAME:
5308	case DIF_SUBR_BASENAME: {
5309		char *dest = (char *)mstate->dtms_scratch_ptr;
5310		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5311		uintptr_t src = tupregs[0].dttk_value;
5312		int i, j, len = dtrace_strlen((char *)src, size);
5313		int lastbase = -1, firstbase = -1, lastdir = -1;
5314		int start, end;
5315
5316		if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5317			regs[rd] = 0;
5318			break;
5319		}
5320
5321		if (!DTRACE_INSCRATCH(mstate, size)) {
5322			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5323			regs[rd] = 0;
5324			break;
5325		}
5326
5327		/*
5328		 * The basename and dirname for a zero-length string is
5329		 * defined to be "."
5330		 */
5331		if (len == 0) {
5332			len = 1;
5333			src = (uintptr_t)".";
5334		}
5335
5336		/*
5337		 * Start from the back of the string, moving back toward the
5338		 * front until we see a character that isn't a slash.  That
5339		 * character is the last character in the basename.
5340		 */
5341		for (i = len - 1; i >= 0; i--) {
5342			if (dtrace_load8(src + i) != '/')
5343				break;
5344		}
5345
5346		if (i >= 0)
5347			lastbase = i;
5348
5349		/*
5350		 * Starting from the last character in the basename, move
5351		 * towards the front until we find a slash.  The character
5352		 * that we processed immediately before that is the first
5353		 * character in the basename.
5354		 */
5355		for (; i >= 0; i--) {
5356			if (dtrace_load8(src + i) == '/')
5357				break;
5358		}
5359
5360		if (i >= 0)
5361			firstbase = i + 1;
5362
5363		/*
5364		 * Now keep going until we find a non-slash character.  That
5365		 * character is the last character in the dirname.
5366		 */
5367		for (; i >= 0; i--) {
5368			if (dtrace_load8(src + i) != '/')
5369				break;
5370		}
5371
5372		if (i >= 0)
5373			lastdir = i;
5374
5375		ASSERT(!(lastbase == -1 && firstbase != -1));
5376		ASSERT(!(firstbase == -1 && lastdir != -1));
5377
5378		if (lastbase == -1) {
5379			/*
5380			 * We didn't find a non-slash character.  We know that
5381			 * the length is non-zero, so the whole string must be
5382			 * slashes.  In either the dirname or the basename
5383			 * case, we return '/'.
5384			 */
5385			ASSERT(firstbase == -1);
5386			firstbase = lastbase = lastdir = 0;
5387		}
5388
5389		if (firstbase == -1) {
5390			/*
5391			 * The entire string consists only of a basename
5392			 * component.  If we're looking for dirname, we need
5393			 * to change our string to be just "."; if we're
5394			 * looking for a basename, we'll just set the first
5395			 * character of the basename to be 0.
5396			 */
5397			if (subr == DIF_SUBR_DIRNAME) {
5398				ASSERT(lastdir == -1);
5399				src = (uintptr_t)".";
5400				lastdir = 0;
5401			} else {
5402				firstbase = 0;
5403			}
5404		}
5405
5406		if (subr == DIF_SUBR_DIRNAME) {
5407			if (lastdir == -1) {
5408				/*
5409				 * We know that we have a slash in the name --
5410				 * or lastdir would be set to 0, above.  And
5411				 * because lastdir is -1, we know that this
5412				 * slash must be the first character.  (That
5413				 * is, the full string must be of the form
5414				 * "/basename".)  In this case, the last
5415				 * character of the directory name is 0.
5416				 */
5417				lastdir = 0;
5418			}
5419
5420			start = 0;
5421			end = lastdir;
5422		} else {
5423			ASSERT(subr == DIF_SUBR_BASENAME);
5424			ASSERT(firstbase != -1 && lastbase != -1);
5425			start = firstbase;
5426			end = lastbase;
5427		}
5428
5429		for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
5430			dest[j] = dtrace_load8(src + i);
5431
5432		dest[j] = '\0';
5433		regs[rd] = (uintptr_t)dest;
5434		mstate->dtms_scratch_ptr += size;
5435		break;
5436	}
5437
5438	case DIF_SUBR_GETF: {
5439		uintptr_t fd = tupregs[0].dttk_value;
5440		struct filedesc *fdp;
5441		file_t *fp;
5442
5443		if (!dtrace_priv_proc(state)) {
5444			regs[rd] = 0;
5445			break;
5446		}
5447		fdp = curproc->p_fd;
5448		FILEDESC_SLOCK(fdp);
5449		fp = fget_locked(fdp, fd);
5450		mstate->dtms_getf = fp;
5451		regs[rd] = (uintptr_t)fp;
5452		FILEDESC_SUNLOCK(fdp);
5453		break;
5454	}
5455
5456	case DIF_SUBR_CLEANPATH: {
5457		char *dest = (char *)mstate->dtms_scratch_ptr, c;
5458		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5459		uintptr_t src = tupregs[0].dttk_value;
5460		int i = 0, j = 0;
5461#if defined(sun)
5462		zone_t *z;
5463#endif
5464
5465		if (!dtrace_strcanload(src, size, mstate, vstate)) {
5466			regs[rd] = 0;
5467			break;
5468		}
5469
5470		if (!DTRACE_INSCRATCH(mstate, size)) {
5471			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5472			regs[rd] = 0;
5473			break;
5474		}
5475
5476		/*
5477		 * Move forward, loading each character.
5478		 */
5479		do {
5480			c = dtrace_load8(src + i++);
5481next:
5482			if (j + 5 >= size)	/* 5 = strlen("/..c\0") */
5483				break;
5484
5485			if (c != '/') {
5486				dest[j++] = c;
5487				continue;
5488			}
5489
5490			c = dtrace_load8(src + i++);
5491
5492			if (c == '/') {
5493				/*
5494				 * We have two slashes -- we can just advance
5495				 * to the next character.
5496				 */
5497				goto next;
5498			}
5499
5500			if (c != '.') {
5501				/*
5502				 * This is not "." and it's not ".." -- we can
5503				 * just store the "/" and this character and
5504				 * drive on.
5505				 */
5506				dest[j++] = '/';
5507				dest[j++] = c;
5508				continue;
5509			}
5510
5511			c = dtrace_load8(src + i++);
5512
5513			if (c == '/') {
5514				/*
5515				 * This is a "/./" component.  We're not going
5516				 * to store anything in the destination buffer;
5517				 * we're just going to go to the next component.
5518				 */
5519				goto next;
5520			}
5521
5522			if (c != '.') {
5523				/*
5524				 * This is not ".." -- we can just store the
5525				 * "/." and this character and continue
5526				 * processing.
5527				 */
5528				dest[j++] = '/';
5529				dest[j++] = '.';
5530				dest[j++] = c;
5531				continue;
5532			}
5533
5534			c = dtrace_load8(src + i++);
5535
5536			if (c != '/' && c != '\0') {
5537				/*
5538				 * This is not ".." -- it's "..[mumble]".
5539				 * We'll store the "/.." and this character
5540				 * and continue processing.
5541				 */
5542				dest[j++] = '/';
5543				dest[j++] = '.';
5544				dest[j++] = '.';
5545				dest[j++] = c;
5546				continue;
5547			}
5548
5549			/*
5550			 * This is "/../" or "/..\0".  We need to back up
5551			 * our destination pointer until we find a "/".
5552			 */
5553			i--;
5554			while (j != 0 && dest[--j] != '/')
5555				continue;
5556
5557			if (c == '\0')
5558				dest[++j] = '/';
5559		} while (c != '\0');
5560
5561		dest[j] = '\0';
5562
5563#if defined(sun)
5564		if (mstate->dtms_getf != NULL &&
5565		    !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
5566		    (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
5567			/*
5568			 * If we've done a getf() as a part of this ECB and we
5569			 * don't have kernel access (and we're not in the global
5570			 * zone), check if the path we cleaned up begins with
5571			 * the zone's root path, and trim it off if so.  Note
5572			 * that this is an output cleanliness issue, not a
5573			 * security issue: knowing one's zone root path does
5574			 * not enable privilege escalation.
5575			 */
5576			if (strstr(dest, z->zone_rootpath) == dest)
5577				dest += strlen(z->zone_rootpath) - 1;
5578		}
5579#endif
5580
5581		regs[rd] = (uintptr_t)dest;
5582		mstate->dtms_scratch_ptr += size;
5583		break;
5584	}
5585
5586	case DIF_SUBR_INET_NTOA:
5587	case DIF_SUBR_INET_NTOA6:
5588	case DIF_SUBR_INET_NTOP: {
5589		size_t size;
5590		int af, argi, i;
5591		char *base, *end;
5592
5593		if (subr == DIF_SUBR_INET_NTOP) {
5594			af = (int)tupregs[0].dttk_value;
5595			argi = 1;
5596		} else {
5597			af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5598			argi = 0;
5599		}
5600
5601		if (af == AF_INET) {
5602			ipaddr_t ip4;
5603			uint8_t *ptr8, val;
5604
5605			/*
5606			 * Safely load the IPv4 address.
5607			 */
5608			ip4 = dtrace_load32(tupregs[argi].dttk_value);
5609
5610			/*
5611			 * Check an IPv4 string will fit in scratch.
5612			 */
5613			size = INET_ADDRSTRLEN;
5614			if (!DTRACE_INSCRATCH(mstate, size)) {
5615				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5616				regs[rd] = 0;
5617				break;
5618			}
5619			base = (char *)mstate->dtms_scratch_ptr;
5620			end = (char *)mstate->dtms_scratch_ptr + size - 1;
5621
5622			/*
5623			 * Stringify as a dotted decimal quad.
5624			 */
5625			*end-- = '\0';
5626			ptr8 = (uint8_t *)&ip4;
5627			for (i = 3; i >= 0; i--) {
5628				val = ptr8[i];
5629
5630				if (val == 0) {
5631					*end-- = '0';
5632				} else {
5633					for (; val; val /= 10) {
5634						*end-- = '0' + (val % 10);
5635					}
5636				}
5637
5638				if (i > 0)
5639					*end-- = '.';
5640			}
5641			ASSERT(end + 1 >= base);
5642
5643		} else if (af == AF_INET6) {
5644			struct in6_addr ip6;
5645			int firstzero, tryzero, numzero, v6end;
5646			uint16_t val;
5647			const char digits[] = "0123456789abcdef";
5648
5649			/*
5650			 * Stringify using RFC 1884 convention 2 - 16 bit
5651			 * hexadecimal values with a zero-run compression.
5652			 * Lower case hexadecimal digits are used.
5653			 * 	eg, fe80::214:4fff:fe0b:76c8.
5654			 * The IPv4 embedded form is returned for inet_ntop,
5655			 * just the IPv4 string is returned for inet_ntoa6.
5656			 */
5657
5658			/*
5659			 * Safely load the IPv6 address.
5660			 */
5661			dtrace_bcopy(
5662			    (void *)(uintptr_t)tupregs[argi].dttk_value,
5663			    (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5664
5665			/*
5666			 * Check an IPv6 string will fit in scratch.
5667			 */
5668			size = INET6_ADDRSTRLEN;
5669			if (!DTRACE_INSCRATCH(mstate, size)) {
5670				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5671				regs[rd] = 0;
5672				break;
5673			}
5674			base = (char *)mstate->dtms_scratch_ptr;
5675			end = (char *)mstate->dtms_scratch_ptr + size - 1;
5676			*end-- = '\0';
5677
5678			/*
5679			 * Find the longest run of 16 bit zero values
5680			 * for the single allowed zero compression - "::".
5681			 */
5682			firstzero = -1;
5683			tryzero = -1;
5684			numzero = 1;
5685			for (i = 0; i < sizeof (struct in6_addr); i++) {
5686#if defined(sun)
5687				if (ip6._S6_un._S6_u8[i] == 0 &&
5688#else
5689				if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5690#endif
5691				    tryzero == -1 && i % 2 == 0) {
5692					tryzero = i;
5693					continue;
5694				}
5695
5696				if (tryzero != -1 &&
5697#if defined(sun)
5698				    (ip6._S6_un._S6_u8[i] != 0 ||
5699#else
5700				    (ip6.__u6_addr.__u6_addr8[i] != 0 ||
5701#endif
5702				    i == sizeof (struct in6_addr) - 1)) {
5703
5704					if (i - tryzero <= numzero) {
5705						tryzero = -1;
5706						continue;
5707					}
5708
5709					firstzero = tryzero;
5710					numzero = i - i % 2 - tryzero;
5711					tryzero = -1;
5712
5713#if defined(sun)
5714					if (ip6._S6_un._S6_u8[i] == 0 &&
5715#else
5716					if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5717#endif
5718					    i == sizeof (struct in6_addr) - 1)
5719						numzero += 2;
5720				}
5721			}
5722			ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
5723
5724			/*
5725			 * Check for an IPv4 embedded address.
5726			 */
5727			v6end = sizeof (struct in6_addr) - 2;
5728			if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5729			    IN6_IS_ADDR_V4COMPAT(&ip6)) {
5730				for (i = sizeof (struct in6_addr) - 1;
5731				    i >= DTRACE_V4MAPPED_OFFSET; i--) {
5732					ASSERT(end >= base);
5733
5734#if defined(sun)
5735					val = ip6._S6_un._S6_u8[i];
5736#else
5737					val = ip6.__u6_addr.__u6_addr8[i];
5738#endif
5739
5740					if (val == 0) {
5741						*end-- = '0';
5742					} else {
5743						for (; val; val /= 10) {
5744							*end-- = '0' + val % 10;
5745						}
5746					}
5747
5748					if (i > DTRACE_V4MAPPED_OFFSET)
5749						*end-- = '.';
5750				}
5751
5752				if (subr == DIF_SUBR_INET_NTOA6)
5753					goto inetout;
5754
5755				/*
5756				 * Set v6end to skip the IPv4 address that
5757				 * we have already stringified.
5758				 */
5759				v6end = 10;
5760			}
5761
5762			/*
5763			 * Build the IPv6 string by working through the
5764			 * address in reverse.
5765			 */
5766			for (i = v6end; i >= 0; i -= 2) {
5767				ASSERT(end >= base);
5768
5769				if (i == firstzero + numzero - 2) {
5770					*end-- = ':';
5771					*end-- = ':';
5772					i -= numzero - 2;
5773					continue;
5774				}
5775
5776				if (i < 14 && i != firstzero - 2)
5777					*end-- = ':';
5778
5779#if defined(sun)
5780				val = (ip6._S6_un._S6_u8[i] << 8) +
5781				    ip6._S6_un._S6_u8[i + 1];
5782#else
5783				val = (ip6.__u6_addr.__u6_addr8[i] << 8) +
5784				    ip6.__u6_addr.__u6_addr8[i + 1];
5785#endif
5786
5787				if (val == 0) {
5788					*end-- = '0';
5789				} else {
5790					for (; val; val /= 16) {
5791						*end-- = digits[val % 16];
5792					}
5793				}
5794			}
5795			ASSERT(end + 1 >= base);
5796
5797		} else {
5798			/*
5799			 * The user didn't use AH_INET or AH_INET6.
5800			 */
5801			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5802			regs[rd] = 0;
5803			break;
5804		}
5805
5806inetout:	regs[rd] = (uintptr_t)end + 1;
5807		mstate->dtms_scratch_ptr += size;
5808		break;
5809	}
5810
5811	case DIF_SUBR_MEMREF: {
5812		uintptr_t size = 2 * sizeof(uintptr_t);
5813		uintptr_t *memref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
5814		size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size;
5815
5816		/* address and length */
5817		memref[0] = tupregs[0].dttk_value;
5818		memref[1] = tupregs[1].dttk_value;
5819
5820		regs[rd] = (uintptr_t) memref;
5821		mstate->dtms_scratch_ptr += scratch_size;
5822		break;
5823	}
5824
5825#if !defined(sun)
5826	case DIF_SUBR_MEMSTR: {
5827		char *str = (char *)mstate->dtms_scratch_ptr;
5828		uintptr_t mem = tupregs[0].dttk_value;
5829		char c = tupregs[1].dttk_value;
5830		size_t size = tupregs[2].dttk_value;
5831		uint8_t n;
5832		int i;
5833
5834		regs[rd] = 0;
5835
5836		if (size == 0)
5837			break;
5838
5839		if (!dtrace_canload(mem, size - 1, mstate, vstate))
5840			break;
5841
5842		if (!DTRACE_INSCRATCH(mstate, size)) {
5843			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5844			break;
5845		}
5846
5847		if (dtrace_memstr_max != 0 && size > dtrace_memstr_max) {
5848			*flags |= CPU_DTRACE_ILLOP;
5849			break;
5850		}
5851
5852		for (i = 0; i < size - 1; i++) {
5853			n = dtrace_load8(mem++);
5854			str[i] = (n == 0) ? c : n;
5855		}
5856		str[size - 1] = 0;
5857
5858		regs[rd] = (uintptr_t)str;
5859		mstate->dtms_scratch_ptr += size;
5860		break;
5861	}
5862#endif
5863
5864	case DIF_SUBR_TYPEREF: {
5865		uintptr_t size = 4 * sizeof(uintptr_t);
5866		uintptr_t *typeref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
5867		size_t scratch_size = ((uintptr_t) typeref - mstate->dtms_scratch_ptr) + size;
5868
5869		/* address, num_elements, type_str, type_len */
5870		typeref[0] = tupregs[0].dttk_value;
5871		typeref[1] = tupregs[1].dttk_value;
5872		typeref[2] = tupregs[2].dttk_value;
5873		typeref[3] = tupregs[3].dttk_value;
5874
5875		regs[rd] = (uintptr_t) typeref;
5876		mstate->dtms_scratch_ptr += scratch_size;
5877		break;
5878	}
5879	}
5880}
5881
5882/*
5883 * Emulate the execution of DTrace IR instructions specified by the given
5884 * DIF object.  This function is deliberately void of assertions as all of
5885 * the necessary checks are handled by a call to dtrace_difo_validate().
5886 */
5887static uint64_t
5888dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5889    dtrace_vstate_t *vstate, dtrace_state_t *state)
5890{
5891	const dif_instr_t *text = difo->dtdo_buf;
5892	const uint_t textlen = difo->dtdo_len;
5893	const char *strtab = difo->dtdo_strtab;
5894	const uint64_t *inttab = difo->dtdo_inttab;
5895
5896	uint64_t rval = 0;
5897	dtrace_statvar_t *svar;
5898	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5899	dtrace_difv_t *v;
5900	volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
5901	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
5902
5903	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5904	uint64_t regs[DIF_DIR_NREGS];
5905	uint64_t *tmp;
5906
5907	uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5908	int64_t cc_r;
5909	uint_t pc = 0, id, opc = 0;
5910	uint8_t ttop = 0;
5911	dif_instr_t instr;
5912	uint_t r1, r2, rd;
5913
5914	/*
5915	 * We stash the current DIF object into the machine state: we need it
5916	 * for subsequent access checking.
5917	 */
5918	mstate->dtms_difo = difo;
5919
5920	regs[DIF_REG_R0] = 0; 		/* %r0 is fixed at zero */
5921
5922	while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5923		opc = pc;
5924
5925		instr = text[pc++];
5926		r1 = DIF_INSTR_R1(instr);
5927		r2 = DIF_INSTR_R2(instr);
5928		rd = DIF_INSTR_RD(instr);
5929
5930		switch (DIF_INSTR_OP(instr)) {
5931		case DIF_OP_OR:
5932			regs[rd] = regs[r1] | regs[r2];
5933			break;
5934		case DIF_OP_XOR:
5935			regs[rd] = regs[r1] ^ regs[r2];
5936			break;
5937		case DIF_OP_AND:
5938			regs[rd] = regs[r1] & regs[r2];
5939			break;
5940		case DIF_OP_SLL:
5941			regs[rd] = regs[r1] << regs[r2];
5942			break;
5943		case DIF_OP_SRL:
5944			regs[rd] = regs[r1] >> regs[r2];
5945			break;
5946		case DIF_OP_SUB:
5947			regs[rd] = regs[r1] - regs[r2];
5948			break;
5949		case DIF_OP_ADD:
5950			regs[rd] = regs[r1] + regs[r2];
5951			break;
5952		case DIF_OP_MUL:
5953			regs[rd] = regs[r1] * regs[r2];
5954			break;
5955		case DIF_OP_SDIV:
5956			if (regs[r2] == 0) {
5957				regs[rd] = 0;
5958				*flags |= CPU_DTRACE_DIVZERO;
5959			} else {
5960				regs[rd] = (int64_t)regs[r1] /
5961				    (int64_t)regs[r2];
5962			}
5963			break;
5964
5965		case DIF_OP_UDIV:
5966			if (regs[r2] == 0) {
5967				regs[rd] = 0;
5968				*flags |= CPU_DTRACE_DIVZERO;
5969			} else {
5970				regs[rd] = regs[r1] / regs[r2];
5971			}
5972			break;
5973
5974		case DIF_OP_SREM:
5975			if (regs[r2] == 0) {
5976				regs[rd] = 0;
5977				*flags |= CPU_DTRACE_DIVZERO;
5978			} else {
5979				regs[rd] = (int64_t)regs[r1] %
5980				    (int64_t)regs[r2];
5981			}
5982			break;
5983
5984		case DIF_OP_UREM:
5985			if (regs[r2] == 0) {
5986				regs[rd] = 0;
5987				*flags |= CPU_DTRACE_DIVZERO;
5988			} else {
5989				regs[rd] = regs[r1] % regs[r2];
5990			}
5991			break;
5992
5993		case DIF_OP_NOT:
5994			regs[rd] = ~regs[r1];
5995			break;
5996		case DIF_OP_MOV:
5997			regs[rd] = regs[r1];
5998			break;
5999		case DIF_OP_CMP:
6000			cc_r = regs[r1] - regs[r2];
6001			cc_n = cc_r < 0;
6002			cc_z = cc_r == 0;
6003			cc_v = 0;
6004			cc_c = regs[r1] < regs[r2];
6005			break;
6006		case DIF_OP_TST:
6007			cc_n = cc_v = cc_c = 0;
6008			cc_z = regs[r1] == 0;
6009			break;
6010		case DIF_OP_BA:
6011			pc = DIF_INSTR_LABEL(instr);
6012			break;
6013		case DIF_OP_BE:
6014			if (cc_z)
6015				pc = DIF_INSTR_LABEL(instr);
6016			break;
6017		case DIF_OP_BNE:
6018			if (cc_z == 0)
6019				pc = DIF_INSTR_LABEL(instr);
6020			break;
6021		case DIF_OP_BG:
6022			if ((cc_z | (cc_n ^ cc_v)) == 0)
6023				pc = DIF_INSTR_LABEL(instr);
6024			break;
6025		case DIF_OP_BGU:
6026			if ((cc_c | cc_z) == 0)
6027				pc = DIF_INSTR_LABEL(instr);
6028			break;
6029		case DIF_OP_BGE:
6030			if ((cc_n ^ cc_v) == 0)
6031				pc = DIF_INSTR_LABEL(instr);
6032			break;
6033		case DIF_OP_BGEU:
6034			if (cc_c == 0)
6035				pc = DIF_INSTR_LABEL(instr);
6036			break;
6037		case DIF_OP_BL:
6038			if (cc_n ^ cc_v)
6039				pc = DIF_INSTR_LABEL(instr);
6040			break;
6041		case DIF_OP_BLU:
6042			if (cc_c)
6043				pc = DIF_INSTR_LABEL(instr);
6044			break;
6045		case DIF_OP_BLE:
6046			if (cc_z | (cc_n ^ cc_v))
6047				pc = DIF_INSTR_LABEL(instr);
6048			break;
6049		case DIF_OP_BLEU:
6050			if (cc_c | cc_z)
6051				pc = DIF_INSTR_LABEL(instr);
6052			break;
6053		case DIF_OP_RLDSB:
6054			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
6055				break;
6056			/*FALLTHROUGH*/
6057		case DIF_OP_LDSB:
6058			regs[rd] = (int8_t)dtrace_load8(regs[r1]);
6059			break;
6060		case DIF_OP_RLDSH:
6061			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
6062				break;
6063			/*FALLTHROUGH*/
6064		case DIF_OP_LDSH:
6065			regs[rd] = (int16_t)dtrace_load16(regs[r1]);
6066			break;
6067		case DIF_OP_RLDSW:
6068			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
6069				break;
6070			/*FALLTHROUGH*/
6071		case DIF_OP_LDSW:
6072			regs[rd] = (int32_t)dtrace_load32(regs[r1]);
6073			break;
6074		case DIF_OP_RLDUB:
6075			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
6076				break;
6077			/*FALLTHROUGH*/
6078		case DIF_OP_LDUB:
6079			regs[rd] = dtrace_load8(regs[r1]);
6080			break;
6081		case DIF_OP_RLDUH:
6082			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
6083				break;
6084			/*FALLTHROUGH*/
6085		case DIF_OP_LDUH:
6086			regs[rd] = dtrace_load16(regs[r1]);
6087			break;
6088		case DIF_OP_RLDUW:
6089			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
6090				break;
6091			/*FALLTHROUGH*/
6092		case DIF_OP_LDUW:
6093			regs[rd] = dtrace_load32(regs[r1]);
6094			break;
6095		case DIF_OP_RLDX:
6096			if (!dtrace_canload(regs[r1], 8, mstate, vstate))
6097				break;
6098			/*FALLTHROUGH*/
6099		case DIF_OP_LDX:
6100			regs[rd] = dtrace_load64(regs[r1]);
6101			break;
6102		case DIF_OP_ULDSB:
6103			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6104			regs[rd] = (int8_t)
6105			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6106			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6107			break;
6108		case DIF_OP_ULDSH:
6109			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6110			regs[rd] = (int16_t)
6111			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6112			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6113			break;
6114		case DIF_OP_ULDSW:
6115			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6116			regs[rd] = (int32_t)
6117			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6118			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6119			break;
6120		case DIF_OP_ULDUB:
6121			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6122			regs[rd] =
6123			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6124			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6125			break;
6126		case DIF_OP_ULDUH:
6127			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6128			regs[rd] =
6129			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6130			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6131			break;
6132		case DIF_OP_ULDUW:
6133			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6134			regs[rd] =
6135			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6136			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6137			break;
6138		case DIF_OP_ULDX:
6139			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6140			regs[rd] =
6141			    dtrace_fuword64((void *)(uintptr_t)regs[r1]);
6142			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6143			break;
6144		case DIF_OP_RET:
6145			rval = regs[rd];
6146			pc = textlen;
6147			break;
6148		case DIF_OP_NOP:
6149			break;
6150		case DIF_OP_SETX:
6151			regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
6152			break;
6153		case DIF_OP_SETS:
6154			regs[rd] = (uint64_t)(uintptr_t)
6155			    (strtab + DIF_INSTR_STRING(instr));
6156			break;
6157		case DIF_OP_SCMP: {
6158			size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
6159			uintptr_t s1 = regs[r1];
6160			uintptr_t s2 = regs[r2];
6161
6162			if (s1 != 0 &&
6163			    !dtrace_strcanload(s1, sz, mstate, vstate))
6164				break;
6165			if (s2 != 0 &&
6166			    !dtrace_strcanload(s2, sz, mstate, vstate))
6167				break;
6168
6169			cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
6170
6171			cc_n = cc_r < 0;
6172			cc_z = cc_r == 0;
6173			cc_v = cc_c = 0;
6174			break;
6175		}
6176		case DIF_OP_LDGA:
6177			regs[rd] = dtrace_dif_variable(mstate, state,
6178			    r1, regs[r2]);
6179			break;
6180		case DIF_OP_LDGS:
6181			id = DIF_INSTR_VAR(instr);
6182
6183			if (id >= DIF_VAR_OTHER_UBASE) {
6184				uintptr_t a;
6185
6186				id -= DIF_VAR_OTHER_UBASE;
6187				svar = vstate->dtvs_globals[id];
6188				ASSERT(svar != NULL);
6189				v = &svar->dtsv_var;
6190
6191				if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
6192					regs[rd] = svar->dtsv_data;
6193					break;
6194				}
6195
6196				a = (uintptr_t)svar->dtsv_data;
6197
6198				if (*(uint8_t *)a == UINT8_MAX) {
6199					/*
6200					 * If the 0th byte is set to UINT8_MAX
6201					 * then this is to be treated as a
6202					 * reference to a NULL variable.
6203					 */
6204					regs[rd] = 0;
6205				} else {
6206					regs[rd] = a + sizeof (uint64_t);
6207				}
6208
6209				break;
6210			}
6211
6212			regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6213			break;
6214
6215		case DIF_OP_STGS:
6216			id = DIF_INSTR_VAR(instr);
6217
6218			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6219			id -= DIF_VAR_OTHER_UBASE;
6220
6221			svar = vstate->dtvs_globals[id];
6222			ASSERT(svar != NULL);
6223			v = &svar->dtsv_var;
6224
6225			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6226				uintptr_t a = (uintptr_t)svar->dtsv_data;
6227
6228				ASSERT(a != 0);
6229				ASSERT(svar->dtsv_size != 0);
6230
6231				if (regs[rd] == 0) {
6232					*(uint8_t *)a = UINT8_MAX;
6233					break;
6234				} else {
6235					*(uint8_t *)a = 0;
6236					a += sizeof (uint64_t);
6237				}
6238				if (!dtrace_vcanload(
6239				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6240				    mstate, vstate))
6241					break;
6242
6243				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6244				    (void *)a, &v->dtdv_type);
6245				break;
6246			}
6247
6248			svar->dtsv_data = regs[rd];
6249			break;
6250
6251		case DIF_OP_LDTA:
6252			/*
6253			 * There are no DTrace built-in thread-local arrays at
6254			 * present.  This opcode is saved for future work.
6255			 */
6256			*flags |= CPU_DTRACE_ILLOP;
6257			regs[rd] = 0;
6258			break;
6259
6260		case DIF_OP_LDLS:
6261			id = DIF_INSTR_VAR(instr);
6262
6263			if (id < DIF_VAR_OTHER_UBASE) {
6264				/*
6265				 * For now, this has no meaning.
6266				 */
6267				regs[rd] = 0;
6268				break;
6269			}
6270
6271			id -= DIF_VAR_OTHER_UBASE;
6272
6273			ASSERT(id < vstate->dtvs_nlocals);
6274			ASSERT(vstate->dtvs_locals != NULL);
6275
6276			svar = vstate->dtvs_locals[id];
6277			ASSERT(svar != NULL);
6278			v = &svar->dtsv_var;
6279
6280			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6281				uintptr_t a = (uintptr_t)svar->dtsv_data;
6282				size_t sz = v->dtdv_type.dtdt_size;
6283
6284				sz += sizeof (uint64_t);
6285				ASSERT(svar->dtsv_size == NCPU * sz);
6286				a += curcpu * sz;
6287
6288				if (*(uint8_t *)a == UINT8_MAX) {
6289					/*
6290					 * If the 0th byte is set to UINT8_MAX
6291					 * then this is to be treated as a
6292					 * reference to a NULL variable.
6293					 */
6294					regs[rd] = 0;
6295				} else {
6296					regs[rd] = a + sizeof (uint64_t);
6297				}
6298
6299				break;
6300			}
6301
6302			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6303			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6304			regs[rd] = tmp[curcpu];
6305			break;
6306
6307		case DIF_OP_STLS:
6308			id = DIF_INSTR_VAR(instr);
6309
6310			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6311			id -= DIF_VAR_OTHER_UBASE;
6312			ASSERT(id < vstate->dtvs_nlocals);
6313
6314			ASSERT(vstate->dtvs_locals != NULL);
6315			svar = vstate->dtvs_locals[id];
6316			ASSERT(svar != NULL);
6317			v = &svar->dtsv_var;
6318
6319			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6320				uintptr_t a = (uintptr_t)svar->dtsv_data;
6321				size_t sz = v->dtdv_type.dtdt_size;
6322
6323				sz += sizeof (uint64_t);
6324				ASSERT(svar->dtsv_size == NCPU * sz);
6325				a += curcpu * sz;
6326
6327				if (regs[rd] == 0) {
6328					*(uint8_t *)a = UINT8_MAX;
6329					break;
6330				} else {
6331					*(uint8_t *)a = 0;
6332					a += sizeof (uint64_t);
6333				}
6334
6335				if (!dtrace_vcanload(
6336				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6337				    mstate, vstate))
6338					break;
6339
6340				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6341				    (void *)a, &v->dtdv_type);
6342				break;
6343			}
6344
6345			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6346			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6347			tmp[curcpu] = regs[rd];
6348			break;
6349
6350		case DIF_OP_LDTS: {
6351			dtrace_dynvar_t *dvar;
6352			dtrace_key_t *key;
6353
6354			id = DIF_INSTR_VAR(instr);
6355			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6356			id -= DIF_VAR_OTHER_UBASE;
6357			v = &vstate->dtvs_tlocals[id];
6358
6359			key = &tupregs[DIF_DTR_NREGS];
6360			key[0].dttk_value = (uint64_t)id;
6361			key[0].dttk_size = 0;
6362			DTRACE_TLS_THRKEY(key[1].dttk_value);
6363			key[1].dttk_size = 0;
6364
6365			dvar = dtrace_dynvar(dstate, 2, key,
6366			    sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6367			    mstate, vstate);
6368
6369			if (dvar == NULL) {
6370				regs[rd] = 0;
6371				break;
6372			}
6373
6374			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6375				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6376			} else {
6377				regs[rd] = *((uint64_t *)dvar->dtdv_data);
6378			}
6379
6380			break;
6381		}
6382
6383		case DIF_OP_STTS: {
6384			dtrace_dynvar_t *dvar;
6385			dtrace_key_t *key;
6386
6387			id = DIF_INSTR_VAR(instr);
6388			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6389			id -= DIF_VAR_OTHER_UBASE;
6390
6391			key = &tupregs[DIF_DTR_NREGS];
6392			key[0].dttk_value = (uint64_t)id;
6393			key[0].dttk_size = 0;
6394			DTRACE_TLS_THRKEY(key[1].dttk_value);
6395			key[1].dttk_size = 0;
6396			v = &vstate->dtvs_tlocals[id];
6397
6398			dvar = dtrace_dynvar(dstate, 2, key,
6399			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6400			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6401			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
6402			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6403
6404			/*
6405			 * Given that we're storing to thread-local data,
6406			 * we need to flush our predicate cache.
6407			 */
6408			curthread->t_predcache = 0;
6409
6410			if (dvar == NULL)
6411				break;
6412
6413			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6414				if (!dtrace_vcanload(
6415				    (void *)(uintptr_t)regs[rd],
6416				    &v->dtdv_type, mstate, vstate))
6417					break;
6418
6419				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6420				    dvar->dtdv_data, &v->dtdv_type);
6421			} else {
6422				*((uint64_t *)dvar->dtdv_data) = regs[rd];
6423			}
6424
6425			break;
6426		}
6427
6428		case DIF_OP_SRA:
6429			regs[rd] = (int64_t)regs[r1] >> regs[r2];
6430			break;
6431
6432		case DIF_OP_CALL:
6433			dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6434			    regs, tupregs, ttop, mstate, state);
6435			break;
6436
6437		case DIF_OP_PUSHTR:
6438			if (ttop == DIF_DTR_NREGS) {
6439				*flags |= CPU_DTRACE_TUPOFLOW;
6440				break;
6441			}
6442
6443			if (r1 == DIF_TYPE_STRING) {
6444				/*
6445				 * If this is a string type and the size is 0,
6446				 * we'll use the system-wide default string
6447				 * size.  Note that we are _not_ looking at
6448				 * the value of the DTRACEOPT_STRSIZE option;
6449				 * had this been set, we would expect to have
6450				 * a non-zero size value in the "pushtr".
6451				 */
6452				tupregs[ttop].dttk_size =
6453				    dtrace_strlen((char *)(uintptr_t)regs[rd],
6454				    regs[r2] ? regs[r2] :
6455				    dtrace_strsize_default) + 1;
6456			} else {
6457				tupregs[ttop].dttk_size = regs[r2];
6458			}
6459
6460			tupregs[ttop++].dttk_value = regs[rd];
6461			break;
6462
6463		case DIF_OP_PUSHTV:
6464			if (ttop == DIF_DTR_NREGS) {
6465				*flags |= CPU_DTRACE_TUPOFLOW;
6466				break;
6467			}
6468
6469			tupregs[ttop].dttk_value = regs[rd];
6470			tupregs[ttop++].dttk_size = 0;
6471			break;
6472
6473		case DIF_OP_POPTS:
6474			if (ttop != 0)
6475				ttop--;
6476			break;
6477
6478		case DIF_OP_FLUSHTS:
6479			ttop = 0;
6480			break;
6481
6482		case DIF_OP_LDGAA:
6483		case DIF_OP_LDTAA: {
6484			dtrace_dynvar_t *dvar;
6485			dtrace_key_t *key = tupregs;
6486			uint_t nkeys = ttop;
6487
6488			id = DIF_INSTR_VAR(instr);
6489			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6490			id -= DIF_VAR_OTHER_UBASE;
6491
6492			key[nkeys].dttk_value = (uint64_t)id;
6493			key[nkeys++].dttk_size = 0;
6494
6495			if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6496				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6497				key[nkeys++].dttk_size = 0;
6498				v = &vstate->dtvs_tlocals[id];
6499			} else {
6500				v = &vstate->dtvs_globals[id]->dtsv_var;
6501			}
6502
6503			dvar = dtrace_dynvar(dstate, nkeys, key,
6504			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6505			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6506			    DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6507
6508			if (dvar == NULL) {
6509				regs[rd] = 0;
6510				break;
6511			}
6512
6513			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6514				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6515			} else {
6516				regs[rd] = *((uint64_t *)dvar->dtdv_data);
6517			}
6518
6519			break;
6520		}
6521
6522		case DIF_OP_STGAA:
6523		case DIF_OP_STTAA: {
6524			dtrace_dynvar_t *dvar;
6525			dtrace_key_t *key = tupregs;
6526			uint_t nkeys = ttop;
6527
6528			id = DIF_INSTR_VAR(instr);
6529			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6530			id -= DIF_VAR_OTHER_UBASE;
6531
6532			key[nkeys].dttk_value = (uint64_t)id;
6533			key[nkeys++].dttk_size = 0;
6534
6535			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6536				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6537				key[nkeys++].dttk_size = 0;
6538				v = &vstate->dtvs_tlocals[id];
6539			} else {
6540				v = &vstate->dtvs_globals[id]->dtsv_var;
6541			}
6542
6543			dvar = dtrace_dynvar(dstate, nkeys, key,
6544			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6545			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6546			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
6547			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6548
6549			if (dvar == NULL)
6550				break;
6551
6552			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6553				if (!dtrace_vcanload(
6554				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6555				    mstate, vstate))
6556					break;
6557
6558				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6559				    dvar->dtdv_data, &v->dtdv_type);
6560			} else {
6561				*((uint64_t *)dvar->dtdv_data) = regs[rd];
6562			}
6563
6564			break;
6565		}
6566
6567		case DIF_OP_ALLOCS: {
6568			uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6569			size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6570
6571			/*
6572			 * Rounding up the user allocation size could have
6573			 * overflowed large, bogus allocations (like -1ULL) to
6574			 * 0.
6575			 */
6576			if (size < regs[r1] ||
6577			    !DTRACE_INSCRATCH(mstate, size)) {
6578				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6579				regs[rd] = 0;
6580				break;
6581			}
6582
6583			dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6584			mstate->dtms_scratch_ptr += size;
6585			regs[rd] = ptr;
6586			break;
6587		}
6588
6589		case DIF_OP_COPYS:
6590			if (!dtrace_canstore(regs[rd], regs[r2],
6591			    mstate, vstate)) {
6592				*flags |= CPU_DTRACE_BADADDR;
6593				*illval = regs[rd];
6594				break;
6595			}
6596
6597			if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6598				break;
6599
6600			dtrace_bcopy((void *)(uintptr_t)regs[r1],
6601			    (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6602			break;
6603
6604		case DIF_OP_STB:
6605			if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6606				*flags |= CPU_DTRACE_BADADDR;
6607				*illval = regs[rd];
6608				break;
6609			}
6610			*((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6611			break;
6612
6613		case DIF_OP_STH:
6614			if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6615				*flags |= CPU_DTRACE_BADADDR;
6616				*illval = regs[rd];
6617				break;
6618			}
6619			if (regs[rd] & 1) {
6620				*flags |= CPU_DTRACE_BADALIGN;
6621				*illval = regs[rd];
6622				break;
6623			}
6624			*((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6625			break;
6626
6627		case DIF_OP_STW:
6628			if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6629				*flags |= CPU_DTRACE_BADADDR;
6630				*illval = regs[rd];
6631				break;
6632			}
6633			if (regs[rd] & 3) {
6634				*flags |= CPU_DTRACE_BADALIGN;
6635				*illval = regs[rd];
6636				break;
6637			}
6638			*((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6639			break;
6640
6641		case DIF_OP_STX:
6642			if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6643				*flags |= CPU_DTRACE_BADADDR;
6644				*illval = regs[rd];
6645				break;
6646			}
6647			if (regs[rd] & 7) {
6648				*flags |= CPU_DTRACE_BADALIGN;
6649				*illval = regs[rd];
6650				break;
6651			}
6652			*((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6653			break;
6654		}
6655	}
6656
6657	if (!(*flags & CPU_DTRACE_FAULT))
6658		return (rval);
6659
6660	mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6661	mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6662
6663	return (0);
6664}
6665
6666static void
6667dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6668{
6669	dtrace_probe_t *probe = ecb->dte_probe;
6670	dtrace_provider_t *prov = probe->dtpr_provider;
6671	char c[DTRACE_FULLNAMELEN + 80], *str;
6672	char *msg = "dtrace: breakpoint action at probe ";
6673	char *ecbmsg = " (ecb ";
6674	uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6675	uintptr_t val = (uintptr_t)ecb;
6676	int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6677
6678	if (dtrace_destructive_disallow)
6679		return;
6680
6681	/*
6682	 * It's impossible to be taking action on the NULL probe.
6683	 */
6684	ASSERT(probe != NULL);
6685
6686	/*
6687	 * This is a poor man's (destitute man's?) sprintf():  we want to
6688	 * print the provider name, module name, function name and name of
6689	 * the probe, along with the hex address of the ECB with the breakpoint
6690	 * action -- all of which we must place in the character buffer by
6691	 * hand.
6692	 */
6693	while (*msg != '\0')
6694		c[i++] = *msg++;
6695
6696	for (str = prov->dtpv_name; *str != '\0'; str++)
6697		c[i++] = *str;
6698	c[i++] = ':';
6699
6700	for (str = probe->dtpr_mod; *str != '\0'; str++)
6701		c[i++] = *str;
6702	c[i++] = ':';
6703
6704	for (str = probe->dtpr_func; *str != '\0'; str++)
6705		c[i++] = *str;
6706	c[i++] = ':';
6707
6708	for (str = probe->dtpr_name; *str != '\0'; str++)
6709		c[i++] = *str;
6710
6711	while (*ecbmsg != '\0')
6712		c[i++] = *ecbmsg++;
6713
6714	while (shift >= 0) {
6715		mask = (uintptr_t)0xf << shift;
6716
6717		if (val >= ((uintptr_t)1 << shift))
6718			c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6719		shift -= 4;
6720	}
6721
6722	c[i++] = ')';
6723	c[i] = '\0';
6724
6725#if defined(sun)
6726	debug_enter(c);
6727#else
6728	kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
6729#endif
6730}
6731
6732static void
6733dtrace_action_panic(dtrace_ecb_t *ecb)
6734{
6735	dtrace_probe_t *probe = ecb->dte_probe;
6736
6737	/*
6738	 * It's impossible to be taking action on the NULL probe.
6739	 */
6740	ASSERT(probe != NULL);
6741
6742	if (dtrace_destructive_disallow)
6743		return;
6744
6745	if (dtrace_panicked != NULL)
6746		return;
6747
6748	if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
6749		return;
6750
6751	/*
6752	 * We won the right to panic.  (We want to be sure that only one
6753	 * thread calls panic() from dtrace_probe(), and that panic() is
6754	 * called exactly once.)
6755	 */
6756	dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6757	    probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6758	    probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6759}
6760
6761static void
6762dtrace_action_raise(uint64_t sig)
6763{
6764	if (dtrace_destructive_disallow)
6765		return;
6766
6767	if (sig >= NSIG) {
6768		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6769		return;
6770	}
6771
6772#if defined(sun)
6773	/*
6774	 * raise() has a queue depth of 1 -- we ignore all subsequent
6775	 * invocations of the raise() action.
6776	 */
6777	if (curthread->t_dtrace_sig == 0)
6778		curthread->t_dtrace_sig = (uint8_t)sig;
6779
6780	curthread->t_sig_check = 1;
6781	aston(curthread);
6782#else
6783	struct proc *p = curproc;
6784	PROC_LOCK(p);
6785	kern_psignal(p, sig);
6786	PROC_UNLOCK(p);
6787#endif
6788}
6789
6790static void
6791dtrace_action_stop(void)
6792{
6793	if (dtrace_destructive_disallow)
6794		return;
6795
6796#if defined(sun)
6797	if (!curthread->t_dtrace_stop) {
6798		curthread->t_dtrace_stop = 1;
6799		curthread->t_sig_check = 1;
6800		aston(curthread);
6801	}
6802#else
6803	struct proc *p = curproc;
6804	PROC_LOCK(p);
6805	kern_psignal(p, SIGSTOP);
6806	PROC_UNLOCK(p);
6807#endif
6808}
6809
6810static void
6811dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6812{
6813	hrtime_t now;
6814	volatile uint16_t *flags;
6815#if defined(sun)
6816	cpu_t *cpu = CPU;
6817#else
6818	cpu_t *cpu = &solaris_cpu[curcpu];
6819#endif
6820
6821	if (dtrace_destructive_disallow)
6822		return;
6823
6824	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
6825
6826	now = dtrace_gethrtime();
6827
6828	if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6829		/*
6830		 * We need to advance the mark to the current time.
6831		 */
6832		cpu->cpu_dtrace_chillmark = now;
6833		cpu->cpu_dtrace_chilled = 0;
6834	}
6835
6836	/*
6837	 * Now check to see if the requested chill time would take us over
6838	 * the maximum amount of time allowed in the chill interval.  (Or
6839	 * worse, if the calculation itself induces overflow.)
6840	 */
6841	if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6842	    cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6843		*flags |= CPU_DTRACE_ILLOP;
6844		return;
6845	}
6846
6847	while (dtrace_gethrtime() - now < val)
6848		continue;
6849
6850	/*
6851	 * Normally, we assure that the value of the variable "timestamp" does
6852	 * not change within an ECB.  The presence of chill() represents an
6853	 * exception to this rule, however.
6854	 */
6855	mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6856	cpu->cpu_dtrace_chilled += val;
6857}
6858
6859static void
6860dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6861    uint64_t *buf, uint64_t arg)
6862{
6863	int nframes = DTRACE_USTACK_NFRAMES(arg);
6864	int strsize = DTRACE_USTACK_STRSIZE(arg);
6865	uint64_t *pcs = &buf[1], *fps;
6866	char *str = (char *)&pcs[nframes];
6867	int size, offs = 0, i, j;
6868	uintptr_t old = mstate->dtms_scratch_ptr, saved;
6869	uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
6870	char *sym;
6871
6872	/*
6873	 * Should be taking a faster path if string space has not been
6874	 * allocated.
6875	 */
6876	ASSERT(strsize != 0);
6877
6878	/*
6879	 * We will first allocate some temporary space for the frame pointers.
6880	 */
6881	fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6882	size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6883	    (nframes * sizeof (uint64_t));
6884
6885	if (!DTRACE_INSCRATCH(mstate, size)) {
6886		/*
6887		 * Not enough room for our frame pointers -- need to indicate
6888		 * that we ran out of scratch space.
6889		 */
6890		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6891		return;
6892	}
6893
6894	mstate->dtms_scratch_ptr += size;
6895	saved = mstate->dtms_scratch_ptr;
6896
6897	/*
6898	 * Now get a stack with both program counters and frame pointers.
6899	 */
6900	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6901	dtrace_getufpstack(buf, fps, nframes + 1);
6902	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6903
6904	/*
6905	 * If that faulted, we're cooked.
6906	 */
6907	if (*flags & CPU_DTRACE_FAULT)
6908		goto out;
6909
6910	/*
6911	 * Now we want to walk up the stack, calling the USTACK helper.  For
6912	 * each iteration, we restore the scratch pointer.
6913	 */
6914	for (i = 0; i < nframes; i++) {
6915		mstate->dtms_scratch_ptr = saved;
6916
6917		if (offs >= strsize)
6918			break;
6919
6920		sym = (char *)(uintptr_t)dtrace_helper(
6921		    DTRACE_HELPER_ACTION_USTACK,
6922		    mstate, state, pcs[i], fps[i]);
6923
6924		/*
6925		 * If we faulted while running the helper, we're going to
6926		 * clear the fault and null out the corresponding string.
6927		 */
6928		if (*flags & CPU_DTRACE_FAULT) {
6929			*flags &= ~CPU_DTRACE_FAULT;
6930			str[offs++] = '\0';
6931			continue;
6932		}
6933
6934		if (sym == NULL) {
6935			str[offs++] = '\0';
6936			continue;
6937		}
6938
6939		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6940
6941		/*
6942		 * Now copy in the string that the helper returned to us.
6943		 */
6944		for (j = 0; offs + j < strsize; j++) {
6945			if ((str[offs + j] = sym[j]) == '\0')
6946				break;
6947		}
6948
6949		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6950
6951		offs += j + 1;
6952	}
6953
6954	if (offs >= strsize) {
6955		/*
6956		 * If we didn't have room for all of the strings, we don't
6957		 * abort processing -- this needn't be a fatal error -- but we
6958		 * still want to increment a counter (dts_stkstroverflows) to
6959		 * allow this condition to be warned about.  (If this is from
6960		 * a jstack() action, it is easily tuned via jstackstrsize.)
6961		 */
6962		dtrace_error(&state->dts_stkstroverflows);
6963	}
6964
6965	while (offs < strsize)
6966		str[offs++] = '\0';
6967
6968out:
6969	mstate->dtms_scratch_ptr = old;
6970}
6971
6972static void
6973dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6974    size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
6975{
6976	volatile uint16_t *flags;
6977	uint64_t val = *valp;
6978	size_t valoffs = *valoffsp;
6979
6980	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
6981	ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
6982
6983	/*
6984	 * If this is a string, we're going to only load until we find the zero
6985	 * byte -- after which we'll store zero bytes.
6986	 */
6987	if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6988		char c = '\0' + 1;
6989		size_t s;
6990
6991		for (s = 0; s < size; s++) {
6992			if (c != '\0' && dtkind == DIF_TF_BYREF) {
6993				c = dtrace_load8(val++);
6994			} else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
6995				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6996				c = dtrace_fuword8((void *)(uintptr_t)val++);
6997				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6998				if (*flags & CPU_DTRACE_FAULT)
6999					break;
7000			}
7001
7002			DTRACE_STORE(uint8_t, tomax, valoffs++, c);
7003
7004			if (c == '\0' && intuple)
7005				break;
7006		}
7007	} else {
7008		uint8_t c;
7009		while (valoffs < end) {
7010			if (dtkind == DIF_TF_BYREF) {
7011				c = dtrace_load8(val++);
7012			} else if (dtkind == DIF_TF_BYUREF) {
7013				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7014				c = dtrace_fuword8((void *)(uintptr_t)val++);
7015				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7016				if (*flags & CPU_DTRACE_FAULT)
7017					break;
7018			}
7019
7020			DTRACE_STORE(uint8_t, tomax,
7021			    valoffs++, c);
7022		}
7023	}
7024
7025	*valp = val;
7026	*valoffsp = valoffs;
7027}
7028
7029/*
7030 * If you're looking for the epicenter of DTrace, you just found it.  This
7031 * is the function called by the provider to fire a probe -- from which all
7032 * subsequent probe-context DTrace activity emanates.
7033 */
7034void
7035dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
7036    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
7037{
7038	processorid_t cpuid;
7039	dtrace_icookie_t cookie;
7040	dtrace_probe_t *probe;
7041	dtrace_mstate_t mstate;
7042	dtrace_ecb_t *ecb;
7043	dtrace_action_t *act;
7044	intptr_t offs;
7045	size_t size;
7046	int vtime, onintr;
7047	volatile uint16_t *flags;
7048	hrtime_t now;
7049
7050	if (panicstr != NULL)
7051		return;
7052
7053#if defined(sun)
7054	/*
7055	 * Kick out immediately if this CPU is still being born (in which case
7056	 * curthread will be set to -1) or the current thread can't allow
7057	 * probes in its current context.
7058	 */
7059	if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
7060		return;
7061#endif
7062
7063	cookie = dtrace_interrupt_disable();
7064	probe = dtrace_probes[id - 1];
7065	cpuid = curcpu;
7066	onintr = CPU_ON_INTR(CPU);
7067
7068	if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
7069	    probe->dtpr_predcache == curthread->t_predcache) {
7070		/*
7071		 * We have hit in the predicate cache; we know that
7072		 * this predicate would evaluate to be false.
7073		 */
7074		dtrace_interrupt_enable(cookie);
7075		return;
7076	}
7077
7078#if defined(sun)
7079	if (panic_quiesce) {
7080#else
7081	if (panicstr != NULL) {
7082#endif
7083		/*
7084		 * We don't trace anything if we're panicking.
7085		 */
7086		dtrace_interrupt_enable(cookie);
7087		return;
7088	}
7089
7090	now = mstate.dtms_timestamp = dtrace_gethrtime();
7091	mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7092	vtime = dtrace_vtime_references != 0;
7093
7094	if (vtime && curthread->t_dtrace_start)
7095		curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
7096
7097	mstate.dtms_difo = NULL;
7098	mstate.dtms_probe = probe;
7099	mstate.dtms_strtok = 0;
7100	mstate.dtms_arg[0] = arg0;
7101	mstate.dtms_arg[1] = arg1;
7102	mstate.dtms_arg[2] = arg2;
7103	mstate.dtms_arg[3] = arg3;
7104	mstate.dtms_arg[4] = arg4;
7105
7106	flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
7107
7108	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
7109		dtrace_predicate_t *pred = ecb->dte_predicate;
7110		dtrace_state_t *state = ecb->dte_state;
7111		dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
7112		dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
7113		dtrace_vstate_t *vstate = &state->dts_vstate;
7114		dtrace_provider_t *prov = probe->dtpr_provider;
7115		uint64_t tracememsize = 0;
7116		int committed = 0;
7117		caddr_t tomax;
7118
7119		/*
7120		 * A little subtlety with the following (seemingly innocuous)
7121		 * declaration of the automatic 'val':  by looking at the
7122		 * code, you might think that it could be declared in the
7123		 * action processing loop, below.  (That is, it's only used in
7124		 * the action processing loop.)  However, it must be declared
7125		 * out of that scope because in the case of DIF expression
7126		 * arguments to aggregating actions, one iteration of the
7127		 * action loop will use the last iteration's value.
7128		 */
7129		uint64_t val = 0;
7130
7131		mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
7132		mstate.dtms_getf = NULL;
7133
7134		*flags &= ~CPU_DTRACE_ERROR;
7135
7136		if (prov == dtrace_provider) {
7137			/*
7138			 * If dtrace itself is the provider of this probe,
7139			 * we're only going to continue processing the ECB if
7140			 * arg0 (the dtrace_state_t) is equal to the ECB's
7141			 * creating state.  (This prevents disjoint consumers
7142			 * from seeing one another's metaprobes.)
7143			 */
7144			if (arg0 != (uint64_t)(uintptr_t)state)
7145				continue;
7146		}
7147
7148		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
7149			/*
7150			 * We're not currently active.  If our provider isn't
7151			 * the dtrace pseudo provider, we're not interested.
7152			 */
7153			if (prov != dtrace_provider)
7154				continue;
7155
7156			/*
7157			 * Now we must further check if we are in the BEGIN
7158			 * probe.  If we are, we will only continue processing
7159			 * if we're still in WARMUP -- if one BEGIN enabling
7160			 * has invoked the exit() action, we don't want to
7161			 * evaluate subsequent BEGIN enablings.
7162			 */
7163			if (probe->dtpr_id == dtrace_probeid_begin &&
7164			    state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
7165				ASSERT(state->dts_activity ==
7166				    DTRACE_ACTIVITY_DRAINING);
7167				continue;
7168			}
7169		}
7170
7171		if (ecb->dte_cond) {
7172			/*
7173			 * If the dte_cond bits indicate that this
7174			 * consumer is only allowed to see user-mode firings
7175			 * of this probe, call the provider's dtps_usermode()
7176			 * entry point to check that the probe was fired
7177			 * while in a user context. Skip this ECB if that's
7178			 * not the case.
7179			 */
7180			if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
7181			    prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
7182			    probe->dtpr_id, probe->dtpr_arg) == 0)
7183				continue;
7184
7185#if defined(sun)
7186			/*
7187			 * This is more subtle than it looks. We have to be
7188			 * absolutely certain that CRED() isn't going to
7189			 * change out from under us so it's only legit to
7190			 * examine that structure if we're in constrained
7191			 * situations. Currently, the only times we'll this
7192			 * check is if a non-super-user has enabled the
7193			 * profile or syscall providers -- providers that
7194			 * allow visibility of all processes. For the
7195			 * profile case, the check above will ensure that
7196			 * we're examining a user context.
7197			 */
7198			if (ecb->dte_cond & DTRACE_COND_OWNER) {
7199				cred_t *cr;
7200				cred_t *s_cr =
7201				    ecb->dte_state->dts_cred.dcr_cred;
7202				proc_t *proc;
7203
7204				ASSERT(s_cr != NULL);
7205
7206				if ((cr = CRED()) == NULL ||
7207				    s_cr->cr_uid != cr->cr_uid ||
7208				    s_cr->cr_uid != cr->cr_ruid ||
7209				    s_cr->cr_uid != cr->cr_suid ||
7210				    s_cr->cr_gid != cr->cr_gid ||
7211				    s_cr->cr_gid != cr->cr_rgid ||
7212				    s_cr->cr_gid != cr->cr_sgid ||
7213				    (proc = ttoproc(curthread)) == NULL ||
7214				    (proc->p_flag & SNOCD))
7215					continue;
7216			}
7217
7218			if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
7219				cred_t *cr;
7220				cred_t *s_cr =
7221				    ecb->dte_state->dts_cred.dcr_cred;
7222
7223				ASSERT(s_cr != NULL);
7224
7225				if ((cr = CRED()) == NULL ||
7226				    s_cr->cr_zone->zone_id !=
7227				    cr->cr_zone->zone_id)
7228					continue;
7229			}
7230#endif
7231		}
7232
7233		if (now - state->dts_alive > dtrace_deadman_timeout) {
7234			/*
7235			 * We seem to be dead.  Unless we (a) have kernel
7236			 * destructive permissions (b) have explicitly enabled
7237			 * destructive actions and (c) destructive actions have
7238			 * not been disabled, we're going to transition into
7239			 * the KILLED state, from which no further processing
7240			 * on this state will be performed.
7241			 */
7242			if (!dtrace_priv_kernel_destructive(state) ||
7243			    !state->dts_cred.dcr_destructive ||
7244			    dtrace_destructive_disallow) {
7245				void *activity = &state->dts_activity;
7246				dtrace_activity_t current;
7247
7248				do {
7249					current = state->dts_activity;
7250				} while (dtrace_cas32(activity, current,
7251				    DTRACE_ACTIVITY_KILLED) != current);
7252
7253				continue;
7254			}
7255		}
7256
7257		if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
7258		    ecb->dte_alignment, state, &mstate)) < 0)
7259			continue;
7260
7261		tomax = buf->dtb_tomax;
7262		ASSERT(tomax != NULL);
7263
7264		if (ecb->dte_size != 0) {
7265			dtrace_rechdr_t dtrh;
7266			if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
7267				mstate.dtms_timestamp = dtrace_gethrtime();
7268				mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7269			}
7270			ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
7271			dtrh.dtrh_epid = ecb->dte_epid;
7272			DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
7273			    mstate.dtms_timestamp);
7274			*((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
7275		}
7276
7277		mstate.dtms_epid = ecb->dte_epid;
7278		mstate.dtms_present |= DTRACE_MSTATE_EPID;
7279
7280		if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
7281			mstate.dtms_access = DTRACE_ACCESS_KERNEL;
7282		else
7283			mstate.dtms_access = 0;
7284
7285		if (pred != NULL) {
7286			dtrace_difo_t *dp = pred->dtp_difo;
7287			int rval;
7288
7289			rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
7290
7291			if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
7292				dtrace_cacheid_t cid = probe->dtpr_predcache;
7293
7294				if (cid != DTRACE_CACHEIDNONE && !onintr) {
7295					/*
7296					 * Update the predicate cache...
7297					 */
7298					ASSERT(cid == pred->dtp_cacheid);
7299					curthread->t_predcache = cid;
7300				}
7301
7302				continue;
7303			}
7304		}
7305
7306		for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
7307		    act != NULL; act = act->dta_next) {
7308			size_t valoffs;
7309			dtrace_difo_t *dp;
7310			dtrace_recdesc_t *rec = &act->dta_rec;
7311
7312			size = rec->dtrd_size;
7313			valoffs = offs + rec->dtrd_offset;
7314
7315			if (DTRACEACT_ISAGG(act->dta_kind)) {
7316				uint64_t v = 0xbad;
7317				dtrace_aggregation_t *agg;
7318
7319				agg = (dtrace_aggregation_t *)act;
7320
7321				if ((dp = act->dta_difo) != NULL)
7322					v = dtrace_dif_emulate(dp,
7323					    &mstate, vstate, state);
7324
7325				if (*flags & CPU_DTRACE_ERROR)
7326					continue;
7327
7328				/*
7329				 * Note that we always pass the expression
7330				 * value from the previous iteration of the
7331				 * action loop.  This value will only be used
7332				 * if there is an expression argument to the
7333				 * aggregating action, denoted by the
7334				 * dtag_hasarg field.
7335				 */
7336				dtrace_aggregate(agg, buf,
7337				    offs, aggbuf, v, val);
7338				continue;
7339			}
7340
7341			switch (act->dta_kind) {
7342			case DTRACEACT_STOP:
7343				if (dtrace_priv_proc_destructive(state))
7344					dtrace_action_stop();
7345				continue;
7346
7347			case DTRACEACT_BREAKPOINT:
7348				if (dtrace_priv_kernel_destructive(state))
7349					dtrace_action_breakpoint(ecb);
7350				continue;
7351
7352			case DTRACEACT_PANIC:
7353				if (dtrace_priv_kernel_destructive(state))
7354					dtrace_action_panic(ecb);
7355				continue;
7356
7357			case DTRACEACT_STACK:
7358				if (!dtrace_priv_kernel(state))
7359					continue;
7360
7361				dtrace_getpcstack((pc_t *)(tomax + valoffs),
7362				    size / sizeof (pc_t), probe->dtpr_aframes,
7363				    DTRACE_ANCHORED(probe) ? NULL :
7364				    (uint32_t *)arg0);
7365				continue;
7366
7367			case DTRACEACT_JSTACK:
7368			case DTRACEACT_USTACK:
7369				if (!dtrace_priv_proc(state))
7370					continue;
7371
7372				/*
7373				 * See comment in DIF_VAR_PID.
7374				 */
7375				if (DTRACE_ANCHORED(mstate.dtms_probe) &&
7376				    CPU_ON_INTR(CPU)) {
7377					int depth = DTRACE_USTACK_NFRAMES(
7378					    rec->dtrd_arg) + 1;
7379
7380					dtrace_bzero((void *)(tomax + valoffs),
7381					    DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
7382					    + depth * sizeof (uint64_t));
7383
7384					continue;
7385				}
7386
7387				if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
7388				    curproc->p_dtrace_helpers != NULL) {
7389					/*
7390					 * This is the slow path -- we have
7391					 * allocated string space, and we're
7392					 * getting the stack of a process that
7393					 * has helpers.  Call into a separate
7394					 * routine to perform this processing.
7395					 */
7396					dtrace_action_ustack(&mstate, state,
7397					    (uint64_t *)(tomax + valoffs),
7398					    rec->dtrd_arg);
7399					continue;
7400				}
7401
7402				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7403				dtrace_getupcstack((uint64_t *)
7404				    (tomax + valoffs),
7405				    DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
7406				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7407				continue;
7408
7409			default:
7410				break;
7411			}
7412
7413			dp = act->dta_difo;
7414			ASSERT(dp != NULL);
7415
7416			val = dtrace_dif_emulate(dp, &mstate, vstate, state);
7417
7418			if (*flags & CPU_DTRACE_ERROR)
7419				continue;
7420
7421			switch (act->dta_kind) {
7422			case DTRACEACT_SPECULATE: {
7423				dtrace_rechdr_t *dtrh;
7424
7425				ASSERT(buf == &state->dts_buffer[cpuid]);
7426				buf = dtrace_speculation_buffer(state,
7427				    cpuid, val);
7428
7429				if (buf == NULL) {
7430					*flags |= CPU_DTRACE_DROP;
7431					continue;
7432				}
7433
7434				offs = dtrace_buffer_reserve(buf,
7435				    ecb->dte_needed, ecb->dte_alignment,
7436				    state, NULL);
7437
7438				if (offs < 0) {
7439					*flags |= CPU_DTRACE_DROP;
7440					continue;
7441				}
7442
7443				tomax = buf->dtb_tomax;
7444				ASSERT(tomax != NULL);
7445
7446				if (ecb->dte_size == 0)
7447					continue;
7448
7449				ASSERT3U(ecb->dte_size, >=,
7450				    sizeof (dtrace_rechdr_t));
7451				dtrh = ((void *)(tomax + offs));
7452				dtrh->dtrh_epid = ecb->dte_epid;
7453				/*
7454				 * When the speculation is committed, all of
7455				 * the records in the speculative buffer will
7456				 * have their timestamps set to the commit
7457				 * time.  Until then, it is set to a sentinel
7458				 * value, for debugability.
7459				 */
7460				DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
7461				continue;
7462			}
7463
7464			case DTRACEACT_PRINTM: {
7465				/* The DIF returns a 'memref'. */
7466				uintptr_t *memref = (uintptr_t *)(uintptr_t) val;
7467
7468				/* Get the size from the memref. */
7469				size = memref[1];
7470
7471				/*
7472				 * Check if the size exceeds the allocated
7473				 * buffer size.
7474				 */
7475				if (size + sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
7476					/* Flag a drop! */
7477					*flags |= CPU_DTRACE_DROP;
7478					continue;
7479				}
7480
7481				/* Store the size in the buffer first. */
7482				DTRACE_STORE(uintptr_t, tomax,
7483				    valoffs, size);
7484
7485				/*
7486				 * Offset the buffer address to the start
7487				 * of the data.
7488				 */
7489				valoffs += sizeof(uintptr_t);
7490
7491				/*
7492				 * Reset to the memory address rather than
7493				 * the memref array, then let the BYREF
7494				 * code below do the work to store the
7495				 * memory data in the buffer.
7496				 */
7497				val = memref[0];
7498				break;
7499			}
7500
7501			case DTRACEACT_PRINTT: {
7502				/* The DIF returns a 'typeref'. */
7503				uintptr_t *typeref = (uintptr_t *)(uintptr_t) val;
7504				char c = '\0' + 1;
7505				size_t s;
7506
7507				/*
7508				 * Get the type string length and round it
7509				 * up so that the data that follows is
7510				 * aligned for easy access.
7511				 */
7512				size_t typs = strlen((char *) typeref[2]) + 1;
7513				typs = roundup(typs,  sizeof(uintptr_t));
7514
7515				/*
7516				 *Get the size from the typeref using the
7517				 * number of elements and the type size.
7518				 */
7519				size = typeref[1] * typeref[3];
7520
7521				/*
7522				 * Check if the size exceeds the allocated
7523				 * buffer size.
7524				 */
7525				if (size + typs + 2 * sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
7526					/* Flag a drop! */
7527					*flags |= CPU_DTRACE_DROP;
7528
7529				}
7530
7531				/* Store the size in the buffer first. */
7532				DTRACE_STORE(uintptr_t, tomax,
7533				    valoffs, size);
7534				valoffs += sizeof(uintptr_t);
7535
7536				/* Store the type size in the buffer. */
7537				DTRACE_STORE(uintptr_t, tomax,
7538				    valoffs, typeref[3]);
7539				valoffs += sizeof(uintptr_t);
7540
7541				val = typeref[2];
7542
7543				for (s = 0; s < typs; s++) {
7544					if (c != '\0')
7545						c = dtrace_load8(val++);
7546
7547					DTRACE_STORE(uint8_t, tomax,
7548					    valoffs++, c);
7549				}
7550
7551				/*
7552				 * Reset to the memory address rather than
7553				 * the typeref array, then let the BYREF
7554				 * code below do the work to store the
7555				 * memory data in the buffer.
7556				 */
7557				val = typeref[0];
7558				break;
7559			}
7560
7561			case DTRACEACT_CHILL:
7562				if (dtrace_priv_kernel_destructive(state))
7563					dtrace_action_chill(&mstate, val);
7564				continue;
7565
7566			case DTRACEACT_RAISE:
7567				if (dtrace_priv_proc_destructive(state))
7568					dtrace_action_raise(val);
7569				continue;
7570
7571			case DTRACEACT_COMMIT:
7572				ASSERT(!committed);
7573
7574				/*
7575				 * We need to commit our buffer state.
7576				 */
7577				if (ecb->dte_size)
7578					buf->dtb_offset = offs + ecb->dte_size;
7579				buf = &state->dts_buffer[cpuid];
7580				dtrace_speculation_commit(state, cpuid, val);
7581				committed = 1;
7582				continue;
7583
7584			case DTRACEACT_DISCARD:
7585				dtrace_speculation_discard(state, cpuid, val);
7586				continue;
7587
7588			case DTRACEACT_DIFEXPR:
7589			case DTRACEACT_LIBACT:
7590			case DTRACEACT_PRINTF:
7591			case DTRACEACT_PRINTA:
7592			case DTRACEACT_SYSTEM:
7593			case DTRACEACT_FREOPEN:
7594			case DTRACEACT_TRACEMEM:
7595				break;
7596
7597			case DTRACEACT_TRACEMEM_DYNSIZE:
7598				tracememsize = val;
7599				break;
7600
7601			case DTRACEACT_SYM:
7602			case DTRACEACT_MOD:
7603				if (!dtrace_priv_kernel(state))
7604					continue;
7605				break;
7606
7607			case DTRACEACT_USYM:
7608			case DTRACEACT_UMOD:
7609			case DTRACEACT_UADDR: {
7610#if defined(sun)
7611				struct pid *pid = curthread->t_procp->p_pidp;
7612#endif
7613
7614				if (!dtrace_priv_proc(state))
7615					continue;
7616
7617				DTRACE_STORE(uint64_t, tomax,
7618#if defined(sun)
7619				    valoffs, (uint64_t)pid->pid_id);
7620#else
7621				    valoffs, (uint64_t) curproc->p_pid);
7622#endif
7623				DTRACE_STORE(uint64_t, tomax,
7624				    valoffs + sizeof (uint64_t), val);
7625
7626				continue;
7627			}
7628
7629			case DTRACEACT_EXIT: {
7630				/*
7631				 * For the exit action, we are going to attempt
7632				 * to atomically set our activity to be
7633				 * draining.  If this fails (either because
7634				 * another CPU has beat us to the exit action,
7635				 * or because our current activity is something
7636				 * other than ACTIVE or WARMUP), we will
7637				 * continue.  This assures that the exit action
7638				 * can be successfully recorded at most once
7639				 * when we're in the ACTIVE state.  If we're
7640				 * encountering the exit() action while in
7641				 * COOLDOWN, however, we want to honor the new
7642				 * status code.  (We know that we're the only
7643				 * thread in COOLDOWN, so there is no race.)
7644				 */
7645				void *activity = &state->dts_activity;
7646				dtrace_activity_t current = state->dts_activity;
7647
7648				if (current == DTRACE_ACTIVITY_COOLDOWN)
7649					break;
7650
7651				if (current != DTRACE_ACTIVITY_WARMUP)
7652					current = DTRACE_ACTIVITY_ACTIVE;
7653
7654				if (dtrace_cas32(activity, current,
7655				    DTRACE_ACTIVITY_DRAINING) != current) {
7656					*flags |= CPU_DTRACE_DROP;
7657					continue;
7658				}
7659
7660				break;
7661			}
7662
7663			default:
7664				ASSERT(0);
7665			}
7666
7667			if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ||
7668			    dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) {
7669				uintptr_t end = valoffs + size;
7670
7671				if (tracememsize != 0 &&
7672				    valoffs + tracememsize < end) {
7673					end = valoffs + tracememsize;
7674					tracememsize = 0;
7675				}
7676
7677				if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
7678				    !dtrace_vcanload((void *)(uintptr_t)val,
7679				    &dp->dtdo_rtype, &mstate, vstate))
7680					continue;
7681
7682				dtrace_store_by_ref(dp, tomax, size, &valoffs,
7683				    &val, end, act->dta_intuple,
7684				    dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
7685				    DIF_TF_BYREF: DIF_TF_BYUREF);
7686				continue;
7687			}
7688
7689			switch (size) {
7690			case 0:
7691				break;
7692
7693			case sizeof (uint8_t):
7694				DTRACE_STORE(uint8_t, tomax, valoffs, val);
7695				break;
7696			case sizeof (uint16_t):
7697				DTRACE_STORE(uint16_t, tomax, valoffs, val);
7698				break;
7699			case sizeof (uint32_t):
7700				DTRACE_STORE(uint32_t, tomax, valoffs, val);
7701				break;
7702			case sizeof (uint64_t):
7703				DTRACE_STORE(uint64_t, tomax, valoffs, val);
7704				break;
7705			default:
7706				/*
7707				 * Any other size should have been returned by
7708				 * reference, not by value.
7709				 */
7710				ASSERT(0);
7711				break;
7712			}
7713		}
7714
7715		if (*flags & CPU_DTRACE_DROP)
7716			continue;
7717
7718		if (*flags & CPU_DTRACE_FAULT) {
7719			int ndx;
7720			dtrace_action_t *err;
7721
7722			buf->dtb_errors++;
7723
7724			if (probe->dtpr_id == dtrace_probeid_error) {
7725				/*
7726				 * There's nothing we can do -- we had an
7727				 * error on the error probe.  We bump an
7728				 * error counter to at least indicate that
7729				 * this condition happened.
7730				 */
7731				dtrace_error(&state->dts_dblerrors);
7732				continue;
7733			}
7734
7735			if (vtime) {
7736				/*
7737				 * Before recursing on dtrace_probe(), we
7738				 * need to explicitly clear out our start
7739				 * time to prevent it from being accumulated
7740				 * into t_dtrace_vtime.
7741				 */
7742				curthread->t_dtrace_start = 0;
7743			}
7744
7745			/*
7746			 * Iterate over the actions to figure out which action
7747			 * we were processing when we experienced the error.
7748			 * Note that act points _past_ the faulting action; if
7749			 * act is ecb->dte_action, the fault was in the
7750			 * predicate, if it's ecb->dte_action->dta_next it's
7751			 * in action #1, and so on.
7752			 */
7753			for (err = ecb->dte_action, ndx = 0;
7754			    err != act; err = err->dta_next, ndx++)
7755				continue;
7756
7757			dtrace_probe_error(state, ecb->dte_epid, ndx,
7758			    (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7759			    mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
7760			    cpu_core[cpuid].cpuc_dtrace_illval);
7761
7762			continue;
7763		}
7764
7765		if (!committed)
7766			buf->dtb_offset = offs + ecb->dte_size;
7767	}
7768
7769	if (vtime)
7770		curthread->t_dtrace_start = dtrace_gethrtime();
7771
7772	dtrace_interrupt_enable(cookie);
7773}
7774
7775/*
7776 * DTrace Probe Hashing Functions
7777 *
7778 * The functions in this section (and indeed, the functions in remaining
7779 * sections) are not _called_ from probe context.  (Any exceptions to this are
7780 * marked with a "Note:".)  Rather, they are called from elsewhere in the
7781 * DTrace framework to look-up probes in, add probes to and remove probes from
7782 * the DTrace probe hashes.  (Each probe is hashed by each element of the
7783 * probe tuple -- allowing for fast lookups, regardless of what was
7784 * specified.)
7785 */
7786static uint_t
7787dtrace_hash_str(const char *p)
7788{
7789	unsigned int g;
7790	uint_t hval = 0;
7791
7792	while (*p) {
7793		hval = (hval << 4) + *p++;
7794		if ((g = (hval & 0xf0000000)) != 0)
7795			hval ^= g >> 24;
7796		hval &= ~g;
7797	}
7798	return (hval);
7799}
7800
7801static dtrace_hash_t *
7802dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
7803{
7804	dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7805
7806	hash->dth_stroffs = stroffs;
7807	hash->dth_nextoffs = nextoffs;
7808	hash->dth_prevoffs = prevoffs;
7809
7810	hash->dth_size = 1;
7811	hash->dth_mask = hash->dth_size - 1;
7812
7813	hash->dth_tab = kmem_zalloc(hash->dth_size *
7814	    sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7815
7816	return (hash);
7817}
7818
7819static void
7820dtrace_hash_destroy(dtrace_hash_t *hash)
7821{
7822#ifdef DEBUG
7823	int i;
7824
7825	for (i = 0; i < hash->dth_size; i++)
7826		ASSERT(hash->dth_tab[i] == NULL);
7827#endif
7828
7829	kmem_free(hash->dth_tab,
7830	    hash->dth_size * sizeof (dtrace_hashbucket_t *));
7831	kmem_free(hash, sizeof (dtrace_hash_t));
7832}
7833
7834static void
7835dtrace_hash_resize(dtrace_hash_t *hash)
7836{
7837	int size = hash->dth_size, i, ndx;
7838	int new_size = hash->dth_size << 1;
7839	int new_mask = new_size - 1;
7840	dtrace_hashbucket_t **new_tab, *bucket, *next;
7841
7842	ASSERT((new_size & new_mask) == 0);
7843
7844	new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7845
7846	for (i = 0; i < size; i++) {
7847		for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7848			dtrace_probe_t *probe = bucket->dthb_chain;
7849
7850			ASSERT(probe != NULL);
7851			ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
7852
7853			next = bucket->dthb_next;
7854			bucket->dthb_next = new_tab[ndx];
7855			new_tab[ndx] = bucket;
7856		}
7857	}
7858
7859	kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7860	hash->dth_tab = new_tab;
7861	hash->dth_size = new_size;
7862	hash->dth_mask = new_mask;
7863}
7864
7865static void
7866dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
7867{
7868	int hashval = DTRACE_HASHSTR(hash, new);
7869	int ndx = hashval & hash->dth_mask;
7870	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7871	dtrace_probe_t **nextp, **prevp;
7872
7873	for (; bucket != NULL; bucket = bucket->dthb_next) {
7874		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7875			goto add;
7876	}
7877
7878	if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7879		dtrace_hash_resize(hash);
7880		dtrace_hash_add(hash, new);
7881		return;
7882	}
7883
7884	bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7885	bucket->dthb_next = hash->dth_tab[ndx];
7886	hash->dth_tab[ndx] = bucket;
7887	hash->dth_nbuckets++;
7888
7889add:
7890	nextp = DTRACE_HASHNEXT(hash, new);
7891	ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7892	*nextp = bucket->dthb_chain;
7893
7894	if (bucket->dthb_chain != NULL) {
7895		prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7896		ASSERT(*prevp == NULL);
7897		*prevp = new;
7898	}
7899
7900	bucket->dthb_chain = new;
7901	bucket->dthb_len++;
7902}
7903
7904static dtrace_probe_t *
7905dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
7906{
7907	int hashval = DTRACE_HASHSTR(hash, template);
7908	int ndx = hashval & hash->dth_mask;
7909	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7910
7911	for (; bucket != NULL; bucket = bucket->dthb_next) {
7912		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7913			return (bucket->dthb_chain);
7914	}
7915
7916	return (NULL);
7917}
7918
7919static int
7920dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
7921{
7922	int hashval = DTRACE_HASHSTR(hash, template);
7923	int ndx = hashval & hash->dth_mask;
7924	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7925
7926	for (; bucket != NULL; bucket = bucket->dthb_next) {
7927		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7928			return (bucket->dthb_len);
7929	}
7930
7931	return (0);
7932}
7933
7934static void
7935dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
7936{
7937	int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
7938	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7939
7940	dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
7941	dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
7942
7943	/*
7944	 * Find the bucket that we're removing this probe from.
7945	 */
7946	for (; bucket != NULL; bucket = bucket->dthb_next) {
7947		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
7948			break;
7949	}
7950
7951	ASSERT(bucket != NULL);
7952
7953	if (*prevp == NULL) {
7954		if (*nextp == NULL) {
7955			/*
7956			 * The removed probe was the only probe on this
7957			 * bucket; we need to remove the bucket.
7958			 */
7959			dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7960
7961			ASSERT(bucket->dthb_chain == probe);
7962			ASSERT(b != NULL);
7963
7964			if (b == bucket) {
7965				hash->dth_tab[ndx] = bucket->dthb_next;
7966			} else {
7967				while (b->dthb_next != bucket)
7968					b = b->dthb_next;
7969				b->dthb_next = bucket->dthb_next;
7970			}
7971
7972			ASSERT(hash->dth_nbuckets > 0);
7973			hash->dth_nbuckets--;
7974			kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7975			return;
7976		}
7977
7978		bucket->dthb_chain = *nextp;
7979	} else {
7980		*(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7981	}
7982
7983	if (*nextp != NULL)
7984		*(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7985}
7986
7987/*
7988 * DTrace Utility Functions
7989 *
7990 * These are random utility functions that are _not_ called from probe context.
7991 */
7992static int
7993dtrace_badattr(const dtrace_attribute_t *a)
7994{
7995	return (a->dtat_name > DTRACE_STABILITY_MAX ||
7996	    a->dtat_data > DTRACE_STABILITY_MAX ||
7997	    a->dtat_class > DTRACE_CLASS_MAX);
7998}
7999
8000/*
8001 * Return a duplicate copy of a string.  If the specified string is NULL,
8002 * this function returns a zero-length string.
8003 */
8004static char *
8005dtrace_strdup(const char *str)
8006{
8007	char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
8008
8009	if (str != NULL)
8010		(void) strcpy(new, str);
8011
8012	return (new);
8013}
8014
8015#define	DTRACE_ISALPHA(c)	\
8016	(((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
8017
8018static int
8019dtrace_badname(const char *s)
8020{
8021	char c;
8022
8023	if (s == NULL || (c = *s++) == '\0')
8024		return (0);
8025
8026	if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
8027		return (1);
8028
8029	while ((c = *s++) != '\0') {
8030		if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
8031		    c != '-' && c != '_' && c != '.' && c != '`')
8032			return (1);
8033	}
8034
8035	return (0);
8036}
8037
8038static void
8039dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
8040{
8041	uint32_t priv;
8042
8043#if defined(sun)
8044	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
8045		/*
8046		 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
8047		 */
8048		priv = DTRACE_PRIV_ALL;
8049	} else {
8050		*uidp = crgetuid(cr);
8051		*zoneidp = crgetzoneid(cr);
8052
8053		priv = 0;
8054		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
8055			priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
8056		else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
8057			priv |= DTRACE_PRIV_USER;
8058		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
8059			priv |= DTRACE_PRIV_PROC;
8060		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
8061			priv |= DTRACE_PRIV_OWNER;
8062		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
8063			priv |= DTRACE_PRIV_ZONEOWNER;
8064	}
8065#else
8066	priv = DTRACE_PRIV_ALL;
8067#endif
8068
8069	*privp = priv;
8070}
8071
8072#ifdef DTRACE_ERRDEBUG
8073static void
8074dtrace_errdebug(const char *str)
8075{
8076	int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
8077	int occupied = 0;
8078
8079	mutex_enter(&dtrace_errlock);
8080	dtrace_errlast = str;
8081	dtrace_errthread = curthread;
8082
8083	while (occupied++ < DTRACE_ERRHASHSZ) {
8084		if (dtrace_errhash[hval].dter_msg == str) {
8085			dtrace_errhash[hval].dter_count++;
8086			goto out;
8087		}
8088
8089		if (dtrace_errhash[hval].dter_msg != NULL) {
8090			hval = (hval + 1) % DTRACE_ERRHASHSZ;
8091			continue;
8092		}
8093
8094		dtrace_errhash[hval].dter_msg = str;
8095		dtrace_errhash[hval].dter_count = 1;
8096		goto out;
8097	}
8098
8099	panic("dtrace: undersized error hash");
8100out:
8101	mutex_exit(&dtrace_errlock);
8102}
8103#endif
8104
8105/*
8106 * DTrace Matching Functions
8107 *
8108 * These functions are used to match groups of probes, given some elements of
8109 * a probe tuple, or some globbed expressions for elements of a probe tuple.
8110 */
8111static int
8112dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
8113    zoneid_t zoneid)
8114{
8115	if (priv != DTRACE_PRIV_ALL) {
8116		uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
8117		uint32_t match = priv & ppriv;
8118
8119		/*
8120		 * No PRIV_DTRACE_* privileges...
8121		 */
8122		if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
8123		    DTRACE_PRIV_KERNEL)) == 0)
8124			return (0);
8125
8126		/*
8127		 * No matching bits, but there were bits to match...
8128		 */
8129		if (match == 0 && ppriv != 0)
8130			return (0);
8131
8132		/*
8133		 * Need to have permissions to the process, but don't...
8134		 */
8135		if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
8136		    uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
8137			return (0);
8138		}
8139
8140		/*
8141		 * Need to be in the same zone unless we possess the
8142		 * privilege to examine all zones.
8143		 */
8144		if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
8145		    zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
8146			return (0);
8147		}
8148	}
8149
8150	return (1);
8151}
8152
8153/*
8154 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
8155 * consists of input pattern strings and an ops-vector to evaluate them.
8156 * This function returns >0 for match, 0 for no match, and <0 for error.
8157 */
8158static int
8159dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
8160    uint32_t priv, uid_t uid, zoneid_t zoneid)
8161{
8162	dtrace_provider_t *pvp = prp->dtpr_provider;
8163	int rv;
8164
8165	if (pvp->dtpv_defunct)
8166		return (0);
8167
8168	if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
8169		return (rv);
8170
8171	if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
8172		return (rv);
8173
8174	if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
8175		return (rv);
8176
8177	if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
8178		return (rv);
8179
8180	if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
8181		return (0);
8182
8183	return (rv);
8184}
8185
8186/*
8187 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
8188 * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
8189 * libc's version, the kernel version only applies to 8-bit ASCII strings.
8190 * In addition, all of the recursion cases except for '*' matching have been
8191 * unwound.  For '*', we still implement recursive evaluation, but a depth
8192 * counter is maintained and matching is aborted if we recurse too deep.
8193 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
8194 */
8195static int
8196dtrace_match_glob(const char *s, const char *p, int depth)
8197{
8198	const char *olds;
8199	char s1, c;
8200	int gs;
8201
8202	if (depth > DTRACE_PROBEKEY_MAXDEPTH)
8203		return (-1);
8204
8205	if (s == NULL)
8206		s = ""; /* treat NULL as empty string */
8207
8208top:
8209	olds = s;
8210	s1 = *s++;
8211
8212	if (p == NULL)
8213		return (0);
8214
8215	if ((c = *p++) == '\0')
8216		return (s1 == '\0');
8217
8218	switch (c) {
8219	case '[': {
8220		int ok = 0, notflag = 0;
8221		char lc = '\0';
8222
8223		if (s1 == '\0')
8224			return (0);
8225
8226		if (*p == '!') {
8227			notflag = 1;
8228			p++;
8229		}
8230
8231		if ((c = *p++) == '\0')
8232			return (0);
8233
8234		do {
8235			if (c == '-' && lc != '\0' && *p != ']') {
8236				if ((c = *p++) == '\0')
8237					return (0);
8238				if (c == '\\' && (c = *p++) == '\0')
8239					return (0);
8240
8241				if (notflag) {
8242					if (s1 < lc || s1 > c)
8243						ok++;
8244					else
8245						return (0);
8246				} else if (lc <= s1 && s1 <= c)
8247					ok++;
8248
8249			} else if (c == '\\' && (c = *p++) == '\0')
8250				return (0);
8251
8252			lc = c; /* save left-hand 'c' for next iteration */
8253
8254			if (notflag) {
8255				if (s1 != c)
8256					ok++;
8257				else
8258					return (0);
8259			} else if (s1 == c)
8260				ok++;
8261
8262			if ((c = *p++) == '\0')
8263				return (0);
8264
8265		} while (c != ']');
8266
8267		if (ok)
8268			goto top;
8269
8270		return (0);
8271	}
8272
8273	case '\\':
8274		if ((c = *p++) == '\0')
8275			return (0);
8276		/*FALLTHRU*/
8277
8278	default:
8279		if (c != s1)
8280			return (0);
8281		/*FALLTHRU*/
8282
8283	case '?':
8284		if (s1 != '\0')
8285			goto top;
8286		return (0);
8287
8288	case '*':
8289		while (*p == '*')
8290			p++; /* consecutive *'s are identical to a single one */
8291
8292		if (*p == '\0')
8293			return (1);
8294
8295		for (s = olds; *s != '\0'; s++) {
8296			if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
8297				return (gs);
8298		}
8299
8300		return (0);
8301	}
8302}
8303
8304/*ARGSUSED*/
8305static int
8306dtrace_match_string(const char *s, const char *p, int depth)
8307{
8308	return (s != NULL && strcmp(s, p) == 0);
8309}
8310
8311/*ARGSUSED*/
8312static int
8313dtrace_match_nul(const char *s, const char *p, int depth)
8314{
8315	return (1); /* always match the empty pattern */
8316}
8317
8318/*ARGSUSED*/
8319static int
8320dtrace_match_nonzero(const char *s, const char *p, int depth)
8321{
8322	return (s != NULL && s[0] != '\0');
8323}
8324
8325static int
8326dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
8327    zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
8328{
8329	dtrace_probe_t template, *probe;
8330	dtrace_hash_t *hash = NULL;
8331	int len, best = INT_MAX, nmatched = 0;
8332	dtrace_id_t i;
8333
8334	ASSERT(MUTEX_HELD(&dtrace_lock));
8335
8336	/*
8337	 * If the probe ID is specified in the key, just lookup by ID and
8338	 * invoke the match callback once if a matching probe is found.
8339	 */
8340	if (pkp->dtpk_id != DTRACE_IDNONE) {
8341		if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
8342		    dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
8343			(void) (*matched)(probe, arg);
8344			nmatched++;
8345		}
8346		return (nmatched);
8347	}
8348
8349	template.dtpr_mod = (char *)pkp->dtpk_mod;
8350	template.dtpr_func = (char *)pkp->dtpk_func;
8351	template.dtpr_name = (char *)pkp->dtpk_name;
8352
8353	/*
8354	 * We want to find the most distinct of the module name, function
8355	 * name, and name.  So for each one that is not a glob pattern or
8356	 * empty string, we perform a lookup in the corresponding hash and
8357	 * use the hash table with the fewest collisions to do our search.
8358	 */
8359	if (pkp->dtpk_mmatch == &dtrace_match_string &&
8360	    (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
8361		best = len;
8362		hash = dtrace_bymod;
8363	}
8364
8365	if (pkp->dtpk_fmatch == &dtrace_match_string &&
8366	    (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
8367		best = len;
8368		hash = dtrace_byfunc;
8369	}
8370
8371	if (pkp->dtpk_nmatch == &dtrace_match_string &&
8372	    (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
8373		best = len;
8374		hash = dtrace_byname;
8375	}
8376
8377	/*
8378	 * If we did not select a hash table, iterate over every probe and
8379	 * invoke our callback for each one that matches our input probe key.
8380	 */
8381	if (hash == NULL) {
8382		for (i = 0; i < dtrace_nprobes; i++) {
8383			if ((probe = dtrace_probes[i]) == NULL ||
8384			    dtrace_match_probe(probe, pkp, priv, uid,
8385			    zoneid) <= 0)
8386				continue;
8387
8388			nmatched++;
8389
8390			if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8391				break;
8392		}
8393
8394		return (nmatched);
8395	}
8396
8397	/*
8398	 * If we selected a hash table, iterate over each probe of the same key
8399	 * name and invoke the callback for every probe that matches the other
8400	 * attributes of our input probe key.
8401	 */
8402	for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
8403	    probe = *(DTRACE_HASHNEXT(hash, probe))) {
8404
8405		if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
8406			continue;
8407
8408		nmatched++;
8409
8410		if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8411			break;
8412	}
8413
8414	return (nmatched);
8415}
8416
8417/*
8418 * Return the function pointer dtrace_probecmp() should use to compare the
8419 * specified pattern with a string.  For NULL or empty patterns, we select
8420 * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
8421 * For non-empty non-glob strings, we use dtrace_match_string().
8422 */
8423static dtrace_probekey_f *
8424dtrace_probekey_func(const char *p)
8425{
8426	char c;
8427
8428	if (p == NULL || *p == '\0')
8429		return (&dtrace_match_nul);
8430
8431	while ((c = *p++) != '\0') {
8432		if (c == '[' || c == '?' || c == '*' || c == '\\')
8433			return (&dtrace_match_glob);
8434	}
8435
8436	return (&dtrace_match_string);
8437}
8438
8439/*
8440 * Build a probe comparison key for use with dtrace_match_probe() from the
8441 * given probe description.  By convention, a null key only matches anchored
8442 * probes: if each field is the empty string, reset dtpk_fmatch to
8443 * dtrace_match_nonzero().
8444 */
8445static void
8446dtrace_probekey(dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
8447{
8448	pkp->dtpk_prov = pdp->dtpd_provider;
8449	pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
8450
8451	pkp->dtpk_mod = pdp->dtpd_mod;
8452	pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
8453
8454	pkp->dtpk_func = pdp->dtpd_func;
8455	pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
8456
8457	pkp->dtpk_name = pdp->dtpd_name;
8458	pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
8459
8460	pkp->dtpk_id = pdp->dtpd_id;
8461
8462	if (pkp->dtpk_id == DTRACE_IDNONE &&
8463	    pkp->dtpk_pmatch == &dtrace_match_nul &&
8464	    pkp->dtpk_mmatch == &dtrace_match_nul &&
8465	    pkp->dtpk_fmatch == &dtrace_match_nul &&
8466	    pkp->dtpk_nmatch == &dtrace_match_nul)
8467		pkp->dtpk_fmatch = &dtrace_match_nonzero;
8468}
8469
8470/*
8471 * DTrace Provider-to-Framework API Functions
8472 *
8473 * These functions implement much of the Provider-to-Framework API, as
8474 * described in <sys/dtrace.h>.  The parts of the API not in this section are
8475 * the functions in the API for probe management (found below), and
8476 * dtrace_probe() itself (found above).
8477 */
8478
8479/*
8480 * Register the calling provider with the DTrace framework.  This should
8481 * generally be called by DTrace providers in their attach(9E) entry point.
8482 */
8483int
8484dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
8485    cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
8486{
8487	dtrace_provider_t *provider;
8488
8489	if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
8490		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8491		    "arguments", name ? name : "<NULL>");
8492		return (EINVAL);
8493	}
8494
8495	if (name[0] == '\0' || dtrace_badname(name)) {
8496		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8497		    "provider name", name);
8498		return (EINVAL);
8499	}
8500
8501	if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
8502	    pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
8503	    pops->dtps_destroy == NULL ||
8504	    ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8505		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8506		    "provider ops", name);
8507		return (EINVAL);
8508	}
8509
8510	if (dtrace_badattr(&pap->dtpa_provider) ||
8511	    dtrace_badattr(&pap->dtpa_mod) ||
8512	    dtrace_badattr(&pap->dtpa_func) ||
8513	    dtrace_badattr(&pap->dtpa_name) ||
8514	    dtrace_badattr(&pap->dtpa_args)) {
8515		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8516		    "provider attributes", name);
8517		return (EINVAL);
8518	}
8519
8520	if (priv & ~DTRACE_PRIV_ALL) {
8521		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8522		    "privilege attributes", name);
8523		return (EINVAL);
8524	}
8525
8526	if ((priv & DTRACE_PRIV_KERNEL) &&
8527	    (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8528	    pops->dtps_usermode == NULL) {
8529		cmn_err(CE_WARN, "failed to register provider '%s': need "
8530		    "dtps_usermode() op for given privilege attributes", name);
8531		return (EINVAL);
8532	}
8533
8534	provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8535	provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8536	(void) strcpy(provider->dtpv_name, name);
8537
8538	provider->dtpv_attr = *pap;
8539	provider->dtpv_priv.dtpp_flags = priv;
8540	if (cr != NULL) {
8541		provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8542		provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8543	}
8544	provider->dtpv_pops = *pops;
8545
8546	if (pops->dtps_provide == NULL) {
8547		ASSERT(pops->dtps_provide_module != NULL);
8548		provider->dtpv_pops.dtps_provide =
8549		    (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop;
8550	}
8551
8552	if (pops->dtps_provide_module == NULL) {
8553		ASSERT(pops->dtps_provide != NULL);
8554		provider->dtpv_pops.dtps_provide_module =
8555		    (void (*)(void *, modctl_t *))dtrace_nullop;
8556	}
8557
8558	if (pops->dtps_suspend == NULL) {
8559		ASSERT(pops->dtps_resume == NULL);
8560		provider->dtpv_pops.dtps_suspend =
8561		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8562		provider->dtpv_pops.dtps_resume =
8563		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8564	}
8565
8566	provider->dtpv_arg = arg;
8567	*idp = (dtrace_provider_id_t)provider;
8568
8569	if (pops == &dtrace_provider_ops) {
8570		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8571		ASSERT(MUTEX_HELD(&dtrace_lock));
8572		ASSERT(dtrace_anon.dta_enabling == NULL);
8573
8574		/*
8575		 * We make sure that the DTrace provider is at the head of
8576		 * the provider chain.
8577		 */
8578		provider->dtpv_next = dtrace_provider;
8579		dtrace_provider = provider;
8580		return (0);
8581	}
8582
8583	mutex_enter(&dtrace_provider_lock);
8584	mutex_enter(&dtrace_lock);
8585
8586	/*
8587	 * If there is at least one provider registered, we'll add this
8588	 * provider after the first provider.
8589	 */
8590	if (dtrace_provider != NULL) {
8591		provider->dtpv_next = dtrace_provider->dtpv_next;
8592		dtrace_provider->dtpv_next = provider;
8593	} else {
8594		dtrace_provider = provider;
8595	}
8596
8597	if (dtrace_retained != NULL) {
8598		dtrace_enabling_provide(provider);
8599
8600		/*
8601		 * Now we need to call dtrace_enabling_matchall() -- which
8602		 * will acquire cpu_lock and dtrace_lock.  We therefore need
8603		 * to drop all of our locks before calling into it...
8604		 */
8605		mutex_exit(&dtrace_lock);
8606		mutex_exit(&dtrace_provider_lock);
8607		dtrace_enabling_matchall();
8608
8609		return (0);
8610	}
8611
8612	mutex_exit(&dtrace_lock);
8613	mutex_exit(&dtrace_provider_lock);
8614
8615	return (0);
8616}
8617
8618/*
8619 * Unregister the specified provider from the DTrace framework.  This should
8620 * generally be called by DTrace providers in their detach(9E) entry point.
8621 */
8622int
8623dtrace_unregister(dtrace_provider_id_t id)
8624{
8625	dtrace_provider_t *old = (dtrace_provider_t *)id;
8626	dtrace_provider_t *prev = NULL;
8627	int i, self = 0, noreap = 0;
8628	dtrace_probe_t *probe, *first = NULL;
8629
8630	if (old->dtpv_pops.dtps_enable ==
8631	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
8632		/*
8633		 * If DTrace itself is the provider, we're called with locks
8634		 * already held.
8635		 */
8636		ASSERT(old == dtrace_provider);
8637#if defined(sun)
8638		ASSERT(dtrace_devi != NULL);
8639#endif
8640		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8641		ASSERT(MUTEX_HELD(&dtrace_lock));
8642		self = 1;
8643
8644		if (dtrace_provider->dtpv_next != NULL) {
8645			/*
8646			 * There's another provider here; return failure.
8647			 */
8648			return (EBUSY);
8649		}
8650	} else {
8651		mutex_enter(&dtrace_provider_lock);
8652#if defined(sun)
8653		mutex_enter(&mod_lock);
8654#endif
8655		mutex_enter(&dtrace_lock);
8656	}
8657
8658	/*
8659	 * If anyone has /dev/dtrace open, or if there are anonymous enabled
8660	 * probes, we refuse to let providers slither away, unless this
8661	 * provider has already been explicitly invalidated.
8662	 */
8663	if (!old->dtpv_defunct &&
8664	    (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8665	    dtrace_anon.dta_state->dts_necbs > 0))) {
8666		if (!self) {
8667			mutex_exit(&dtrace_lock);
8668#if defined(sun)
8669			mutex_exit(&mod_lock);
8670#endif
8671			mutex_exit(&dtrace_provider_lock);
8672		}
8673		return (EBUSY);
8674	}
8675
8676	/*
8677	 * Attempt to destroy the probes associated with this provider.
8678	 */
8679	for (i = 0; i < dtrace_nprobes; i++) {
8680		if ((probe = dtrace_probes[i]) == NULL)
8681			continue;
8682
8683		if (probe->dtpr_provider != old)
8684			continue;
8685
8686		if (probe->dtpr_ecb == NULL)
8687			continue;
8688
8689		/*
8690		 * If we are trying to unregister a defunct provider, and the
8691		 * provider was made defunct within the interval dictated by
8692		 * dtrace_unregister_defunct_reap, we'll (asynchronously)
8693		 * attempt to reap our enablings.  To denote that the provider
8694		 * should reattempt to unregister itself at some point in the
8695		 * future, we will return a differentiable error code (EAGAIN
8696		 * instead of EBUSY) in this case.
8697		 */
8698		if (dtrace_gethrtime() - old->dtpv_defunct >
8699		    dtrace_unregister_defunct_reap)
8700			noreap = 1;
8701
8702		if (!self) {
8703			mutex_exit(&dtrace_lock);
8704#if defined(sun)
8705			mutex_exit(&mod_lock);
8706#endif
8707			mutex_exit(&dtrace_provider_lock);
8708		}
8709
8710		if (noreap)
8711			return (EBUSY);
8712
8713		(void) taskq_dispatch(dtrace_taskq,
8714		    (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
8715
8716		return (EAGAIN);
8717	}
8718
8719	/*
8720	 * All of the probes for this provider are disabled; we can safely
8721	 * remove all of them from their hash chains and from the probe array.
8722	 */
8723	for (i = 0; i < dtrace_nprobes; i++) {
8724		if ((probe = dtrace_probes[i]) == NULL)
8725			continue;
8726
8727		if (probe->dtpr_provider != old)
8728			continue;
8729
8730		dtrace_probes[i] = NULL;
8731
8732		dtrace_hash_remove(dtrace_bymod, probe);
8733		dtrace_hash_remove(dtrace_byfunc, probe);
8734		dtrace_hash_remove(dtrace_byname, probe);
8735
8736		if (first == NULL) {
8737			first = probe;
8738			probe->dtpr_nextmod = NULL;
8739		} else {
8740			probe->dtpr_nextmod = first;
8741			first = probe;
8742		}
8743	}
8744
8745	/*
8746	 * The provider's probes have been removed from the hash chains and
8747	 * from the probe array.  Now issue a dtrace_sync() to be sure that
8748	 * everyone has cleared out from any probe array processing.
8749	 */
8750	dtrace_sync();
8751
8752	for (probe = first; probe != NULL; probe = first) {
8753		first = probe->dtpr_nextmod;
8754
8755		old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8756		    probe->dtpr_arg);
8757		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8758		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8759		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8760#if defined(sun)
8761		vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8762#else
8763		free_unr(dtrace_arena, probe->dtpr_id);
8764#endif
8765		kmem_free(probe, sizeof (dtrace_probe_t));
8766	}
8767
8768	if ((prev = dtrace_provider) == old) {
8769#if defined(sun)
8770		ASSERT(self || dtrace_devi == NULL);
8771		ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8772#endif
8773		dtrace_provider = old->dtpv_next;
8774	} else {
8775		while (prev != NULL && prev->dtpv_next != old)
8776			prev = prev->dtpv_next;
8777
8778		if (prev == NULL) {
8779			panic("attempt to unregister non-existent "
8780			    "dtrace provider %p\n", (void *)id);
8781		}
8782
8783		prev->dtpv_next = old->dtpv_next;
8784	}
8785
8786	if (!self) {
8787		mutex_exit(&dtrace_lock);
8788#if defined(sun)
8789		mutex_exit(&mod_lock);
8790#endif
8791		mutex_exit(&dtrace_provider_lock);
8792	}
8793
8794	kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
8795	kmem_free(old, sizeof (dtrace_provider_t));
8796
8797	return (0);
8798}
8799
8800/*
8801 * Invalidate the specified provider.  All subsequent probe lookups for the
8802 * specified provider will fail, but its probes will not be removed.
8803 */
8804void
8805dtrace_invalidate(dtrace_provider_id_t id)
8806{
8807	dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8808
8809	ASSERT(pvp->dtpv_pops.dtps_enable !=
8810	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
8811
8812	mutex_enter(&dtrace_provider_lock);
8813	mutex_enter(&dtrace_lock);
8814
8815	pvp->dtpv_defunct = dtrace_gethrtime();
8816
8817	mutex_exit(&dtrace_lock);
8818	mutex_exit(&dtrace_provider_lock);
8819}
8820
8821/*
8822 * Indicate whether or not DTrace has attached.
8823 */
8824int
8825dtrace_attached(void)
8826{
8827	/*
8828	 * dtrace_provider will be non-NULL iff the DTrace driver has
8829	 * attached.  (It's non-NULL because DTrace is always itself a
8830	 * provider.)
8831	 */
8832	return (dtrace_provider != NULL);
8833}
8834
8835/*
8836 * Remove all the unenabled probes for the given provider.  This function is
8837 * not unlike dtrace_unregister(), except that it doesn't remove the provider
8838 * -- just as many of its associated probes as it can.
8839 */
8840int
8841dtrace_condense(dtrace_provider_id_t id)
8842{
8843	dtrace_provider_t *prov = (dtrace_provider_t *)id;
8844	int i;
8845	dtrace_probe_t *probe;
8846
8847	/*
8848	 * Make sure this isn't the dtrace provider itself.
8849	 */
8850	ASSERT(prov->dtpv_pops.dtps_enable !=
8851	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
8852
8853	mutex_enter(&dtrace_provider_lock);
8854	mutex_enter(&dtrace_lock);
8855
8856	/*
8857	 * Attempt to destroy the probes associated with this provider.
8858	 */
8859	for (i = 0; i < dtrace_nprobes; i++) {
8860		if ((probe = dtrace_probes[i]) == NULL)
8861			continue;
8862
8863		if (probe->dtpr_provider != prov)
8864			continue;
8865
8866		if (probe->dtpr_ecb != NULL)
8867			continue;
8868
8869		dtrace_probes[i] = NULL;
8870
8871		dtrace_hash_remove(dtrace_bymod, probe);
8872		dtrace_hash_remove(dtrace_byfunc, probe);
8873		dtrace_hash_remove(dtrace_byname, probe);
8874
8875		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
8876		    probe->dtpr_arg);
8877		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8878		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8879		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8880		kmem_free(probe, sizeof (dtrace_probe_t));
8881#if defined(sun)
8882		vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
8883#else
8884		free_unr(dtrace_arena, i + 1);
8885#endif
8886	}
8887
8888	mutex_exit(&dtrace_lock);
8889	mutex_exit(&dtrace_provider_lock);
8890
8891	return (0);
8892}
8893
8894/*
8895 * DTrace Probe Management Functions
8896 *
8897 * The functions in this section perform the DTrace probe management,
8898 * including functions to create probes, look-up probes, and call into the
8899 * providers to request that probes be provided.  Some of these functions are
8900 * in the Provider-to-Framework API; these functions can be identified by the
8901 * fact that they are not declared "static".
8902 */
8903
8904/*
8905 * Create a probe with the specified module name, function name, and name.
8906 */
8907dtrace_id_t
8908dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
8909    const char *func, const char *name, int aframes, void *arg)
8910{
8911	dtrace_probe_t *probe, **probes;
8912	dtrace_provider_t *provider = (dtrace_provider_t *)prov;
8913	dtrace_id_t id;
8914
8915	if (provider == dtrace_provider) {
8916		ASSERT(MUTEX_HELD(&dtrace_lock));
8917	} else {
8918		mutex_enter(&dtrace_lock);
8919	}
8920
8921#if defined(sun)
8922	id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
8923	    VM_BESTFIT | VM_SLEEP);
8924#else
8925	id = alloc_unr(dtrace_arena);
8926#endif
8927	probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
8928
8929	probe->dtpr_id = id;
8930	probe->dtpr_gen = dtrace_probegen++;
8931	probe->dtpr_mod = dtrace_strdup(mod);
8932	probe->dtpr_func = dtrace_strdup(func);
8933	probe->dtpr_name = dtrace_strdup(name);
8934	probe->dtpr_arg = arg;
8935	probe->dtpr_aframes = aframes;
8936	probe->dtpr_provider = provider;
8937
8938	dtrace_hash_add(dtrace_bymod, probe);
8939	dtrace_hash_add(dtrace_byfunc, probe);
8940	dtrace_hash_add(dtrace_byname, probe);
8941
8942	if (id - 1 >= dtrace_nprobes) {
8943		size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
8944		size_t nsize = osize << 1;
8945
8946		if (nsize == 0) {
8947			ASSERT(osize == 0);
8948			ASSERT(dtrace_probes == NULL);
8949			nsize = sizeof (dtrace_probe_t *);
8950		}
8951
8952		probes = kmem_zalloc(nsize, KM_SLEEP);
8953
8954		if (dtrace_probes == NULL) {
8955			ASSERT(osize == 0);
8956			dtrace_probes = probes;
8957			dtrace_nprobes = 1;
8958		} else {
8959			dtrace_probe_t **oprobes = dtrace_probes;
8960
8961			bcopy(oprobes, probes, osize);
8962			dtrace_membar_producer();
8963			dtrace_probes = probes;
8964
8965			dtrace_sync();
8966
8967			/*
8968			 * All CPUs are now seeing the new probes array; we can
8969			 * safely free the old array.
8970			 */
8971			kmem_free(oprobes, osize);
8972			dtrace_nprobes <<= 1;
8973		}
8974
8975		ASSERT(id - 1 < dtrace_nprobes);
8976	}
8977
8978	ASSERT(dtrace_probes[id - 1] == NULL);
8979	dtrace_probes[id - 1] = probe;
8980
8981	if (provider != dtrace_provider)
8982		mutex_exit(&dtrace_lock);
8983
8984	return (id);
8985}
8986
8987static dtrace_probe_t *
8988dtrace_probe_lookup_id(dtrace_id_t id)
8989{
8990	ASSERT(MUTEX_HELD(&dtrace_lock));
8991
8992	if (id == 0 || id > dtrace_nprobes)
8993		return (NULL);
8994
8995	return (dtrace_probes[id - 1]);
8996}
8997
8998static int
8999dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
9000{
9001	*((dtrace_id_t *)arg) = probe->dtpr_id;
9002
9003	return (DTRACE_MATCH_DONE);
9004}
9005
9006/*
9007 * Look up a probe based on provider and one or more of module name, function
9008 * name and probe name.
9009 */
9010dtrace_id_t
9011dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,
9012    char *func, char *name)
9013{
9014	dtrace_probekey_t pkey;
9015	dtrace_id_t id;
9016	int match;
9017
9018	pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
9019	pkey.dtpk_pmatch = &dtrace_match_string;
9020	pkey.dtpk_mod = mod;
9021	pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
9022	pkey.dtpk_func = func;
9023	pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
9024	pkey.dtpk_name = name;
9025	pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
9026	pkey.dtpk_id = DTRACE_IDNONE;
9027
9028	mutex_enter(&dtrace_lock);
9029	match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
9030	    dtrace_probe_lookup_match, &id);
9031	mutex_exit(&dtrace_lock);
9032
9033	ASSERT(match == 1 || match == 0);
9034	return (match ? id : 0);
9035}
9036
9037/*
9038 * Returns the probe argument associated with the specified probe.
9039 */
9040void *
9041dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
9042{
9043	dtrace_probe_t *probe;
9044	void *rval = NULL;
9045
9046	mutex_enter(&dtrace_lock);
9047
9048	if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
9049	    probe->dtpr_provider == (dtrace_provider_t *)id)
9050		rval = probe->dtpr_arg;
9051
9052	mutex_exit(&dtrace_lock);
9053
9054	return (rval);
9055}
9056
9057/*
9058 * Copy a probe into a probe description.
9059 */
9060static void
9061dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
9062{
9063	bzero(pdp, sizeof (dtrace_probedesc_t));
9064	pdp->dtpd_id = prp->dtpr_id;
9065
9066	(void) strncpy(pdp->dtpd_provider,
9067	    prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
9068
9069	(void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
9070	(void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
9071	(void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
9072}
9073
9074/*
9075 * Called to indicate that a probe -- or probes -- should be provided by a
9076 * specfied provider.  If the specified description is NULL, the provider will
9077 * be told to provide all of its probes.  (This is done whenever a new
9078 * consumer comes along, or whenever a retained enabling is to be matched.) If
9079 * the specified description is non-NULL, the provider is given the
9080 * opportunity to dynamically provide the specified probe, allowing providers
9081 * to support the creation of probes on-the-fly.  (So-called _autocreated_
9082 * probes.)  If the provider is NULL, the operations will be applied to all
9083 * providers; if the provider is non-NULL the operations will only be applied
9084 * to the specified provider.  The dtrace_provider_lock must be held, and the
9085 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
9086 * will need to grab the dtrace_lock when it reenters the framework through
9087 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
9088 */
9089static void
9090dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
9091{
9092#if defined(sun)
9093	modctl_t *ctl;
9094#endif
9095	int all = 0;
9096
9097	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
9098
9099	if (prv == NULL) {
9100		all = 1;
9101		prv = dtrace_provider;
9102	}
9103
9104	do {
9105		/*
9106		 * First, call the blanket provide operation.
9107		 */
9108		prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
9109
9110#if defined(sun)
9111		/*
9112		 * Now call the per-module provide operation.  We will grab
9113		 * mod_lock to prevent the list from being modified.  Note
9114		 * that this also prevents the mod_busy bits from changing.
9115		 * (mod_busy can only be changed with mod_lock held.)
9116		 */
9117		mutex_enter(&mod_lock);
9118
9119		ctl = &modules;
9120		do {
9121			if (ctl->mod_busy || ctl->mod_mp == NULL)
9122				continue;
9123
9124			prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
9125
9126		} while ((ctl = ctl->mod_next) != &modules);
9127
9128		mutex_exit(&mod_lock);
9129#endif
9130	} while (all && (prv = prv->dtpv_next) != NULL);
9131}
9132
9133#if defined(sun)
9134/*
9135 * Iterate over each probe, and call the Framework-to-Provider API function
9136 * denoted by offs.
9137 */
9138static void
9139dtrace_probe_foreach(uintptr_t offs)
9140{
9141	dtrace_provider_t *prov;
9142	void (*func)(void *, dtrace_id_t, void *);
9143	dtrace_probe_t *probe;
9144	dtrace_icookie_t cookie;
9145	int i;
9146
9147	/*
9148	 * We disable interrupts to walk through the probe array.  This is
9149	 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
9150	 * won't see stale data.
9151	 */
9152	cookie = dtrace_interrupt_disable();
9153
9154	for (i = 0; i < dtrace_nprobes; i++) {
9155		if ((probe = dtrace_probes[i]) == NULL)
9156			continue;
9157
9158		if (probe->dtpr_ecb == NULL) {
9159			/*
9160			 * This probe isn't enabled -- don't call the function.
9161			 */
9162			continue;
9163		}
9164
9165		prov = probe->dtpr_provider;
9166		func = *((void(**)(void *, dtrace_id_t, void *))
9167		    ((uintptr_t)&prov->dtpv_pops + offs));
9168
9169		func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
9170	}
9171
9172	dtrace_interrupt_enable(cookie);
9173}
9174#endif
9175
9176static int
9177dtrace_probe_enable(dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
9178{
9179	dtrace_probekey_t pkey;
9180	uint32_t priv;
9181	uid_t uid;
9182	zoneid_t zoneid;
9183
9184	ASSERT(MUTEX_HELD(&dtrace_lock));
9185	dtrace_ecb_create_cache = NULL;
9186
9187	if (desc == NULL) {
9188		/*
9189		 * If we're passed a NULL description, we're being asked to
9190		 * create an ECB with a NULL probe.
9191		 */
9192		(void) dtrace_ecb_create_enable(NULL, enab);
9193		return (0);
9194	}
9195
9196	dtrace_probekey(desc, &pkey);
9197	dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
9198	    &priv, &uid, &zoneid);
9199
9200	return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
9201	    enab));
9202}
9203
9204/*
9205 * DTrace Helper Provider Functions
9206 */
9207static void
9208dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
9209{
9210	attr->dtat_name = DOF_ATTR_NAME(dofattr);
9211	attr->dtat_data = DOF_ATTR_DATA(dofattr);
9212	attr->dtat_class = DOF_ATTR_CLASS(dofattr);
9213}
9214
9215static void
9216dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
9217    const dof_provider_t *dofprov, char *strtab)
9218{
9219	hprov->dthpv_provname = strtab + dofprov->dofpv_name;
9220	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
9221	    dofprov->dofpv_provattr);
9222	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
9223	    dofprov->dofpv_modattr);
9224	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
9225	    dofprov->dofpv_funcattr);
9226	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
9227	    dofprov->dofpv_nameattr);
9228	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
9229	    dofprov->dofpv_argsattr);
9230}
9231
9232static void
9233dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9234{
9235	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9236	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9237	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
9238	dof_provider_t *provider;
9239	dof_probe_t *probe;
9240	uint32_t *off, *enoff;
9241	uint8_t *arg;
9242	char *strtab;
9243	uint_t i, nprobes;
9244	dtrace_helper_provdesc_t dhpv;
9245	dtrace_helper_probedesc_t dhpb;
9246	dtrace_meta_t *meta = dtrace_meta_pid;
9247	dtrace_mops_t *mops = &meta->dtm_mops;
9248	void *parg;
9249
9250	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9251	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9252	    provider->dofpv_strtab * dof->dofh_secsize);
9253	prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9254	    provider->dofpv_probes * dof->dofh_secsize);
9255	arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9256	    provider->dofpv_prargs * dof->dofh_secsize);
9257	off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9258	    provider->dofpv_proffs * dof->dofh_secsize);
9259
9260	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9261	off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
9262	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
9263	enoff = NULL;
9264
9265	/*
9266	 * See dtrace_helper_provider_validate().
9267	 */
9268	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
9269	    provider->dofpv_prenoffs != DOF_SECT_NONE) {
9270		enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9271		    provider->dofpv_prenoffs * dof->dofh_secsize);
9272		enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
9273	}
9274
9275	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
9276
9277	/*
9278	 * Create the provider.
9279	 */
9280	dtrace_dofprov2hprov(&dhpv, provider, strtab);
9281
9282	if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
9283		return;
9284
9285	meta->dtm_count++;
9286
9287	/*
9288	 * Create the probes.
9289	 */
9290	for (i = 0; i < nprobes; i++) {
9291		probe = (dof_probe_t *)(uintptr_t)(daddr +
9292		    prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
9293
9294		dhpb.dthpb_mod = dhp->dofhp_mod;
9295		dhpb.dthpb_func = strtab + probe->dofpr_func;
9296		dhpb.dthpb_name = strtab + probe->dofpr_name;
9297		dhpb.dthpb_base = probe->dofpr_addr;
9298		dhpb.dthpb_offs = off + probe->dofpr_offidx;
9299		dhpb.dthpb_noffs = probe->dofpr_noffs;
9300		if (enoff != NULL) {
9301			dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
9302			dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
9303		} else {
9304			dhpb.dthpb_enoffs = NULL;
9305			dhpb.dthpb_nenoffs = 0;
9306		}
9307		dhpb.dthpb_args = arg + probe->dofpr_argidx;
9308		dhpb.dthpb_nargc = probe->dofpr_nargc;
9309		dhpb.dthpb_xargc = probe->dofpr_xargc;
9310		dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
9311		dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
9312
9313		mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
9314	}
9315}
9316
9317static void
9318dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
9319{
9320	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9321	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9322	int i;
9323
9324	ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9325
9326	for (i = 0; i < dof->dofh_secnum; i++) {
9327		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9328		    dof->dofh_secoff + i * dof->dofh_secsize);
9329
9330		if (sec->dofs_type != DOF_SECT_PROVIDER)
9331			continue;
9332
9333		dtrace_helper_provide_one(dhp, sec, pid);
9334	}
9335
9336	/*
9337	 * We may have just created probes, so we must now rematch against
9338	 * any retained enablings.  Note that this call will acquire both
9339	 * cpu_lock and dtrace_lock; the fact that we are holding
9340	 * dtrace_meta_lock now is what defines the ordering with respect to
9341	 * these three locks.
9342	 */
9343	dtrace_enabling_matchall();
9344}
9345
9346static void
9347dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9348{
9349	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9350	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9351	dof_sec_t *str_sec;
9352	dof_provider_t *provider;
9353	char *strtab;
9354	dtrace_helper_provdesc_t dhpv;
9355	dtrace_meta_t *meta = dtrace_meta_pid;
9356	dtrace_mops_t *mops = &meta->dtm_mops;
9357
9358	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9359	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9360	    provider->dofpv_strtab * dof->dofh_secsize);
9361
9362	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9363
9364	/*
9365	 * Create the provider.
9366	 */
9367	dtrace_dofprov2hprov(&dhpv, provider, strtab);
9368
9369	mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
9370
9371	meta->dtm_count--;
9372}
9373
9374static void
9375dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
9376{
9377	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9378	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9379	int i;
9380
9381	ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9382
9383	for (i = 0; i < dof->dofh_secnum; i++) {
9384		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9385		    dof->dofh_secoff + i * dof->dofh_secsize);
9386
9387		if (sec->dofs_type != DOF_SECT_PROVIDER)
9388			continue;
9389
9390		dtrace_helper_provider_remove_one(dhp, sec, pid);
9391	}
9392}
9393
9394/*
9395 * DTrace Meta Provider-to-Framework API Functions
9396 *
9397 * These functions implement the Meta Provider-to-Framework API, as described
9398 * in <sys/dtrace.h>.
9399 */
9400int
9401dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
9402    dtrace_meta_provider_id_t *idp)
9403{
9404	dtrace_meta_t *meta;
9405	dtrace_helpers_t *help, *next;
9406	int i;
9407
9408	*idp = DTRACE_METAPROVNONE;
9409
9410	/*
9411	 * We strictly don't need the name, but we hold onto it for
9412	 * debuggability. All hail error queues!
9413	 */
9414	if (name == NULL) {
9415		cmn_err(CE_WARN, "failed to register meta-provider: "
9416		    "invalid name");
9417		return (EINVAL);
9418	}
9419
9420	if (mops == NULL ||
9421	    mops->dtms_create_probe == NULL ||
9422	    mops->dtms_provide_pid == NULL ||
9423	    mops->dtms_remove_pid == NULL) {
9424		cmn_err(CE_WARN, "failed to register meta-register %s: "
9425		    "invalid ops", name);
9426		return (EINVAL);
9427	}
9428
9429	meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
9430	meta->dtm_mops = *mops;
9431	meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
9432	(void) strcpy(meta->dtm_name, name);
9433	meta->dtm_arg = arg;
9434
9435	mutex_enter(&dtrace_meta_lock);
9436	mutex_enter(&dtrace_lock);
9437
9438	if (dtrace_meta_pid != NULL) {
9439		mutex_exit(&dtrace_lock);
9440		mutex_exit(&dtrace_meta_lock);
9441		cmn_err(CE_WARN, "failed to register meta-register %s: "
9442		    "user-land meta-provider exists", name);
9443		kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
9444		kmem_free(meta, sizeof (dtrace_meta_t));
9445		return (EINVAL);
9446	}
9447
9448	dtrace_meta_pid = meta;
9449	*idp = (dtrace_meta_provider_id_t)meta;
9450
9451	/*
9452	 * If there are providers and probes ready to go, pass them
9453	 * off to the new meta provider now.
9454	 */
9455
9456	help = dtrace_deferred_pid;
9457	dtrace_deferred_pid = NULL;
9458
9459	mutex_exit(&dtrace_lock);
9460
9461	while (help != NULL) {
9462		for (i = 0; i < help->dthps_nprovs; i++) {
9463			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
9464			    help->dthps_pid);
9465		}
9466
9467		next = help->dthps_next;
9468		help->dthps_next = NULL;
9469		help->dthps_prev = NULL;
9470		help->dthps_deferred = 0;
9471		help = next;
9472	}
9473
9474	mutex_exit(&dtrace_meta_lock);
9475
9476	return (0);
9477}
9478
9479int
9480dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9481{
9482	dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9483
9484	mutex_enter(&dtrace_meta_lock);
9485	mutex_enter(&dtrace_lock);
9486
9487	if (old == dtrace_meta_pid) {
9488		pp = &dtrace_meta_pid;
9489	} else {
9490		panic("attempt to unregister non-existent "
9491		    "dtrace meta-provider %p\n", (void *)old);
9492	}
9493
9494	if (old->dtm_count != 0) {
9495		mutex_exit(&dtrace_lock);
9496		mutex_exit(&dtrace_meta_lock);
9497		return (EBUSY);
9498	}
9499
9500	*pp = NULL;
9501
9502	mutex_exit(&dtrace_lock);
9503	mutex_exit(&dtrace_meta_lock);
9504
9505	kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
9506	kmem_free(old, sizeof (dtrace_meta_t));
9507
9508	return (0);
9509}
9510
9511
9512/*
9513 * DTrace DIF Object Functions
9514 */
9515static int
9516dtrace_difo_err(uint_t pc, const char *format, ...)
9517{
9518	if (dtrace_err_verbose) {
9519		va_list alist;
9520
9521		(void) uprintf("dtrace DIF object error: [%u]: ", pc);
9522		va_start(alist, format);
9523		(void) vuprintf(format, alist);
9524		va_end(alist);
9525	}
9526
9527#ifdef DTRACE_ERRDEBUG
9528	dtrace_errdebug(format);
9529#endif
9530	return (1);
9531}
9532
9533/*
9534 * Validate a DTrace DIF object by checking the IR instructions.  The following
9535 * rules are currently enforced by dtrace_difo_validate():
9536 *
9537 * 1. Each instruction must have a valid opcode
9538 * 2. Each register, string, variable, or subroutine reference must be valid
9539 * 3. No instruction can modify register %r0 (must be zero)
9540 * 4. All instruction reserved bits must be set to zero
9541 * 5. The last instruction must be a "ret" instruction
9542 * 6. All branch targets must reference a valid instruction _after_ the branch
9543 */
9544static int
9545dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9546    cred_t *cr)
9547{
9548	int err = 0, i;
9549	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9550	int kcheckload;
9551	uint_t pc;
9552
9553	kcheckload = cr == NULL ||
9554	    (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9555
9556	dp->dtdo_destructive = 0;
9557
9558	for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9559		dif_instr_t instr = dp->dtdo_buf[pc];
9560
9561		uint_t r1 = DIF_INSTR_R1(instr);
9562		uint_t r2 = DIF_INSTR_R2(instr);
9563		uint_t rd = DIF_INSTR_RD(instr);
9564		uint_t rs = DIF_INSTR_RS(instr);
9565		uint_t label = DIF_INSTR_LABEL(instr);
9566		uint_t v = DIF_INSTR_VAR(instr);
9567		uint_t subr = DIF_INSTR_SUBR(instr);
9568		uint_t type = DIF_INSTR_TYPE(instr);
9569		uint_t op = DIF_INSTR_OP(instr);
9570
9571		switch (op) {
9572		case DIF_OP_OR:
9573		case DIF_OP_XOR:
9574		case DIF_OP_AND:
9575		case DIF_OP_SLL:
9576		case DIF_OP_SRL:
9577		case DIF_OP_SRA:
9578		case DIF_OP_SUB:
9579		case DIF_OP_ADD:
9580		case DIF_OP_MUL:
9581		case DIF_OP_SDIV:
9582		case DIF_OP_UDIV:
9583		case DIF_OP_SREM:
9584		case DIF_OP_UREM:
9585		case DIF_OP_COPYS:
9586			if (r1 >= nregs)
9587				err += efunc(pc, "invalid register %u\n", r1);
9588			if (r2 >= nregs)
9589				err += efunc(pc, "invalid register %u\n", r2);
9590			if (rd >= nregs)
9591				err += efunc(pc, "invalid register %u\n", rd);
9592			if (rd == 0)
9593				err += efunc(pc, "cannot write to %r0\n");
9594			break;
9595		case DIF_OP_NOT:
9596		case DIF_OP_MOV:
9597		case DIF_OP_ALLOCS:
9598			if (r1 >= nregs)
9599				err += efunc(pc, "invalid register %u\n", r1);
9600			if (r2 != 0)
9601				err += efunc(pc, "non-zero reserved bits\n");
9602			if (rd >= nregs)
9603				err += efunc(pc, "invalid register %u\n", rd);
9604			if (rd == 0)
9605				err += efunc(pc, "cannot write to %r0\n");
9606			break;
9607		case DIF_OP_LDSB:
9608		case DIF_OP_LDSH:
9609		case DIF_OP_LDSW:
9610		case DIF_OP_LDUB:
9611		case DIF_OP_LDUH:
9612		case DIF_OP_LDUW:
9613		case DIF_OP_LDX:
9614			if (r1 >= nregs)
9615				err += efunc(pc, "invalid register %u\n", r1);
9616			if (r2 != 0)
9617				err += efunc(pc, "non-zero reserved bits\n");
9618			if (rd >= nregs)
9619				err += efunc(pc, "invalid register %u\n", rd);
9620			if (rd == 0)
9621				err += efunc(pc, "cannot write to %r0\n");
9622			if (kcheckload)
9623				dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9624				    DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9625			break;
9626		case DIF_OP_RLDSB:
9627		case DIF_OP_RLDSH:
9628		case DIF_OP_RLDSW:
9629		case DIF_OP_RLDUB:
9630		case DIF_OP_RLDUH:
9631		case DIF_OP_RLDUW:
9632		case DIF_OP_RLDX:
9633			if (r1 >= nregs)
9634				err += efunc(pc, "invalid register %u\n", r1);
9635			if (r2 != 0)
9636				err += efunc(pc, "non-zero reserved bits\n");
9637			if (rd >= nregs)
9638				err += efunc(pc, "invalid register %u\n", rd);
9639			if (rd == 0)
9640				err += efunc(pc, "cannot write to %r0\n");
9641			break;
9642		case DIF_OP_ULDSB:
9643		case DIF_OP_ULDSH:
9644		case DIF_OP_ULDSW:
9645		case DIF_OP_ULDUB:
9646		case DIF_OP_ULDUH:
9647		case DIF_OP_ULDUW:
9648		case DIF_OP_ULDX:
9649			if (r1 >= nregs)
9650				err += efunc(pc, "invalid register %u\n", r1);
9651			if (r2 != 0)
9652				err += efunc(pc, "non-zero reserved bits\n");
9653			if (rd >= nregs)
9654				err += efunc(pc, "invalid register %u\n", rd);
9655			if (rd == 0)
9656				err += efunc(pc, "cannot write to %r0\n");
9657			break;
9658		case DIF_OP_STB:
9659		case DIF_OP_STH:
9660		case DIF_OP_STW:
9661		case DIF_OP_STX:
9662			if (r1 >= nregs)
9663				err += efunc(pc, "invalid register %u\n", r1);
9664			if (r2 != 0)
9665				err += efunc(pc, "non-zero reserved bits\n");
9666			if (rd >= nregs)
9667				err += efunc(pc, "invalid register %u\n", rd);
9668			if (rd == 0)
9669				err += efunc(pc, "cannot write to 0 address\n");
9670			break;
9671		case DIF_OP_CMP:
9672		case DIF_OP_SCMP:
9673			if (r1 >= nregs)
9674				err += efunc(pc, "invalid register %u\n", r1);
9675			if (r2 >= nregs)
9676				err += efunc(pc, "invalid register %u\n", r2);
9677			if (rd != 0)
9678				err += efunc(pc, "non-zero reserved bits\n");
9679			break;
9680		case DIF_OP_TST:
9681			if (r1 >= nregs)
9682				err += efunc(pc, "invalid register %u\n", r1);
9683			if (r2 != 0 || rd != 0)
9684				err += efunc(pc, "non-zero reserved bits\n");
9685			break;
9686		case DIF_OP_BA:
9687		case DIF_OP_BE:
9688		case DIF_OP_BNE:
9689		case DIF_OP_BG:
9690		case DIF_OP_BGU:
9691		case DIF_OP_BGE:
9692		case DIF_OP_BGEU:
9693		case DIF_OP_BL:
9694		case DIF_OP_BLU:
9695		case DIF_OP_BLE:
9696		case DIF_OP_BLEU:
9697			if (label >= dp->dtdo_len) {
9698				err += efunc(pc, "invalid branch target %u\n",
9699				    label);
9700			}
9701			if (label <= pc) {
9702				err += efunc(pc, "backward branch to %u\n",
9703				    label);
9704			}
9705			break;
9706		case DIF_OP_RET:
9707			if (r1 != 0 || r2 != 0)
9708				err += efunc(pc, "non-zero reserved bits\n");
9709			if (rd >= nregs)
9710				err += efunc(pc, "invalid register %u\n", rd);
9711			break;
9712		case DIF_OP_NOP:
9713		case DIF_OP_POPTS:
9714		case DIF_OP_FLUSHTS:
9715			if (r1 != 0 || r2 != 0 || rd != 0)
9716				err += efunc(pc, "non-zero reserved bits\n");
9717			break;
9718		case DIF_OP_SETX:
9719			if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9720				err += efunc(pc, "invalid integer ref %u\n",
9721				    DIF_INSTR_INTEGER(instr));
9722			}
9723			if (rd >= nregs)
9724				err += efunc(pc, "invalid register %u\n", rd);
9725			if (rd == 0)
9726				err += efunc(pc, "cannot write to %r0\n");
9727			break;
9728		case DIF_OP_SETS:
9729			if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9730				err += efunc(pc, "invalid string ref %u\n",
9731				    DIF_INSTR_STRING(instr));
9732			}
9733			if (rd >= nregs)
9734				err += efunc(pc, "invalid register %u\n", rd);
9735			if (rd == 0)
9736				err += efunc(pc, "cannot write to %r0\n");
9737			break;
9738		case DIF_OP_LDGA:
9739		case DIF_OP_LDTA:
9740			if (r1 > DIF_VAR_ARRAY_MAX)
9741				err += efunc(pc, "invalid array %u\n", r1);
9742			if (r2 >= nregs)
9743				err += efunc(pc, "invalid register %u\n", r2);
9744			if (rd >= nregs)
9745				err += efunc(pc, "invalid register %u\n", rd);
9746			if (rd == 0)
9747				err += efunc(pc, "cannot write to %r0\n");
9748			break;
9749		case DIF_OP_LDGS:
9750		case DIF_OP_LDTS:
9751		case DIF_OP_LDLS:
9752		case DIF_OP_LDGAA:
9753		case DIF_OP_LDTAA:
9754			if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9755				err += efunc(pc, "invalid variable %u\n", v);
9756			if (rd >= nregs)
9757				err += efunc(pc, "invalid register %u\n", rd);
9758			if (rd == 0)
9759				err += efunc(pc, "cannot write to %r0\n");
9760			break;
9761		case DIF_OP_STGS:
9762		case DIF_OP_STTS:
9763		case DIF_OP_STLS:
9764		case DIF_OP_STGAA:
9765		case DIF_OP_STTAA:
9766			if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9767				err += efunc(pc, "invalid variable %u\n", v);
9768			if (rs >= nregs)
9769				err += efunc(pc, "invalid register %u\n", rd);
9770			break;
9771		case DIF_OP_CALL:
9772			if (subr > DIF_SUBR_MAX)
9773				err += efunc(pc, "invalid subr %u\n", subr);
9774			if (rd >= nregs)
9775				err += efunc(pc, "invalid register %u\n", rd);
9776			if (rd == 0)
9777				err += efunc(pc, "cannot write to %r0\n");
9778
9779			if (subr == DIF_SUBR_COPYOUT ||
9780			    subr == DIF_SUBR_COPYOUTSTR) {
9781				dp->dtdo_destructive = 1;
9782			}
9783
9784			if (subr == DIF_SUBR_GETF) {
9785				/*
9786				 * If we have a getf() we need to record that
9787				 * in our state.  Note that our state can be
9788				 * NULL if this is a helper -- but in that
9789				 * case, the call to getf() is itself illegal,
9790				 * and will be caught (slightly later) when
9791				 * the helper is validated.
9792				 */
9793				if (vstate->dtvs_state != NULL)
9794					vstate->dtvs_state->dts_getf++;
9795			}
9796
9797			break;
9798		case DIF_OP_PUSHTR:
9799			if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9800				err += efunc(pc, "invalid ref type %u\n", type);
9801			if (r2 >= nregs)
9802				err += efunc(pc, "invalid register %u\n", r2);
9803			if (rs >= nregs)
9804				err += efunc(pc, "invalid register %u\n", rs);
9805			break;
9806		case DIF_OP_PUSHTV:
9807			if (type != DIF_TYPE_CTF)
9808				err += efunc(pc, "invalid val type %u\n", type);
9809			if (r2 >= nregs)
9810				err += efunc(pc, "invalid register %u\n", r2);
9811			if (rs >= nregs)
9812				err += efunc(pc, "invalid register %u\n", rs);
9813			break;
9814		default:
9815			err += efunc(pc, "invalid opcode %u\n",
9816			    DIF_INSTR_OP(instr));
9817		}
9818	}
9819
9820	if (dp->dtdo_len != 0 &&
9821	    DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9822		err += efunc(dp->dtdo_len - 1,
9823		    "expected 'ret' as last DIF instruction\n");
9824	}
9825
9826	if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
9827		/*
9828		 * If we're not returning by reference, the size must be either
9829		 * 0 or the size of one of the base types.
9830		 */
9831		switch (dp->dtdo_rtype.dtdt_size) {
9832		case 0:
9833		case sizeof (uint8_t):
9834		case sizeof (uint16_t):
9835		case sizeof (uint32_t):
9836		case sizeof (uint64_t):
9837			break;
9838
9839		default:
9840			err += efunc(dp->dtdo_len - 1, "bad return size\n");
9841		}
9842	}
9843
9844	for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9845		dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9846		dtrace_diftype_t *vt, *et;
9847		uint_t id, ndx;
9848
9849		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9850		    v->dtdv_scope != DIFV_SCOPE_THREAD &&
9851		    v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9852			err += efunc(i, "unrecognized variable scope %d\n",
9853			    v->dtdv_scope);
9854			break;
9855		}
9856
9857		if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9858		    v->dtdv_kind != DIFV_KIND_SCALAR) {
9859			err += efunc(i, "unrecognized variable type %d\n",
9860			    v->dtdv_kind);
9861			break;
9862		}
9863
9864		if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9865			err += efunc(i, "%d exceeds variable id limit\n", id);
9866			break;
9867		}
9868
9869		if (id < DIF_VAR_OTHER_UBASE)
9870			continue;
9871
9872		/*
9873		 * For user-defined variables, we need to check that this
9874		 * definition is identical to any previous definition that we
9875		 * encountered.
9876		 */
9877		ndx = id - DIF_VAR_OTHER_UBASE;
9878
9879		switch (v->dtdv_scope) {
9880		case DIFV_SCOPE_GLOBAL:
9881			if (ndx < vstate->dtvs_nglobals) {
9882				dtrace_statvar_t *svar;
9883
9884				if ((svar = vstate->dtvs_globals[ndx]) != NULL)
9885					existing = &svar->dtsv_var;
9886			}
9887
9888			break;
9889
9890		case DIFV_SCOPE_THREAD:
9891			if (ndx < vstate->dtvs_ntlocals)
9892				existing = &vstate->dtvs_tlocals[ndx];
9893			break;
9894
9895		case DIFV_SCOPE_LOCAL:
9896			if (ndx < vstate->dtvs_nlocals) {
9897				dtrace_statvar_t *svar;
9898
9899				if ((svar = vstate->dtvs_locals[ndx]) != NULL)
9900					existing = &svar->dtsv_var;
9901			}
9902
9903			break;
9904		}
9905
9906		vt = &v->dtdv_type;
9907
9908		if (vt->dtdt_flags & DIF_TF_BYREF) {
9909			if (vt->dtdt_size == 0) {
9910				err += efunc(i, "zero-sized variable\n");
9911				break;
9912			}
9913
9914			if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
9915			    vt->dtdt_size > dtrace_global_maxsize) {
9916				err += efunc(i, "oversized by-ref global\n");
9917				break;
9918			}
9919		}
9920
9921		if (existing == NULL || existing->dtdv_id == 0)
9922			continue;
9923
9924		ASSERT(existing->dtdv_id == v->dtdv_id);
9925		ASSERT(existing->dtdv_scope == v->dtdv_scope);
9926
9927		if (existing->dtdv_kind != v->dtdv_kind)
9928			err += efunc(i, "%d changed variable kind\n", id);
9929
9930		et = &existing->dtdv_type;
9931
9932		if (vt->dtdt_flags != et->dtdt_flags) {
9933			err += efunc(i, "%d changed variable type flags\n", id);
9934			break;
9935		}
9936
9937		if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
9938			err += efunc(i, "%d changed variable type size\n", id);
9939			break;
9940		}
9941	}
9942
9943	return (err);
9944}
9945
9946/*
9947 * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
9948 * are much more constrained than normal DIFOs.  Specifically, they may
9949 * not:
9950 *
9951 * 1. Make calls to subroutines other than copyin(), copyinstr() or
9952 *    miscellaneous string routines
9953 * 2. Access DTrace variables other than the args[] array, and the
9954 *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
9955 * 3. Have thread-local variables.
9956 * 4. Have dynamic variables.
9957 */
9958static int
9959dtrace_difo_validate_helper(dtrace_difo_t *dp)
9960{
9961	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9962	int err = 0;
9963	uint_t pc;
9964
9965	for (pc = 0; pc < dp->dtdo_len; pc++) {
9966		dif_instr_t instr = dp->dtdo_buf[pc];
9967
9968		uint_t v = DIF_INSTR_VAR(instr);
9969		uint_t subr = DIF_INSTR_SUBR(instr);
9970		uint_t op = DIF_INSTR_OP(instr);
9971
9972		switch (op) {
9973		case DIF_OP_OR:
9974		case DIF_OP_XOR:
9975		case DIF_OP_AND:
9976		case DIF_OP_SLL:
9977		case DIF_OP_SRL:
9978		case DIF_OP_SRA:
9979		case DIF_OP_SUB:
9980		case DIF_OP_ADD:
9981		case DIF_OP_MUL:
9982		case DIF_OP_SDIV:
9983		case DIF_OP_UDIV:
9984		case DIF_OP_SREM:
9985		case DIF_OP_UREM:
9986		case DIF_OP_COPYS:
9987		case DIF_OP_NOT:
9988		case DIF_OP_MOV:
9989		case DIF_OP_RLDSB:
9990		case DIF_OP_RLDSH:
9991		case DIF_OP_RLDSW:
9992		case DIF_OP_RLDUB:
9993		case DIF_OP_RLDUH:
9994		case DIF_OP_RLDUW:
9995		case DIF_OP_RLDX:
9996		case DIF_OP_ULDSB:
9997		case DIF_OP_ULDSH:
9998		case DIF_OP_ULDSW:
9999		case DIF_OP_ULDUB:
10000		case DIF_OP_ULDUH:
10001		case DIF_OP_ULDUW:
10002		case DIF_OP_ULDX:
10003		case DIF_OP_STB:
10004		case DIF_OP_STH:
10005		case DIF_OP_STW:
10006		case DIF_OP_STX:
10007		case DIF_OP_ALLOCS:
10008		case DIF_OP_CMP:
10009		case DIF_OP_SCMP:
10010		case DIF_OP_TST:
10011		case DIF_OP_BA:
10012		case DIF_OP_BE:
10013		case DIF_OP_BNE:
10014		case DIF_OP_BG:
10015		case DIF_OP_BGU:
10016		case DIF_OP_BGE:
10017		case DIF_OP_BGEU:
10018		case DIF_OP_BL:
10019		case DIF_OP_BLU:
10020		case DIF_OP_BLE:
10021		case DIF_OP_BLEU:
10022		case DIF_OP_RET:
10023		case DIF_OP_NOP:
10024		case DIF_OP_POPTS:
10025		case DIF_OP_FLUSHTS:
10026		case DIF_OP_SETX:
10027		case DIF_OP_SETS:
10028		case DIF_OP_LDGA:
10029		case DIF_OP_LDLS:
10030		case DIF_OP_STGS:
10031		case DIF_OP_STLS:
10032		case DIF_OP_PUSHTR:
10033		case DIF_OP_PUSHTV:
10034			break;
10035
10036		case DIF_OP_LDGS:
10037			if (v >= DIF_VAR_OTHER_UBASE)
10038				break;
10039
10040			if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
10041				break;
10042
10043			if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
10044			    v == DIF_VAR_PPID || v == DIF_VAR_TID ||
10045			    v == DIF_VAR_EXECARGS ||
10046			    v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
10047			    v == DIF_VAR_UID || v == DIF_VAR_GID)
10048				break;
10049
10050			err += efunc(pc, "illegal variable %u\n", v);
10051			break;
10052
10053		case DIF_OP_LDTA:
10054		case DIF_OP_LDTS:
10055		case DIF_OP_LDGAA:
10056		case DIF_OP_LDTAA:
10057			err += efunc(pc, "illegal dynamic variable load\n");
10058			break;
10059
10060		case DIF_OP_STTS:
10061		case DIF_OP_STGAA:
10062		case DIF_OP_STTAA:
10063			err += efunc(pc, "illegal dynamic variable store\n");
10064			break;
10065
10066		case DIF_OP_CALL:
10067			if (subr == DIF_SUBR_ALLOCA ||
10068			    subr == DIF_SUBR_BCOPY ||
10069			    subr == DIF_SUBR_COPYIN ||
10070			    subr == DIF_SUBR_COPYINTO ||
10071			    subr == DIF_SUBR_COPYINSTR ||
10072			    subr == DIF_SUBR_INDEX ||
10073			    subr == DIF_SUBR_INET_NTOA ||
10074			    subr == DIF_SUBR_INET_NTOA6 ||
10075			    subr == DIF_SUBR_INET_NTOP ||
10076			    subr == DIF_SUBR_JSON ||
10077			    subr == DIF_SUBR_LLTOSTR ||
10078			    subr == DIF_SUBR_STRTOLL ||
10079			    subr == DIF_SUBR_RINDEX ||
10080			    subr == DIF_SUBR_STRCHR ||
10081			    subr == DIF_SUBR_STRJOIN ||
10082			    subr == DIF_SUBR_STRRCHR ||
10083			    subr == DIF_SUBR_STRSTR ||
10084			    subr == DIF_SUBR_HTONS ||
10085			    subr == DIF_SUBR_HTONL ||
10086			    subr == DIF_SUBR_HTONLL ||
10087			    subr == DIF_SUBR_NTOHS ||
10088			    subr == DIF_SUBR_NTOHL ||
10089			    subr == DIF_SUBR_NTOHLL ||
10090			    subr == DIF_SUBR_MEMREF ||
10091#if !defined(sun)
10092			    subr == DIF_SUBR_MEMSTR ||
10093#endif
10094			    subr == DIF_SUBR_TYPEREF)
10095				break;
10096
10097			err += efunc(pc, "invalid subr %u\n", subr);
10098			break;
10099
10100		default:
10101			err += efunc(pc, "invalid opcode %u\n",
10102			    DIF_INSTR_OP(instr));
10103		}
10104	}
10105
10106	return (err);
10107}
10108
10109/*
10110 * Returns 1 if the expression in the DIF object can be cached on a per-thread
10111 * basis; 0 if not.
10112 */
10113static int
10114dtrace_difo_cacheable(dtrace_difo_t *dp)
10115{
10116	int i;
10117
10118	if (dp == NULL)
10119		return (0);
10120
10121	for (i = 0; i < dp->dtdo_varlen; i++) {
10122		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10123
10124		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10125			continue;
10126
10127		switch (v->dtdv_id) {
10128		case DIF_VAR_CURTHREAD:
10129		case DIF_VAR_PID:
10130		case DIF_VAR_TID:
10131		case DIF_VAR_EXECARGS:
10132		case DIF_VAR_EXECNAME:
10133		case DIF_VAR_ZONENAME:
10134			break;
10135
10136		default:
10137			return (0);
10138		}
10139	}
10140
10141	/*
10142	 * This DIF object may be cacheable.  Now we need to look for any
10143	 * array loading instructions, any memory loading instructions, or
10144	 * any stores to thread-local variables.
10145	 */
10146	for (i = 0; i < dp->dtdo_len; i++) {
10147		uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10148
10149		if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
10150		    (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
10151		    (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
10152		    op == DIF_OP_LDGA || op == DIF_OP_STTS)
10153			return (0);
10154	}
10155
10156	return (1);
10157}
10158
10159static void
10160dtrace_difo_hold(dtrace_difo_t *dp)
10161{
10162	int i;
10163
10164	ASSERT(MUTEX_HELD(&dtrace_lock));
10165
10166	dp->dtdo_refcnt++;
10167	ASSERT(dp->dtdo_refcnt != 0);
10168
10169	/*
10170	 * We need to check this DIF object for references to the variable
10171	 * DIF_VAR_VTIMESTAMP.
10172	 */
10173	for (i = 0; i < dp->dtdo_varlen; i++) {
10174		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10175
10176		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10177			continue;
10178
10179		if (dtrace_vtime_references++ == 0)
10180			dtrace_vtime_enable();
10181	}
10182}
10183
10184/*
10185 * This routine calculates the dynamic variable chunksize for a given DIF
10186 * object.  The calculation is not fool-proof, and can probably be tricked by
10187 * malicious DIF -- but it works for all compiler-generated DIF.  Because this
10188 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10189 * if a dynamic variable size exceeds the chunksize.
10190 */
10191static void
10192dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10193{
10194	uint64_t sval = 0;
10195	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
10196	const dif_instr_t *text = dp->dtdo_buf;
10197	uint_t pc, srd = 0;
10198	uint_t ttop = 0;
10199	size_t size, ksize;
10200	uint_t id, i;
10201
10202	for (pc = 0; pc < dp->dtdo_len; pc++) {
10203		dif_instr_t instr = text[pc];
10204		uint_t op = DIF_INSTR_OP(instr);
10205		uint_t rd = DIF_INSTR_RD(instr);
10206		uint_t r1 = DIF_INSTR_R1(instr);
10207		uint_t nkeys = 0;
10208		uchar_t scope = 0;
10209
10210		dtrace_key_t *key = tupregs;
10211
10212		switch (op) {
10213		case DIF_OP_SETX:
10214			sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10215			srd = rd;
10216			continue;
10217
10218		case DIF_OP_STTS:
10219			key = &tupregs[DIF_DTR_NREGS];
10220			key[0].dttk_size = 0;
10221			key[1].dttk_size = 0;
10222			nkeys = 2;
10223			scope = DIFV_SCOPE_THREAD;
10224			break;
10225
10226		case DIF_OP_STGAA:
10227		case DIF_OP_STTAA:
10228			nkeys = ttop;
10229
10230			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10231				key[nkeys++].dttk_size = 0;
10232
10233			key[nkeys++].dttk_size = 0;
10234
10235			if (op == DIF_OP_STTAA) {
10236				scope = DIFV_SCOPE_THREAD;
10237			} else {
10238				scope = DIFV_SCOPE_GLOBAL;
10239			}
10240
10241			break;
10242
10243		case DIF_OP_PUSHTR:
10244			if (ttop == DIF_DTR_NREGS)
10245				return;
10246
10247			if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
10248				/*
10249				 * If the register for the size of the "pushtr"
10250				 * is %r0 (or the value is 0) and the type is
10251				 * a string, we'll use the system-wide default
10252				 * string size.
10253				 */
10254				tupregs[ttop++].dttk_size =
10255				    dtrace_strsize_default;
10256			} else {
10257				if (srd == 0)
10258					return;
10259
10260				tupregs[ttop++].dttk_size = sval;
10261			}
10262
10263			break;
10264
10265		case DIF_OP_PUSHTV:
10266			if (ttop == DIF_DTR_NREGS)
10267				return;
10268
10269			tupregs[ttop++].dttk_size = 0;
10270			break;
10271
10272		case DIF_OP_FLUSHTS:
10273			ttop = 0;
10274			break;
10275
10276		case DIF_OP_POPTS:
10277			if (ttop != 0)
10278				ttop--;
10279			break;
10280		}
10281
10282		sval = 0;
10283		srd = 0;
10284
10285		if (nkeys == 0)
10286			continue;
10287
10288		/*
10289		 * We have a dynamic variable allocation; calculate its size.
10290		 */
10291		for (ksize = 0, i = 0; i < nkeys; i++)
10292			ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10293
10294		size = sizeof (dtrace_dynvar_t);
10295		size += sizeof (dtrace_key_t) * (nkeys - 1);
10296		size += ksize;
10297
10298		/*
10299		 * Now we need to determine the size of the stored data.
10300		 */
10301		id = DIF_INSTR_VAR(instr);
10302
10303		for (i = 0; i < dp->dtdo_varlen; i++) {
10304			dtrace_difv_t *v = &dp->dtdo_vartab[i];
10305
10306			if (v->dtdv_id == id && v->dtdv_scope == scope) {
10307				size += v->dtdv_type.dtdt_size;
10308				break;
10309			}
10310		}
10311
10312		if (i == dp->dtdo_varlen)
10313			return;
10314
10315		/*
10316		 * We have the size.  If this is larger than the chunk size
10317		 * for our dynamic variable state, reset the chunk size.
10318		 */
10319		size = P2ROUNDUP(size, sizeof (uint64_t));
10320
10321		if (size > vstate->dtvs_dynvars.dtds_chunksize)
10322			vstate->dtvs_dynvars.dtds_chunksize = size;
10323	}
10324}
10325
10326static void
10327dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10328{
10329	int i, oldsvars, osz, nsz, otlocals, ntlocals;
10330	uint_t id;
10331
10332	ASSERT(MUTEX_HELD(&dtrace_lock));
10333	ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
10334
10335	for (i = 0; i < dp->dtdo_varlen; i++) {
10336		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10337		dtrace_statvar_t *svar, ***svarp = NULL;
10338		size_t dsize = 0;
10339		uint8_t scope = v->dtdv_scope;
10340		int *np = NULL;
10341
10342		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10343			continue;
10344
10345		id -= DIF_VAR_OTHER_UBASE;
10346
10347		switch (scope) {
10348		case DIFV_SCOPE_THREAD:
10349			while (id >= (otlocals = vstate->dtvs_ntlocals)) {
10350				dtrace_difv_t *tlocals;
10351
10352				if ((ntlocals = (otlocals << 1)) == 0)
10353					ntlocals = 1;
10354
10355				osz = otlocals * sizeof (dtrace_difv_t);
10356				nsz = ntlocals * sizeof (dtrace_difv_t);
10357
10358				tlocals = kmem_zalloc(nsz, KM_SLEEP);
10359
10360				if (osz != 0) {
10361					bcopy(vstate->dtvs_tlocals,
10362					    tlocals, osz);
10363					kmem_free(vstate->dtvs_tlocals, osz);
10364				}
10365
10366				vstate->dtvs_tlocals = tlocals;
10367				vstate->dtvs_ntlocals = ntlocals;
10368			}
10369
10370			vstate->dtvs_tlocals[id] = *v;
10371			continue;
10372
10373		case DIFV_SCOPE_LOCAL:
10374			np = &vstate->dtvs_nlocals;
10375			svarp = &vstate->dtvs_locals;
10376
10377			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10378				dsize = NCPU * (v->dtdv_type.dtdt_size +
10379				    sizeof (uint64_t));
10380			else
10381				dsize = NCPU * sizeof (uint64_t);
10382
10383			break;
10384
10385		case DIFV_SCOPE_GLOBAL:
10386			np = &vstate->dtvs_nglobals;
10387			svarp = &vstate->dtvs_globals;
10388
10389			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10390				dsize = v->dtdv_type.dtdt_size +
10391				    sizeof (uint64_t);
10392
10393			break;
10394
10395		default:
10396			ASSERT(0);
10397		}
10398
10399		while (id >= (oldsvars = *np)) {
10400			dtrace_statvar_t **statics;
10401			int newsvars, oldsize, newsize;
10402
10403			if ((newsvars = (oldsvars << 1)) == 0)
10404				newsvars = 1;
10405
10406			oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10407			newsize = newsvars * sizeof (dtrace_statvar_t *);
10408
10409			statics = kmem_zalloc(newsize, KM_SLEEP);
10410
10411			if (oldsize != 0) {
10412				bcopy(*svarp, statics, oldsize);
10413				kmem_free(*svarp, oldsize);
10414			}
10415
10416			*svarp = statics;
10417			*np = newsvars;
10418		}
10419
10420		if ((svar = (*svarp)[id]) == NULL) {
10421			svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10422			svar->dtsv_var = *v;
10423
10424			if ((svar->dtsv_size = dsize) != 0) {
10425				svar->dtsv_data = (uint64_t)(uintptr_t)
10426				    kmem_zalloc(dsize, KM_SLEEP);
10427			}
10428
10429			(*svarp)[id] = svar;
10430		}
10431
10432		svar->dtsv_refcnt++;
10433	}
10434
10435	dtrace_difo_chunksize(dp, vstate);
10436	dtrace_difo_hold(dp);
10437}
10438
10439static dtrace_difo_t *
10440dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10441{
10442	dtrace_difo_t *new;
10443	size_t sz;
10444
10445	ASSERT(dp->dtdo_buf != NULL);
10446	ASSERT(dp->dtdo_refcnt != 0);
10447
10448	new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10449
10450	ASSERT(dp->dtdo_buf != NULL);
10451	sz = dp->dtdo_len * sizeof (dif_instr_t);
10452	new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10453	bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10454	new->dtdo_len = dp->dtdo_len;
10455
10456	if (dp->dtdo_strtab != NULL) {
10457		ASSERT(dp->dtdo_strlen != 0);
10458		new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10459		bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10460		new->dtdo_strlen = dp->dtdo_strlen;
10461	}
10462
10463	if (dp->dtdo_inttab != NULL) {
10464		ASSERT(dp->dtdo_intlen != 0);
10465		sz = dp->dtdo_intlen * sizeof (uint64_t);
10466		new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10467		bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10468		new->dtdo_intlen = dp->dtdo_intlen;
10469	}
10470
10471	if (dp->dtdo_vartab != NULL) {
10472		ASSERT(dp->dtdo_varlen != 0);
10473		sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10474		new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10475		bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10476		new->dtdo_varlen = dp->dtdo_varlen;
10477	}
10478
10479	dtrace_difo_init(new, vstate);
10480	return (new);
10481}
10482
10483static void
10484dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10485{
10486	int i;
10487
10488	ASSERT(dp->dtdo_refcnt == 0);
10489
10490	for (i = 0; i < dp->dtdo_varlen; i++) {
10491		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10492		dtrace_statvar_t *svar, **svarp = NULL;
10493		uint_t id;
10494		uint8_t scope = v->dtdv_scope;
10495		int *np = NULL;
10496
10497		switch (scope) {
10498		case DIFV_SCOPE_THREAD:
10499			continue;
10500
10501		case DIFV_SCOPE_LOCAL:
10502			np = &vstate->dtvs_nlocals;
10503			svarp = vstate->dtvs_locals;
10504			break;
10505
10506		case DIFV_SCOPE_GLOBAL:
10507			np = &vstate->dtvs_nglobals;
10508			svarp = vstate->dtvs_globals;
10509			break;
10510
10511		default:
10512			ASSERT(0);
10513		}
10514
10515		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10516			continue;
10517
10518		id -= DIF_VAR_OTHER_UBASE;
10519		ASSERT(id < *np);
10520
10521		svar = svarp[id];
10522		ASSERT(svar != NULL);
10523		ASSERT(svar->dtsv_refcnt > 0);
10524
10525		if (--svar->dtsv_refcnt > 0)
10526			continue;
10527
10528		if (svar->dtsv_size != 0) {
10529			ASSERT(svar->dtsv_data != 0);
10530			kmem_free((void *)(uintptr_t)svar->dtsv_data,
10531			    svar->dtsv_size);
10532		}
10533
10534		kmem_free(svar, sizeof (dtrace_statvar_t));
10535		svarp[id] = NULL;
10536	}
10537
10538	if (dp->dtdo_buf != NULL)
10539		kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10540	if (dp->dtdo_inttab != NULL)
10541		kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10542	if (dp->dtdo_strtab != NULL)
10543		kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10544	if (dp->dtdo_vartab != NULL)
10545		kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10546
10547	kmem_free(dp, sizeof (dtrace_difo_t));
10548}
10549
10550static void
10551dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10552{
10553	int i;
10554
10555	ASSERT(MUTEX_HELD(&dtrace_lock));
10556	ASSERT(dp->dtdo_refcnt != 0);
10557
10558	for (i = 0; i < dp->dtdo_varlen; i++) {
10559		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10560
10561		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10562			continue;
10563
10564		ASSERT(dtrace_vtime_references > 0);
10565		if (--dtrace_vtime_references == 0)
10566			dtrace_vtime_disable();
10567	}
10568
10569	if (--dp->dtdo_refcnt == 0)
10570		dtrace_difo_destroy(dp, vstate);
10571}
10572
10573/*
10574 * DTrace Format Functions
10575 */
10576static uint16_t
10577dtrace_format_add(dtrace_state_t *state, char *str)
10578{
10579	char *fmt, **new;
10580	uint16_t ndx, len = strlen(str) + 1;
10581
10582	fmt = kmem_zalloc(len, KM_SLEEP);
10583	bcopy(str, fmt, len);
10584
10585	for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10586		if (state->dts_formats[ndx] == NULL) {
10587			state->dts_formats[ndx] = fmt;
10588			return (ndx + 1);
10589		}
10590	}
10591
10592	if (state->dts_nformats == USHRT_MAX) {
10593		/*
10594		 * This is only likely if a denial-of-service attack is being
10595		 * attempted.  As such, it's okay to fail silently here.
10596		 */
10597		kmem_free(fmt, len);
10598		return (0);
10599	}
10600
10601	/*
10602	 * For simplicity, we always resize the formats array to be exactly the
10603	 * number of formats.
10604	 */
10605	ndx = state->dts_nformats++;
10606	new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
10607
10608	if (state->dts_formats != NULL) {
10609		ASSERT(ndx != 0);
10610		bcopy(state->dts_formats, new, ndx * sizeof (char *));
10611		kmem_free(state->dts_formats, ndx * sizeof (char *));
10612	}
10613
10614	state->dts_formats = new;
10615	state->dts_formats[ndx] = fmt;
10616
10617	return (ndx + 1);
10618}
10619
10620static void
10621dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10622{
10623	char *fmt;
10624
10625	ASSERT(state->dts_formats != NULL);
10626	ASSERT(format <= state->dts_nformats);
10627	ASSERT(state->dts_formats[format - 1] != NULL);
10628
10629	fmt = state->dts_formats[format - 1];
10630	kmem_free(fmt, strlen(fmt) + 1);
10631	state->dts_formats[format - 1] = NULL;
10632}
10633
10634static void
10635dtrace_format_destroy(dtrace_state_t *state)
10636{
10637	int i;
10638
10639	if (state->dts_nformats == 0) {
10640		ASSERT(state->dts_formats == NULL);
10641		return;
10642	}
10643
10644	ASSERT(state->dts_formats != NULL);
10645
10646	for (i = 0; i < state->dts_nformats; i++) {
10647		char *fmt = state->dts_formats[i];
10648
10649		if (fmt == NULL)
10650			continue;
10651
10652		kmem_free(fmt, strlen(fmt) + 1);
10653	}
10654
10655	kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
10656	state->dts_nformats = 0;
10657	state->dts_formats = NULL;
10658}
10659
10660/*
10661 * DTrace Predicate Functions
10662 */
10663static dtrace_predicate_t *
10664dtrace_predicate_create(dtrace_difo_t *dp)
10665{
10666	dtrace_predicate_t *pred;
10667
10668	ASSERT(MUTEX_HELD(&dtrace_lock));
10669	ASSERT(dp->dtdo_refcnt != 0);
10670
10671	pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10672	pred->dtp_difo = dp;
10673	pred->dtp_refcnt = 1;
10674
10675	if (!dtrace_difo_cacheable(dp))
10676		return (pred);
10677
10678	if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10679		/*
10680		 * This is only theoretically possible -- we have had 2^32
10681		 * cacheable predicates on this machine.  We cannot allow any
10682		 * more predicates to become cacheable:  as unlikely as it is,
10683		 * there may be a thread caching a (now stale) predicate cache
10684		 * ID. (N.B.: the temptation is being successfully resisted to
10685		 * have this cmn_err() "Holy shit -- we executed this code!")
10686		 */
10687		return (pred);
10688	}
10689
10690	pred->dtp_cacheid = dtrace_predcache_id++;
10691
10692	return (pred);
10693}
10694
10695static void
10696dtrace_predicate_hold(dtrace_predicate_t *pred)
10697{
10698	ASSERT(MUTEX_HELD(&dtrace_lock));
10699	ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10700	ASSERT(pred->dtp_refcnt > 0);
10701
10702	pred->dtp_refcnt++;
10703}
10704
10705static void
10706dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10707{
10708	dtrace_difo_t *dp = pred->dtp_difo;
10709
10710	ASSERT(MUTEX_HELD(&dtrace_lock));
10711	ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10712	ASSERT(pred->dtp_refcnt > 0);
10713
10714	if (--pred->dtp_refcnt == 0) {
10715		dtrace_difo_release(pred->dtp_difo, vstate);
10716		kmem_free(pred, sizeof (dtrace_predicate_t));
10717	}
10718}
10719
10720/*
10721 * DTrace Action Description Functions
10722 */
10723static dtrace_actdesc_t *
10724dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10725    uint64_t uarg, uint64_t arg)
10726{
10727	dtrace_actdesc_t *act;
10728
10729#if defined(sun)
10730	ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
10731	    arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
10732#endif
10733
10734	act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10735	act->dtad_kind = kind;
10736	act->dtad_ntuple = ntuple;
10737	act->dtad_uarg = uarg;
10738	act->dtad_arg = arg;
10739	act->dtad_refcnt = 1;
10740
10741	return (act);
10742}
10743
10744static void
10745dtrace_actdesc_hold(dtrace_actdesc_t *act)
10746{
10747	ASSERT(act->dtad_refcnt >= 1);
10748	act->dtad_refcnt++;
10749}
10750
10751static void
10752dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10753{
10754	dtrace_actkind_t kind = act->dtad_kind;
10755	dtrace_difo_t *dp;
10756
10757	ASSERT(act->dtad_refcnt >= 1);
10758
10759	if (--act->dtad_refcnt != 0)
10760		return;
10761
10762	if ((dp = act->dtad_difo) != NULL)
10763		dtrace_difo_release(dp, vstate);
10764
10765	if (DTRACEACT_ISPRINTFLIKE(kind)) {
10766		char *str = (char *)(uintptr_t)act->dtad_arg;
10767
10768#if defined(sun)
10769		ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10770		    (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10771#endif
10772
10773		if (str != NULL)
10774			kmem_free(str, strlen(str) + 1);
10775	}
10776
10777	kmem_free(act, sizeof (dtrace_actdesc_t));
10778}
10779
10780/*
10781 * DTrace ECB Functions
10782 */
10783static dtrace_ecb_t *
10784dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10785{
10786	dtrace_ecb_t *ecb;
10787	dtrace_epid_t epid;
10788
10789	ASSERT(MUTEX_HELD(&dtrace_lock));
10790
10791	ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10792	ecb->dte_predicate = NULL;
10793	ecb->dte_probe = probe;
10794
10795	/*
10796	 * The default size is the size of the default action: recording
10797	 * the header.
10798	 */
10799	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
10800	ecb->dte_alignment = sizeof (dtrace_epid_t);
10801
10802	epid = state->dts_epid++;
10803
10804	if (epid - 1 >= state->dts_necbs) {
10805		dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
10806		int necbs = state->dts_necbs << 1;
10807
10808		ASSERT(epid == state->dts_necbs + 1);
10809
10810		if (necbs == 0) {
10811			ASSERT(oecbs == NULL);
10812			necbs = 1;
10813		}
10814
10815		ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10816
10817		if (oecbs != NULL)
10818			bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
10819
10820		dtrace_membar_producer();
10821		state->dts_ecbs = ecbs;
10822
10823		if (oecbs != NULL) {
10824			/*
10825			 * If this state is active, we must dtrace_sync()
10826			 * before we can free the old dts_ecbs array:  we're
10827			 * coming in hot, and there may be active ring
10828			 * buffer processing (which indexes into the dts_ecbs
10829			 * array) on another CPU.
10830			 */
10831			if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
10832				dtrace_sync();
10833
10834			kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
10835		}
10836
10837		dtrace_membar_producer();
10838		state->dts_necbs = necbs;
10839	}
10840
10841	ecb->dte_state = state;
10842
10843	ASSERT(state->dts_ecbs[epid - 1] == NULL);
10844	dtrace_membar_producer();
10845	state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
10846
10847	return (ecb);
10848}
10849
10850static void
10851dtrace_ecb_enable(dtrace_ecb_t *ecb)
10852{
10853	dtrace_probe_t *probe = ecb->dte_probe;
10854
10855	ASSERT(MUTEX_HELD(&cpu_lock));
10856	ASSERT(MUTEX_HELD(&dtrace_lock));
10857	ASSERT(ecb->dte_next == NULL);
10858
10859	if (probe == NULL) {
10860		/*
10861		 * This is the NULL probe -- there's nothing to do.
10862		 */
10863		return;
10864	}
10865
10866	if (probe->dtpr_ecb == NULL) {
10867		dtrace_provider_t *prov = probe->dtpr_provider;
10868
10869		/*
10870		 * We're the first ECB on this probe.
10871		 */
10872		probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
10873
10874		if (ecb->dte_predicate != NULL)
10875			probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
10876
10877		prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
10878		    probe->dtpr_id, probe->dtpr_arg);
10879	} else {
10880		/*
10881		 * This probe is already active.  Swing the last pointer to
10882		 * point to the new ECB, and issue a dtrace_sync() to assure
10883		 * that all CPUs have seen the change.
10884		 */
10885		ASSERT(probe->dtpr_ecb_last != NULL);
10886		probe->dtpr_ecb_last->dte_next = ecb;
10887		probe->dtpr_ecb_last = ecb;
10888		probe->dtpr_predcache = 0;
10889
10890		dtrace_sync();
10891	}
10892}
10893
10894static void
10895dtrace_ecb_resize(dtrace_ecb_t *ecb)
10896{
10897	dtrace_action_t *act;
10898	uint32_t curneeded = UINT32_MAX;
10899	uint32_t aggbase = UINT32_MAX;
10900
10901	/*
10902	 * If we record anything, we always record the dtrace_rechdr_t.  (And
10903	 * we always record it first.)
10904	 */
10905	ecb->dte_size = sizeof (dtrace_rechdr_t);
10906	ecb->dte_alignment = sizeof (dtrace_epid_t);
10907
10908	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10909		dtrace_recdesc_t *rec = &act->dta_rec;
10910		ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
10911
10912		ecb->dte_alignment = MAX(ecb->dte_alignment,
10913		    rec->dtrd_alignment);
10914
10915		if (DTRACEACT_ISAGG(act->dta_kind)) {
10916			dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10917
10918			ASSERT(rec->dtrd_size != 0);
10919			ASSERT(agg->dtag_first != NULL);
10920			ASSERT(act->dta_prev->dta_intuple);
10921			ASSERT(aggbase != UINT32_MAX);
10922			ASSERT(curneeded != UINT32_MAX);
10923
10924			agg->dtag_base = aggbase;
10925
10926			curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10927			rec->dtrd_offset = curneeded;
10928			curneeded += rec->dtrd_size;
10929			ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
10930
10931			aggbase = UINT32_MAX;
10932			curneeded = UINT32_MAX;
10933		} else if (act->dta_intuple) {
10934			if (curneeded == UINT32_MAX) {
10935				/*
10936				 * This is the first record in a tuple.  Align
10937				 * curneeded to be at offset 4 in an 8-byte
10938				 * aligned block.
10939				 */
10940				ASSERT(act->dta_prev == NULL ||
10941				    !act->dta_prev->dta_intuple);
10942				ASSERT3U(aggbase, ==, UINT32_MAX);
10943				curneeded = P2PHASEUP(ecb->dte_size,
10944				    sizeof (uint64_t), sizeof (dtrace_aggid_t));
10945
10946				aggbase = curneeded - sizeof (dtrace_aggid_t);
10947				ASSERT(IS_P2ALIGNED(aggbase,
10948				    sizeof (uint64_t)));
10949			}
10950			curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10951			rec->dtrd_offset = curneeded;
10952			curneeded += rec->dtrd_size;
10953		} else {
10954			/* tuples must be followed by an aggregation */
10955			ASSERT(act->dta_prev == NULL ||
10956			    !act->dta_prev->dta_intuple);
10957
10958			ecb->dte_size = P2ROUNDUP(ecb->dte_size,
10959			    rec->dtrd_alignment);
10960			rec->dtrd_offset = ecb->dte_size;
10961			ecb->dte_size += rec->dtrd_size;
10962			ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
10963		}
10964	}
10965
10966	if ((act = ecb->dte_action) != NULL &&
10967	    !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
10968	    ecb->dte_size == sizeof (dtrace_rechdr_t)) {
10969		/*
10970		 * If the size is still sizeof (dtrace_rechdr_t), then all
10971		 * actions store no data; set the size to 0.
10972		 */
10973		ecb->dte_size = 0;
10974	}
10975
10976	ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
10977	ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
10978	ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
10979	    ecb->dte_needed);
10980}
10981
10982static dtrace_action_t *
10983dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10984{
10985	dtrace_aggregation_t *agg;
10986	size_t size = sizeof (uint64_t);
10987	int ntuple = desc->dtad_ntuple;
10988	dtrace_action_t *act;
10989	dtrace_recdesc_t *frec;
10990	dtrace_aggid_t aggid;
10991	dtrace_state_t *state = ecb->dte_state;
10992
10993	agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
10994	agg->dtag_ecb = ecb;
10995
10996	ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
10997
10998	switch (desc->dtad_kind) {
10999	case DTRACEAGG_MIN:
11000		agg->dtag_initial = INT64_MAX;
11001		agg->dtag_aggregate = dtrace_aggregate_min;
11002		break;
11003
11004	case DTRACEAGG_MAX:
11005		agg->dtag_initial = INT64_MIN;
11006		agg->dtag_aggregate = dtrace_aggregate_max;
11007		break;
11008
11009	case DTRACEAGG_COUNT:
11010		agg->dtag_aggregate = dtrace_aggregate_count;
11011		break;
11012
11013	case DTRACEAGG_QUANTIZE:
11014		agg->dtag_aggregate = dtrace_aggregate_quantize;
11015		size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
11016		    sizeof (uint64_t);
11017		break;
11018
11019	case DTRACEAGG_LQUANTIZE: {
11020		uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
11021		uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
11022
11023		agg->dtag_initial = desc->dtad_arg;
11024		agg->dtag_aggregate = dtrace_aggregate_lquantize;
11025
11026		if (step == 0 || levels == 0)
11027			goto err;
11028
11029		size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
11030		break;
11031	}
11032
11033	case DTRACEAGG_LLQUANTIZE: {
11034		uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
11035		uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
11036		uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
11037		uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
11038		int64_t v;
11039
11040		agg->dtag_initial = desc->dtad_arg;
11041		agg->dtag_aggregate = dtrace_aggregate_llquantize;
11042
11043		if (factor < 2 || low >= high || nsteps < factor)
11044			goto err;
11045
11046		/*
11047		 * Now check that the number of steps evenly divides a power
11048		 * of the factor.  (This assures both integer bucket size and
11049		 * linearity within each magnitude.)
11050		 */
11051		for (v = factor; v < nsteps; v *= factor)
11052			continue;
11053
11054		if ((v % nsteps) || (nsteps % factor))
11055			goto err;
11056
11057		size = (dtrace_aggregate_llquantize_bucket(factor,
11058		    low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
11059		break;
11060	}
11061
11062	case DTRACEAGG_AVG:
11063		agg->dtag_aggregate = dtrace_aggregate_avg;
11064		size = sizeof (uint64_t) * 2;
11065		break;
11066
11067	case DTRACEAGG_STDDEV:
11068		agg->dtag_aggregate = dtrace_aggregate_stddev;
11069		size = sizeof (uint64_t) * 4;
11070		break;
11071
11072	case DTRACEAGG_SUM:
11073		agg->dtag_aggregate = dtrace_aggregate_sum;
11074		break;
11075
11076	default:
11077		goto err;
11078	}
11079
11080	agg->dtag_action.dta_rec.dtrd_size = size;
11081
11082	if (ntuple == 0)
11083		goto err;
11084
11085	/*
11086	 * We must make sure that we have enough actions for the n-tuple.
11087	 */
11088	for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
11089		if (DTRACEACT_ISAGG(act->dta_kind))
11090			break;
11091
11092		if (--ntuple == 0) {
11093			/*
11094			 * This is the action with which our n-tuple begins.
11095			 */
11096			agg->dtag_first = act;
11097			goto success;
11098		}
11099	}
11100
11101	/*
11102	 * This n-tuple is short by ntuple elements.  Return failure.
11103	 */
11104	ASSERT(ntuple != 0);
11105err:
11106	kmem_free(agg, sizeof (dtrace_aggregation_t));
11107	return (NULL);
11108
11109success:
11110	/*
11111	 * If the last action in the tuple has a size of zero, it's actually
11112	 * an expression argument for the aggregating action.
11113	 */
11114	ASSERT(ecb->dte_action_last != NULL);
11115	act = ecb->dte_action_last;
11116
11117	if (act->dta_kind == DTRACEACT_DIFEXPR) {
11118		ASSERT(act->dta_difo != NULL);
11119
11120		if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
11121			agg->dtag_hasarg = 1;
11122	}
11123
11124	/*
11125	 * We need to allocate an id for this aggregation.
11126	 */
11127#if defined(sun)
11128	aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
11129	    VM_BESTFIT | VM_SLEEP);
11130#else
11131	aggid = alloc_unr(state->dts_aggid_arena);
11132#endif
11133
11134	if (aggid - 1 >= state->dts_naggregations) {
11135		dtrace_aggregation_t **oaggs = state->dts_aggregations;
11136		dtrace_aggregation_t **aggs;
11137		int naggs = state->dts_naggregations << 1;
11138		int onaggs = state->dts_naggregations;
11139
11140		ASSERT(aggid == state->dts_naggregations + 1);
11141
11142		if (naggs == 0) {
11143			ASSERT(oaggs == NULL);
11144			naggs = 1;
11145		}
11146
11147		aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11148
11149		if (oaggs != NULL) {
11150			bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
11151			kmem_free(oaggs, onaggs * sizeof (*aggs));
11152		}
11153
11154		state->dts_aggregations = aggs;
11155		state->dts_naggregations = naggs;
11156	}
11157
11158	ASSERT(state->dts_aggregations[aggid - 1] == NULL);
11159	state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
11160
11161	frec = &agg->dtag_first->dta_rec;
11162	if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11163		frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11164
11165	for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11166		ASSERT(!act->dta_intuple);
11167		act->dta_intuple = 1;
11168	}
11169
11170	return (&agg->dtag_action);
11171}
11172
11173static void
11174dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
11175{
11176	dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11177	dtrace_state_t *state = ecb->dte_state;
11178	dtrace_aggid_t aggid = agg->dtag_id;
11179
11180	ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11181#if defined(sun)
11182	vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
11183#else
11184	free_unr(state->dts_aggid_arena, aggid);
11185#endif
11186
11187	ASSERT(state->dts_aggregations[aggid - 1] == agg);
11188	state->dts_aggregations[aggid - 1] = NULL;
11189
11190	kmem_free(agg, sizeof (dtrace_aggregation_t));
11191}
11192
11193static int
11194dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11195{
11196	dtrace_action_t *action, *last;
11197	dtrace_difo_t *dp = desc->dtad_difo;
11198	uint32_t size = 0, align = sizeof (uint8_t), mask;
11199	uint16_t format = 0;
11200	dtrace_recdesc_t *rec;
11201	dtrace_state_t *state = ecb->dte_state;
11202	dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize;
11203	uint64_t arg = desc->dtad_arg;
11204
11205	ASSERT(MUTEX_HELD(&dtrace_lock));
11206	ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
11207
11208	if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11209		/*
11210		 * If this is an aggregating action, there must be neither
11211		 * a speculate nor a commit on the action chain.
11212		 */
11213		dtrace_action_t *act;
11214
11215		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11216			if (act->dta_kind == DTRACEACT_COMMIT)
11217				return (EINVAL);
11218
11219			if (act->dta_kind == DTRACEACT_SPECULATE)
11220				return (EINVAL);
11221		}
11222
11223		action = dtrace_ecb_aggregation_create(ecb, desc);
11224
11225		if (action == NULL)
11226			return (EINVAL);
11227	} else {
11228		if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
11229		    (desc->dtad_kind == DTRACEACT_DIFEXPR &&
11230		    dp != NULL && dp->dtdo_destructive)) {
11231			state->dts_destructive = 1;
11232		}
11233
11234		switch (desc->dtad_kind) {
11235		case DTRACEACT_PRINTF:
11236		case DTRACEACT_PRINTA:
11237		case DTRACEACT_SYSTEM:
11238		case DTRACEACT_FREOPEN:
11239		case DTRACEACT_DIFEXPR:
11240			/*
11241			 * We know that our arg is a string -- turn it into a
11242			 * format.
11243			 */
11244			if (arg == 0) {
11245				ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
11246				    desc->dtad_kind == DTRACEACT_DIFEXPR);
11247				format = 0;
11248			} else {
11249				ASSERT(arg != 0);
11250#if defined(sun)
11251				ASSERT(arg > KERNELBASE);
11252#endif
11253				format = dtrace_format_add(state,
11254				    (char *)(uintptr_t)arg);
11255			}
11256
11257			/*FALLTHROUGH*/
11258		case DTRACEACT_LIBACT:
11259		case DTRACEACT_TRACEMEM:
11260		case DTRACEACT_TRACEMEM_DYNSIZE:
11261			if (dp == NULL)
11262				return (EINVAL);
11263
11264			if ((size = dp->dtdo_rtype.dtdt_size) != 0)
11265				break;
11266
11267			if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11268				if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11269					return (EINVAL);
11270
11271				size = opt[DTRACEOPT_STRSIZE];
11272			}
11273
11274			break;
11275
11276		case DTRACEACT_STACK:
11277			if ((nframes = arg) == 0) {
11278				nframes = opt[DTRACEOPT_STACKFRAMES];
11279				ASSERT(nframes > 0);
11280				arg = nframes;
11281			}
11282
11283			size = nframes * sizeof (pc_t);
11284			break;
11285
11286		case DTRACEACT_JSTACK:
11287			if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
11288				strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11289
11290			if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
11291				nframes = opt[DTRACEOPT_JSTACKFRAMES];
11292
11293			arg = DTRACE_USTACK_ARG(nframes, strsize);
11294
11295			/*FALLTHROUGH*/
11296		case DTRACEACT_USTACK:
11297			if (desc->dtad_kind != DTRACEACT_JSTACK &&
11298			    (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
11299				strsize = DTRACE_USTACK_STRSIZE(arg);
11300				nframes = opt[DTRACEOPT_USTACKFRAMES];
11301				ASSERT(nframes > 0);
11302				arg = DTRACE_USTACK_ARG(nframes, strsize);
11303			}
11304
11305			/*
11306			 * Save a slot for the pid.
11307			 */
11308			size = (nframes + 1) * sizeof (uint64_t);
11309			size += DTRACE_USTACK_STRSIZE(arg);
11310			size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11311
11312			break;
11313
11314		case DTRACEACT_SYM:
11315		case DTRACEACT_MOD:
11316			if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
11317			    sizeof (uint64_t)) ||
11318			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11319				return (EINVAL);
11320			break;
11321
11322		case DTRACEACT_USYM:
11323		case DTRACEACT_UMOD:
11324		case DTRACEACT_UADDR:
11325			if (dp == NULL ||
11326			    (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
11327			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11328				return (EINVAL);
11329
11330			/*
11331			 * We have a slot for the pid, plus a slot for the
11332			 * argument.  To keep things simple (aligned with
11333			 * bitness-neutral sizing), we store each as a 64-bit
11334			 * quantity.
11335			 */
11336			size = 2 * sizeof (uint64_t);
11337			break;
11338
11339		case DTRACEACT_STOP:
11340		case DTRACEACT_BREAKPOINT:
11341		case DTRACEACT_PANIC:
11342			break;
11343
11344		case DTRACEACT_CHILL:
11345		case DTRACEACT_DISCARD:
11346		case DTRACEACT_RAISE:
11347			if (dp == NULL)
11348				return (EINVAL);
11349			break;
11350
11351		case DTRACEACT_EXIT:
11352			if (dp == NULL ||
11353			    (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
11354			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11355				return (EINVAL);
11356			break;
11357
11358		case DTRACEACT_SPECULATE:
11359			if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11360				return (EINVAL);
11361
11362			if (dp == NULL)
11363				return (EINVAL);
11364
11365			state->dts_speculates = 1;
11366			break;
11367
11368		case DTRACEACT_PRINTM:
11369		    	size = dp->dtdo_rtype.dtdt_size;
11370			break;
11371
11372		case DTRACEACT_PRINTT:
11373		    	size = dp->dtdo_rtype.dtdt_size;
11374			break;
11375
11376		case DTRACEACT_COMMIT: {
11377			dtrace_action_t *act = ecb->dte_action;
11378
11379			for (; act != NULL; act = act->dta_next) {
11380				if (act->dta_kind == DTRACEACT_COMMIT)
11381					return (EINVAL);
11382			}
11383
11384			if (dp == NULL)
11385				return (EINVAL);
11386			break;
11387		}
11388
11389		default:
11390			return (EINVAL);
11391		}
11392
11393		if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
11394			/*
11395			 * If this is a data-storing action or a speculate,
11396			 * we must be sure that there isn't a commit on the
11397			 * action chain.
11398			 */
11399			dtrace_action_t *act = ecb->dte_action;
11400
11401			for (; act != NULL; act = act->dta_next) {
11402				if (act->dta_kind == DTRACEACT_COMMIT)
11403					return (EINVAL);
11404			}
11405		}
11406
11407		action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11408		action->dta_rec.dtrd_size = size;
11409	}
11410
11411	action->dta_refcnt = 1;
11412	rec = &action->dta_rec;
11413	size = rec->dtrd_size;
11414
11415	for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11416		if (!(size & mask)) {
11417			align = mask + 1;
11418			break;
11419		}
11420	}
11421
11422	action->dta_kind = desc->dtad_kind;
11423
11424	if ((action->dta_difo = dp) != NULL)
11425		dtrace_difo_hold(dp);
11426
11427	rec->dtrd_action = action->dta_kind;
11428	rec->dtrd_arg = arg;
11429	rec->dtrd_uarg = desc->dtad_uarg;
11430	rec->dtrd_alignment = (uint16_t)align;
11431	rec->dtrd_format = format;
11432
11433	if ((last = ecb->dte_action_last) != NULL) {
11434		ASSERT(ecb->dte_action != NULL);
11435		action->dta_prev = last;
11436		last->dta_next = action;
11437	} else {
11438		ASSERT(ecb->dte_action == NULL);
11439		ecb->dte_action = action;
11440	}
11441
11442	ecb->dte_action_last = action;
11443
11444	return (0);
11445}
11446
11447static void
11448dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11449{
11450	dtrace_action_t *act = ecb->dte_action, *next;
11451	dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11452	dtrace_difo_t *dp;
11453	uint16_t format;
11454
11455	if (act != NULL && act->dta_refcnt > 1) {
11456		ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11457		act->dta_refcnt--;
11458	} else {
11459		for (; act != NULL; act = next) {
11460			next = act->dta_next;
11461			ASSERT(next != NULL || act == ecb->dte_action_last);
11462			ASSERT(act->dta_refcnt == 1);
11463
11464			if ((format = act->dta_rec.dtrd_format) != 0)
11465				dtrace_format_remove(ecb->dte_state, format);
11466
11467			if ((dp = act->dta_difo) != NULL)
11468				dtrace_difo_release(dp, vstate);
11469
11470			if (DTRACEACT_ISAGG(act->dta_kind)) {
11471				dtrace_ecb_aggregation_destroy(ecb, act);
11472			} else {
11473				kmem_free(act, sizeof (dtrace_action_t));
11474			}
11475		}
11476	}
11477
11478	ecb->dte_action = NULL;
11479	ecb->dte_action_last = NULL;
11480	ecb->dte_size = 0;
11481}
11482
11483static void
11484dtrace_ecb_disable(dtrace_ecb_t *ecb)
11485{
11486	/*
11487	 * We disable the ECB by removing it from its probe.
11488	 */
11489	dtrace_ecb_t *pecb, *prev = NULL;
11490	dtrace_probe_t *probe = ecb->dte_probe;
11491
11492	ASSERT(MUTEX_HELD(&dtrace_lock));
11493
11494	if (probe == NULL) {
11495		/*
11496		 * This is the NULL probe; there is nothing to disable.
11497		 */
11498		return;
11499	}
11500
11501	for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11502		if (pecb == ecb)
11503			break;
11504		prev = pecb;
11505	}
11506
11507	ASSERT(pecb != NULL);
11508
11509	if (prev == NULL) {
11510		probe->dtpr_ecb = ecb->dte_next;
11511	} else {
11512		prev->dte_next = ecb->dte_next;
11513	}
11514
11515	if (ecb == probe->dtpr_ecb_last) {
11516		ASSERT(ecb->dte_next == NULL);
11517		probe->dtpr_ecb_last = prev;
11518	}
11519
11520	/*
11521	 * The ECB has been disconnected from the probe; now sync to assure
11522	 * that all CPUs have seen the change before returning.
11523	 */
11524	dtrace_sync();
11525
11526	if (probe->dtpr_ecb == NULL) {
11527		/*
11528		 * That was the last ECB on the probe; clear the predicate
11529		 * cache ID for the probe, disable it and sync one more time
11530		 * to assure that we'll never hit it again.
11531		 */
11532		dtrace_provider_t *prov = probe->dtpr_provider;
11533
11534		ASSERT(ecb->dte_next == NULL);
11535		ASSERT(probe->dtpr_ecb_last == NULL);
11536		probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11537		prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11538		    probe->dtpr_id, probe->dtpr_arg);
11539		dtrace_sync();
11540	} else {
11541		/*
11542		 * There is at least one ECB remaining on the probe.  If there
11543		 * is _exactly_ one, set the probe's predicate cache ID to be
11544		 * the predicate cache ID of the remaining ECB.
11545		 */
11546		ASSERT(probe->dtpr_ecb_last != NULL);
11547		ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11548
11549		if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11550			dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11551
11552			ASSERT(probe->dtpr_ecb->dte_next == NULL);
11553
11554			if (p != NULL)
11555				probe->dtpr_predcache = p->dtp_cacheid;
11556		}
11557
11558		ecb->dte_next = NULL;
11559	}
11560}
11561
11562static void
11563dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11564{
11565	dtrace_state_t *state = ecb->dte_state;
11566	dtrace_vstate_t *vstate = &state->dts_vstate;
11567	dtrace_predicate_t *pred;
11568	dtrace_epid_t epid = ecb->dte_epid;
11569
11570	ASSERT(MUTEX_HELD(&dtrace_lock));
11571	ASSERT(ecb->dte_next == NULL);
11572	ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11573
11574	if ((pred = ecb->dte_predicate) != NULL)
11575		dtrace_predicate_release(pred, vstate);
11576
11577	dtrace_ecb_action_remove(ecb);
11578
11579	ASSERT(state->dts_ecbs[epid - 1] == ecb);
11580	state->dts_ecbs[epid - 1] = NULL;
11581
11582	kmem_free(ecb, sizeof (dtrace_ecb_t));
11583}
11584
11585static dtrace_ecb_t *
11586dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11587    dtrace_enabling_t *enab)
11588{
11589	dtrace_ecb_t *ecb;
11590	dtrace_predicate_t *pred;
11591	dtrace_actdesc_t *act;
11592	dtrace_provider_t *prov;
11593	dtrace_ecbdesc_t *desc = enab->dten_current;
11594
11595	ASSERT(MUTEX_HELD(&dtrace_lock));
11596	ASSERT(state != NULL);
11597
11598	ecb = dtrace_ecb_add(state, probe);
11599	ecb->dte_uarg = desc->dted_uarg;
11600
11601	if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11602		dtrace_predicate_hold(pred);
11603		ecb->dte_predicate = pred;
11604	}
11605
11606	if (probe != NULL) {
11607		/*
11608		 * If the provider shows more leg than the consumer is old
11609		 * enough to see, we need to enable the appropriate implicit
11610		 * predicate bits to prevent the ecb from activating at
11611		 * revealing times.
11612		 *
11613		 * Providers specifying DTRACE_PRIV_USER at register time
11614		 * are stating that they need the /proc-style privilege
11615		 * model to be enforced, and this is what DTRACE_COND_OWNER
11616		 * and DTRACE_COND_ZONEOWNER will then do at probe time.
11617		 */
11618		prov = probe->dtpr_provider;
11619		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11620		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11621			ecb->dte_cond |= DTRACE_COND_OWNER;
11622
11623		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11624		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11625			ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11626
11627		/*
11628		 * If the provider shows us kernel innards and the user
11629		 * is lacking sufficient privilege, enable the
11630		 * DTRACE_COND_USERMODE implicit predicate.
11631		 */
11632		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11633		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11634			ecb->dte_cond |= DTRACE_COND_USERMODE;
11635	}
11636
11637	if (dtrace_ecb_create_cache != NULL) {
11638		/*
11639		 * If we have a cached ecb, we'll use its action list instead
11640		 * of creating our own (saving both time and space).
11641		 */
11642		dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11643		dtrace_action_t *act = cached->dte_action;
11644
11645		if (act != NULL) {
11646			ASSERT(act->dta_refcnt > 0);
11647			act->dta_refcnt++;
11648			ecb->dte_action = act;
11649			ecb->dte_action_last = cached->dte_action_last;
11650			ecb->dte_needed = cached->dte_needed;
11651			ecb->dte_size = cached->dte_size;
11652			ecb->dte_alignment = cached->dte_alignment;
11653		}
11654
11655		return (ecb);
11656	}
11657
11658	for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11659		if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11660			dtrace_ecb_destroy(ecb);
11661			return (NULL);
11662		}
11663	}
11664
11665	dtrace_ecb_resize(ecb);
11666
11667	return (dtrace_ecb_create_cache = ecb);
11668}
11669
11670static int
11671dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
11672{
11673	dtrace_ecb_t *ecb;
11674	dtrace_enabling_t *enab = arg;
11675	dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11676
11677	ASSERT(state != NULL);
11678
11679	if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
11680		/*
11681		 * This probe was created in a generation for which this
11682		 * enabling has previously created ECBs; we don't want to
11683		 * enable it again, so just kick out.
11684		 */
11685		return (DTRACE_MATCH_NEXT);
11686	}
11687
11688	if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11689		return (DTRACE_MATCH_DONE);
11690
11691	dtrace_ecb_enable(ecb);
11692	return (DTRACE_MATCH_NEXT);
11693}
11694
11695static dtrace_ecb_t *
11696dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11697{
11698	dtrace_ecb_t *ecb;
11699
11700	ASSERT(MUTEX_HELD(&dtrace_lock));
11701
11702	if (id == 0 || id > state->dts_necbs)
11703		return (NULL);
11704
11705	ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11706	ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11707
11708	return (state->dts_ecbs[id - 1]);
11709}
11710
11711static dtrace_aggregation_t *
11712dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11713{
11714	dtrace_aggregation_t *agg;
11715
11716	ASSERT(MUTEX_HELD(&dtrace_lock));
11717
11718	if (id == 0 || id > state->dts_naggregations)
11719		return (NULL);
11720
11721	ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11722	ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11723	    agg->dtag_id == id);
11724
11725	return (state->dts_aggregations[id - 1]);
11726}
11727
11728/*
11729 * DTrace Buffer Functions
11730 *
11731 * The following functions manipulate DTrace buffers.  Most of these functions
11732 * are called in the context of establishing or processing consumer state;
11733 * exceptions are explicitly noted.
11734 */
11735
11736/*
11737 * Note:  called from cross call context.  This function switches the two
11738 * buffers on a given CPU.  The atomicity of this operation is assured by
11739 * disabling interrupts while the actual switch takes place; the disabling of
11740 * interrupts serializes the execution with any execution of dtrace_probe() on
11741 * the same CPU.
11742 */
11743static void
11744dtrace_buffer_switch(dtrace_buffer_t *buf)
11745{
11746	caddr_t tomax = buf->dtb_tomax;
11747	caddr_t xamot = buf->dtb_xamot;
11748	dtrace_icookie_t cookie;
11749	hrtime_t now;
11750
11751	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11752	ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11753
11754	cookie = dtrace_interrupt_disable();
11755	now = dtrace_gethrtime();
11756	buf->dtb_tomax = xamot;
11757	buf->dtb_xamot = tomax;
11758	buf->dtb_xamot_drops = buf->dtb_drops;
11759	buf->dtb_xamot_offset = buf->dtb_offset;
11760	buf->dtb_xamot_errors = buf->dtb_errors;
11761	buf->dtb_xamot_flags = buf->dtb_flags;
11762	buf->dtb_offset = 0;
11763	buf->dtb_drops = 0;
11764	buf->dtb_errors = 0;
11765	buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11766	buf->dtb_interval = now - buf->dtb_switched;
11767	buf->dtb_switched = now;
11768	dtrace_interrupt_enable(cookie);
11769}
11770
11771/*
11772 * Note:  called from cross call context.  This function activates a buffer
11773 * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
11774 * is guaranteed by the disabling of interrupts.
11775 */
11776static void
11777dtrace_buffer_activate(dtrace_state_t *state)
11778{
11779	dtrace_buffer_t *buf;
11780	dtrace_icookie_t cookie = dtrace_interrupt_disable();
11781
11782	buf = &state->dts_buffer[curcpu];
11783
11784	if (buf->dtb_tomax != NULL) {
11785		/*
11786		 * We might like to assert that the buffer is marked inactive,
11787		 * but this isn't necessarily true:  the buffer for the CPU
11788		 * that processes the BEGIN probe has its buffer activated
11789		 * manually.  In this case, we take the (harmless) action
11790		 * re-clearing the bit INACTIVE bit.
11791		 */
11792		buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11793	}
11794
11795	dtrace_interrupt_enable(cookie);
11796}
11797
11798static int
11799dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
11800    processorid_t cpu, int *factor)
11801{
11802#if defined(sun)
11803	cpu_t *cp;
11804#endif
11805	dtrace_buffer_t *buf;
11806	int allocated = 0, desired = 0;
11807
11808#if defined(sun)
11809	ASSERT(MUTEX_HELD(&cpu_lock));
11810	ASSERT(MUTEX_HELD(&dtrace_lock));
11811
11812	*factor = 1;
11813
11814	if (size > dtrace_nonroot_maxsize &&
11815	    !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11816		return (EFBIG);
11817
11818	cp = cpu_list;
11819
11820	do {
11821		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11822			continue;
11823
11824		buf = &bufs[cp->cpu_id];
11825
11826		/*
11827		 * If there is already a buffer allocated for this CPU, it
11828		 * is only possible that this is a DR event.  In this case,
11829		 */
11830		if (buf->dtb_tomax != NULL) {
11831			ASSERT(buf->dtb_size == size);
11832			continue;
11833		}
11834
11835		ASSERT(buf->dtb_xamot == NULL);
11836
11837		if ((buf->dtb_tomax = kmem_zalloc(size,
11838		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11839			goto err;
11840
11841		buf->dtb_size = size;
11842		buf->dtb_flags = flags;
11843		buf->dtb_offset = 0;
11844		buf->dtb_drops = 0;
11845
11846		if (flags & DTRACEBUF_NOSWITCH)
11847			continue;
11848
11849		if ((buf->dtb_xamot = kmem_zalloc(size,
11850		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11851			goto err;
11852	} while ((cp = cp->cpu_next) != cpu_list);
11853
11854	return (0);
11855
11856err:
11857	cp = cpu_list;
11858
11859	do {
11860		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11861			continue;
11862
11863		buf = &bufs[cp->cpu_id];
11864		desired += 2;
11865
11866		if (buf->dtb_xamot != NULL) {
11867			ASSERT(buf->dtb_tomax != NULL);
11868			ASSERT(buf->dtb_size == size);
11869			kmem_free(buf->dtb_xamot, size);
11870			allocated++;
11871		}
11872
11873		if (buf->dtb_tomax != NULL) {
11874			ASSERT(buf->dtb_size == size);
11875			kmem_free(buf->dtb_tomax, size);
11876			allocated++;
11877		}
11878
11879		buf->dtb_tomax = NULL;
11880		buf->dtb_xamot = NULL;
11881		buf->dtb_size = 0;
11882	} while ((cp = cp->cpu_next) != cpu_list);
11883#else
11884	int i;
11885
11886	*factor = 1;
11887#if defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
11888	/*
11889	 * FreeBSD isn't good at limiting the amount of memory we
11890	 * ask to malloc, so let's place a limit here before trying
11891	 * to do something that might well end in tears at bedtime.
11892	 */
11893	if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1)))
11894		return (ENOMEM);
11895#endif
11896
11897	ASSERT(MUTEX_HELD(&dtrace_lock));
11898	CPU_FOREACH(i) {
11899		if (cpu != DTRACE_CPUALL && cpu != i)
11900			continue;
11901
11902		buf = &bufs[i];
11903
11904		/*
11905		 * If there is already a buffer allocated for this CPU, it
11906		 * is only possible that this is a DR event.  In this case,
11907		 * the buffer size must match our specified size.
11908		 */
11909		if (buf->dtb_tomax != NULL) {
11910			ASSERT(buf->dtb_size == size);
11911			continue;
11912		}
11913
11914		ASSERT(buf->dtb_xamot == NULL);
11915
11916		if ((buf->dtb_tomax = kmem_zalloc(size,
11917		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11918			goto err;
11919
11920		buf->dtb_size = size;
11921		buf->dtb_flags = flags;
11922		buf->dtb_offset = 0;
11923		buf->dtb_drops = 0;
11924
11925		if (flags & DTRACEBUF_NOSWITCH)
11926			continue;
11927
11928		if ((buf->dtb_xamot = kmem_zalloc(size,
11929		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11930			goto err;
11931	}
11932
11933	return (0);
11934
11935err:
11936	/*
11937	 * Error allocating memory, so free the buffers that were
11938	 * allocated before the failed allocation.
11939	 */
11940	CPU_FOREACH(i) {
11941		if (cpu != DTRACE_CPUALL && cpu != i)
11942			continue;
11943
11944		buf = &bufs[i];
11945		desired += 2;
11946
11947		if (buf->dtb_xamot != NULL) {
11948			ASSERT(buf->dtb_tomax != NULL);
11949			ASSERT(buf->dtb_size == size);
11950			kmem_free(buf->dtb_xamot, size);
11951			allocated++;
11952		}
11953
11954		if (buf->dtb_tomax != NULL) {
11955			ASSERT(buf->dtb_size == size);
11956			kmem_free(buf->dtb_tomax, size);
11957			allocated++;
11958		}
11959
11960		buf->dtb_tomax = NULL;
11961		buf->dtb_xamot = NULL;
11962		buf->dtb_size = 0;
11963
11964	}
11965#endif
11966	*factor = desired / (allocated > 0 ? allocated : 1);
11967
11968	return (ENOMEM);
11969}
11970
11971/*
11972 * Note:  called from probe context.  This function just increments the drop
11973 * count on a buffer.  It has been made a function to allow for the
11974 * possibility of understanding the source of mysterious drop counts.  (A
11975 * problem for which one may be particularly disappointed that DTrace cannot
11976 * be used to understand DTrace.)
11977 */
11978static void
11979dtrace_buffer_drop(dtrace_buffer_t *buf)
11980{
11981	buf->dtb_drops++;
11982}
11983
11984/*
11985 * Note:  called from probe context.  This function is called to reserve space
11986 * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
11987 * mstate.  Returns the new offset in the buffer, or a negative value if an
11988 * error has occurred.
11989 */
11990static intptr_t
11991dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
11992    dtrace_state_t *state, dtrace_mstate_t *mstate)
11993{
11994	intptr_t offs = buf->dtb_offset, soffs;
11995	intptr_t woffs;
11996	caddr_t tomax;
11997	size_t total;
11998
11999	if (buf->dtb_flags & DTRACEBUF_INACTIVE)
12000		return (-1);
12001
12002	if ((tomax = buf->dtb_tomax) == NULL) {
12003		dtrace_buffer_drop(buf);
12004		return (-1);
12005	}
12006
12007	if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
12008		while (offs & (align - 1)) {
12009			/*
12010			 * Assert that our alignment is off by a number which
12011			 * is itself sizeof (uint32_t) aligned.
12012			 */
12013			ASSERT(!((align - (offs & (align - 1))) &
12014			    (sizeof (uint32_t) - 1)));
12015			DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12016			offs += sizeof (uint32_t);
12017		}
12018
12019		if ((soffs = offs + needed) > buf->dtb_size) {
12020			dtrace_buffer_drop(buf);
12021			return (-1);
12022		}
12023
12024		if (mstate == NULL)
12025			return (offs);
12026
12027		mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
12028		mstate->dtms_scratch_size = buf->dtb_size - soffs;
12029		mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12030
12031		return (offs);
12032	}
12033
12034	if (buf->dtb_flags & DTRACEBUF_FILL) {
12035		if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
12036		    (buf->dtb_flags & DTRACEBUF_FULL))
12037			return (-1);
12038		goto out;
12039	}
12040
12041	total = needed + (offs & (align - 1));
12042
12043	/*
12044	 * For a ring buffer, life is quite a bit more complicated.  Before
12045	 * we can store any padding, we need to adjust our wrapping offset.
12046	 * (If we've never before wrapped or we're not about to, no adjustment
12047	 * is required.)
12048	 */
12049	if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
12050	    offs + total > buf->dtb_size) {
12051		woffs = buf->dtb_xamot_offset;
12052
12053		if (offs + total > buf->dtb_size) {
12054			/*
12055			 * We can't fit in the end of the buffer.  First, a
12056			 * sanity check that we can fit in the buffer at all.
12057			 */
12058			if (total > buf->dtb_size) {
12059				dtrace_buffer_drop(buf);
12060				return (-1);
12061			}
12062
12063			/*
12064			 * We're going to be storing at the top of the buffer,
12065			 * so now we need to deal with the wrapped offset.  We
12066			 * only reset our wrapped offset to 0 if it is
12067			 * currently greater than the current offset.  If it
12068			 * is less than the current offset, it is because a
12069			 * previous allocation induced a wrap -- but the
12070			 * allocation didn't subsequently take the space due
12071			 * to an error or false predicate evaluation.  In this
12072			 * case, we'll just leave the wrapped offset alone: if
12073			 * the wrapped offset hasn't been advanced far enough
12074			 * for this allocation, it will be adjusted in the
12075			 * lower loop.
12076			 */
12077			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
12078				if (woffs >= offs)
12079					woffs = 0;
12080			} else {
12081				woffs = 0;
12082			}
12083
12084			/*
12085			 * Now we know that we're going to be storing to the
12086			 * top of the buffer and that there is room for us
12087			 * there.  We need to clear the buffer from the current
12088			 * offset to the end (there may be old gunk there).
12089			 */
12090			while (offs < buf->dtb_size)
12091				tomax[offs++] = 0;
12092
12093			/*
12094			 * We need to set our offset to zero.  And because we
12095			 * are wrapping, we need to set the bit indicating as
12096			 * much.  We can also adjust our needed space back
12097			 * down to the space required by the ECB -- we know
12098			 * that the top of the buffer is aligned.
12099			 */
12100			offs = 0;
12101			total = needed;
12102			buf->dtb_flags |= DTRACEBUF_WRAPPED;
12103		} else {
12104			/*
12105			 * There is room for us in the buffer, so we simply
12106			 * need to check the wrapped offset.
12107			 */
12108			if (woffs < offs) {
12109				/*
12110				 * The wrapped offset is less than the offset.
12111				 * This can happen if we allocated buffer space
12112				 * that induced a wrap, but then we didn't
12113				 * subsequently take the space due to an error
12114				 * or false predicate evaluation.  This is
12115				 * okay; we know that _this_ allocation isn't
12116				 * going to induce a wrap.  We still can't
12117				 * reset the wrapped offset to be zero,
12118				 * however: the space may have been trashed in
12119				 * the previous failed probe attempt.  But at
12120				 * least the wrapped offset doesn't need to
12121				 * be adjusted at all...
12122				 */
12123				goto out;
12124			}
12125		}
12126
12127		while (offs + total > woffs) {
12128			dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
12129			size_t size;
12130
12131			if (epid == DTRACE_EPIDNONE) {
12132				size = sizeof (uint32_t);
12133			} else {
12134				ASSERT3U(epid, <=, state->dts_necbs);
12135				ASSERT(state->dts_ecbs[epid - 1] != NULL);
12136
12137				size = state->dts_ecbs[epid - 1]->dte_size;
12138			}
12139
12140			ASSERT(woffs + size <= buf->dtb_size);
12141			ASSERT(size != 0);
12142
12143			if (woffs + size == buf->dtb_size) {
12144				/*
12145				 * We've reached the end of the buffer; we want
12146				 * to set the wrapped offset to 0 and break
12147				 * out.  However, if the offs is 0, then we're
12148				 * in a strange edge-condition:  the amount of
12149				 * space that we want to reserve plus the size
12150				 * of the record that we're overwriting is
12151				 * greater than the size of the buffer.  This
12152				 * is problematic because if we reserve the
12153				 * space but subsequently don't consume it (due
12154				 * to a failed predicate or error) the wrapped
12155				 * offset will be 0 -- yet the EPID at offset 0
12156				 * will not be committed.  This situation is
12157				 * relatively easy to deal with:  if we're in
12158				 * this case, the buffer is indistinguishable
12159				 * from one that hasn't wrapped; we need only
12160				 * finish the job by clearing the wrapped bit,
12161				 * explicitly setting the offset to be 0, and
12162				 * zero'ing out the old data in the buffer.
12163				 */
12164				if (offs == 0) {
12165					buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12166					buf->dtb_offset = 0;
12167					woffs = total;
12168
12169					while (woffs < buf->dtb_size)
12170						tomax[woffs++] = 0;
12171				}
12172
12173				woffs = 0;
12174				break;
12175			}
12176
12177			woffs += size;
12178		}
12179
12180		/*
12181		 * We have a wrapped offset.  It may be that the wrapped offset
12182		 * has become zero -- that's okay.
12183		 */
12184		buf->dtb_xamot_offset = woffs;
12185	}
12186
12187out:
12188	/*
12189	 * Now we can plow the buffer with any necessary padding.
12190	 */
12191	while (offs & (align - 1)) {
12192		/*
12193		 * Assert that our alignment is off by a number which
12194		 * is itself sizeof (uint32_t) aligned.
12195		 */
12196		ASSERT(!((align - (offs & (align - 1))) &
12197		    (sizeof (uint32_t) - 1)));
12198		DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12199		offs += sizeof (uint32_t);
12200	}
12201
12202	if (buf->dtb_flags & DTRACEBUF_FILL) {
12203		if (offs + needed > buf->dtb_size - state->dts_reserve) {
12204			buf->dtb_flags |= DTRACEBUF_FULL;
12205			return (-1);
12206		}
12207	}
12208
12209	if (mstate == NULL)
12210		return (offs);
12211
12212	/*
12213	 * For ring buffers and fill buffers, the scratch space is always
12214	 * the inactive buffer.
12215	 */
12216	mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12217	mstate->dtms_scratch_size = buf->dtb_size;
12218	mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12219
12220	return (offs);
12221}
12222
12223static void
12224dtrace_buffer_polish(dtrace_buffer_t *buf)
12225{
12226	ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12227	ASSERT(MUTEX_HELD(&dtrace_lock));
12228
12229	if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12230		return;
12231
12232	/*
12233	 * We need to polish the ring buffer.  There are three cases:
12234	 *
12235	 * - The first (and presumably most common) is that there is no gap
12236	 *   between the buffer offset and the wrapped offset.  In this case,
12237	 *   there is nothing in the buffer that isn't valid data; we can
12238	 *   mark the buffer as polished and return.
12239	 *
12240	 * - The second (less common than the first but still more common
12241	 *   than the third) is that there is a gap between the buffer offset
12242	 *   and the wrapped offset, and the wrapped offset is larger than the
12243	 *   buffer offset.  This can happen because of an alignment issue, or
12244	 *   can happen because of a call to dtrace_buffer_reserve() that
12245	 *   didn't subsequently consume the buffer space.  In this case,
12246	 *   we need to zero the data from the buffer offset to the wrapped
12247	 *   offset.
12248	 *
12249	 * - The third (and least common) is that there is a gap between the
12250	 *   buffer offset and the wrapped offset, but the wrapped offset is
12251	 *   _less_ than the buffer offset.  This can only happen because a
12252	 *   call to dtrace_buffer_reserve() induced a wrap, but the space
12253	 *   was not subsequently consumed.  In this case, we need to zero the
12254	 *   space from the offset to the end of the buffer _and_ from the
12255	 *   top of the buffer to the wrapped offset.
12256	 */
12257	if (buf->dtb_offset < buf->dtb_xamot_offset) {
12258		bzero(buf->dtb_tomax + buf->dtb_offset,
12259		    buf->dtb_xamot_offset - buf->dtb_offset);
12260	}
12261
12262	if (buf->dtb_offset > buf->dtb_xamot_offset) {
12263		bzero(buf->dtb_tomax + buf->dtb_offset,
12264		    buf->dtb_size - buf->dtb_offset);
12265		bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
12266	}
12267}
12268
12269/*
12270 * This routine determines if data generated at the specified time has likely
12271 * been entirely consumed at user-level.  This routine is called to determine
12272 * if an ECB on a defunct probe (but for an active enabling) can be safely
12273 * disabled and destroyed.
12274 */
12275static int
12276dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
12277{
12278	int i;
12279
12280	for (i = 0; i < NCPU; i++) {
12281		dtrace_buffer_t *buf = &bufs[i];
12282
12283		if (buf->dtb_size == 0)
12284			continue;
12285
12286		if (buf->dtb_flags & DTRACEBUF_RING)
12287			return (0);
12288
12289		if (!buf->dtb_switched && buf->dtb_offset != 0)
12290			return (0);
12291
12292		if (buf->dtb_switched - buf->dtb_interval < when)
12293			return (0);
12294	}
12295
12296	return (1);
12297}
12298
12299static void
12300dtrace_buffer_free(dtrace_buffer_t *bufs)
12301{
12302	int i;
12303
12304	for (i = 0; i < NCPU; i++) {
12305		dtrace_buffer_t *buf = &bufs[i];
12306
12307		if (buf->dtb_tomax == NULL) {
12308			ASSERT(buf->dtb_xamot == NULL);
12309			ASSERT(buf->dtb_size == 0);
12310			continue;
12311		}
12312
12313		if (buf->dtb_xamot != NULL) {
12314			ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12315			kmem_free(buf->dtb_xamot, buf->dtb_size);
12316		}
12317
12318		kmem_free(buf->dtb_tomax, buf->dtb_size);
12319		buf->dtb_size = 0;
12320		buf->dtb_tomax = NULL;
12321		buf->dtb_xamot = NULL;
12322	}
12323}
12324
12325/*
12326 * DTrace Enabling Functions
12327 */
12328static dtrace_enabling_t *
12329dtrace_enabling_create(dtrace_vstate_t *vstate)
12330{
12331	dtrace_enabling_t *enab;
12332
12333	enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12334	enab->dten_vstate = vstate;
12335
12336	return (enab);
12337}
12338
12339static void
12340dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
12341{
12342	dtrace_ecbdesc_t **ndesc;
12343	size_t osize, nsize;
12344
12345	/*
12346	 * We can't add to enablings after we've enabled them, or after we've
12347	 * retained them.
12348	 */
12349	ASSERT(enab->dten_probegen == 0);
12350	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12351
12352	if (enab->dten_ndesc < enab->dten_maxdesc) {
12353		enab->dten_desc[enab->dten_ndesc++] = ecb;
12354		return;
12355	}
12356
12357	osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12358
12359	if (enab->dten_maxdesc == 0) {
12360		enab->dten_maxdesc = 1;
12361	} else {
12362		enab->dten_maxdesc <<= 1;
12363	}
12364
12365	ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12366
12367	nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12368	ndesc = kmem_zalloc(nsize, KM_SLEEP);
12369	bcopy(enab->dten_desc, ndesc, osize);
12370	if (enab->dten_desc != NULL)
12371		kmem_free(enab->dten_desc, osize);
12372
12373	enab->dten_desc = ndesc;
12374	enab->dten_desc[enab->dten_ndesc++] = ecb;
12375}
12376
12377static void
12378dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
12379    dtrace_probedesc_t *pd)
12380{
12381	dtrace_ecbdesc_t *new;
12382	dtrace_predicate_t *pred;
12383	dtrace_actdesc_t *act;
12384
12385	/*
12386	 * We're going to create a new ECB description that matches the
12387	 * specified ECB in every way, but has the specified probe description.
12388	 */
12389	new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12390
12391	if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12392		dtrace_predicate_hold(pred);
12393
12394	for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12395		dtrace_actdesc_hold(act);
12396
12397	new->dted_action = ecb->dted_action;
12398	new->dted_pred = ecb->dted_pred;
12399	new->dted_probe = *pd;
12400	new->dted_uarg = ecb->dted_uarg;
12401
12402	dtrace_enabling_add(enab, new);
12403}
12404
12405static void
12406dtrace_enabling_dump(dtrace_enabling_t *enab)
12407{
12408	int i;
12409
12410	for (i = 0; i < enab->dten_ndesc; i++) {
12411		dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12412
12413		cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12414		    desc->dtpd_provider, desc->dtpd_mod,
12415		    desc->dtpd_func, desc->dtpd_name);
12416	}
12417}
12418
12419static void
12420dtrace_enabling_destroy(dtrace_enabling_t *enab)
12421{
12422	int i;
12423	dtrace_ecbdesc_t *ep;
12424	dtrace_vstate_t *vstate = enab->dten_vstate;
12425
12426	ASSERT(MUTEX_HELD(&dtrace_lock));
12427
12428	for (i = 0; i < enab->dten_ndesc; i++) {
12429		dtrace_actdesc_t *act, *next;
12430		dtrace_predicate_t *pred;
12431
12432		ep = enab->dten_desc[i];
12433
12434		if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12435			dtrace_predicate_release(pred, vstate);
12436
12437		for (act = ep->dted_action; act != NULL; act = next) {
12438			next = act->dtad_next;
12439			dtrace_actdesc_release(act, vstate);
12440		}
12441
12442		kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12443	}
12444
12445	if (enab->dten_desc != NULL)
12446		kmem_free(enab->dten_desc,
12447		    enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12448
12449	/*
12450	 * If this was a retained enabling, decrement the dts_nretained count
12451	 * and take it off of the dtrace_retained list.
12452	 */
12453	if (enab->dten_prev != NULL || enab->dten_next != NULL ||
12454	    dtrace_retained == enab) {
12455		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12456		ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
12457		enab->dten_vstate->dtvs_state->dts_nretained--;
12458		dtrace_retained_gen++;
12459	}
12460
12461	if (enab->dten_prev == NULL) {
12462		if (dtrace_retained == enab) {
12463			dtrace_retained = enab->dten_next;
12464
12465			if (dtrace_retained != NULL)
12466				dtrace_retained->dten_prev = NULL;
12467		}
12468	} else {
12469		ASSERT(enab != dtrace_retained);
12470		ASSERT(dtrace_retained != NULL);
12471		enab->dten_prev->dten_next = enab->dten_next;
12472	}
12473
12474	if (enab->dten_next != NULL) {
12475		ASSERT(dtrace_retained != NULL);
12476		enab->dten_next->dten_prev = enab->dten_prev;
12477	}
12478
12479	kmem_free(enab, sizeof (dtrace_enabling_t));
12480}
12481
12482static int
12483dtrace_enabling_retain(dtrace_enabling_t *enab)
12484{
12485	dtrace_state_t *state;
12486
12487	ASSERT(MUTEX_HELD(&dtrace_lock));
12488	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12489	ASSERT(enab->dten_vstate != NULL);
12490
12491	state = enab->dten_vstate->dtvs_state;
12492	ASSERT(state != NULL);
12493
12494	/*
12495	 * We only allow each state to retain dtrace_retain_max enablings.
12496	 */
12497	if (state->dts_nretained >= dtrace_retain_max)
12498		return (ENOSPC);
12499
12500	state->dts_nretained++;
12501	dtrace_retained_gen++;
12502
12503	if (dtrace_retained == NULL) {
12504		dtrace_retained = enab;
12505		return (0);
12506	}
12507
12508	enab->dten_next = dtrace_retained;
12509	dtrace_retained->dten_prev = enab;
12510	dtrace_retained = enab;
12511
12512	return (0);
12513}
12514
12515static int
12516dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12517    dtrace_probedesc_t *create)
12518{
12519	dtrace_enabling_t *new, *enab;
12520	int found = 0, err = ENOENT;
12521
12522	ASSERT(MUTEX_HELD(&dtrace_lock));
12523	ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12524	ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12525	ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12526	ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12527
12528	new = dtrace_enabling_create(&state->dts_vstate);
12529
12530	/*
12531	 * Iterate over all retained enablings, looking for enablings that
12532	 * match the specified state.
12533	 */
12534	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12535		int i;
12536
12537		/*
12538		 * dtvs_state can only be NULL for helper enablings -- and
12539		 * helper enablings can't be retained.
12540		 */
12541		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12542
12543		if (enab->dten_vstate->dtvs_state != state)
12544			continue;
12545
12546		/*
12547		 * Now iterate over each probe description; we're looking for
12548		 * an exact match to the specified probe description.
12549		 */
12550		for (i = 0; i < enab->dten_ndesc; i++) {
12551			dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12552			dtrace_probedesc_t *pd = &ep->dted_probe;
12553
12554			if (strcmp(pd->dtpd_provider, match->dtpd_provider))
12555				continue;
12556
12557			if (strcmp(pd->dtpd_mod, match->dtpd_mod))
12558				continue;
12559
12560			if (strcmp(pd->dtpd_func, match->dtpd_func))
12561				continue;
12562
12563			if (strcmp(pd->dtpd_name, match->dtpd_name))
12564				continue;
12565
12566			/*
12567			 * We have a winning probe!  Add it to our growing
12568			 * enabling.
12569			 */
12570			found = 1;
12571			dtrace_enabling_addlike(new, ep, create);
12572		}
12573	}
12574
12575	if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12576		dtrace_enabling_destroy(new);
12577		return (err);
12578	}
12579
12580	return (0);
12581}
12582
12583static void
12584dtrace_enabling_retract(dtrace_state_t *state)
12585{
12586	dtrace_enabling_t *enab, *next;
12587
12588	ASSERT(MUTEX_HELD(&dtrace_lock));
12589
12590	/*
12591	 * Iterate over all retained enablings, destroy the enablings retained
12592	 * for the specified state.
12593	 */
12594	for (enab = dtrace_retained; enab != NULL; enab = next) {
12595		next = enab->dten_next;
12596
12597		/*
12598		 * dtvs_state can only be NULL for helper enablings -- and
12599		 * helper enablings can't be retained.
12600		 */
12601		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12602
12603		if (enab->dten_vstate->dtvs_state == state) {
12604			ASSERT(state->dts_nretained > 0);
12605			dtrace_enabling_destroy(enab);
12606		}
12607	}
12608
12609	ASSERT(state->dts_nretained == 0);
12610}
12611
12612static int
12613dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
12614{
12615	int i = 0;
12616	int matched = 0;
12617
12618	ASSERT(MUTEX_HELD(&cpu_lock));
12619	ASSERT(MUTEX_HELD(&dtrace_lock));
12620
12621	for (i = 0; i < enab->dten_ndesc; i++) {
12622		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12623
12624		enab->dten_current = ep;
12625		enab->dten_error = 0;
12626
12627		matched += dtrace_probe_enable(&ep->dted_probe, enab);
12628
12629		if (enab->dten_error != 0) {
12630			/*
12631			 * If we get an error half-way through enabling the
12632			 * probes, we kick out -- perhaps with some number of
12633			 * them enabled.  Leaving enabled probes enabled may
12634			 * be slightly confusing for user-level, but we expect
12635			 * that no one will attempt to actually drive on in
12636			 * the face of such errors.  If this is an anonymous
12637			 * enabling (indicated with a NULL nmatched pointer),
12638			 * we cmn_err() a message.  We aren't expecting to
12639			 * get such an error -- such as it can exist at all,
12640			 * it would be a result of corrupted DOF in the driver
12641			 * properties.
12642			 */
12643			if (nmatched == NULL) {
12644				cmn_err(CE_WARN, "dtrace_enabling_match() "
12645				    "error on %p: %d", (void *)ep,
12646				    enab->dten_error);
12647			}
12648
12649			return (enab->dten_error);
12650		}
12651	}
12652
12653	enab->dten_probegen = dtrace_probegen;
12654	if (nmatched != NULL)
12655		*nmatched = matched;
12656
12657	return (0);
12658}
12659
12660static void
12661dtrace_enabling_matchall(void)
12662{
12663	dtrace_enabling_t *enab;
12664
12665	mutex_enter(&cpu_lock);
12666	mutex_enter(&dtrace_lock);
12667
12668	/*
12669	 * Iterate over all retained enablings to see if any probes match
12670	 * against them.  We only perform this operation on enablings for which
12671	 * we have sufficient permissions by virtue of being in the global zone
12672	 * or in the same zone as the DTrace client.  Because we can be called
12673	 * after dtrace_detach() has been called, we cannot assert that there
12674	 * are retained enablings.  We can safely load from dtrace_retained,
12675	 * however:  the taskq_destroy() at the end of dtrace_detach() will
12676	 * block pending our completion.
12677	 */
12678	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12679#if defined(sun)
12680		cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
12681
12682		if (INGLOBALZONE(curproc) ||
12683		    cr != NULL && getzoneid() == crgetzoneid(cr))
12684#endif
12685			(void) dtrace_enabling_match(enab, NULL);
12686	}
12687
12688	mutex_exit(&dtrace_lock);
12689	mutex_exit(&cpu_lock);
12690}
12691
12692/*
12693 * If an enabling is to be enabled without having matched probes (that is, if
12694 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12695 * enabling must be _primed_ by creating an ECB for every ECB description.
12696 * This must be done to assure that we know the number of speculations, the
12697 * number of aggregations, the minimum buffer size needed, etc. before we
12698 * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
12699 * enabling any probes, we create ECBs for every ECB decription, but with a
12700 * NULL probe -- which is exactly what this function does.
12701 */
12702static void
12703dtrace_enabling_prime(dtrace_state_t *state)
12704{
12705	dtrace_enabling_t *enab;
12706	int i;
12707
12708	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12709		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12710
12711		if (enab->dten_vstate->dtvs_state != state)
12712			continue;
12713
12714		/*
12715		 * We don't want to prime an enabling more than once, lest
12716		 * we allow a malicious user to induce resource exhaustion.
12717		 * (The ECBs that result from priming an enabling aren't
12718		 * leaked -- but they also aren't deallocated until the
12719		 * consumer state is destroyed.)
12720		 */
12721		if (enab->dten_primed)
12722			continue;
12723
12724		for (i = 0; i < enab->dten_ndesc; i++) {
12725			enab->dten_current = enab->dten_desc[i];
12726			(void) dtrace_probe_enable(NULL, enab);
12727		}
12728
12729		enab->dten_primed = 1;
12730	}
12731}
12732
12733/*
12734 * Called to indicate that probes should be provided due to retained
12735 * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
12736 * must take an initial lap through the enabling calling the dtps_provide()
12737 * entry point explicitly to allow for autocreated probes.
12738 */
12739static void
12740dtrace_enabling_provide(dtrace_provider_t *prv)
12741{
12742	int i, all = 0;
12743	dtrace_probedesc_t desc;
12744	dtrace_genid_t gen;
12745
12746	ASSERT(MUTEX_HELD(&dtrace_lock));
12747	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
12748
12749	if (prv == NULL) {
12750		all = 1;
12751		prv = dtrace_provider;
12752	}
12753
12754	do {
12755		dtrace_enabling_t *enab;
12756		void *parg = prv->dtpv_arg;
12757
12758retry:
12759		gen = dtrace_retained_gen;
12760		for (enab = dtrace_retained; enab != NULL;
12761		    enab = enab->dten_next) {
12762			for (i = 0; i < enab->dten_ndesc; i++) {
12763				desc = enab->dten_desc[i]->dted_probe;
12764				mutex_exit(&dtrace_lock);
12765				prv->dtpv_pops.dtps_provide(parg, &desc);
12766				mutex_enter(&dtrace_lock);
12767				/*
12768				 * Process the retained enablings again if
12769				 * they have changed while we weren't holding
12770				 * dtrace_lock.
12771				 */
12772				if (gen != dtrace_retained_gen)
12773					goto retry;
12774			}
12775		}
12776	} while (all && (prv = prv->dtpv_next) != NULL);
12777
12778	mutex_exit(&dtrace_lock);
12779	dtrace_probe_provide(NULL, all ? NULL : prv);
12780	mutex_enter(&dtrace_lock);
12781}
12782
12783/*
12784 * Called to reap ECBs that are attached to probes from defunct providers.
12785 */
12786static void
12787dtrace_enabling_reap(void)
12788{
12789	dtrace_provider_t *prov;
12790	dtrace_probe_t *probe;
12791	dtrace_ecb_t *ecb;
12792	hrtime_t when;
12793	int i;
12794
12795	mutex_enter(&cpu_lock);
12796	mutex_enter(&dtrace_lock);
12797
12798	for (i = 0; i < dtrace_nprobes; i++) {
12799		if ((probe = dtrace_probes[i]) == NULL)
12800			continue;
12801
12802		if (probe->dtpr_ecb == NULL)
12803			continue;
12804
12805		prov = probe->dtpr_provider;
12806
12807		if ((when = prov->dtpv_defunct) == 0)
12808			continue;
12809
12810		/*
12811		 * We have ECBs on a defunct provider:  we want to reap these
12812		 * ECBs to allow the provider to unregister.  The destruction
12813		 * of these ECBs must be done carefully:  if we destroy the ECB
12814		 * and the consumer later wishes to consume an EPID that
12815		 * corresponds to the destroyed ECB (and if the EPID metadata
12816		 * has not been previously consumed), the consumer will abort
12817		 * processing on the unknown EPID.  To reduce (but not, sadly,
12818		 * eliminate) the possibility of this, we will only destroy an
12819		 * ECB for a defunct provider if, for the state that
12820		 * corresponds to the ECB:
12821		 *
12822		 *  (a)	There is no speculative tracing (which can effectively
12823		 *	cache an EPID for an arbitrary amount of time).
12824		 *
12825		 *  (b)	The principal buffers have been switched twice since the
12826		 *	provider became defunct.
12827		 *
12828		 *  (c)	The aggregation buffers are of zero size or have been
12829		 *	switched twice since the provider became defunct.
12830		 *
12831		 * We use dts_speculates to determine (a) and call a function
12832		 * (dtrace_buffer_consumed()) to determine (b) and (c).  Note
12833		 * that as soon as we've been unable to destroy one of the ECBs
12834		 * associated with the probe, we quit trying -- reaping is only
12835		 * fruitful in as much as we can destroy all ECBs associated
12836		 * with the defunct provider's probes.
12837		 */
12838		while ((ecb = probe->dtpr_ecb) != NULL) {
12839			dtrace_state_t *state = ecb->dte_state;
12840			dtrace_buffer_t *buf = state->dts_buffer;
12841			dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
12842
12843			if (state->dts_speculates)
12844				break;
12845
12846			if (!dtrace_buffer_consumed(buf, when))
12847				break;
12848
12849			if (!dtrace_buffer_consumed(aggbuf, when))
12850				break;
12851
12852			dtrace_ecb_disable(ecb);
12853			ASSERT(probe->dtpr_ecb != ecb);
12854			dtrace_ecb_destroy(ecb);
12855		}
12856	}
12857
12858	mutex_exit(&dtrace_lock);
12859	mutex_exit(&cpu_lock);
12860}
12861
12862/*
12863 * DTrace DOF Functions
12864 */
12865/*ARGSUSED*/
12866static void
12867dtrace_dof_error(dof_hdr_t *dof, const char *str)
12868{
12869	if (dtrace_err_verbose)
12870		cmn_err(CE_WARN, "failed to process DOF: %s", str);
12871
12872#ifdef DTRACE_ERRDEBUG
12873	dtrace_errdebug(str);
12874#endif
12875}
12876
12877/*
12878 * Create DOF out of a currently enabled state.  Right now, we only create
12879 * DOF containing the run-time options -- but this could be expanded to create
12880 * complete DOF representing the enabled state.
12881 */
12882static dof_hdr_t *
12883dtrace_dof_create(dtrace_state_t *state)
12884{
12885	dof_hdr_t *dof;
12886	dof_sec_t *sec;
12887	dof_optdesc_t *opt;
12888	int i, len = sizeof (dof_hdr_t) +
12889	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12890	    sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12891
12892	ASSERT(MUTEX_HELD(&dtrace_lock));
12893
12894	dof = kmem_zalloc(len, KM_SLEEP);
12895	dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12896	dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12897	dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12898	dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12899
12900	dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12901	dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12902	dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12903	dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12904	dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12905	dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12906
12907	dof->dofh_flags = 0;
12908	dof->dofh_hdrsize = sizeof (dof_hdr_t);
12909	dof->dofh_secsize = sizeof (dof_sec_t);
12910	dof->dofh_secnum = 1;	/* only DOF_SECT_OPTDESC */
12911	dof->dofh_secoff = sizeof (dof_hdr_t);
12912	dof->dofh_loadsz = len;
12913	dof->dofh_filesz = len;
12914	dof->dofh_pad = 0;
12915
12916	/*
12917	 * Fill in the option section header...
12918	 */
12919	sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12920	sec->dofs_type = DOF_SECT_OPTDESC;
12921	sec->dofs_align = sizeof (uint64_t);
12922	sec->dofs_flags = DOF_SECF_LOAD;
12923	sec->dofs_entsize = sizeof (dof_optdesc_t);
12924
12925	opt = (dof_optdesc_t *)((uintptr_t)sec +
12926	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12927
12928	sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12929	sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12930
12931	for (i = 0; i < DTRACEOPT_MAX; i++) {
12932		opt[i].dofo_option = i;
12933		opt[i].dofo_strtab = DOF_SECIDX_NONE;
12934		opt[i].dofo_value = state->dts_options[i];
12935	}
12936
12937	return (dof);
12938}
12939
12940static dof_hdr_t *
12941dtrace_dof_copyin(uintptr_t uarg, int *errp)
12942{
12943	dof_hdr_t hdr, *dof;
12944
12945	ASSERT(!MUTEX_HELD(&dtrace_lock));
12946
12947	/*
12948	 * First, we're going to copyin() the sizeof (dof_hdr_t).
12949	 */
12950	if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
12951		dtrace_dof_error(NULL, "failed to copyin DOF header");
12952		*errp = EFAULT;
12953		return (NULL);
12954	}
12955
12956	/*
12957	 * Now we'll allocate the entire DOF and copy it in -- provided
12958	 * that the length isn't outrageous.
12959	 */
12960	if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
12961		dtrace_dof_error(&hdr, "load size exceeds maximum");
12962		*errp = E2BIG;
12963		return (NULL);
12964	}
12965
12966	if (hdr.dofh_loadsz < sizeof (hdr)) {
12967		dtrace_dof_error(&hdr, "invalid load size");
12968		*errp = EINVAL;
12969		return (NULL);
12970	}
12971
12972	dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
12973
12974	if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
12975	    dof->dofh_loadsz != hdr.dofh_loadsz) {
12976		kmem_free(dof, hdr.dofh_loadsz);
12977		*errp = EFAULT;
12978		return (NULL);
12979	}
12980
12981	return (dof);
12982}
12983
12984#if !defined(sun)
12985static __inline uchar_t
12986dtrace_dof_char(char c) {
12987	switch (c) {
12988	case '0':
12989	case '1':
12990	case '2':
12991	case '3':
12992	case '4':
12993	case '5':
12994	case '6':
12995	case '7':
12996	case '8':
12997	case '9':
12998		return (c - '0');
12999	case 'A':
13000	case 'B':
13001	case 'C':
13002	case 'D':
13003	case 'E':
13004	case 'F':
13005		return (c - 'A' + 10);
13006	case 'a':
13007	case 'b':
13008	case 'c':
13009	case 'd':
13010	case 'e':
13011	case 'f':
13012		return (c - 'a' + 10);
13013	}
13014	/* Should not reach here. */
13015	return (0);
13016}
13017#endif
13018
13019static dof_hdr_t *
13020dtrace_dof_property(const char *name)
13021{
13022	uchar_t *buf;
13023	uint64_t loadsz;
13024	unsigned int len, i;
13025	dof_hdr_t *dof;
13026
13027#if defined(sun)
13028	/*
13029	 * Unfortunately, array of values in .conf files are always (and
13030	 * only) interpreted to be integer arrays.  We must read our DOF
13031	 * as an integer array, and then squeeze it into a byte array.
13032	 */
13033	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
13034	    (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
13035		return (NULL);
13036
13037	for (i = 0; i < len; i++)
13038		buf[i] = (uchar_t)(((int *)buf)[i]);
13039
13040	if (len < sizeof (dof_hdr_t)) {
13041		ddi_prop_free(buf);
13042		dtrace_dof_error(NULL, "truncated header");
13043		return (NULL);
13044	}
13045
13046	if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
13047		ddi_prop_free(buf);
13048		dtrace_dof_error(NULL, "truncated DOF");
13049		return (NULL);
13050	}
13051
13052	if (loadsz >= dtrace_dof_maxsize) {
13053		ddi_prop_free(buf);
13054		dtrace_dof_error(NULL, "oversized DOF");
13055		return (NULL);
13056	}
13057
13058	dof = kmem_alloc(loadsz, KM_SLEEP);
13059	bcopy(buf, dof, loadsz);
13060	ddi_prop_free(buf);
13061#else
13062	char *p;
13063	char *p_env;
13064
13065	if ((p_env = getenv(name)) == NULL)
13066		return (NULL);
13067
13068	len = strlen(p_env) / 2;
13069
13070	buf = kmem_alloc(len, KM_SLEEP);
13071
13072	dof = (dof_hdr_t *) buf;
13073
13074	p = p_env;
13075
13076	for (i = 0; i < len; i++) {
13077		buf[i] = (dtrace_dof_char(p[0]) << 4) |
13078		     dtrace_dof_char(p[1]);
13079		p += 2;
13080	}
13081
13082	freeenv(p_env);
13083
13084	if (len < sizeof (dof_hdr_t)) {
13085		kmem_free(buf, 0);
13086		dtrace_dof_error(NULL, "truncated header");
13087		return (NULL);
13088	}
13089
13090	if (len < (loadsz = dof->dofh_loadsz)) {
13091		kmem_free(buf, 0);
13092		dtrace_dof_error(NULL, "truncated DOF");
13093		return (NULL);
13094	}
13095
13096	if (loadsz >= dtrace_dof_maxsize) {
13097		kmem_free(buf, 0);
13098		dtrace_dof_error(NULL, "oversized DOF");
13099		return (NULL);
13100	}
13101#endif
13102
13103	return (dof);
13104}
13105
13106static void
13107dtrace_dof_destroy(dof_hdr_t *dof)
13108{
13109	kmem_free(dof, dof->dofh_loadsz);
13110}
13111
13112/*
13113 * Return the dof_sec_t pointer corresponding to a given section index.  If the
13114 * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
13115 * a type other than DOF_SECT_NONE is specified, the header is checked against
13116 * this type and NULL is returned if the types do not match.
13117 */
13118static dof_sec_t *
13119dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13120{
13121	dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
13122	    ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13123
13124	if (i >= dof->dofh_secnum) {
13125		dtrace_dof_error(dof, "referenced section index is invalid");
13126		return (NULL);
13127	}
13128
13129	if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13130		dtrace_dof_error(dof, "referenced section is not loadable");
13131		return (NULL);
13132	}
13133
13134	if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13135		dtrace_dof_error(dof, "referenced section is the wrong type");
13136		return (NULL);
13137	}
13138
13139	return (sec);
13140}
13141
13142static dtrace_probedesc_t *
13143dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
13144{
13145	dof_probedesc_t *probe;
13146	dof_sec_t *strtab;
13147	uintptr_t daddr = (uintptr_t)dof;
13148	uintptr_t str;
13149	size_t size;
13150
13151	if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13152		dtrace_dof_error(dof, "invalid probe section");
13153		return (NULL);
13154	}
13155
13156	if (sec->dofs_align != sizeof (dof_secidx_t)) {
13157		dtrace_dof_error(dof, "bad alignment in probe description");
13158		return (NULL);
13159	}
13160
13161	if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13162		dtrace_dof_error(dof, "truncated probe description");
13163		return (NULL);
13164	}
13165
13166	probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13167	strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
13168
13169	if (strtab == NULL)
13170		return (NULL);
13171
13172	str = daddr + strtab->dofs_offset;
13173	size = strtab->dofs_size;
13174
13175	if (probe->dofp_provider >= strtab->dofs_size) {
13176		dtrace_dof_error(dof, "corrupt probe provider");
13177		return (NULL);
13178	}
13179
13180	(void) strncpy(desc->dtpd_provider,
13181	    (char *)(str + probe->dofp_provider),
13182	    MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
13183
13184	if (probe->dofp_mod >= strtab->dofs_size) {
13185		dtrace_dof_error(dof, "corrupt probe module");
13186		return (NULL);
13187	}
13188
13189	(void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13190	    MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
13191
13192	if (probe->dofp_func >= strtab->dofs_size) {
13193		dtrace_dof_error(dof, "corrupt probe function");
13194		return (NULL);
13195	}
13196
13197	(void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13198	    MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
13199
13200	if (probe->dofp_name >= strtab->dofs_size) {
13201		dtrace_dof_error(dof, "corrupt probe name");
13202		return (NULL);
13203	}
13204
13205	(void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13206	    MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
13207
13208	return (desc);
13209}
13210
13211static dtrace_difo_t *
13212dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13213    cred_t *cr)
13214{
13215	dtrace_difo_t *dp;
13216	size_t ttl = 0;
13217	dof_difohdr_t *dofd;
13218	uintptr_t daddr = (uintptr_t)dof;
13219	size_t max = dtrace_difo_maxsize;
13220	int i, l, n;
13221
13222	static const struct {
13223		int section;
13224		int bufoffs;
13225		int lenoffs;
13226		int entsize;
13227		int align;
13228		const char *msg;
13229	} difo[] = {
13230		{ DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13231		offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
13232		sizeof (dif_instr_t), "multiple DIF sections" },
13233
13234		{ DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13235		offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
13236		sizeof (uint64_t), "multiple integer tables" },
13237
13238		{ DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13239		offsetof(dtrace_difo_t, dtdo_strlen), 0,
13240		sizeof (char), "multiple string tables" },
13241
13242		{ DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13243		offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
13244		sizeof (uint_t), "multiple variable tables" },
13245
13246		{ DOF_SECT_NONE, 0, 0, 0, 0, NULL }
13247	};
13248
13249	if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13250		dtrace_dof_error(dof, "invalid DIFO header section");
13251		return (NULL);
13252	}
13253
13254	if (sec->dofs_align != sizeof (dof_secidx_t)) {
13255		dtrace_dof_error(dof, "bad alignment in DIFO header");
13256		return (NULL);
13257	}
13258
13259	if (sec->dofs_size < sizeof (dof_difohdr_t) ||
13260	    sec->dofs_size % sizeof (dof_secidx_t)) {
13261		dtrace_dof_error(dof, "bad size in DIFO header");
13262		return (NULL);
13263	}
13264
13265	dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13266	n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
13267
13268	dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13269	dp->dtdo_rtype = dofd->dofd_rtype;
13270
13271	for (l = 0; l < n; l++) {
13272		dof_sec_t *subsec;
13273		void **bufp;
13274		uint32_t *lenp;
13275
13276		if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13277		    dofd->dofd_links[l])) == NULL)
13278			goto err; /* invalid section link */
13279
13280		if (ttl + subsec->dofs_size > max) {
13281			dtrace_dof_error(dof, "exceeds maximum size");
13282			goto err;
13283		}
13284
13285		ttl += subsec->dofs_size;
13286
13287		for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
13288			if (subsec->dofs_type != difo[i].section)
13289				continue;
13290
13291			if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13292				dtrace_dof_error(dof, "section not loaded");
13293				goto err;
13294			}
13295
13296			if (subsec->dofs_align != difo[i].align) {
13297				dtrace_dof_error(dof, "bad alignment");
13298				goto err;
13299			}
13300
13301			bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13302			lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13303
13304			if (*bufp != NULL) {
13305				dtrace_dof_error(dof, difo[i].msg);
13306				goto err;
13307			}
13308
13309			if (difo[i].entsize != subsec->dofs_entsize) {
13310				dtrace_dof_error(dof, "entry size mismatch");
13311				goto err;
13312			}
13313
13314			if (subsec->dofs_entsize != 0 &&
13315			    (subsec->dofs_size % subsec->dofs_entsize) != 0) {
13316				dtrace_dof_error(dof, "corrupt entry size");
13317				goto err;
13318			}
13319
13320			*lenp = subsec->dofs_size;
13321			*bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13322			bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
13323			    *bufp, subsec->dofs_size);
13324
13325			if (subsec->dofs_entsize != 0)
13326				*lenp /= subsec->dofs_entsize;
13327
13328			break;
13329		}
13330
13331		/*
13332		 * If we encounter a loadable DIFO sub-section that is not
13333		 * known to us, assume this is a broken program and fail.
13334		 */
13335		if (difo[i].section == DOF_SECT_NONE &&
13336		    (subsec->dofs_flags & DOF_SECF_LOAD)) {
13337			dtrace_dof_error(dof, "unrecognized DIFO subsection");
13338			goto err;
13339		}
13340	}
13341
13342	if (dp->dtdo_buf == NULL) {
13343		/*
13344		 * We can't have a DIF object without DIF text.
13345		 */
13346		dtrace_dof_error(dof, "missing DIF text");
13347		goto err;
13348	}
13349
13350	/*
13351	 * Before we validate the DIF object, run through the variable table
13352	 * looking for the strings -- if any of their size are under, we'll set
13353	 * their size to be the system-wide default string size.  Note that
13354	 * this should _not_ happen if the "strsize" option has been set --
13355	 * in this case, the compiler should have set the size to reflect the
13356	 * setting of the option.
13357	 */
13358	for (i = 0; i < dp->dtdo_varlen; i++) {
13359		dtrace_difv_t *v = &dp->dtdo_vartab[i];
13360		dtrace_diftype_t *t = &v->dtdv_type;
13361
13362		if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13363			continue;
13364
13365		if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
13366			t->dtdt_size = dtrace_strsize_default;
13367	}
13368
13369	if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
13370		goto err;
13371
13372	dtrace_difo_init(dp, vstate);
13373	return (dp);
13374
13375err:
13376	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13377	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13378	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13379	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13380
13381	kmem_free(dp, sizeof (dtrace_difo_t));
13382	return (NULL);
13383}
13384
13385static dtrace_predicate_t *
13386dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13387    cred_t *cr)
13388{
13389	dtrace_difo_t *dp;
13390
13391	if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13392		return (NULL);
13393
13394	return (dtrace_predicate_create(dp));
13395}
13396
13397static dtrace_actdesc_t *
13398dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13399    cred_t *cr)
13400{
13401	dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
13402	dof_actdesc_t *desc;
13403	dof_sec_t *difosec;
13404	size_t offs;
13405	uintptr_t daddr = (uintptr_t)dof;
13406	uint64_t arg;
13407	dtrace_actkind_t kind;
13408
13409	if (sec->dofs_type != DOF_SECT_ACTDESC) {
13410		dtrace_dof_error(dof, "invalid action section");
13411		return (NULL);
13412	}
13413
13414	if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13415		dtrace_dof_error(dof, "truncated action description");
13416		return (NULL);
13417	}
13418
13419	if (sec->dofs_align != sizeof (uint64_t)) {
13420		dtrace_dof_error(dof, "bad alignment in action description");
13421		return (NULL);
13422	}
13423
13424	if (sec->dofs_size < sec->dofs_entsize) {
13425		dtrace_dof_error(dof, "section entry size exceeds total size");
13426		return (NULL);
13427	}
13428
13429	if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13430		dtrace_dof_error(dof, "bad entry size in action description");
13431		return (NULL);
13432	}
13433
13434	if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13435		dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
13436		return (NULL);
13437	}
13438
13439	for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13440		desc = (dof_actdesc_t *)(daddr +
13441		    (uintptr_t)sec->dofs_offset + offs);
13442		kind = (dtrace_actkind_t)desc->dofa_kind;
13443
13444		if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13445		    (kind != DTRACEACT_PRINTA ||
13446		    desc->dofa_strtab != DOF_SECIDX_NONE)) ||
13447		    (kind == DTRACEACT_DIFEXPR &&
13448		    desc->dofa_strtab != DOF_SECIDX_NONE)) {
13449			dof_sec_t *strtab;
13450			char *str, *fmt;
13451			uint64_t i;
13452
13453			/*
13454			 * The argument to these actions is an index into the
13455			 * DOF string table.  For printf()-like actions, this
13456			 * is the format string.  For print(), this is the
13457			 * CTF type of the expression result.
13458			 */
13459			if ((strtab = dtrace_dof_sect(dof,
13460			    DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
13461				goto err;
13462
13463			str = (char *)((uintptr_t)dof +
13464			    (uintptr_t)strtab->dofs_offset);
13465
13466			for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13467				if (str[i] == '\0')
13468					break;
13469			}
13470
13471			if (i >= strtab->dofs_size) {
13472				dtrace_dof_error(dof, "bogus format string");
13473				goto err;
13474			}
13475
13476			if (i == desc->dofa_arg) {
13477				dtrace_dof_error(dof, "empty format string");
13478				goto err;
13479			}
13480
13481			i -= desc->dofa_arg;
13482			fmt = kmem_alloc(i + 1, KM_SLEEP);
13483			bcopy(&str[desc->dofa_arg], fmt, i + 1);
13484			arg = (uint64_t)(uintptr_t)fmt;
13485		} else {
13486			if (kind == DTRACEACT_PRINTA) {
13487				ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13488				arg = 0;
13489			} else {
13490				arg = desc->dofa_arg;
13491			}
13492		}
13493
13494		act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13495		    desc->dofa_uarg, arg);
13496
13497		if (last != NULL) {
13498			last->dtad_next = act;
13499		} else {
13500			first = act;
13501		}
13502
13503		last = act;
13504
13505		if (desc->dofa_difo == DOF_SECIDX_NONE)
13506			continue;
13507
13508		if ((difosec = dtrace_dof_sect(dof,
13509		    DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13510			goto err;
13511
13512		act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13513
13514		if (act->dtad_difo == NULL)
13515			goto err;
13516	}
13517
13518	ASSERT(first != NULL);
13519	return (first);
13520
13521err:
13522	for (act = first; act != NULL; act = next) {
13523		next = act->dtad_next;
13524		dtrace_actdesc_release(act, vstate);
13525	}
13526
13527	return (NULL);
13528}
13529
13530static dtrace_ecbdesc_t *
13531dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13532    cred_t *cr)
13533{
13534	dtrace_ecbdesc_t *ep;
13535	dof_ecbdesc_t *ecb;
13536	dtrace_probedesc_t *desc;
13537	dtrace_predicate_t *pred = NULL;
13538
13539	if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13540		dtrace_dof_error(dof, "truncated ECB description");
13541		return (NULL);
13542	}
13543
13544	if (sec->dofs_align != sizeof (uint64_t)) {
13545		dtrace_dof_error(dof, "bad alignment in ECB description");
13546		return (NULL);
13547	}
13548
13549	ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13550	sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13551
13552	if (sec == NULL)
13553		return (NULL);
13554
13555	ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13556	ep->dted_uarg = ecb->dofe_uarg;
13557	desc = &ep->dted_probe;
13558
13559	if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13560		goto err;
13561
13562	if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13563		if ((sec = dtrace_dof_sect(dof,
13564		    DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13565			goto err;
13566
13567		if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13568			goto err;
13569
13570		ep->dted_pred.dtpdd_predicate = pred;
13571	}
13572
13573	if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13574		if ((sec = dtrace_dof_sect(dof,
13575		    DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13576			goto err;
13577
13578		ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13579
13580		if (ep->dted_action == NULL)
13581			goto err;
13582	}
13583
13584	return (ep);
13585
13586err:
13587	if (pred != NULL)
13588		dtrace_predicate_release(pred, vstate);
13589	kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13590	return (NULL);
13591}
13592
13593/*
13594 * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
13595 * specified DOF.  At present, this amounts to simply adding 'ubase' to the
13596 * site of any user SETX relocations to account for load object base address.
13597 * In the future, if we need other relocations, this function can be extended.
13598 */
13599static int
13600dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
13601{
13602	uintptr_t daddr = (uintptr_t)dof;
13603	dof_relohdr_t *dofr =
13604	    (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13605	dof_sec_t *ss, *rs, *ts;
13606	dof_relodesc_t *r;
13607	uint_t i, n;
13608
13609	if (sec->dofs_size < sizeof (dof_relohdr_t) ||
13610	    sec->dofs_align != sizeof (dof_secidx_t)) {
13611		dtrace_dof_error(dof, "invalid relocation header");
13612		return (-1);
13613	}
13614
13615	ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
13616	rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
13617	ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
13618
13619	if (ss == NULL || rs == NULL || ts == NULL)
13620		return (-1); /* dtrace_dof_error() has been called already */
13621
13622	if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
13623	    rs->dofs_align != sizeof (uint64_t)) {
13624		dtrace_dof_error(dof, "invalid relocation section");
13625		return (-1);
13626	}
13627
13628	r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
13629	n = rs->dofs_size / rs->dofs_entsize;
13630
13631	for (i = 0; i < n; i++) {
13632		uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
13633
13634		switch (r->dofr_type) {
13635		case DOF_RELO_NONE:
13636			break;
13637		case DOF_RELO_SETX:
13638			if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
13639			    sizeof (uint64_t) > ts->dofs_size) {
13640				dtrace_dof_error(dof, "bad relocation offset");
13641				return (-1);
13642			}
13643
13644			if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
13645				dtrace_dof_error(dof, "misaligned setx relo");
13646				return (-1);
13647			}
13648
13649			*(uint64_t *)taddr += ubase;
13650			break;
13651		default:
13652			dtrace_dof_error(dof, "invalid relocation type");
13653			return (-1);
13654		}
13655
13656		r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
13657	}
13658
13659	return (0);
13660}
13661
13662/*
13663 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13664 * header:  it should be at the front of a memory region that is at least
13665 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13666 * size.  It need not be validated in any other way.
13667 */
13668static int
13669dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
13670    dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
13671{
13672	uint64_t len = dof->dofh_loadsz, seclen;
13673	uintptr_t daddr = (uintptr_t)dof;
13674	dtrace_ecbdesc_t *ep;
13675	dtrace_enabling_t *enab;
13676	uint_t i;
13677
13678	ASSERT(MUTEX_HELD(&dtrace_lock));
13679	ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13680
13681	/*
13682	 * Check the DOF header identification bytes.  In addition to checking
13683	 * valid settings, we also verify that unused bits/bytes are zeroed so
13684	 * we can use them later without fear of regressing existing binaries.
13685	 */
13686	if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
13687	    DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
13688		dtrace_dof_error(dof, "DOF magic string mismatch");
13689		return (-1);
13690	}
13691
13692	if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13693	    dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13694		dtrace_dof_error(dof, "DOF has invalid data model");
13695		return (-1);
13696	}
13697
13698	if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13699		dtrace_dof_error(dof, "DOF encoding mismatch");
13700		return (-1);
13701	}
13702
13703	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
13704	    dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
13705		dtrace_dof_error(dof, "DOF version mismatch");
13706		return (-1);
13707	}
13708
13709	if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13710		dtrace_dof_error(dof, "DOF uses unsupported instruction set");
13711		return (-1);
13712	}
13713
13714	if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13715		dtrace_dof_error(dof, "DOF uses too many integer registers");
13716		return (-1);
13717	}
13718
13719	if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13720		dtrace_dof_error(dof, "DOF uses too many tuple registers");
13721		return (-1);
13722	}
13723
13724	for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13725		if (dof->dofh_ident[i] != 0) {
13726			dtrace_dof_error(dof, "DOF has invalid ident byte set");
13727			return (-1);
13728		}
13729	}
13730
13731	if (dof->dofh_flags & ~DOF_FL_VALID) {
13732		dtrace_dof_error(dof, "DOF has invalid flag bits set");
13733		return (-1);
13734	}
13735
13736	if (dof->dofh_secsize == 0) {
13737		dtrace_dof_error(dof, "zero section header size");
13738		return (-1);
13739	}
13740
13741	/*
13742	 * Check that the section headers don't exceed the amount of DOF
13743	 * data.  Note that we cast the section size and number of sections
13744	 * to uint64_t's to prevent possible overflow in the multiplication.
13745	 */
13746	seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13747
13748	if (dof->dofh_secoff > len || seclen > len ||
13749	    dof->dofh_secoff + seclen > len) {
13750		dtrace_dof_error(dof, "truncated section headers");
13751		return (-1);
13752	}
13753
13754	if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13755		dtrace_dof_error(dof, "misaligned section headers");
13756		return (-1);
13757	}
13758
13759	if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13760		dtrace_dof_error(dof, "misaligned section size");
13761		return (-1);
13762	}
13763
13764	/*
13765	 * Take an initial pass through the section headers to be sure that
13766	 * the headers don't have stray offsets.  If the 'noprobes' flag is
13767	 * set, do not permit sections relating to providers, probes, or args.
13768	 */
13769	for (i = 0; i < dof->dofh_secnum; i++) {
13770		dof_sec_t *sec = (dof_sec_t *)(daddr +
13771		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13772
13773		if (noprobes) {
13774			switch (sec->dofs_type) {
13775			case DOF_SECT_PROVIDER:
13776			case DOF_SECT_PROBES:
13777			case DOF_SECT_PRARGS:
13778			case DOF_SECT_PROFFS:
13779				dtrace_dof_error(dof, "illegal sections "
13780				    "for enabling");
13781				return (-1);
13782			}
13783		}
13784
13785		if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
13786		    !(sec->dofs_flags & DOF_SECF_LOAD)) {
13787			dtrace_dof_error(dof, "loadable section with load "
13788			    "flag unset");
13789			return (-1);
13790		}
13791
13792		if (!(sec->dofs_flags & DOF_SECF_LOAD))
13793			continue; /* just ignore non-loadable sections */
13794
13795		if (!ISP2(sec->dofs_align)) {
13796			dtrace_dof_error(dof, "bad section alignment");
13797			return (-1);
13798		}
13799
13800		if (sec->dofs_offset & (sec->dofs_align - 1)) {
13801			dtrace_dof_error(dof, "misaligned section");
13802			return (-1);
13803		}
13804
13805		if (sec->dofs_offset > len || sec->dofs_size > len ||
13806		    sec->dofs_offset + sec->dofs_size > len) {
13807			dtrace_dof_error(dof, "corrupt section header");
13808			return (-1);
13809		}
13810
13811		if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13812		    sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13813			dtrace_dof_error(dof, "non-terminating string table");
13814			return (-1);
13815		}
13816	}
13817
13818	/*
13819	 * Take a second pass through the sections and locate and perform any
13820	 * relocations that are present.  We do this after the first pass to
13821	 * be sure that all sections have had their headers validated.
13822	 */
13823	for (i = 0; i < dof->dofh_secnum; i++) {
13824		dof_sec_t *sec = (dof_sec_t *)(daddr +
13825		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13826
13827		if (!(sec->dofs_flags & DOF_SECF_LOAD))
13828			continue; /* skip sections that are not loadable */
13829
13830		switch (sec->dofs_type) {
13831		case DOF_SECT_URELHDR:
13832			if (dtrace_dof_relocate(dof, sec, ubase) != 0)
13833				return (-1);
13834			break;
13835		}
13836	}
13837
13838	if ((enab = *enabp) == NULL)
13839		enab = *enabp = dtrace_enabling_create(vstate);
13840
13841	for (i = 0; i < dof->dofh_secnum; i++) {
13842		dof_sec_t *sec = (dof_sec_t *)(daddr +
13843		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13844
13845		if (sec->dofs_type != DOF_SECT_ECBDESC)
13846			continue;
13847
13848		if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
13849			dtrace_enabling_destroy(enab);
13850			*enabp = NULL;
13851			return (-1);
13852		}
13853
13854		dtrace_enabling_add(enab, ep);
13855	}
13856
13857	return (0);
13858}
13859
13860/*
13861 * Process DOF for any options.  This routine assumes that the DOF has been
13862 * at least processed by dtrace_dof_slurp().
13863 */
13864static int
13865dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13866{
13867	int i, rval;
13868	uint32_t entsize;
13869	size_t offs;
13870	dof_optdesc_t *desc;
13871
13872	for (i = 0; i < dof->dofh_secnum; i++) {
13873		dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13874		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13875
13876		if (sec->dofs_type != DOF_SECT_OPTDESC)
13877			continue;
13878
13879		if (sec->dofs_align != sizeof (uint64_t)) {
13880			dtrace_dof_error(dof, "bad alignment in "
13881			    "option description");
13882			return (EINVAL);
13883		}
13884
13885		if ((entsize = sec->dofs_entsize) == 0) {
13886			dtrace_dof_error(dof, "zeroed option entry size");
13887			return (EINVAL);
13888		}
13889
13890		if (entsize < sizeof (dof_optdesc_t)) {
13891			dtrace_dof_error(dof, "bad option entry size");
13892			return (EINVAL);
13893		}
13894
13895		for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13896			desc = (dof_optdesc_t *)((uintptr_t)dof +
13897			    (uintptr_t)sec->dofs_offset + offs);
13898
13899			if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13900				dtrace_dof_error(dof, "non-zero option string");
13901				return (EINVAL);
13902			}
13903
13904			if (desc->dofo_value == DTRACEOPT_UNSET) {
13905				dtrace_dof_error(dof, "unset option");
13906				return (EINVAL);
13907			}
13908
13909			if ((rval = dtrace_state_option(state,
13910			    desc->dofo_option, desc->dofo_value)) != 0) {
13911				dtrace_dof_error(dof, "rejected option");
13912				return (rval);
13913			}
13914		}
13915	}
13916
13917	return (0);
13918}
13919
13920/*
13921 * DTrace Consumer State Functions
13922 */
13923static int
13924dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13925{
13926	size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
13927	void *base;
13928	uintptr_t limit;
13929	dtrace_dynvar_t *dvar, *next, *start;
13930	int i;
13931
13932	ASSERT(MUTEX_HELD(&dtrace_lock));
13933	ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13934
13935	bzero(dstate, sizeof (dtrace_dstate_t));
13936
13937	if ((dstate->dtds_chunksize = chunksize) == 0)
13938		dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13939
13940	if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13941		size = min;
13942
13943	if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
13944		return (ENOMEM);
13945
13946	dstate->dtds_size = size;
13947	dstate->dtds_base = base;
13948	dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13949	bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
13950
13951	hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13952
13953	if (hashsize != 1 && (hashsize & 1))
13954		hashsize--;
13955
13956	dstate->dtds_hashsize = hashsize;
13957	dstate->dtds_hash = dstate->dtds_base;
13958
13959	/*
13960	 * Set all of our hash buckets to point to the single sink, and (if
13961	 * it hasn't already been set), set the sink's hash value to be the
13962	 * sink sentinel value.  The sink is needed for dynamic variable
13963	 * lookups to know that they have iterated over an entire, valid hash
13964	 * chain.
13965	 */
13966	for (i = 0; i < hashsize; i++)
13967		dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13968
13969	if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13970		dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13971
13972	/*
13973	 * Determine number of active CPUs.  Divide free list evenly among
13974	 * active CPUs.
13975	 */
13976	start = (dtrace_dynvar_t *)
13977	    ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13978	limit = (uintptr_t)base + size;
13979
13980	maxper = (limit - (uintptr_t)start) / NCPU;
13981	maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13982
13983#if !defined(sun)
13984	CPU_FOREACH(i) {
13985#else
13986	for (i = 0; i < NCPU; i++) {
13987#endif
13988		dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13989
13990		/*
13991		 * If we don't even have enough chunks to make it once through
13992		 * NCPUs, we're just going to allocate everything to the first
13993		 * CPU.  And if we're on the last CPU, we're going to allocate
13994		 * whatever is left over.  In either case, we set the limit to
13995		 * be the limit of the dynamic variable space.
13996		 */
13997		if (maxper == 0 || i == NCPU - 1) {
13998			limit = (uintptr_t)base + size;
13999			start = NULL;
14000		} else {
14001			limit = (uintptr_t)start + maxper;
14002			start = (dtrace_dynvar_t *)limit;
14003		}
14004
14005		ASSERT(limit <= (uintptr_t)base + size);
14006
14007		for (;;) {
14008			next = (dtrace_dynvar_t *)((uintptr_t)dvar +
14009			    dstate->dtds_chunksize);
14010
14011			if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
14012				break;
14013
14014			dvar->dtdv_next = next;
14015			dvar = next;
14016		}
14017
14018		if (maxper == 0)
14019			break;
14020	}
14021
14022	return (0);
14023}
14024
14025static void
14026dtrace_dstate_fini(dtrace_dstate_t *dstate)
14027{
14028	ASSERT(MUTEX_HELD(&cpu_lock));
14029
14030	if (dstate->dtds_base == NULL)
14031		return;
14032
14033	kmem_free(dstate->dtds_base, dstate->dtds_size);
14034	kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
14035}
14036
14037static void
14038dtrace_vstate_fini(dtrace_vstate_t *vstate)
14039{
14040	/*
14041	 * Logical XOR, where are you?
14042	 */
14043	ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
14044
14045	if (vstate->dtvs_nglobals > 0) {
14046		kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
14047		    sizeof (dtrace_statvar_t *));
14048	}
14049
14050	if (vstate->dtvs_ntlocals > 0) {
14051		kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
14052		    sizeof (dtrace_difv_t));
14053	}
14054
14055	ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
14056
14057	if (vstate->dtvs_nlocals > 0) {
14058		kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
14059		    sizeof (dtrace_statvar_t *));
14060	}
14061}
14062
14063#if defined(sun)
14064static void
14065dtrace_state_clean(dtrace_state_t *state)
14066{
14067	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14068		return;
14069
14070	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14071	dtrace_speculation_clean(state);
14072}
14073
14074static void
14075dtrace_state_deadman(dtrace_state_t *state)
14076{
14077	hrtime_t now;
14078
14079	dtrace_sync();
14080
14081	now = dtrace_gethrtime();
14082
14083	if (state != dtrace_anon.dta_state &&
14084	    now - state->dts_laststatus >= dtrace_deadman_user)
14085		return;
14086
14087	/*
14088	 * We must be sure that dts_alive never appears to be less than the
14089	 * value upon entry to dtrace_state_deadman(), and because we lack a
14090	 * dtrace_cas64(), we cannot store to it atomically.  We thus instead
14091	 * store INT64_MAX to it, followed by a memory barrier, followed by
14092	 * the new value.  This assures that dts_alive never appears to be
14093	 * less than its true value, regardless of the order in which the
14094	 * stores to the underlying storage are issued.
14095	 */
14096	state->dts_alive = INT64_MAX;
14097	dtrace_membar_producer();
14098	state->dts_alive = now;
14099}
14100#else
14101static void
14102dtrace_state_clean(void *arg)
14103{
14104	dtrace_state_t *state = arg;
14105	dtrace_optval_t *opt = state->dts_options;
14106
14107	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14108		return;
14109
14110	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14111	dtrace_speculation_clean(state);
14112
14113	callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14114	    dtrace_state_clean, state);
14115}
14116
14117static void
14118dtrace_state_deadman(void *arg)
14119{
14120	dtrace_state_t *state = arg;
14121	hrtime_t now;
14122
14123	dtrace_sync();
14124
14125	dtrace_debug_output();
14126
14127	now = dtrace_gethrtime();
14128
14129	if (state != dtrace_anon.dta_state &&
14130	    now - state->dts_laststatus >= dtrace_deadman_user)
14131		return;
14132
14133	/*
14134	 * We must be sure that dts_alive never appears to be less than the
14135	 * value upon entry to dtrace_state_deadman(), and because we lack a
14136	 * dtrace_cas64(), we cannot store to it atomically.  We thus instead
14137	 * store INT64_MAX to it, followed by a memory barrier, followed by
14138	 * the new value.  This assures that dts_alive never appears to be
14139	 * less than its true value, regardless of the order in which the
14140	 * stores to the underlying storage are issued.
14141	 */
14142	state->dts_alive = INT64_MAX;
14143	dtrace_membar_producer();
14144	state->dts_alive = now;
14145
14146	callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14147	    dtrace_state_deadman, state);
14148}
14149#endif
14150
14151static dtrace_state_t *
14152#if defined(sun)
14153dtrace_state_create(dev_t *devp, cred_t *cr)
14154#else
14155dtrace_state_create(struct cdev *dev)
14156#endif
14157{
14158#if defined(sun)
14159	minor_t minor;
14160	major_t major;
14161#else
14162	cred_t *cr = NULL;
14163	int m = 0;
14164#endif
14165	char c[30];
14166	dtrace_state_t *state;
14167	dtrace_optval_t *opt;
14168	int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
14169
14170	ASSERT(MUTEX_HELD(&dtrace_lock));
14171	ASSERT(MUTEX_HELD(&cpu_lock));
14172
14173#if defined(sun)
14174	minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
14175	    VM_BESTFIT | VM_SLEEP);
14176
14177	if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
14178		vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
14179		return (NULL);
14180	}
14181
14182	state = ddi_get_soft_state(dtrace_softstate, minor);
14183#else
14184	if (dev != NULL) {
14185		cr = dev->si_cred;
14186		m = dev2unit(dev);
14187	}
14188
14189	/* Allocate memory for the state. */
14190	state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
14191#endif
14192
14193	state->dts_epid = DTRACE_EPIDNONE + 1;
14194
14195	(void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
14196#if defined(sun)
14197	state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
14198	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14199
14200	if (devp != NULL) {
14201		major = getemajor(*devp);
14202	} else {
14203		major = ddi_driver_major(dtrace_devi);
14204	}
14205
14206	state->dts_dev = makedevice(major, minor);
14207
14208	if (devp != NULL)
14209		*devp = state->dts_dev;
14210#else
14211	state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
14212	state->dts_dev = dev;
14213#endif
14214
14215	/*
14216	 * We allocate NCPU buffers.  On the one hand, this can be quite
14217	 * a bit of memory per instance (nearly 36K on a Starcat).  On the
14218	 * other hand, it saves an additional memory reference in the probe
14219	 * path.
14220	 */
14221	state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14222	state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14223
14224#if defined(sun)
14225	state->dts_cleaner = CYCLIC_NONE;
14226	state->dts_deadman = CYCLIC_NONE;
14227#else
14228	callout_init(&state->dts_cleaner, CALLOUT_MPSAFE);
14229	callout_init(&state->dts_deadman, CALLOUT_MPSAFE);
14230#endif
14231	state->dts_vstate.dtvs_state = state;
14232
14233	for (i = 0; i < DTRACEOPT_MAX; i++)
14234		state->dts_options[i] = DTRACEOPT_UNSET;
14235
14236	/*
14237	 * Set the default options.
14238	 */
14239	opt = state->dts_options;
14240	opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14241	opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14242	opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14243	opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14244	opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14245	opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14246	opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14247	opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14248	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14249	opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14250	opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14251	opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14252	opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14253	opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14254
14255	state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
14256
14257	/*
14258	 * Depending on the user credentials, we set flag bits which alter probe
14259	 * visibility or the amount of destructiveness allowed.  In the case of
14260	 * actual anonymous tracing, or the possession of all privileges, all of
14261	 * the normal checks are bypassed.
14262	 */
14263	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14264		state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14265		state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14266	} else {
14267		/*
14268		 * Set up the credentials for this instantiation.  We take a
14269		 * hold on the credential to prevent it from disappearing on
14270		 * us; this in turn prevents the zone_t referenced by this
14271		 * credential from disappearing.  This means that we can
14272		 * examine the credential and the zone from probe context.
14273		 */
14274		crhold(cr);
14275		state->dts_cred.dcr_cred = cr;
14276
14277		/*
14278		 * CRA_PROC means "we have *some* privilege for dtrace" and
14279		 * unlocks the use of variables like pid, zonename, etc.
14280		 */
14281		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
14282		    PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14283			state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
14284		}
14285
14286		/*
14287		 * dtrace_user allows use of syscall and profile providers.
14288		 * If the user also has proc_owner and/or proc_zone, we
14289		 * extend the scope to include additional visibility and
14290		 * destructive power.
14291		 */
14292		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14293			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14294				state->dts_cred.dcr_visible |=
14295				    DTRACE_CRV_ALLPROC;
14296
14297				state->dts_cred.dcr_action |=
14298				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14299			}
14300
14301			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14302				state->dts_cred.dcr_visible |=
14303				    DTRACE_CRV_ALLZONE;
14304
14305				state->dts_cred.dcr_action |=
14306				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14307			}
14308
14309			/*
14310			 * If we have all privs in whatever zone this is,
14311			 * we can do destructive things to processes which
14312			 * have altered credentials.
14313			 */
14314#if defined(sun)
14315			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14316			    cr->cr_zone->zone_privset)) {
14317				state->dts_cred.dcr_action |=
14318				    DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14319			}
14320#endif
14321		}
14322
14323		/*
14324		 * Holding the dtrace_kernel privilege also implies that
14325		 * the user has the dtrace_user privilege from a visibility
14326		 * perspective.  But without further privileges, some
14327		 * destructive actions are not available.
14328		 */
14329		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14330			/*
14331			 * Make all probes in all zones visible.  However,
14332			 * this doesn't mean that all actions become available
14333			 * to all zones.
14334			 */
14335			state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
14336			    DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
14337
14338			state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
14339			    DTRACE_CRA_PROC;
14340			/*
14341			 * Holding proc_owner means that destructive actions
14342			 * for *this* zone are allowed.
14343			 */
14344			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14345				state->dts_cred.dcr_action |=
14346				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14347
14348			/*
14349			 * Holding proc_zone means that destructive actions
14350			 * for this user/group ID in all zones is allowed.
14351			 */
14352			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14353				state->dts_cred.dcr_action |=
14354				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14355
14356#if defined(sun)
14357			/*
14358			 * If we have all privs in whatever zone this is,
14359			 * we can do destructive things to processes which
14360			 * have altered credentials.
14361			 */
14362			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14363			    cr->cr_zone->zone_privset)) {
14364				state->dts_cred.dcr_action |=
14365				    DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14366			}
14367#endif
14368		}
14369
14370		/*
14371		 * Holding the dtrace_proc privilege gives control over fasttrap
14372		 * and pid providers.  We need to grant wider destructive
14373		 * privileges in the event that the user has proc_owner and/or
14374		 * proc_zone.
14375		 */
14376		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14377			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14378				state->dts_cred.dcr_action |=
14379				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14380
14381			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14382				state->dts_cred.dcr_action |=
14383				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14384		}
14385	}
14386
14387	return (state);
14388}
14389
14390static int
14391dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
14392{
14393	dtrace_optval_t *opt = state->dts_options, size;
14394	processorid_t cpu = 0;;
14395	int flags = 0, rval, factor, divisor = 1;
14396
14397	ASSERT(MUTEX_HELD(&dtrace_lock));
14398	ASSERT(MUTEX_HELD(&cpu_lock));
14399	ASSERT(which < DTRACEOPT_MAX);
14400	ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
14401	    (state == dtrace_anon.dta_state &&
14402	    state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14403
14404	if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
14405		return (0);
14406
14407	if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14408		cpu = opt[DTRACEOPT_CPU];
14409
14410	if (which == DTRACEOPT_SPECSIZE)
14411		flags |= DTRACEBUF_NOSWITCH;
14412
14413	if (which == DTRACEOPT_BUFSIZE) {
14414		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14415			flags |= DTRACEBUF_RING;
14416
14417		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14418			flags |= DTRACEBUF_FILL;
14419
14420		if (state != dtrace_anon.dta_state ||
14421		    state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14422			flags |= DTRACEBUF_INACTIVE;
14423	}
14424
14425	for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
14426		/*
14427		 * The size must be 8-byte aligned.  If the size is not 8-byte
14428		 * aligned, drop it down by the difference.
14429		 */
14430		if (size & (sizeof (uint64_t) - 1))
14431			size -= size & (sizeof (uint64_t) - 1);
14432
14433		if (size < state->dts_reserve) {
14434			/*
14435			 * Buffers always must be large enough to accommodate
14436			 * their prereserved space.  We return E2BIG instead
14437			 * of ENOMEM in this case to allow for user-level
14438			 * software to differentiate the cases.
14439			 */
14440			return (E2BIG);
14441		}
14442
14443		rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
14444
14445		if (rval != ENOMEM) {
14446			opt[which] = size;
14447			return (rval);
14448		}
14449
14450		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14451			return (rval);
14452
14453		for (divisor = 2; divisor < factor; divisor <<= 1)
14454			continue;
14455	}
14456
14457	return (ENOMEM);
14458}
14459
14460static int
14461dtrace_state_buffers(dtrace_state_t *state)
14462{
14463	dtrace_speculation_t *spec = state->dts_speculations;
14464	int rval, i;
14465
14466	if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14467	    DTRACEOPT_BUFSIZE)) != 0)
14468		return (rval);
14469
14470	if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14471	    DTRACEOPT_AGGSIZE)) != 0)
14472		return (rval);
14473
14474	for (i = 0; i < state->dts_nspeculations; i++) {
14475		if ((rval = dtrace_state_buffer(state,
14476		    spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14477			return (rval);
14478	}
14479
14480	return (0);
14481}
14482
14483static void
14484dtrace_state_prereserve(dtrace_state_t *state)
14485{
14486	dtrace_ecb_t *ecb;
14487	dtrace_probe_t *probe;
14488
14489	state->dts_reserve = 0;
14490
14491	if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14492		return;
14493
14494	/*
14495	 * If our buffer policy is a "fill" buffer policy, we need to set the
14496	 * prereserved space to be the space required by the END probes.
14497	 */
14498	probe = dtrace_probes[dtrace_probeid_end - 1];
14499	ASSERT(probe != NULL);
14500
14501	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14502		if (ecb->dte_state != state)
14503			continue;
14504
14505		state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14506	}
14507}
14508
14509static int
14510dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14511{
14512	dtrace_optval_t *opt = state->dts_options, sz, nspec;
14513	dtrace_speculation_t *spec;
14514	dtrace_buffer_t *buf;
14515#if defined(sun)
14516	cyc_handler_t hdlr;
14517	cyc_time_t when;
14518#endif
14519	int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
14520	dtrace_icookie_t cookie;
14521
14522	mutex_enter(&cpu_lock);
14523	mutex_enter(&dtrace_lock);
14524
14525	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14526		rval = EBUSY;
14527		goto out;
14528	}
14529
14530	/*
14531	 * Before we can perform any checks, we must prime all of the
14532	 * retained enablings that correspond to this state.
14533	 */
14534	dtrace_enabling_prime(state);
14535
14536	if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14537		rval = EACCES;
14538		goto out;
14539	}
14540
14541	dtrace_state_prereserve(state);
14542
14543	/*
14544	 * Now we want to do is try to allocate our speculations.
14545	 * We do not automatically resize the number of speculations; if
14546	 * this fails, we will fail the operation.
14547	 */
14548	nspec = opt[DTRACEOPT_NSPEC];
14549	ASSERT(nspec != DTRACEOPT_UNSET);
14550
14551	if (nspec > INT_MAX) {
14552		rval = ENOMEM;
14553		goto out;
14554	}
14555
14556	spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
14557	    KM_NOSLEEP | KM_NORMALPRI);
14558
14559	if (spec == NULL) {
14560		rval = ENOMEM;
14561		goto out;
14562	}
14563
14564	state->dts_speculations = spec;
14565	state->dts_nspeculations = (int)nspec;
14566
14567	for (i = 0; i < nspec; i++) {
14568		if ((buf = kmem_zalloc(bufsize,
14569		    KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
14570			rval = ENOMEM;
14571			goto err;
14572		}
14573
14574		spec[i].dtsp_buffer = buf;
14575	}
14576
14577	if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14578		if (dtrace_anon.dta_state == NULL) {
14579			rval = ENOENT;
14580			goto out;
14581		}
14582
14583		if (state->dts_necbs != 0) {
14584			rval = EALREADY;
14585			goto out;
14586		}
14587
14588		state->dts_anon = dtrace_anon_grab();
14589		ASSERT(state->dts_anon != NULL);
14590		state = state->dts_anon;
14591
14592		/*
14593		 * We want "grabanon" to be set in the grabbed state, so we'll
14594		 * copy that option value from the grabbing state into the
14595		 * grabbed state.
14596		 */
14597		state->dts_options[DTRACEOPT_GRABANON] =
14598		    opt[DTRACEOPT_GRABANON];
14599
14600		*cpu = dtrace_anon.dta_beganon;
14601
14602		/*
14603		 * If the anonymous state is active (as it almost certainly
14604		 * is if the anonymous enabling ultimately matched anything),
14605		 * we don't allow any further option processing -- but we
14606		 * don't return failure.
14607		 */
14608		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14609			goto out;
14610	}
14611
14612	if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14613	    opt[DTRACEOPT_AGGSIZE] != 0) {
14614		if (state->dts_aggregations == NULL) {
14615			/*
14616			 * We're not going to create an aggregation buffer
14617			 * because we don't have any ECBs that contain
14618			 * aggregations -- set this option to 0.
14619			 */
14620			opt[DTRACEOPT_AGGSIZE] = 0;
14621		} else {
14622			/*
14623			 * If we have an aggregation buffer, we must also have
14624			 * a buffer to use as scratch.
14625			 */
14626			if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14627			    opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14628				opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14629			}
14630		}
14631	}
14632
14633	if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14634	    opt[DTRACEOPT_SPECSIZE] != 0) {
14635		if (!state->dts_speculates) {
14636			/*
14637			 * We're not going to create speculation buffers
14638			 * because we don't have any ECBs that actually
14639			 * speculate -- set the speculation size to 0.
14640			 */
14641			opt[DTRACEOPT_SPECSIZE] = 0;
14642		}
14643	}
14644
14645	/*
14646	 * The bare minimum size for any buffer that we're actually going to
14647	 * do anything to is sizeof (uint64_t).
14648	 */
14649	sz = sizeof (uint64_t);
14650
14651	if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
14652	    (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
14653	    (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14654		/*
14655		 * A buffer size has been explicitly set to 0 (or to a size
14656		 * that will be adjusted to 0) and we need the space -- we
14657		 * need to return failure.  We return ENOSPC to differentiate
14658		 * it from failing to allocate a buffer due to failure to meet
14659		 * the reserve (for which we return E2BIG).
14660		 */
14661		rval = ENOSPC;
14662		goto out;
14663	}
14664
14665	if ((rval = dtrace_state_buffers(state)) != 0)
14666		goto err;
14667
14668	if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14669		sz = dtrace_dstate_defsize;
14670
14671	do {
14672		rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
14673
14674		if (rval == 0)
14675			break;
14676
14677		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14678			goto err;
14679	} while (sz >>= 1);
14680
14681	opt[DTRACEOPT_DYNVARSIZE] = sz;
14682
14683	if (rval != 0)
14684		goto err;
14685
14686	if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14687		opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14688
14689	if (opt[DTRACEOPT_CLEANRATE] == 0)
14690		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14691
14692	if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14693		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14694
14695	if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14696		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14697
14698	state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14699#if defined(sun)
14700	hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14701	hdlr.cyh_arg = state;
14702	hdlr.cyh_level = CY_LOW_LEVEL;
14703
14704	when.cyt_when = 0;
14705	when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14706
14707	state->dts_cleaner = cyclic_add(&hdlr, &when);
14708
14709	hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14710	hdlr.cyh_arg = state;
14711	hdlr.cyh_level = CY_LOW_LEVEL;
14712
14713	when.cyt_when = 0;
14714	when.cyt_interval = dtrace_deadman_interval;
14715
14716	state->dts_deadman = cyclic_add(&hdlr, &when);
14717#else
14718	callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14719	    dtrace_state_clean, state);
14720	callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14721	    dtrace_state_deadman, state);
14722#endif
14723
14724	state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14725
14726#if defined(sun)
14727	if (state->dts_getf != 0 &&
14728	    !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
14729		/*
14730		 * We don't have kernel privs but we have at least one call
14731		 * to getf(); we need to bump our zone's count, and (if
14732		 * this is the first enabling to have an unprivileged call
14733		 * to getf()) we need to hook into closef().
14734		 */
14735		state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
14736
14737		if (dtrace_getf++ == 0) {
14738			ASSERT(dtrace_closef == NULL);
14739			dtrace_closef = dtrace_getf_barrier;
14740		}
14741	}
14742#endif
14743
14744	/*
14745	 * Now it's time to actually fire the BEGIN probe.  We need to disable
14746	 * interrupts here both to record the CPU on which we fired the BEGIN
14747	 * probe (the data from this CPU will be processed first at user
14748	 * level) and to manually activate the buffer for this CPU.
14749	 */
14750	cookie = dtrace_interrupt_disable();
14751	*cpu = curcpu;
14752	ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14753	state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14754
14755	dtrace_probe(dtrace_probeid_begin,
14756	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14757	dtrace_interrupt_enable(cookie);
14758	/*
14759	 * We may have had an exit action from a BEGIN probe; only change our
14760	 * state to ACTIVE if we're still in WARMUP.
14761	 */
14762	ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
14763	    state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14764
14765	if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14766		state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14767
14768	/*
14769	 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
14770	 * want each CPU to transition its principal buffer out of the
14771	 * INACTIVE state.  Doing this assures that no CPU will suddenly begin
14772	 * processing an ECB halfway down a probe's ECB chain; all CPUs will
14773	 * atomically transition from processing none of a state's ECBs to
14774	 * processing all of them.
14775	 */
14776	dtrace_xcall(DTRACE_CPUALL,
14777	    (dtrace_xcall_t)dtrace_buffer_activate, state);
14778	goto out;
14779
14780err:
14781	dtrace_buffer_free(state->dts_buffer);
14782	dtrace_buffer_free(state->dts_aggbuffer);
14783
14784	if ((nspec = state->dts_nspeculations) == 0) {
14785		ASSERT(state->dts_speculations == NULL);
14786		goto out;
14787	}
14788
14789	spec = state->dts_speculations;
14790	ASSERT(spec != NULL);
14791
14792	for (i = 0; i < state->dts_nspeculations; i++) {
14793		if ((buf = spec[i].dtsp_buffer) == NULL)
14794			break;
14795
14796		dtrace_buffer_free(buf);
14797		kmem_free(buf, bufsize);
14798	}
14799
14800	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14801	state->dts_nspeculations = 0;
14802	state->dts_speculations = NULL;
14803
14804out:
14805	mutex_exit(&dtrace_lock);
14806	mutex_exit(&cpu_lock);
14807
14808	return (rval);
14809}
14810
14811static int
14812dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
14813{
14814	dtrace_icookie_t cookie;
14815
14816	ASSERT(MUTEX_HELD(&dtrace_lock));
14817
14818	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14819	    state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14820		return (EINVAL);
14821
14822	/*
14823	 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14824	 * to be sure that every CPU has seen it.  See below for the details
14825	 * on why this is done.
14826	 */
14827	state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14828	dtrace_sync();
14829
14830	/*
14831	 * By this point, it is impossible for any CPU to be still processing
14832	 * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
14833	 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14834	 * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
14835	 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14836	 * iff we're in the END probe.
14837	 */
14838	state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14839	dtrace_sync();
14840	ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14841
14842	/*
14843	 * Finally, we can release the reserve and call the END probe.  We
14844	 * disable interrupts across calling the END probe to allow us to
14845	 * return the CPU on which we actually called the END probe.  This
14846	 * allows user-land to be sure that this CPU's principal buffer is
14847	 * processed last.
14848	 */
14849	state->dts_reserve = 0;
14850
14851	cookie = dtrace_interrupt_disable();
14852	*cpu = curcpu;
14853	dtrace_probe(dtrace_probeid_end,
14854	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14855	dtrace_interrupt_enable(cookie);
14856
14857	state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14858	dtrace_sync();
14859
14860#if defined(sun)
14861	if (state->dts_getf != 0 &&
14862	    !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
14863		/*
14864		 * We don't have kernel privs but we have at least one call
14865		 * to getf(); we need to lower our zone's count, and (if
14866		 * this is the last enabling to have an unprivileged call
14867		 * to getf()) we need to clear the closef() hook.
14868		 */
14869		ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
14870		ASSERT(dtrace_closef == dtrace_getf_barrier);
14871		ASSERT(dtrace_getf > 0);
14872
14873		state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
14874
14875		if (--dtrace_getf == 0)
14876			dtrace_closef = NULL;
14877	}
14878#endif
14879
14880	return (0);
14881}
14882
14883static int
14884dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14885    dtrace_optval_t val)
14886{
14887	ASSERT(MUTEX_HELD(&dtrace_lock));
14888
14889	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14890		return (EBUSY);
14891
14892	if (option >= DTRACEOPT_MAX)
14893		return (EINVAL);
14894
14895	if (option != DTRACEOPT_CPU && val < 0)
14896		return (EINVAL);
14897
14898	switch (option) {
14899	case DTRACEOPT_DESTRUCTIVE:
14900		if (dtrace_destructive_disallow)
14901			return (EACCES);
14902
14903		state->dts_cred.dcr_destructive = 1;
14904		break;
14905
14906	case DTRACEOPT_BUFSIZE:
14907	case DTRACEOPT_DYNVARSIZE:
14908	case DTRACEOPT_AGGSIZE:
14909	case DTRACEOPT_SPECSIZE:
14910	case DTRACEOPT_STRSIZE:
14911		if (val < 0)
14912			return (EINVAL);
14913
14914		if (val >= LONG_MAX) {
14915			/*
14916			 * If this is an otherwise negative value, set it to
14917			 * the highest multiple of 128m less than LONG_MAX.
14918			 * Technically, we're adjusting the size without
14919			 * regard to the buffer resizing policy, but in fact,
14920			 * this has no effect -- if we set the buffer size to
14921			 * ~LONG_MAX and the buffer policy is ultimately set to
14922			 * be "manual", the buffer allocation is guaranteed to
14923			 * fail, if only because the allocation requires two
14924			 * buffers.  (We set the the size to the highest
14925			 * multiple of 128m because it ensures that the size
14926			 * will remain a multiple of a megabyte when
14927			 * repeatedly halved -- all the way down to 15m.)
14928			 */
14929			val = LONG_MAX - (1 << 27) + 1;
14930		}
14931	}
14932
14933	state->dts_options[option] = val;
14934
14935	return (0);
14936}
14937
14938static void
14939dtrace_state_destroy(dtrace_state_t *state)
14940{
14941	dtrace_ecb_t *ecb;
14942	dtrace_vstate_t *vstate = &state->dts_vstate;
14943#if defined(sun)
14944	minor_t minor = getminor(state->dts_dev);
14945#endif
14946	int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
14947	dtrace_speculation_t *spec = state->dts_speculations;
14948	int nspec = state->dts_nspeculations;
14949	uint32_t match;
14950
14951	ASSERT(MUTEX_HELD(&dtrace_lock));
14952	ASSERT(MUTEX_HELD(&cpu_lock));
14953
14954	/*
14955	 * First, retract any retained enablings for this state.
14956	 */
14957	dtrace_enabling_retract(state);
14958	ASSERT(state->dts_nretained == 0);
14959
14960	if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14961	    state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14962		/*
14963		 * We have managed to come into dtrace_state_destroy() on a
14964		 * hot enabling -- almost certainly because of a disorderly
14965		 * shutdown of a consumer.  (That is, a consumer that is
14966		 * exiting without having called dtrace_stop().) In this case,
14967		 * we're going to set our activity to be KILLED, and then
14968		 * issue a sync to be sure that everyone is out of probe
14969		 * context before we start blowing away ECBs.
14970		 */
14971		state->dts_activity = DTRACE_ACTIVITY_KILLED;
14972		dtrace_sync();
14973	}
14974
14975	/*
14976	 * Release the credential hold we took in dtrace_state_create().
14977	 */
14978	if (state->dts_cred.dcr_cred != NULL)
14979		crfree(state->dts_cred.dcr_cred);
14980
14981	/*
14982	 * Now we can safely disable and destroy any enabled probes.  Because
14983	 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14984	 * (especially if they're all enabled), we take two passes through the
14985	 * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14986	 * in the second we disable whatever is left over.
14987	 */
14988	for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14989		for (i = 0; i < state->dts_necbs; i++) {
14990			if ((ecb = state->dts_ecbs[i]) == NULL)
14991				continue;
14992
14993			if (match && ecb->dte_probe != NULL) {
14994				dtrace_probe_t *probe = ecb->dte_probe;
14995				dtrace_provider_t *prov = probe->dtpr_provider;
14996
14997				if (!(prov->dtpv_priv.dtpp_flags & match))
14998					continue;
14999			}
15000
15001			dtrace_ecb_disable(ecb);
15002			dtrace_ecb_destroy(ecb);
15003		}
15004
15005		if (!match)
15006			break;
15007	}
15008
15009	/*
15010	 * Before we free the buffers, perform one more sync to assure that
15011	 * every CPU is out of probe context.
15012	 */
15013	dtrace_sync();
15014
15015	dtrace_buffer_free(state->dts_buffer);
15016	dtrace_buffer_free(state->dts_aggbuffer);
15017
15018	for (i = 0; i < nspec; i++)
15019		dtrace_buffer_free(spec[i].dtsp_buffer);
15020
15021#if defined(sun)
15022	if (state->dts_cleaner != CYCLIC_NONE)
15023		cyclic_remove(state->dts_cleaner);
15024
15025	if (state->dts_deadman != CYCLIC_NONE)
15026		cyclic_remove(state->dts_deadman);
15027#else
15028	callout_stop(&state->dts_cleaner);
15029	callout_drain(&state->dts_cleaner);
15030	callout_stop(&state->dts_deadman);
15031	callout_drain(&state->dts_deadman);
15032#endif
15033
15034	dtrace_dstate_fini(&vstate->dtvs_dynvars);
15035	dtrace_vstate_fini(vstate);
15036	if (state->dts_ecbs != NULL)
15037		kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
15038
15039	if (state->dts_aggregations != NULL) {
15040#ifdef DEBUG
15041		for (i = 0; i < state->dts_naggregations; i++)
15042			ASSERT(state->dts_aggregations[i] == NULL);
15043#endif
15044		ASSERT(state->dts_naggregations > 0);
15045		kmem_free(state->dts_aggregations,
15046		    state->dts_naggregations * sizeof (dtrace_aggregation_t *));
15047	}
15048
15049	kmem_free(state->dts_buffer, bufsize);
15050	kmem_free(state->dts_aggbuffer, bufsize);
15051
15052	for (i = 0; i < nspec; i++)
15053		kmem_free(spec[i].dtsp_buffer, bufsize);
15054
15055	if (spec != NULL)
15056		kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
15057
15058	dtrace_format_destroy(state);
15059
15060	if (state->dts_aggid_arena != NULL) {
15061#if defined(sun)
15062		vmem_destroy(state->dts_aggid_arena);
15063#else
15064		delete_unrhdr(state->dts_aggid_arena);
15065#endif
15066		state->dts_aggid_arena = NULL;
15067	}
15068#if defined(sun)
15069	ddi_soft_state_free(dtrace_softstate, minor);
15070	vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
15071#endif
15072}
15073
15074/*
15075 * DTrace Anonymous Enabling Functions
15076 */
15077static dtrace_state_t *
15078dtrace_anon_grab(void)
15079{
15080	dtrace_state_t *state;
15081
15082	ASSERT(MUTEX_HELD(&dtrace_lock));
15083
15084	if ((state = dtrace_anon.dta_state) == NULL) {
15085		ASSERT(dtrace_anon.dta_enabling == NULL);
15086		return (NULL);
15087	}
15088
15089	ASSERT(dtrace_anon.dta_enabling != NULL);
15090	ASSERT(dtrace_retained != NULL);
15091
15092	dtrace_enabling_destroy(dtrace_anon.dta_enabling);
15093	dtrace_anon.dta_enabling = NULL;
15094	dtrace_anon.dta_state = NULL;
15095
15096	return (state);
15097}
15098
15099static void
15100dtrace_anon_property(void)
15101{
15102	int i, rv;
15103	dtrace_state_t *state;
15104	dof_hdr_t *dof;
15105	char c[32];		/* enough for "dof-data-" + digits */
15106
15107	ASSERT(MUTEX_HELD(&dtrace_lock));
15108	ASSERT(MUTEX_HELD(&cpu_lock));
15109
15110	for (i = 0; ; i++) {
15111		(void) snprintf(c, sizeof (c), "dof-data-%d", i);
15112
15113		dtrace_err_verbose = 1;
15114
15115		if ((dof = dtrace_dof_property(c)) == NULL) {
15116			dtrace_err_verbose = 0;
15117			break;
15118		}
15119
15120#if defined(sun)
15121		/*
15122		 * We want to create anonymous state, so we need to transition
15123		 * the kernel debugger to indicate that DTrace is active.  If
15124		 * this fails (e.g. because the debugger has modified text in
15125		 * some way), we won't continue with the processing.
15126		 */
15127		if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15128			cmn_err(CE_NOTE, "kernel debugger active; anonymous "
15129			    "enabling ignored.");
15130			dtrace_dof_destroy(dof);
15131			break;
15132		}
15133#endif
15134
15135		/*
15136		 * If we haven't allocated an anonymous state, we'll do so now.
15137		 */
15138		if ((state = dtrace_anon.dta_state) == NULL) {
15139#if defined(sun)
15140			state = dtrace_state_create(NULL, NULL);
15141#else
15142			state = dtrace_state_create(NULL);
15143#endif
15144			dtrace_anon.dta_state = state;
15145
15146			if (state == NULL) {
15147				/*
15148				 * This basically shouldn't happen:  the only
15149				 * failure mode from dtrace_state_create() is a
15150				 * failure of ddi_soft_state_zalloc() that
15151				 * itself should never happen.  Still, the
15152				 * interface allows for a failure mode, and
15153				 * we want to fail as gracefully as possible:
15154				 * we'll emit an error message and cease
15155				 * processing anonymous state in this case.
15156				 */
15157				cmn_err(CE_WARN, "failed to create "
15158				    "anonymous state");
15159				dtrace_dof_destroy(dof);
15160				break;
15161			}
15162		}
15163
15164		rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
15165		    &dtrace_anon.dta_enabling, 0, B_TRUE);
15166
15167		if (rv == 0)
15168			rv = dtrace_dof_options(dof, state);
15169
15170		dtrace_err_verbose = 0;
15171		dtrace_dof_destroy(dof);
15172
15173		if (rv != 0) {
15174			/*
15175			 * This is malformed DOF; chuck any anonymous state
15176			 * that we created.
15177			 */
15178			ASSERT(dtrace_anon.dta_enabling == NULL);
15179			dtrace_state_destroy(state);
15180			dtrace_anon.dta_state = NULL;
15181			break;
15182		}
15183
15184		ASSERT(dtrace_anon.dta_enabling != NULL);
15185	}
15186
15187	if (dtrace_anon.dta_enabling != NULL) {
15188		int rval;
15189
15190		/*
15191		 * dtrace_enabling_retain() can only fail because we are
15192		 * trying to retain more enablings than are allowed -- but
15193		 * we only have one anonymous enabling, and we are guaranteed
15194		 * to be allowed at least one retained enabling; we assert
15195		 * that dtrace_enabling_retain() returns success.
15196		 */
15197		rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
15198		ASSERT(rval == 0);
15199
15200		dtrace_enabling_dump(dtrace_anon.dta_enabling);
15201	}
15202}
15203
15204/*
15205 * DTrace Helper Functions
15206 */
15207static void
15208dtrace_helper_trace(dtrace_helper_action_t *helper,
15209    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
15210{
15211	uint32_t size, next, nnext, i;
15212	dtrace_helptrace_t *ent, *buffer;
15213	uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;
15214
15215	if ((buffer = dtrace_helptrace_buffer) == NULL)
15216		return;
15217
15218	ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
15219
15220	/*
15221	 * What would a tracing framework be without its own tracing
15222	 * framework?  (Well, a hell of a lot simpler, for starters...)
15223	 */
15224	size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
15225	    sizeof (uint64_t) - sizeof (uint64_t);
15226
15227	/*
15228	 * Iterate until we can allocate a slot in the trace buffer.
15229	 */
15230	do {
15231		next = dtrace_helptrace_next;
15232
15233		if (next + size < dtrace_helptrace_bufsize) {
15234			nnext = next + size;
15235		} else {
15236			nnext = size;
15237		}
15238	} while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15239
15240	/*
15241	 * We have our slot; fill it in.
15242	 */
15243	if (nnext == size) {
15244		dtrace_helptrace_wrapped++;
15245		next = 0;
15246	}
15247
15248	ent = (dtrace_helptrace_t *)((uintptr_t)buffer + next);
15249	ent->dtht_helper = helper;
15250	ent->dtht_where = where;
15251	ent->dtht_nlocals = vstate->dtvs_nlocals;
15252
15253	ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15254	    mstate->dtms_fltoffs : -1;
15255	ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15256	ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval;
15257
15258	for (i = 0; i < vstate->dtvs_nlocals; i++) {
15259		dtrace_statvar_t *svar;
15260
15261		if ((svar = vstate->dtvs_locals[i]) == NULL)
15262			continue;
15263
15264		ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
15265		ent->dtht_locals[i] =
15266		    ((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];
15267	}
15268}
15269
15270static uint64_t
15271dtrace_helper(int which, dtrace_mstate_t *mstate,
15272    dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15273{
15274	uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
15275	uint64_t sarg0 = mstate->dtms_arg[0];
15276	uint64_t sarg1 = mstate->dtms_arg[1];
15277	uint64_t rval = 0;
15278	dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15279	dtrace_helper_action_t *helper;
15280	dtrace_vstate_t *vstate;
15281	dtrace_difo_t *pred;
15282	int i, trace = dtrace_helptrace_buffer != NULL;
15283
15284	ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
15285
15286	if (helpers == NULL)
15287		return (0);
15288
15289	if ((helper = helpers->dthps_actions[which]) == NULL)
15290		return (0);
15291
15292	vstate = &helpers->dthps_vstate;
15293	mstate->dtms_arg[0] = arg0;
15294	mstate->dtms_arg[1] = arg1;
15295
15296	/*
15297	 * Now iterate over each helper.  If its predicate evaluates to 'true',
15298	 * we'll call the corresponding actions.  Note that the below calls
15299	 * to dtrace_dif_emulate() may set faults in machine state.  This is
15300	 * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
15301	 * the stored DIF offset with its own (which is the desired behavior).
15302	 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
15303	 * from machine state; this is okay, too.
15304	 */
15305	for (; helper != NULL; helper = helper->dtha_next) {
15306		if ((pred = helper->dtha_predicate) != NULL) {
15307			if (trace)
15308				dtrace_helper_trace(helper, mstate, vstate, 0);
15309
15310			if (!dtrace_dif_emulate(pred, mstate, vstate, state))
15311				goto next;
15312
15313			if (*flags & CPU_DTRACE_FAULT)
15314				goto err;
15315		}
15316
15317		for (i = 0; i < helper->dtha_nactions; i++) {
15318			if (trace)
15319				dtrace_helper_trace(helper,
15320				    mstate, vstate, i + 1);
15321
15322			rval = dtrace_dif_emulate(helper->dtha_actions[i],
15323			    mstate, vstate, state);
15324
15325			if (*flags & CPU_DTRACE_FAULT)
15326				goto err;
15327		}
15328
15329next:
15330		if (trace)
15331			dtrace_helper_trace(helper, mstate, vstate,
15332			    DTRACE_HELPTRACE_NEXT);
15333	}
15334
15335	if (trace)
15336		dtrace_helper_trace(helper, mstate, vstate,
15337		    DTRACE_HELPTRACE_DONE);
15338
15339	/*
15340	 * Restore the arg0 that we saved upon entry.
15341	 */
15342	mstate->dtms_arg[0] = sarg0;
15343	mstate->dtms_arg[1] = sarg1;
15344
15345	return (rval);
15346
15347err:
15348	if (trace)
15349		dtrace_helper_trace(helper, mstate, vstate,
15350		    DTRACE_HELPTRACE_ERR);
15351
15352	/*
15353	 * Restore the arg0 that we saved upon entry.
15354	 */
15355	mstate->dtms_arg[0] = sarg0;
15356	mstate->dtms_arg[1] = sarg1;
15357
15358	return (0);
15359}
15360
15361static void
15362dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15363    dtrace_vstate_t *vstate)
15364{
15365	int i;
15366
15367	if (helper->dtha_predicate != NULL)
15368		dtrace_difo_release(helper->dtha_predicate, vstate);
15369
15370	for (i = 0; i < helper->dtha_nactions; i++) {
15371		ASSERT(helper->dtha_actions[i] != NULL);
15372		dtrace_difo_release(helper->dtha_actions[i], vstate);
15373	}
15374
15375	kmem_free(helper->dtha_actions,
15376	    helper->dtha_nactions * sizeof (dtrace_difo_t *));
15377	kmem_free(helper, sizeof (dtrace_helper_action_t));
15378}
15379
15380static int
15381dtrace_helper_destroygen(int gen)
15382{
15383	proc_t *p = curproc;
15384	dtrace_helpers_t *help = p->p_dtrace_helpers;
15385	dtrace_vstate_t *vstate;
15386	int i;
15387
15388	ASSERT(MUTEX_HELD(&dtrace_lock));
15389
15390	if (help == NULL || gen > help->dthps_generation)
15391		return (EINVAL);
15392
15393	vstate = &help->dthps_vstate;
15394
15395	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15396		dtrace_helper_action_t *last = NULL, *h, *next;
15397
15398		for (h = help->dthps_actions[i]; h != NULL; h = next) {
15399			next = h->dtha_next;
15400
15401			if (h->dtha_generation == gen) {
15402				if (last != NULL) {
15403					last->dtha_next = next;
15404				} else {
15405					help->dthps_actions[i] = next;
15406				}
15407
15408				dtrace_helper_action_destroy(h, vstate);
15409			} else {
15410				last = h;
15411			}
15412		}
15413	}
15414
15415	/*
15416	 * Interate until we've cleared out all helper providers with the
15417	 * given generation number.
15418	 */
15419	for (;;) {
15420		dtrace_helper_provider_t *prov;
15421
15422		/*
15423		 * Look for a helper provider with the right generation. We
15424		 * have to start back at the beginning of the list each time
15425		 * because we drop dtrace_lock. It's unlikely that we'll make
15426		 * more than two passes.
15427		 */
15428		for (i = 0; i < help->dthps_nprovs; i++) {
15429			prov = help->dthps_provs[i];
15430
15431			if (prov->dthp_generation == gen)
15432				break;
15433		}
15434
15435		/*
15436		 * If there were no matches, we're done.
15437		 */
15438		if (i == help->dthps_nprovs)
15439			break;
15440
15441		/*
15442		 * Move the last helper provider into this slot.
15443		 */
15444		help->dthps_nprovs--;
15445		help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15446		help->dthps_provs[help->dthps_nprovs] = NULL;
15447
15448		mutex_exit(&dtrace_lock);
15449
15450		/*
15451		 * If we have a meta provider, remove this helper provider.
15452		 */
15453		mutex_enter(&dtrace_meta_lock);
15454		if (dtrace_meta_pid != NULL) {
15455			ASSERT(dtrace_deferred_pid == NULL);
15456			dtrace_helper_provider_remove(&prov->dthp_prov,
15457			    p->p_pid);
15458		}
15459		mutex_exit(&dtrace_meta_lock);
15460
15461		dtrace_helper_provider_destroy(prov);
15462
15463		mutex_enter(&dtrace_lock);
15464	}
15465
15466	return (0);
15467}
15468
15469static int
15470dtrace_helper_validate(dtrace_helper_action_t *helper)
15471{
15472	int err = 0, i;
15473	dtrace_difo_t *dp;
15474
15475	if ((dp = helper->dtha_predicate) != NULL)
15476		err += dtrace_difo_validate_helper(dp);
15477
15478	for (i = 0; i < helper->dtha_nactions; i++)
15479		err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
15480
15481	return (err == 0);
15482}
15483
15484static int
15485dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
15486{
15487	dtrace_helpers_t *help;
15488	dtrace_helper_action_t *helper, *last;
15489	dtrace_actdesc_t *act;
15490	dtrace_vstate_t *vstate;
15491	dtrace_predicate_t *pred;
15492	int count = 0, nactions = 0, i;
15493
15494	if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
15495		return (EINVAL);
15496
15497	help = curproc->p_dtrace_helpers;
15498	last = help->dthps_actions[which];
15499	vstate = &help->dthps_vstate;
15500
15501	for (count = 0; last != NULL; last = last->dtha_next) {
15502		count++;
15503		if (last->dtha_next == NULL)
15504			break;
15505	}
15506
15507	/*
15508	 * If we already have dtrace_helper_actions_max helper actions for this
15509	 * helper action type, we'll refuse to add a new one.
15510	 */
15511	if (count >= dtrace_helper_actions_max)
15512		return (ENOSPC);
15513
15514	helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15515	helper->dtha_generation = help->dthps_generation;
15516
15517	if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15518		ASSERT(pred->dtp_difo != NULL);
15519		dtrace_difo_hold(pred->dtp_difo);
15520		helper->dtha_predicate = pred->dtp_difo;
15521	}
15522
15523	for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15524		if (act->dtad_kind != DTRACEACT_DIFEXPR)
15525			goto err;
15526
15527		if (act->dtad_difo == NULL)
15528			goto err;
15529
15530		nactions++;
15531	}
15532
15533	helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
15534	    (helper->dtha_nactions = nactions), KM_SLEEP);
15535
15536	for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15537		dtrace_difo_hold(act->dtad_difo);
15538		helper->dtha_actions[i++] = act->dtad_difo;
15539	}
15540
15541	if (!dtrace_helper_validate(helper))
15542		goto err;
15543
15544	if (last == NULL) {
15545		help->dthps_actions[which] = helper;
15546	} else {
15547		last->dtha_next = helper;
15548	}
15549
15550	if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15551		dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15552		dtrace_helptrace_next = 0;
15553	}
15554
15555	return (0);
15556err:
15557	dtrace_helper_action_destroy(helper, vstate);
15558	return (EINVAL);
15559}
15560
15561static void
15562dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
15563    dof_helper_t *dofhp)
15564{
15565	ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
15566
15567	mutex_enter(&dtrace_meta_lock);
15568	mutex_enter(&dtrace_lock);
15569
15570	if (!dtrace_attached() || dtrace_meta_pid == NULL) {
15571		/*
15572		 * If the dtrace module is loaded but not attached, or if
15573		 * there aren't isn't a meta provider registered to deal with
15574		 * these provider descriptions, we need to postpone creating
15575		 * the actual providers until later.
15576		 */
15577
15578		if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15579		    dtrace_deferred_pid != help) {
15580			help->dthps_deferred = 1;
15581			help->dthps_pid = p->p_pid;
15582			help->dthps_next = dtrace_deferred_pid;
15583			help->dthps_prev = NULL;
15584			if (dtrace_deferred_pid != NULL)
15585				dtrace_deferred_pid->dthps_prev = help;
15586			dtrace_deferred_pid = help;
15587		}
15588
15589		mutex_exit(&dtrace_lock);
15590
15591	} else if (dofhp != NULL) {
15592		/*
15593		 * If the dtrace module is loaded and we have a particular
15594		 * helper provider description, pass that off to the
15595		 * meta provider.
15596		 */
15597
15598		mutex_exit(&dtrace_lock);
15599
15600		dtrace_helper_provide(dofhp, p->p_pid);
15601
15602	} else {
15603		/*
15604		 * Otherwise, just pass all the helper provider descriptions
15605		 * off to the meta provider.
15606		 */
15607
15608		int i;
15609		mutex_exit(&dtrace_lock);
15610
15611		for (i = 0; i < help->dthps_nprovs; i++) {
15612			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
15613			    p->p_pid);
15614		}
15615	}
15616
15617	mutex_exit(&dtrace_meta_lock);
15618}
15619
15620static int
15621dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
15622{
15623	dtrace_helpers_t *help;
15624	dtrace_helper_provider_t *hprov, **tmp_provs;
15625	uint_t tmp_maxprovs, i;
15626
15627	ASSERT(MUTEX_HELD(&dtrace_lock));
15628
15629	help = curproc->p_dtrace_helpers;
15630	ASSERT(help != NULL);
15631
15632	/*
15633	 * If we already have dtrace_helper_providers_max helper providers,
15634	 * we're refuse to add a new one.
15635	 */
15636	if (help->dthps_nprovs >= dtrace_helper_providers_max)
15637		return (ENOSPC);
15638
15639	/*
15640	 * Check to make sure this isn't a duplicate.
15641	 */
15642	for (i = 0; i < help->dthps_nprovs; i++) {
15643		if (dofhp->dofhp_dof ==
15644		    help->dthps_provs[i]->dthp_prov.dofhp_dof)
15645			return (EALREADY);
15646	}
15647
15648	hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15649	hprov->dthp_prov = *dofhp;
15650	hprov->dthp_ref = 1;
15651	hprov->dthp_generation = gen;
15652
15653	/*
15654	 * Allocate a bigger table for helper providers if it's already full.
15655	 */
15656	if (help->dthps_maxprovs == help->dthps_nprovs) {
15657		tmp_maxprovs = help->dthps_maxprovs;
15658		tmp_provs = help->dthps_provs;
15659
15660		if (help->dthps_maxprovs == 0)
15661			help->dthps_maxprovs = 2;
15662		else
15663			help->dthps_maxprovs *= 2;
15664		if (help->dthps_maxprovs > dtrace_helper_providers_max)
15665			help->dthps_maxprovs = dtrace_helper_providers_max;
15666
15667		ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15668
15669		help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15670		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15671
15672		if (tmp_provs != NULL) {
15673			bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
15674			    sizeof (dtrace_helper_provider_t *));
15675			kmem_free(tmp_provs, tmp_maxprovs *
15676			    sizeof (dtrace_helper_provider_t *));
15677		}
15678	}
15679
15680	help->dthps_provs[help->dthps_nprovs] = hprov;
15681	help->dthps_nprovs++;
15682
15683	return (0);
15684}
15685
15686static void
15687dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15688{
15689	mutex_enter(&dtrace_lock);
15690
15691	if (--hprov->dthp_ref == 0) {
15692		dof_hdr_t *dof;
15693		mutex_exit(&dtrace_lock);
15694		dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15695		dtrace_dof_destroy(dof);
15696		kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15697	} else {
15698		mutex_exit(&dtrace_lock);
15699	}
15700}
15701
15702static int
15703dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
15704{
15705	uintptr_t daddr = (uintptr_t)dof;
15706	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
15707	dof_provider_t *provider;
15708	dof_probe_t *probe;
15709	uint8_t *arg;
15710	char *strtab, *typestr;
15711	dof_stridx_t typeidx;
15712	size_t typesz;
15713	uint_t nprobes, j, k;
15714
15715	ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15716
15717	if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
15718		dtrace_dof_error(dof, "misaligned section offset");
15719		return (-1);
15720	}
15721
15722	/*
15723	 * The section needs to be large enough to contain the DOF provider
15724	 * structure appropriate for the given version.
15725	 */
15726	if (sec->dofs_size <
15727	    ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15728	    offsetof(dof_provider_t, dofpv_prenoffs) :
15729	    sizeof (dof_provider_t))) {
15730		dtrace_dof_error(dof, "provider section too small");
15731		return (-1);
15732	}
15733
15734	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15735	str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
15736	prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
15737	arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
15738	off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
15739
15740	if (str_sec == NULL || prb_sec == NULL ||
15741	    arg_sec == NULL || off_sec == NULL)
15742		return (-1);
15743
15744	enoff_sec = NULL;
15745
15746	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15747	    provider->dofpv_prenoffs != DOF_SECT_NONE &&
15748	    (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15749	    provider->dofpv_prenoffs)) == NULL)
15750		return (-1);
15751
15752	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15753
15754	if (provider->dofpv_name >= str_sec->dofs_size ||
15755	    strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15756		dtrace_dof_error(dof, "invalid provider name");
15757		return (-1);
15758	}
15759
15760	if (prb_sec->dofs_entsize == 0 ||
15761	    prb_sec->dofs_entsize > prb_sec->dofs_size) {
15762		dtrace_dof_error(dof, "invalid entry size");
15763		return (-1);
15764	}
15765
15766	if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
15767		dtrace_dof_error(dof, "misaligned entry size");
15768		return (-1);
15769	}
15770
15771	if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15772		dtrace_dof_error(dof, "invalid entry size");
15773		return (-1);
15774	}
15775
15776	if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
15777		dtrace_dof_error(dof, "misaligned section offset");
15778		return (-1);
15779	}
15780
15781	if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15782		dtrace_dof_error(dof, "invalid entry size");
15783		return (-1);
15784	}
15785
15786	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15787
15788	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15789
15790	/*
15791	 * Take a pass through the probes to check for errors.
15792	 */
15793	for (j = 0; j < nprobes; j++) {
15794		probe = (dof_probe_t *)(uintptr_t)(daddr +
15795		    prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15796
15797		if (probe->dofpr_func >= str_sec->dofs_size) {
15798			dtrace_dof_error(dof, "invalid function name");
15799			return (-1);
15800		}
15801
15802		if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15803			dtrace_dof_error(dof, "function name too long");
15804			return (-1);
15805		}
15806
15807		if (probe->dofpr_name >= str_sec->dofs_size ||
15808		    strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15809			dtrace_dof_error(dof, "invalid probe name");
15810			return (-1);
15811		}
15812
15813		/*
15814		 * The offset count must not wrap the index, and the offsets
15815		 * must also not overflow the section's data.
15816		 */
15817		if (probe->dofpr_offidx + probe->dofpr_noffs <
15818		    probe->dofpr_offidx ||
15819		    (probe->dofpr_offidx + probe->dofpr_noffs) *
15820		    off_sec->dofs_entsize > off_sec->dofs_size) {
15821			dtrace_dof_error(dof, "invalid probe offset");
15822			return (-1);
15823		}
15824
15825		if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15826			/*
15827			 * If there's no is-enabled offset section, make sure
15828			 * there aren't any is-enabled offsets. Otherwise
15829			 * perform the same checks as for probe offsets
15830			 * (immediately above).
15831			 */
15832			if (enoff_sec == NULL) {
15833				if (probe->dofpr_enoffidx != 0 ||
15834				    probe->dofpr_nenoffs != 0) {
15835					dtrace_dof_error(dof, "is-enabled "
15836					    "offsets with null section");
15837					return (-1);
15838				}
15839			} else if (probe->dofpr_enoffidx +
15840			    probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
15841			    (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15842			    enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15843				dtrace_dof_error(dof, "invalid is-enabled "
15844				    "offset");
15845				return (-1);
15846			}
15847
15848			if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
15849				dtrace_dof_error(dof, "zero probe and "
15850				    "is-enabled offsets");
15851				return (-1);
15852			}
15853		} else if (probe->dofpr_noffs == 0) {
15854			dtrace_dof_error(dof, "zero probe offsets");
15855			return (-1);
15856		}
15857
15858		if (probe->dofpr_argidx + probe->dofpr_xargc <
15859		    probe->dofpr_argidx ||
15860		    (probe->dofpr_argidx + probe->dofpr_xargc) *
15861		    arg_sec->dofs_entsize > arg_sec->dofs_size) {
15862			dtrace_dof_error(dof, "invalid args");
15863			return (-1);
15864		}
15865
15866		typeidx = probe->dofpr_nargv;
15867		typestr = strtab + probe->dofpr_nargv;
15868		for (k = 0; k < probe->dofpr_nargc; k++) {
15869			if (typeidx >= str_sec->dofs_size) {
15870				dtrace_dof_error(dof, "bad "
15871				    "native argument type");
15872				return (-1);
15873			}
15874
15875			typesz = strlen(typestr) + 1;
15876			if (typesz > DTRACE_ARGTYPELEN) {
15877				dtrace_dof_error(dof, "native "
15878				    "argument type too long");
15879				return (-1);
15880			}
15881			typeidx += typesz;
15882			typestr += typesz;
15883		}
15884
15885		typeidx = probe->dofpr_xargv;
15886		typestr = strtab + probe->dofpr_xargv;
15887		for (k = 0; k < probe->dofpr_xargc; k++) {
15888			if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15889				dtrace_dof_error(dof, "bad "
15890				    "native argument index");
15891				return (-1);
15892			}
15893
15894			if (typeidx >= str_sec->dofs_size) {
15895				dtrace_dof_error(dof, "bad "
15896				    "translated argument type");
15897				return (-1);
15898			}
15899
15900			typesz = strlen(typestr) + 1;
15901			if (typesz > DTRACE_ARGTYPELEN) {
15902				dtrace_dof_error(dof, "translated argument "
15903				    "type too long");
15904				return (-1);
15905			}
15906
15907			typeidx += typesz;
15908			typestr += typesz;
15909		}
15910	}
15911
15912	return (0);
15913}
15914
15915static int
15916dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
15917{
15918	dtrace_helpers_t *help;
15919	dtrace_vstate_t *vstate;
15920	dtrace_enabling_t *enab = NULL;
15921	int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15922	uintptr_t daddr = (uintptr_t)dof;
15923
15924	ASSERT(MUTEX_HELD(&dtrace_lock));
15925
15926	if ((help = curproc->p_dtrace_helpers) == NULL)
15927		help = dtrace_helpers_create(curproc);
15928
15929	vstate = &help->dthps_vstate;
15930
15931	if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15932	    dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15933		dtrace_dof_destroy(dof);
15934		return (rv);
15935	}
15936
15937	/*
15938	 * Look for helper providers and validate their descriptions.
15939	 */
15940	if (dhp != NULL) {
15941		for (i = 0; i < dof->dofh_secnum; i++) {
15942			dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15943			    dof->dofh_secoff + i * dof->dofh_secsize);
15944
15945			if (sec->dofs_type != DOF_SECT_PROVIDER)
15946				continue;
15947
15948			if (dtrace_helper_provider_validate(dof, sec) != 0) {
15949				dtrace_enabling_destroy(enab);
15950				dtrace_dof_destroy(dof);
15951				return (-1);
15952			}
15953
15954			nprovs++;
15955		}
15956	}
15957
15958	/*
15959	 * Now we need to walk through the ECB descriptions in the enabling.
15960	 */
15961	for (i = 0; i < enab->dten_ndesc; i++) {
15962		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15963		dtrace_probedesc_t *desc = &ep->dted_probe;
15964
15965		if (strcmp(desc->dtpd_provider, "dtrace") != 0)
15966			continue;
15967
15968		if (strcmp(desc->dtpd_mod, "helper") != 0)
15969			continue;
15970
15971		if (strcmp(desc->dtpd_func, "ustack") != 0)
15972			continue;
15973
15974		if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
15975		    ep)) != 0) {
15976			/*
15977			 * Adding this helper action failed -- we are now going
15978			 * to rip out the entire generation and return failure.
15979			 */
15980			(void) dtrace_helper_destroygen(help->dthps_generation);
15981			dtrace_enabling_destroy(enab);
15982			dtrace_dof_destroy(dof);
15983			return (-1);
15984		}
15985
15986		nhelpers++;
15987	}
15988
15989	if (nhelpers < enab->dten_ndesc)
15990		dtrace_dof_error(dof, "unmatched helpers");
15991
15992	gen = help->dthps_generation++;
15993	dtrace_enabling_destroy(enab);
15994
15995	if (dhp != NULL && nprovs > 0) {
15996		dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15997		if (dtrace_helper_provider_add(dhp, gen) == 0) {
15998			mutex_exit(&dtrace_lock);
15999			dtrace_helper_provider_register(curproc, help, dhp);
16000			mutex_enter(&dtrace_lock);
16001
16002			destroy = 0;
16003		}
16004	}
16005
16006	if (destroy)
16007		dtrace_dof_destroy(dof);
16008
16009	return (gen);
16010}
16011
16012static dtrace_helpers_t *
16013dtrace_helpers_create(proc_t *p)
16014{
16015	dtrace_helpers_t *help;
16016
16017	ASSERT(MUTEX_HELD(&dtrace_lock));
16018	ASSERT(p->p_dtrace_helpers == NULL);
16019
16020	help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
16021	help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
16022	    DTRACE_NHELPER_ACTIONS, KM_SLEEP);
16023
16024	p->p_dtrace_helpers = help;
16025	dtrace_helpers++;
16026
16027	return (help);
16028}
16029
16030#if defined(sun)
16031static
16032#endif
16033void
16034dtrace_helpers_destroy(proc_t *p)
16035{
16036	dtrace_helpers_t *help;
16037	dtrace_vstate_t *vstate;
16038#if defined(sun)
16039	proc_t *p = curproc;
16040#endif
16041	int i;
16042
16043	mutex_enter(&dtrace_lock);
16044
16045	ASSERT(p->p_dtrace_helpers != NULL);
16046	ASSERT(dtrace_helpers > 0);
16047
16048	help = p->p_dtrace_helpers;
16049	vstate = &help->dthps_vstate;
16050
16051	/*
16052	 * We're now going to lose the help from this process.
16053	 */
16054	p->p_dtrace_helpers = NULL;
16055	dtrace_sync();
16056
16057	/*
16058	 * Destory the helper actions.
16059	 */
16060	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16061		dtrace_helper_action_t *h, *next;
16062
16063		for (h = help->dthps_actions[i]; h != NULL; h = next) {
16064			next = h->dtha_next;
16065			dtrace_helper_action_destroy(h, vstate);
16066			h = next;
16067		}
16068	}
16069
16070	mutex_exit(&dtrace_lock);
16071
16072	/*
16073	 * Destroy the helper providers.
16074	 */
16075	if (help->dthps_maxprovs > 0) {
16076		mutex_enter(&dtrace_meta_lock);
16077		if (dtrace_meta_pid != NULL) {
16078			ASSERT(dtrace_deferred_pid == NULL);
16079
16080			for (i = 0; i < help->dthps_nprovs; i++) {
16081				dtrace_helper_provider_remove(
16082				    &help->dthps_provs[i]->dthp_prov, p->p_pid);
16083			}
16084		} else {
16085			mutex_enter(&dtrace_lock);
16086			ASSERT(help->dthps_deferred == 0 ||
16087			    help->dthps_next != NULL ||
16088			    help->dthps_prev != NULL ||
16089			    help == dtrace_deferred_pid);
16090
16091			/*
16092			 * Remove the helper from the deferred list.
16093			 */
16094			if (help->dthps_next != NULL)
16095				help->dthps_next->dthps_prev = help->dthps_prev;
16096			if (help->dthps_prev != NULL)
16097				help->dthps_prev->dthps_next = help->dthps_next;
16098			if (dtrace_deferred_pid == help) {
16099				dtrace_deferred_pid = help->dthps_next;
16100				ASSERT(help->dthps_prev == NULL);
16101			}
16102
16103			mutex_exit(&dtrace_lock);
16104		}
16105
16106		mutex_exit(&dtrace_meta_lock);
16107
16108		for (i = 0; i < help->dthps_nprovs; i++) {
16109			dtrace_helper_provider_destroy(help->dthps_provs[i]);
16110		}
16111
16112		kmem_free(help->dthps_provs, help->dthps_maxprovs *
16113		    sizeof (dtrace_helper_provider_t *));
16114	}
16115
16116	mutex_enter(&dtrace_lock);
16117
16118	dtrace_vstate_fini(&help->dthps_vstate);
16119	kmem_free(help->dthps_actions,
16120	    sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16121	kmem_free(help, sizeof (dtrace_helpers_t));
16122
16123	--dtrace_helpers;
16124	mutex_exit(&dtrace_lock);
16125}
16126
16127#if defined(sun)
16128static
16129#endif
16130void
16131dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16132{
16133	dtrace_helpers_t *help, *newhelp;
16134	dtrace_helper_action_t *helper, *new, *last;
16135	dtrace_difo_t *dp;
16136	dtrace_vstate_t *vstate;
16137	int i, j, sz, hasprovs = 0;
16138
16139	mutex_enter(&dtrace_lock);
16140	ASSERT(from->p_dtrace_helpers != NULL);
16141	ASSERT(dtrace_helpers > 0);
16142
16143	help = from->p_dtrace_helpers;
16144	newhelp = dtrace_helpers_create(to);
16145	ASSERT(to->p_dtrace_helpers != NULL);
16146
16147	newhelp->dthps_generation = help->dthps_generation;
16148	vstate = &newhelp->dthps_vstate;
16149
16150	/*
16151	 * Duplicate the helper actions.
16152	 */
16153	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16154		if ((helper = help->dthps_actions[i]) == NULL)
16155			continue;
16156
16157		for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16158			new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16159			    KM_SLEEP);
16160			new->dtha_generation = helper->dtha_generation;
16161
16162			if ((dp = helper->dtha_predicate) != NULL) {
16163				dp = dtrace_difo_duplicate(dp, vstate);
16164				new->dtha_predicate = dp;
16165			}
16166
16167			new->dtha_nactions = helper->dtha_nactions;
16168			sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16169			new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16170
16171			for (j = 0; j < new->dtha_nactions; j++) {
16172				dtrace_difo_t *dp = helper->dtha_actions[j];
16173
16174				ASSERT(dp != NULL);
16175				dp = dtrace_difo_duplicate(dp, vstate);
16176				new->dtha_actions[j] = dp;
16177			}
16178
16179			if (last != NULL) {
16180				last->dtha_next = new;
16181			} else {
16182				newhelp->dthps_actions[i] = new;
16183			}
16184
16185			last = new;
16186		}
16187	}
16188
16189	/*
16190	 * Duplicate the helper providers and register them with the
16191	 * DTrace framework.
16192	 */
16193	if (help->dthps_nprovs > 0) {
16194		newhelp->dthps_nprovs = help->dthps_nprovs;
16195		newhelp->dthps_maxprovs = help->dthps_nprovs;
16196		newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16197		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16198		for (i = 0; i < newhelp->dthps_nprovs; i++) {
16199			newhelp->dthps_provs[i] = help->dthps_provs[i];
16200			newhelp->dthps_provs[i]->dthp_ref++;
16201		}
16202
16203		hasprovs = 1;
16204	}
16205
16206	mutex_exit(&dtrace_lock);
16207
16208	if (hasprovs)
16209		dtrace_helper_provider_register(to, newhelp, NULL);
16210}
16211
16212/*
16213 * DTrace Hook Functions
16214 */
16215static void
16216dtrace_module_loaded(modctl_t *ctl)
16217{
16218	dtrace_provider_t *prv;
16219
16220	mutex_enter(&dtrace_provider_lock);
16221#if defined(sun)
16222	mutex_enter(&mod_lock);
16223#endif
16224
16225#if defined(sun)
16226	ASSERT(ctl->mod_busy);
16227#endif
16228
16229	/*
16230	 * We're going to call each providers per-module provide operation
16231	 * specifying only this module.
16232	 */
16233	for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16234		prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16235
16236#if defined(sun)
16237	mutex_exit(&mod_lock);
16238#endif
16239	mutex_exit(&dtrace_provider_lock);
16240
16241	/*
16242	 * If we have any retained enablings, we need to match against them.
16243	 * Enabling probes requires that cpu_lock be held, and we cannot hold
16244	 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16245	 * module.  (In particular, this happens when loading scheduling
16246	 * classes.)  So if we have any retained enablings, we need to dispatch
16247	 * our task queue to do the match for us.
16248	 */
16249	mutex_enter(&dtrace_lock);
16250
16251	if (dtrace_retained == NULL) {
16252		mutex_exit(&dtrace_lock);
16253		return;
16254	}
16255
16256	(void) taskq_dispatch(dtrace_taskq,
16257	    (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
16258
16259	mutex_exit(&dtrace_lock);
16260
16261	/*
16262	 * And now, for a little heuristic sleaze:  in general, we want to
16263	 * match modules as soon as they load.  However, we cannot guarantee
16264	 * this, because it would lead us to the lock ordering violation
16265	 * outlined above.  The common case, of course, is that cpu_lock is
16266	 * _not_ held -- so we delay here for a clock tick, hoping that that's
16267	 * long enough for the task queue to do its work.  If it's not, it's
16268	 * not a serious problem -- it just means that the module that we
16269	 * just loaded may not be immediately instrumentable.
16270	 */
16271	delay(1);
16272}
16273
16274static void
16275#if defined(sun)
16276dtrace_module_unloaded(modctl_t *ctl)
16277#else
16278dtrace_module_unloaded(modctl_t *ctl, int *error)
16279#endif
16280{
16281	dtrace_probe_t template, *probe, *first, *next;
16282	dtrace_provider_t *prov;
16283#if !defined(sun)
16284	char modname[DTRACE_MODNAMELEN];
16285	size_t len;
16286#endif
16287
16288#if defined(sun)
16289	template.dtpr_mod = ctl->mod_modname;
16290#else
16291	/* Handle the fact that ctl->filename may end in ".ko". */
16292	strlcpy(modname, ctl->filename, sizeof(modname));
16293	len = strlen(ctl->filename);
16294	if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
16295		modname[len - 3] = '\0';
16296	template.dtpr_mod = modname;
16297#endif
16298
16299	mutex_enter(&dtrace_provider_lock);
16300#if defined(sun)
16301	mutex_enter(&mod_lock);
16302#endif
16303	mutex_enter(&dtrace_lock);
16304
16305#if !defined(sun)
16306	if (ctl->nenabled > 0) {
16307		/* Don't allow unloads if a probe is enabled. */
16308		mutex_exit(&dtrace_provider_lock);
16309		mutex_exit(&dtrace_lock);
16310		*error = -1;
16311		printf(
16312	"kldunload: attempt to unload module that has DTrace probes enabled\n");
16313		return;
16314	}
16315#endif
16316
16317	if (dtrace_bymod == NULL) {
16318		/*
16319		 * The DTrace module is loaded (obviously) but not attached;
16320		 * we don't have any work to do.
16321		 */
16322		mutex_exit(&dtrace_provider_lock);
16323#if defined(sun)
16324		mutex_exit(&mod_lock);
16325#endif
16326		mutex_exit(&dtrace_lock);
16327		return;
16328	}
16329
16330	for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16331	    probe != NULL; probe = probe->dtpr_nextmod) {
16332		if (probe->dtpr_ecb != NULL) {
16333			mutex_exit(&dtrace_provider_lock);
16334#if defined(sun)
16335			mutex_exit(&mod_lock);
16336#endif
16337			mutex_exit(&dtrace_lock);
16338
16339			/*
16340			 * This shouldn't _actually_ be possible -- we're
16341			 * unloading a module that has an enabled probe in it.
16342			 * (It's normally up to the provider to make sure that
16343			 * this can't happen.)  However, because dtps_enable()
16344			 * doesn't have a failure mode, there can be an
16345			 * enable/unload race.  Upshot:  we don't want to
16346			 * assert, but we're not going to disable the
16347			 * probe, either.
16348			 */
16349			if (dtrace_err_verbose) {
16350#if defined(sun)
16351				cmn_err(CE_WARN, "unloaded module '%s' had "
16352				    "enabled probes", ctl->mod_modname);
16353#else
16354				cmn_err(CE_WARN, "unloaded module '%s' had "
16355				    "enabled probes", modname);
16356#endif
16357			}
16358
16359			return;
16360		}
16361	}
16362
16363	probe = first;
16364
16365	for (first = NULL; probe != NULL; probe = next) {
16366		ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16367
16368		dtrace_probes[probe->dtpr_id - 1] = NULL;
16369
16370		next = probe->dtpr_nextmod;
16371		dtrace_hash_remove(dtrace_bymod, probe);
16372		dtrace_hash_remove(dtrace_byfunc, probe);
16373		dtrace_hash_remove(dtrace_byname, probe);
16374
16375		if (first == NULL) {
16376			first = probe;
16377			probe->dtpr_nextmod = NULL;
16378		} else {
16379			probe->dtpr_nextmod = first;
16380			first = probe;
16381		}
16382	}
16383
16384	/*
16385	 * We've removed all of the module's probes from the hash chains and
16386	 * from the probe array.  Now issue a dtrace_sync() to be sure that
16387	 * everyone has cleared out from any probe array processing.
16388	 */
16389	dtrace_sync();
16390
16391	for (probe = first; probe != NULL; probe = first) {
16392		first = probe->dtpr_nextmod;
16393		prov = probe->dtpr_provider;
16394		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16395		    probe->dtpr_arg);
16396		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
16397		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
16398		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
16399#if defined(sun)
16400		vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16401#else
16402		free_unr(dtrace_arena, probe->dtpr_id);
16403#endif
16404		kmem_free(probe, sizeof (dtrace_probe_t));
16405	}
16406
16407	mutex_exit(&dtrace_lock);
16408#if defined(sun)
16409	mutex_exit(&mod_lock);
16410#endif
16411	mutex_exit(&dtrace_provider_lock);
16412}
16413
16414#if !defined(sun)
16415static void
16416dtrace_kld_load(void *arg __unused, linker_file_t lf)
16417{
16418
16419	dtrace_module_loaded(lf);
16420}
16421
16422static void
16423dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)
16424{
16425
16426	if (*error != 0)
16427		/* We already have an error, so don't do anything. */
16428		return;
16429	dtrace_module_unloaded(lf, error);
16430}
16431#endif
16432
16433#if defined(sun)
16434static void
16435dtrace_suspend(void)
16436{
16437	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
16438}
16439
16440static void
16441dtrace_resume(void)
16442{
16443	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
16444}
16445#endif
16446
16447static int
16448dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
16449{
16450	ASSERT(MUTEX_HELD(&cpu_lock));
16451	mutex_enter(&dtrace_lock);
16452
16453	switch (what) {
16454	case CPU_CONFIG: {
16455		dtrace_state_t *state;
16456		dtrace_optval_t *opt, rs, c;
16457
16458		/*
16459		 * For now, we only allocate a new buffer for anonymous state.
16460		 */
16461		if ((state = dtrace_anon.dta_state) == NULL)
16462			break;
16463
16464		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
16465			break;
16466
16467		opt = state->dts_options;
16468		c = opt[DTRACEOPT_CPU];
16469
16470		if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
16471			break;
16472
16473		/*
16474		 * Regardless of what the actual policy is, we're going to
16475		 * temporarily set our resize policy to be manual.  We're
16476		 * also going to temporarily set our CPU option to denote
16477		 * the newly configured CPU.
16478		 */
16479		rs = opt[DTRACEOPT_BUFRESIZE];
16480		opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
16481		opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
16482
16483		(void) dtrace_state_buffers(state);
16484
16485		opt[DTRACEOPT_BUFRESIZE] = rs;
16486		opt[DTRACEOPT_CPU] = c;
16487
16488		break;
16489	}
16490
16491	case CPU_UNCONFIG:
16492		/*
16493		 * We don't free the buffer in the CPU_UNCONFIG case.  (The
16494		 * buffer will be freed when the consumer exits.)
16495		 */
16496		break;
16497
16498	default:
16499		break;
16500	}
16501
16502	mutex_exit(&dtrace_lock);
16503	return (0);
16504}
16505
16506#if defined(sun)
16507static void
16508dtrace_cpu_setup_initial(processorid_t cpu)
16509{
16510	(void) dtrace_cpu_setup(CPU_CONFIG, cpu);
16511}
16512#endif
16513
16514static void
16515dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
16516{
16517	if (dtrace_toxranges >= dtrace_toxranges_max) {
16518		int osize, nsize;
16519		dtrace_toxrange_t *range;
16520
16521		osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16522
16523		if (osize == 0) {
16524			ASSERT(dtrace_toxrange == NULL);
16525			ASSERT(dtrace_toxranges_max == 0);
16526			dtrace_toxranges_max = 1;
16527		} else {
16528			dtrace_toxranges_max <<= 1;
16529		}
16530
16531		nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16532		range = kmem_zalloc(nsize, KM_SLEEP);
16533
16534		if (dtrace_toxrange != NULL) {
16535			ASSERT(osize != 0);
16536			bcopy(dtrace_toxrange, range, osize);
16537			kmem_free(dtrace_toxrange, osize);
16538		}
16539
16540		dtrace_toxrange = range;
16541	}
16542
16543	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
16544	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
16545
16546	dtrace_toxrange[dtrace_toxranges].dtt_base = base;
16547	dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
16548	dtrace_toxranges++;
16549}
16550
16551static void
16552dtrace_getf_barrier()
16553{
16554#if defined(sun)
16555	/*
16556	 * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
16557	 * that contain calls to getf(), this routine will be called on every
16558	 * closef() before either the underlying vnode is released or the
16559	 * file_t itself is freed.  By the time we are here, it is essential
16560	 * that the file_t can no longer be accessed from a call to getf()
16561	 * in probe context -- that assures that a dtrace_sync() can be used
16562	 * to clear out any enablings referring to the old structures.
16563	 */
16564	if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
16565	    kcred->cr_zone->zone_dtrace_getf != 0)
16566		dtrace_sync();
16567#endif
16568}
16569
16570/*
16571 * DTrace Driver Cookbook Functions
16572 */
16573#if defined(sun)
16574/*ARGSUSED*/
16575static int
16576dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
16577{
16578	dtrace_provider_id_t id;
16579	dtrace_state_t *state = NULL;
16580	dtrace_enabling_t *enab;
16581
16582	mutex_enter(&cpu_lock);
16583	mutex_enter(&dtrace_provider_lock);
16584	mutex_enter(&dtrace_lock);
16585
16586	if (ddi_soft_state_init(&dtrace_softstate,
16587	    sizeof (dtrace_state_t), 0) != 0) {
16588		cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
16589		mutex_exit(&cpu_lock);
16590		mutex_exit(&dtrace_provider_lock);
16591		mutex_exit(&dtrace_lock);
16592		return (DDI_FAILURE);
16593	}
16594
16595	if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
16596	    DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
16597	    ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
16598	    DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
16599		cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
16600		ddi_remove_minor_node(devi, NULL);
16601		ddi_soft_state_fini(&dtrace_softstate);
16602		mutex_exit(&cpu_lock);
16603		mutex_exit(&dtrace_provider_lock);
16604		mutex_exit(&dtrace_lock);
16605		return (DDI_FAILURE);
16606	}
16607
16608	ddi_report_dev(devi);
16609	dtrace_devi = devi;
16610
16611	dtrace_modload = dtrace_module_loaded;
16612	dtrace_modunload = dtrace_module_unloaded;
16613	dtrace_cpu_init = dtrace_cpu_setup_initial;
16614	dtrace_helpers_cleanup = dtrace_helpers_destroy;
16615	dtrace_helpers_fork = dtrace_helpers_duplicate;
16616	dtrace_cpustart_init = dtrace_suspend;
16617	dtrace_cpustart_fini = dtrace_resume;
16618	dtrace_debugger_init = dtrace_suspend;
16619	dtrace_debugger_fini = dtrace_resume;
16620
16621	register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16622
16623	ASSERT(MUTEX_HELD(&cpu_lock));
16624
16625	dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
16626	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
16627	dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
16628	    UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
16629	    VM_SLEEP | VMC_IDENTIFIER);
16630	dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
16631	    1, INT_MAX, 0);
16632
16633	dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
16634	    sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
16635	    NULL, NULL, NULL, NULL, NULL, 0);
16636
16637	ASSERT(MUTEX_HELD(&cpu_lock));
16638	dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
16639	    offsetof(dtrace_probe_t, dtpr_nextmod),
16640	    offsetof(dtrace_probe_t, dtpr_prevmod));
16641
16642	dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
16643	    offsetof(dtrace_probe_t, dtpr_nextfunc),
16644	    offsetof(dtrace_probe_t, dtpr_prevfunc));
16645
16646	dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
16647	    offsetof(dtrace_probe_t, dtpr_nextname),
16648	    offsetof(dtrace_probe_t, dtpr_prevname));
16649
16650	if (dtrace_retain_max < 1) {
16651		cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
16652		    "setting to 1", dtrace_retain_max);
16653		dtrace_retain_max = 1;
16654	}
16655
16656	/*
16657	 * Now discover our toxic ranges.
16658	 */
16659	dtrace_toxic_ranges(dtrace_toxrange_add);
16660
16661	/*
16662	 * Before we register ourselves as a provider to our own framework,
16663	 * we would like to assert that dtrace_provider is NULL -- but that's
16664	 * not true if we were loaded as a dependency of a DTrace provider.
16665	 * Once we've registered, we can assert that dtrace_provider is our
16666	 * pseudo provider.
16667	 */
16668	(void) dtrace_register("dtrace", &dtrace_provider_attr,
16669	    DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
16670
16671	ASSERT(dtrace_provider != NULL);
16672	ASSERT((dtrace_provider_id_t)dtrace_provider == id);
16673
16674	dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
16675	    dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
16676	dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
16677	    dtrace_provider, NULL, NULL, "END", 0, NULL);
16678	dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
16679	    dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
16680
16681	dtrace_anon_property();
16682	mutex_exit(&cpu_lock);
16683
16684	/*
16685	 * If there are already providers, we must ask them to provide their
16686	 * probes, and then match any anonymous enabling against them.  Note
16687	 * that there should be no other retained enablings at this time:
16688	 * the only retained enablings at this time should be the anonymous
16689	 * enabling.
16690	 */
16691	if (dtrace_anon.dta_enabling != NULL) {
16692		ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
16693
16694		dtrace_enabling_provide(NULL);
16695		state = dtrace_anon.dta_state;
16696
16697		/*
16698		 * We couldn't hold cpu_lock across the above call to
16699		 * dtrace_enabling_provide(), but we must hold it to actually
16700		 * enable the probes.  We have to drop all of our locks, pick
16701		 * up cpu_lock, and regain our locks before matching the
16702		 * retained anonymous enabling.
16703		 */
16704		mutex_exit(&dtrace_lock);
16705		mutex_exit(&dtrace_provider_lock);
16706
16707		mutex_enter(&cpu_lock);
16708		mutex_enter(&dtrace_provider_lock);
16709		mutex_enter(&dtrace_lock);
16710
16711		if ((enab = dtrace_anon.dta_enabling) != NULL)
16712			(void) dtrace_enabling_match(enab, NULL);
16713
16714		mutex_exit(&cpu_lock);
16715	}
16716
16717	mutex_exit(&dtrace_lock);
16718	mutex_exit(&dtrace_provider_lock);
16719
16720	if (state != NULL) {
16721		/*
16722		 * If we created any anonymous state, set it going now.
16723		 */
16724		(void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
16725	}
16726
16727	return (DDI_SUCCESS);
16728}
16729#endif
16730
16731#if !defined(sun)
16732static void dtrace_dtr(void *);
16733#endif
16734
16735/*ARGSUSED*/
16736static int
16737#if defined(sun)
16738dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
16739#else
16740dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
16741#endif
16742{
16743	dtrace_state_t *state;
16744	uint32_t priv;
16745	uid_t uid;
16746	zoneid_t zoneid;
16747
16748#if defined(sun)
16749	if (getminor(*devp) == DTRACEMNRN_HELPER)
16750		return (0);
16751
16752	/*
16753	 * If this wasn't an open with the "helper" minor, then it must be
16754	 * the "dtrace" minor.
16755	 */
16756	if (getminor(*devp) == DTRACEMNRN_DTRACE)
16757		return (ENXIO);
16758#else
16759	cred_t *cred_p = NULL;
16760	cred_p = dev->si_cred;
16761
16762	/*
16763	 * If no DTRACE_PRIV_* bits are set in the credential, then the
16764	 * caller lacks sufficient permission to do anything with DTrace.
16765	 */
16766	dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
16767	if (priv == DTRACE_PRIV_NONE) {
16768#endif
16769
16770		return (EACCES);
16771	}
16772
16773	/*
16774	 * Ask all providers to provide all their probes.
16775	 */
16776	mutex_enter(&dtrace_provider_lock);
16777	dtrace_probe_provide(NULL, NULL);
16778	mutex_exit(&dtrace_provider_lock);
16779
16780	mutex_enter(&cpu_lock);
16781	mutex_enter(&dtrace_lock);
16782	dtrace_opens++;
16783	dtrace_membar_producer();
16784
16785#if defined(sun)
16786	/*
16787	 * If the kernel debugger is active (that is, if the kernel debugger
16788	 * modified text in some way), we won't allow the open.
16789	 */
16790	if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
16791		dtrace_opens--;
16792		mutex_exit(&cpu_lock);
16793		mutex_exit(&dtrace_lock);
16794		return (EBUSY);
16795	}
16796
16797	if (dtrace_helptrace_enable && dtrace_helptrace_buffer == NULL) {
16798		/*
16799		 * If DTrace helper tracing is enabled, we need to allocate the
16800		 * trace buffer and initialize the values.
16801		 */
16802		dtrace_helptrace_buffer =
16803		    kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
16804		dtrace_helptrace_next = 0;
16805		dtrace_helptrace_wrapped = 0;
16806		dtrace_helptrace_enable = 0;
16807	}
16808
16809	state = dtrace_state_create(devp, cred_p);
16810#else
16811	state = dtrace_state_create(dev);
16812	devfs_set_cdevpriv(state, dtrace_dtr);
16813#endif
16814
16815	mutex_exit(&cpu_lock);
16816
16817	if (state == NULL) {
16818#if defined(sun)
16819		if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
16820			(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16821#else
16822		--dtrace_opens;
16823#endif
16824		mutex_exit(&dtrace_lock);
16825		return (EAGAIN);
16826	}
16827
16828	mutex_exit(&dtrace_lock);
16829
16830	return (0);
16831}
16832
16833/*ARGSUSED*/
16834#if defined(sun)
16835static int
16836dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
16837#else
16838static void
16839dtrace_dtr(void *data)
16840#endif
16841{
16842#if defined(sun)
16843	minor_t minor = getminor(dev);
16844	dtrace_state_t *state;
16845#endif
16846	dtrace_helptrace_t *buf = NULL;
16847
16848#ifdef illumos
16849	if (minor == DTRACEMNRN_HELPER)
16850		return (0);
16851
16852	state = ddi_get_soft_state(dtrace_softstate, minor);
16853#else
16854	dtrace_state_t *state = data;
16855#endif
16856
16857	mutex_enter(&cpu_lock);
16858	mutex_enter(&dtrace_lock);
16859
16860#ifdef illumos
16861	if (state->dts_anon)
16862#else
16863	if (state != NULL && state->dts_anon)
16864#endif
16865	{
16866		/*
16867		 * There is anonymous state. Destroy that first.
16868		 */
16869		ASSERT(dtrace_anon.dta_state == NULL);
16870		dtrace_state_destroy(state->dts_anon);
16871	}
16872
16873	if (dtrace_helptrace_disable) {
16874		/*
16875		 * If we have been told to disable helper tracing, set the
16876		 * buffer to NULL before calling into dtrace_state_destroy();
16877		 * we take advantage of its dtrace_sync() to know that no
16878		 * CPU is in probe context with enabled helper tracing
16879		 * after it returns.
16880		 */
16881		buf = dtrace_helptrace_buffer;
16882		dtrace_helptrace_buffer = NULL;
16883	}
16884
16885#ifdef illumos
16886	dtrace_state_destroy(state);
16887#else
16888	if (state != NULL) {
16889		dtrace_state_destroy(state);
16890		kmem_free(state, 0);
16891	}
16892#endif
16893	ASSERT(dtrace_opens > 0);
16894
16895#if defined(sun)
16896	/*
16897	 * Only relinquish control of the kernel debugger interface when there
16898	 * are no consumers and no anonymous enablings.
16899	 */
16900	if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
16901		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16902#else
16903	--dtrace_opens;
16904#endif
16905
16906	if (buf != NULL) {
16907		kmem_free(buf, dtrace_helptrace_bufsize);
16908		dtrace_helptrace_disable = 0;
16909	}
16910
16911	mutex_exit(&dtrace_lock);
16912	mutex_exit(&cpu_lock);
16913
16914#if defined(sun)
16915	return (0);
16916#endif
16917}
16918
16919#if defined(sun)
16920/*ARGSUSED*/
16921static int
16922dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
16923{
16924	int rval;
16925	dof_helper_t help, *dhp = NULL;
16926
16927	switch (cmd) {
16928	case DTRACEHIOC_ADDDOF:
16929		if (copyin((void *)arg, &help, sizeof (help)) != 0) {
16930			dtrace_dof_error(NULL, "failed to copyin DOF helper");
16931			return (EFAULT);
16932		}
16933
16934		dhp = &help;
16935		arg = (intptr_t)help.dofhp_dof;
16936		/*FALLTHROUGH*/
16937
16938	case DTRACEHIOC_ADD: {
16939		dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
16940
16941		if (dof == NULL)
16942			return (rval);
16943
16944		mutex_enter(&dtrace_lock);
16945
16946		/*
16947		 * dtrace_helper_slurp() takes responsibility for the dof --
16948		 * it may free it now or it may save it and free it later.
16949		 */
16950		if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
16951			*rv = rval;
16952			rval = 0;
16953		} else {
16954			rval = EINVAL;
16955		}
16956
16957		mutex_exit(&dtrace_lock);
16958		return (rval);
16959	}
16960
16961	case DTRACEHIOC_REMOVE: {
16962		mutex_enter(&dtrace_lock);
16963		rval = dtrace_helper_destroygen(arg);
16964		mutex_exit(&dtrace_lock);
16965
16966		return (rval);
16967	}
16968
16969	default:
16970		break;
16971	}
16972
16973	return (ENOTTY);
16974}
16975
16976/*ARGSUSED*/
16977static int
16978dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
16979{
16980	minor_t minor = getminor(dev);
16981	dtrace_state_t *state;
16982	int rval;
16983
16984	if (minor == DTRACEMNRN_HELPER)
16985		return (dtrace_ioctl_helper(cmd, arg, rv));
16986
16987	state = ddi_get_soft_state(dtrace_softstate, minor);
16988
16989	if (state->dts_anon) {
16990		ASSERT(dtrace_anon.dta_state == NULL);
16991		state = state->dts_anon;
16992	}
16993
16994	switch (cmd) {
16995	case DTRACEIOC_PROVIDER: {
16996		dtrace_providerdesc_t pvd;
16997		dtrace_provider_t *pvp;
16998
16999		if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
17000			return (EFAULT);
17001
17002		pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
17003		mutex_enter(&dtrace_provider_lock);
17004
17005		for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17006			if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
17007				break;
17008		}
17009
17010		mutex_exit(&dtrace_provider_lock);
17011
17012		if (pvp == NULL)
17013			return (ESRCH);
17014
17015		bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17016		bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17017
17018		if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
17019			return (EFAULT);
17020
17021		return (0);
17022	}
17023
17024	case DTRACEIOC_EPROBE: {
17025		dtrace_eprobedesc_t epdesc;
17026		dtrace_ecb_t *ecb;
17027		dtrace_action_t *act;
17028		void *buf;
17029		size_t size;
17030		uintptr_t dest;
17031		int nrecs;
17032
17033		if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
17034			return (EFAULT);
17035
17036		mutex_enter(&dtrace_lock);
17037
17038		if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17039			mutex_exit(&dtrace_lock);
17040			return (EINVAL);
17041		}
17042
17043		if (ecb->dte_probe == NULL) {
17044			mutex_exit(&dtrace_lock);
17045			return (EINVAL);
17046		}
17047
17048		epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17049		epdesc.dtepd_uarg = ecb->dte_uarg;
17050		epdesc.dtepd_size = ecb->dte_size;
17051
17052		nrecs = epdesc.dtepd_nrecs;
17053		epdesc.dtepd_nrecs = 0;
17054		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17055			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17056				continue;
17057
17058			epdesc.dtepd_nrecs++;
17059		}
17060
17061		/*
17062		 * Now that we have the size, we need to allocate a temporary
17063		 * buffer in which to store the complete description.  We need
17064		 * the temporary buffer to be able to drop dtrace_lock()
17065		 * across the copyout(), below.
17066		 */
17067		size = sizeof (dtrace_eprobedesc_t) +
17068		    (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17069
17070		buf = kmem_alloc(size, KM_SLEEP);
17071		dest = (uintptr_t)buf;
17072
17073		bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17074		dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17075
17076		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17077			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17078				continue;
17079
17080			if (nrecs-- == 0)
17081				break;
17082
17083			bcopy(&act->dta_rec, (void *)dest,
17084			    sizeof (dtrace_recdesc_t));
17085			dest += sizeof (dtrace_recdesc_t);
17086		}
17087
17088		mutex_exit(&dtrace_lock);
17089
17090		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17091			kmem_free(buf, size);
17092			return (EFAULT);
17093		}
17094
17095		kmem_free(buf, size);
17096		return (0);
17097	}
17098
17099	case DTRACEIOC_AGGDESC: {
17100		dtrace_aggdesc_t aggdesc;
17101		dtrace_action_t *act;
17102		dtrace_aggregation_t *agg;
17103		int nrecs;
17104		uint32_t offs;
17105		dtrace_recdesc_t *lrec;
17106		void *buf;
17107		size_t size;
17108		uintptr_t dest;
17109
17110		if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
17111			return (EFAULT);
17112
17113		mutex_enter(&dtrace_lock);
17114
17115		if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17116			mutex_exit(&dtrace_lock);
17117			return (EINVAL);
17118		}
17119
17120		aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17121
17122		nrecs = aggdesc.dtagd_nrecs;
17123		aggdesc.dtagd_nrecs = 0;
17124
17125		offs = agg->dtag_base;
17126		lrec = &agg->dtag_action.dta_rec;
17127		aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17128
17129		for (act = agg->dtag_first; ; act = act->dta_next) {
17130			ASSERT(act->dta_intuple ||
17131			    DTRACEACT_ISAGG(act->dta_kind));
17132
17133			/*
17134			 * If this action has a record size of zero, it
17135			 * denotes an argument to the aggregating action.
17136			 * Because the presence of this record doesn't (or
17137			 * shouldn't) affect the way the data is interpreted,
17138			 * we don't copy it out to save user-level the
17139			 * confusion of dealing with a zero-length record.
17140			 */
17141			if (act->dta_rec.dtrd_size == 0) {
17142				ASSERT(agg->dtag_hasarg);
17143				continue;
17144			}
17145
17146			aggdesc.dtagd_nrecs++;
17147
17148			if (act == &agg->dtag_action)
17149				break;
17150		}
17151
17152		/*
17153		 * Now that we have the size, we need to allocate a temporary
17154		 * buffer in which to store the complete description.  We need
17155		 * the temporary buffer to be able to drop dtrace_lock()
17156		 * across the copyout(), below.
17157		 */
17158		size = sizeof (dtrace_aggdesc_t) +
17159		    (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17160
17161		buf = kmem_alloc(size, KM_SLEEP);
17162		dest = (uintptr_t)buf;
17163
17164		bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17165		dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17166
17167		for (act = agg->dtag_first; ; act = act->dta_next) {
17168			dtrace_recdesc_t rec = act->dta_rec;
17169
17170			/*
17171			 * See the comment in the above loop for why we pass
17172			 * over zero-length records.
17173			 */
17174			if (rec.dtrd_size == 0) {
17175				ASSERT(agg->dtag_hasarg);
17176				continue;
17177			}
17178
17179			if (nrecs-- == 0)
17180				break;
17181
17182			rec.dtrd_offset -= offs;
17183			bcopy(&rec, (void *)dest, sizeof (rec));
17184			dest += sizeof (dtrace_recdesc_t);
17185
17186			if (act == &agg->dtag_action)
17187				break;
17188		}
17189
17190		mutex_exit(&dtrace_lock);
17191
17192		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17193			kmem_free(buf, size);
17194			return (EFAULT);
17195		}
17196
17197		kmem_free(buf, size);
17198		return (0);
17199	}
17200
17201	case DTRACEIOC_ENABLE: {
17202		dof_hdr_t *dof;
17203		dtrace_enabling_t *enab = NULL;
17204		dtrace_vstate_t *vstate;
17205		int err = 0;
17206
17207		*rv = 0;
17208
17209		/*
17210		 * If a NULL argument has been passed, we take this as our
17211		 * cue to reevaluate our enablings.
17212		 */
17213		if (arg == NULL) {
17214			dtrace_enabling_matchall();
17215
17216			return (0);
17217		}
17218
17219		if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17220			return (rval);
17221
17222		mutex_enter(&cpu_lock);
17223		mutex_enter(&dtrace_lock);
17224		vstate = &state->dts_vstate;
17225
17226		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17227			mutex_exit(&dtrace_lock);
17228			mutex_exit(&cpu_lock);
17229			dtrace_dof_destroy(dof);
17230			return (EBUSY);
17231		}
17232
17233		if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17234			mutex_exit(&dtrace_lock);
17235			mutex_exit(&cpu_lock);
17236			dtrace_dof_destroy(dof);
17237			return (EINVAL);
17238		}
17239
17240		if ((rval = dtrace_dof_options(dof, state)) != 0) {
17241			dtrace_enabling_destroy(enab);
17242			mutex_exit(&dtrace_lock);
17243			mutex_exit(&cpu_lock);
17244			dtrace_dof_destroy(dof);
17245			return (rval);
17246		}
17247
17248		if ((err = dtrace_enabling_match(enab, rv)) == 0) {
17249			err = dtrace_enabling_retain(enab);
17250		} else {
17251			dtrace_enabling_destroy(enab);
17252		}
17253
17254		mutex_exit(&cpu_lock);
17255		mutex_exit(&dtrace_lock);
17256		dtrace_dof_destroy(dof);
17257
17258		return (err);
17259	}
17260
17261	case DTRACEIOC_REPLICATE: {
17262		dtrace_repldesc_t desc;
17263		dtrace_probedesc_t *match = &desc.dtrpd_match;
17264		dtrace_probedesc_t *create = &desc.dtrpd_create;
17265		int err;
17266
17267		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17268			return (EFAULT);
17269
17270		match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17271		match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17272		match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17273		match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17274
17275		create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17276		create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17277		create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17278		create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17279
17280		mutex_enter(&dtrace_lock);
17281		err = dtrace_enabling_replicate(state, match, create);
17282		mutex_exit(&dtrace_lock);
17283
17284		return (err);
17285	}
17286
17287	case DTRACEIOC_PROBEMATCH:
17288	case DTRACEIOC_PROBES: {
17289		dtrace_probe_t *probe = NULL;
17290		dtrace_probedesc_t desc;
17291		dtrace_probekey_t pkey;
17292		dtrace_id_t i;
17293		int m = 0;
17294		uint32_t priv;
17295		uid_t uid;
17296		zoneid_t zoneid;
17297
17298		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17299			return (EFAULT);
17300
17301		desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17302		desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17303		desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17304		desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17305
17306		/*
17307		 * Before we attempt to match this probe, we want to give
17308		 * all providers the opportunity to provide it.
17309		 */
17310		if (desc.dtpd_id == DTRACE_IDNONE) {
17311			mutex_enter(&dtrace_provider_lock);
17312			dtrace_probe_provide(&desc, NULL);
17313			mutex_exit(&dtrace_provider_lock);
17314			desc.dtpd_id++;
17315		}
17316
17317		if (cmd == DTRACEIOC_PROBEMATCH)  {
17318			dtrace_probekey(&desc, &pkey);
17319			pkey.dtpk_id = DTRACE_IDNONE;
17320		}
17321
17322		dtrace_cred2priv(cr, &priv, &uid, &zoneid);
17323
17324		mutex_enter(&dtrace_lock);
17325
17326		if (cmd == DTRACEIOC_PROBEMATCH) {
17327			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17328				if ((probe = dtrace_probes[i - 1]) != NULL &&
17329				    (m = dtrace_match_probe(probe, &pkey,
17330				    priv, uid, zoneid)) != 0)
17331					break;
17332			}
17333
17334			if (m < 0) {
17335				mutex_exit(&dtrace_lock);
17336				return (EINVAL);
17337			}
17338
17339		} else {
17340			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17341				if ((probe = dtrace_probes[i - 1]) != NULL &&
17342				    dtrace_match_priv(probe, priv, uid, zoneid))
17343					break;
17344			}
17345		}
17346
17347		if (probe == NULL) {
17348			mutex_exit(&dtrace_lock);
17349			return (ESRCH);
17350		}
17351
17352		dtrace_probe_description(probe, &desc);
17353		mutex_exit(&dtrace_lock);
17354
17355		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17356			return (EFAULT);
17357
17358		return (0);
17359	}
17360
17361	case DTRACEIOC_PROBEARG: {
17362		dtrace_argdesc_t desc;
17363		dtrace_probe_t *probe;
17364		dtrace_provider_t *prov;
17365
17366		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17367			return (EFAULT);
17368
17369		if (desc.dtargd_id == DTRACE_IDNONE)
17370			return (EINVAL);
17371
17372		if (desc.dtargd_ndx == DTRACE_ARGNONE)
17373			return (EINVAL);
17374
17375		mutex_enter(&dtrace_provider_lock);
17376		mutex_enter(&mod_lock);
17377		mutex_enter(&dtrace_lock);
17378
17379		if (desc.dtargd_id > dtrace_nprobes) {
17380			mutex_exit(&dtrace_lock);
17381			mutex_exit(&mod_lock);
17382			mutex_exit(&dtrace_provider_lock);
17383			return (EINVAL);
17384		}
17385
17386		if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
17387			mutex_exit(&dtrace_lock);
17388			mutex_exit(&mod_lock);
17389			mutex_exit(&dtrace_provider_lock);
17390			return (EINVAL);
17391		}
17392
17393		mutex_exit(&dtrace_lock);
17394
17395		prov = probe->dtpr_provider;
17396
17397		if (prov->dtpv_pops.dtps_getargdesc == NULL) {
17398			/*
17399			 * There isn't any typed information for this probe.
17400			 * Set the argument number to DTRACE_ARGNONE.
17401			 */
17402			desc.dtargd_ndx = DTRACE_ARGNONE;
17403		} else {
17404			desc.dtargd_native[0] = '\0';
17405			desc.dtargd_xlate[0] = '\0';
17406			desc.dtargd_mapping = desc.dtargd_ndx;
17407
17408			prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
17409			    probe->dtpr_id, probe->dtpr_arg, &desc);
17410		}
17411
17412		mutex_exit(&mod_lock);
17413		mutex_exit(&dtrace_provider_lock);
17414
17415		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17416			return (EFAULT);
17417
17418		return (0);
17419	}
17420
17421	case DTRACEIOC_GO: {
17422		processorid_t cpuid;
17423		rval = dtrace_state_go(state, &cpuid);
17424
17425		if (rval != 0)
17426			return (rval);
17427
17428		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17429			return (EFAULT);
17430
17431		return (0);
17432	}
17433
17434	case DTRACEIOC_STOP: {
17435		processorid_t cpuid;
17436
17437		mutex_enter(&dtrace_lock);
17438		rval = dtrace_state_stop(state, &cpuid);
17439		mutex_exit(&dtrace_lock);
17440
17441		if (rval != 0)
17442			return (rval);
17443
17444		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17445			return (EFAULT);
17446
17447		return (0);
17448	}
17449
17450	case DTRACEIOC_DOFGET: {
17451		dof_hdr_t hdr, *dof;
17452		uint64_t len;
17453
17454		if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
17455			return (EFAULT);
17456
17457		mutex_enter(&dtrace_lock);
17458		dof = dtrace_dof_create(state);
17459		mutex_exit(&dtrace_lock);
17460
17461		len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
17462		rval = copyout(dof, (void *)arg, len);
17463		dtrace_dof_destroy(dof);
17464
17465		return (rval == 0 ? 0 : EFAULT);
17466	}
17467
17468	case DTRACEIOC_AGGSNAP:
17469	case DTRACEIOC_BUFSNAP: {
17470		dtrace_bufdesc_t desc;
17471		caddr_t cached;
17472		dtrace_buffer_t *buf;
17473
17474		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17475			return (EFAULT);
17476
17477		if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
17478			return (EINVAL);
17479
17480		mutex_enter(&dtrace_lock);
17481
17482		if (cmd == DTRACEIOC_BUFSNAP) {
17483			buf = &state->dts_buffer[desc.dtbd_cpu];
17484		} else {
17485			buf = &state->dts_aggbuffer[desc.dtbd_cpu];
17486		}
17487
17488		if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
17489			size_t sz = buf->dtb_offset;
17490
17491			if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
17492				mutex_exit(&dtrace_lock);
17493				return (EBUSY);
17494			}
17495
17496			/*
17497			 * If this buffer has already been consumed, we're
17498			 * going to indicate that there's nothing left here
17499			 * to consume.
17500			 */
17501			if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
17502				mutex_exit(&dtrace_lock);
17503
17504				desc.dtbd_size = 0;
17505				desc.dtbd_drops = 0;
17506				desc.dtbd_errors = 0;
17507				desc.dtbd_oldest = 0;
17508				sz = sizeof (desc);
17509
17510				if (copyout(&desc, (void *)arg, sz) != 0)
17511					return (EFAULT);
17512
17513				return (0);
17514			}
17515
17516			/*
17517			 * If this is a ring buffer that has wrapped, we want
17518			 * to copy the whole thing out.
17519			 */
17520			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
17521				dtrace_buffer_polish(buf);
17522				sz = buf->dtb_size;
17523			}
17524
17525			if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
17526				mutex_exit(&dtrace_lock);
17527				return (EFAULT);
17528			}
17529
17530			desc.dtbd_size = sz;
17531			desc.dtbd_drops = buf->dtb_drops;
17532			desc.dtbd_errors = buf->dtb_errors;
17533			desc.dtbd_oldest = buf->dtb_xamot_offset;
17534			desc.dtbd_timestamp = dtrace_gethrtime();
17535
17536			mutex_exit(&dtrace_lock);
17537
17538			if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17539				return (EFAULT);
17540
17541			buf->dtb_flags |= DTRACEBUF_CONSUMED;
17542
17543			return (0);
17544		}
17545
17546		if (buf->dtb_tomax == NULL) {
17547			ASSERT(buf->dtb_xamot == NULL);
17548			mutex_exit(&dtrace_lock);
17549			return (ENOENT);
17550		}
17551
17552		cached = buf->dtb_tomax;
17553		ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
17554
17555		dtrace_xcall(desc.dtbd_cpu,
17556		    (dtrace_xcall_t)dtrace_buffer_switch, buf);
17557
17558		state->dts_errors += buf->dtb_xamot_errors;
17559
17560		/*
17561		 * If the buffers did not actually switch, then the cross call
17562		 * did not take place -- presumably because the given CPU is
17563		 * not in the ready set.  If this is the case, we'll return
17564		 * ENOENT.
17565		 */
17566		if (buf->dtb_tomax == cached) {
17567			ASSERT(buf->dtb_xamot != cached);
17568			mutex_exit(&dtrace_lock);
17569			return (ENOENT);
17570		}
17571
17572		ASSERT(cached == buf->dtb_xamot);
17573
17574		/*
17575		 * We have our snapshot; now copy it out.
17576		 */
17577		if (copyout(buf->dtb_xamot, desc.dtbd_data,
17578		    buf->dtb_xamot_offset) != 0) {
17579			mutex_exit(&dtrace_lock);
17580			return (EFAULT);
17581		}
17582
17583		desc.dtbd_size = buf->dtb_xamot_offset;
17584		desc.dtbd_drops = buf->dtb_xamot_drops;
17585		desc.dtbd_errors = buf->dtb_xamot_errors;
17586		desc.dtbd_oldest = 0;
17587		desc.dtbd_timestamp = buf->dtb_switched;
17588
17589		mutex_exit(&dtrace_lock);
17590
17591		/*
17592		 * Finally, copy out the buffer description.
17593		 */
17594		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17595			return (EFAULT);
17596
17597		return (0);
17598	}
17599
17600	case DTRACEIOC_CONF: {
17601		dtrace_conf_t conf;
17602
17603		bzero(&conf, sizeof (conf));
17604		conf.dtc_difversion = DIF_VERSION;
17605		conf.dtc_difintregs = DIF_DIR_NREGS;
17606		conf.dtc_diftupregs = DIF_DTR_NREGS;
17607		conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
17608
17609		if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
17610			return (EFAULT);
17611
17612		return (0);
17613	}
17614
17615	case DTRACEIOC_STATUS: {
17616		dtrace_status_t stat;
17617		dtrace_dstate_t *dstate;
17618		int i, j;
17619		uint64_t nerrs;
17620
17621		/*
17622		 * See the comment in dtrace_state_deadman() for the reason
17623		 * for setting dts_laststatus to INT64_MAX before setting
17624		 * it to the correct value.
17625		 */
17626		state->dts_laststatus = INT64_MAX;
17627		dtrace_membar_producer();
17628		state->dts_laststatus = dtrace_gethrtime();
17629
17630		bzero(&stat, sizeof (stat));
17631
17632		mutex_enter(&dtrace_lock);
17633
17634		if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
17635			mutex_exit(&dtrace_lock);
17636			return (ENOENT);
17637		}
17638
17639		if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
17640			stat.dtst_exiting = 1;
17641
17642		nerrs = state->dts_errors;
17643		dstate = &state->dts_vstate.dtvs_dynvars;
17644
17645		for (i = 0; i < NCPU; i++) {
17646			dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
17647
17648			stat.dtst_dyndrops += dcpu->dtdsc_drops;
17649			stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
17650			stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
17651
17652			if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
17653				stat.dtst_filled++;
17654
17655			nerrs += state->dts_buffer[i].dtb_errors;
17656
17657			for (j = 0; j < state->dts_nspeculations; j++) {
17658				dtrace_speculation_t *spec;
17659				dtrace_buffer_t *buf;
17660
17661				spec = &state->dts_speculations[j];
17662				buf = &spec->dtsp_buffer[i];
17663				stat.dtst_specdrops += buf->dtb_xamot_drops;
17664			}
17665		}
17666
17667		stat.dtst_specdrops_busy = state->dts_speculations_busy;
17668		stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
17669		stat.dtst_stkstroverflows = state->dts_stkstroverflows;
17670		stat.dtst_dblerrors = state->dts_dblerrors;
17671		stat.dtst_killed =
17672		    (state->dts_activity == DTRACE_ACTIVITY_KILLED);
17673		stat.dtst_errors = nerrs;
17674
17675		mutex_exit(&dtrace_lock);
17676
17677		if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
17678			return (EFAULT);
17679
17680		return (0);
17681	}
17682
17683	case DTRACEIOC_FORMAT: {
17684		dtrace_fmtdesc_t fmt;
17685		char *str;
17686		int len;
17687
17688		if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
17689			return (EFAULT);
17690
17691		mutex_enter(&dtrace_lock);
17692
17693		if (fmt.dtfd_format == 0 ||
17694		    fmt.dtfd_format > state->dts_nformats) {
17695			mutex_exit(&dtrace_lock);
17696			return (EINVAL);
17697		}
17698
17699		/*
17700		 * Format strings are allocated contiguously and they are
17701		 * never freed; if a format index is less than the number
17702		 * of formats, we can assert that the format map is non-NULL
17703		 * and that the format for the specified index is non-NULL.
17704		 */
17705		ASSERT(state->dts_formats != NULL);
17706		str = state->dts_formats[fmt.dtfd_format - 1];
17707		ASSERT(str != NULL);
17708
17709		len = strlen(str) + 1;
17710
17711		if (len > fmt.dtfd_length) {
17712			fmt.dtfd_length = len;
17713
17714			if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
17715				mutex_exit(&dtrace_lock);
17716				return (EINVAL);
17717			}
17718		} else {
17719			if (copyout(str, fmt.dtfd_string, len) != 0) {
17720				mutex_exit(&dtrace_lock);
17721				return (EINVAL);
17722			}
17723		}
17724
17725		mutex_exit(&dtrace_lock);
17726		return (0);
17727	}
17728
17729	default:
17730		break;
17731	}
17732
17733	return (ENOTTY);
17734}
17735
17736/*ARGSUSED*/
17737static int
17738dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
17739{
17740	dtrace_state_t *state;
17741
17742	switch (cmd) {
17743	case DDI_DETACH:
17744		break;
17745
17746	case DDI_SUSPEND:
17747		return (DDI_SUCCESS);
17748
17749	default:
17750		return (DDI_FAILURE);
17751	}
17752
17753	mutex_enter(&cpu_lock);
17754	mutex_enter(&dtrace_provider_lock);
17755	mutex_enter(&dtrace_lock);
17756
17757	ASSERT(dtrace_opens == 0);
17758
17759	if (dtrace_helpers > 0) {
17760		mutex_exit(&dtrace_provider_lock);
17761		mutex_exit(&dtrace_lock);
17762		mutex_exit(&cpu_lock);
17763		return (DDI_FAILURE);
17764	}
17765
17766	if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
17767		mutex_exit(&dtrace_provider_lock);
17768		mutex_exit(&dtrace_lock);
17769		mutex_exit(&cpu_lock);
17770		return (DDI_FAILURE);
17771	}
17772
17773	dtrace_provider = NULL;
17774
17775	if ((state = dtrace_anon_grab()) != NULL) {
17776		/*
17777		 * If there were ECBs on this state, the provider should
17778		 * have not been allowed to detach; assert that there is
17779		 * none.
17780		 */
17781		ASSERT(state->dts_necbs == 0);
17782		dtrace_state_destroy(state);
17783
17784		/*
17785		 * If we're being detached with anonymous state, we need to
17786		 * indicate to the kernel debugger that DTrace is now inactive.
17787		 */
17788		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17789	}
17790
17791	bzero(&dtrace_anon, sizeof (dtrace_anon_t));
17792	unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17793	dtrace_cpu_init = NULL;
17794	dtrace_helpers_cleanup = NULL;
17795	dtrace_helpers_fork = NULL;
17796	dtrace_cpustart_init = NULL;
17797	dtrace_cpustart_fini = NULL;
17798	dtrace_debugger_init = NULL;
17799	dtrace_debugger_fini = NULL;
17800	dtrace_modload = NULL;
17801	dtrace_modunload = NULL;
17802
17803	ASSERT(dtrace_getf == 0);
17804	ASSERT(dtrace_closef == NULL);
17805
17806	mutex_exit(&cpu_lock);
17807
17808	kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
17809	dtrace_probes = NULL;
17810	dtrace_nprobes = 0;
17811
17812	dtrace_hash_destroy(dtrace_bymod);
17813	dtrace_hash_destroy(dtrace_byfunc);
17814	dtrace_hash_destroy(dtrace_byname);
17815	dtrace_bymod = NULL;
17816	dtrace_byfunc = NULL;
17817	dtrace_byname = NULL;
17818
17819	kmem_cache_destroy(dtrace_state_cache);
17820	vmem_destroy(dtrace_minor);
17821	vmem_destroy(dtrace_arena);
17822
17823	if (dtrace_toxrange != NULL) {
17824		kmem_free(dtrace_toxrange,
17825		    dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
17826		dtrace_toxrange = NULL;
17827		dtrace_toxranges = 0;
17828		dtrace_toxranges_max = 0;
17829	}
17830
17831	ddi_remove_minor_node(dtrace_devi, NULL);
17832	dtrace_devi = NULL;
17833
17834	ddi_soft_state_fini(&dtrace_softstate);
17835
17836	ASSERT(dtrace_vtime_references == 0);
17837	ASSERT(dtrace_opens == 0);
17838	ASSERT(dtrace_retained == NULL);
17839
17840	mutex_exit(&dtrace_lock);
17841	mutex_exit(&dtrace_provider_lock);
17842
17843	/*
17844	 * We don't destroy the task queue until after we have dropped our
17845	 * locks (taskq_destroy() may block on running tasks).  To prevent
17846	 * attempting to do work after we have effectively detached but before
17847	 * the task queue has been destroyed, all tasks dispatched via the
17848	 * task queue must check that DTrace is still attached before
17849	 * performing any operation.
17850	 */
17851	taskq_destroy(dtrace_taskq);
17852	dtrace_taskq = NULL;
17853
17854	return (DDI_SUCCESS);
17855}
17856#endif
17857
17858#if defined(sun)
17859/*ARGSUSED*/
17860static int
17861dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
17862{
17863	int error;
17864
17865	switch (infocmd) {
17866	case DDI_INFO_DEVT2DEVINFO:
17867		*result = (void *)dtrace_devi;
17868		error = DDI_SUCCESS;
17869		break;
17870	case DDI_INFO_DEVT2INSTANCE:
17871		*result = (void *)0;
17872		error = DDI_SUCCESS;
17873		break;
17874	default:
17875		error = DDI_FAILURE;
17876	}
17877	return (error);
17878}
17879#endif
17880
17881#if defined(sun)
17882static struct cb_ops dtrace_cb_ops = {
17883	dtrace_open,		/* open */
17884	dtrace_close,		/* close */
17885	nulldev,		/* strategy */
17886	nulldev,		/* print */
17887	nodev,			/* dump */
17888	nodev,			/* read */
17889	nodev,			/* write */
17890	dtrace_ioctl,		/* ioctl */
17891	nodev,			/* devmap */
17892	nodev,			/* mmap */
17893	nodev,			/* segmap */
17894	nochpoll,		/* poll */
17895	ddi_prop_op,		/* cb_prop_op */
17896	0,			/* streamtab  */
17897	D_NEW | D_MP		/* Driver compatibility flag */
17898};
17899
17900static struct dev_ops dtrace_ops = {
17901	DEVO_REV,		/* devo_rev */
17902	0,			/* refcnt */
17903	dtrace_info,		/* get_dev_info */
17904	nulldev,		/* identify */
17905	nulldev,		/* probe */
17906	dtrace_attach,		/* attach */
17907	dtrace_detach,		/* detach */
17908	nodev,			/* reset */
17909	&dtrace_cb_ops,		/* driver operations */
17910	NULL,			/* bus operations */
17911	nodev			/* dev power */
17912};
17913
17914static struct modldrv modldrv = {
17915	&mod_driverops,		/* module type (this is a pseudo driver) */
17916	"Dynamic Tracing",	/* name of module */
17917	&dtrace_ops,		/* driver ops */
17918};
17919
17920static struct modlinkage modlinkage = {
17921	MODREV_1,
17922	(void *)&modldrv,
17923	NULL
17924};
17925
17926int
17927_init(void)
17928{
17929	return (mod_install(&modlinkage));
17930}
17931
17932int
17933_info(struct modinfo *modinfop)
17934{
17935	return (mod_info(&modlinkage, modinfop));
17936}
17937
17938int
17939_fini(void)
17940{
17941	return (mod_remove(&modlinkage));
17942}
17943#else
17944
17945static d_ioctl_t	dtrace_ioctl;
17946static d_ioctl_t	dtrace_ioctl_helper;
17947static void		dtrace_load(void *);
17948static int		dtrace_unload(void);
17949static struct cdev	*dtrace_dev;
17950static struct cdev	*helper_dev;
17951
17952void dtrace_invop_init(void);
17953void dtrace_invop_uninit(void);
17954
17955static struct cdevsw dtrace_cdevsw = {
17956	.d_version	= D_VERSION,
17957	.d_ioctl	= dtrace_ioctl,
17958	.d_open		= dtrace_open,
17959	.d_name		= "dtrace",
17960};
17961
17962static struct cdevsw helper_cdevsw = {
17963	.d_version	= D_VERSION,
17964	.d_ioctl	= dtrace_ioctl_helper,
17965	.d_name		= "helper",
17966};
17967
17968#include <dtrace_anon.c>
17969#include <dtrace_ioctl.c>
17970#include <dtrace_load.c>
17971#include <dtrace_modevent.c>
17972#include <dtrace_sysctl.c>
17973#include <dtrace_unload.c>
17974#include <dtrace_vtime.c>
17975#include <dtrace_hacks.c>
17976#include <dtrace_isa.c>
17977
17978SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL);
17979SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL);
17980SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL);
17981
17982DEV_MODULE(dtrace, dtrace_modevent, NULL);
17983MODULE_VERSION(dtrace, 1);
17984MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
17985#endif
17986