dtrace.c revision 284136
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
21 * $FreeBSD: stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c 284136 2015-06-07 20:45:13Z pfg $
22 */
23
24/*
25 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 * Copyright (c) 2012 by Delphix. All rights reserved.
28 */
29
30/*
31 * DTrace - Dynamic Tracing for Solaris
32 *
33 * This is the implementation of the Solaris Dynamic Tracing framework
34 * (DTrace).  The user-visible interface to DTrace is described at length in
35 * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
36 * library, the in-kernel DTrace framework, and the DTrace providers are
37 * described in the block comments in the <sys/dtrace.h> header file.  The
38 * internal architecture of DTrace is described in the block comments in the
39 * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
40 * implementation very much assume mastery of all of these sources; if one has
41 * an unanswered question about the implementation, one should consult them
42 * first.
43 *
44 * The functions here are ordered roughly as follows:
45 *
46 *   - Probe context functions
47 *   - Probe hashing functions
48 *   - Non-probe context utility functions
49 *   - Matching functions
50 *   - Provider-to-Framework API functions
51 *   - Probe management functions
52 *   - DIF object functions
53 *   - Format functions
54 *   - Predicate functions
55 *   - ECB functions
56 *   - Buffer functions
57 *   - Enabling functions
58 *   - DOF functions
59 *   - Anonymous enabling functions
60 *   - Consumer state functions
61 *   - Helper functions
62 *   - Hook functions
63 *   - Driver cookbook functions
64 *
65 * Each group of functions begins with a block comment labelled the "DTrace
66 * [Group] Functions", allowing one to find each block by searching forward
67 * on capital-f functions.
68 */
69#include <sys/errno.h>
70#if !defined(sun)
71#include <sys/time.h>
72#endif
73#include <sys/stat.h>
74#include <sys/modctl.h>
75#include <sys/conf.h>
76#include <sys/systm.h>
77#if defined(sun)
78#include <sys/ddi.h>
79#include <sys/sunddi.h>
80#endif
81#include <sys/cpuvar.h>
82#include <sys/kmem.h>
83#if defined(sun)
84#include <sys/strsubr.h>
85#endif
86#include <sys/sysmacros.h>
87#include <sys/dtrace_impl.h>
88#include <sys/atomic.h>
89#include <sys/cmn_err.h>
90#if defined(sun)
91#include <sys/mutex_impl.h>
92#include <sys/rwlock_impl.h>
93#endif
94#include <sys/ctf_api.h>
95#if defined(sun)
96#include <sys/panic.h>
97#include <sys/priv_impl.h>
98#endif
99#include <sys/policy.h>
100#if defined(sun)
101#include <sys/cred_impl.h>
102#include <sys/procfs_isa.h>
103#endif
104#include <sys/taskq.h>
105#if defined(sun)
106#include <sys/mkdev.h>
107#include <sys/kdi.h>
108#endif
109#include <sys/zone.h>
110#include <sys/socket.h>
111#include <netinet/in.h>
112#include "strtolctype.h"
113
114/* FreeBSD includes: */
115#if !defined(sun)
116#include <sys/callout.h>
117#include <sys/ctype.h>
118#include <sys/eventhandler.h>
119#include <sys/limits.h>
120#include <sys/kdb.h>
121#include <sys/kernel.h>
122#include <sys/malloc.h>
123#include <sys/sysctl.h>
124#include <sys/lock.h>
125#include <sys/mutex.h>
126#include <sys/rwlock.h>
127#include <sys/sx.h>
128#include <sys/dtrace_bsd.h>
129#include <netinet/in.h>
130#include "dtrace_cddl.h"
131#include "dtrace_debug.c"
132#endif
133
134/*
135 * DTrace Tunable Variables
136 *
137 * The following variables may be tuned by adding a line to /etc/system that
138 * includes both the name of the DTrace module ("dtrace") and the name of the
139 * variable.  For example:
140 *
141 *   set dtrace:dtrace_destructive_disallow = 1
142 *
143 * In general, the only variables that one should be tuning this way are those
144 * that affect system-wide DTrace behavior, and for which the default behavior
145 * is undesirable.  Most of these variables are tunable on a per-consumer
146 * basis using DTrace options, and need not be tuned on a system-wide basis.
147 * When tuning these variables, avoid pathological values; while some attempt
148 * is made to verify the integrity of these variables, they are not considered
149 * part of the supported interface to DTrace, and they are therefore not
150 * checked comprehensively.  Further, these variables should not be tuned
151 * dynamically via "mdb -kw" or other means; they should only be tuned via
152 * /etc/system.
153 */
154int		dtrace_destructive_disallow = 0;
155dtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
156size_t		dtrace_difo_maxsize = (256 * 1024);
157dtrace_optval_t	dtrace_dof_maxsize = (8 * 1024 * 1024);
158size_t		dtrace_global_maxsize = (16 * 1024);
159size_t		dtrace_actions_max = (16 * 1024);
160size_t		dtrace_retain_max = 1024;
161dtrace_optval_t	dtrace_helper_actions_max = 128;
162dtrace_optval_t	dtrace_helper_providers_max = 32;
163dtrace_optval_t	dtrace_dstate_defsize = (1 * 1024 * 1024);
164size_t		dtrace_strsize_default = 256;
165dtrace_optval_t	dtrace_cleanrate_default = 9900990;		/* 101 hz */
166dtrace_optval_t	dtrace_cleanrate_min = 200000;			/* 5000 hz */
167dtrace_optval_t	dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;	/* 1/minute */
168dtrace_optval_t	dtrace_aggrate_default = NANOSEC;		/* 1 hz */
169dtrace_optval_t	dtrace_statusrate_default = NANOSEC;		/* 1 hz */
170dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;	 /* 6/minute */
171dtrace_optval_t	dtrace_switchrate_default = NANOSEC;		/* 1 hz */
172dtrace_optval_t	dtrace_nspec_default = 1;
173dtrace_optval_t	dtrace_specsize_default = 32 * 1024;
174dtrace_optval_t dtrace_stackframes_default = 20;
175dtrace_optval_t dtrace_ustackframes_default = 20;
176dtrace_optval_t dtrace_jstackframes_default = 50;
177dtrace_optval_t dtrace_jstackstrsize_default = 512;
178int		dtrace_msgdsize_max = 128;
179hrtime_t	dtrace_chill_max = MSEC2NSEC(500);		/* 500 ms */
180hrtime_t	dtrace_chill_interval = NANOSEC;		/* 1000 ms */
181int		dtrace_devdepth_max = 32;
182int		dtrace_err_verbose;
183hrtime_t	dtrace_deadman_interval = NANOSEC;
184hrtime_t	dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
185hrtime_t	dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
186hrtime_t	dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
187#if !defined(sun)
188int		dtrace_memstr_max = 4096;
189#endif
190
191/*
192 * DTrace External Variables
193 *
194 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
195 * available to DTrace consumers via the backtick (`) syntax.  One of these,
196 * dtrace_zero, is made deliberately so:  it is provided as a source of
197 * well-known, zero-filled memory.  While this variable is not documented,
198 * it is used by some translators as an implementation detail.
199 */
200const char	dtrace_zero[256] = { 0 };	/* zero-filled memory */
201
202/*
203 * DTrace Internal Variables
204 */
205#if defined(sun)
206static dev_info_t	*dtrace_devi;		/* device info */
207#endif
208#if defined(sun)
209static vmem_t		*dtrace_arena;		/* probe ID arena */
210static vmem_t		*dtrace_minor;		/* minor number arena */
211#else
212static taskq_t		*dtrace_taskq;		/* task queue */
213static struct unrhdr	*dtrace_arena;		/* Probe ID number.     */
214#endif
215static dtrace_probe_t	**dtrace_probes;	/* array of all probes */
216static int		dtrace_nprobes;		/* number of probes */
217static dtrace_provider_t *dtrace_provider;	/* provider list */
218static dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
219static int		dtrace_opens;		/* number of opens */
220static int		dtrace_helpers;		/* number of helpers */
221static int		dtrace_getf;		/* number of unpriv getf()s */
222#if defined(sun)
223static void		*dtrace_softstate;	/* softstate pointer */
224#endif
225static dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
226static dtrace_hash_t	*dtrace_byfunc;		/* probes hashed by function */
227static dtrace_hash_t	*dtrace_byname;		/* probes hashed by name */
228static dtrace_toxrange_t *dtrace_toxrange;	/* toxic range array */
229static int		dtrace_toxranges;	/* number of toxic ranges */
230static int		dtrace_toxranges_max;	/* size of toxic range array */
231static dtrace_anon_t	dtrace_anon;		/* anonymous enabling */
232static kmem_cache_t	*dtrace_state_cache;	/* cache for dynamic state */
233static uint64_t		dtrace_vtime_references; /* number of vtimestamp refs */
234static kthread_t	*dtrace_panicked;	/* panicking thread */
235static dtrace_ecb_t	*dtrace_ecb_create_cache; /* cached created ECB */
236static dtrace_genid_t	dtrace_probegen;	/* current probe generation */
237static dtrace_helpers_t *dtrace_deferred_pid;	/* deferred helper list */
238static dtrace_enabling_t *dtrace_retained;	/* list of retained enablings */
239static dtrace_genid_t	dtrace_retained_gen;	/* current retained enab gen */
240static dtrace_dynvar_t	dtrace_dynhash_sink;	/* end of dynamic hash chains */
241static int		dtrace_dynvar_failclean; /* dynvars failed to clean */
242#if !defined(sun)
243static struct mtx	dtrace_unr_mtx;
244MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
245int		dtrace_in_probe;	/* non-zero if executing a probe */
246#if defined(__i386__) || defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
247uintptr_t	dtrace_in_probe_addr;	/* Address of invop when already in probe */
248#endif
249static eventhandler_tag	dtrace_kld_load_tag;
250static eventhandler_tag	dtrace_kld_unload_try_tag;
251#endif
252
253/*
254 * DTrace Locking
255 * DTrace is protected by three (relatively coarse-grained) locks:
256 *
257 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
258 *     including enabling state, probes, ECBs, consumer state, helper state,
259 *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
260 *     probe context is lock-free -- synchronization is handled via the
261 *     dtrace_sync() cross call mechanism.
262 *
263 * (2) dtrace_provider_lock is required when manipulating provider state, or
264 *     when provider state must be held constant.
265 *
266 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
267 *     when meta provider state must be held constant.
268 *
269 * The lock ordering between these three locks is dtrace_meta_lock before
270 * dtrace_provider_lock before dtrace_lock.  (In particular, there are
271 * several places where dtrace_provider_lock is held by the framework as it
272 * calls into the providers -- which then call back into the framework,
273 * grabbing dtrace_lock.)
274 *
275 * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
276 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
277 * role as a coarse-grained lock; it is acquired before both of these locks.
278 * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
279 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
280 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
281 * acquired _between_ dtrace_provider_lock and dtrace_lock.
282 */
283static kmutex_t		dtrace_lock;		/* probe state lock */
284static kmutex_t		dtrace_provider_lock;	/* provider state lock */
285static kmutex_t		dtrace_meta_lock;	/* meta-provider state lock */
286
287#if !defined(sun)
288/* XXX FreeBSD hacks. */
289#define cr_suid		cr_svuid
290#define cr_sgid		cr_svgid
291#define	ipaddr_t	in_addr_t
292#define mod_modname	pathname
293#define vuprintf	vprintf
294#define ttoproc(_a)	((_a)->td_proc)
295#define crgetzoneid(_a)	0
296#define	NCPU		MAXCPU
297#define SNOCD		0
298#define CPU_ON_INTR(_a)	0
299
300#define PRIV_EFFECTIVE		(1 << 0)
301#define PRIV_DTRACE_KERNEL	(1 << 1)
302#define PRIV_DTRACE_PROC	(1 << 2)
303#define PRIV_DTRACE_USER	(1 << 3)
304#define PRIV_PROC_OWNER		(1 << 4)
305#define PRIV_PROC_ZONE		(1 << 5)
306#define PRIV_ALL		~0
307
308SYSCTL_DECL(_debug_dtrace);
309SYSCTL_DECL(_kern_dtrace);
310#endif
311
312#if defined(sun)
313#define curcpu	CPU->cpu_id
314#endif
315
316
317/*
318 * DTrace Provider Variables
319 *
320 * These are the variables relating to DTrace as a provider (that is, the
321 * provider of the BEGIN, END, and ERROR probes).
322 */
323static dtrace_pattr_t	dtrace_provider_attr = {
324{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
325{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
326{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
327{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
328{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
329};
330
331static void
332dtrace_nullop(void)
333{}
334
335static dtrace_pops_t	dtrace_provider_ops = {
336	(void (*)(void *, dtrace_probedesc_t *))dtrace_nullop,
337	(void (*)(void *, modctl_t *))dtrace_nullop,
338	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
339	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
340	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
341	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
342	NULL,
343	NULL,
344	NULL,
345	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop
346};
347
348static dtrace_id_t	dtrace_probeid_begin;	/* special BEGIN probe */
349static dtrace_id_t	dtrace_probeid_end;	/* special END probe */
350dtrace_id_t		dtrace_probeid_error;	/* special ERROR probe */
351
352/*
353 * DTrace Helper Tracing Variables
354 *
355 * These variables should be set dynamically to enable helper tracing.  The
356 * only variables that should be set are dtrace_helptrace_enable (which should
357 * be set to a non-zero value to allocate helper tracing buffers on the next
358 * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a
359 * non-zero value to deallocate helper tracing buffers on the next close of
360 * /dev/dtrace).  When (and only when) helper tracing is disabled, the
361 * buffer size may also be set via dtrace_helptrace_bufsize.
362 */
363int			dtrace_helptrace_enable = 0;
364int			dtrace_helptrace_disable = 0;
365int			dtrace_helptrace_bufsize = 16 * 1024 * 1024;
366uint32_t		dtrace_helptrace_nlocals;
367static dtrace_helptrace_t *dtrace_helptrace_buffer;
368static uint32_t		dtrace_helptrace_next = 0;
369static int		dtrace_helptrace_wrapped = 0;
370
371/*
372 * DTrace Error Hashing
373 *
374 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
375 * table.  This is very useful for checking coverage of tests that are
376 * expected to induce DIF or DOF processing errors, and may be useful for
377 * debugging problems in the DIF code generator or in DOF generation .  The
378 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
379 */
380#ifdef DEBUG
381static dtrace_errhash_t	dtrace_errhash[DTRACE_ERRHASHSZ];
382static const char *dtrace_errlast;
383static kthread_t *dtrace_errthread;
384static kmutex_t dtrace_errlock;
385#endif
386
387/*
388 * DTrace Macros and Constants
389 *
390 * These are various macros that are useful in various spots in the
391 * implementation, along with a few random constants that have no meaning
392 * outside of the implementation.  There is no real structure to this cpp
393 * mishmash -- but is there ever?
394 */
395#define	DTRACE_HASHSTR(hash, probe)	\
396	dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
397
398#define	DTRACE_HASHNEXT(hash, probe)	\
399	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
400
401#define	DTRACE_HASHPREV(hash, probe)	\
402	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
403
404#define	DTRACE_HASHEQ(hash, lhs, rhs)	\
405	(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
406	    *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
407
408#define	DTRACE_AGGHASHSIZE_SLEW		17
409
410#define	DTRACE_V4MAPPED_OFFSET		(sizeof (uint32_t) * 3)
411
412/*
413 * The key for a thread-local variable consists of the lower 61 bits of the
414 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
415 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
416 * equal to a variable identifier.  This is necessary (but not sufficient) to
417 * assure that global associative arrays never collide with thread-local
418 * variables.  To guarantee that they cannot collide, we must also define the
419 * order for keying dynamic variables.  That order is:
420 *
421 *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
422 *
423 * Because the variable-key and the tls-key are in orthogonal spaces, there is
424 * no way for a global variable key signature to match a thread-local key
425 * signature.
426 */
427#if defined(sun)
428#define	DTRACE_TLS_THRKEY(where) { \
429	uint_t intr = 0; \
430	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
431	for (; actv; actv >>= 1) \
432		intr++; \
433	ASSERT(intr < (1 << 3)); \
434	(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
435	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
436}
437#else
438#define	DTRACE_TLS_THRKEY(where) { \
439	solaris_cpu_t *_c = &solaris_cpu[curcpu]; \
440	uint_t intr = 0; \
441	uint_t actv = _c->cpu_intr_actv; \
442	for (; actv; actv >>= 1) \
443		intr++; \
444	ASSERT(intr < (1 << 3)); \
445	(where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \
446	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
447}
448#endif
449
450#define	DT_BSWAP_8(x)	((x) & 0xff)
451#define	DT_BSWAP_16(x)	((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
452#define	DT_BSWAP_32(x)	((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
453#define	DT_BSWAP_64(x)	((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
454
455#define	DT_MASK_LO 0x00000000FFFFFFFFULL
456
457#define	DTRACE_STORE(type, tomax, offset, what) \
458	*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
459
460#ifndef __x86
461#define	DTRACE_ALIGNCHECK(addr, size, flags)				\
462	if (addr & (size - 1)) {					\
463		*flags |= CPU_DTRACE_BADALIGN;				\
464		cpu_core[curcpu].cpuc_dtrace_illval = addr;	\
465		return (0);						\
466	}
467#else
468#define	DTRACE_ALIGNCHECK(addr, size, flags)
469#endif
470
471/*
472 * Test whether a range of memory starting at testaddr of size testsz falls
473 * within the range of memory described by addr, sz.  We take care to avoid
474 * problems with overflow and underflow of the unsigned quantities, and
475 * disallow all negative sizes.  Ranges of size 0 are allowed.
476 */
477#define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
478	((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
479	(testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
480	(testaddr) + (testsz) >= (testaddr))
481
482/*
483 * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
484 * alloc_sz on the righthand side of the comparison in order to avoid overflow
485 * or underflow in the comparison with it.  This is simpler than the INRANGE
486 * check above, because we know that the dtms_scratch_ptr is valid in the
487 * range.  Allocations of size zero are allowed.
488 */
489#define	DTRACE_INSCRATCH(mstate, alloc_sz) \
490	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
491	(mstate)->dtms_scratch_ptr >= (alloc_sz))
492
493#define	DTRACE_LOADFUNC(bits)						\
494/*CSTYLED*/								\
495uint##bits##_t								\
496dtrace_load##bits(uintptr_t addr)					\
497{									\
498	size_t size = bits / NBBY;					\
499	/*CSTYLED*/							\
500	uint##bits##_t rval;						\
501	int i;								\
502	volatile uint16_t *flags = (volatile uint16_t *)		\
503	    &cpu_core[curcpu].cpuc_dtrace_flags;			\
504									\
505	DTRACE_ALIGNCHECK(addr, size, flags);				\
506									\
507	for (i = 0; i < dtrace_toxranges; i++) {			\
508		if (addr >= dtrace_toxrange[i].dtt_limit)		\
509			continue;					\
510									\
511		if (addr + size <= dtrace_toxrange[i].dtt_base)		\
512			continue;					\
513									\
514		/*							\
515		 * This address falls within a toxic region; return 0.	\
516		 */							\
517		*flags |= CPU_DTRACE_BADADDR;				\
518		cpu_core[curcpu].cpuc_dtrace_illval = addr;		\
519		return (0);						\
520	}								\
521									\
522	*flags |= CPU_DTRACE_NOFAULT;					\
523	/*CSTYLED*/							\
524	rval = *((volatile uint##bits##_t *)addr);			\
525	*flags &= ~CPU_DTRACE_NOFAULT;					\
526									\
527	return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);		\
528}
529
530#ifdef _LP64
531#define	dtrace_loadptr	dtrace_load64
532#else
533#define	dtrace_loadptr	dtrace_load32
534#endif
535
536#define	DTRACE_DYNHASH_FREE	0
537#define	DTRACE_DYNHASH_SINK	1
538#define	DTRACE_DYNHASH_VALID	2
539
540#define	DTRACE_MATCH_NEXT	0
541#define	DTRACE_MATCH_DONE	1
542#define	DTRACE_ANCHORED(probe)	((probe)->dtpr_func[0] != '\0')
543#define	DTRACE_STATE_ALIGN	64
544
545#define	DTRACE_FLAGS2FLT(flags)						\
546	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :		\
547	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :		\
548	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :		\
549	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :		\
550	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :		\
551	((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :		\
552	((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :		\
553	((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :	\
554	((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :		\
555	DTRACEFLT_UNKNOWN)
556
557#define	DTRACEACT_ISSTRING(act)						\
558	((act)->dta_kind == DTRACEACT_DIFEXPR &&			\
559	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
560
561/* Function prototype definitions: */
562static size_t dtrace_strlen(const char *, size_t);
563static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
564static void dtrace_enabling_provide(dtrace_provider_t *);
565static int dtrace_enabling_match(dtrace_enabling_t *, int *);
566static void dtrace_enabling_matchall(void);
567static void dtrace_enabling_reap(void);
568static dtrace_state_t *dtrace_anon_grab(void);
569static uint64_t dtrace_helper(int, dtrace_mstate_t *,
570    dtrace_state_t *, uint64_t, uint64_t);
571static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
572static void dtrace_buffer_drop(dtrace_buffer_t *);
573static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
574static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
575    dtrace_state_t *, dtrace_mstate_t *);
576static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
577    dtrace_optval_t);
578static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
579static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
580uint16_t dtrace_load16(uintptr_t);
581uint32_t dtrace_load32(uintptr_t);
582uint64_t dtrace_load64(uintptr_t);
583uint8_t dtrace_load8(uintptr_t);
584void dtrace_dynvar_clean(dtrace_dstate_t *);
585dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
586    size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
587uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
588static int dtrace_priv_proc(dtrace_state_t *);
589static void dtrace_getf_barrier(void);
590
591/*
592 * DTrace Probe Context Functions
593 *
594 * These functions are called from probe context.  Because probe context is
595 * any context in which C may be called, arbitrarily locks may be held,
596 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
597 * As a result, functions called from probe context may only call other DTrace
598 * support functions -- they may not interact at all with the system at large.
599 * (Note that the ASSERT macro is made probe-context safe by redefining it in
600 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
601 * loads are to be performed from probe context, they _must_ be in terms of
602 * the safe dtrace_load*() variants.
603 *
604 * Some functions in this block are not actually called from probe context;
605 * for these functions, there will be a comment above the function reading
606 * "Note:  not called from probe context."
607 */
608void
609dtrace_panic(const char *format, ...)
610{
611	va_list alist;
612
613	va_start(alist, format);
614#ifdef __FreeBSD__
615	vpanic(format, alist);
616#else
617	dtrace_vpanic(format, alist);
618#endif
619	va_end(alist);
620}
621
622int
623dtrace_assfail(const char *a, const char *f, int l)
624{
625	dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
626
627	/*
628	 * We just need something here that even the most clever compiler
629	 * cannot optimize away.
630	 */
631	return (a[(uintptr_t)f]);
632}
633
634/*
635 * Atomically increment a specified error counter from probe context.
636 */
637static void
638dtrace_error(uint32_t *counter)
639{
640	/*
641	 * Most counters stored to in probe context are per-CPU counters.
642	 * However, there are some error conditions that are sufficiently
643	 * arcane that they don't merit per-CPU storage.  If these counters
644	 * are incremented concurrently on different CPUs, scalability will be
645	 * adversely affected -- but we don't expect them to be white-hot in a
646	 * correctly constructed enabling...
647	 */
648	uint32_t oval, nval;
649
650	do {
651		oval = *counter;
652
653		if ((nval = oval + 1) == 0) {
654			/*
655			 * If the counter would wrap, set it to 1 -- assuring
656			 * that the counter is never zero when we have seen
657			 * errors.  (The counter must be 32-bits because we
658			 * aren't guaranteed a 64-bit compare&swap operation.)
659			 * To save this code both the infamy of being fingered
660			 * by a priggish news story and the indignity of being
661			 * the target of a neo-puritan witch trial, we're
662			 * carefully avoiding any colorful description of the
663			 * likelihood of this condition -- but suffice it to
664			 * say that it is only slightly more likely than the
665			 * overflow of predicate cache IDs, as discussed in
666			 * dtrace_predicate_create().
667			 */
668			nval = 1;
669		}
670	} while (dtrace_cas32(counter, oval, nval) != oval);
671}
672
673/*
674 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
675 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
676 */
677DTRACE_LOADFUNC(8)
678DTRACE_LOADFUNC(16)
679DTRACE_LOADFUNC(32)
680DTRACE_LOADFUNC(64)
681
682static int
683dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
684{
685	if (dest < mstate->dtms_scratch_base)
686		return (0);
687
688	if (dest + size < dest)
689		return (0);
690
691	if (dest + size > mstate->dtms_scratch_ptr)
692		return (0);
693
694	return (1);
695}
696
697static int
698dtrace_canstore_statvar(uint64_t addr, size_t sz,
699    dtrace_statvar_t **svars, int nsvars)
700{
701	int i;
702
703	for (i = 0; i < nsvars; i++) {
704		dtrace_statvar_t *svar = svars[i];
705
706		if (svar == NULL || svar->dtsv_size == 0)
707			continue;
708
709		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
710			return (1);
711	}
712
713	return (0);
714}
715
716/*
717 * Check to see if the address is within a memory region to which a store may
718 * be issued.  This includes the DTrace scratch areas, and any DTrace variable
719 * region.  The caller of dtrace_canstore() is responsible for performing any
720 * alignment checks that are needed before stores are actually executed.
721 */
722static int
723dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
724    dtrace_vstate_t *vstate)
725{
726	/*
727	 * First, check to see if the address is in scratch space...
728	 */
729	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
730	    mstate->dtms_scratch_size))
731		return (1);
732
733	/*
734	 * Now check to see if it's a dynamic variable.  This check will pick
735	 * up both thread-local variables and any global dynamically-allocated
736	 * variables.
737	 */
738	if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
739	    vstate->dtvs_dynvars.dtds_size)) {
740		dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
741		uintptr_t base = (uintptr_t)dstate->dtds_base +
742		    (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
743		uintptr_t chunkoffs;
744
745		/*
746		 * Before we assume that we can store here, we need to make
747		 * sure that it isn't in our metadata -- storing to our
748		 * dynamic variable metadata would corrupt our state.  For
749		 * the range to not include any dynamic variable metadata,
750		 * it must:
751		 *
752		 *	(1) Start above the hash table that is at the base of
753		 *	the dynamic variable space
754		 *
755		 *	(2) Have a starting chunk offset that is beyond the
756		 *	dtrace_dynvar_t that is at the base of every chunk
757		 *
758		 *	(3) Not span a chunk boundary
759		 *
760		 */
761		if (addr < base)
762			return (0);
763
764		chunkoffs = (addr - base) % dstate->dtds_chunksize;
765
766		if (chunkoffs < sizeof (dtrace_dynvar_t))
767			return (0);
768
769		if (chunkoffs + sz > dstate->dtds_chunksize)
770			return (0);
771
772		return (1);
773	}
774
775	/*
776	 * Finally, check the static local and global variables.  These checks
777	 * take the longest, so we perform them last.
778	 */
779	if (dtrace_canstore_statvar(addr, sz,
780	    vstate->dtvs_locals, vstate->dtvs_nlocals))
781		return (1);
782
783	if (dtrace_canstore_statvar(addr, sz,
784	    vstate->dtvs_globals, vstate->dtvs_nglobals))
785		return (1);
786
787	return (0);
788}
789
790
791/*
792 * Convenience routine to check to see if the address is within a memory
793 * region in which a load may be issued given the user's privilege level;
794 * if not, it sets the appropriate error flags and loads 'addr' into the
795 * illegal value slot.
796 *
797 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
798 * appropriate memory access protection.
799 */
800static int
801dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
802    dtrace_vstate_t *vstate)
803{
804	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
805	file_t *fp;
806
807	/*
808	 * If we hold the privilege to read from kernel memory, then
809	 * everything is readable.
810	 */
811	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
812		return (1);
813
814	/*
815	 * You can obviously read that which you can store.
816	 */
817	if (dtrace_canstore(addr, sz, mstate, vstate))
818		return (1);
819
820	/*
821	 * We're allowed to read from our own string table.
822	 */
823	if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
824	    mstate->dtms_difo->dtdo_strlen))
825		return (1);
826
827	if (vstate->dtvs_state != NULL &&
828	    dtrace_priv_proc(vstate->dtvs_state)) {
829		proc_t *p;
830
831		/*
832		 * When we have privileges to the current process, there are
833		 * several context-related kernel structures that are safe to
834		 * read, even absent the privilege to read from kernel memory.
835		 * These reads are safe because these structures contain only
836		 * state that (1) we're permitted to read, (2) is harmless or
837		 * (3) contains pointers to additional kernel state that we're
838		 * not permitted to read (and as such, do not present an
839		 * opportunity for privilege escalation).  Finally (and
840		 * critically), because of the nature of their relation with
841		 * the current thread context, the memory associated with these
842		 * structures cannot change over the duration of probe context,
843		 * and it is therefore impossible for this memory to be
844		 * deallocated and reallocated as something else while it's
845		 * being operated upon.
846		 */
847		if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t)))
848			return (1);
849
850		if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
851		    sz, curthread->t_procp, sizeof (proc_t))) {
852			return (1);
853		}
854
855		if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
856		    curthread->t_cred, sizeof (cred_t))) {
857			return (1);
858		}
859
860#if defined(sun)
861		if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
862		    &(p->p_pidp->pid_id), sizeof (pid_t))) {
863			return (1);
864		}
865
866		if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
867		    curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
868			return (1);
869		}
870#endif
871	}
872
873	if ((fp = mstate->dtms_getf) != NULL) {
874		uintptr_t psz = sizeof (void *);
875		vnode_t *vp;
876		vnodeops_t *op;
877
878		/*
879		 * When getf() returns a file_t, the enabling is implicitly
880		 * granted the (transient) right to read the returned file_t
881		 * as well as the v_path and v_op->vnop_name of the underlying
882		 * vnode.  These accesses are allowed after a successful
883		 * getf() because the members that they refer to cannot change
884		 * once set -- and the barrier logic in the kernel's closef()
885		 * path assures that the file_t and its referenced vode_t
886		 * cannot themselves be stale (that is, it impossible for
887		 * either dtms_getf itself or its f_vnode member to reference
888		 * freed memory).
889		 */
890		if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t)))
891			return (1);
892
893		if ((vp = fp->f_vnode) != NULL) {
894#if defined(sun)
895			if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz))
896				return (1);
897			if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz,
898			    vp->v_path, strlen(vp->v_path) + 1)) {
899				return (1);
900			}
901#endif
902
903			if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz))
904				return (1);
905
906#if defined(sun)
907			if ((op = vp->v_op) != NULL &&
908			    DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
909				return (1);
910			}
911
912			if (op != NULL && op->vnop_name != NULL &&
913			    DTRACE_INRANGE(addr, sz, op->vnop_name,
914			    strlen(op->vnop_name) + 1)) {
915				return (1);
916			}
917#endif
918		}
919	}
920
921	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
922	*illval = addr;
923	return (0);
924}
925
926/*
927 * Convenience routine to check to see if a given string is within a memory
928 * region in which a load may be issued given the user's privilege level;
929 * this exists so that we don't need to issue unnecessary dtrace_strlen()
930 * calls in the event that the user has all privileges.
931 */
932static int
933dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
934    dtrace_vstate_t *vstate)
935{
936	size_t strsz;
937
938	/*
939	 * If we hold the privilege to read from kernel memory, then
940	 * everything is readable.
941	 */
942	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
943		return (1);
944
945	strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
946	if (dtrace_canload(addr, strsz, mstate, vstate))
947		return (1);
948
949	return (0);
950}
951
952/*
953 * Convenience routine to check to see if a given variable is within a memory
954 * region in which a load may be issued given the user's privilege level.
955 */
956static int
957dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
958    dtrace_vstate_t *vstate)
959{
960	size_t sz;
961	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
962
963	/*
964	 * If we hold the privilege to read from kernel memory, then
965	 * everything is readable.
966	 */
967	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
968		return (1);
969
970	if (type->dtdt_kind == DIF_TYPE_STRING)
971		sz = dtrace_strlen(src,
972		    vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
973	else
974		sz = type->dtdt_size;
975
976	return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
977}
978
979/*
980 * Convert a string to a signed integer using safe loads.
981 *
982 * NOTE: This function uses various macros from strtolctype.h to manipulate
983 * digit values, etc -- these have all been checked to ensure they make
984 * no additional function calls.
985 */
986static int64_t
987dtrace_strtoll(char *input, int base, size_t limit)
988{
989	uintptr_t pos = (uintptr_t)input;
990	int64_t val = 0;
991	int x;
992	boolean_t neg = B_FALSE;
993	char c, cc, ccc;
994	uintptr_t end = pos + limit;
995
996	/*
997	 * Consume any whitespace preceding digits.
998	 */
999	while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
1000		pos++;
1001
1002	/*
1003	 * Handle an explicit sign if one is present.
1004	 */
1005	if (c == '-' || c == '+') {
1006		if (c == '-')
1007			neg = B_TRUE;
1008		c = dtrace_load8(++pos);
1009	}
1010
1011	/*
1012	 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1013	 * if present.
1014	 */
1015	if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1016	    cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1017		pos += 2;
1018		c = ccc;
1019	}
1020
1021	/*
1022	 * Read in contiguous digits until the first non-digit character.
1023	 */
1024	for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1025	    c = dtrace_load8(++pos))
1026		val = val * base + x;
1027
1028	return (neg ? -val : val);
1029}
1030
1031/*
1032 * Compare two strings using safe loads.
1033 */
1034static int
1035dtrace_strncmp(char *s1, char *s2, size_t limit)
1036{
1037	uint8_t c1, c2;
1038	volatile uint16_t *flags;
1039
1040	if (s1 == s2 || limit == 0)
1041		return (0);
1042
1043	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1044
1045	do {
1046		if (s1 == NULL) {
1047			c1 = '\0';
1048		} else {
1049			c1 = dtrace_load8((uintptr_t)s1++);
1050		}
1051
1052		if (s2 == NULL) {
1053			c2 = '\0';
1054		} else {
1055			c2 = dtrace_load8((uintptr_t)s2++);
1056		}
1057
1058		if (c1 != c2)
1059			return (c1 - c2);
1060	} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1061
1062	return (0);
1063}
1064
1065/*
1066 * Compute strlen(s) for a string using safe memory accesses.  The additional
1067 * len parameter is used to specify a maximum length to ensure completion.
1068 */
1069static size_t
1070dtrace_strlen(const char *s, size_t lim)
1071{
1072	uint_t len;
1073
1074	for (len = 0; len != lim; len++) {
1075		if (dtrace_load8((uintptr_t)s++) == '\0')
1076			break;
1077	}
1078
1079	return (len);
1080}
1081
1082/*
1083 * Check if an address falls within a toxic region.
1084 */
1085static int
1086dtrace_istoxic(uintptr_t kaddr, size_t size)
1087{
1088	uintptr_t taddr, tsize;
1089	int i;
1090
1091	for (i = 0; i < dtrace_toxranges; i++) {
1092		taddr = dtrace_toxrange[i].dtt_base;
1093		tsize = dtrace_toxrange[i].dtt_limit - taddr;
1094
1095		if (kaddr - taddr < tsize) {
1096			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1097			cpu_core[curcpu].cpuc_dtrace_illval = kaddr;
1098			return (1);
1099		}
1100
1101		if (taddr - kaddr < size) {
1102			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1103			cpu_core[curcpu].cpuc_dtrace_illval = taddr;
1104			return (1);
1105		}
1106	}
1107
1108	return (0);
1109}
1110
1111/*
1112 * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
1113 * memory specified by the DIF program.  The dst is assumed to be safe memory
1114 * that we can store to directly because it is managed by DTrace.  As with
1115 * standard bcopy, overlapping copies are handled properly.
1116 */
1117static void
1118dtrace_bcopy(const void *src, void *dst, size_t len)
1119{
1120	if (len != 0) {
1121		uint8_t *s1 = dst;
1122		const uint8_t *s2 = src;
1123
1124		if (s1 <= s2) {
1125			do {
1126				*s1++ = dtrace_load8((uintptr_t)s2++);
1127			} while (--len != 0);
1128		} else {
1129			s2 += len;
1130			s1 += len;
1131
1132			do {
1133				*--s1 = dtrace_load8((uintptr_t)--s2);
1134			} while (--len != 0);
1135		}
1136	}
1137}
1138
1139/*
1140 * Copy src to dst using safe memory accesses, up to either the specified
1141 * length, or the point that a nul byte is encountered.  The src is assumed to
1142 * be unsafe memory specified by the DIF program.  The dst is assumed to be
1143 * safe memory that we can store to directly because it is managed by DTrace.
1144 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1145 */
1146static void
1147dtrace_strcpy(const void *src, void *dst, size_t len)
1148{
1149	if (len != 0) {
1150		uint8_t *s1 = dst, c;
1151		const uint8_t *s2 = src;
1152
1153		do {
1154			*s1++ = c = dtrace_load8((uintptr_t)s2++);
1155		} while (--len != 0 && c != '\0');
1156	}
1157}
1158
1159/*
1160 * Copy src to dst, deriving the size and type from the specified (BYREF)
1161 * variable type.  The src is assumed to be unsafe memory specified by the DIF
1162 * program.  The dst is assumed to be DTrace variable memory that is of the
1163 * specified type; we assume that we can store to directly.
1164 */
1165static void
1166dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
1167{
1168	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1169
1170	if (type->dtdt_kind == DIF_TYPE_STRING) {
1171		dtrace_strcpy(src, dst, type->dtdt_size);
1172	} else {
1173		dtrace_bcopy(src, dst, type->dtdt_size);
1174	}
1175}
1176
1177/*
1178 * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1179 * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1180 * safe memory that we can access directly because it is managed by DTrace.
1181 */
1182static int
1183dtrace_bcmp(const void *s1, const void *s2, size_t len)
1184{
1185	volatile uint16_t *flags;
1186
1187	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1188
1189	if (s1 == s2)
1190		return (0);
1191
1192	if (s1 == NULL || s2 == NULL)
1193		return (1);
1194
1195	if (s1 != s2 && len != 0) {
1196		const uint8_t *ps1 = s1;
1197		const uint8_t *ps2 = s2;
1198
1199		do {
1200			if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1201				return (1);
1202		} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1203	}
1204	return (0);
1205}
1206
1207/*
1208 * Zero the specified region using a simple byte-by-byte loop.  Note that this
1209 * is for safe DTrace-managed memory only.
1210 */
1211static void
1212dtrace_bzero(void *dst, size_t len)
1213{
1214	uchar_t *cp;
1215
1216	for (cp = dst; len != 0; len--)
1217		*cp++ = 0;
1218}
1219
1220static void
1221dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1222{
1223	uint64_t result[2];
1224
1225	result[0] = addend1[0] + addend2[0];
1226	result[1] = addend1[1] + addend2[1] +
1227	    (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1228
1229	sum[0] = result[0];
1230	sum[1] = result[1];
1231}
1232
1233/*
1234 * Shift the 128-bit value in a by b. If b is positive, shift left.
1235 * If b is negative, shift right.
1236 */
1237static void
1238dtrace_shift_128(uint64_t *a, int b)
1239{
1240	uint64_t mask;
1241
1242	if (b == 0)
1243		return;
1244
1245	if (b < 0) {
1246		b = -b;
1247		if (b >= 64) {
1248			a[0] = a[1] >> (b - 64);
1249			a[1] = 0;
1250		} else {
1251			a[0] >>= b;
1252			mask = 1LL << (64 - b);
1253			mask -= 1;
1254			a[0] |= ((a[1] & mask) << (64 - b));
1255			a[1] >>= b;
1256		}
1257	} else {
1258		if (b >= 64) {
1259			a[1] = a[0] << (b - 64);
1260			a[0] = 0;
1261		} else {
1262			a[1] <<= b;
1263			mask = a[0] >> (64 - b);
1264			a[1] |= mask;
1265			a[0] <<= b;
1266		}
1267	}
1268}
1269
1270/*
1271 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1272 * use native multiplication on those, and then re-combine into the
1273 * resulting 128-bit value.
1274 *
1275 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1276 *     hi1 * hi2 << 64 +
1277 *     hi1 * lo2 << 32 +
1278 *     hi2 * lo1 << 32 +
1279 *     lo1 * lo2
1280 */
1281static void
1282dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1283{
1284	uint64_t hi1, hi2, lo1, lo2;
1285	uint64_t tmp[2];
1286
1287	hi1 = factor1 >> 32;
1288	hi2 = factor2 >> 32;
1289
1290	lo1 = factor1 & DT_MASK_LO;
1291	lo2 = factor2 & DT_MASK_LO;
1292
1293	product[0] = lo1 * lo2;
1294	product[1] = hi1 * hi2;
1295
1296	tmp[0] = hi1 * lo2;
1297	tmp[1] = 0;
1298	dtrace_shift_128(tmp, 32);
1299	dtrace_add_128(product, tmp, product);
1300
1301	tmp[0] = hi2 * lo1;
1302	tmp[1] = 0;
1303	dtrace_shift_128(tmp, 32);
1304	dtrace_add_128(product, tmp, product);
1305}
1306
1307/*
1308 * This privilege check should be used by actions and subroutines to
1309 * verify that the user credentials of the process that enabled the
1310 * invoking ECB match the target credentials
1311 */
1312static int
1313dtrace_priv_proc_common_user(dtrace_state_t *state)
1314{
1315	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1316
1317	/*
1318	 * We should always have a non-NULL state cred here, since if cred
1319	 * is null (anonymous tracing), we fast-path bypass this routine.
1320	 */
1321	ASSERT(s_cr != NULL);
1322
1323	if ((cr = CRED()) != NULL &&
1324	    s_cr->cr_uid == cr->cr_uid &&
1325	    s_cr->cr_uid == cr->cr_ruid &&
1326	    s_cr->cr_uid == cr->cr_suid &&
1327	    s_cr->cr_gid == cr->cr_gid &&
1328	    s_cr->cr_gid == cr->cr_rgid &&
1329	    s_cr->cr_gid == cr->cr_sgid)
1330		return (1);
1331
1332	return (0);
1333}
1334
1335/*
1336 * This privilege check should be used by actions and subroutines to
1337 * verify that the zone of the process that enabled the invoking ECB
1338 * matches the target credentials
1339 */
1340static int
1341dtrace_priv_proc_common_zone(dtrace_state_t *state)
1342{
1343#if defined(sun)
1344	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1345
1346	/*
1347	 * We should always have a non-NULL state cred here, since if cred
1348	 * is null (anonymous tracing), we fast-path bypass this routine.
1349	 */
1350	ASSERT(s_cr != NULL);
1351
1352	if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
1353		return (1);
1354
1355	return (0);
1356#else
1357	return (1);
1358#endif
1359}
1360
1361/*
1362 * This privilege check should be used by actions and subroutines to
1363 * verify that the process has not setuid or changed credentials.
1364 */
1365static int
1366dtrace_priv_proc_common_nocd(void)
1367{
1368	proc_t *proc;
1369
1370	if ((proc = ttoproc(curthread)) != NULL &&
1371	    !(proc->p_flag & SNOCD))
1372		return (1);
1373
1374	return (0);
1375}
1376
1377static int
1378dtrace_priv_proc_destructive(dtrace_state_t *state)
1379{
1380	int action = state->dts_cred.dcr_action;
1381
1382	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1383	    dtrace_priv_proc_common_zone(state) == 0)
1384		goto bad;
1385
1386	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1387	    dtrace_priv_proc_common_user(state) == 0)
1388		goto bad;
1389
1390	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1391	    dtrace_priv_proc_common_nocd() == 0)
1392		goto bad;
1393
1394	return (1);
1395
1396bad:
1397	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1398
1399	return (0);
1400}
1401
1402static int
1403dtrace_priv_proc_control(dtrace_state_t *state)
1404{
1405	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1406		return (1);
1407
1408	if (dtrace_priv_proc_common_zone(state) &&
1409	    dtrace_priv_proc_common_user(state) &&
1410	    dtrace_priv_proc_common_nocd())
1411		return (1);
1412
1413	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1414
1415	return (0);
1416}
1417
1418static int
1419dtrace_priv_proc(dtrace_state_t *state)
1420{
1421	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1422		return (1);
1423
1424	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1425
1426	return (0);
1427}
1428
1429static int
1430dtrace_priv_kernel(dtrace_state_t *state)
1431{
1432	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1433		return (1);
1434
1435	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1436
1437	return (0);
1438}
1439
1440static int
1441dtrace_priv_kernel_destructive(dtrace_state_t *state)
1442{
1443	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1444		return (1);
1445
1446	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1447
1448	return (0);
1449}
1450
1451/*
1452 * Determine if the dte_cond of the specified ECB allows for processing of
1453 * the current probe to continue.  Note that this routine may allow continued
1454 * processing, but with access(es) stripped from the mstate's dtms_access
1455 * field.
1456 */
1457static int
1458dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
1459    dtrace_ecb_t *ecb)
1460{
1461	dtrace_probe_t *probe = ecb->dte_probe;
1462	dtrace_provider_t *prov = probe->dtpr_provider;
1463	dtrace_pops_t *pops = &prov->dtpv_pops;
1464	int mode = DTRACE_MODE_NOPRIV_DROP;
1465
1466	ASSERT(ecb->dte_cond);
1467
1468#if defined(sun)
1469	if (pops->dtps_mode != NULL) {
1470		mode = pops->dtps_mode(prov->dtpv_arg,
1471		    probe->dtpr_id, probe->dtpr_arg);
1472
1473		ASSERT((mode & DTRACE_MODE_USER) ||
1474		    (mode & DTRACE_MODE_KERNEL));
1475		ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
1476		    (mode & DTRACE_MODE_NOPRIV_DROP));
1477	}
1478
1479	/*
1480	 * If the dte_cond bits indicate that this consumer is only allowed to
1481	 * see user-mode firings of this probe, call the provider's dtps_mode()
1482	 * entry point to check that the probe was fired while in a user
1483	 * context.  If that's not the case, use the policy specified by the
1484	 * provider to determine if we drop the probe or merely restrict
1485	 * operation.
1486	 */
1487	if (ecb->dte_cond & DTRACE_COND_USERMODE) {
1488		ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
1489
1490		if (!(mode & DTRACE_MODE_USER)) {
1491			if (mode & DTRACE_MODE_NOPRIV_DROP)
1492				return (0);
1493
1494			mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1495		}
1496	}
1497#endif
1498
1499	/*
1500	 * This is more subtle than it looks. We have to be absolutely certain
1501	 * that CRED() isn't going to change out from under us so it's only
1502	 * legit to examine that structure if we're in constrained situations.
1503	 * Currently, the only times we'll this check is if a non-super-user
1504	 * has enabled the profile or syscall providers -- providers that
1505	 * allow visibility of all processes. For the profile case, the check
1506	 * above will ensure that we're examining a user context.
1507	 */
1508	if (ecb->dte_cond & DTRACE_COND_OWNER) {
1509		cred_t *cr;
1510		cred_t *s_cr = state->dts_cred.dcr_cred;
1511		proc_t *proc;
1512
1513		ASSERT(s_cr != NULL);
1514
1515		if ((cr = CRED()) == NULL ||
1516		    s_cr->cr_uid != cr->cr_uid ||
1517		    s_cr->cr_uid != cr->cr_ruid ||
1518		    s_cr->cr_uid != cr->cr_suid ||
1519		    s_cr->cr_gid != cr->cr_gid ||
1520		    s_cr->cr_gid != cr->cr_rgid ||
1521		    s_cr->cr_gid != cr->cr_sgid ||
1522		    (proc = ttoproc(curthread)) == NULL ||
1523		    (proc->p_flag & SNOCD)) {
1524			if (mode & DTRACE_MODE_NOPRIV_DROP)
1525				return (0);
1526
1527#if defined(sun)
1528			mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
1529#endif
1530		}
1531	}
1532
1533#if defined(sun)
1534	/*
1535	 * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
1536	 * in our zone, check to see if our mode policy is to restrict rather
1537	 * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
1538	 * and DTRACE_ACCESS_ARGS
1539	 */
1540	if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
1541		cred_t *cr;
1542		cred_t *s_cr = state->dts_cred.dcr_cred;
1543
1544		ASSERT(s_cr != NULL);
1545
1546		if ((cr = CRED()) == NULL ||
1547		    s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
1548			if (mode & DTRACE_MODE_NOPRIV_DROP)
1549				return (0);
1550
1551			mstate->dtms_access &=
1552			    ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
1553		}
1554	}
1555#endif
1556
1557	return (1);
1558}
1559
1560/*
1561 * Note:  not called from probe context.  This function is called
1562 * asynchronously (and at a regular interval) from outside of probe context to
1563 * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1564 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1565 */
1566void
1567dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1568{
1569	dtrace_dynvar_t *dirty;
1570	dtrace_dstate_percpu_t *dcpu;
1571	dtrace_dynvar_t **rinsep;
1572	int i, j, work = 0;
1573
1574	for (i = 0; i < NCPU; i++) {
1575		dcpu = &dstate->dtds_percpu[i];
1576		rinsep = &dcpu->dtdsc_rinsing;
1577
1578		/*
1579		 * If the dirty list is NULL, there is no dirty work to do.
1580		 */
1581		if (dcpu->dtdsc_dirty == NULL)
1582			continue;
1583
1584		if (dcpu->dtdsc_rinsing != NULL) {
1585			/*
1586			 * If the rinsing list is non-NULL, then it is because
1587			 * this CPU was selected to accept another CPU's
1588			 * dirty list -- and since that time, dirty buffers
1589			 * have accumulated.  This is a highly unlikely
1590			 * condition, but we choose to ignore the dirty
1591			 * buffers -- they'll be picked up a future cleanse.
1592			 */
1593			continue;
1594		}
1595
1596		if (dcpu->dtdsc_clean != NULL) {
1597			/*
1598			 * If the clean list is non-NULL, then we're in a
1599			 * situation where a CPU has done deallocations (we
1600			 * have a non-NULL dirty list) but no allocations (we
1601			 * also have a non-NULL clean list).  We can't simply
1602			 * move the dirty list into the clean list on this
1603			 * CPU, yet we also don't want to allow this condition
1604			 * to persist, lest a short clean list prevent a
1605			 * massive dirty list from being cleaned (which in
1606			 * turn could lead to otherwise avoidable dynamic
1607			 * drops).  To deal with this, we look for some CPU
1608			 * with a NULL clean list, NULL dirty list, and NULL
1609			 * rinsing list -- and then we borrow this CPU to
1610			 * rinse our dirty list.
1611			 */
1612			for (j = 0; j < NCPU; j++) {
1613				dtrace_dstate_percpu_t *rinser;
1614
1615				rinser = &dstate->dtds_percpu[j];
1616
1617				if (rinser->dtdsc_rinsing != NULL)
1618					continue;
1619
1620				if (rinser->dtdsc_dirty != NULL)
1621					continue;
1622
1623				if (rinser->dtdsc_clean != NULL)
1624					continue;
1625
1626				rinsep = &rinser->dtdsc_rinsing;
1627				break;
1628			}
1629
1630			if (j == NCPU) {
1631				/*
1632				 * We were unable to find another CPU that
1633				 * could accept this dirty list -- we are
1634				 * therefore unable to clean it now.
1635				 */
1636				dtrace_dynvar_failclean++;
1637				continue;
1638			}
1639		}
1640
1641		work = 1;
1642
1643		/*
1644		 * Atomically move the dirty list aside.
1645		 */
1646		do {
1647			dirty = dcpu->dtdsc_dirty;
1648
1649			/*
1650			 * Before we zap the dirty list, set the rinsing list.
1651			 * (This allows for a potential assertion in
1652			 * dtrace_dynvar():  if a free dynamic variable appears
1653			 * on a hash chain, either the dirty list or the
1654			 * rinsing list for some CPU must be non-NULL.)
1655			 */
1656			*rinsep = dirty;
1657			dtrace_membar_producer();
1658		} while (dtrace_casptr(&dcpu->dtdsc_dirty,
1659		    dirty, NULL) != dirty);
1660	}
1661
1662	if (!work) {
1663		/*
1664		 * We have no work to do; we can simply return.
1665		 */
1666		return;
1667	}
1668
1669	dtrace_sync();
1670
1671	for (i = 0; i < NCPU; i++) {
1672		dcpu = &dstate->dtds_percpu[i];
1673
1674		if (dcpu->dtdsc_rinsing == NULL)
1675			continue;
1676
1677		/*
1678		 * We are now guaranteed that no hash chain contains a pointer
1679		 * into this dirty list; we can make it clean.
1680		 */
1681		ASSERT(dcpu->dtdsc_clean == NULL);
1682		dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1683		dcpu->dtdsc_rinsing = NULL;
1684	}
1685
1686	/*
1687	 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1688	 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1689	 * This prevents a race whereby a CPU incorrectly decides that
1690	 * the state should be something other than DTRACE_DSTATE_CLEAN
1691	 * after dtrace_dynvar_clean() has completed.
1692	 */
1693	dtrace_sync();
1694
1695	dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1696}
1697
1698/*
1699 * Depending on the value of the op parameter, this function looks-up,
1700 * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1701 * allocation is requested, this function will return a pointer to a
1702 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1703 * variable can be allocated.  If NULL is returned, the appropriate counter
1704 * will be incremented.
1705 */
1706dtrace_dynvar_t *
1707dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1708    dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1709    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1710{
1711	uint64_t hashval = DTRACE_DYNHASH_VALID;
1712	dtrace_dynhash_t *hash = dstate->dtds_hash;
1713	dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1714	processorid_t me = curcpu, cpu = me;
1715	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1716	size_t bucket, ksize;
1717	size_t chunksize = dstate->dtds_chunksize;
1718	uintptr_t kdata, lock, nstate;
1719	uint_t i;
1720
1721	ASSERT(nkeys != 0);
1722
1723	/*
1724	 * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1725	 * algorithm.  For the by-value portions, we perform the algorithm in
1726	 * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1727	 * bit, and seems to have only a minute effect on distribution.  For
1728	 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1729	 * over each referenced byte.  It's painful to do this, but it's much
1730	 * better than pathological hash distribution.  The efficacy of the
1731	 * hashing algorithm (and a comparison with other algorithms) may be
1732	 * found by running the ::dtrace_dynstat MDB dcmd.
1733	 */
1734	for (i = 0; i < nkeys; i++) {
1735		if (key[i].dttk_size == 0) {
1736			uint64_t val = key[i].dttk_value;
1737
1738			hashval += (val >> 48) & 0xffff;
1739			hashval += (hashval << 10);
1740			hashval ^= (hashval >> 6);
1741
1742			hashval += (val >> 32) & 0xffff;
1743			hashval += (hashval << 10);
1744			hashval ^= (hashval >> 6);
1745
1746			hashval += (val >> 16) & 0xffff;
1747			hashval += (hashval << 10);
1748			hashval ^= (hashval >> 6);
1749
1750			hashval += val & 0xffff;
1751			hashval += (hashval << 10);
1752			hashval ^= (hashval >> 6);
1753		} else {
1754			/*
1755			 * This is incredibly painful, but it beats the hell
1756			 * out of the alternative.
1757			 */
1758			uint64_t j, size = key[i].dttk_size;
1759			uintptr_t base = (uintptr_t)key[i].dttk_value;
1760
1761			if (!dtrace_canload(base, size, mstate, vstate))
1762				break;
1763
1764			for (j = 0; j < size; j++) {
1765				hashval += dtrace_load8(base + j);
1766				hashval += (hashval << 10);
1767				hashval ^= (hashval >> 6);
1768			}
1769		}
1770	}
1771
1772	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1773		return (NULL);
1774
1775	hashval += (hashval << 3);
1776	hashval ^= (hashval >> 11);
1777	hashval += (hashval << 15);
1778
1779	/*
1780	 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1781	 * comes out to be one of our two sentinel hash values.  If this
1782	 * actually happens, we set the hashval to be a value known to be a
1783	 * non-sentinel value.
1784	 */
1785	if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1786		hashval = DTRACE_DYNHASH_VALID;
1787
1788	/*
1789	 * Yes, it's painful to do a divide here.  If the cycle count becomes
1790	 * important here, tricks can be pulled to reduce it.  (However, it's
1791	 * critical that hash collisions be kept to an absolute minimum;
1792	 * they're much more painful than a divide.)  It's better to have a
1793	 * solution that generates few collisions and still keeps things
1794	 * relatively simple.
1795	 */
1796	bucket = hashval % dstate->dtds_hashsize;
1797
1798	if (op == DTRACE_DYNVAR_DEALLOC) {
1799		volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1800
1801		for (;;) {
1802			while ((lock = *lockp) & 1)
1803				continue;
1804
1805			if (dtrace_casptr((volatile void *)lockp,
1806			    (volatile void *)lock, (volatile void *)(lock + 1)) == (void *)lock)
1807				break;
1808		}
1809
1810		dtrace_membar_producer();
1811	}
1812
1813top:
1814	prev = NULL;
1815	lock = hash[bucket].dtdh_lock;
1816
1817	dtrace_membar_consumer();
1818
1819	start = hash[bucket].dtdh_chain;
1820	ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1821	    start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1822	    op != DTRACE_DYNVAR_DEALLOC));
1823
1824	for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1825		dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1826		dtrace_key_t *dkey = &dtuple->dtt_key[0];
1827
1828		if (dvar->dtdv_hashval != hashval) {
1829			if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1830				/*
1831				 * We've reached the sink, and therefore the
1832				 * end of the hash chain; we can kick out of
1833				 * the loop knowing that we have seen a valid
1834				 * snapshot of state.
1835				 */
1836				ASSERT(dvar->dtdv_next == NULL);
1837				ASSERT(dvar == &dtrace_dynhash_sink);
1838				break;
1839			}
1840
1841			if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1842				/*
1843				 * We've gone off the rails:  somewhere along
1844				 * the line, one of the members of this hash
1845				 * chain was deleted.  Note that we could also
1846				 * detect this by simply letting this loop run
1847				 * to completion, as we would eventually hit
1848				 * the end of the dirty list.  However, we
1849				 * want to avoid running the length of the
1850				 * dirty list unnecessarily (it might be quite
1851				 * long), so we catch this as early as
1852				 * possible by detecting the hash marker.  In
1853				 * this case, we simply set dvar to NULL and
1854				 * break; the conditional after the loop will
1855				 * send us back to top.
1856				 */
1857				dvar = NULL;
1858				break;
1859			}
1860
1861			goto next;
1862		}
1863
1864		if (dtuple->dtt_nkeys != nkeys)
1865			goto next;
1866
1867		for (i = 0; i < nkeys; i++, dkey++) {
1868			if (dkey->dttk_size != key[i].dttk_size)
1869				goto next; /* size or type mismatch */
1870
1871			if (dkey->dttk_size != 0) {
1872				if (dtrace_bcmp(
1873				    (void *)(uintptr_t)key[i].dttk_value,
1874				    (void *)(uintptr_t)dkey->dttk_value,
1875				    dkey->dttk_size))
1876					goto next;
1877			} else {
1878				if (dkey->dttk_value != key[i].dttk_value)
1879					goto next;
1880			}
1881		}
1882
1883		if (op != DTRACE_DYNVAR_DEALLOC)
1884			return (dvar);
1885
1886		ASSERT(dvar->dtdv_next == NULL ||
1887		    dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1888
1889		if (prev != NULL) {
1890			ASSERT(hash[bucket].dtdh_chain != dvar);
1891			ASSERT(start != dvar);
1892			ASSERT(prev->dtdv_next == dvar);
1893			prev->dtdv_next = dvar->dtdv_next;
1894		} else {
1895			if (dtrace_casptr(&hash[bucket].dtdh_chain,
1896			    start, dvar->dtdv_next) != start) {
1897				/*
1898				 * We have failed to atomically swing the
1899				 * hash table head pointer, presumably because
1900				 * of a conflicting allocation on another CPU.
1901				 * We need to reread the hash chain and try
1902				 * again.
1903				 */
1904				goto top;
1905			}
1906		}
1907
1908		dtrace_membar_producer();
1909
1910		/*
1911		 * Now set the hash value to indicate that it's free.
1912		 */
1913		ASSERT(hash[bucket].dtdh_chain != dvar);
1914		dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1915
1916		dtrace_membar_producer();
1917
1918		/*
1919		 * Set the next pointer to point at the dirty list, and
1920		 * atomically swing the dirty pointer to the newly freed dvar.
1921		 */
1922		do {
1923			next = dcpu->dtdsc_dirty;
1924			dvar->dtdv_next = next;
1925		} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1926
1927		/*
1928		 * Finally, unlock this hash bucket.
1929		 */
1930		ASSERT(hash[bucket].dtdh_lock == lock);
1931		ASSERT(lock & 1);
1932		hash[bucket].dtdh_lock++;
1933
1934		return (NULL);
1935next:
1936		prev = dvar;
1937		continue;
1938	}
1939
1940	if (dvar == NULL) {
1941		/*
1942		 * If dvar is NULL, it is because we went off the rails:
1943		 * one of the elements that we traversed in the hash chain
1944		 * was deleted while we were traversing it.  In this case,
1945		 * we assert that we aren't doing a dealloc (deallocs lock
1946		 * the hash bucket to prevent themselves from racing with
1947		 * one another), and retry the hash chain traversal.
1948		 */
1949		ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1950		goto top;
1951	}
1952
1953	if (op != DTRACE_DYNVAR_ALLOC) {
1954		/*
1955		 * If we are not to allocate a new variable, we want to
1956		 * return NULL now.  Before we return, check that the value
1957		 * of the lock word hasn't changed.  If it has, we may have
1958		 * seen an inconsistent snapshot.
1959		 */
1960		if (op == DTRACE_DYNVAR_NOALLOC) {
1961			if (hash[bucket].dtdh_lock != lock)
1962				goto top;
1963		} else {
1964			ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1965			ASSERT(hash[bucket].dtdh_lock == lock);
1966			ASSERT(lock & 1);
1967			hash[bucket].dtdh_lock++;
1968		}
1969
1970		return (NULL);
1971	}
1972
1973	/*
1974	 * We need to allocate a new dynamic variable.  The size we need is the
1975	 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1976	 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1977	 * the size of any referred-to data (dsize).  We then round the final
1978	 * size up to the chunksize for allocation.
1979	 */
1980	for (ksize = 0, i = 0; i < nkeys; i++)
1981		ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1982
1983	/*
1984	 * This should be pretty much impossible, but could happen if, say,
1985	 * strange DIF specified the tuple.  Ideally, this should be an
1986	 * assertion and not an error condition -- but that requires that the
1987	 * chunksize calculation in dtrace_difo_chunksize() be absolutely
1988	 * bullet-proof.  (That is, it must not be able to be fooled by
1989	 * malicious DIF.)  Given the lack of backwards branches in DIF,
1990	 * solving this would presumably not amount to solving the Halting
1991	 * Problem -- but it still seems awfully hard.
1992	 */
1993	if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1994	    ksize + dsize > chunksize) {
1995		dcpu->dtdsc_drops++;
1996		return (NULL);
1997	}
1998
1999	nstate = DTRACE_DSTATE_EMPTY;
2000
2001	do {
2002retry:
2003		free = dcpu->dtdsc_free;
2004
2005		if (free == NULL) {
2006			dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2007			void *rval;
2008
2009			if (clean == NULL) {
2010				/*
2011				 * We're out of dynamic variable space on
2012				 * this CPU.  Unless we have tried all CPUs,
2013				 * we'll try to allocate from a different
2014				 * CPU.
2015				 */
2016				switch (dstate->dtds_state) {
2017				case DTRACE_DSTATE_CLEAN: {
2018					void *sp = &dstate->dtds_state;
2019
2020					if (++cpu >= NCPU)
2021						cpu = 0;
2022
2023					if (dcpu->dtdsc_dirty != NULL &&
2024					    nstate == DTRACE_DSTATE_EMPTY)
2025						nstate = DTRACE_DSTATE_DIRTY;
2026
2027					if (dcpu->dtdsc_rinsing != NULL)
2028						nstate = DTRACE_DSTATE_RINSING;
2029
2030					dcpu = &dstate->dtds_percpu[cpu];
2031
2032					if (cpu != me)
2033						goto retry;
2034
2035					(void) dtrace_cas32(sp,
2036					    DTRACE_DSTATE_CLEAN, nstate);
2037
2038					/*
2039					 * To increment the correct bean
2040					 * counter, take another lap.
2041					 */
2042					goto retry;
2043				}
2044
2045				case DTRACE_DSTATE_DIRTY:
2046					dcpu->dtdsc_dirty_drops++;
2047					break;
2048
2049				case DTRACE_DSTATE_RINSING:
2050					dcpu->dtdsc_rinsing_drops++;
2051					break;
2052
2053				case DTRACE_DSTATE_EMPTY:
2054					dcpu->dtdsc_drops++;
2055					break;
2056				}
2057
2058				DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2059				return (NULL);
2060			}
2061
2062			/*
2063			 * The clean list appears to be non-empty.  We want to
2064			 * move the clean list to the free list; we start by
2065			 * moving the clean pointer aside.
2066			 */
2067			if (dtrace_casptr(&dcpu->dtdsc_clean,
2068			    clean, NULL) != clean) {
2069				/*
2070				 * We are in one of two situations:
2071				 *
2072				 *  (a)	The clean list was switched to the
2073				 *	free list by another CPU.
2074				 *
2075				 *  (b)	The clean list was added to by the
2076				 *	cleansing cyclic.
2077				 *
2078				 * In either of these situations, we can
2079				 * just reattempt the free list allocation.
2080				 */
2081				goto retry;
2082			}
2083
2084			ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2085
2086			/*
2087			 * Now we'll move the clean list to our free list.
2088			 * It's impossible for this to fail:  the only way
2089			 * the free list can be updated is through this
2090			 * code path, and only one CPU can own the clean list.
2091			 * Thus, it would only be possible for this to fail if
2092			 * this code were racing with dtrace_dynvar_clean().
2093			 * (That is, if dtrace_dynvar_clean() updated the clean
2094			 * list, and we ended up racing to update the free
2095			 * list.)  This race is prevented by the dtrace_sync()
2096			 * in dtrace_dynvar_clean() -- which flushes the
2097			 * owners of the clean lists out before resetting
2098			 * the clean lists.
2099			 */
2100			dcpu = &dstate->dtds_percpu[me];
2101			rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2102			ASSERT(rval == NULL);
2103			goto retry;
2104		}
2105
2106		dvar = free;
2107		new_free = dvar->dtdv_next;
2108	} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2109
2110	/*
2111	 * We have now allocated a new chunk.  We copy the tuple keys into the
2112	 * tuple array and copy any referenced key data into the data space
2113	 * following the tuple array.  As we do this, we relocate dttk_value
2114	 * in the final tuple to point to the key data address in the chunk.
2115	 */
2116	kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2117	dvar->dtdv_data = (void *)(kdata + ksize);
2118	dvar->dtdv_tuple.dtt_nkeys = nkeys;
2119
2120	for (i = 0; i < nkeys; i++) {
2121		dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2122		size_t kesize = key[i].dttk_size;
2123
2124		if (kesize != 0) {
2125			dtrace_bcopy(
2126			    (const void *)(uintptr_t)key[i].dttk_value,
2127			    (void *)kdata, kesize);
2128			dkey->dttk_value = kdata;
2129			kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2130		} else {
2131			dkey->dttk_value = key[i].dttk_value;
2132		}
2133
2134		dkey->dttk_size = kesize;
2135	}
2136
2137	ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2138	dvar->dtdv_hashval = hashval;
2139	dvar->dtdv_next = start;
2140
2141	if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2142		return (dvar);
2143
2144	/*
2145	 * The cas has failed.  Either another CPU is adding an element to
2146	 * this hash chain, or another CPU is deleting an element from this
2147	 * hash chain.  The simplest way to deal with both of these cases
2148	 * (though not necessarily the most efficient) is to free our
2149	 * allocated block and tail-call ourselves.  Note that the free is
2150	 * to the dirty list and _not_ to the free list.  This is to prevent
2151	 * races with allocators, above.
2152	 */
2153	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2154
2155	dtrace_membar_producer();
2156
2157	do {
2158		free = dcpu->dtdsc_dirty;
2159		dvar->dtdv_next = free;
2160	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2161
2162	return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2163}
2164
2165/*ARGSUSED*/
2166static void
2167dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2168{
2169	if ((int64_t)nval < (int64_t)*oval)
2170		*oval = nval;
2171}
2172
2173/*ARGSUSED*/
2174static void
2175dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2176{
2177	if ((int64_t)nval > (int64_t)*oval)
2178		*oval = nval;
2179}
2180
2181static void
2182dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2183{
2184	int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2185	int64_t val = (int64_t)nval;
2186
2187	if (val < 0) {
2188		for (i = 0; i < zero; i++) {
2189			if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2190				quanta[i] += incr;
2191				return;
2192			}
2193		}
2194	} else {
2195		for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2196			if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2197				quanta[i - 1] += incr;
2198				return;
2199			}
2200		}
2201
2202		quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2203		return;
2204	}
2205
2206	ASSERT(0);
2207}
2208
2209static void
2210dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2211{
2212	uint64_t arg = *lquanta++;
2213	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2214	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2215	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2216	int32_t val = (int32_t)nval, level;
2217
2218	ASSERT(step != 0);
2219	ASSERT(levels != 0);
2220
2221	if (val < base) {
2222		/*
2223		 * This is an underflow.
2224		 */
2225		lquanta[0] += incr;
2226		return;
2227	}
2228
2229	level = (val - base) / step;
2230
2231	if (level < levels) {
2232		lquanta[level + 1] += incr;
2233		return;
2234	}
2235
2236	/*
2237	 * This is an overflow.
2238	 */
2239	lquanta[levels + 1] += incr;
2240}
2241
2242static int
2243dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
2244    uint16_t high, uint16_t nsteps, int64_t value)
2245{
2246	int64_t this = 1, last, next;
2247	int base = 1, order;
2248
2249	ASSERT(factor <= nsteps);
2250	ASSERT(nsteps % factor == 0);
2251
2252	for (order = 0; order < low; order++)
2253		this *= factor;
2254
2255	/*
2256	 * If our value is less than our factor taken to the power of the
2257	 * low order of magnitude, it goes into the zeroth bucket.
2258	 */
2259	if (value < (last = this))
2260		return (0);
2261
2262	for (this *= factor; order <= high; order++) {
2263		int nbuckets = this > nsteps ? nsteps : this;
2264
2265		if ((next = this * factor) < this) {
2266			/*
2267			 * We should not generally get log/linear quantizations
2268			 * with a high magnitude that allows 64-bits to
2269			 * overflow, but we nonetheless protect against this
2270			 * by explicitly checking for overflow, and clamping
2271			 * our value accordingly.
2272			 */
2273			value = this - 1;
2274		}
2275
2276		if (value < this) {
2277			/*
2278			 * If our value lies within this order of magnitude,
2279			 * determine its position by taking the offset within
2280			 * the order of magnitude, dividing by the bucket
2281			 * width, and adding to our (accumulated) base.
2282			 */
2283			return (base + (value - last) / (this / nbuckets));
2284		}
2285
2286		base += nbuckets - (nbuckets / factor);
2287		last = this;
2288		this = next;
2289	}
2290
2291	/*
2292	 * Our value is greater than or equal to our factor taken to the
2293	 * power of one plus the high magnitude -- return the top bucket.
2294	 */
2295	return (base);
2296}
2297
2298static void
2299dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2300{
2301	uint64_t arg = *llquanta++;
2302	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2303	uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2304	uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2305	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2306
2307	llquanta[dtrace_aggregate_llquantize_bucket(factor,
2308	    low, high, nsteps, nval)] += incr;
2309}
2310
2311/*ARGSUSED*/
2312static void
2313dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2314{
2315	data[0]++;
2316	data[1] += nval;
2317}
2318
2319/*ARGSUSED*/
2320static void
2321dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2322{
2323	int64_t snval = (int64_t)nval;
2324	uint64_t tmp[2];
2325
2326	data[0]++;
2327	data[1] += nval;
2328
2329	/*
2330	 * What we want to say here is:
2331	 *
2332	 * data[2] += nval * nval;
2333	 *
2334	 * But given that nval is 64-bit, we could easily overflow, so
2335	 * we do this as 128-bit arithmetic.
2336	 */
2337	if (snval < 0)
2338		snval = -snval;
2339
2340	dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2341	dtrace_add_128(data + 2, tmp, data + 2);
2342}
2343
2344/*ARGSUSED*/
2345static void
2346dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2347{
2348	*oval = *oval + 1;
2349}
2350
2351/*ARGSUSED*/
2352static void
2353dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2354{
2355	*oval += nval;
2356}
2357
2358/*
2359 * Aggregate given the tuple in the principal data buffer, and the aggregating
2360 * action denoted by the specified dtrace_aggregation_t.  The aggregation
2361 * buffer is specified as the buf parameter.  This routine does not return
2362 * failure; if there is no space in the aggregation buffer, the data will be
2363 * dropped, and a corresponding counter incremented.
2364 */
2365static void
2366dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2367    intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2368{
2369	dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2370	uint32_t i, ndx, size, fsize;
2371	uint32_t align = sizeof (uint64_t) - 1;
2372	dtrace_aggbuffer_t *agb;
2373	dtrace_aggkey_t *key;
2374	uint32_t hashval = 0, limit, isstr;
2375	caddr_t tomax, data, kdata;
2376	dtrace_actkind_t action;
2377	dtrace_action_t *act;
2378	uintptr_t offs;
2379
2380	if (buf == NULL)
2381		return;
2382
2383	if (!agg->dtag_hasarg) {
2384		/*
2385		 * Currently, only quantize() and lquantize() take additional
2386		 * arguments, and they have the same semantics:  an increment
2387		 * value that defaults to 1 when not present.  If additional
2388		 * aggregating actions take arguments, the setting of the
2389		 * default argument value will presumably have to become more
2390		 * sophisticated...
2391		 */
2392		arg = 1;
2393	}
2394
2395	action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2396	size = rec->dtrd_offset - agg->dtag_base;
2397	fsize = size + rec->dtrd_size;
2398
2399	ASSERT(dbuf->dtb_tomax != NULL);
2400	data = dbuf->dtb_tomax + offset + agg->dtag_base;
2401
2402	if ((tomax = buf->dtb_tomax) == NULL) {
2403		dtrace_buffer_drop(buf);
2404		return;
2405	}
2406
2407	/*
2408	 * The metastructure is always at the bottom of the buffer.
2409	 */
2410	agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2411	    sizeof (dtrace_aggbuffer_t));
2412
2413	if (buf->dtb_offset == 0) {
2414		/*
2415		 * We just kludge up approximately 1/8th of the size to be
2416		 * buckets.  If this guess ends up being routinely
2417		 * off-the-mark, we may need to dynamically readjust this
2418		 * based on past performance.
2419		 */
2420		uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2421
2422		if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2423		    (uintptr_t)tomax || hashsize == 0) {
2424			/*
2425			 * We've been given a ludicrously small buffer;
2426			 * increment our drop count and leave.
2427			 */
2428			dtrace_buffer_drop(buf);
2429			return;
2430		}
2431
2432		/*
2433		 * And now, a pathetic attempt to try to get a an odd (or
2434		 * perchance, a prime) hash size for better hash distribution.
2435		 */
2436		if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2437			hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2438
2439		agb->dtagb_hashsize = hashsize;
2440		agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2441		    agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2442		agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2443
2444		for (i = 0; i < agb->dtagb_hashsize; i++)
2445			agb->dtagb_hash[i] = NULL;
2446	}
2447
2448	ASSERT(agg->dtag_first != NULL);
2449	ASSERT(agg->dtag_first->dta_intuple);
2450
2451	/*
2452	 * Calculate the hash value based on the key.  Note that we _don't_
2453	 * include the aggid in the hashing (but we will store it as part of
2454	 * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2455	 * algorithm: a simple, quick algorithm that has no known funnels, and
2456	 * gets good distribution in practice.  The efficacy of the hashing
2457	 * algorithm (and a comparison with other algorithms) may be found by
2458	 * running the ::dtrace_aggstat MDB dcmd.
2459	 */
2460	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2461		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2462		limit = i + act->dta_rec.dtrd_size;
2463		ASSERT(limit <= size);
2464		isstr = DTRACEACT_ISSTRING(act);
2465
2466		for (; i < limit; i++) {
2467			hashval += data[i];
2468			hashval += (hashval << 10);
2469			hashval ^= (hashval >> 6);
2470
2471			if (isstr && data[i] == '\0')
2472				break;
2473		}
2474	}
2475
2476	hashval += (hashval << 3);
2477	hashval ^= (hashval >> 11);
2478	hashval += (hashval << 15);
2479
2480	/*
2481	 * Yes, the divide here is expensive -- but it's generally the least
2482	 * of the performance issues given the amount of data that we iterate
2483	 * over to compute hash values, compare data, etc.
2484	 */
2485	ndx = hashval % agb->dtagb_hashsize;
2486
2487	for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2488		ASSERT((caddr_t)key >= tomax);
2489		ASSERT((caddr_t)key < tomax + buf->dtb_size);
2490
2491		if (hashval != key->dtak_hashval || key->dtak_size != size)
2492			continue;
2493
2494		kdata = key->dtak_data;
2495		ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2496
2497		for (act = agg->dtag_first; act->dta_intuple;
2498		    act = act->dta_next) {
2499			i = act->dta_rec.dtrd_offset - agg->dtag_base;
2500			limit = i + act->dta_rec.dtrd_size;
2501			ASSERT(limit <= size);
2502			isstr = DTRACEACT_ISSTRING(act);
2503
2504			for (; i < limit; i++) {
2505				if (kdata[i] != data[i])
2506					goto next;
2507
2508				if (isstr && data[i] == '\0')
2509					break;
2510			}
2511		}
2512
2513		if (action != key->dtak_action) {
2514			/*
2515			 * We are aggregating on the same value in the same
2516			 * aggregation with two different aggregating actions.
2517			 * (This should have been picked up in the compiler,
2518			 * so we may be dealing with errant or devious DIF.)
2519			 * This is an error condition; we indicate as much,
2520			 * and return.
2521			 */
2522			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2523			return;
2524		}
2525
2526		/*
2527		 * This is a hit:  we need to apply the aggregator to
2528		 * the value at this key.
2529		 */
2530		agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2531		return;
2532next:
2533		continue;
2534	}
2535
2536	/*
2537	 * We didn't find it.  We need to allocate some zero-filled space,
2538	 * link it into the hash table appropriately, and apply the aggregator
2539	 * to the (zero-filled) value.
2540	 */
2541	offs = buf->dtb_offset;
2542	while (offs & (align - 1))
2543		offs += sizeof (uint32_t);
2544
2545	/*
2546	 * If we don't have enough room to both allocate a new key _and_
2547	 * its associated data, increment the drop count and return.
2548	 */
2549	if ((uintptr_t)tomax + offs + fsize >
2550	    agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2551		dtrace_buffer_drop(buf);
2552		return;
2553	}
2554
2555	/*CONSTCOND*/
2556	ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2557	key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2558	agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2559
2560	key->dtak_data = kdata = tomax + offs;
2561	buf->dtb_offset = offs + fsize;
2562
2563	/*
2564	 * Now copy the data across.
2565	 */
2566	*((dtrace_aggid_t *)kdata) = agg->dtag_id;
2567
2568	for (i = sizeof (dtrace_aggid_t); i < size; i++)
2569		kdata[i] = data[i];
2570
2571	/*
2572	 * Because strings are not zeroed out by default, we need to iterate
2573	 * looking for actions that store strings, and we need to explicitly
2574	 * pad these strings out with zeroes.
2575	 */
2576	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2577		int nul;
2578
2579		if (!DTRACEACT_ISSTRING(act))
2580			continue;
2581
2582		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2583		limit = i + act->dta_rec.dtrd_size;
2584		ASSERT(limit <= size);
2585
2586		for (nul = 0; i < limit; i++) {
2587			if (nul) {
2588				kdata[i] = '\0';
2589				continue;
2590			}
2591
2592			if (data[i] != '\0')
2593				continue;
2594
2595			nul = 1;
2596		}
2597	}
2598
2599	for (i = size; i < fsize; i++)
2600		kdata[i] = 0;
2601
2602	key->dtak_hashval = hashval;
2603	key->dtak_size = size;
2604	key->dtak_action = action;
2605	key->dtak_next = agb->dtagb_hash[ndx];
2606	agb->dtagb_hash[ndx] = key;
2607
2608	/*
2609	 * Finally, apply the aggregator.
2610	 */
2611	*((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2612	agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2613}
2614
2615/*
2616 * Given consumer state, this routine finds a speculation in the INACTIVE
2617 * state and transitions it into the ACTIVE state.  If there is no speculation
2618 * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2619 * incremented -- it is up to the caller to take appropriate action.
2620 */
2621static int
2622dtrace_speculation(dtrace_state_t *state)
2623{
2624	int i = 0;
2625	dtrace_speculation_state_t current;
2626	uint32_t *stat = &state->dts_speculations_unavail, count;
2627
2628	while (i < state->dts_nspeculations) {
2629		dtrace_speculation_t *spec = &state->dts_speculations[i];
2630
2631		current = spec->dtsp_state;
2632
2633		if (current != DTRACESPEC_INACTIVE) {
2634			if (current == DTRACESPEC_COMMITTINGMANY ||
2635			    current == DTRACESPEC_COMMITTING ||
2636			    current == DTRACESPEC_DISCARDING)
2637				stat = &state->dts_speculations_busy;
2638			i++;
2639			continue;
2640		}
2641
2642		if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2643		    current, DTRACESPEC_ACTIVE) == current)
2644			return (i + 1);
2645	}
2646
2647	/*
2648	 * We couldn't find a speculation.  If we found as much as a single
2649	 * busy speculation buffer, we'll attribute this failure as "busy"
2650	 * instead of "unavail".
2651	 */
2652	do {
2653		count = *stat;
2654	} while (dtrace_cas32(stat, count, count + 1) != count);
2655
2656	return (0);
2657}
2658
2659/*
2660 * This routine commits an active speculation.  If the specified speculation
2661 * is not in a valid state to perform a commit(), this routine will silently do
2662 * nothing.  The state of the specified speculation is transitioned according
2663 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2664 */
2665static void
2666dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2667    dtrace_specid_t which)
2668{
2669	dtrace_speculation_t *spec;
2670	dtrace_buffer_t *src, *dest;
2671	uintptr_t daddr, saddr, dlimit, slimit;
2672	dtrace_speculation_state_t current, new = 0;
2673	intptr_t offs;
2674	uint64_t timestamp;
2675
2676	if (which == 0)
2677		return;
2678
2679	if (which > state->dts_nspeculations) {
2680		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2681		return;
2682	}
2683
2684	spec = &state->dts_speculations[which - 1];
2685	src = &spec->dtsp_buffer[cpu];
2686	dest = &state->dts_buffer[cpu];
2687
2688	do {
2689		current = spec->dtsp_state;
2690
2691		if (current == DTRACESPEC_COMMITTINGMANY)
2692			break;
2693
2694		switch (current) {
2695		case DTRACESPEC_INACTIVE:
2696		case DTRACESPEC_DISCARDING:
2697			return;
2698
2699		case DTRACESPEC_COMMITTING:
2700			/*
2701			 * This is only possible if we are (a) commit()'ing
2702			 * without having done a prior speculate() on this CPU
2703			 * and (b) racing with another commit() on a different
2704			 * CPU.  There's nothing to do -- we just assert that
2705			 * our offset is 0.
2706			 */
2707			ASSERT(src->dtb_offset == 0);
2708			return;
2709
2710		case DTRACESPEC_ACTIVE:
2711			new = DTRACESPEC_COMMITTING;
2712			break;
2713
2714		case DTRACESPEC_ACTIVEONE:
2715			/*
2716			 * This speculation is active on one CPU.  If our
2717			 * buffer offset is non-zero, we know that the one CPU
2718			 * must be us.  Otherwise, we are committing on a
2719			 * different CPU from the speculate(), and we must
2720			 * rely on being asynchronously cleaned.
2721			 */
2722			if (src->dtb_offset != 0) {
2723				new = DTRACESPEC_COMMITTING;
2724				break;
2725			}
2726			/*FALLTHROUGH*/
2727
2728		case DTRACESPEC_ACTIVEMANY:
2729			new = DTRACESPEC_COMMITTINGMANY;
2730			break;
2731
2732		default:
2733			ASSERT(0);
2734		}
2735	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2736	    current, new) != current);
2737
2738	/*
2739	 * We have set the state to indicate that we are committing this
2740	 * speculation.  Now reserve the necessary space in the destination
2741	 * buffer.
2742	 */
2743	if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2744	    sizeof (uint64_t), state, NULL)) < 0) {
2745		dtrace_buffer_drop(dest);
2746		goto out;
2747	}
2748
2749	/*
2750	 * We have sufficient space to copy the speculative buffer into the
2751	 * primary buffer.  First, modify the speculative buffer, filling
2752	 * in the timestamp of all entries with the current time.  The data
2753	 * must have the commit() time rather than the time it was traced,
2754	 * so that all entries in the primary buffer are in timestamp order.
2755	 */
2756	timestamp = dtrace_gethrtime();
2757	saddr = (uintptr_t)src->dtb_tomax;
2758	slimit = saddr + src->dtb_offset;
2759	while (saddr < slimit) {
2760		size_t size;
2761		dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2762
2763		if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2764			saddr += sizeof (dtrace_epid_t);
2765			continue;
2766		}
2767		ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
2768		size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2769
2770		ASSERT3U(saddr + size, <=, slimit);
2771		ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
2772		ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
2773
2774		DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2775
2776		saddr += size;
2777	}
2778
2779	/*
2780	 * Copy the buffer across.  (Note that this is a
2781	 * highly subobtimal bcopy(); in the unlikely event that this becomes
2782	 * a serious performance issue, a high-performance DTrace-specific
2783	 * bcopy() should obviously be invented.)
2784	 */
2785	daddr = (uintptr_t)dest->dtb_tomax + offs;
2786	dlimit = daddr + src->dtb_offset;
2787	saddr = (uintptr_t)src->dtb_tomax;
2788
2789	/*
2790	 * First, the aligned portion.
2791	 */
2792	while (dlimit - daddr >= sizeof (uint64_t)) {
2793		*((uint64_t *)daddr) = *((uint64_t *)saddr);
2794
2795		daddr += sizeof (uint64_t);
2796		saddr += sizeof (uint64_t);
2797	}
2798
2799	/*
2800	 * Now any left-over bit...
2801	 */
2802	while (dlimit - daddr)
2803		*((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2804
2805	/*
2806	 * Finally, commit the reserved space in the destination buffer.
2807	 */
2808	dest->dtb_offset = offs + src->dtb_offset;
2809
2810out:
2811	/*
2812	 * If we're lucky enough to be the only active CPU on this speculation
2813	 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2814	 */
2815	if (current == DTRACESPEC_ACTIVE ||
2816	    (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2817		uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2818		    DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2819
2820		ASSERT(rval == DTRACESPEC_COMMITTING);
2821	}
2822
2823	src->dtb_offset = 0;
2824	src->dtb_xamot_drops += src->dtb_drops;
2825	src->dtb_drops = 0;
2826}
2827
2828/*
2829 * This routine discards an active speculation.  If the specified speculation
2830 * is not in a valid state to perform a discard(), this routine will silently
2831 * do nothing.  The state of the specified speculation is transitioned
2832 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2833 */
2834static void
2835dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2836    dtrace_specid_t which)
2837{
2838	dtrace_speculation_t *spec;
2839	dtrace_speculation_state_t current, new = 0;
2840	dtrace_buffer_t *buf;
2841
2842	if (which == 0)
2843		return;
2844
2845	if (which > state->dts_nspeculations) {
2846		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2847		return;
2848	}
2849
2850	spec = &state->dts_speculations[which - 1];
2851	buf = &spec->dtsp_buffer[cpu];
2852
2853	do {
2854		current = spec->dtsp_state;
2855
2856		switch (current) {
2857		case DTRACESPEC_INACTIVE:
2858		case DTRACESPEC_COMMITTINGMANY:
2859		case DTRACESPEC_COMMITTING:
2860		case DTRACESPEC_DISCARDING:
2861			return;
2862
2863		case DTRACESPEC_ACTIVE:
2864		case DTRACESPEC_ACTIVEMANY:
2865			new = DTRACESPEC_DISCARDING;
2866			break;
2867
2868		case DTRACESPEC_ACTIVEONE:
2869			if (buf->dtb_offset != 0) {
2870				new = DTRACESPEC_INACTIVE;
2871			} else {
2872				new = DTRACESPEC_DISCARDING;
2873			}
2874			break;
2875
2876		default:
2877			ASSERT(0);
2878		}
2879	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2880	    current, new) != current);
2881
2882	buf->dtb_offset = 0;
2883	buf->dtb_drops = 0;
2884}
2885
2886/*
2887 * Note:  not called from probe context.  This function is called
2888 * asynchronously from cross call context to clean any speculations that are
2889 * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2890 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2891 * speculation.
2892 */
2893static void
2894dtrace_speculation_clean_here(dtrace_state_t *state)
2895{
2896	dtrace_icookie_t cookie;
2897	processorid_t cpu = curcpu;
2898	dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2899	dtrace_specid_t i;
2900
2901	cookie = dtrace_interrupt_disable();
2902
2903	if (dest->dtb_tomax == NULL) {
2904		dtrace_interrupt_enable(cookie);
2905		return;
2906	}
2907
2908	for (i = 0; i < state->dts_nspeculations; i++) {
2909		dtrace_speculation_t *spec = &state->dts_speculations[i];
2910		dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2911
2912		if (src->dtb_tomax == NULL)
2913			continue;
2914
2915		if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2916			src->dtb_offset = 0;
2917			continue;
2918		}
2919
2920		if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2921			continue;
2922
2923		if (src->dtb_offset == 0)
2924			continue;
2925
2926		dtrace_speculation_commit(state, cpu, i + 1);
2927	}
2928
2929	dtrace_interrupt_enable(cookie);
2930}
2931
2932/*
2933 * Note:  not called from probe context.  This function is called
2934 * asynchronously (and at a regular interval) to clean any speculations that
2935 * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2936 * is work to be done, it cross calls all CPUs to perform that work;
2937 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2938 * INACTIVE state until they have been cleaned by all CPUs.
2939 */
2940static void
2941dtrace_speculation_clean(dtrace_state_t *state)
2942{
2943	int work = 0, rv;
2944	dtrace_specid_t i;
2945
2946	for (i = 0; i < state->dts_nspeculations; i++) {
2947		dtrace_speculation_t *spec = &state->dts_speculations[i];
2948
2949		ASSERT(!spec->dtsp_cleaning);
2950
2951		if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2952		    spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2953			continue;
2954
2955		work++;
2956		spec->dtsp_cleaning = 1;
2957	}
2958
2959	if (!work)
2960		return;
2961
2962	dtrace_xcall(DTRACE_CPUALL,
2963	    (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2964
2965	/*
2966	 * We now know that all CPUs have committed or discarded their
2967	 * speculation buffers, as appropriate.  We can now set the state
2968	 * to inactive.
2969	 */
2970	for (i = 0; i < state->dts_nspeculations; i++) {
2971		dtrace_speculation_t *spec = &state->dts_speculations[i];
2972		dtrace_speculation_state_t current, new;
2973
2974		if (!spec->dtsp_cleaning)
2975			continue;
2976
2977		current = spec->dtsp_state;
2978		ASSERT(current == DTRACESPEC_DISCARDING ||
2979		    current == DTRACESPEC_COMMITTINGMANY);
2980
2981		new = DTRACESPEC_INACTIVE;
2982
2983		rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2984		ASSERT(rv == current);
2985		spec->dtsp_cleaning = 0;
2986	}
2987}
2988
2989/*
2990 * Called as part of a speculate() to get the speculative buffer associated
2991 * with a given speculation.  Returns NULL if the specified speculation is not
2992 * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
2993 * the active CPU is not the specified CPU -- the speculation will be
2994 * atomically transitioned into the ACTIVEMANY state.
2995 */
2996static dtrace_buffer_t *
2997dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2998    dtrace_specid_t which)
2999{
3000	dtrace_speculation_t *spec;
3001	dtrace_speculation_state_t current, new = 0;
3002	dtrace_buffer_t *buf;
3003
3004	if (which == 0)
3005		return (NULL);
3006
3007	if (which > state->dts_nspeculations) {
3008		cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3009		return (NULL);
3010	}
3011
3012	spec = &state->dts_speculations[which - 1];
3013	buf = &spec->dtsp_buffer[cpuid];
3014
3015	do {
3016		current = spec->dtsp_state;
3017
3018		switch (current) {
3019		case DTRACESPEC_INACTIVE:
3020		case DTRACESPEC_COMMITTINGMANY:
3021		case DTRACESPEC_DISCARDING:
3022			return (NULL);
3023
3024		case DTRACESPEC_COMMITTING:
3025			ASSERT(buf->dtb_offset == 0);
3026			return (NULL);
3027
3028		case DTRACESPEC_ACTIVEONE:
3029			/*
3030			 * This speculation is currently active on one CPU.
3031			 * Check the offset in the buffer; if it's non-zero,
3032			 * that CPU must be us (and we leave the state alone).
3033			 * If it's zero, assume that we're starting on a new
3034			 * CPU -- and change the state to indicate that the
3035			 * speculation is active on more than one CPU.
3036			 */
3037			if (buf->dtb_offset != 0)
3038				return (buf);
3039
3040			new = DTRACESPEC_ACTIVEMANY;
3041			break;
3042
3043		case DTRACESPEC_ACTIVEMANY:
3044			return (buf);
3045
3046		case DTRACESPEC_ACTIVE:
3047			new = DTRACESPEC_ACTIVEONE;
3048			break;
3049
3050		default:
3051			ASSERT(0);
3052		}
3053	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3054	    current, new) != current);
3055
3056	ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3057	return (buf);
3058}
3059
3060/*
3061 * Return a string.  In the event that the user lacks the privilege to access
3062 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3063 * don't fail access checking.
3064 *
3065 * dtrace_dif_variable() uses this routine as a helper for various
3066 * builtin values such as 'execname' and 'probefunc.'
3067 */
3068uintptr_t
3069dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3070    dtrace_mstate_t *mstate)
3071{
3072	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3073	uintptr_t ret;
3074	size_t strsz;
3075
3076	/*
3077	 * The easy case: this probe is allowed to read all of memory, so
3078	 * we can just return this as a vanilla pointer.
3079	 */
3080	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3081		return (addr);
3082
3083	/*
3084	 * This is the tougher case: we copy the string in question from
3085	 * kernel memory into scratch memory and return it that way: this
3086	 * ensures that we won't trip up when access checking tests the
3087	 * BYREF return value.
3088	 */
3089	strsz = dtrace_strlen((char *)addr, size) + 1;
3090
3091	if (mstate->dtms_scratch_ptr + strsz >
3092	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3093		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3094		return (0);
3095	}
3096
3097	dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3098	    strsz);
3099	ret = mstate->dtms_scratch_ptr;
3100	mstate->dtms_scratch_ptr += strsz;
3101	return (ret);
3102}
3103
3104/*
3105 * Return a string from a memoy address which is known to have one or
3106 * more concatenated, individually zero terminated, sub-strings.
3107 * In the event that the user lacks the privilege to access
3108 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3109 * don't fail access checking.
3110 *
3111 * dtrace_dif_variable() uses this routine as a helper for various
3112 * builtin values such as 'execargs'.
3113 */
3114static uintptr_t
3115dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state,
3116    dtrace_mstate_t *mstate)
3117{
3118	char *p;
3119	size_t i;
3120	uintptr_t ret;
3121
3122	if (mstate->dtms_scratch_ptr + strsz >
3123	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3124		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3125		return (0);
3126	}
3127
3128	dtrace_bcopy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3129	    strsz);
3130
3131	/* Replace sub-string termination characters with a space. */
3132	for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1;
3133	    p++, i++)
3134		if (*p == '\0')
3135			*p = ' ';
3136
3137	ret = mstate->dtms_scratch_ptr;
3138	mstate->dtms_scratch_ptr += strsz;
3139	return (ret);
3140}
3141
3142/*
3143 * This function implements the DIF emulator's variable lookups.  The emulator
3144 * passes a reserved variable identifier and optional built-in array index.
3145 */
3146static uint64_t
3147dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3148    uint64_t ndx)
3149{
3150	/*
3151	 * If we're accessing one of the uncached arguments, we'll turn this
3152	 * into a reference in the args array.
3153	 */
3154	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3155		ndx = v - DIF_VAR_ARG0;
3156		v = DIF_VAR_ARGS;
3157	}
3158
3159	switch (v) {
3160	case DIF_VAR_ARGS:
3161		ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3162		if (ndx >= sizeof (mstate->dtms_arg) /
3163		    sizeof (mstate->dtms_arg[0])) {
3164			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3165			dtrace_provider_t *pv;
3166			uint64_t val;
3167
3168			pv = mstate->dtms_probe->dtpr_provider;
3169			if (pv->dtpv_pops.dtps_getargval != NULL)
3170				val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3171				    mstate->dtms_probe->dtpr_id,
3172				    mstate->dtms_probe->dtpr_arg, ndx, aframes);
3173			else
3174				val = dtrace_getarg(ndx, aframes);
3175
3176			/*
3177			 * This is regrettably required to keep the compiler
3178			 * from tail-optimizing the call to dtrace_getarg().
3179			 * The condition always evaluates to true, but the
3180			 * compiler has no way of figuring that out a priori.
3181			 * (None of this would be necessary if the compiler
3182			 * could be relied upon to _always_ tail-optimize
3183			 * the call to dtrace_getarg() -- but it can't.)
3184			 */
3185			if (mstate->dtms_probe != NULL)
3186				return (val);
3187
3188			ASSERT(0);
3189		}
3190
3191		return (mstate->dtms_arg[ndx]);
3192
3193#if defined(sun)
3194	case DIF_VAR_UREGS: {
3195		klwp_t *lwp;
3196
3197		if (!dtrace_priv_proc(state))
3198			return (0);
3199
3200		if ((lwp = curthread->t_lwp) == NULL) {
3201			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3202			cpu_core[curcpu].cpuc_dtrace_illval = NULL;
3203			return (0);
3204		}
3205
3206		return (dtrace_getreg(lwp->lwp_regs, ndx));
3207		return (0);
3208	}
3209#else
3210	case DIF_VAR_UREGS: {
3211		struct trapframe *tframe;
3212
3213		if (!dtrace_priv_proc(state))
3214			return (0);
3215
3216		if ((tframe = curthread->td_frame) == NULL) {
3217			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3218			cpu_core[curcpu].cpuc_dtrace_illval = 0;
3219			return (0);
3220		}
3221
3222		return (dtrace_getreg(tframe, ndx));
3223	}
3224#endif
3225
3226	case DIF_VAR_CURTHREAD:
3227		if (!dtrace_priv_proc(state))
3228			return (0);
3229		return ((uint64_t)(uintptr_t)curthread);
3230
3231	case DIF_VAR_TIMESTAMP:
3232		if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3233			mstate->dtms_timestamp = dtrace_gethrtime();
3234			mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3235		}
3236		return (mstate->dtms_timestamp);
3237
3238	case DIF_VAR_VTIMESTAMP:
3239		ASSERT(dtrace_vtime_references != 0);
3240		return (curthread->t_dtrace_vtime);
3241
3242	case DIF_VAR_WALLTIMESTAMP:
3243		if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3244			mstate->dtms_walltimestamp = dtrace_gethrestime();
3245			mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3246		}
3247		return (mstate->dtms_walltimestamp);
3248
3249#if defined(sun)
3250	case DIF_VAR_IPL:
3251		if (!dtrace_priv_kernel(state))
3252			return (0);
3253		if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3254			mstate->dtms_ipl = dtrace_getipl();
3255			mstate->dtms_present |= DTRACE_MSTATE_IPL;
3256		}
3257		return (mstate->dtms_ipl);
3258#endif
3259
3260	case DIF_VAR_EPID:
3261		ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3262		return (mstate->dtms_epid);
3263
3264	case DIF_VAR_ID:
3265		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3266		return (mstate->dtms_probe->dtpr_id);
3267
3268	case DIF_VAR_STACKDEPTH:
3269		if (!dtrace_priv_kernel(state))
3270			return (0);
3271		if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3272			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3273
3274			mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3275			mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3276		}
3277		return (mstate->dtms_stackdepth);
3278
3279	case DIF_VAR_USTACKDEPTH:
3280		if (!dtrace_priv_proc(state))
3281			return (0);
3282		if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3283			/*
3284			 * See comment in DIF_VAR_PID.
3285			 */
3286			if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3287			    CPU_ON_INTR(CPU)) {
3288				mstate->dtms_ustackdepth = 0;
3289			} else {
3290				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3291				mstate->dtms_ustackdepth =
3292				    dtrace_getustackdepth();
3293				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3294			}
3295			mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3296		}
3297		return (mstate->dtms_ustackdepth);
3298
3299	case DIF_VAR_CALLER:
3300		if (!dtrace_priv_kernel(state))
3301			return (0);
3302		if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3303			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3304
3305			if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3306				/*
3307				 * If this is an unanchored probe, we are
3308				 * required to go through the slow path:
3309				 * dtrace_caller() only guarantees correct
3310				 * results for anchored probes.
3311				 */
3312				pc_t caller[2] = {0, 0};
3313
3314				dtrace_getpcstack(caller, 2, aframes,
3315				    (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3316				mstate->dtms_caller = caller[1];
3317			} else if ((mstate->dtms_caller =
3318			    dtrace_caller(aframes)) == -1) {
3319				/*
3320				 * We have failed to do this the quick way;
3321				 * we must resort to the slower approach of
3322				 * calling dtrace_getpcstack().
3323				 */
3324				pc_t caller = 0;
3325
3326				dtrace_getpcstack(&caller, 1, aframes, NULL);
3327				mstate->dtms_caller = caller;
3328			}
3329
3330			mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3331		}
3332		return (mstate->dtms_caller);
3333
3334	case DIF_VAR_UCALLER:
3335		if (!dtrace_priv_proc(state))
3336			return (0);
3337
3338		if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3339			uint64_t ustack[3];
3340
3341			/*
3342			 * dtrace_getupcstack() fills in the first uint64_t
3343			 * with the current PID.  The second uint64_t will
3344			 * be the program counter at user-level.  The third
3345			 * uint64_t will contain the caller, which is what
3346			 * we're after.
3347			 */
3348			ustack[2] = 0;
3349			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3350			dtrace_getupcstack(ustack, 3);
3351			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3352			mstate->dtms_ucaller = ustack[2];
3353			mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3354		}
3355
3356		return (mstate->dtms_ucaller);
3357
3358	case DIF_VAR_PROBEPROV:
3359		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3360		return (dtrace_dif_varstr(
3361		    (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3362		    state, mstate));
3363
3364	case DIF_VAR_PROBEMOD:
3365		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3366		return (dtrace_dif_varstr(
3367		    (uintptr_t)mstate->dtms_probe->dtpr_mod,
3368		    state, mstate));
3369
3370	case DIF_VAR_PROBEFUNC:
3371		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3372		return (dtrace_dif_varstr(
3373		    (uintptr_t)mstate->dtms_probe->dtpr_func,
3374		    state, mstate));
3375
3376	case DIF_VAR_PROBENAME:
3377		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3378		return (dtrace_dif_varstr(
3379		    (uintptr_t)mstate->dtms_probe->dtpr_name,
3380		    state, mstate));
3381
3382	case DIF_VAR_PID:
3383		if (!dtrace_priv_proc(state))
3384			return (0);
3385
3386#if defined(sun)
3387		/*
3388		 * Note that we are assuming that an unanchored probe is
3389		 * always due to a high-level interrupt.  (And we're assuming
3390		 * that there is only a single high level interrupt.)
3391		 */
3392		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3393			return (pid0.pid_id);
3394
3395		/*
3396		 * It is always safe to dereference one's own t_procp pointer:
3397		 * it always points to a valid, allocated proc structure.
3398		 * Further, it is always safe to dereference the p_pidp member
3399		 * of one's own proc structure.  (These are truisms becuase
3400		 * threads and processes don't clean up their own state --
3401		 * they leave that task to whomever reaps them.)
3402		 */
3403		return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3404#else
3405		return ((uint64_t)curproc->p_pid);
3406#endif
3407
3408	case DIF_VAR_PPID:
3409		if (!dtrace_priv_proc(state))
3410			return (0);
3411
3412#if defined(sun)
3413		/*
3414		 * See comment in DIF_VAR_PID.
3415		 */
3416		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3417			return (pid0.pid_id);
3418
3419		/*
3420		 * It is always safe to dereference one's own t_procp pointer:
3421		 * it always points to a valid, allocated proc structure.
3422		 * (This is true because threads don't clean up their own
3423		 * state -- they leave that task to whomever reaps them.)
3424		 */
3425		return ((uint64_t)curthread->t_procp->p_ppid);
3426#else
3427		if (curproc->p_pid == proc0.p_pid)
3428			return (curproc->p_pid);
3429		else
3430			return (curproc->p_pptr->p_pid);
3431#endif
3432
3433	case DIF_VAR_TID:
3434#if defined(sun)
3435		/*
3436		 * See comment in DIF_VAR_PID.
3437		 */
3438		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3439			return (0);
3440#endif
3441
3442		return ((uint64_t)curthread->t_tid);
3443
3444	case DIF_VAR_EXECARGS: {
3445		struct pargs *p_args = curthread->td_proc->p_args;
3446
3447		if (p_args == NULL)
3448			return(0);
3449
3450		return (dtrace_dif_varstrz(
3451		    (uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));
3452	}
3453
3454	case DIF_VAR_EXECNAME:
3455#if defined(sun)
3456		if (!dtrace_priv_proc(state))
3457			return (0);
3458
3459		/*
3460		 * See comment in DIF_VAR_PID.
3461		 */
3462		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3463			return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3464
3465		/*
3466		 * It is always safe to dereference one's own t_procp pointer:
3467		 * it always points to a valid, allocated proc structure.
3468		 * (This is true because threads don't clean up their own
3469		 * state -- they leave that task to whomever reaps them.)
3470		 */
3471		return (dtrace_dif_varstr(
3472		    (uintptr_t)curthread->t_procp->p_user.u_comm,
3473		    state, mstate));
3474#else
3475		return (dtrace_dif_varstr(
3476		    (uintptr_t) curthread->td_proc->p_comm, state, mstate));
3477#endif
3478
3479	case DIF_VAR_ZONENAME:
3480#if defined(sun)
3481		if (!dtrace_priv_proc(state))
3482			return (0);
3483
3484		/*
3485		 * See comment in DIF_VAR_PID.
3486		 */
3487		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3488			return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3489
3490		/*
3491		 * It is always safe to dereference one's own t_procp pointer:
3492		 * it always points to a valid, allocated proc structure.
3493		 * (This is true because threads don't clean up their own
3494		 * state -- they leave that task to whomever reaps them.)
3495		 */
3496		return (dtrace_dif_varstr(
3497		    (uintptr_t)curthread->t_procp->p_zone->zone_name,
3498		    state, mstate));
3499#else
3500		return (0);
3501#endif
3502
3503	case DIF_VAR_UID:
3504		if (!dtrace_priv_proc(state))
3505			return (0);
3506
3507#if defined(sun)
3508		/*
3509		 * See comment in DIF_VAR_PID.
3510		 */
3511		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3512			return ((uint64_t)p0.p_cred->cr_uid);
3513#endif
3514
3515		/*
3516		 * It is always safe to dereference one's own t_procp pointer:
3517		 * it always points to a valid, allocated proc structure.
3518		 * (This is true because threads don't clean up their own
3519		 * state -- they leave that task to whomever reaps them.)
3520		 *
3521		 * Additionally, it is safe to dereference one's own process
3522		 * credential, since this is never NULL after process birth.
3523		 */
3524		return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3525
3526	case DIF_VAR_GID:
3527		if (!dtrace_priv_proc(state))
3528			return (0);
3529
3530#if defined(sun)
3531		/*
3532		 * See comment in DIF_VAR_PID.
3533		 */
3534		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3535			return ((uint64_t)p0.p_cred->cr_gid);
3536#endif
3537
3538		/*
3539		 * It is always safe to dereference one's own t_procp pointer:
3540		 * it always points to a valid, allocated proc structure.
3541		 * (This is true because threads don't clean up their own
3542		 * state -- they leave that task to whomever reaps them.)
3543		 *
3544		 * Additionally, it is safe to dereference one's own process
3545		 * credential, since this is never NULL after process birth.
3546		 */
3547		return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3548
3549	case DIF_VAR_ERRNO: {
3550#if defined(sun)
3551		klwp_t *lwp;
3552		if (!dtrace_priv_proc(state))
3553			return (0);
3554
3555		/*
3556		 * See comment in DIF_VAR_PID.
3557		 */
3558		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3559			return (0);
3560
3561		/*
3562		 * It is always safe to dereference one's own t_lwp pointer in
3563		 * the event that this pointer is non-NULL.  (This is true
3564		 * because threads and lwps don't clean up their own state --
3565		 * they leave that task to whomever reaps them.)
3566		 */
3567		if ((lwp = curthread->t_lwp) == NULL)
3568			return (0);
3569
3570		return ((uint64_t)lwp->lwp_errno);
3571#else
3572		return (curthread->td_errno);
3573#endif
3574	}
3575#if !defined(sun)
3576	case DIF_VAR_CPU: {
3577		return curcpu;
3578	}
3579#endif
3580	default:
3581		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3582		return (0);
3583	}
3584}
3585
3586
3587typedef enum dtrace_json_state {
3588	DTRACE_JSON_REST = 1,
3589	DTRACE_JSON_OBJECT,
3590	DTRACE_JSON_STRING,
3591	DTRACE_JSON_STRING_ESCAPE,
3592	DTRACE_JSON_STRING_ESCAPE_UNICODE,
3593	DTRACE_JSON_COLON,
3594	DTRACE_JSON_COMMA,
3595	DTRACE_JSON_VALUE,
3596	DTRACE_JSON_IDENTIFIER,
3597	DTRACE_JSON_NUMBER,
3598	DTRACE_JSON_NUMBER_FRAC,
3599	DTRACE_JSON_NUMBER_EXP,
3600	DTRACE_JSON_COLLECT_OBJECT
3601} dtrace_json_state_t;
3602
3603/*
3604 * This function possesses just enough knowledge about JSON to extract a single
3605 * value from a JSON string and store it in the scratch buffer.  It is able
3606 * to extract nested object values, and members of arrays by index.
3607 *
3608 * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3609 * be looked up as we descend into the object tree.  e.g.
3610 *
3611 *    foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3612 *       with nelems = 5.
3613 *
3614 * The run time of this function must be bounded above by strsize to limit the
3615 * amount of work done in probe context.  As such, it is implemented as a
3616 * simple state machine, reading one character at a time using safe loads
3617 * until we find the requested element, hit a parsing error or run off the
3618 * end of the object or string.
3619 *
3620 * As there is no way for a subroutine to return an error without interrupting
3621 * clause execution, we simply return NULL in the event of a missing key or any
3622 * other error condition.  Each NULL return in this function is commented with
3623 * the error condition it represents -- parsing or otherwise.
3624 *
3625 * The set of states for the state machine closely matches the JSON
3626 * specification (http://json.org/).  Briefly:
3627 *
3628 *   DTRACE_JSON_REST:
3629 *     Skip whitespace until we find either a top-level Object, moving
3630 *     to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3631 *
3632 *   DTRACE_JSON_OBJECT:
3633 *     Locate the next key String in an Object.  Sets a flag to denote
3634 *     the next String as a key string and moves to DTRACE_JSON_STRING.
3635 *
3636 *   DTRACE_JSON_COLON:
3637 *     Skip whitespace until we find the colon that separates key Strings
3638 *     from their values.  Once found, move to DTRACE_JSON_VALUE.
3639 *
3640 *   DTRACE_JSON_VALUE:
3641 *     Detects the type of the next value (String, Number, Identifier, Object
3642 *     or Array) and routes to the states that process that type.  Here we also
3643 *     deal with the element selector list if we are requested to traverse down
3644 *     into the object tree.
3645 *
3646 *   DTRACE_JSON_COMMA:
3647 *     Skip whitespace until we find the comma that separates key-value pairs
3648 *     in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3649 *     (similarly DTRACE_JSON_VALUE).  All following literal value processing
3650 *     states return to this state at the end of their value, unless otherwise
3651 *     noted.
3652 *
3653 *   DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3654 *     Processes a Number literal from the JSON, including any exponent
3655 *     component that may be present.  Numbers are returned as strings, which
3656 *     may be passed to strtoll() if an integer is required.
3657 *
3658 *   DTRACE_JSON_IDENTIFIER:
3659 *     Processes a "true", "false" or "null" literal in the JSON.
3660 *
3661 *   DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3662 *   DTRACE_JSON_STRING_ESCAPE_UNICODE:
3663 *     Processes a String literal from the JSON, whether the String denotes
3664 *     a key, a value or part of a larger Object.  Handles all escape sequences
3665 *     present in the specification, including four-digit unicode characters,
3666 *     but merely includes the escape sequence without converting it to the
3667 *     actual escaped character.  If the String is flagged as a key, we
3668 *     move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3669 *
3670 *   DTRACE_JSON_COLLECT_OBJECT:
3671 *     This state collects an entire Object (or Array), correctly handling
3672 *     embedded strings.  If the full element selector list matches this nested
3673 *     object, we return the Object in full as a string.  If not, we use this
3674 *     state to skip to the next value at this level and continue processing.
3675 *
3676 * NOTE: This function uses various macros from strtolctype.h to manipulate
3677 * digit values, etc -- these have all been checked to ensure they make
3678 * no additional function calls.
3679 */
3680static char *
3681dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3682    char *dest)
3683{
3684	dtrace_json_state_t state = DTRACE_JSON_REST;
3685	int64_t array_elem = INT64_MIN;
3686	int64_t array_pos = 0;
3687	uint8_t escape_unicount = 0;
3688	boolean_t string_is_key = B_FALSE;
3689	boolean_t collect_object = B_FALSE;
3690	boolean_t found_key = B_FALSE;
3691	boolean_t in_array = B_FALSE;
3692	uint32_t braces = 0, brackets = 0;
3693	char *elem = elemlist;
3694	char *dd = dest;
3695	uintptr_t cur;
3696
3697	for (cur = json; cur < json + size; cur++) {
3698		char cc = dtrace_load8(cur);
3699		if (cc == '\0')
3700			return (NULL);
3701
3702		switch (state) {
3703		case DTRACE_JSON_REST:
3704			if (isspace(cc))
3705				break;
3706
3707			if (cc == '{') {
3708				state = DTRACE_JSON_OBJECT;
3709				break;
3710			}
3711
3712			if (cc == '[') {
3713				in_array = B_TRUE;
3714				array_pos = 0;
3715				array_elem = dtrace_strtoll(elem, 10, size);
3716				found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3717				state = DTRACE_JSON_VALUE;
3718				break;
3719			}
3720
3721			/*
3722			 * ERROR: expected to find a top-level object or array.
3723			 */
3724			return (NULL);
3725		case DTRACE_JSON_OBJECT:
3726			if (isspace(cc))
3727				break;
3728
3729			if (cc == '"') {
3730				state = DTRACE_JSON_STRING;
3731				string_is_key = B_TRUE;
3732				break;
3733			}
3734
3735			/*
3736			 * ERROR: either the object did not start with a key
3737			 * string, or we've run off the end of the object
3738			 * without finding the requested key.
3739			 */
3740			return (NULL);
3741		case DTRACE_JSON_STRING:
3742			if (cc == '\\') {
3743				*dd++ = '\\';
3744				state = DTRACE_JSON_STRING_ESCAPE;
3745				break;
3746			}
3747
3748			if (cc == '"') {
3749				if (collect_object) {
3750					/*
3751					 * We don't reset the dest here, as
3752					 * the string is part of a larger
3753					 * object being collected.
3754					 */
3755					*dd++ = cc;
3756					collect_object = B_FALSE;
3757					state = DTRACE_JSON_COLLECT_OBJECT;
3758					break;
3759				}
3760				*dd = '\0';
3761				dd = dest; /* reset string buffer */
3762				if (string_is_key) {
3763					if (dtrace_strncmp(dest, elem,
3764					    size) == 0)
3765						found_key = B_TRUE;
3766				} else if (found_key) {
3767					if (nelems > 1) {
3768						/*
3769						 * We expected an object, not
3770						 * this string.
3771						 */
3772						return (NULL);
3773					}
3774					return (dest);
3775				}
3776				state = string_is_key ? DTRACE_JSON_COLON :
3777				    DTRACE_JSON_COMMA;
3778				string_is_key = B_FALSE;
3779				break;
3780			}
3781
3782			*dd++ = cc;
3783			break;
3784		case DTRACE_JSON_STRING_ESCAPE:
3785			*dd++ = cc;
3786			if (cc == 'u') {
3787				escape_unicount = 0;
3788				state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3789			} else {
3790				state = DTRACE_JSON_STRING;
3791			}
3792			break;
3793		case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3794			if (!isxdigit(cc)) {
3795				/*
3796				 * ERROR: invalid unicode escape, expected
3797				 * four valid hexidecimal digits.
3798				 */
3799				return (NULL);
3800			}
3801
3802			*dd++ = cc;
3803			if (++escape_unicount == 4)
3804				state = DTRACE_JSON_STRING;
3805			break;
3806		case DTRACE_JSON_COLON:
3807			if (isspace(cc))
3808				break;
3809
3810			if (cc == ':') {
3811				state = DTRACE_JSON_VALUE;
3812				break;
3813			}
3814
3815			/*
3816			 * ERROR: expected a colon.
3817			 */
3818			return (NULL);
3819		case DTRACE_JSON_COMMA:
3820			if (isspace(cc))
3821				break;
3822
3823			if (cc == ',') {
3824				if (in_array) {
3825					state = DTRACE_JSON_VALUE;
3826					if (++array_pos == array_elem)
3827						found_key = B_TRUE;
3828				} else {
3829					state = DTRACE_JSON_OBJECT;
3830				}
3831				break;
3832			}
3833
3834			/*
3835			 * ERROR: either we hit an unexpected character, or
3836			 * we reached the end of the object or array without
3837			 * finding the requested key.
3838			 */
3839			return (NULL);
3840		case DTRACE_JSON_IDENTIFIER:
3841			if (islower(cc)) {
3842				*dd++ = cc;
3843				break;
3844			}
3845
3846			*dd = '\0';
3847			dd = dest; /* reset string buffer */
3848
3849			if (dtrace_strncmp(dest, "true", 5) == 0 ||
3850			    dtrace_strncmp(dest, "false", 6) == 0 ||
3851			    dtrace_strncmp(dest, "null", 5) == 0) {
3852				if (found_key) {
3853					if (nelems > 1) {
3854						/*
3855						 * ERROR: We expected an object,
3856						 * not this identifier.
3857						 */
3858						return (NULL);
3859					}
3860					return (dest);
3861				} else {
3862					cur--;
3863					state = DTRACE_JSON_COMMA;
3864					break;
3865				}
3866			}
3867
3868			/*
3869			 * ERROR: we did not recognise the identifier as one
3870			 * of those in the JSON specification.
3871			 */
3872			return (NULL);
3873		case DTRACE_JSON_NUMBER:
3874			if (cc == '.') {
3875				*dd++ = cc;
3876				state = DTRACE_JSON_NUMBER_FRAC;
3877				break;
3878			}
3879
3880			if (cc == 'x' || cc == 'X') {
3881				/*
3882				 * ERROR: specification explicitly excludes
3883				 * hexidecimal or octal numbers.
3884				 */
3885				return (NULL);
3886			}
3887
3888			/* FALLTHRU */
3889		case DTRACE_JSON_NUMBER_FRAC:
3890			if (cc == 'e' || cc == 'E') {
3891				*dd++ = cc;
3892				state = DTRACE_JSON_NUMBER_EXP;
3893				break;
3894			}
3895
3896			if (cc == '+' || cc == '-') {
3897				/*
3898				 * ERROR: expect sign as part of exponent only.
3899				 */
3900				return (NULL);
3901			}
3902			/* FALLTHRU */
3903		case DTRACE_JSON_NUMBER_EXP:
3904			if (isdigit(cc) || cc == '+' || cc == '-') {
3905				*dd++ = cc;
3906				break;
3907			}
3908
3909			*dd = '\0';
3910			dd = dest; /* reset string buffer */
3911			if (found_key) {
3912				if (nelems > 1) {
3913					/*
3914					 * ERROR: We expected an object, not
3915					 * this number.
3916					 */
3917					return (NULL);
3918				}
3919				return (dest);
3920			}
3921
3922			cur--;
3923			state = DTRACE_JSON_COMMA;
3924			break;
3925		case DTRACE_JSON_VALUE:
3926			if (isspace(cc))
3927				break;
3928
3929			if (cc == '{' || cc == '[') {
3930				if (nelems > 1 && found_key) {
3931					in_array = cc == '[' ? B_TRUE : B_FALSE;
3932					/*
3933					 * If our element selector directs us
3934					 * to descend into this nested object,
3935					 * then move to the next selector
3936					 * element in the list and restart the
3937					 * state machine.
3938					 */
3939					while (*elem != '\0')
3940						elem++;
3941					elem++; /* skip the inter-element NUL */
3942					nelems--;
3943					dd = dest;
3944					if (in_array) {
3945						state = DTRACE_JSON_VALUE;
3946						array_pos = 0;
3947						array_elem = dtrace_strtoll(
3948						    elem, 10, size);
3949						found_key = array_elem == 0 ?
3950						    B_TRUE : B_FALSE;
3951					} else {
3952						found_key = B_FALSE;
3953						state = DTRACE_JSON_OBJECT;
3954					}
3955					break;
3956				}
3957
3958				/*
3959				 * Otherwise, we wish to either skip this
3960				 * nested object or return it in full.
3961				 */
3962				if (cc == '[')
3963					brackets = 1;
3964				else
3965					braces = 1;
3966				*dd++ = cc;
3967				state = DTRACE_JSON_COLLECT_OBJECT;
3968				break;
3969			}
3970
3971			if (cc == '"') {
3972				state = DTRACE_JSON_STRING;
3973				break;
3974			}
3975
3976			if (islower(cc)) {
3977				/*
3978				 * Here we deal with true, false and null.
3979				 */
3980				*dd++ = cc;
3981				state = DTRACE_JSON_IDENTIFIER;
3982				break;
3983			}
3984
3985			if (cc == '-' || isdigit(cc)) {
3986				*dd++ = cc;
3987				state = DTRACE_JSON_NUMBER;
3988				break;
3989			}
3990
3991			/*
3992			 * ERROR: unexpected character at start of value.
3993			 */
3994			return (NULL);
3995		case DTRACE_JSON_COLLECT_OBJECT:
3996			if (cc == '\0')
3997				/*
3998				 * ERROR: unexpected end of input.
3999				 */
4000				return (NULL);
4001
4002			*dd++ = cc;
4003			if (cc == '"') {
4004				collect_object = B_TRUE;
4005				state = DTRACE_JSON_STRING;
4006				break;
4007			}
4008
4009			if (cc == ']') {
4010				if (brackets-- == 0) {
4011					/*
4012					 * ERROR: unbalanced brackets.
4013					 */
4014					return (NULL);
4015				}
4016			} else if (cc == '}') {
4017				if (braces-- == 0) {
4018					/*
4019					 * ERROR: unbalanced braces.
4020					 */
4021					return (NULL);
4022				}
4023			} else if (cc == '{') {
4024				braces++;
4025			} else if (cc == '[') {
4026				brackets++;
4027			}
4028
4029			if (brackets == 0 && braces == 0) {
4030				if (found_key) {
4031					*dd = '\0';
4032					return (dest);
4033				}
4034				dd = dest; /* reset string buffer */
4035				state = DTRACE_JSON_COMMA;
4036			}
4037			break;
4038		}
4039	}
4040	return (NULL);
4041}
4042
4043/*
4044 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
4045 * Notice that we don't bother validating the proper number of arguments or
4046 * their types in the tuple stack.  This isn't needed because all argument
4047 * interpretation is safe because of our load safety -- the worst that can
4048 * happen is that a bogus program can obtain bogus results.
4049 */
4050static void
4051dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
4052    dtrace_key_t *tupregs, int nargs,
4053    dtrace_mstate_t *mstate, dtrace_state_t *state)
4054{
4055	volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
4056	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
4057	dtrace_vstate_t *vstate = &state->dts_vstate;
4058
4059#if defined(sun)
4060	union {
4061		mutex_impl_t mi;
4062		uint64_t mx;
4063	} m;
4064
4065	union {
4066		krwlock_t ri;
4067		uintptr_t rw;
4068	} r;
4069#else
4070	struct thread *lowner;
4071	union {
4072		struct lock_object *li;
4073		uintptr_t lx;
4074	} l;
4075#endif
4076
4077	switch (subr) {
4078	case DIF_SUBR_RAND:
4079		regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
4080		break;
4081
4082#if defined(sun)
4083	case DIF_SUBR_MUTEX_OWNED:
4084		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4085		    mstate, vstate)) {
4086			regs[rd] = 0;
4087			break;
4088		}
4089
4090		m.mx = dtrace_load64(tupregs[0].dttk_value);
4091		if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4092			regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4093		else
4094			regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4095		break;
4096
4097	case DIF_SUBR_MUTEX_OWNER:
4098		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4099		    mstate, vstate)) {
4100			regs[rd] = 0;
4101			break;
4102		}
4103
4104		m.mx = dtrace_load64(tupregs[0].dttk_value);
4105		if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4106		    MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4107			regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4108		else
4109			regs[rd] = 0;
4110		break;
4111
4112	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4113		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4114		    mstate, vstate)) {
4115			regs[rd] = 0;
4116			break;
4117		}
4118
4119		m.mx = dtrace_load64(tupregs[0].dttk_value);
4120		regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4121		break;
4122
4123	case DIF_SUBR_MUTEX_TYPE_SPIN:
4124		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4125		    mstate, vstate)) {
4126			regs[rd] = 0;
4127			break;
4128		}
4129
4130		m.mx = dtrace_load64(tupregs[0].dttk_value);
4131		regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4132		break;
4133
4134	case DIF_SUBR_RW_READ_HELD: {
4135		uintptr_t tmp;
4136
4137		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4138		    mstate, vstate)) {
4139			regs[rd] = 0;
4140			break;
4141		}
4142
4143		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4144		regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4145		break;
4146	}
4147
4148	case DIF_SUBR_RW_WRITE_HELD:
4149		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4150		    mstate, vstate)) {
4151			regs[rd] = 0;
4152			break;
4153		}
4154
4155		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4156		regs[rd] = _RW_WRITE_HELD(&r.ri);
4157		break;
4158
4159	case DIF_SUBR_RW_ISWRITER:
4160		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4161		    mstate, vstate)) {
4162			regs[rd] = 0;
4163			break;
4164		}
4165
4166		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4167		regs[rd] = _RW_ISWRITER(&r.ri);
4168		break;
4169
4170#else
4171	case DIF_SUBR_MUTEX_OWNED:
4172		if (!dtrace_canload(tupregs[0].dttk_value,
4173			sizeof (struct lock_object), mstate, vstate)) {
4174			regs[rd] = 0;
4175			break;
4176		}
4177		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4178		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4179		break;
4180
4181	case DIF_SUBR_MUTEX_OWNER:
4182		if (!dtrace_canload(tupregs[0].dttk_value,
4183			sizeof (struct lock_object), mstate, vstate)) {
4184			regs[rd] = 0;
4185			break;
4186		}
4187		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4188		LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4189		regs[rd] = (uintptr_t)lowner;
4190		break;
4191
4192	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4193		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4194		    mstate, vstate)) {
4195			regs[rd] = 0;
4196			break;
4197		}
4198		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4199		/* XXX - should be only LC_SLEEPABLE? */
4200		regs[rd] = (LOCK_CLASS(l.li)->lc_flags &
4201		    (LC_SLEEPLOCK | LC_SLEEPABLE)) != 0;
4202		break;
4203
4204	case DIF_SUBR_MUTEX_TYPE_SPIN:
4205		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4206		    mstate, vstate)) {
4207			regs[rd] = 0;
4208			break;
4209		}
4210		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4211		regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;
4212		break;
4213
4214	case DIF_SUBR_RW_READ_HELD:
4215	case DIF_SUBR_SX_SHARED_HELD:
4216		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4217		    mstate, vstate)) {
4218			regs[rd] = 0;
4219			break;
4220		}
4221		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4222		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4223		    lowner == NULL;
4224		break;
4225
4226	case DIF_SUBR_RW_WRITE_HELD:
4227	case DIF_SUBR_SX_EXCLUSIVE_HELD:
4228		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4229		    mstate, vstate)) {
4230			regs[rd] = 0;
4231			break;
4232		}
4233		l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4234		LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4235		regs[rd] = (lowner == curthread);
4236		break;
4237
4238	case DIF_SUBR_RW_ISWRITER:
4239	case DIF_SUBR_SX_ISEXCLUSIVE:
4240		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4241		    mstate, vstate)) {
4242			regs[rd] = 0;
4243			break;
4244		}
4245		l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4246		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4247		    lowner != NULL;
4248		break;
4249#endif /* ! defined(sun) */
4250
4251	case DIF_SUBR_BCOPY: {
4252		/*
4253		 * We need to be sure that the destination is in the scratch
4254		 * region -- no other region is allowed.
4255		 */
4256		uintptr_t src = tupregs[0].dttk_value;
4257		uintptr_t dest = tupregs[1].dttk_value;
4258		size_t size = tupregs[2].dttk_value;
4259
4260		if (!dtrace_inscratch(dest, size, mstate)) {
4261			*flags |= CPU_DTRACE_BADADDR;
4262			*illval = regs[rd];
4263			break;
4264		}
4265
4266		if (!dtrace_canload(src, size, mstate, vstate)) {
4267			regs[rd] = 0;
4268			break;
4269		}
4270
4271		dtrace_bcopy((void *)src, (void *)dest, size);
4272		break;
4273	}
4274
4275	case DIF_SUBR_ALLOCA:
4276	case DIF_SUBR_COPYIN: {
4277		uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4278		uint64_t size =
4279		    tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4280		size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4281
4282		/*
4283		 * This action doesn't require any credential checks since
4284		 * probes will not activate in user contexts to which the
4285		 * enabling user does not have permissions.
4286		 */
4287
4288		/*
4289		 * Rounding up the user allocation size could have overflowed
4290		 * a large, bogus allocation (like -1ULL) to 0.
4291		 */
4292		if (scratch_size < size ||
4293		    !DTRACE_INSCRATCH(mstate, scratch_size)) {
4294			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4295			regs[rd] = 0;
4296			break;
4297		}
4298
4299		if (subr == DIF_SUBR_COPYIN) {
4300			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4301			dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4302			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4303		}
4304
4305		mstate->dtms_scratch_ptr += scratch_size;
4306		regs[rd] = dest;
4307		break;
4308	}
4309
4310	case DIF_SUBR_COPYINTO: {
4311		uint64_t size = tupregs[1].dttk_value;
4312		uintptr_t dest = tupregs[2].dttk_value;
4313
4314		/*
4315		 * This action doesn't require any credential checks since
4316		 * probes will not activate in user contexts to which the
4317		 * enabling user does not have permissions.
4318		 */
4319		if (!dtrace_inscratch(dest, size, mstate)) {
4320			*flags |= CPU_DTRACE_BADADDR;
4321			*illval = regs[rd];
4322			break;
4323		}
4324
4325		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4326		dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4327		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4328		break;
4329	}
4330
4331	case DIF_SUBR_COPYINSTR: {
4332		uintptr_t dest = mstate->dtms_scratch_ptr;
4333		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4334
4335		if (nargs > 1 && tupregs[1].dttk_value < size)
4336			size = tupregs[1].dttk_value + 1;
4337
4338		/*
4339		 * This action doesn't require any credential checks since
4340		 * probes will not activate in user contexts to which the
4341		 * enabling user does not have permissions.
4342		 */
4343		if (!DTRACE_INSCRATCH(mstate, size)) {
4344			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4345			regs[rd] = 0;
4346			break;
4347		}
4348
4349		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4350		dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4351		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4352
4353		((char *)dest)[size - 1] = '\0';
4354		mstate->dtms_scratch_ptr += size;
4355		regs[rd] = dest;
4356		break;
4357	}
4358
4359#if defined(sun)
4360	case DIF_SUBR_MSGSIZE:
4361	case DIF_SUBR_MSGDSIZE: {
4362		uintptr_t baddr = tupregs[0].dttk_value, daddr;
4363		uintptr_t wptr, rptr;
4364		size_t count = 0;
4365		int cont = 0;
4366
4367		while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
4368
4369			if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
4370			    vstate)) {
4371				regs[rd] = 0;
4372				break;
4373			}
4374
4375			wptr = dtrace_loadptr(baddr +
4376			    offsetof(mblk_t, b_wptr));
4377
4378			rptr = dtrace_loadptr(baddr +
4379			    offsetof(mblk_t, b_rptr));
4380
4381			if (wptr < rptr) {
4382				*flags |= CPU_DTRACE_BADADDR;
4383				*illval = tupregs[0].dttk_value;
4384				break;
4385			}
4386
4387			daddr = dtrace_loadptr(baddr +
4388			    offsetof(mblk_t, b_datap));
4389
4390			baddr = dtrace_loadptr(baddr +
4391			    offsetof(mblk_t, b_cont));
4392
4393			/*
4394			 * We want to prevent against denial-of-service here,
4395			 * so we're only going to search the list for
4396			 * dtrace_msgdsize_max mblks.
4397			 */
4398			if (cont++ > dtrace_msgdsize_max) {
4399				*flags |= CPU_DTRACE_ILLOP;
4400				break;
4401			}
4402
4403			if (subr == DIF_SUBR_MSGDSIZE) {
4404				if (dtrace_load8(daddr +
4405				    offsetof(dblk_t, db_type)) != M_DATA)
4406					continue;
4407			}
4408
4409			count += wptr - rptr;
4410		}
4411
4412		if (!(*flags & CPU_DTRACE_FAULT))
4413			regs[rd] = count;
4414
4415		break;
4416	}
4417#endif
4418
4419	case DIF_SUBR_PROGENYOF: {
4420		pid_t pid = tupregs[0].dttk_value;
4421		proc_t *p;
4422		int rval = 0;
4423
4424		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4425
4426		for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
4427#if defined(sun)
4428			if (p->p_pidp->pid_id == pid) {
4429#else
4430			if (p->p_pid == pid) {
4431#endif
4432				rval = 1;
4433				break;
4434			}
4435		}
4436
4437		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4438
4439		regs[rd] = rval;
4440		break;
4441	}
4442
4443	case DIF_SUBR_SPECULATION:
4444		regs[rd] = dtrace_speculation(state);
4445		break;
4446
4447	case DIF_SUBR_COPYOUT: {
4448		uintptr_t kaddr = tupregs[0].dttk_value;
4449		uintptr_t uaddr = tupregs[1].dttk_value;
4450		uint64_t size = tupregs[2].dttk_value;
4451
4452		if (!dtrace_destructive_disallow &&
4453		    dtrace_priv_proc_control(state) &&
4454		    !dtrace_istoxic(kaddr, size)) {
4455			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4456			dtrace_copyout(kaddr, uaddr, size, flags);
4457			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4458		}
4459		break;
4460	}
4461
4462	case DIF_SUBR_COPYOUTSTR: {
4463		uintptr_t kaddr = tupregs[0].dttk_value;
4464		uintptr_t uaddr = tupregs[1].dttk_value;
4465		uint64_t size = tupregs[2].dttk_value;
4466
4467		if (!dtrace_destructive_disallow &&
4468		    dtrace_priv_proc_control(state) &&
4469		    !dtrace_istoxic(kaddr, size)) {
4470			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4471			dtrace_copyoutstr(kaddr, uaddr, size, flags);
4472			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4473		}
4474		break;
4475	}
4476
4477	case DIF_SUBR_STRLEN: {
4478		size_t sz;
4479		uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4480		sz = dtrace_strlen((char *)addr,
4481		    state->dts_options[DTRACEOPT_STRSIZE]);
4482
4483		if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
4484			regs[rd] = 0;
4485			break;
4486		}
4487
4488		regs[rd] = sz;
4489
4490		break;
4491	}
4492
4493	case DIF_SUBR_STRCHR:
4494	case DIF_SUBR_STRRCHR: {
4495		/*
4496		 * We're going to iterate over the string looking for the
4497		 * specified character.  We will iterate until we have reached
4498		 * the string length or we have found the character.  If this
4499		 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4500		 * of the specified character instead of the first.
4501		 */
4502		uintptr_t saddr = tupregs[0].dttk_value;
4503		uintptr_t addr = tupregs[0].dttk_value;
4504		uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
4505		char c, target = (char)tupregs[1].dttk_value;
4506
4507		for (regs[rd] = 0; addr < limit; addr++) {
4508			if ((c = dtrace_load8(addr)) == target) {
4509				regs[rd] = addr;
4510
4511				if (subr == DIF_SUBR_STRCHR)
4512					break;
4513			}
4514
4515			if (c == '\0')
4516				break;
4517		}
4518
4519		if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
4520			regs[rd] = 0;
4521			break;
4522		}
4523
4524		break;
4525	}
4526
4527	case DIF_SUBR_STRSTR:
4528	case DIF_SUBR_INDEX:
4529	case DIF_SUBR_RINDEX: {
4530		/*
4531		 * We're going to iterate over the string looking for the
4532		 * specified string.  We will iterate until we have reached
4533		 * the string length or we have found the string.  (Yes, this
4534		 * is done in the most naive way possible -- but considering
4535		 * that the string we're searching for is likely to be
4536		 * relatively short, the complexity of Rabin-Karp or similar
4537		 * hardly seems merited.)
4538		 */
4539		char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4540		char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4541		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4542		size_t len = dtrace_strlen(addr, size);
4543		size_t sublen = dtrace_strlen(substr, size);
4544		char *limit = addr + len, *orig = addr;
4545		int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4546		int inc = 1;
4547
4548		regs[rd] = notfound;
4549
4550		if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4551			regs[rd] = 0;
4552			break;
4553		}
4554
4555		if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4556		    vstate)) {
4557			regs[rd] = 0;
4558			break;
4559		}
4560
4561		/*
4562		 * strstr() and index()/rindex() have similar semantics if
4563		 * both strings are the empty string: strstr() returns a
4564		 * pointer to the (empty) string, and index() and rindex()
4565		 * both return index 0 (regardless of any position argument).
4566		 */
4567		if (sublen == 0 && len == 0) {
4568			if (subr == DIF_SUBR_STRSTR)
4569				regs[rd] = (uintptr_t)addr;
4570			else
4571				regs[rd] = 0;
4572			break;
4573		}
4574
4575		if (subr != DIF_SUBR_STRSTR) {
4576			if (subr == DIF_SUBR_RINDEX) {
4577				limit = orig - 1;
4578				addr += len;
4579				inc = -1;
4580			}
4581
4582			/*
4583			 * Both index() and rindex() take an optional position
4584			 * argument that denotes the starting position.
4585			 */
4586			if (nargs == 3) {
4587				int64_t pos = (int64_t)tupregs[2].dttk_value;
4588
4589				/*
4590				 * If the position argument to index() is
4591				 * negative, Perl implicitly clamps it at
4592				 * zero.  This semantic is a little surprising
4593				 * given the special meaning of negative
4594				 * positions to similar Perl functions like
4595				 * substr(), but it appears to reflect a
4596				 * notion that index() can start from a
4597				 * negative index and increment its way up to
4598				 * the string.  Given this notion, Perl's
4599				 * rindex() is at least self-consistent in
4600				 * that it implicitly clamps positions greater
4601				 * than the string length to be the string
4602				 * length.  Where Perl completely loses
4603				 * coherence, however, is when the specified
4604				 * substring is the empty string ("").  In
4605				 * this case, even if the position is
4606				 * negative, rindex() returns 0 -- and even if
4607				 * the position is greater than the length,
4608				 * index() returns the string length.  These
4609				 * semantics violate the notion that index()
4610				 * should never return a value less than the
4611				 * specified position and that rindex() should
4612				 * never return a value greater than the
4613				 * specified position.  (One assumes that
4614				 * these semantics are artifacts of Perl's
4615				 * implementation and not the results of
4616				 * deliberate design -- it beggars belief that
4617				 * even Larry Wall could desire such oddness.)
4618				 * While in the abstract one would wish for
4619				 * consistent position semantics across
4620				 * substr(), index() and rindex() -- or at the
4621				 * very least self-consistent position
4622				 * semantics for index() and rindex() -- we
4623				 * instead opt to keep with the extant Perl
4624				 * semantics, in all their broken glory.  (Do
4625				 * we have more desire to maintain Perl's
4626				 * semantics than Perl does?  Probably.)
4627				 */
4628				if (subr == DIF_SUBR_RINDEX) {
4629					if (pos < 0) {
4630						if (sublen == 0)
4631							regs[rd] = 0;
4632						break;
4633					}
4634
4635					if (pos > len)
4636						pos = len;
4637				} else {
4638					if (pos < 0)
4639						pos = 0;
4640
4641					if (pos >= len) {
4642						if (sublen == 0)
4643							regs[rd] = len;
4644						break;
4645					}
4646				}
4647
4648				addr = orig + pos;
4649			}
4650		}
4651
4652		for (regs[rd] = notfound; addr != limit; addr += inc) {
4653			if (dtrace_strncmp(addr, substr, sublen) == 0) {
4654				if (subr != DIF_SUBR_STRSTR) {
4655					/*
4656					 * As D index() and rindex() are
4657					 * modeled on Perl (and not on awk),
4658					 * we return a zero-based (and not a
4659					 * one-based) index.  (For you Perl
4660					 * weenies: no, we're not going to add
4661					 * $[ -- and shouldn't you be at a con
4662					 * or something?)
4663					 */
4664					regs[rd] = (uintptr_t)(addr - orig);
4665					break;
4666				}
4667
4668				ASSERT(subr == DIF_SUBR_STRSTR);
4669				regs[rd] = (uintptr_t)addr;
4670				break;
4671			}
4672		}
4673
4674		break;
4675	}
4676
4677	case DIF_SUBR_STRTOK: {
4678		uintptr_t addr = tupregs[0].dttk_value;
4679		uintptr_t tokaddr = tupregs[1].dttk_value;
4680		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4681		uintptr_t limit, toklimit = tokaddr + size;
4682		uint8_t c = 0, tokmap[32];	 /* 256 / 8 */
4683		char *dest = (char *)mstate->dtms_scratch_ptr;
4684		int i;
4685
4686		/*
4687		 * Check both the token buffer and (later) the input buffer,
4688		 * since both could be non-scratch addresses.
4689		 */
4690		if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
4691			regs[rd] = 0;
4692			break;
4693		}
4694
4695		if (!DTRACE_INSCRATCH(mstate, size)) {
4696			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4697			regs[rd] = 0;
4698			break;
4699		}
4700
4701		if (addr == 0) {
4702			/*
4703			 * If the address specified is NULL, we use our saved
4704			 * strtok pointer from the mstate.  Note that this
4705			 * means that the saved strtok pointer is _only_
4706			 * valid within multiple enablings of the same probe --
4707			 * it behaves like an implicit clause-local variable.
4708			 */
4709			addr = mstate->dtms_strtok;
4710		} else {
4711			/*
4712			 * If the user-specified address is non-NULL we must
4713			 * access check it.  This is the only time we have
4714			 * a chance to do so, since this address may reside
4715			 * in the string table of this clause-- future calls
4716			 * (when we fetch addr from mstate->dtms_strtok)
4717			 * would fail this access check.
4718			 */
4719			if (!dtrace_strcanload(addr, size, mstate, vstate)) {
4720				regs[rd] = 0;
4721				break;
4722			}
4723		}
4724
4725		/*
4726		 * First, zero the token map, and then process the token
4727		 * string -- setting a bit in the map for every character
4728		 * found in the token string.
4729		 */
4730		for (i = 0; i < sizeof (tokmap); i++)
4731			tokmap[i] = 0;
4732
4733		for (; tokaddr < toklimit; tokaddr++) {
4734			if ((c = dtrace_load8(tokaddr)) == '\0')
4735				break;
4736
4737			ASSERT((c >> 3) < sizeof (tokmap));
4738			tokmap[c >> 3] |= (1 << (c & 0x7));
4739		}
4740
4741		for (limit = addr + size; addr < limit; addr++) {
4742			/*
4743			 * We're looking for a character that is _not_ contained
4744			 * in the token string.
4745			 */
4746			if ((c = dtrace_load8(addr)) == '\0')
4747				break;
4748
4749			if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4750				break;
4751		}
4752
4753		if (c == '\0') {
4754			/*
4755			 * We reached the end of the string without finding
4756			 * any character that was not in the token string.
4757			 * We return NULL in this case, and we set the saved
4758			 * address to NULL as well.
4759			 */
4760			regs[rd] = 0;
4761			mstate->dtms_strtok = 0;
4762			break;
4763		}
4764
4765		/*
4766		 * From here on, we're copying into the destination string.
4767		 */
4768		for (i = 0; addr < limit && i < size - 1; addr++) {
4769			if ((c = dtrace_load8(addr)) == '\0')
4770				break;
4771
4772			if (tokmap[c >> 3] & (1 << (c & 0x7)))
4773				break;
4774
4775			ASSERT(i < size);
4776			dest[i++] = c;
4777		}
4778
4779		ASSERT(i < size);
4780		dest[i] = '\0';
4781		regs[rd] = (uintptr_t)dest;
4782		mstate->dtms_scratch_ptr += size;
4783		mstate->dtms_strtok = addr;
4784		break;
4785	}
4786
4787	case DIF_SUBR_SUBSTR: {
4788		uintptr_t s = tupregs[0].dttk_value;
4789		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4790		char *d = (char *)mstate->dtms_scratch_ptr;
4791		int64_t index = (int64_t)tupregs[1].dttk_value;
4792		int64_t remaining = (int64_t)tupregs[2].dttk_value;
4793		size_t len = dtrace_strlen((char *)s, size);
4794		int64_t i;
4795
4796		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4797			regs[rd] = 0;
4798			break;
4799		}
4800
4801		if (!DTRACE_INSCRATCH(mstate, size)) {
4802			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4803			regs[rd] = 0;
4804			break;
4805		}
4806
4807		if (nargs <= 2)
4808			remaining = (int64_t)size;
4809
4810		if (index < 0) {
4811			index += len;
4812
4813			if (index < 0 && index + remaining > 0) {
4814				remaining += index;
4815				index = 0;
4816			}
4817		}
4818
4819		if (index >= len || index < 0) {
4820			remaining = 0;
4821		} else if (remaining < 0) {
4822			remaining += len - index;
4823		} else if (index + remaining > size) {
4824			remaining = size - index;
4825		}
4826
4827		for (i = 0; i < remaining; i++) {
4828			if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4829				break;
4830		}
4831
4832		d[i] = '\0';
4833
4834		mstate->dtms_scratch_ptr += size;
4835		regs[rd] = (uintptr_t)d;
4836		break;
4837	}
4838
4839	case DIF_SUBR_JSON: {
4840		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4841		uintptr_t json = tupregs[0].dttk_value;
4842		size_t jsonlen = dtrace_strlen((char *)json, size);
4843		uintptr_t elem = tupregs[1].dttk_value;
4844		size_t elemlen = dtrace_strlen((char *)elem, size);
4845
4846		char *dest = (char *)mstate->dtms_scratch_ptr;
4847		char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
4848		char *ee = elemlist;
4849		int nelems = 1;
4850		uintptr_t cur;
4851
4852		if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
4853		    !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
4854			regs[rd] = 0;
4855			break;
4856		}
4857
4858		if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
4859			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4860			regs[rd] = 0;
4861			break;
4862		}
4863
4864		/*
4865		 * Read the element selector and split it up into a packed list
4866		 * of strings.
4867		 */
4868		for (cur = elem; cur < elem + elemlen; cur++) {
4869			char cc = dtrace_load8(cur);
4870
4871			if (cur == elem && cc == '[') {
4872				/*
4873				 * If the first element selector key is
4874				 * actually an array index then ignore the
4875				 * bracket.
4876				 */
4877				continue;
4878			}
4879
4880			if (cc == ']')
4881				continue;
4882
4883			if (cc == '.' || cc == '[') {
4884				nelems++;
4885				cc = '\0';
4886			}
4887
4888			*ee++ = cc;
4889		}
4890		*ee++ = '\0';
4891
4892		if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
4893		    nelems, dest)) != 0)
4894			mstate->dtms_scratch_ptr += jsonlen + 1;
4895		break;
4896	}
4897
4898	case DIF_SUBR_TOUPPER:
4899	case DIF_SUBR_TOLOWER: {
4900		uintptr_t s = tupregs[0].dttk_value;
4901		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4902		char *dest = (char *)mstate->dtms_scratch_ptr, c;
4903		size_t len = dtrace_strlen((char *)s, size);
4904		char lower, upper, convert;
4905		int64_t i;
4906
4907		if (subr == DIF_SUBR_TOUPPER) {
4908			lower = 'a';
4909			upper = 'z';
4910			convert = 'A';
4911		} else {
4912			lower = 'A';
4913			upper = 'Z';
4914			convert = 'a';
4915		}
4916
4917		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4918			regs[rd] = 0;
4919			break;
4920		}
4921
4922		if (!DTRACE_INSCRATCH(mstate, size)) {
4923			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4924			regs[rd] = 0;
4925			break;
4926		}
4927
4928		for (i = 0; i < size - 1; i++) {
4929			if ((c = dtrace_load8(s + i)) == '\0')
4930				break;
4931
4932			if (c >= lower && c <= upper)
4933				c = convert + (c - lower);
4934
4935			dest[i] = c;
4936		}
4937
4938		ASSERT(i < size);
4939		dest[i] = '\0';
4940		regs[rd] = (uintptr_t)dest;
4941		mstate->dtms_scratch_ptr += size;
4942		break;
4943	}
4944
4945#if defined(sun)
4946	case DIF_SUBR_GETMAJOR:
4947#ifdef _LP64
4948		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
4949#else
4950		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
4951#endif
4952		break;
4953
4954	case DIF_SUBR_GETMINOR:
4955#ifdef _LP64
4956		regs[rd] = tupregs[0].dttk_value & MAXMIN64;
4957#else
4958		regs[rd] = tupregs[0].dttk_value & MAXMIN;
4959#endif
4960		break;
4961
4962	case DIF_SUBR_DDI_PATHNAME: {
4963		/*
4964		 * This one is a galactic mess.  We are going to roughly
4965		 * emulate ddi_pathname(), but it's made more complicated
4966		 * by the fact that we (a) want to include the minor name and
4967		 * (b) must proceed iteratively instead of recursively.
4968		 */
4969		uintptr_t dest = mstate->dtms_scratch_ptr;
4970		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4971		char *start = (char *)dest, *end = start + size - 1;
4972		uintptr_t daddr = tupregs[0].dttk_value;
4973		int64_t minor = (int64_t)tupregs[1].dttk_value;
4974		char *s;
4975		int i, len, depth = 0;
4976
4977		/*
4978		 * Due to all the pointer jumping we do and context we must
4979		 * rely upon, we just mandate that the user must have kernel
4980		 * read privileges to use this routine.
4981		 */
4982		if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4983			*flags |= CPU_DTRACE_KPRIV;
4984			*illval = daddr;
4985			regs[rd] = 0;
4986		}
4987
4988		if (!DTRACE_INSCRATCH(mstate, size)) {
4989			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4990			regs[rd] = 0;
4991			break;
4992		}
4993
4994		*end = '\0';
4995
4996		/*
4997		 * We want to have a name for the minor.  In order to do this,
4998		 * we need to walk the minor list from the devinfo.  We want
4999		 * to be sure that we don't infinitely walk a circular list,
5000		 * so we check for circularity by sending a scout pointer
5001		 * ahead two elements for every element that we iterate over;
5002		 * if the list is circular, these will ultimately point to the
5003		 * same element.  You may recognize this little trick as the
5004		 * answer to a stupid interview question -- one that always
5005		 * seems to be asked by those who had to have it laboriously
5006		 * explained to them, and who can't even concisely describe
5007		 * the conditions under which one would be forced to resort to
5008		 * this technique.  Needless to say, those conditions are
5009		 * found here -- and probably only here.  Is this the only use
5010		 * of this infamous trick in shipping, production code?  If it
5011		 * isn't, it probably should be...
5012		 */
5013		if (minor != -1) {
5014			uintptr_t maddr = dtrace_loadptr(daddr +
5015			    offsetof(struct dev_info, devi_minor));
5016
5017			uintptr_t next = offsetof(struct ddi_minor_data, next);
5018			uintptr_t name = offsetof(struct ddi_minor_data,
5019			    d_minor) + offsetof(struct ddi_minor, name);
5020			uintptr_t dev = offsetof(struct ddi_minor_data,
5021			    d_minor) + offsetof(struct ddi_minor, dev);
5022			uintptr_t scout;
5023
5024			if (maddr != NULL)
5025				scout = dtrace_loadptr(maddr + next);
5026
5027			while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
5028				uint64_t m;
5029#ifdef _LP64
5030				m = dtrace_load64(maddr + dev) & MAXMIN64;
5031#else
5032				m = dtrace_load32(maddr + dev) & MAXMIN;
5033#endif
5034				if (m != minor) {
5035					maddr = dtrace_loadptr(maddr + next);
5036
5037					if (scout == NULL)
5038						continue;
5039
5040					scout = dtrace_loadptr(scout + next);
5041
5042					if (scout == NULL)
5043						continue;
5044
5045					scout = dtrace_loadptr(scout + next);
5046
5047					if (scout == NULL)
5048						continue;
5049
5050					if (scout == maddr) {
5051						*flags |= CPU_DTRACE_ILLOP;
5052						break;
5053					}
5054
5055					continue;
5056				}
5057
5058				/*
5059				 * We have the minor data.  Now we need to
5060				 * copy the minor's name into the end of the
5061				 * pathname.
5062				 */
5063				s = (char *)dtrace_loadptr(maddr + name);
5064				len = dtrace_strlen(s, size);
5065
5066				if (*flags & CPU_DTRACE_FAULT)
5067					break;
5068
5069				if (len != 0) {
5070					if ((end -= (len + 1)) < start)
5071						break;
5072
5073					*end = ':';
5074				}
5075
5076				for (i = 1; i <= len; i++)
5077					end[i] = dtrace_load8((uintptr_t)s++);
5078				break;
5079			}
5080		}
5081
5082		while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
5083			ddi_node_state_t devi_state;
5084
5085			devi_state = dtrace_load32(daddr +
5086			    offsetof(struct dev_info, devi_node_state));
5087
5088			if (*flags & CPU_DTRACE_FAULT)
5089				break;
5090
5091			if (devi_state >= DS_INITIALIZED) {
5092				s = (char *)dtrace_loadptr(daddr +
5093				    offsetof(struct dev_info, devi_addr));
5094				len = dtrace_strlen(s, size);
5095
5096				if (*flags & CPU_DTRACE_FAULT)
5097					break;
5098
5099				if (len != 0) {
5100					if ((end -= (len + 1)) < start)
5101						break;
5102
5103					*end = '@';
5104				}
5105
5106				for (i = 1; i <= len; i++)
5107					end[i] = dtrace_load8((uintptr_t)s++);
5108			}
5109
5110			/*
5111			 * Now for the node name...
5112			 */
5113			s = (char *)dtrace_loadptr(daddr +
5114			    offsetof(struct dev_info, devi_node_name));
5115
5116			daddr = dtrace_loadptr(daddr +
5117			    offsetof(struct dev_info, devi_parent));
5118
5119			/*
5120			 * If our parent is NULL (that is, if we're the root
5121			 * node), we're going to use the special path
5122			 * "devices".
5123			 */
5124			if (daddr == 0)
5125				s = "devices";
5126
5127			len = dtrace_strlen(s, size);
5128			if (*flags & CPU_DTRACE_FAULT)
5129				break;
5130
5131			if ((end -= (len + 1)) < start)
5132				break;
5133
5134			for (i = 1; i <= len; i++)
5135				end[i] = dtrace_load8((uintptr_t)s++);
5136			*end = '/';
5137
5138			if (depth++ > dtrace_devdepth_max) {
5139				*flags |= CPU_DTRACE_ILLOP;
5140				break;
5141			}
5142		}
5143
5144		if (end < start)
5145			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5146
5147		if (daddr == 0) {
5148			regs[rd] = (uintptr_t)end;
5149			mstate->dtms_scratch_ptr += size;
5150		}
5151
5152		break;
5153	}
5154#endif
5155
5156	case DIF_SUBR_STRJOIN: {
5157		char *d = (char *)mstate->dtms_scratch_ptr;
5158		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5159		uintptr_t s1 = tupregs[0].dttk_value;
5160		uintptr_t s2 = tupregs[1].dttk_value;
5161		int i = 0;
5162
5163		if (!dtrace_strcanload(s1, size, mstate, vstate) ||
5164		    !dtrace_strcanload(s2, size, mstate, vstate)) {
5165			regs[rd] = 0;
5166			break;
5167		}
5168
5169		if (!DTRACE_INSCRATCH(mstate, size)) {
5170			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5171			regs[rd] = 0;
5172			break;
5173		}
5174
5175		for (;;) {
5176			if (i >= size) {
5177				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5178				regs[rd] = 0;
5179				break;
5180			}
5181
5182			if ((d[i++] = dtrace_load8(s1++)) == '\0') {
5183				i--;
5184				break;
5185			}
5186		}
5187
5188		for (;;) {
5189			if (i >= size) {
5190				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5191				regs[rd] = 0;
5192				break;
5193			}
5194
5195			if ((d[i++] = dtrace_load8(s2++)) == '\0')
5196				break;
5197		}
5198
5199		if (i < size) {
5200			mstate->dtms_scratch_ptr += i;
5201			regs[rd] = (uintptr_t)d;
5202		}
5203
5204		break;
5205	}
5206
5207	case DIF_SUBR_STRTOLL: {
5208		uintptr_t s = tupregs[0].dttk_value;
5209		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5210		int base = 10;
5211
5212		if (nargs > 1) {
5213			if ((base = tupregs[1].dttk_value) <= 1 ||
5214			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5215				*flags |= CPU_DTRACE_ILLOP;
5216				break;
5217			}
5218		}
5219
5220		if (!dtrace_strcanload(s, size, mstate, vstate)) {
5221			regs[rd] = INT64_MIN;
5222			break;
5223		}
5224
5225		regs[rd] = dtrace_strtoll((char *)s, base, size);
5226		break;
5227	}
5228
5229	case DIF_SUBR_LLTOSTR: {
5230		int64_t i = (int64_t)tupregs[0].dttk_value;
5231		uint64_t val, digit;
5232		uint64_t size = 65;	/* enough room for 2^64 in binary */
5233		char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
5234		int base = 10;
5235
5236		if (nargs > 1) {
5237			if ((base = tupregs[1].dttk_value) <= 1 ||
5238			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5239				*flags |= CPU_DTRACE_ILLOP;
5240				break;
5241			}
5242		}
5243
5244		val = (base == 10 && i < 0) ? i * -1 : i;
5245
5246		if (!DTRACE_INSCRATCH(mstate, size)) {
5247			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5248			regs[rd] = 0;
5249			break;
5250		}
5251
5252		for (*end-- = '\0'; val; val /= base) {
5253			if ((digit = val % base) <= '9' - '0') {
5254				*end-- = '0' + digit;
5255			} else {
5256				*end-- = 'a' + (digit - ('9' - '0') - 1);
5257			}
5258		}
5259
5260		if (i == 0 && base == 16)
5261			*end-- = '0';
5262
5263		if (base == 16)
5264			*end-- = 'x';
5265
5266		if (i == 0 || base == 8 || base == 16)
5267			*end-- = '0';
5268
5269		if (i < 0 && base == 10)
5270			*end-- = '-';
5271
5272		regs[rd] = (uintptr_t)end + 1;
5273		mstate->dtms_scratch_ptr += size;
5274		break;
5275	}
5276
5277	case DIF_SUBR_HTONS:
5278	case DIF_SUBR_NTOHS:
5279#if BYTE_ORDER == BIG_ENDIAN
5280		regs[rd] = (uint16_t)tupregs[0].dttk_value;
5281#else
5282		regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
5283#endif
5284		break;
5285
5286
5287	case DIF_SUBR_HTONL:
5288	case DIF_SUBR_NTOHL:
5289#if BYTE_ORDER == BIG_ENDIAN
5290		regs[rd] = (uint32_t)tupregs[0].dttk_value;
5291#else
5292		regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
5293#endif
5294		break;
5295
5296
5297	case DIF_SUBR_HTONLL:
5298	case DIF_SUBR_NTOHLL:
5299#if BYTE_ORDER == BIG_ENDIAN
5300		regs[rd] = (uint64_t)tupregs[0].dttk_value;
5301#else
5302		regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
5303#endif
5304		break;
5305
5306
5307	case DIF_SUBR_DIRNAME:
5308	case DIF_SUBR_BASENAME: {
5309		char *dest = (char *)mstate->dtms_scratch_ptr;
5310		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5311		uintptr_t src = tupregs[0].dttk_value;
5312		int i, j, len = dtrace_strlen((char *)src, size);
5313		int lastbase = -1, firstbase = -1, lastdir = -1;
5314		int start, end;
5315
5316		if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5317			regs[rd] = 0;
5318			break;
5319		}
5320
5321		if (!DTRACE_INSCRATCH(mstate, size)) {
5322			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5323			regs[rd] = 0;
5324			break;
5325		}
5326
5327		/*
5328		 * The basename and dirname for a zero-length string is
5329		 * defined to be "."
5330		 */
5331		if (len == 0) {
5332			len = 1;
5333			src = (uintptr_t)".";
5334		}
5335
5336		/*
5337		 * Start from the back of the string, moving back toward the
5338		 * front until we see a character that isn't a slash.  That
5339		 * character is the last character in the basename.
5340		 */
5341		for (i = len - 1; i >= 0; i--) {
5342			if (dtrace_load8(src + i) != '/')
5343				break;
5344		}
5345
5346		if (i >= 0)
5347			lastbase = i;
5348
5349		/*
5350		 * Starting from the last character in the basename, move
5351		 * towards the front until we find a slash.  The character
5352		 * that we processed immediately before that is the first
5353		 * character in the basename.
5354		 */
5355		for (; i >= 0; i--) {
5356			if (dtrace_load8(src + i) == '/')
5357				break;
5358		}
5359
5360		if (i >= 0)
5361			firstbase = i + 1;
5362
5363		/*
5364		 * Now keep going until we find a non-slash character.  That
5365		 * character is the last character in the dirname.
5366		 */
5367		for (; i >= 0; i--) {
5368			if (dtrace_load8(src + i) != '/')
5369				break;
5370		}
5371
5372		if (i >= 0)
5373			lastdir = i;
5374
5375		ASSERT(!(lastbase == -1 && firstbase != -1));
5376		ASSERT(!(firstbase == -1 && lastdir != -1));
5377
5378		if (lastbase == -1) {
5379			/*
5380			 * We didn't find a non-slash character.  We know that
5381			 * the length is non-zero, so the whole string must be
5382			 * slashes.  In either the dirname or the basename
5383			 * case, we return '/'.
5384			 */
5385			ASSERT(firstbase == -1);
5386			firstbase = lastbase = lastdir = 0;
5387		}
5388
5389		if (firstbase == -1) {
5390			/*
5391			 * The entire string consists only of a basename
5392			 * component.  If we're looking for dirname, we need
5393			 * to change our string to be just "."; if we're
5394			 * looking for a basename, we'll just set the first
5395			 * character of the basename to be 0.
5396			 */
5397			if (subr == DIF_SUBR_DIRNAME) {
5398				ASSERT(lastdir == -1);
5399				src = (uintptr_t)".";
5400				lastdir = 0;
5401			} else {
5402				firstbase = 0;
5403			}
5404		}
5405
5406		if (subr == DIF_SUBR_DIRNAME) {
5407			if (lastdir == -1) {
5408				/*
5409				 * We know that we have a slash in the name --
5410				 * or lastdir would be set to 0, above.  And
5411				 * because lastdir is -1, we know that this
5412				 * slash must be the first character.  (That
5413				 * is, the full string must be of the form
5414				 * "/basename".)  In this case, the last
5415				 * character of the directory name is 0.
5416				 */
5417				lastdir = 0;
5418			}
5419
5420			start = 0;
5421			end = lastdir;
5422		} else {
5423			ASSERT(subr == DIF_SUBR_BASENAME);
5424			ASSERT(firstbase != -1 && lastbase != -1);
5425			start = firstbase;
5426			end = lastbase;
5427		}
5428
5429		for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
5430			dest[j] = dtrace_load8(src + i);
5431
5432		dest[j] = '\0';
5433		regs[rd] = (uintptr_t)dest;
5434		mstate->dtms_scratch_ptr += size;
5435		break;
5436	}
5437
5438	case DIF_SUBR_GETF: {
5439		uintptr_t fd = tupregs[0].dttk_value;
5440		struct filedesc *fdp;
5441		file_t *fp;
5442
5443		if (!dtrace_priv_proc(state)) {
5444			regs[rd] = 0;
5445			break;
5446		}
5447		fdp = curproc->p_fd;
5448		FILEDESC_SLOCK(fdp);
5449		fp = fget_locked(fdp, fd);
5450		mstate->dtms_getf = fp;
5451		regs[rd] = (uintptr_t)fp;
5452		FILEDESC_SUNLOCK(fdp);
5453		break;
5454	}
5455
5456	case DIF_SUBR_CLEANPATH: {
5457		char *dest = (char *)mstate->dtms_scratch_ptr, c;
5458		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5459		uintptr_t src = tupregs[0].dttk_value;
5460		int i = 0, j = 0;
5461#if defined(sun)
5462		zone_t *z;
5463#endif
5464
5465		if (!dtrace_strcanload(src, size, mstate, vstate)) {
5466			regs[rd] = 0;
5467			break;
5468		}
5469
5470		if (!DTRACE_INSCRATCH(mstate, size)) {
5471			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5472			regs[rd] = 0;
5473			break;
5474		}
5475
5476		/*
5477		 * Move forward, loading each character.
5478		 */
5479		do {
5480			c = dtrace_load8(src + i++);
5481next:
5482			if (j + 5 >= size)	/* 5 = strlen("/..c\0") */
5483				break;
5484
5485			if (c != '/') {
5486				dest[j++] = c;
5487				continue;
5488			}
5489
5490			c = dtrace_load8(src + i++);
5491
5492			if (c == '/') {
5493				/*
5494				 * We have two slashes -- we can just advance
5495				 * to the next character.
5496				 */
5497				goto next;
5498			}
5499
5500			if (c != '.') {
5501				/*
5502				 * This is not "." and it's not ".." -- we can
5503				 * just store the "/" and this character and
5504				 * drive on.
5505				 */
5506				dest[j++] = '/';
5507				dest[j++] = c;
5508				continue;
5509			}
5510
5511			c = dtrace_load8(src + i++);
5512
5513			if (c == '/') {
5514				/*
5515				 * This is a "/./" component.  We're not going
5516				 * to store anything in the destination buffer;
5517				 * we're just going to go to the next component.
5518				 */
5519				goto next;
5520			}
5521
5522			if (c != '.') {
5523				/*
5524				 * This is not ".." -- we can just store the
5525				 * "/." and this character and continue
5526				 * processing.
5527				 */
5528				dest[j++] = '/';
5529				dest[j++] = '.';
5530				dest[j++] = c;
5531				continue;
5532			}
5533
5534			c = dtrace_load8(src + i++);
5535
5536			if (c != '/' && c != '\0') {
5537				/*
5538				 * This is not ".." -- it's "..[mumble]".
5539				 * We'll store the "/.." and this character
5540				 * and continue processing.
5541				 */
5542				dest[j++] = '/';
5543				dest[j++] = '.';
5544				dest[j++] = '.';
5545				dest[j++] = c;
5546				continue;
5547			}
5548
5549			/*
5550			 * This is "/../" or "/..\0".  We need to back up
5551			 * our destination pointer until we find a "/".
5552			 */
5553			i--;
5554			while (j != 0 && dest[--j] != '/')
5555				continue;
5556
5557			if (c == '\0')
5558				dest[++j] = '/';
5559		} while (c != '\0');
5560
5561		dest[j] = '\0';
5562
5563#if defined(sun)
5564		if (mstate->dtms_getf != NULL &&
5565		    !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
5566		    (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
5567			/*
5568			 * If we've done a getf() as a part of this ECB and we
5569			 * don't have kernel access (and we're not in the global
5570			 * zone), check if the path we cleaned up begins with
5571			 * the zone's root path, and trim it off if so.  Note
5572			 * that this is an output cleanliness issue, not a
5573			 * security issue: knowing one's zone root path does
5574			 * not enable privilege escalation.
5575			 */
5576			if (strstr(dest, z->zone_rootpath) == dest)
5577				dest += strlen(z->zone_rootpath) - 1;
5578		}
5579#endif
5580
5581		regs[rd] = (uintptr_t)dest;
5582		mstate->dtms_scratch_ptr += size;
5583		break;
5584	}
5585
5586	case DIF_SUBR_INET_NTOA:
5587	case DIF_SUBR_INET_NTOA6:
5588	case DIF_SUBR_INET_NTOP: {
5589		size_t size;
5590		int af, argi, i;
5591		char *base, *end;
5592
5593		if (subr == DIF_SUBR_INET_NTOP) {
5594			af = (int)tupregs[0].dttk_value;
5595			argi = 1;
5596		} else {
5597			af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5598			argi = 0;
5599		}
5600
5601		if (af == AF_INET) {
5602			ipaddr_t ip4;
5603			uint8_t *ptr8, val;
5604
5605			/*
5606			 * Safely load the IPv4 address.
5607			 */
5608			ip4 = dtrace_load32(tupregs[argi].dttk_value);
5609
5610			/*
5611			 * Check an IPv4 string will fit in scratch.
5612			 */
5613			size = INET_ADDRSTRLEN;
5614			if (!DTRACE_INSCRATCH(mstate, size)) {
5615				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5616				regs[rd] = 0;
5617				break;
5618			}
5619			base = (char *)mstate->dtms_scratch_ptr;
5620			end = (char *)mstate->dtms_scratch_ptr + size - 1;
5621
5622			/*
5623			 * Stringify as a dotted decimal quad.
5624			 */
5625			*end-- = '\0';
5626			ptr8 = (uint8_t *)&ip4;
5627			for (i = 3; i >= 0; i--) {
5628				val = ptr8[i];
5629
5630				if (val == 0) {
5631					*end-- = '0';
5632				} else {
5633					for (; val; val /= 10) {
5634						*end-- = '0' + (val % 10);
5635					}
5636				}
5637
5638				if (i > 0)
5639					*end-- = '.';
5640			}
5641			ASSERT(end + 1 >= base);
5642
5643		} else if (af == AF_INET6) {
5644			struct in6_addr ip6;
5645			int firstzero, tryzero, numzero, v6end;
5646			uint16_t val;
5647			const char digits[] = "0123456789abcdef";
5648
5649			/*
5650			 * Stringify using RFC 1884 convention 2 - 16 bit
5651			 * hexadecimal values with a zero-run compression.
5652			 * Lower case hexadecimal digits are used.
5653			 * 	eg, fe80::214:4fff:fe0b:76c8.
5654			 * The IPv4 embedded form is returned for inet_ntop,
5655			 * just the IPv4 string is returned for inet_ntoa6.
5656			 */
5657
5658			/*
5659			 * Safely load the IPv6 address.
5660			 */
5661			dtrace_bcopy(
5662			    (void *)(uintptr_t)tupregs[argi].dttk_value,
5663			    (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5664
5665			/*
5666			 * Check an IPv6 string will fit in scratch.
5667			 */
5668			size = INET6_ADDRSTRLEN;
5669			if (!DTRACE_INSCRATCH(mstate, size)) {
5670				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5671				regs[rd] = 0;
5672				break;
5673			}
5674			base = (char *)mstate->dtms_scratch_ptr;
5675			end = (char *)mstate->dtms_scratch_ptr + size - 1;
5676			*end-- = '\0';
5677
5678			/*
5679			 * Find the longest run of 16 bit zero values
5680			 * for the single allowed zero compression - "::".
5681			 */
5682			firstzero = -1;
5683			tryzero = -1;
5684			numzero = 1;
5685			for (i = 0; i < sizeof (struct in6_addr); i++) {
5686#if defined(sun)
5687				if (ip6._S6_un._S6_u8[i] == 0 &&
5688#else
5689				if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5690#endif
5691				    tryzero == -1 && i % 2 == 0) {
5692					tryzero = i;
5693					continue;
5694				}
5695
5696				if (tryzero != -1 &&
5697#if defined(sun)
5698				    (ip6._S6_un._S6_u8[i] != 0 ||
5699#else
5700				    (ip6.__u6_addr.__u6_addr8[i] != 0 ||
5701#endif
5702				    i == sizeof (struct in6_addr) - 1)) {
5703
5704					if (i - tryzero <= numzero) {
5705						tryzero = -1;
5706						continue;
5707					}
5708
5709					firstzero = tryzero;
5710					numzero = i - i % 2 - tryzero;
5711					tryzero = -1;
5712
5713#if defined(sun)
5714					if (ip6._S6_un._S6_u8[i] == 0 &&
5715#else
5716					if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5717#endif
5718					    i == sizeof (struct in6_addr) - 1)
5719						numzero += 2;
5720				}
5721			}
5722			ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
5723
5724			/*
5725			 * Check for an IPv4 embedded address.
5726			 */
5727			v6end = sizeof (struct in6_addr) - 2;
5728			if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5729			    IN6_IS_ADDR_V4COMPAT(&ip6)) {
5730				for (i = sizeof (struct in6_addr) - 1;
5731				    i >= DTRACE_V4MAPPED_OFFSET; i--) {
5732					ASSERT(end >= base);
5733
5734#if defined(sun)
5735					val = ip6._S6_un._S6_u8[i];
5736#else
5737					val = ip6.__u6_addr.__u6_addr8[i];
5738#endif
5739
5740					if (val == 0) {
5741						*end-- = '0';
5742					} else {
5743						for (; val; val /= 10) {
5744							*end-- = '0' + val % 10;
5745						}
5746					}
5747
5748					if (i > DTRACE_V4MAPPED_OFFSET)
5749						*end-- = '.';
5750				}
5751
5752				if (subr == DIF_SUBR_INET_NTOA6)
5753					goto inetout;
5754
5755				/*
5756				 * Set v6end to skip the IPv4 address that
5757				 * we have already stringified.
5758				 */
5759				v6end = 10;
5760			}
5761
5762			/*
5763			 * Build the IPv6 string by working through the
5764			 * address in reverse.
5765			 */
5766			for (i = v6end; i >= 0; i -= 2) {
5767				ASSERT(end >= base);
5768
5769				if (i == firstzero + numzero - 2) {
5770					*end-- = ':';
5771					*end-- = ':';
5772					i -= numzero - 2;
5773					continue;
5774				}
5775
5776				if (i < 14 && i != firstzero - 2)
5777					*end-- = ':';
5778
5779#if defined(sun)
5780				val = (ip6._S6_un._S6_u8[i] << 8) +
5781				    ip6._S6_un._S6_u8[i + 1];
5782#else
5783				val = (ip6.__u6_addr.__u6_addr8[i] << 8) +
5784				    ip6.__u6_addr.__u6_addr8[i + 1];
5785#endif
5786
5787				if (val == 0) {
5788					*end-- = '0';
5789				} else {
5790					for (; val; val /= 16) {
5791						*end-- = digits[val % 16];
5792					}
5793				}
5794			}
5795			ASSERT(end + 1 >= base);
5796
5797		} else {
5798			/*
5799			 * The user didn't use AH_INET or AH_INET6.
5800			 */
5801			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5802			regs[rd] = 0;
5803			break;
5804		}
5805
5806inetout:	regs[rd] = (uintptr_t)end + 1;
5807		mstate->dtms_scratch_ptr += size;
5808		break;
5809	}
5810
5811	case DIF_SUBR_MEMREF: {
5812		uintptr_t size = 2 * sizeof(uintptr_t);
5813		uintptr_t *memref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
5814		size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size;
5815
5816		/* address and length */
5817		memref[0] = tupregs[0].dttk_value;
5818		memref[1] = tupregs[1].dttk_value;
5819
5820		regs[rd] = (uintptr_t) memref;
5821		mstate->dtms_scratch_ptr += scratch_size;
5822		break;
5823	}
5824
5825#if !defined(sun)
5826	case DIF_SUBR_MEMSTR: {
5827		char *str = (char *)mstate->dtms_scratch_ptr;
5828		uintptr_t mem = tupregs[0].dttk_value;
5829		char c = tupregs[1].dttk_value;
5830		size_t size = tupregs[2].dttk_value;
5831		uint8_t n;
5832		int i;
5833
5834		regs[rd] = 0;
5835
5836		if (size == 0)
5837			break;
5838
5839		if (!dtrace_canload(mem, size - 1, mstate, vstate))
5840			break;
5841
5842		if (!DTRACE_INSCRATCH(mstate, size)) {
5843			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5844			break;
5845		}
5846
5847		if (dtrace_memstr_max != 0 && size > dtrace_memstr_max) {
5848			*flags |= CPU_DTRACE_ILLOP;
5849			break;
5850		}
5851
5852		for (i = 0; i < size - 1; i++) {
5853			n = dtrace_load8(mem++);
5854			str[i] = (n == 0) ? c : n;
5855		}
5856		str[size - 1] = 0;
5857
5858		regs[rd] = (uintptr_t)str;
5859		mstate->dtms_scratch_ptr += size;
5860		break;
5861	}
5862#endif
5863
5864	case DIF_SUBR_TYPEREF: {
5865		uintptr_t size = 4 * sizeof(uintptr_t);
5866		uintptr_t *typeref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
5867		size_t scratch_size = ((uintptr_t) typeref - mstate->dtms_scratch_ptr) + size;
5868
5869		/* address, num_elements, type_str, type_len */
5870		typeref[0] = tupregs[0].dttk_value;
5871		typeref[1] = tupregs[1].dttk_value;
5872		typeref[2] = tupregs[2].dttk_value;
5873		typeref[3] = tupregs[3].dttk_value;
5874
5875		regs[rd] = (uintptr_t) typeref;
5876		mstate->dtms_scratch_ptr += scratch_size;
5877		break;
5878	}
5879	}
5880}
5881
5882/*
5883 * Emulate the execution of DTrace IR instructions specified by the given
5884 * DIF object.  This function is deliberately void of assertions as all of
5885 * the necessary checks are handled by a call to dtrace_difo_validate().
5886 */
5887static uint64_t
5888dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5889    dtrace_vstate_t *vstate, dtrace_state_t *state)
5890{
5891	const dif_instr_t *text = difo->dtdo_buf;
5892	const uint_t textlen = difo->dtdo_len;
5893	const char *strtab = difo->dtdo_strtab;
5894	const uint64_t *inttab = difo->dtdo_inttab;
5895
5896	uint64_t rval = 0;
5897	dtrace_statvar_t *svar;
5898	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5899	dtrace_difv_t *v;
5900	volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
5901	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
5902
5903	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5904	uint64_t regs[DIF_DIR_NREGS];
5905	uint64_t *tmp;
5906
5907	uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5908	int64_t cc_r;
5909	uint_t pc = 0, id, opc = 0;
5910	uint8_t ttop = 0;
5911	dif_instr_t instr;
5912	uint_t r1, r2, rd;
5913
5914	/*
5915	 * We stash the current DIF object into the machine state: we need it
5916	 * for subsequent access checking.
5917	 */
5918	mstate->dtms_difo = difo;
5919
5920	regs[DIF_REG_R0] = 0; 		/* %r0 is fixed at zero */
5921
5922	while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5923		opc = pc;
5924
5925		instr = text[pc++];
5926		r1 = DIF_INSTR_R1(instr);
5927		r2 = DIF_INSTR_R2(instr);
5928		rd = DIF_INSTR_RD(instr);
5929
5930		switch (DIF_INSTR_OP(instr)) {
5931		case DIF_OP_OR:
5932			regs[rd] = regs[r1] | regs[r2];
5933			break;
5934		case DIF_OP_XOR:
5935			regs[rd] = regs[r1] ^ regs[r2];
5936			break;
5937		case DIF_OP_AND:
5938			regs[rd] = regs[r1] & regs[r2];
5939			break;
5940		case DIF_OP_SLL:
5941			regs[rd] = regs[r1] << regs[r2];
5942			break;
5943		case DIF_OP_SRL:
5944			regs[rd] = regs[r1] >> regs[r2];
5945			break;
5946		case DIF_OP_SUB:
5947			regs[rd] = regs[r1] - regs[r2];
5948			break;
5949		case DIF_OP_ADD:
5950			regs[rd] = regs[r1] + regs[r2];
5951			break;
5952		case DIF_OP_MUL:
5953			regs[rd] = regs[r1] * regs[r2];
5954			break;
5955		case DIF_OP_SDIV:
5956			if (regs[r2] == 0) {
5957				regs[rd] = 0;
5958				*flags |= CPU_DTRACE_DIVZERO;
5959			} else {
5960				regs[rd] = (int64_t)regs[r1] /
5961				    (int64_t)regs[r2];
5962			}
5963			break;
5964
5965		case DIF_OP_UDIV:
5966			if (regs[r2] == 0) {
5967				regs[rd] = 0;
5968				*flags |= CPU_DTRACE_DIVZERO;
5969			} else {
5970				regs[rd] = regs[r1] / regs[r2];
5971			}
5972			break;
5973
5974		case DIF_OP_SREM:
5975			if (regs[r2] == 0) {
5976				regs[rd] = 0;
5977				*flags |= CPU_DTRACE_DIVZERO;
5978			} else {
5979				regs[rd] = (int64_t)regs[r1] %
5980				    (int64_t)regs[r2];
5981			}
5982			break;
5983
5984		case DIF_OP_UREM:
5985			if (regs[r2] == 0) {
5986				regs[rd] = 0;
5987				*flags |= CPU_DTRACE_DIVZERO;
5988			} else {
5989				regs[rd] = regs[r1] % regs[r2];
5990			}
5991			break;
5992
5993		case DIF_OP_NOT:
5994			regs[rd] = ~regs[r1];
5995			break;
5996		case DIF_OP_MOV:
5997			regs[rd] = regs[r1];
5998			break;
5999		case DIF_OP_CMP:
6000			cc_r = regs[r1] - regs[r2];
6001			cc_n = cc_r < 0;
6002			cc_z = cc_r == 0;
6003			cc_v = 0;
6004			cc_c = regs[r1] < regs[r2];
6005			break;
6006		case DIF_OP_TST:
6007			cc_n = cc_v = cc_c = 0;
6008			cc_z = regs[r1] == 0;
6009			break;
6010		case DIF_OP_BA:
6011			pc = DIF_INSTR_LABEL(instr);
6012			break;
6013		case DIF_OP_BE:
6014			if (cc_z)
6015				pc = DIF_INSTR_LABEL(instr);
6016			break;
6017		case DIF_OP_BNE:
6018			if (cc_z == 0)
6019				pc = DIF_INSTR_LABEL(instr);
6020			break;
6021		case DIF_OP_BG:
6022			if ((cc_z | (cc_n ^ cc_v)) == 0)
6023				pc = DIF_INSTR_LABEL(instr);
6024			break;
6025		case DIF_OP_BGU:
6026			if ((cc_c | cc_z) == 0)
6027				pc = DIF_INSTR_LABEL(instr);
6028			break;
6029		case DIF_OP_BGE:
6030			if ((cc_n ^ cc_v) == 0)
6031				pc = DIF_INSTR_LABEL(instr);
6032			break;
6033		case DIF_OP_BGEU:
6034			if (cc_c == 0)
6035				pc = DIF_INSTR_LABEL(instr);
6036			break;
6037		case DIF_OP_BL:
6038			if (cc_n ^ cc_v)
6039				pc = DIF_INSTR_LABEL(instr);
6040			break;
6041		case DIF_OP_BLU:
6042			if (cc_c)
6043				pc = DIF_INSTR_LABEL(instr);
6044			break;
6045		case DIF_OP_BLE:
6046			if (cc_z | (cc_n ^ cc_v))
6047				pc = DIF_INSTR_LABEL(instr);
6048			break;
6049		case DIF_OP_BLEU:
6050			if (cc_c | cc_z)
6051				pc = DIF_INSTR_LABEL(instr);
6052			break;
6053		case DIF_OP_RLDSB:
6054			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
6055				break;
6056			/*FALLTHROUGH*/
6057		case DIF_OP_LDSB:
6058			regs[rd] = (int8_t)dtrace_load8(regs[r1]);
6059			break;
6060		case DIF_OP_RLDSH:
6061			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
6062				break;
6063			/*FALLTHROUGH*/
6064		case DIF_OP_LDSH:
6065			regs[rd] = (int16_t)dtrace_load16(regs[r1]);
6066			break;
6067		case DIF_OP_RLDSW:
6068			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
6069				break;
6070			/*FALLTHROUGH*/
6071		case DIF_OP_LDSW:
6072			regs[rd] = (int32_t)dtrace_load32(regs[r1]);
6073			break;
6074		case DIF_OP_RLDUB:
6075			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
6076				break;
6077			/*FALLTHROUGH*/
6078		case DIF_OP_LDUB:
6079			regs[rd] = dtrace_load8(regs[r1]);
6080			break;
6081		case DIF_OP_RLDUH:
6082			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
6083				break;
6084			/*FALLTHROUGH*/
6085		case DIF_OP_LDUH:
6086			regs[rd] = dtrace_load16(regs[r1]);
6087			break;
6088		case DIF_OP_RLDUW:
6089			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
6090				break;
6091			/*FALLTHROUGH*/
6092		case DIF_OP_LDUW:
6093			regs[rd] = dtrace_load32(regs[r1]);
6094			break;
6095		case DIF_OP_RLDX:
6096			if (!dtrace_canload(regs[r1], 8, mstate, vstate))
6097				break;
6098			/*FALLTHROUGH*/
6099		case DIF_OP_LDX:
6100			regs[rd] = dtrace_load64(regs[r1]);
6101			break;
6102		case DIF_OP_ULDSB:
6103			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6104			regs[rd] = (int8_t)
6105			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6106			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6107			break;
6108		case DIF_OP_ULDSH:
6109			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6110			regs[rd] = (int16_t)
6111			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6112			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6113			break;
6114		case DIF_OP_ULDSW:
6115			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6116			regs[rd] = (int32_t)
6117			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6118			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6119			break;
6120		case DIF_OP_ULDUB:
6121			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6122			regs[rd] =
6123			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6124			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6125			break;
6126		case DIF_OP_ULDUH:
6127			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6128			regs[rd] =
6129			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6130			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6131			break;
6132		case DIF_OP_ULDUW:
6133			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6134			regs[rd] =
6135			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6136			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6137			break;
6138		case DIF_OP_ULDX:
6139			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6140			regs[rd] =
6141			    dtrace_fuword64((void *)(uintptr_t)regs[r1]);
6142			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6143			break;
6144		case DIF_OP_RET:
6145			rval = regs[rd];
6146			pc = textlen;
6147			break;
6148		case DIF_OP_NOP:
6149			break;
6150		case DIF_OP_SETX:
6151			regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
6152			break;
6153		case DIF_OP_SETS:
6154			regs[rd] = (uint64_t)(uintptr_t)
6155			    (strtab + DIF_INSTR_STRING(instr));
6156			break;
6157		case DIF_OP_SCMP: {
6158			size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
6159			uintptr_t s1 = regs[r1];
6160			uintptr_t s2 = regs[r2];
6161
6162			if (s1 != 0 &&
6163			    !dtrace_strcanload(s1, sz, mstate, vstate))
6164				break;
6165			if (s2 != 0 &&
6166			    !dtrace_strcanload(s2, sz, mstate, vstate))
6167				break;
6168
6169			cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
6170
6171			cc_n = cc_r < 0;
6172			cc_z = cc_r == 0;
6173			cc_v = cc_c = 0;
6174			break;
6175		}
6176		case DIF_OP_LDGA:
6177			regs[rd] = dtrace_dif_variable(mstate, state,
6178			    r1, regs[r2]);
6179			break;
6180		case DIF_OP_LDGS:
6181			id = DIF_INSTR_VAR(instr);
6182
6183			if (id >= DIF_VAR_OTHER_UBASE) {
6184				uintptr_t a;
6185
6186				id -= DIF_VAR_OTHER_UBASE;
6187				svar = vstate->dtvs_globals[id];
6188				ASSERT(svar != NULL);
6189				v = &svar->dtsv_var;
6190
6191				if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
6192					regs[rd] = svar->dtsv_data;
6193					break;
6194				}
6195
6196				a = (uintptr_t)svar->dtsv_data;
6197
6198				if (*(uint8_t *)a == UINT8_MAX) {
6199					/*
6200					 * If the 0th byte is set to UINT8_MAX
6201					 * then this is to be treated as a
6202					 * reference to a NULL variable.
6203					 */
6204					regs[rd] = 0;
6205				} else {
6206					regs[rd] = a + sizeof (uint64_t);
6207				}
6208
6209				break;
6210			}
6211
6212			regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6213			break;
6214
6215		case DIF_OP_STGS:
6216			id = DIF_INSTR_VAR(instr);
6217
6218			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6219			id -= DIF_VAR_OTHER_UBASE;
6220
6221			svar = vstate->dtvs_globals[id];
6222			ASSERT(svar != NULL);
6223			v = &svar->dtsv_var;
6224
6225			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6226				uintptr_t a = (uintptr_t)svar->dtsv_data;
6227
6228				ASSERT(a != 0);
6229				ASSERT(svar->dtsv_size != 0);
6230
6231				if (regs[rd] == 0) {
6232					*(uint8_t *)a = UINT8_MAX;
6233					break;
6234				} else {
6235					*(uint8_t *)a = 0;
6236					a += sizeof (uint64_t);
6237				}
6238				if (!dtrace_vcanload(
6239				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6240				    mstate, vstate))
6241					break;
6242
6243				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6244				    (void *)a, &v->dtdv_type);
6245				break;
6246			}
6247
6248			svar->dtsv_data = regs[rd];
6249			break;
6250
6251		case DIF_OP_LDTA:
6252			/*
6253			 * There are no DTrace built-in thread-local arrays at
6254			 * present.  This opcode is saved for future work.
6255			 */
6256			*flags |= CPU_DTRACE_ILLOP;
6257			regs[rd] = 0;
6258			break;
6259
6260		case DIF_OP_LDLS:
6261			id = DIF_INSTR_VAR(instr);
6262
6263			if (id < DIF_VAR_OTHER_UBASE) {
6264				/*
6265				 * For now, this has no meaning.
6266				 */
6267				regs[rd] = 0;
6268				break;
6269			}
6270
6271			id -= DIF_VAR_OTHER_UBASE;
6272
6273			ASSERT(id < vstate->dtvs_nlocals);
6274			ASSERT(vstate->dtvs_locals != NULL);
6275
6276			svar = vstate->dtvs_locals[id];
6277			ASSERT(svar != NULL);
6278			v = &svar->dtsv_var;
6279
6280			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6281				uintptr_t a = (uintptr_t)svar->dtsv_data;
6282				size_t sz = v->dtdv_type.dtdt_size;
6283
6284				sz += sizeof (uint64_t);
6285				ASSERT(svar->dtsv_size == NCPU * sz);
6286				a += curcpu * sz;
6287
6288				if (*(uint8_t *)a == UINT8_MAX) {
6289					/*
6290					 * If the 0th byte is set to UINT8_MAX
6291					 * then this is to be treated as a
6292					 * reference to a NULL variable.
6293					 */
6294					regs[rd] = 0;
6295				} else {
6296					regs[rd] = a + sizeof (uint64_t);
6297				}
6298
6299				break;
6300			}
6301
6302			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6303			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6304			regs[rd] = tmp[curcpu];
6305			break;
6306
6307		case DIF_OP_STLS:
6308			id = DIF_INSTR_VAR(instr);
6309
6310			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6311			id -= DIF_VAR_OTHER_UBASE;
6312			ASSERT(id < vstate->dtvs_nlocals);
6313
6314			ASSERT(vstate->dtvs_locals != NULL);
6315			svar = vstate->dtvs_locals[id];
6316			ASSERT(svar != NULL);
6317			v = &svar->dtsv_var;
6318
6319			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6320				uintptr_t a = (uintptr_t)svar->dtsv_data;
6321				size_t sz = v->dtdv_type.dtdt_size;
6322
6323				sz += sizeof (uint64_t);
6324				ASSERT(svar->dtsv_size == NCPU * sz);
6325				a += curcpu * sz;
6326
6327				if (regs[rd] == 0) {
6328					*(uint8_t *)a = UINT8_MAX;
6329					break;
6330				} else {
6331					*(uint8_t *)a = 0;
6332					a += sizeof (uint64_t);
6333				}
6334
6335				if (!dtrace_vcanload(
6336				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6337				    mstate, vstate))
6338					break;
6339
6340				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6341				    (void *)a, &v->dtdv_type);
6342				break;
6343			}
6344
6345			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6346			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6347			tmp[curcpu] = regs[rd];
6348			break;
6349
6350		case DIF_OP_LDTS: {
6351			dtrace_dynvar_t *dvar;
6352			dtrace_key_t *key;
6353
6354			id = DIF_INSTR_VAR(instr);
6355			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6356			id -= DIF_VAR_OTHER_UBASE;
6357			v = &vstate->dtvs_tlocals[id];
6358
6359			key = &tupregs[DIF_DTR_NREGS];
6360			key[0].dttk_value = (uint64_t)id;
6361			key[0].dttk_size = 0;
6362			DTRACE_TLS_THRKEY(key[1].dttk_value);
6363			key[1].dttk_size = 0;
6364
6365			dvar = dtrace_dynvar(dstate, 2, key,
6366			    sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6367			    mstate, vstate);
6368
6369			if (dvar == NULL) {
6370				regs[rd] = 0;
6371				break;
6372			}
6373
6374			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6375				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6376			} else {
6377				regs[rd] = *((uint64_t *)dvar->dtdv_data);
6378			}
6379
6380			break;
6381		}
6382
6383		case DIF_OP_STTS: {
6384			dtrace_dynvar_t *dvar;
6385			dtrace_key_t *key;
6386
6387			id = DIF_INSTR_VAR(instr);
6388			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6389			id -= DIF_VAR_OTHER_UBASE;
6390
6391			key = &tupregs[DIF_DTR_NREGS];
6392			key[0].dttk_value = (uint64_t)id;
6393			key[0].dttk_size = 0;
6394			DTRACE_TLS_THRKEY(key[1].dttk_value);
6395			key[1].dttk_size = 0;
6396			v = &vstate->dtvs_tlocals[id];
6397
6398			dvar = dtrace_dynvar(dstate, 2, key,
6399			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6400			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6401			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
6402			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6403
6404			/*
6405			 * Given that we're storing to thread-local data,
6406			 * we need to flush our predicate cache.
6407			 */
6408			curthread->t_predcache = 0;
6409
6410			if (dvar == NULL)
6411				break;
6412
6413			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6414				if (!dtrace_vcanload(
6415				    (void *)(uintptr_t)regs[rd],
6416				    &v->dtdv_type, mstate, vstate))
6417					break;
6418
6419				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6420				    dvar->dtdv_data, &v->dtdv_type);
6421			} else {
6422				*((uint64_t *)dvar->dtdv_data) = regs[rd];
6423			}
6424
6425			break;
6426		}
6427
6428		case DIF_OP_SRA:
6429			regs[rd] = (int64_t)regs[r1] >> regs[r2];
6430			break;
6431
6432		case DIF_OP_CALL:
6433			dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6434			    regs, tupregs, ttop, mstate, state);
6435			break;
6436
6437		case DIF_OP_PUSHTR:
6438			if (ttop == DIF_DTR_NREGS) {
6439				*flags |= CPU_DTRACE_TUPOFLOW;
6440				break;
6441			}
6442
6443			if (r1 == DIF_TYPE_STRING) {
6444				/*
6445				 * If this is a string type and the size is 0,
6446				 * we'll use the system-wide default string
6447				 * size.  Note that we are _not_ looking at
6448				 * the value of the DTRACEOPT_STRSIZE option;
6449				 * had this been set, we would expect to have
6450				 * a non-zero size value in the "pushtr".
6451				 */
6452				tupregs[ttop].dttk_size =
6453				    dtrace_strlen((char *)(uintptr_t)regs[rd],
6454				    regs[r2] ? regs[r2] :
6455				    dtrace_strsize_default) + 1;
6456			} else {
6457				tupregs[ttop].dttk_size = regs[r2];
6458			}
6459
6460			tupregs[ttop++].dttk_value = regs[rd];
6461			break;
6462
6463		case DIF_OP_PUSHTV:
6464			if (ttop == DIF_DTR_NREGS) {
6465				*flags |= CPU_DTRACE_TUPOFLOW;
6466				break;
6467			}
6468
6469			tupregs[ttop].dttk_value = regs[rd];
6470			tupregs[ttop++].dttk_size = 0;
6471			break;
6472
6473		case DIF_OP_POPTS:
6474			if (ttop != 0)
6475				ttop--;
6476			break;
6477
6478		case DIF_OP_FLUSHTS:
6479			ttop = 0;
6480			break;
6481
6482		case DIF_OP_LDGAA:
6483		case DIF_OP_LDTAA: {
6484			dtrace_dynvar_t *dvar;
6485			dtrace_key_t *key = tupregs;
6486			uint_t nkeys = ttop;
6487
6488			id = DIF_INSTR_VAR(instr);
6489			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6490			id -= DIF_VAR_OTHER_UBASE;
6491
6492			key[nkeys].dttk_value = (uint64_t)id;
6493			key[nkeys++].dttk_size = 0;
6494
6495			if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6496				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6497				key[nkeys++].dttk_size = 0;
6498				v = &vstate->dtvs_tlocals[id];
6499			} else {
6500				v = &vstate->dtvs_globals[id]->dtsv_var;
6501			}
6502
6503			dvar = dtrace_dynvar(dstate, nkeys, key,
6504			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6505			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6506			    DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6507
6508			if (dvar == NULL) {
6509				regs[rd] = 0;
6510				break;
6511			}
6512
6513			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6514				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6515			} else {
6516				regs[rd] = *((uint64_t *)dvar->dtdv_data);
6517			}
6518
6519			break;
6520		}
6521
6522		case DIF_OP_STGAA:
6523		case DIF_OP_STTAA: {
6524			dtrace_dynvar_t *dvar;
6525			dtrace_key_t *key = tupregs;
6526			uint_t nkeys = ttop;
6527
6528			id = DIF_INSTR_VAR(instr);
6529			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6530			id -= DIF_VAR_OTHER_UBASE;
6531
6532			key[nkeys].dttk_value = (uint64_t)id;
6533			key[nkeys++].dttk_size = 0;
6534
6535			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6536				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6537				key[nkeys++].dttk_size = 0;
6538				v = &vstate->dtvs_tlocals[id];
6539			} else {
6540				v = &vstate->dtvs_globals[id]->dtsv_var;
6541			}
6542
6543			dvar = dtrace_dynvar(dstate, nkeys, key,
6544			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6545			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6546			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
6547			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6548
6549			if (dvar == NULL)
6550				break;
6551
6552			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6553				if (!dtrace_vcanload(
6554				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6555				    mstate, vstate))
6556					break;
6557
6558				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6559				    dvar->dtdv_data, &v->dtdv_type);
6560			} else {
6561				*((uint64_t *)dvar->dtdv_data) = regs[rd];
6562			}
6563
6564			break;
6565		}
6566
6567		case DIF_OP_ALLOCS: {
6568			uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6569			size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6570
6571			/*
6572			 * Rounding up the user allocation size could have
6573			 * overflowed large, bogus allocations (like -1ULL) to
6574			 * 0.
6575			 */
6576			if (size < regs[r1] ||
6577			    !DTRACE_INSCRATCH(mstate, size)) {
6578				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6579				regs[rd] = 0;
6580				break;
6581			}
6582
6583			dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6584			mstate->dtms_scratch_ptr += size;
6585			regs[rd] = ptr;
6586			break;
6587		}
6588
6589		case DIF_OP_COPYS:
6590			if (!dtrace_canstore(regs[rd], regs[r2],
6591			    mstate, vstate)) {
6592				*flags |= CPU_DTRACE_BADADDR;
6593				*illval = regs[rd];
6594				break;
6595			}
6596
6597			if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6598				break;
6599
6600			dtrace_bcopy((void *)(uintptr_t)regs[r1],
6601			    (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6602			break;
6603
6604		case DIF_OP_STB:
6605			if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6606				*flags |= CPU_DTRACE_BADADDR;
6607				*illval = regs[rd];
6608				break;
6609			}
6610			*((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6611			break;
6612
6613		case DIF_OP_STH:
6614			if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6615				*flags |= CPU_DTRACE_BADADDR;
6616				*illval = regs[rd];
6617				break;
6618			}
6619			if (regs[rd] & 1) {
6620				*flags |= CPU_DTRACE_BADALIGN;
6621				*illval = regs[rd];
6622				break;
6623			}
6624			*((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6625			break;
6626
6627		case DIF_OP_STW:
6628			if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6629				*flags |= CPU_DTRACE_BADADDR;
6630				*illval = regs[rd];
6631				break;
6632			}
6633			if (regs[rd] & 3) {
6634				*flags |= CPU_DTRACE_BADALIGN;
6635				*illval = regs[rd];
6636				break;
6637			}
6638			*((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6639			break;
6640
6641		case DIF_OP_STX:
6642			if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6643				*flags |= CPU_DTRACE_BADADDR;
6644				*illval = regs[rd];
6645				break;
6646			}
6647			if (regs[rd] & 7) {
6648				*flags |= CPU_DTRACE_BADALIGN;
6649				*illval = regs[rd];
6650				break;
6651			}
6652			*((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6653			break;
6654		}
6655	}
6656
6657	if (!(*flags & CPU_DTRACE_FAULT))
6658		return (rval);
6659
6660	mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6661	mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6662
6663	return (0);
6664}
6665
6666static void
6667dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6668{
6669	dtrace_probe_t *probe = ecb->dte_probe;
6670	dtrace_provider_t *prov = probe->dtpr_provider;
6671	char c[DTRACE_FULLNAMELEN + 80], *str;
6672	char *msg = "dtrace: breakpoint action at probe ";
6673	char *ecbmsg = " (ecb ";
6674	uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6675	uintptr_t val = (uintptr_t)ecb;
6676	int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6677
6678	if (dtrace_destructive_disallow)
6679		return;
6680
6681	/*
6682	 * It's impossible to be taking action on the NULL probe.
6683	 */
6684	ASSERT(probe != NULL);
6685
6686	/*
6687	 * This is a poor man's (destitute man's?) sprintf():  we want to
6688	 * print the provider name, module name, function name and name of
6689	 * the probe, along with the hex address of the ECB with the breakpoint
6690	 * action -- all of which we must place in the character buffer by
6691	 * hand.
6692	 */
6693	while (*msg != '\0')
6694		c[i++] = *msg++;
6695
6696	for (str = prov->dtpv_name; *str != '\0'; str++)
6697		c[i++] = *str;
6698	c[i++] = ':';
6699
6700	for (str = probe->dtpr_mod; *str != '\0'; str++)
6701		c[i++] = *str;
6702	c[i++] = ':';
6703
6704	for (str = probe->dtpr_func; *str != '\0'; str++)
6705		c[i++] = *str;
6706	c[i++] = ':';
6707
6708	for (str = probe->dtpr_name; *str != '\0'; str++)
6709		c[i++] = *str;
6710
6711	while (*ecbmsg != '\0')
6712		c[i++] = *ecbmsg++;
6713
6714	while (shift >= 0) {
6715		mask = (uintptr_t)0xf << shift;
6716
6717		if (val >= ((uintptr_t)1 << shift))
6718			c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6719		shift -= 4;
6720	}
6721
6722	c[i++] = ')';
6723	c[i] = '\0';
6724
6725#if defined(sun)
6726	debug_enter(c);
6727#else
6728	kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
6729#endif
6730}
6731
6732static void
6733dtrace_action_panic(dtrace_ecb_t *ecb)
6734{
6735	dtrace_probe_t *probe = ecb->dte_probe;
6736
6737	/*
6738	 * It's impossible to be taking action on the NULL probe.
6739	 */
6740	ASSERT(probe != NULL);
6741
6742	if (dtrace_destructive_disallow)
6743		return;
6744
6745	if (dtrace_panicked != NULL)
6746		return;
6747
6748	if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
6749		return;
6750
6751	/*
6752	 * We won the right to panic.  (We want to be sure that only one
6753	 * thread calls panic() from dtrace_probe(), and that panic() is
6754	 * called exactly once.)
6755	 */
6756	dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6757	    probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6758	    probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6759}
6760
6761static void
6762dtrace_action_raise(uint64_t sig)
6763{
6764	if (dtrace_destructive_disallow)
6765		return;
6766
6767	if (sig >= NSIG) {
6768		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6769		return;
6770	}
6771
6772#if defined(sun)
6773	/*
6774	 * raise() has a queue depth of 1 -- we ignore all subsequent
6775	 * invocations of the raise() action.
6776	 */
6777	if (curthread->t_dtrace_sig == 0)
6778		curthread->t_dtrace_sig = (uint8_t)sig;
6779
6780	curthread->t_sig_check = 1;
6781	aston(curthread);
6782#else
6783	struct proc *p = curproc;
6784	PROC_LOCK(p);
6785	kern_psignal(p, sig);
6786	PROC_UNLOCK(p);
6787#endif
6788}
6789
6790static void
6791dtrace_action_stop(void)
6792{
6793	if (dtrace_destructive_disallow)
6794		return;
6795
6796#if defined(sun)
6797	if (!curthread->t_dtrace_stop) {
6798		curthread->t_dtrace_stop = 1;
6799		curthread->t_sig_check = 1;
6800		aston(curthread);
6801	}
6802#else
6803	struct proc *p = curproc;
6804	PROC_LOCK(p);
6805	kern_psignal(p, SIGSTOP);
6806	PROC_UNLOCK(p);
6807#endif
6808}
6809
6810static void
6811dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6812{
6813	hrtime_t now;
6814	volatile uint16_t *flags;
6815#if defined(sun)
6816	cpu_t *cpu = CPU;
6817#else
6818	cpu_t *cpu = &solaris_cpu[curcpu];
6819#endif
6820
6821	if (dtrace_destructive_disallow)
6822		return;
6823
6824	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
6825
6826	now = dtrace_gethrtime();
6827
6828	if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6829		/*
6830		 * We need to advance the mark to the current time.
6831		 */
6832		cpu->cpu_dtrace_chillmark = now;
6833		cpu->cpu_dtrace_chilled = 0;
6834	}
6835
6836	/*
6837	 * Now check to see if the requested chill time would take us over
6838	 * the maximum amount of time allowed in the chill interval.  (Or
6839	 * worse, if the calculation itself induces overflow.)
6840	 */
6841	if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6842	    cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6843		*flags |= CPU_DTRACE_ILLOP;
6844		return;
6845	}
6846
6847	while (dtrace_gethrtime() - now < val)
6848		continue;
6849
6850	/*
6851	 * Normally, we assure that the value of the variable "timestamp" does
6852	 * not change within an ECB.  The presence of chill() represents an
6853	 * exception to this rule, however.
6854	 */
6855	mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6856	cpu->cpu_dtrace_chilled += val;
6857}
6858
6859static void
6860dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6861    uint64_t *buf, uint64_t arg)
6862{
6863	int nframes = DTRACE_USTACK_NFRAMES(arg);
6864	int strsize = DTRACE_USTACK_STRSIZE(arg);
6865	uint64_t *pcs = &buf[1], *fps;
6866	char *str = (char *)&pcs[nframes];
6867	int size, offs = 0, i, j;
6868	uintptr_t old = mstate->dtms_scratch_ptr, saved;
6869	uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
6870	char *sym;
6871
6872	/*
6873	 * Should be taking a faster path if string space has not been
6874	 * allocated.
6875	 */
6876	ASSERT(strsize != 0);
6877
6878	/*
6879	 * We will first allocate some temporary space for the frame pointers.
6880	 */
6881	fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6882	size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6883	    (nframes * sizeof (uint64_t));
6884
6885	if (!DTRACE_INSCRATCH(mstate, size)) {
6886		/*
6887		 * Not enough room for our frame pointers -- need to indicate
6888		 * that we ran out of scratch space.
6889		 */
6890		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6891		return;
6892	}
6893
6894	mstate->dtms_scratch_ptr += size;
6895	saved = mstate->dtms_scratch_ptr;
6896
6897	/*
6898	 * Now get a stack with both program counters and frame pointers.
6899	 */
6900	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6901	dtrace_getufpstack(buf, fps, nframes + 1);
6902	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6903
6904	/*
6905	 * If that faulted, we're cooked.
6906	 */
6907	if (*flags & CPU_DTRACE_FAULT)
6908		goto out;
6909
6910	/*
6911	 * Now we want to walk up the stack, calling the USTACK helper.  For
6912	 * each iteration, we restore the scratch pointer.
6913	 */
6914	for (i = 0; i < nframes; i++) {
6915		mstate->dtms_scratch_ptr = saved;
6916
6917		if (offs >= strsize)
6918			break;
6919
6920		sym = (char *)(uintptr_t)dtrace_helper(
6921		    DTRACE_HELPER_ACTION_USTACK,
6922		    mstate, state, pcs[i], fps[i]);
6923
6924		/*
6925		 * If we faulted while running the helper, we're going to
6926		 * clear the fault and null out the corresponding string.
6927		 */
6928		if (*flags & CPU_DTRACE_FAULT) {
6929			*flags &= ~CPU_DTRACE_FAULT;
6930			str[offs++] = '\0';
6931			continue;
6932		}
6933
6934		if (sym == NULL) {
6935			str[offs++] = '\0';
6936			continue;
6937		}
6938
6939		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6940
6941		/*
6942		 * Now copy in the string that the helper returned to us.
6943		 */
6944		for (j = 0; offs + j < strsize; j++) {
6945			if ((str[offs + j] = sym[j]) == '\0')
6946				break;
6947		}
6948
6949		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6950
6951		offs += j + 1;
6952	}
6953
6954	if (offs >= strsize) {
6955		/*
6956		 * If we didn't have room for all of the strings, we don't
6957		 * abort processing -- this needn't be a fatal error -- but we
6958		 * still want to increment a counter (dts_stkstroverflows) to
6959		 * allow this condition to be warned about.  (If this is from
6960		 * a jstack() action, it is easily tuned via jstackstrsize.)
6961		 */
6962		dtrace_error(&state->dts_stkstroverflows);
6963	}
6964
6965	while (offs < strsize)
6966		str[offs++] = '\0';
6967
6968out:
6969	mstate->dtms_scratch_ptr = old;
6970}
6971
6972static void
6973dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6974    size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
6975{
6976	volatile uint16_t *flags;
6977	uint64_t val = *valp;
6978	size_t valoffs = *valoffsp;
6979
6980	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
6981	ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
6982
6983	/*
6984	 * If this is a string, we're going to only load until we find the zero
6985	 * byte -- after which we'll store zero bytes.
6986	 */
6987	if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6988		char c = '\0' + 1;
6989		size_t s;
6990
6991		for (s = 0; s < size; s++) {
6992			if (c != '\0' && dtkind == DIF_TF_BYREF) {
6993				c = dtrace_load8(val++);
6994			} else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
6995				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6996				c = dtrace_fuword8((void *)(uintptr_t)val++);
6997				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6998				if (*flags & CPU_DTRACE_FAULT)
6999					break;
7000			}
7001
7002			DTRACE_STORE(uint8_t, tomax, valoffs++, c);
7003
7004			if (c == '\0' && intuple)
7005				break;
7006		}
7007	} else {
7008		uint8_t c;
7009		while (valoffs < end) {
7010			if (dtkind == DIF_TF_BYREF) {
7011				c = dtrace_load8(val++);
7012			} else if (dtkind == DIF_TF_BYUREF) {
7013				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7014				c = dtrace_fuword8((void *)(uintptr_t)val++);
7015				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7016				if (*flags & CPU_DTRACE_FAULT)
7017					break;
7018			}
7019
7020			DTRACE_STORE(uint8_t, tomax,
7021			    valoffs++, c);
7022		}
7023	}
7024
7025	*valp = val;
7026	*valoffsp = valoffs;
7027}
7028
7029/*
7030 * If you're looking for the epicenter of DTrace, you just found it.  This
7031 * is the function called by the provider to fire a probe -- from which all
7032 * subsequent probe-context DTrace activity emanates.
7033 */
7034void
7035dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
7036    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
7037{
7038	processorid_t cpuid;
7039	dtrace_icookie_t cookie;
7040	dtrace_probe_t *probe;
7041	dtrace_mstate_t mstate;
7042	dtrace_ecb_t *ecb;
7043	dtrace_action_t *act;
7044	intptr_t offs;
7045	size_t size;
7046	int vtime, onintr;
7047	volatile uint16_t *flags;
7048	hrtime_t now;
7049
7050	if (panicstr != NULL)
7051		return;
7052
7053#if defined(sun)
7054	/*
7055	 * Kick out immediately if this CPU is still being born (in which case
7056	 * curthread will be set to -1) or the current thread can't allow
7057	 * probes in its current context.
7058	 */
7059	if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
7060		return;
7061#endif
7062
7063	cookie = dtrace_interrupt_disable();
7064	probe = dtrace_probes[id - 1];
7065	cpuid = curcpu;
7066	onintr = CPU_ON_INTR(CPU);
7067
7068	if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
7069	    probe->dtpr_predcache == curthread->t_predcache) {
7070		/*
7071		 * We have hit in the predicate cache; we know that
7072		 * this predicate would evaluate to be false.
7073		 */
7074		dtrace_interrupt_enable(cookie);
7075		return;
7076	}
7077
7078#if defined(sun)
7079	if (panic_quiesce) {
7080#else
7081	if (panicstr != NULL) {
7082#endif
7083		/*
7084		 * We don't trace anything if we're panicking.
7085		 */
7086		dtrace_interrupt_enable(cookie);
7087		return;
7088	}
7089
7090	now = dtrace_gethrtime();
7091	vtime = dtrace_vtime_references != 0;
7092
7093	if (vtime && curthread->t_dtrace_start)
7094		curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
7095
7096	mstate.dtms_difo = NULL;
7097	mstate.dtms_probe = probe;
7098	mstate.dtms_strtok = 0;
7099	mstate.dtms_arg[0] = arg0;
7100	mstate.dtms_arg[1] = arg1;
7101	mstate.dtms_arg[2] = arg2;
7102	mstate.dtms_arg[3] = arg3;
7103	mstate.dtms_arg[4] = arg4;
7104
7105	flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
7106
7107	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
7108		dtrace_predicate_t *pred = ecb->dte_predicate;
7109		dtrace_state_t *state = ecb->dte_state;
7110		dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
7111		dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
7112		dtrace_vstate_t *vstate = &state->dts_vstate;
7113		dtrace_provider_t *prov = probe->dtpr_provider;
7114		uint64_t tracememsize = 0;
7115		int committed = 0;
7116		caddr_t tomax;
7117
7118		/*
7119		 * A little subtlety with the following (seemingly innocuous)
7120		 * declaration of the automatic 'val':  by looking at the
7121		 * code, you might think that it could be declared in the
7122		 * action processing loop, below.  (That is, it's only used in
7123		 * the action processing loop.)  However, it must be declared
7124		 * out of that scope because in the case of DIF expression
7125		 * arguments to aggregating actions, one iteration of the
7126		 * action loop will use the last iteration's value.
7127		 */
7128		uint64_t val = 0;
7129
7130		mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
7131		mstate.dtms_getf = NULL;
7132
7133		*flags &= ~CPU_DTRACE_ERROR;
7134
7135		if (prov == dtrace_provider) {
7136			/*
7137			 * If dtrace itself is the provider of this probe,
7138			 * we're only going to continue processing the ECB if
7139			 * arg0 (the dtrace_state_t) is equal to the ECB's
7140			 * creating state.  (This prevents disjoint consumers
7141			 * from seeing one another's metaprobes.)
7142			 */
7143			if (arg0 != (uint64_t)(uintptr_t)state)
7144				continue;
7145		}
7146
7147		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
7148			/*
7149			 * We're not currently active.  If our provider isn't
7150			 * the dtrace pseudo provider, we're not interested.
7151			 */
7152			if (prov != dtrace_provider)
7153				continue;
7154
7155			/*
7156			 * Now we must further check if we are in the BEGIN
7157			 * probe.  If we are, we will only continue processing
7158			 * if we're still in WARMUP -- if one BEGIN enabling
7159			 * has invoked the exit() action, we don't want to
7160			 * evaluate subsequent BEGIN enablings.
7161			 */
7162			if (probe->dtpr_id == dtrace_probeid_begin &&
7163			    state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
7164				ASSERT(state->dts_activity ==
7165				    DTRACE_ACTIVITY_DRAINING);
7166				continue;
7167			}
7168		}
7169
7170		if (ecb->dte_cond) {
7171			/*
7172			 * If the dte_cond bits indicate that this
7173			 * consumer is only allowed to see user-mode firings
7174			 * of this probe, call the provider's dtps_usermode()
7175			 * entry point to check that the probe was fired
7176			 * while in a user context. Skip this ECB if that's
7177			 * not the case.
7178			 */
7179			if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
7180			    prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
7181			    probe->dtpr_id, probe->dtpr_arg) == 0)
7182				continue;
7183
7184#if defined(sun)
7185			/*
7186			 * This is more subtle than it looks. We have to be
7187			 * absolutely certain that CRED() isn't going to
7188			 * change out from under us so it's only legit to
7189			 * examine that structure if we're in constrained
7190			 * situations. Currently, the only times we'll this
7191			 * check is if a non-super-user has enabled the
7192			 * profile or syscall providers -- providers that
7193			 * allow visibility of all processes. For the
7194			 * profile case, the check above will ensure that
7195			 * we're examining a user context.
7196			 */
7197			if (ecb->dte_cond & DTRACE_COND_OWNER) {
7198				cred_t *cr;
7199				cred_t *s_cr =
7200				    ecb->dte_state->dts_cred.dcr_cred;
7201				proc_t *proc;
7202
7203				ASSERT(s_cr != NULL);
7204
7205				if ((cr = CRED()) == NULL ||
7206				    s_cr->cr_uid != cr->cr_uid ||
7207				    s_cr->cr_uid != cr->cr_ruid ||
7208				    s_cr->cr_uid != cr->cr_suid ||
7209				    s_cr->cr_gid != cr->cr_gid ||
7210				    s_cr->cr_gid != cr->cr_rgid ||
7211				    s_cr->cr_gid != cr->cr_sgid ||
7212				    (proc = ttoproc(curthread)) == NULL ||
7213				    (proc->p_flag & SNOCD))
7214					continue;
7215			}
7216
7217			if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
7218				cred_t *cr;
7219				cred_t *s_cr =
7220				    ecb->dte_state->dts_cred.dcr_cred;
7221
7222				ASSERT(s_cr != NULL);
7223
7224				if ((cr = CRED()) == NULL ||
7225				    s_cr->cr_zone->zone_id !=
7226				    cr->cr_zone->zone_id)
7227					continue;
7228			}
7229#endif
7230		}
7231
7232		if (now - state->dts_alive > dtrace_deadman_timeout) {
7233			/*
7234			 * We seem to be dead.  Unless we (a) have kernel
7235			 * destructive permissions (b) have explicitly enabled
7236			 * destructive actions and (c) destructive actions have
7237			 * not been disabled, we're going to transition into
7238			 * the KILLED state, from which no further processing
7239			 * on this state will be performed.
7240			 */
7241			if (!dtrace_priv_kernel_destructive(state) ||
7242			    !state->dts_cred.dcr_destructive ||
7243			    dtrace_destructive_disallow) {
7244				void *activity = &state->dts_activity;
7245				dtrace_activity_t current;
7246
7247				do {
7248					current = state->dts_activity;
7249				} while (dtrace_cas32(activity, current,
7250				    DTRACE_ACTIVITY_KILLED) != current);
7251
7252				continue;
7253			}
7254		}
7255
7256		if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
7257		    ecb->dte_alignment, state, &mstate)) < 0)
7258			continue;
7259
7260		tomax = buf->dtb_tomax;
7261		ASSERT(tomax != NULL);
7262
7263		if (ecb->dte_size != 0) {
7264			dtrace_rechdr_t dtrh;
7265			if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
7266				mstate.dtms_timestamp = dtrace_gethrtime();
7267				mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7268			}
7269			ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
7270			dtrh.dtrh_epid = ecb->dte_epid;
7271			DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
7272			    mstate.dtms_timestamp);
7273			*((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
7274		}
7275
7276		mstate.dtms_epid = ecb->dte_epid;
7277		mstate.dtms_present |= DTRACE_MSTATE_EPID;
7278
7279		if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
7280			mstate.dtms_access = DTRACE_ACCESS_KERNEL;
7281		else
7282			mstate.dtms_access = 0;
7283
7284		if (pred != NULL) {
7285			dtrace_difo_t *dp = pred->dtp_difo;
7286			int rval;
7287
7288			rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
7289
7290			if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
7291				dtrace_cacheid_t cid = probe->dtpr_predcache;
7292
7293				if (cid != DTRACE_CACHEIDNONE && !onintr) {
7294					/*
7295					 * Update the predicate cache...
7296					 */
7297					ASSERT(cid == pred->dtp_cacheid);
7298					curthread->t_predcache = cid;
7299				}
7300
7301				continue;
7302			}
7303		}
7304
7305		for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
7306		    act != NULL; act = act->dta_next) {
7307			size_t valoffs;
7308			dtrace_difo_t *dp;
7309			dtrace_recdesc_t *rec = &act->dta_rec;
7310
7311			size = rec->dtrd_size;
7312			valoffs = offs + rec->dtrd_offset;
7313
7314			if (DTRACEACT_ISAGG(act->dta_kind)) {
7315				uint64_t v = 0xbad;
7316				dtrace_aggregation_t *agg;
7317
7318				agg = (dtrace_aggregation_t *)act;
7319
7320				if ((dp = act->dta_difo) != NULL)
7321					v = dtrace_dif_emulate(dp,
7322					    &mstate, vstate, state);
7323
7324				if (*flags & CPU_DTRACE_ERROR)
7325					continue;
7326
7327				/*
7328				 * Note that we always pass the expression
7329				 * value from the previous iteration of the
7330				 * action loop.  This value will only be used
7331				 * if there is an expression argument to the
7332				 * aggregating action, denoted by the
7333				 * dtag_hasarg field.
7334				 */
7335				dtrace_aggregate(agg, buf,
7336				    offs, aggbuf, v, val);
7337				continue;
7338			}
7339
7340			switch (act->dta_kind) {
7341			case DTRACEACT_STOP:
7342				if (dtrace_priv_proc_destructive(state))
7343					dtrace_action_stop();
7344				continue;
7345
7346			case DTRACEACT_BREAKPOINT:
7347				if (dtrace_priv_kernel_destructive(state))
7348					dtrace_action_breakpoint(ecb);
7349				continue;
7350
7351			case DTRACEACT_PANIC:
7352				if (dtrace_priv_kernel_destructive(state))
7353					dtrace_action_panic(ecb);
7354				continue;
7355
7356			case DTRACEACT_STACK:
7357				if (!dtrace_priv_kernel(state))
7358					continue;
7359
7360				dtrace_getpcstack((pc_t *)(tomax + valoffs),
7361				    size / sizeof (pc_t), probe->dtpr_aframes,
7362				    DTRACE_ANCHORED(probe) ? NULL :
7363				    (uint32_t *)arg0);
7364				continue;
7365
7366			case DTRACEACT_JSTACK:
7367			case DTRACEACT_USTACK:
7368				if (!dtrace_priv_proc(state))
7369					continue;
7370
7371				/*
7372				 * See comment in DIF_VAR_PID.
7373				 */
7374				if (DTRACE_ANCHORED(mstate.dtms_probe) &&
7375				    CPU_ON_INTR(CPU)) {
7376					int depth = DTRACE_USTACK_NFRAMES(
7377					    rec->dtrd_arg) + 1;
7378
7379					dtrace_bzero((void *)(tomax + valoffs),
7380					    DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
7381					    + depth * sizeof (uint64_t));
7382
7383					continue;
7384				}
7385
7386				if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
7387				    curproc->p_dtrace_helpers != NULL) {
7388					/*
7389					 * This is the slow path -- we have
7390					 * allocated string space, and we're
7391					 * getting the stack of a process that
7392					 * has helpers.  Call into a separate
7393					 * routine to perform this processing.
7394					 */
7395					dtrace_action_ustack(&mstate, state,
7396					    (uint64_t *)(tomax + valoffs),
7397					    rec->dtrd_arg);
7398					continue;
7399				}
7400
7401				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7402				dtrace_getupcstack((uint64_t *)
7403				    (tomax + valoffs),
7404				    DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
7405				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7406				continue;
7407
7408			default:
7409				break;
7410			}
7411
7412			dp = act->dta_difo;
7413			ASSERT(dp != NULL);
7414
7415			val = dtrace_dif_emulate(dp, &mstate, vstate, state);
7416
7417			if (*flags & CPU_DTRACE_ERROR)
7418				continue;
7419
7420			switch (act->dta_kind) {
7421			case DTRACEACT_SPECULATE: {
7422				dtrace_rechdr_t *dtrh;
7423
7424				ASSERT(buf == &state->dts_buffer[cpuid]);
7425				buf = dtrace_speculation_buffer(state,
7426				    cpuid, val);
7427
7428				if (buf == NULL) {
7429					*flags |= CPU_DTRACE_DROP;
7430					continue;
7431				}
7432
7433				offs = dtrace_buffer_reserve(buf,
7434				    ecb->dte_needed, ecb->dte_alignment,
7435				    state, NULL);
7436
7437				if (offs < 0) {
7438					*flags |= CPU_DTRACE_DROP;
7439					continue;
7440				}
7441
7442				tomax = buf->dtb_tomax;
7443				ASSERT(tomax != NULL);
7444
7445				if (ecb->dte_size == 0)
7446					continue;
7447
7448				ASSERT3U(ecb->dte_size, >=,
7449				    sizeof (dtrace_rechdr_t));
7450				dtrh = ((void *)(tomax + offs));
7451				dtrh->dtrh_epid = ecb->dte_epid;
7452				/*
7453				 * When the speculation is committed, all of
7454				 * the records in the speculative buffer will
7455				 * have their timestamps set to the commit
7456				 * time.  Until then, it is set to a sentinel
7457				 * value, for debugability.
7458				 */
7459				DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
7460				continue;
7461			}
7462
7463			case DTRACEACT_PRINTM: {
7464				/* The DIF returns a 'memref'. */
7465				uintptr_t *memref = (uintptr_t *)(uintptr_t) val;
7466
7467				/* Get the size from the memref. */
7468				size = memref[1];
7469
7470				/*
7471				 * Check if the size exceeds the allocated
7472				 * buffer size.
7473				 */
7474				if (size + sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
7475					/* Flag a drop! */
7476					*flags |= CPU_DTRACE_DROP;
7477					continue;
7478				}
7479
7480				/* Store the size in the buffer first. */
7481				DTRACE_STORE(uintptr_t, tomax,
7482				    valoffs, size);
7483
7484				/*
7485				 * Offset the buffer address to the start
7486				 * of the data.
7487				 */
7488				valoffs += sizeof(uintptr_t);
7489
7490				/*
7491				 * Reset to the memory address rather than
7492				 * the memref array, then let the BYREF
7493				 * code below do the work to store the
7494				 * memory data in the buffer.
7495				 */
7496				val = memref[0];
7497				break;
7498			}
7499
7500			case DTRACEACT_PRINTT: {
7501				/* The DIF returns a 'typeref'. */
7502				uintptr_t *typeref = (uintptr_t *)(uintptr_t) val;
7503				char c = '\0' + 1;
7504				size_t s;
7505
7506				/*
7507				 * Get the type string length and round it
7508				 * up so that the data that follows is
7509				 * aligned for easy access.
7510				 */
7511				size_t typs = strlen((char *) typeref[2]) + 1;
7512				typs = roundup(typs,  sizeof(uintptr_t));
7513
7514				/*
7515				 *Get the size from the typeref using the
7516				 * number of elements and the type size.
7517				 */
7518				size = typeref[1] * typeref[3];
7519
7520				/*
7521				 * Check if the size exceeds the allocated
7522				 * buffer size.
7523				 */
7524				if (size + typs + 2 * sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
7525					/* Flag a drop! */
7526					*flags |= CPU_DTRACE_DROP;
7527
7528				}
7529
7530				/* Store the size in the buffer first. */
7531				DTRACE_STORE(uintptr_t, tomax,
7532				    valoffs, size);
7533				valoffs += sizeof(uintptr_t);
7534
7535				/* Store the type size in the buffer. */
7536				DTRACE_STORE(uintptr_t, tomax,
7537				    valoffs, typeref[3]);
7538				valoffs += sizeof(uintptr_t);
7539
7540				val = typeref[2];
7541
7542				for (s = 0; s < typs; s++) {
7543					if (c != '\0')
7544						c = dtrace_load8(val++);
7545
7546					DTRACE_STORE(uint8_t, tomax,
7547					    valoffs++, c);
7548				}
7549
7550				/*
7551				 * Reset to the memory address rather than
7552				 * the typeref array, then let the BYREF
7553				 * code below do the work to store the
7554				 * memory data in the buffer.
7555				 */
7556				val = typeref[0];
7557				break;
7558			}
7559
7560			case DTRACEACT_CHILL:
7561				if (dtrace_priv_kernel_destructive(state))
7562					dtrace_action_chill(&mstate, val);
7563				continue;
7564
7565			case DTRACEACT_RAISE:
7566				if (dtrace_priv_proc_destructive(state))
7567					dtrace_action_raise(val);
7568				continue;
7569
7570			case DTRACEACT_COMMIT:
7571				ASSERT(!committed);
7572
7573				/*
7574				 * We need to commit our buffer state.
7575				 */
7576				if (ecb->dte_size)
7577					buf->dtb_offset = offs + ecb->dte_size;
7578				buf = &state->dts_buffer[cpuid];
7579				dtrace_speculation_commit(state, cpuid, val);
7580				committed = 1;
7581				continue;
7582
7583			case DTRACEACT_DISCARD:
7584				dtrace_speculation_discard(state, cpuid, val);
7585				continue;
7586
7587			case DTRACEACT_DIFEXPR:
7588			case DTRACEACT_LIBACT:
7589			case DTRACEACT_PRINTF:
7590			case DTRACEACT_PRINTA:
7591			case DTRACEACT_SYSTEM:
7592			case DTRACEACT_FREOPEN:
7593			case DTRACEACT_TRACEMEM:
7594				break;
7595
7596			case DTRACEACT_TRACEMEM_DYNSIZE:
7597				tracememsize = val;
7598				break;
7599
7600			case DTRACEACT_SYM:
7601			case DTRACEACT_MOD:
7602				if (!dtrace_priv_kernel(state))
7603					continue;
7604				break;
7605
7606			case DTRACEACT_USYM:
7607			case DTRACEACT_UMOD:
7608			case DTRACEACT_UADDR: {
7609#if defined(sun)
7610				struct pid *pid = curthread->t_procp->p_pidp;
7611#endif
7612
7613				if (!dtrace_priv_proc(state))
7614					continue;
7615
7616				DTRACE_STORE(uint64_t, tomax,
7617#if defined(sun)
7618				    valoffs, (uint64_t)pid->pid_id);
7619#else
7620				    valoffs, (uint64_t) curproc->p_pid);
7621#endif
7622				DTRACE_STORE(uint64_t, tomax,
7623				    valoffs + sizeof (uint64_t), val);
7624
7625				continue;
7626			}
7627
7628			case DTRACEACT_EXIT: {
7629				/*
7630				 * For the exit action, we are going to attempt
7631				 * to atomically set our activity to be
7632				 * draining.  If this fails (either because
7633				 * another CPU has beat us to the exit action,
7634				 * or because our current activity is something
7635				 * other than ACTIVE or WARMUP), we will
7636				 * continue.  This assures that the exit action
7637				 * can be successfully recorded at most once
7638				 * when we're in the ACTIVE state.  If we're
7639				 * encountering the exit() action while in
7640				 * COOLDOWN, however, we want to honor the new
7641				 * status code.  (We know that we're the only
7642				 * thread in COOLDOWN, so there is no race.)
7643				 */
7644				void *activity = &state->dts_activity;
7645				dtrace_activity_t current = state->dts_activity;
7646
7647				if (current == DTRACE_ACTIVITY_COOLDOWN)
7648					break;
7649
7650				if (current != DTRACE_ACTIVITY_WARMUP)
7651					current = DTRACE_ACTIVITY_ACTIVE;
7652
7653				if (dtrace_cas32(activity, current,
7654				    DTRACE_ACTIVITY_DRAINING) != current) {
7655					*flags |= CPU_DTRACE_DROP;
7656					continue;
7657				}
7658
7659				break;
7660			}
7661
7662			default:
7663				ASSERT(0);
7664			}
7665
7666			if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ||
7667			    dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) {
7668				uintptr_t end = valoffs + size;
7669
7670				if (tracememsize != 0 &&
7671				    valoffs + tracememsize < end) {
7672					end = valoffs + tracememsize;
7673					tracememsize = 0;
7674				}
7675
7676				if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
7677				    !dtrace_vcanload((void *)(uintptr_t)val,
7678				    &dp->dtdo_rtype, &mstate, vstate))
7679					continue;
7680
7681				dtrace_store_by_ref(dp, tomax, size, &valoffs,
7682				    &val, end, act->dta_intuple,
7683				    dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
7684				    DIF_TF_BYREF: DIF_TF_BYUREF);
7685				continue;
7686			}
7687
7688			switch (size) {
7689			case 0:
7690				break;
7691
7692			case sizeof (uint8_t):
7693				DTRACE_STORE(uint8_t, tomax, valoffs, val);
7694				break;
7695			case sizeof (uint16_t):
7696				DTRACE_STORE(uint16_t, tomax, valoffs, val);
7697				break;
7698			case sizeof (uint32_t):
7699				DTRACE_STORE(uint32_t, tomax, valoffs, val);
7700				break;
7701			case sizeof (uint64_t):
7702				DTRACE_STORE(uint64_t, tomax, valoffs, val);
7703				break;
7704			default:
7705				/*
7706				 * Any other size should have been returned by
7707				 * reference, not by value.
7708				 */
7709				ASSERT(0);
7710				break;
7711			}
7712		}
7713
7714		if (*flags & CPU_DTRACE_DROP)
7715			continue;
7716
7717		if (*flags & CPU_DTRACE_FAULT) {
7718			int ndx;
7719			dtrace_action_t *err;
7720
7721			buf->dtb_errors++;
7722
7723			if (probe->dtpr_id == dtrace_probeid_error) {
7724				/*
7725				 * There's nothing we can do -- we had an
7726				 * error on the error probe.  We bump an
7727				 * error counter to at least indicate that
7728				 * this condition happened.
7729				 */
7730				dtrace_error(&state->dts_dblerrors);
7731				continue;
7732			}
7733
7734			if (vtime) {
7735				/*
7736				 * Before recursing on dtrace_probe(), we
7737				 * need to explicitly clear out our start
7738				 * time to prevent it from being accumulated
7739				 * into t_dtrace_vtime.
7740				 */
7741				curthread->t_dtrace_start = 0;
7742			}
7743
7744			/*
7745			 * Iterate over the actions to figure out which action
7746			 * we were processing when we experienced the error.
7747			 * Note that act points _past_ the faulting action; if
7748			 * act is ecb->dte_action, the fault was in the
7749			 * predicate, if it's ecb->dte_action->dta_next it's
7750			 * in action #1, and so on.
7751			 */
7752			for (err = ecb->dte_action, ndx = 0;
7753			    err != act; err = err->dta_next, ndx++)
7754				continue;
7755
7756			dtrace_probe_error(state, ecb->dte_epid, ndx,
7757			    (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7758			    mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
7759			    cpu_core[cpuid].cpuc_dtrace_illval);
7760
7761			continue;
7762		}
7763
7764		if (!committed)
7765			buf->dtb_offset = offs + ecb->dte_size;
7766	}
7767
7768	if (vtime)
7769		curthread->t_dtrace_start = dtrace_gethrtime();
7770
7771	dtrace_interrupt_enable(cookie);
7772}
7773
7774/*
7775 * DTrace Probe Hashing Functions
7776 *
7777 * The functions in this section (and indeed, the functions in remaining
7778 * sections) are not _called_ from probe context.  (Any exceptions to this are
7779 * marked with a "Note:".)  Rather, they are called from elsewhere in the
7780 * DTrace framework to look-up probes in, add probes to and remove probes from
7781 * the DTrace probe hashes.  (Each probe is hashed by each element of the
7782 * probe tuple -- allowing for fast lookups, regardless of what was
7783 * specified.)
7784 */
7785static uint_t
7786dtrace_hash_str(const char *p)
7787{
7788	unsigned int g;
7789	uint_t hval = 0;
7790
7791	while (*p) {
7792		hval = (hval << 4) + *p++;
7793		if ((g = (hval & 0xf0000000)) != 0)
7794			hval ^= g >> 24;
7795		hval &= ~g;
7796	}
7797	return (hval);
7798}
7799
7800static dtrace_hash_t *
7801dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
7802{
7803	dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7804
7805	hash->dth_stroffs = stroffs;
7806	hash->dth_nextoffs = nextoffs;
7807	hash->dth_prevoffs = prevoffs;
7808
7809	hash->dth_size = 1;
7810	hash->dth_mask = hash->dth_size - 1;
7811
7812	hash->dth_tab = kmem_zalloc(hash->dth_size *
7813	    sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7814
7815	return (hash);
7816}
7817
7818static void
7819dtrace_hash_destroy(dtrace_hash_t *hash)
7820{
7821#ifdef DEBUG
7822	int i;
7823
7824	for (i = 0; i < hash->dth_size; i++)
7825		ASSERT(hash->dth_tab[i] == NULL);
7826#endif
7827
7828	kmem_free(hash->dth_tab,
7829	    hash->dth_size * sizeof (dtrace_hashbucket_t *));
7830	kmem_free(hash, sizeof (dtrace_hash_t));
7831}
7832
7833static void
7834dtrace_hash_resize(dtrace_hash_t *hash)
7835{
7836	int size = hash->dth_size, i, ndx;
7837	int new_size = hash->dth_size << 1;
7838	int new_mask = new_size - 1;
7839	dtrace_hashbucket_t **new_tab, *bucket, *next;
7840
7841	ASSERT((new_size & new_mask) == 0);
7842
7843	new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7844
7845	for (i = 0; i < size; i++) {
7846		for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7847			dtrace_probe_t *probe = bucket->dthb_chain;
7848
7849			ASSERT(probe != NULL);
7850			ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
7851
7852			next = bucket->dthb_next;
7853			bucket->dthb_next = new_tab[ndx];
7854			new_tab[ndx] = bucket;
7855		}
7856	}
7857
7858	kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7859	hash->dth_tab = new_tab;
7860	hash->dth_size = new_size;
7861	hash->dth_mask = new_mask;
7862}
7863
7864static void
7865dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
7866{
7867	int hashval = DTRACE_HASHSTR(hash, new);
7868	int ndx = hashval & hash->dth_mask;
7869	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7870	dtrace_probe_t **nextp, **prevp;
7871
7872	for (; bucket != NULL; bucket = bucket->dthb_next) {
7873		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7874			goto add;
7875	}
7876
7877	if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7878		dtrace_hash_resize(hash);
7879		dtrace_hash_add(hash, new);
7880		return;
7881	}
7882
7883	bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7884	bucket->dthb_next = hash->dth_tab[ndx];
7885	hash->dth_tab[ndx] = bucket;
7886	hash->dth_nbuckets++;
7887
7888add:
7889	nextp = DTRACE_HASHNEXT(hash, new);
7890	ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7891	*nextp = bucket->dthb_chain;
7892
7893	if (bucket->dthb_chain != NULL) {
7894		prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7895		ASSERT(*prevp == NULL);
7896		*prevp = new;
7897	}
7898
7899	bucket->dthb_chain = new;
7900	bucket->dthb_len++;
7901}
7902
7903static dtrace_probe_t *
7904dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
7905{
7906	int hashval = DTRACE_HASHSTR(hash, template);
7907	int ndx = hashval & hash->dth_mask;
7908	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7909
7910	for (; bucket != NULL; bucket = bucket->dthb_next) {
7911		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7912			return (bucket->dthb_chain);
7913	}
7914
7915	return (NULL);
7916}
7917
7918static int
7919dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
7920{
7921	int hashval = DTRACE_HASHSTR(hash, template);
7922	int ndx = hashval & hash->dth_mask;
7923	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7924
7925	for (; bucket != NULL; bucket = bucket->dthb_next) {
7926		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7927			return (bucket->dthb_len);
7928	}
7929
7930	return (0);
7931}
7932
7933static void
7934dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
7935{
7936	int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
7937	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7938
7939	dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
7940	dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
7941
7942	/*
7943	 * Find the bucket that we're removing this probe from.
7944	 */
7945	for (; bucket != NULL; bucket = bucket->dthb_next) {
7946		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
7947			break;
7948	}
7949
7950	ASSERT(bucket != NULL);
7951
7952	if (*prevp == NULL) {
7953		if (*nextp == NULL) {
7954			/*
7955			 * The removed probe was the only probe on this
7956			 * bucket; we need to remove the bucket.
7957			 */
7958			dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7959
7960			ASSERT(bucket->dthb_chain == probe);
7961			ASSERT(b != NULL);
7962
7963			if (b == bucket) {
7964				hash->dth_tab[ndx] = bucket->dthb_next;
7965			} else {
7966				while (b->dthb_next != bucket)
7967					b = b->dthb_next;
7968				b->dthb_next = bucket->dthb_next;
7969			}
7970
7971			ASSERT(hash->dth_nbuckets > 0);
7972			hash->dth_nbuckets--;
7973			kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7974			return;
7975		}
7976
7977		bucket->dthb_chain = *nextp;
7978	} else {
7979		*(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7980	}
7981
7982	if (*nextp != NULL)
7983		*(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7984}
7985
7986/*
7987 * DTrace Utility Functions
7988 *
7989 * These are random utility functions that are _not_ called from probe context.
7990 */
7991static int
7992dtrace_badattr(const dtrace_attribute_t *a)
7993{
7994	return (a->dtat_name > DTRACE_STABILITY_MAX ||
7995	    a->dtat_data > DTRACE_STABILITY_MAX ||
7996	    a->dtat_class > DTRACE_CLASS_MAX);
7997}
7998
7999/*
8000 * Return a duplicate copy of a string.  If the specified string is NULL,
8001 * this function returns a zero-length string.
8002 */
8003static char *
8004dtrace_strdup(const char *str)
8005{
8006	char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
8007
8008	if (str != NULL)
8009		(void) strcpy(new, str);
8010
8011	return (new);
8012}
8013
8014#define	DTRACE_ISALPHA(c)	\
8015	(((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
8016
8017static int
8018dtrace_badname(const char *s)
8019{
8020	char c;
8021
8022	if (s == NULL || (c = *s++) == '\0')
8023		return (0);
8024
8025	if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
8026		return (1);
8027
8028	while ((c = *s++) != '\0') {
8029		if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
8030		    c != '-' && c != '_' && c != '.' && c != '`')
8031			return (1);
8032	}
8033
8034	return (0);
8035}
8036
8037static void
8038dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
8039{
8040	uint32_t priv;
8041
8042#if defined(sun)
8043	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
8044		/*
8045		 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
8046		 */
8047		priv = DTRACE_PRIV_ALL;
8048	} else {
8049		*uidp = crgetuid(cr);
8050		*zoneidp = crgetzoneid(cr);
8051
8052		priv = 0;
8053		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
8054			priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
8055		else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
8056			priv |= DTRACE_PRIV_USER;
8057		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
8058			priv |= DTRACE_PRIV_PROC;
8059		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
8060			priv |= DTRACE_PRIV_OWNER;
8061		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
8062			priv |= DTRACE_PRIV_ZONEOWNER;
8063	}
8064#else
8065	priv = DTRACE_PRIV_ALL;
8066#endif
8067
8068	*privp = priv;
8069}
8070
8071#ifdef DTRACE_ERRDEBUG
8072static void
8073dtrace_errdebug(const char *str)
8074{
8075	int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
8076	int occupied = 0;
8077
8078	mutex_enter(&dtrace_errlock);
8079	dtrace_errlast = str;
8080	dtrace_errthread = curthread;
8081
8082	while (occupied++ < DTRACE_ERRHASHSZ) {
8083		if (dtrace_errhash[hval].dter_msg == str) {
8084			dtrace_errhash[hval].dter_count++;
8085			goto out;
8086		}
8087
8088		if (dtrace_errhash[hval].dter_msg != NULL) {
8089			hval = (hval + 1) % DTRACE_ERRHASHSZ;
8090			continue;
8091		}
8092
8093		dtrace_errhash[hval].dter_msg = str;
8094		dtrace_errhash[hval].dter_count = 1;
8095		goto out;
8096	}
8097
8098	panic("dtrace: undersized error hash");
8099out:
8100	mutex_exit(&dtrace_errlock);
8101}
8102#endif
8103
8104/*
8105 * DTrace Matching Functions
8106 *
8107 * These functions are used to match groups of probes, given some elements of
8108 * a probe tuple, or some globbed expressions for elements of a probe tuple.
8109 */
8110static int
8111dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
8112    zoneid_t zoneid)
8113{
8114	if (priv != DTRACE_PRIV_ALL) {
8115		uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
8116		uint32_t match = priv & ppriv;
8117
8118		/*
8119		 * No PRIV_DTRACE_* privileges...
8120		 */
8121		if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
8122		    DTRACE_PRIV_KERNEL)) == 0)
8123			return (0);
8124
8125		/*
8126		 * No matching bits, but there were bits to match...
8127		 */
8128		if (match == 0 && ppriv != 0)
8129			return (0);
8130
8131		/*
8132		 * Need to have permissions to the process, but don't...
8133		 */
8134		if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
8135		    uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
8136			return (0);
8137		}
8138
8139		/*
8140		 * Need to be in the same zone unless we possess the
8141		 * privilege to examine all zones.
8142		 */
8143		if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
8144		    zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
8145			return (0);
8146		}
8147	}
8148
8149	return (1);
8150}
8151
8152/*
8153 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
8154 * consists of input pattern strings and an ops-vector to evaluate them.
8155 * This function returns >0 for match, 0 for no match, and <0 for error.
8156 */
8157static int
8158dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
8159    uint32_t priv, uid_t uid, zoneid_t zoneid)
8160{
8161	dtrace_provider_t *pvp = prp->dtpr_provider;
8162	int rv;
8163
8164	if (pvp->dtpv_defunct)
8165		return (0);
8166
8167	if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
8168		return (rv);
8169
8170	if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
8171		return (rv);
8172
8173	if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
8174		return (rv);
8175
8176	if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
8177		return (rv);
8178
8179	if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
8180		return (0);
8181
8182	return (rv);
8183}
8184
8185/*
8186 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
8187 * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
8188 * libc's version, the kernel version only applies to 8-bit ASCII strings.
8189 * In addition, all of the recursion cases except for '*' matching have been
8190 * unwound.  For '*', we still implement recursive evaluation, but a depth
8191 * counter is maintained and matching is aborted if we recurse too deep.
8192 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
8193 */
8194static int
8195dtrace_match_glob(const char *s, const char *p, int depth)
8196{
8197	const char *olds;
8198	char s1, c;
8199	int gs;
8200
8201	if (depth > DTRACE_PROBEKEY_MAXDEPTH)
8202		return (-1);
8203
8204	if (s == NULL)
8205		s = ""; /* treat NULL as empty string */
8206
8207top:
8208	olds = s;
8209	s1 = *s++;
8210
8211	if (p == NULL)
8212		return (0);
8213
8214	if ((c = *p++) == '\0')
8215		return (s1 == '\0');
8216
8217	switch (c) {
8218	case '[': {
8219		int ok = 0, notflag = 0;
8220		char lc = '\0';
8221
8222		if (s1 == '\0')
8223			return (0);
8224
8225		if (*p == '!') {
8226			notflag = 1;
8227			p++;
8228		}
8229
8230		if ((c = *p++) == '\0')
8231			return (0);
8232
8233		do {
8234			if (c == '-' && lc != '\0' && *p != ']') {
8235				if ((c = *p++) == '\0')
8236					return (0);
8237				if (c == '\\' && (c = *p++) == '\0')
8238					return (0);
8239
8240				if (notflag) {
8241					if (s1 < lc || s1 > c)
8242						ok++;
8243					else
8244						return (0);
8245				} else if (lc <= s1 && s1 <= c)
8246					ok++;
8247
8248			} else if (c == '\\' && (c = *p++) == '\0')
8249				return (0);
8250
8251			lc = c; /* save left-hand 'c' for next iteration */
8252
8253			if (notflag) {
8254				if (s1 != c)
8255					ok++;
8256				else
8257					return (0);
8258			} else if (s1 == c)
8259				ok++;
8260
8261			if ((c = *p++) == '\0')
8262				return (0);
8263
8264		} while (c != ']');
8265
8266		if (ok)
8267			goto top;
8268
8269		return (0);
8270	}
8271
8272	case '\\':
8273		if ((c = *p++) == '\0')
8274			return (0);
8275		/*FALLTHRU*/
8276
8277	default:
8278		if (c != s1)
8279			return (0);
8280		/*FALLTHRU*/
8281
8282	case '?':
8283		if (s1 != '\0')
8284			goto top;
8285		return (0);
8286
8287	case '*':
8288		while (*p == '*')
8289			p++; /* consecutive *'s are identical to a single one */
8290
8291		if (*p == '\0')
8292			return (1);
8293
8294		for (s = olds; *s != '\0'; s++) {
8295			if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
8296				return (gs);
8297		}
8298
8299		return (0);
8300	}
8301}
8302
8303/*ARGSUSED*/
8304static int
8305dtrace_match_string(const char *s, const char *p, int depth)
8306{
8307	return (s != NULL && strcmp(s, p) == 0);
8308}
8309
8310/*ARGSUSED*/
8311static int
8312dtrace_match_nul(const char *s, const char *p, int depth)
8313{
8314	return (1); /* always match the empty pattern */
8315}
8316
8317/*ARGSUSED*/
8318static int
8319dtrace_match_nonzero(const char *s, const char *p, int depth)
8320{
8321	return (s != NULL && s[0] != '\0');
8322}
8323
8324static int
8325dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
8326    zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
8327{
8328	dtrace_probe_t template, *probe;
8329	dtrace_hash_t *hash = NULL;
8330	int len, best = INT_MAX, nmatched = 0;
8331	dtrace_id_t i;
8332
8333	ASSERT(MUTEX_HELD(&dtrace_lock));
8334
8335	/*
8336	 * If the probe ID is specified in the key, just lookup by ID and
8337	 * invoke the match callback once if a matching probe is found.
8338	 */
8339	if (pkp->dtpk_id != DTRACE_IDNONE) {
8340		if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
8341		    dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
8342			(void) (*matched)(probe, arg);
8343			nmatched++;
8344		}
8345		return (nmatched);
8346	}
8347
8348	template.dtpr_mod = (char *)pkp->dtpk_mod;
8349	template.dtpr_func = (char *)pkp->dtpk_func;
8350	template.dtpr_name = (char *)pkp->dtpk_name;
8351
8352	/*
8353	 * We want to find the most distinct of the module name, function
8354	 * name, and name.  So for each one that is not a glob pattern or
8355	 * empty string, we perform a lookup in the corresponding hash and
8356	 * use the hash table with the fewest collisions to do our search.
8357	 */
8358	if (pkp->dtpk_mmatch == &dtrace_match_string &&
8359	    (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
8360		best = len;
8361		hash = dtrace_bymod;
8362	}
8363
8364	if (pkp->dtpk_fmatch == &dtrace_match_string &&
8365	    (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
8366		best = len;
8367		hash = dtrace_byfunc;
8368	}
8369
8370	if (pkp->dtpk_nmatch == &dtrace_match_string &&
8371	    (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
8372		best = len;
8373		hash = dtrace_byname;
8374	}
8375
8376	/*
8377	 * If we did not select a hash table, iterate over every probe and
8378	 * invoke our callback for each one that matches our input probe key.
8379	 */
8380	if (hash == NULL) {
8381		for (i = 0; i < dtrace_nprobes; i++) {
8382			if ((probe = dtrace_probes[i]) == NULL ||
8383			    dtrace_match_probe(probe, pkp, priv, uid,
8384			    zoneid) <= 0)
8385				continue;
8386
8387			nmatched++;
8388
8389			if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8390				break;
8391		}
8392
8393		return (nmatched);
8394	}
8395
8396	/*
8397	 * If we selected a hash table, iterate over each probe of the same key
8398	 * name and invoke the callback for every probe that matches the other
8399	 * attributes of our input probe key.
8400	 */
8401	for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
8402	    probe = *(DTRACE_HASHNEXT(hash, probe))) {
8403
8404		if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
8405			continue;
8406
8407		nmatched++;
8408
8409		if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8410			break;
8411	}
8412
8413	return (nmatched);
8414}
8415
8416/*
8417 * Return the function pointer dtrace_probecmp() should use to compare the
8418 * specified pattern with a string.  For NULL or empty patterns, we select
8419 * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
8420 * For non-empty non-glob strings, we use dtrace_match_string().
8421 */
8422static dtrace_probekey_f *
8423dtrace_probekey_func(const char *p)
8424{
8425	char c;
8426
8427	if (p == NULL || *p == '\0')
8428		return (&dtrace_match_nul);
8429
8430	while ((c = *p++) != '\0') {
8431		if (c == '[' || c == '?' || c == '*' || c == '\\')
8432			return (&dtrace_match_glob);
8433	}
8434
8435	return (&dtrace_match_string);
8436}
8437
8438/*
8439 * Build a probe comparison key for use with dtrace_match_probe() from the
8440 * given probe description.  By convention, a null key only matches anchored
8441 * probes: if each field is the empty string, reset dtpk_fmatch to
8442 * dtrace_match_nonzero().
8443 */
8444static void
8445dtrace_probekey(dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
8446{
8447	pkp->dtpk_prov = pdp->dtpd_provider;
8448	pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
8449
8450	pkp->dtpk_mod = pdp->dtpd_mod;
8451	pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
8452
8453	pkp->dtpk_func = pdp->dtpd_func;
8454	pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
8455
8456	pkp->dtpk_name = pdp->dtpd_name;
8457	pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
8458
8459	pkp->dtpk_id = pdp->dtpd_id;
8460
8461	if (pkp->dtpk_id == DTRACE_IDNONE &&
8462	    pkp->dtpk_pmatch == &dtrace_match_nul &&
8463	    pkp->dtpk_mmatch == &dtrace_match_nul &&
8464	    pkp->dtpk_fmatch == &dtrace_match_nul &&
8465	    pkp->dtpk_nmatch == &dtrace_match_nul)
8466		pkp->dtpk_fmatch = &dtrace_match_nonzero;
8467}
8468
8469/*
8470 * DTrace Provider-to-Framework API Functions
8471 *
8472 * These functions implement much of the Provider-to-Framework API, as
8473 * described in <sys/dtrace.h>.  The parts of the API not in this section are
8474 * the functions in the API for probe management (found below), and
8475 * dtrace_probe() itself (found above).
8476 */
8477
8478/*
8479 * Register the calling provider with the DTrace framework.  This should
8480 * generally be called by DTrace providers in their attach(9E) entry point.
8481 */
8482int
8483dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
8484    cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
8485{
8486	dtrace_provider_t *provider;
8487
8488	if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
8489		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8490		    "arguments", name ? name : "<NULL>");
8491		return (EINVAL);
8492	}
8493
8494	if (name[0] == '\0' || dtrace_badname(name)) {
8495		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8496		    "provider name", name);
8497		return (EINVAL);
8498	}
8499
8500	if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
8501	    pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
8502	    pops->dtps_destroy == NULL ||
8503	    ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8504		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8505		    "provider ops", name);
8506		return (EINVAL);
8507	}
8508
8509	if (dtrace_badattr(&pap->dtpa_provider) ||
8510	    dtrace_badattr(&pap->dtpa_mod) ||
8511	    dtrace_badattr(&pap->dtpa_func) ||
8512	    dtrace_badattr(&pap->dtpa_name) ||
8513	    dtrace_badattr(&pap->dtpa_args)) {
8514		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8515		    "provider attributes", name);
8516		return (EINVAL);
8517	}
8518
8519	if (priv & ~DTRACE_PRIV_ALL) {
8520		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8521		    "privilege attributes", name);
8522		return (EINVAL);
8523	}
8524
8525	if ((priv & DTRACE_PRIV_KERNEL) &&
8526	    (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8527	    pops->dtps_usermode == NULL) {
8528		cmn_err(CE_WARN, "failed to register provider '%s': need "
8529		    "dtps_usermode() op for given privilege attributes", name);
8530		return (EINVAL);
8531	}
8532
8533	provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8534	provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8535	(void) strcpy(provider->dtpv_name, name);
8536
8537	provider->dtpv_attr = *pap;
8538	provider->dtpv_priv.dtpp_flags = priv;
8539	if (cr != NULL) {
8540		provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8541		provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8542	}
8543	provider->dtpv_pops = *pops;
8544
8545	if (pops->dtps_provide == NULL) {
8546		ASSERT(pops->dtps_provide_module != NULL);
8547		provider->dtpv_pops.dtps_provide =
8548		    (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop;
8549	}
8550
8551	if (pops->dtps_provide_module == NULL) {
8552		ASSERT(pops->dtps_provide != NULL);
8553		provider->dtpv_pops.dtps_provide_module =
8554		    (void (*)(void *, modctl_t *))dtrace_nullop;
8555	}
8556
8557	if (pops->dtps_suspend == NULL) {
8558		ASSERT(pops->dtps_resume == NULL);
8559		provider->dtpv_pops.dtps_suspend =
8560		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8561		provider->dtpv_pops.dtps_resume =
8562		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8563	}
8564
8565	provider->dtpv_arg = arg;
8566	*idp = (dtrace_provider_id_t)provider;
8567
8568	if (pops == &dtrace_provider_ops) {
8569		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8570		ASSERT(MUTEX_HELD(&dtrace_lock));
8571		ASSERT(dtrace_anon.dta_enabling == NULL);
8572
8573		/*
8574		 * We make sure that the DTrace provider is at the head of
8575		 * the provider chain.
8576		 */
8577		provider->dtpv_next = dtrace_provider;
8578		dtrace_provider = provider;
8579		return (0);
8580	}
8581
8582	mutex_enter(&dtrace_provider_lock);
8583	mutex_enter(&dtrace_lock);
8584
8585	/*
8586	 * If there is at least one provider registered, we'll add this
8587	 * provider after the first provider.
8588	 */
8589	if (dtrace_provider != NULL) {
8590		provider->dtpv_next = dtrace_provider->dtpv_next;
8591		dtrace_provider->dtpv_next = provider;
8592	} else {
8593		dtrace_provider = provider;
8594	}
8595
8596	if (dtrace_retained != NULL) {
8597		dtrace_enabling_provide(provider);
8598
8599		/*
8600		 * Now we need to call dtrace_enabling_matchall() -- which
8601		 * will acquire cpu_lock and dtrace_lock.  We therefore need
8602		 * to drop all of our locks before calling into it...
8603		 */
8604		mutex_exit(&dtrace_lock);
8605		mutex_exit(&dtrace_provider_lock);
8606		dtrace_enabling_matchall();
8607
8608		return (0);
8609	}
8610
8611	mutex_exit(&dtrace_lock);
8612	mutex_exit(&dtrace_provider_lock);
8613
8614	return (0);
8615}
8616
8617/*
8618 * Unregister the specified provider from the DTrace framework.  This should
8619 * generally be called by DTrace providers in their detach(9E) entry point.
8620 */
8621int
8622dtrace_unregister(dtrace_provider_id_t id)
8623{
8624	dtrace_provider_t *old = (dtrace_provider_t *)id;
8625	dtrace_provider_t *prev = NULL;
8626	int i, self = 0, noreap = 0;
8627	dtrace_probe_t *probe, *first = NULL;
8628
8629	if (old->dtpv_pops.dtps_enable ==
8630	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
8631		/*
8632		 * If DTrace itself is the provider, we're called with locks
8633		 * already held.
8634		 */
8635		ASSERT(old == dtrace_provider);
8636#if defined(sun)
8637		ASSERT(dtrace_devi != NULL);
8638#endif
8639		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8640		ASSERT(MUTEX_HELD(&dtrace_lock));
8641		self = 1;
8642
8643		if (dtrace_provider->dtpv_next != NULL) {
8644			/*
8645			 * There's another provider here; return failure.
8646			 */
8647			return (EBUSY);
8648		}
8649	} else {
8650		mutex_enter(&dtrace_provider_lock);
8651#if defined(sun)
8652		mutex_enter(&mod_lock);
8653#endif
8654		mutex_enter(&dtrace_lock);
8655	}
8656
8657	/*
8658	 * If anyone has /dev/dtrace open, or if there are anonymous enabled
8659	 * probes, we refuse to let providers slither away, unless this
8660	 * provider has already been explicitly invalidated.
8661	 */
8662	if (!old->dtpv_defunct &&
8663	    (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8664	    dtrace_anon.dta_state->dts_necbs > 0))) {
8665		if (!self) {
8666			mutex_exit(&dtrace_lock);
8667#if defined(sun)
8668			mutex_exit(&mod_lock);
8669#endif
8670			mutex_exit(&dtrace_provider_lock);
8671		}
8672		return (EBUSY);
8673	}
8674
8675	/*
8676	 * Attempt to destroy the probes associated with this provider.
8677	 */
8678	for (i = 0; i < dtrace_nprobes; i++) {
8679		if ((probe = dtrace_probes[i]) == NULL)
8680			continue;
8681
8682		if (probe->dtpr_provider != old)
8683			continue;
8684
8685		if (probe->dtpr_ecb == NULL)
8686			continue;
8687
8688		/*
8689		 * If we are trying to unregister a defunct provider, and the
8690		 * provider was made defunct within the interval dictated by
8691		 * dtrace_unregister_defunct_reap, we'll (asynchronously)
8692		 * attempt to reap our enablings.  To denote that the provider
8693		 * should reattempt to unregister itself at some point in the
8694		 * future, we will return a differentiable error code (EAGAIN
8695		 * instead of EBUSY) in this case.
8696		 */
8697		if (dtrace_gethrtime() - old->dtpv_defunct >
8698		    dtrace_unregister_defunct_reap)
8699			noreap = 1;
8700
8701		if (!self) {
8702			mutex_exit(&dtrace_lock);
8703#if defined(sun)
8704			mutex_exit(&mod_lock);
8705#endif
8706			mutex_exit(&dtrace_provider_lock);
8707		}
8708
8709		if (noreap)
8710			return (EBUSY);
8711
8712		(void) taskq_dispatch(dtrace_taskq,
8713		    (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
8714
8715		return (EAGAIN);
8716	}
8717
8718	/*
8719	 * All of the probes for this provider are disabled; we can safely
8720	 * remove all of them from their hash chains and from the probe array.
8721	 */
8722	for (i = 0; i < dtrace_nprobes; i++) {
8723		if ((probe = dtrace_probes[i]) == NULL)
8724			continue;
8725
8726		if (probe->dtpr_provider != old)
8727			continue;
8728
8729		dtrace_probes[i] = NULL;
8730
8731		dtrace_hash_remove(dtrace_bymod, probe);
8732		dtrace_hash_remove(dtrace_byfunc, probe);
8733		dtrace_hash_remove(dtrace_byname, probe);
8734
8735		if (first == NULL) {
8736			first = probe;
8737			probe->dtpr_nextmod = NULL;
8738		} else {
8739			probe->dtpr_nextmod = first;
8740			first = probe;
8741		}
8742	}
8743
8744	/*
8745	 * The provider's probes have been removed from the hash chains and
8746	 * from the probe array.  Now issue a dtrace_sync() to be sure that
8747	 * everyone has cleared out from any probe array processing.
8748	 */
8749	dtrace_sync();
8750
8751	for (probe = first; probe != NULL; probe = first) {
8752		first = probe->dtpr_nextmod;
8753
8754		old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8755		    probe->dtpr_arg);
8756		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8757		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8758		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8759#if defined(sun)
8760		vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8761#else
8762		free_unr(dtrace_arena, probe->dtpr_id);
8763#endif
8764		kmem_free(probe, sizeof (dtrace_probe_t));
8765	}
8766
8767	if ((prev = dtrace_provider) == old) {
8768#if defined(sun)
8769		ASSERT(self || dtrace_devi == NULL);
8770		ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8771#endif
8772		dtrace_provider = old->dtpv_next;
8773	} else {
8774		while (prev != NULL && prev->dtpv_next != old)
8775			prev = prev->dtpv_next;
8776
8777		if (prev == NULL) {
8778			panic("attempt to unregister non-existent "
8779			    "dtrace provider %p\n", (void *)id);
8780		}
8781
8782		prev->dtpv_next = old->dtpv_next;
8783	}
8784
8785	if (!self) {
8786		mutex_exit(&dtrace_lock);
8787#if defined(sun)
8788		mutex_exit(&mod_lock);
8789#endif
8790		mutex_exit(&dtrace_provider_lock);
8791	}
8792
8793	kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
8794	kmem_free(old, sizeof (dtrace_provider_t));
8795
8796	return (0);
8797}
8798
8799/*
8800 * Invalidate the specified provider.  All subsequent probe lookups for the
8801 * specified provider will fail, but its probes will not be removed.
8802 */
8803void
8804dtrace_invalidate(dtrace_provider_id_t id)
8805{
8806	dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8807
8808	ASSERT(pvp->dtpv_pops.dtps_enable !=
8809	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
8810
8811	mutex_enter(&dtrace_provider_lock);
8812	mutex_enter(&dtrace_lock);
8813
8814	pvp->dtpv_defunct = dtrace_gethrtime();
8815
8816	mutex_exit(&dtrace_lock);
8817	mutex_exit(&dtrace_provider_lock);
8818}
8819
8820/*
8821 * Indicate whether or not DTrace has attached.
8822 */
8823int
8824dtrace_attached(void)
8825{
8826	/*
8827	 * dtrace_provider will be non-NULL iff the DTrace driver has
8828	 * attached.  (It's non-NULL because DTrace is always itself a
8829	 * provider.)
8830	 */
8831	return (dtrace_provider != NULL);
8832}
8833
8834/*
8835 * Remove all the unenabled probes for the given provider.  This function is
8836 * not unlike dtrace_unregister(), except that it doesn't remove the provider
8837 * -- just as many of its associated probes as it can.
8838 */
8839int
8840dtrace_condense(dtrace_provider_id_t id)
8841{
8842	dtrace_provider_t *prov = (dtrace_provider_t *)id;
8843	int i;
8844	dtrace_probe_t *probe;
8845
8846	/*
8847	 * Make sure this isn't the dtrace provider itself.
8848	 */
8849	ASSERT(prov->dtpv_pops.dtps_enable !=
8850	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
8851
8852	mutex_enter(&dtrace_provider_lock);
8853	mutex_enter(&dtrace_lock);
8854
8855	/*
8856	 * Attempt to destroy the probes associated with this provider.
8857	 */
8858	for (i = 0; i < dtrace_nprobes; i++) {
8859		if ((probe = dtrace_probes[i]) == NULL)
8860			continue;
8861
8862		if (probe->dtpr_provider != prov)
8863			continue;
8864
8865		if (probe->dtpr_ecb != NULL)
8866			continue;
8867
8868		dtrace_probes[i] = NULL;
8869
8870		dtrace_hash_remove(dtrace_bymod, probe);
8871		dtrace_hash_remove(dtrace_byfunc, probe);
8872		dtrace_hash_remove(dtrace_byname, probe);
8873
8874		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
8875		    probe->dtpr_arg);
8876		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8877		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8878		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8879		kmem_free(probe, sizeof (dtrace_probe_t));
8880#if defined(sun)
8881		vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
8882#else
8883		free_unr(dtrace_arena, i + 1);
8884#endif
8885	}
8886
8887	mutex_exit(&dtrace_lock);
8888	mutex_exit(&dtrace_provider_lock);
8889
8890	return (0);
8891}
8892
8893/*
8894 * DTrace Probe Management Functions
8895 *
8896 * The functions in this section perform the DTrace probe management,
8897 * including functions to create probes, look-up probes, and call into the
8898 * providers to request that probes be provided.  Some of these functions are
8899 * in the Provider-to-Framework API; these functions can be identified by the
8900 * fact that they are not declared "static".
8901 */
8902
8903/*
8904 * Create a probe with the specified module name, function name, and name.
8905 */
8906dtrace_id_t
8907dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
8908    const char *func, const char *name, int aframes, void *arg)
8909{
8910	dtrace_probe_t *probe, **probes;
8911	dtrace_provider_t *provider = (dtrace_provider_t *)prov;
8912	dtrace_id_t id;
8913
8914	if (provider == dtrace_provider) {
8915		ASSERT(MUTEX_HELD(&dtrace_lock));
8916	} else {
8917		mutex_enter(&dtrace_lock);
8918	}
8919
8920#if defined(sun)
8921	id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
8922	    VM_BESTFIT | VM_SLEEP);
8923#else
8924	id = alloc_unr(dtrace_arena);
8925#endif
8926	probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
8927
8928	probe->dtpr_id = id;
8929	probe->dtpr_gen = dtrace_probegen++;
8930	probe->dtpr_mod = dtrace_strdup(mod);
8931	probe->dtpr_func = dtrace_strdup(func);
8932	probe->dtpr_name = dtrace_strdup(name);
8933	probe->dtpr_arg = arg;
8934	probe->dtpr_aframes = aframes;
8935	probe->dtpr_provider = provider;
8936
8937	dtrace_hash_add(dtrace_bymod, probe);
8938	dtrace_hash_add(dtrace_byfunc, probe);
8939	dtrace_hash_add(dtrace_byname, probe);
8940
8941	if (id - 1 >= dtrace_nprobes) {
8942		size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
8943		size_t nsize = osize << 1;
8944
8945		if (nsize == 0) {
8946			ASSERT(osize == 0);
8947			ASSERT(dtrace_probes == NULL);
8948			nsize = sizeof (dtrace_probe_t *);
8949		}
8950
8951		probes = kmem_zalloc(nsize, KM_SLEEP);
8952
8953		if (dtrace_probes == NULL) {
8954			ASSERT(osize == 0);
8955			dtrace_probes = probes;
8956			dtrace_nprobes = 1;
8957		} else {
8958			dtrace_probe_t **oprobes = dtrace_probes;
8959
8960			bcopy(oprobes, probes, osize);
8961			dtrace_membar_producer();
8962			dtrace_probes = probes;
8963
8964			dtrace_sync();
8965
8966			/*
8967			 * All CPUs are now seeing the new probes array; we can
8968			 * safely free the old array.
8969			 */
8970			kmem_free(oprobes, osize);
8971			dtrace_nprobes <<= 1;
8972		}
8973
8974		ASSERT(id - 1 < dtrace_nprobes);
8975	}
8976
8977	ASSERT(dtrace_probes[id - 1] == NULL);
8978	dtrace_probes[id - 1] = probe;
8979
8980	if (provider != dtrace_provider)
8981		mutex_exit(&dtrace_lock);
8982
8983	return (id);
8984}
8985
8986static dtrace_probe_t *
8987dtrace_probe_lookup_id(dtrace_id_t id)
8988{
8989	ASSERT(MUTEX_HELD(&dtrace_lock));
8990
8991	if (id == 0 || id > dtrace_nprobes)
8992		return (NULL);
8993
8994	return (dtrace_probes[id - 1]);
8995}
8996
8997static int
8998dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
8999{
9000	*((dtrace_id_t *)arg) = probe->dtpr_id;
9001
9002	return (DTRACE_MATCH_DONE);
9003}
9004
9005/*
9006 * Look up a probe based on provider and one or more of module name, function
9007 * name and probe name.
9008 */
9009dtrace_id_t
9010dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,
9011    char *func, char *name)
9012{
9013	dtrace_probekey_t pkey;
9014	dtrace_id_t id;
9015	int match;
9016
9017	pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
9018	pkey.dtpk_pmatch = &dtrace_match_string;
9019	pkey.dtpk_mod = mod;
9020	pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
9021	pkey.dtpk_func = func;
9022	pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
9023	pkey.dtpk_name = name;
9024	pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
9025	pkey.dtpk_id = DTRACE_IDNONE;
9026
9027	mutex_enter(&dtrace_lock);
9028	match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
9029	    dtrace_probe_lookup_match, &id);
9030	mutex_exit(&dtrace_lock);
9031
9032	ASSERT(match == 1 || match == 0);
9033	return (match ? id : 0);
9034}
9035
9036/*
9037 * Returns the probe argument associated with the specified probe.
9038 */
9039void *
9040dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
9041{
9042	dtrace_probe_t *probe;
9043	void *rval = NULL;
9044
9045	mutex_enter(&dtrace_lock);
9046
9047	if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
9048	    probe->dtpr_provider == (dtrace_provider_t *)id)
9049		rval = probe->dtpr_arg;
9050
9051	mutex_exit(&dtrace_lock);
9052
9053	return (rval);
9054}
9055
9056/*
9057 * Copy a probe into a probe description.
9058 */
9059static void
9060dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
9061{
9062	bzero(pdp, sizeof (dtrace_probedesc_t));
9063	pdp->dtpd_id = prp->dtpr_id;
9064
9065	(void) strncpy(pdp->dtpd_provider,
9066	    prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
9067
9068	(void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
9069	(void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
9070	(void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
9071}
9072
9073/*
9074 * Called to indicate that a probe -- or probes -- should be provided by a
9075 * specfied provider.  If the specified description is NULL, the provider will
9076 * be told to provide all of its probes.  (This is done whenever a new
9077 * consumer comes along, or whenever a retained enabling is to be matched.) If
9078 * the specified description is non-NULL, the provider is given the
9079 * opportunity to dynamically provide the specified probe, allowing providers
9080 * to support the creation of probes on-the-fly.  (So-called _autocreated_
9081 * probes.)  If the provider is NULL, the operations will be applied to all
9082 * providers; if the provider is non-NULL the operations will only be applied
9083 * to the specified provider.  The dtrace_provider_lock must be held, and the
9084 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
9085 * will need to grab the dtrace_lock when it reenters the framework through
9086 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
9087 */
9088static void
9089dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
9090{
9091#if defined(sun)
9092	modctl_t *ctl;
9093#endif
9094	int all = 0;
9095
9096	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
9097
9098	if (prv == NULL) {
9099		all = 1;
9100		prv = dtrace_provider;
9101	}
9102
9103	do {
9104		/*
9105		 * First, call the blanket provide operation.
9106		 */
9107		prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
9108
9109#if defined(sun)
9110		/*
9111		 * Now call the per-module provide operation.  We will grab
9112		 * mod_lock to prevent the list from being modified.  Note
9113		 * that this also prevents the mod_busy bits from changing.
9114		 * (mod_busy can only be changed with mod_lock held.)
9115		 */
9116		mutex_enter(&mod_lock);
9117
9118		ctl = &modules;
9119		do {
9120			if (ctl->mod_busy || ctl->mod_mp == NULL)
9121				continue;
9122
9123			prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
9124
9125		} while ((ctl = ctl->mod_next) != &modules);
9126
9127		mutex_exit(&mod_lock);
9128#endif
9129	} while (all && (prv = prv->dtpv_next) != NULL);
9130}
9131
9132#if defined(sun)
9133/*
9134 * Iterate over each probe, and call the Framework-to-Provider API function
9135 * denoted by offs.
9136 */
9137static void
9138dtrace_probe_foreach(uintptr_t offs)
9139{
9140	dtrace_provider_t *prov;
9141	void (*func)(void *, dtrace_id_t, void *);
9142	dtrace_probe_t *probe;
9143	dtrace_icookie_t cookie;
9144	int i;
9145
9146	/*
9147	 * We disable interrupts to walk through the probe array.  This is
9148	 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
9149	 * won't see stale data.
9150	 */
9151	cookie = dtrace_interrupt_disable();
9152
9153	for (i = 0; i < dtrace_nprobes; i++) {
9154		if ((probe = dtrace_probes[i]) == NULL)
9155			continue;
9156
9157		if (probe->dtpr_ecb == NULL) {
9158			/*
9159			 * This probe isn't enabled -- don't call the function.
9160			 */
9161			continue;
9162		}
9163
9164		prov = probe->dtpr_provider;
9165		func = *((void(**)(void *, dtrace_id_t, void *))
9166		    ((uintptr_t)&prov->dtpv_pops + offs));
9167
9168		func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
9169	}
9170
9171	dtrace_interrupt_enable(cookie);
9172}
9173#endif
9174
9175static int
9176dtrace_probe_enable(dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
9177{
9178	dtrace_probekey_t pkey;
9179	uint32_t priv;
9180	uid_t uid;
9181	zoneid_t zoneid;
9182
9183	ASSERT(MUTEX_HELD(&dtrace_lock));
9184	dtrace_ecb_create_cache = NULL;
9185
9186	if (desc == NULL) {
9187		/*
9188		 * If we're passed a NULL description, we're being asked to
9189		 * create an ECB with a NULL probe.
9190		 */
9191		(void) dtrace_ecb_create_enable(NULL, enab);
9192		return (0);
9193	}
9194
9195	dtrace_probekey(desc, &pkey);
9196	dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
9197	    &priv, &uid, &zoneid);
9198
9199	return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
9200	    enab));
9201}
9202
9203/*
9204 * DTrace Helper Provider Functions
9205 */
9206static void
9207dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
9208{
9209	attr->dtat_name = DOF_ATTR_NAME(dofattr);
9210	attr->dtat_data = DOF_ATTR_DATA(dofattr);
9211	attr->dtat_class = DOF_ATTR_CLASS(dofattr);
9212}
9213
9214static void
9215dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
9216    const dof_provider_t *dofprov, char *strtab)
9217{
9218	hprov->dthpv_provname = strtab + dofprov->dofpv_name;
9219	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
9220	    dofprov->dofpv_provattr);
9221	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
9222	    dofprov->dofpv_modattr);
9223	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
9224	    dofprov->dofpv_funcattr);
9225	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
9226	    dofprov->dofpv_nameattr);
9227	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
9228	    dofprov->dofpv_argsattr);
9229}
9230
9231static void
9232dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9233{
9234	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9235	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9236	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
9237	dof_provider_t *provider;
9238	dof_probe_t *probe;
9239	uint32_t *off, *enoff;
9240	uint8_t *arg;
9241	char *strtab;
9242	uint_t i, nprobes;
9243	dtrace_helper_provdesc_t dhpv;
9244	dtrace_helper_probedesc_t dhpb;
9245	dtrace_meta_t *meta = dtrace_meta_pid;
9246	dtrace_mops_t *mops = &meta->dtm_mops;
9247	void *parg;
9248
9249	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9250	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9251	    provider->dofpv_strtab * dof->dofh_secsize);
9252	prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9253	    provider->dofpv_probes * dof->dofh_secsize);
9254	arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9255	    provider->dofpv_prargs * dof->dofh_secsize);
9256	off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9257	    provider->dofpv_proffs * dof->dofh_secsize);
9258
9259	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9260	off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
9261	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
9262	enoff = NULL;
9263
9264	/*
9265	 * See dtrace_helper_provider_validate().
9266	 */
9267	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
9268	    provider->dofpv_prenoffs != DOF_SECT_NONE) {
9269		enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9270		    provider->dofpv_prenoffs * dof->dofh_secsize);
9271		enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
9272	}
9273
9274	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
9275
9276	/*
9277	 * Create the provider.
9278	 */
9279	dtrace_dofprov2hprov(&dhpv, provider, strtab);
9280
9281	if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
9282		return;
9283
9284	meta->dtm_count++;
9285
9286	/*
9287	 * Create the probes.
9288	 */
9289	for (i = 0; i < nprobes; i++) {
9290		probe = (dof_probe_t *)(uintptr_t)(daddr +
9291		    prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
9292
9293		dhpb.dthpb_mod = dhp->dofhp_mod;
9294		dhpb.dthpb_func = strtab + probe->dofpr_func;
9295		dhpb.dthpb_name = strtab + probe->dofpr_name;
9296		dhpb.dthpb_base = probe->dofpr_addr;
9297		dhpb.dthpb_offs = off + probe->dofpr_offidx;
9298		dhpb.dthpb_noffs = probe->dofpr_noffs;
9299		if (enoff != NULL) {
9300			dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
9301			dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
9302		} else {
9303			dhpb.dthpb_enoffs = NULL;
9304			dhpb.dthpb_nenoffs = 0;
9305		}
9306		dhpb.dthpb_args = arg + probe->dofpr_argidx;
9307		dhpb.dthpb_nargc = probe->dofpr_nargc;
9308		dhpb.dthpb_xargc = probe->dofpr_xargc;
9309		dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
9310		dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
9311
9312		mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
9313	}
9314}
9315
9316static void
9317dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
9318{
9319	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9320	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9321	int i;
9322
9323	ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9324
9325	for (i = 0; i < dof->dofh_secnum; i++) {
9326		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9327		    dof->dofh_secoff + i * dof->dofh_secsize);
9328
9329		if (sec->dofs_type != DOF_SECT_PROVIDER)
9330			continue;
9331
9332		dtrace_helper_provide_one(dhp, sec, pid);
9333	}
9334
9335	/*
9336	 * We may have just created probes, so we must now rematch against
9337	 * any retained enablings.  Note that this call will acquire both
9338	 * cpu_lock and dtrace_lock; the fact that we are holding
9339	 * dtrace_meta_lock now is what defines the ordering with respect to
9340	 * these three locks.
9341	 */
9342	dtrace_enabling_matchall();
9343}
9344
9345static void
9346dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9347{
9348	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9349	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9350	dof_sec_t *str_sec;
9351	dof_provider_t *provider;
9352	char *strtab;
9353	dtrace_helper_provdesc_t dhpv;
9354	dtrace_meta_t *meta = dtrace_meta_pid;
9355	dtrace_mops_t *mops = &meta->dtm_mops;
9356
9357	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9358	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9359	    provider->dofpv_strtab * dof->dofh_secsize);
9360
9361	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9362
9363	/*
9364	 * Create the provider.
9365	 */
9366	dtrace_dofprov2hprov(&dhpv, provider, strtab);
9367
9368	mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
9369
9370	meta->dtm_count--;
9371}
9372
9373static void
9374dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
9375{
9376	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9377	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9378	int i;
9379
9380	ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9381
9382	for (i = 0; i < dof->dofh_secnum; i++) {
9383		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9384		    dof->dofh_secoff + i * dof->dofh_secsize);
9385
9386		if (sec->dofs_type != DOF_SECT_PROVIDER)
9387			continue;
9388
9389		dtrace_helper_provider_remove_one(dhp, sec, pid);
9390	}
9391}
9392
9393/*
9394 * DTrace Meta Provider-to-Framework API Functions
9395 *
9396 * These functions implement the Meta Provider-to-Framework API, as described
9397 * in <sys/dtrace.h>.
9398 */
9399int
9400dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
9401    dtrace_meta_provider_id_t *idp)
9402{
9403	dtrace_meta_t *meta;
9404	dtrace_helpers_t *help, *next;
9405	int i;
9406
9407	*idp = DTRACE_METAPROVNONE;
9408
9409	/*
9410	 * We strictly don't need the name, but we hold onto it for
9411	 * debuggability. All hail error queues!
9412	 */
9413	if (name == NULL) {
9414		cmn_err(CE_WARN, "failed to register meta-provider: "
9415		    "invalid name");
9416		return (EINVAL);
9417	}
9418
9419	if (mops == NULL ||
9420	    mops->dtms_create_probe == NULL ||
9421	    mops->dtms_provide_pid == NULL ||
9422	    mops->dtms_remove_pid == NULL) {
9423		cmn_err(CE_WARN, "failed to register meta-register %s: "
9424		    "invalid ops", name);
9425		return (EINVAL);
9426	}
9427
9428	meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
9429	meta->dtm_mops = *mops;
9430	meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
9431	(void) strcpy(meta->dtm_name, name);
9432	meta->dtm_arg = arg;
9433
9434	mutex_enter(&dtrace_meta_lock);
9435	mutex_enter(&dtrace_lock);
9436
9437	if (dtrace_meta_pid != NULL) {
9438		mutex_exit(&dtrace_lock);
9439		mutex_exit(&dtrace_meta_lock);
9440		cmn_err(CE_WARN, "failed to register meta-register %s: "
9441		    "user-land meta-provider exists", name);
9442		kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
9443		kmem_free(meta, sizeof (dtrace_meta_t));
9444		return (EINVAL);
9445	}
9446
9447	dtrace_meta_pid = meta;
9448	*idp = (dtrace_meta_provider_id_t)meta;
9449
9450	/*
9451	 * If there are providers and probes ready to go, pass them
9452	 * off to the new meta provider now.
9453	 */
9454
9455	help = dtrace_deferred_pid;
9456	dtrace_deferred_pid = NULL;
9457
9458	mutex_exit(&dtrace_lock);
9459
9460	while (help != NULL) {
9461		for (i = 0; i < help->dthps_nprovs; i++) {
9462			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
9463			    help->dthps_pid);
9464		}
9465
9466		next = help->dthps_next;
9467		help->dthps_next = NULL;
9468		help->dthps_prev = NULL;
9469		help->dthps_deferred = 0;
9470		help = next;
9471	}
9472
9473	mutex_exit(&dtrace_meta_lock);
9474
9475	return (0);
9476}
9477
9478int
9479dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9480{
9481	dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9482
9483	mutex_enter(&dtrace_meta_lock);
9484	mutex_enter(&dtrace_lock);
9485
9486	if (old == dtrace_meta_pid) {
9487		pp = &dtrace_meta_pid;
9488	} else {
9489		panic("attempt to unregister non-existent "
9490		    "dtrace meta-provider %p\n", (void *)old);
9491	}
9492
9493	if (old->dtm_count != 0) {
9494		mutex_exit(&dtrace_lock);
9495		mutex_exit(&dtrace_meta_lock);
9496		return (EBUSY);
9497	}
9498
9499	*pp = NULL;
9500
9501	mutex_exit(&dtrace_lock);
9502	mutex_exit(&dtrace_meta_lock);
9503
9504	kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
9505	kmem_free(old, sizeof (dtrace_meta_t));
9506
9507	return (0);
9508}
9509
9510
9511/*
9512 * DTrace DIF Object Functions
9513 */
9514static int
9515dtrace_difo_err(uint_t pc, const char *format, ...)
9516{
9517	if (dtrace_err_verbose) {
9518		va_list alist;
9519
9520		(void) uprintf("dtrace DIF object error: [%u]: ", pc);
9521		va_start(alist, format);
9522		(void) vuprintf(format, alist);
9523		va_end(alist);
9524	}
9525
9526#ifdef DTRACE_ERRDEBUG
9527	dtrace_errdebug(format);
9528#endif
9529	return (1);
9530}
9531
9532/*
9533 * Validate a DTrace DIF object by checking the IR instructions.  The following
9534 * rules are currently enforced by dtrace_difo_validate():
9535 *
9536 * 1. Each instruction must have a valid opcode
9537 * 2. Each register, string, variable, or subroutine reference must be valid
9538 * 3. No instruction can modify register %r0 (must be zero)
9539 * 4. All instruction reserved bits must be set to zero
9540 * 5. The last instruction must be a "ret" instruction
9541 * 6. All branch targets must reference a valid instruction _after_ the branch
9542 */
9543static int
9544dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9545    cred_t *cr)
9546{
9547	int err = 0, i;
9548	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9549	int kcheckload;
9550	uint_t pc;
9551
9552	kcheckload = cr == NULL ||
9553	    (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9554
9555	dp->dtdo_destructive = 0;
9556
9557	for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9558		dif_instr_t instr = dp->dtdo_buf[pc];
9559
9560		uint_t r1 = DIF_INSTR_R1(instr);
9561		uint_t r2 = DIF_INSTR_R2(instr);
9562		uint_t rd = DIF_INSTR_RD(instr);
9563		uint_t rs = DIF_INSTR_RS(instr);
9564		uint_t label = DIF_INSTR_LABEL(instr);
9565		uint_t v = DIF_INSTR_VAR(instr);
9566		uint_t subr = DIF_INSTR_SUBR(instr);
9567		uint_t type = DIF_INSTR_TYPE(instr);
9568		uint_t op = DIF_INSTR_OP(instr);
9569
9570		switch (op) {
9571		case DIF_OP_OR:
9572		case DIF_OP_XOR:
9573		case DIF_OP_AND:
9574		case DIF_OP_SLL:
9575		case DIF_OP_SRL:
9576		case DIF_OP_SRA:
9577		case DIF_OP_SUB:
9578		case DIF_OP_ADD:
9579		case DIF_OP_MUL:
9580		case DIF_OP_SDIV:
9581		case DIF_OP_UDIV:
9582		case DIF_OP_SREM:
9583		case DIF_OP_UREM:
9584		case DIF_OP_COPYS:
9585			if (r1 >= nregs)
9586				err += efunc(pc, "invalid register %u\n", r1);
9587			if (r2 >= nregs)
9588				err += efunc(pc, "invalid register %u\n", r2);
9589			if (rd >= nregs)
9590				err += efunc(pc, "invalid register %u\n", rd);
9591			if (rd == 0)
9592				err += efunc(pc, "cannot write to %r0\n");
9593			break;
9594		case DIF_OP_NOT:
9595		case DIF_OP_MOV:
9596		case DIF_OP_ALLOCS:
9597			if (r1 >= nregs)
9598				err += efunc(pc, "invalid register %u\n", r1);
9599			if (r2 != 0)
9600				err += efunc(pc, "non-zero reserved bits\n");
9601			if (rd >= nregs)
9602				err += efunc(pc, "invalid register %u\n", rd);
9603			if (rd == 0)
9604				err += efunc(pc, "cannot write to %r0\n");
9605			break;
9606		case DIF_OP_LDSB:
9607		case DIF_OP_LDSH:
9608		case DIF_OP_LDSW:
9609		case DIF_OP_LDUB:
9610		case DIF_OP_LDUH:
9611		case DIF_OP_LDUW:
9612		case DIF_OP_LDX:
9613			if (r1 >= nregs)
9614				err += efunc(pc, "invalid register %u\n", r1);
9615			if (r2 != 0)
9616				err += efunc(pc, "non-zero reserved bits\n");
9617			if (rd >= nregs)
9618				err += efunc(pc, "invalid register %u\n", rd);
9619			if (rd == 0)
9620				err += efunc(pc, "cannot write to %r0\n");
9621			if (kcheckload)
9622				dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9623				    DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9624			break;
9625		case DIF_OP_RLDSB:
9626		case DIF_OP_RLDSH:
9627		case DIF_OP_RLDSW:
9628		case DIF_OP_RLDUB:
9629		case DIF_OP_RLDUH:
9630		case DIF_OP_RLDUW:
9631		case DIF_OP_RLDX:
9632			if (r1 >= nregs)
9633				err += efunc(pc, "invalid register %u\n", r1);
9634			if (r2 != 0)
9635				err += efunc(pc, "non-zero reserved bits\n");
9636			if (rd >= nregs)
9637				err += efunc(pc, "invalid register %u\n", rd);
9638			if (rd == 0)
9639				err += efunc(pc, "cannot write to %r0\n");
9640			break;
9641		case DIF_OP_ULDSB:
9642		case DIF_OP_ULDSH:
9643		case DIF_OP_ULDSW:
9644		case DIF_OP_ULDUB:
9645		case DIF_OP_ULDUH:
9646		case DIF_OP_ULDUW:
9647		case DIF_OP_ULDX:
9648			if (r1 >= nregs)
9649				err += efunc(pc, "invalid register %u\n", r1);
9650			if (r2 != 0)
9651				err += efunc(pc, "non-zero reserved bits\n");
9652			if (rd >= nregs)
9653				err += efunc(pc, "invalid register %u\n", rd);
9654			if (rd == 0)
9655				err += efunc(pc, "cannot write to %r0\n");
9656			break;
9657		case DIF_OP_STB:
9658		case DIF_OP_STH:
9659		case DIF_OP_STW:
9660		case DIF_OP_STX:
9661			if (r1 >= nregs)
9662				err += efunc(pc, "invalid register %u\n", r1);
9663			if (r2 != 0)
9664				err += efunc(pc, "non-zero reserved bits\n");
9665			if (rd >= nregs)
9666				err += efunc(pc, "invalid register %u\n", rd);
9667			if (rd == 0)
9668				err += efunc(pc, "cannot write to 0 address\n");
9669			break;
9670		case DIF_OP_CMP:
9671		case DIF_OP_SCMP:
9672			if (r1 >= nregs)
9673				err += efunc(pc, "invalid register %u\n", r1);
9674			if (r2 >= nregs)
9675				err += efunc(pc, "invalid register %u\n", r2);
9676			if (rd != 0)
9677				err += efunc(pc, "non-zero reserved bits\n");
9678			break;
9679		case DIF_OP_TST:
9680			if (r1 >= nregs)
9681				err += efunc(pc, "invalid register %u\n", r1);
9682			if (r2 != 0 || rd != 0)
9683				err += efunc(pc, "non-zero reserved bits\n");
9684			break;
9685		case DIF_OP_BA:
9686		case DIF_OP_BE:
9687		case DIF_OP_BNE:
9688		case DIF_OP_BG:
9689		case DIF_OP_BGU:
9690		case DIF_OP_BGE:
9691		case DIF_OP_BGEU:
9692		case DIF_OP_BL:
9693		case DIF_OP_BLU:
9694		case DIF_OP_BLE:
9695		case DIF_OP_BLEU:
9696			if (label >= dp->dtdo_len) {
9697				err += efunc(pc, "invalid branch target %u\n",
9698				    label);
9699			}
9700			if (label <= pc) {
9701				err += efunc(pc, "backward branch to %u\n",
9702				    label);
9703			}
9704			break;
9705		case DIF_OP_RET:
9706			if (r1 != 0 || r2 != 0)
9707				err += efunc(pc, "non-zero reserved bits\n");
9708			if (rd >= nregs)
9709				err += efunc(pc, "invalid register %u\n", rd);
9710			break;
9711		case DIF_OP_NOP:
9712		case DIF_OP_POPTS:
9713		case DIF_OP_FLUSHTS:
9714			if (r1 != 0 || r2 != 0 || rd != 0)
9715				err += efunc(pc, "non-zero reserved bits\n");
9716			break;
9717		case DIF_OP_SETX:
9718			if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9719				err += efunc(pc, "invalid integer ref %u\n",
9720				    DIF_INSTR_INTEGER(instr));
9721			}
9722			if (rd >= nregs)
9723				err += efunc(pc, "invalid register %u\n", rd);
9724			if (rd == 0)
9725				err += efunc(pc, "cannot write to %r0\n");
9726			break;
9727		case DIF_OP_SETS:
9728			if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9729				err += efunc(pc, "invalid string ref %u\n",
9730				    DIF_INSTR_STRING(instr));
9731			}
9732			if (rd >= nregs)
9733				err += efunc(pc, "invalid register %u\n", rd);
9734			if (rd == 0)
9735				err += efunc(pc, "cannot write to %r0\n");
9736			break;
9737		case DIF_OP_LDGA:
9738		case DIF_OP_LDTA:
9739			if (r1 > DIF_VAR_ARRAY_MAX)
9740				err += efunc(pc, "invalid array %u\n", r1);
9741			if (r2 >= nregs)
9742				err += efunc(pc, "invalid register %u\n", r2);
9743			if (rd >= nregs)
9744				err += efunc(pc, "invalid register %u\n", rd);
9745			if (rd == 0)
9746				err += efunc(pc, "cannot write to %r0\n");
9747			break;
9748		case DIF_OP_LDGS:
9749		case DIF_OP_LDTS:
9750		case DIF_OP_LDLS:
9751		case DIF_OP_LDGAA:
9752		case DIF_OP_LDTAA:
9753			if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9754				err += efunc(pc, "invalid variable %u\n", v);
9755			if (rd >= nregs)
9756				err += efunc(pc, "invalid register %u\n", rd);
9757			if (rd == 0)
9758				err += efunc(pc, "cannot write to %r0\n");
9759			break;
9760		case DIF_OP_STGS:
9761		case DIF_OP_STTS:
9762		case DIF_OP_STLS:
9763		case DIF_OP_STGAA:
9764		case DIF_OP_STTAA:
9765			if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9766				err += efunc(pc, "invalid variable %u\n", v);
9767			if (rs >= nregs)
9768				err += efunc(pc, "invalid register %u\n", rd);
9769			break;
9770		case DIF_OP_CALL:
9771			if (subr > DIF_SUBR_MAX)
9772				err += efunc(pc, "invalid subr %u\n", subr);
9773			if (rd >= nregs)
9774				err += efunc(pc, "invalid register %u\n", rd);
9775			if (rd == 0)
9776				err += efunc(pc, "cannot write to %r0\n");
9777
9778			if (subr == DIF_SUBR_COPYOUT ||
9779			    subr == DIF_SUBR_COPYOUTSTR) {
9780				dp->dtdo_destructive = 1;
9781			}
9782
9783			if (subr == DIF_SUBR_GETF) {
9784				/*
9785				 * If we have a getf() we need to record that
9786				 * in our state.  Note that our state can be
9787				 * NULL if this is a helper -- but in that
9788				 * case, the call to getf() is itself illegal,
9789				 * and will be caught (slightly later) when
9790				 * the helper is validated.
9791				 */
9792				if (vstate->dtvs_state != NULL)
9793					vstate->dtvs_state->dts_getf++;
9794			}
9795
9796			break;
9797		case DIF_OP_PUSHTR:
9798			if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9799				err += efunc(pc, "invalid ref type %u\n", type);
9800			if (r2 >= nregs)
9801				err += efunc(pc, "invalid register %u\n", r2);
9802			if (rs >= nregs)
9803				err += efunc(pc, "invalid register %u\n", rs);
9804			break;
9805		case DIF_OP_PUSHTV:
9806			if (type != DIF_TYPE_CTF)
9807				err += efunc(pc, "invalid val type %u\n", type);
9808			if (r2 >= nregs)
9809				err += efunc(pc, "invalid register %u\n", r2);
9810			if (rs >= nregs)
9811				err += efunc(pc, "invalid register %u\n", rs);
9812			break;
9813		default:
9814			err += efunc(pc, "invalid opcode %u\n",
9815			    DIF_INSTR_OP(instr));
9816		}
9817	}
9818
9819	if (dp->dtdo_len != 0 &&
9820	    DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9821		err += efunc(dp->dtdo_len - 1,
9822		    "expected 'ret' as last DIF instruction\n");
9823	}
9824
9825	if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
9826		/*
9827		 * If we're not returning by reference, the size must be either
9828		 * 0 or the size of one of the base types.
9829		 */
9830		switch (dp->dtdo_rtype.dtdt_size) {
9831		case 0:
9832		case sizeof (uint8_t):
9833		case sizeof (uint16_t):
9834		case sizeof (uint32_t):
9835		case sizeof (uint64_t):
9836			break;
9837
9838		default:
9839			err += efunc(dp->dtdo_len - 1, "bad return size\n");
9840		}
9841	}
9842
9843	for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9844		dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9845		dtrace_diftype_t *vt, *et;
9846		uint_t id, ndx;
9847
9848		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9849		    v->dtdv_scope != DIFV_SCOPE_THREAD &&
9850		    v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9851			err += efunc(i, "unrecognized variable scope %d\n",
9852			    v->dtdv_scope);
9853			break;
9854		}
9855
9856		if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9857		    v->dtdv_kind != DIFV_KIND_SCALAR) {
9858			err += efunc(i, "unrecognized variable type %d\n",
9859			    v->dtdv_kind);
9860			break;
9861		}
9862
9863		if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9864			err += efunc(i, "%d exceeds variable id limit\n", id);
9865			break;
9866		}
9867
9868		if (id < DIF_VAR_OTHER_UBASE)
9869			continue;
9870
9871		/*
9872		 * For user-defined variables, we need to check that this
9873		 * definition is identical to any previous definition that we
9874		 * encountered.
9875		 */
9876		ndx = id - DIF_VAR_OTHER_UBASE;
9877
9878		switch (v->dtdv_scope) {
9879		case DIFV_SCOPE_GLOBAL:
9880			if (ndx < vstate->dtvs_nglobals) {
9881				dtrace_statvar_t *svar;
9882
9883				if ((svar = vstate->dtvs_globals[ndx]) != NULL)
9884					existing = &svar->dtsv_var;
9885			}
9886
9887			break;
9888
9889		case DIFV_SCOPE_THREAD:
9890			if (ndx < vstate->dtvs_ntlocals)
9891				existing = &vstate->dtvs_tlocals[ndx];
9892			break;
9893
9894		case DIFV_SCOPE_LOCAL:
9895			if (ndx < vstate->dtvs_nlocals) {
9896				dtrace_statvar_t *svar;
9897
9898				if ((svar = vstate->dtvs_locals[ndx]) != NULL)
9899					existing = &svar->dtsv_var;
9900			}
9901
9902			break;
9903		}
9904
9905		vt = &v->dtdv_type;
9906
9907		if (vt->dtdt_flags & DIF_TF_BYREF) {
9908			if (vt->dtdt_size == 0) {
9909				err += efunc(i, "zero-sized variable\n");
9910				break;
9911			}
9912
9913			if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
9914			    vt->dtdt_size > dtrace_global_maxsize) {
9915				err += efunc(i, "oversized by-ref global\n");
9916				break;
9917			}
9918		}
9919
9920		if (existing == NULL || existing->dtdv_id == 0)
9921			continue;
9922
9923		ASSERT(existing->dtdv_id == v->dtdv_id);
9924		ASSERT(existing->dtdv_scope == v->dtdv_scope);
9925
9926		if (existing->dtdv_kind != v->dtdv_kind)
9927			err += efunc(i, "%d changed variable kind\n", id);
9928
9929		et = &existing->dtdv_type;
9930
9931		if (vt->dtdt_flags != et->dtdt_flags) {
9932			err += efunc(i, "%d changed variable type flags\n", id);
9933			break;
9934		}
9935
9936		if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
9937			err += efunc(i, "%d changed variable type size\n", id);
9938			break;
9939		}
9940	}
9941
9942	return (err);
9943}
9944
9945/*
9946 * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
9947 * are much more constrained than normal DIFOs.  Specifically, they may
9948 * not:
9949 *
9950 * 1. Make calls to subroutines other than copyin(), copyinstr() or
9951 *    miscellaneous string routines
9952 * 2. Access DTrace variables other than the args[] array, and the
9953 *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
9954 * 3. Have thread-local variables.
9955 * 4. Have dynamic variables.
9956 */
9957static int
9958dtrace_difo_validate_helper(dtrace_difo_t *dp)
9959{
9960	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9961	int err = 0;
9962	uint_t pc;
9963
9964	for (pc = 0; pc < dp->dtdo_len; pc++) {
9965		dif_instr_t instr = dp->dtdo_buf[pc];
9966
9967		uint_t v = DIF_INSTR_VAR(instr);
9968		uint_t subr = DIF_INSTR_SUBR(instr);
9969		uint_t op = DIF_INSTR_OP(instr);
9970
9971		switch (op) {
9972		case DIF_OP_OR:
9973		case DIF_OP_XOR:
9974		case DIF_OP_AND:
9975		case DIF_OP_SLL:
9976		case DIF_OP_SRL:
9977		case DIF_OP_SRA:
9978		case DIF_OP_SUB:
9979		case DIF_OP_ADD:
9980		case DIF_OP_MUL:
9981		case DIF_OP_SDIV:
9982		case DIF_OP_UDIV:
9983		case DIF_OP_SREM:
9984		case DIF_OP_UREM:
9985		case DIF_OP_COPYS:
9986		case DIF_OP_NOT:
9987		case DIF_OP_MOV:
9988		case DIF_OP_RLDSB:
9989		case DIF_OP_RLDSH:
9990		case DIF_OP_RLDSW:
9991		case DIF_OP_RLDUB:
9992		case DIF_OP_RLDUH:
9993		case DIF_OP_RLDUW:
9994		case DIF_OP_RLDX:
9995		case DIF_OP_ULDSB:
9996		case DIF_OP_ULDSH:
9997		case DIF_OP_ULDSW:
9998		case DIF_OP_ULDUB:
9999		case DIF_OP_ULDUH:
10000		case DIF_OP_ULDUW:
10001		case DIF_OP_ULDX:
10002		case DIF_OP_STB:
10003		case DIF_OP_STH:
10004		case DIF_OP_STW:
10005		case DIF_OP_STX:
10006		case DIF_OP_ALLOCS:
10007		case DIF_OP_CMP:
10008		case DIF_OP_SCMP:
10009		case DIF_OP_TST:
10010		case DIF_OP_BA:
10011		case DIF_OP_BE:
10012		case DIF_OP_BNE:
10013		case DIF_OP_BG:
10014		case DIF_OP_BGU:
10015		case DIF_OP_BGE:
10016		case DIF_OP_BGEU:
10017		case DIF_OP_BL:
10018		case DIF_OP_BLU:
10019		case DIF_OP_BLE:
10020		case DIF_OP_BLEU:
10021		case DIF_OP_RET:
10022		case DIF_OP_NOP:
10023		case DIF_OP_POPTS:
10024		case DIF_OP_FLUSHTS:
10025		case DIF_OP_SETX:
10026		case DIF_OP_SETS:
10027		case DIF_OP_LDGA:
10028		case DIF_OP_LDLS:
10029		case DIF_OP_STGS:
10030		case DIF_OP_STLS:
10031		case DIF_OP_PUSHTR:
10032		case DIF_OP_PUSHTV:
10033			break;
10034
10035		case DIF_OP_LDGS:
10036			if (v >= DIF_VAR_OTHER_UBASE)
10037				break;
10038
10039			if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
10040				break;
10041
10042			if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
10043			    v == DIF_VAR_PPID || v == DIF_VAR_TID ||
10044			    v == DIF_VAR_EXECARGS ||
10045			    v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
10046			    v == DIF_VAR_UID || v == DIF_VAR_GID)
10047				break;
10048
10049			err += efunc(pc, "illegal variable %u\n", v);
10050			break;
10051
10052		case DIF_OP_LDTA:
10053		case DIF_OP_LDTS:
10054		case DIF_OP_LDGAA:
10055		case DIF_OP_LDTAA:
10056			err += efunc(pc, "illegal dynamic variable load\n");
10057			break;
10058
10059		case DIF_OP_STTS:
10060		case DIF_OP_STGAA:
10061		case DIF_OP_STTAA:
10062			err += efunc(pc, "illegal dynamic variable store\n");
10063			break;
10064
10065		case DIF_OP_CALL:
10066			if (subr == DIF_SUBR_ALLOCA ||
10067			    subr == DIF_SUBR_BCOPY ||
10068			    subr == DIF_SUBR_COPYIN ||
10069			    subr == DIF_SUBR_COPYINTO ||
10070			    subr == DIF_SUBR_COPYINSTR ||
10071			    subr == DIF_SUBR_INDEX ||
10072			    subr == DIF_SUBR_INET_NTOA ||
10073			    subr == DIF_SUBR_INET_NTOA6 ||
10074			    subr == DIF_SUBR_INET_NTOP ||
10075			    subr == DIF_SUBR_JSON ||
10076			    subr == DIF_SUBR_LLTOSTR ||
10077			    subr == DIF_SUBR_STRTOLL ||
10078			    subr == DIF_SUBR_RINDEX ||
10079			    subr == DIF_SUBR_STRCHR ||
10080			    subr == DIF_SUBR_STRJOIN ||
10081			    subr == DIF_SUBR_STRRCHR ||
10082			    subr == DIF_SUBR_STRSTR ||
10083			    subr == DIF_SUBR_HTONS ||
10084			    subr == DIF_SUBR_HTONL ||
10085			    subr == DIF_SUBR_HTONLL ||
10086			    subr == DIF_SUBR_NTOHS ||
10087			    subr == DIF_SUBR_NTOHL ||
10088			    subr == DIF_SUBR_NTOHLL ||
10089			    subr == DIF_SUBR_MEMREF ||
10090#if !defined(sun)
10091			    subr == DIF_SUBR_MEMSTR ||
10092#endif
10093			    subr == DIF_SUBR_TYPEREF)
10094				break;
10095
10096			err += efunc(pc, "invalid subr %u\n", subr);
10097			break;
10098
10099		default:
10100			err += efunc(pc, "invalid opcode %u\n",
10101			    DIF_INSTR_OP(instr));
10102		}
10103	}
10104
10105	return (err);
10106}
10107
10108/*
10109 * Returns 1 if the expression in the DIF object can be cached on a per-thread
10110 * basis; 0 if not.
10111 */
10112static int
10113dtrace_difo_cacheable(dtrace_difo_t *dp)
10114{
10115	int i;
10116
10117	if (dp == NULL)
10118		return (0);
10119
10120	for (i = 0; i < dp->dtdo_varlen; i++) {
10121		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10122
10123		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10124			continue;
10125
10126		switch (v->dtdv_id) {
10127		case DIF_VAR_CURTHREAD:
10128		case DIF_VAR_PID:
10129		case DIF_VAR_TID:
10130		case DIF_VAR_EXECARGS:
10131		case DIF_VAR_EXECNAME:
10132		case DIF_VAR_ZONENAME:
10133			break;
10134
10135		default:
10136			return (0);
10137		}
10138	}
10139
10140	/*
10141	 * This DIF object may be cacheable.  Now we need to look for any
10142	 * array loading instructions, any memory loading instructions, or
10143	 * any stores to thread-local variables.
10144	 */
10145	for (i = 0; i < dp->dtdo_len; i++) {
10146		uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10147
10148		if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
10149		    (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
10150		    (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
10151		    op == DIF_OP_LDGA || op == DIF_OP_STTS)
10152			return (0);
10153	}
10154
10155	return (1);
10156}
10157
10158static void
10159dtrace_difo_hold(dtrace_difo_t *dp)
10160{
10161	int i;
10162
10163	ASSERT(MUTEX_HELD(&dtrace_lock));
10164
10165	dp->dtdo_refcnt++;
10166	ASSERT(dp->dtdo_refcnt != 0);
10167
10168	/*
10169	 * We need to check this DIF object for references to the variable
10170	 * DIF_VAR_VTIMESTAMP.
10171	 */
10172	for (i = 0; i < dp->dtdo_varlen; i++) {
10173		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10174
10175		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10176			continue;
10177
10178		if (dtrace_vtime_references++ == 0)
10179			dtrace_vtime_enable();
10180	}
10181}
10182
10183/*
10184 * This routine calculates the dynamic variable chunksize for a given DIF
10185 * object.  The calculation is not fool-proof, and can probably be tricked by
10186 * malicious DIF -- but it works for all compiler-generated DIF.  Because this
10187 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10188 * if a dynamic variable size exceeds the chunksize.
10189 */
10190static void
10191dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10192{
10193	uint64_t sval = 0;
10194	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
10195	const dif_instr_t *text = dp->dtdo_buf;
10196	uint_t pc, srd = 0;
10197	uint_t ttop = 0;
10198	size_t size, ksize;
10199	uint_t id, i;
10200
10201	for (pc = 0; pc < dp->dtdo_len; pc++) {
10202		dif_instr_t instr = text[pc];
10203		uint_t op = DIF_INSTR_OP(instr);
10204		uint_t rd = DIF_INSTR_RD(instr);
10205		uint_t r1 = DIF_INSTR_R1(instr);
10206		uint_t nkeys = 0;
10207		uchar_t scope = 0;
10208
10209		dtrace_key_t *key = tupregs;
10210
10211		switch (op) {
10212		case DIF_OP_SETX:
10213			sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10214			srd = rd;
10215			continue;
10216
10217		case DIF_OP_STTS:
10218			key = &tupregs[DIF_DTR_NREGS];
10219			key[0].dttk_size = 0;
10220			key[1].dttk_size = 0;
10221			nkeys = 2;
10222			scope = DIFV_SCOPE_THREAD;
10223			break;
10224
10225		case DIF_OP_STGAA:
10226		case DIF_OP_STTAA:
10227			nkeys = ttop;
10228
10229			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10230				key[nkeys++].dttk_size = 0;
10231
10232			key[nkeys++].dttk_size = 0;
10233
10234			if (op == DIF_OP_STTAA) {
10235				scope = DIFV_SCOPE_THREAD;
10236			} else {
10237				scope = DIFV_SCOPE_GLOBAL;
10238			}
10239
10240			break;
10241
10242		case DIF_OP_PUSHTR:
10243			if (ttop == DIF_DTR_NREGS)
10244				return;
10245
10246			if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
10247				/*
10248				 * If the register for the size of the "pushtr"
10249				 * is %r0 (or the value is 0) and the type is
10250				 * a string, we'll use the system-wide default
10251				 * string size.
10252				 */
10253				tupregs[ttop++].dttk_size =
10254				    dtrace_strsize_default;
10255			} else {
10256				if (srd == 0)
10257					return;
10258
10259				tupregs[ttop++].dttk_size = sval;
10260			}
10261
10262			break;
10263
10264		case DIF_OP_PUSHTV:
10265			if (ttop == DIF_DTR_NREGS)
10266				return;
10267
10268			tupregs[ttop++].dttk_size = 0;
10269			break;
10270
10271		case DIF_OP_FLUSHTS:
10272			ttop = 0;
10273			break;
10274
10275		case DIF_OP_POPTS:
10276			if (ttop != 0)
10277				ttop--;
10278			break;
10279		}
10280
10281		sval = 0;
10282		srd = 0;
10283
10284		if (nkeys == 0)
10285			continue;
10286
10287		/*
10288		 * We have a dynamic variable allocation; calculate its size.
10289		 */
10290		for (ksize = 0, i = 0; i < nkeys; i++)
10291			ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10292
10293		size = sizeof (dtrace_dynvar_t);
10294		size += sizeof (dtrace_key_t) * (nkeys - 1);
10295		size += ksize;
10296
10297		/*
10298		 * Now we need to determine the size of the stored data.
10299		 */
10300		id = DIF_INSTR_VAR(instr);
10301
10302		for (i = 0; i < dp->dtdo_varlen; i++) {
10303			dtrace_difv_t *v = &dp->dtdo_vartab[i];
10304
10305			if (v->dtdv_id == id && v->dtdv_scope == scope) {
10306				size += v->dtdv_type.dtdt_size;
10307				break;
10308			}
10309		}
10310
10311		if (i == dp->dtdo_varlen)
10312			return;
10313
10314		/*
10315		 * We have the size.  If this is larger than the chunk size
10316		 * for our dynamic variable state, reset the chunk size.
10317		 */
10318		size = P2ROUNDUP(size, sizeof (uint64_t));
10319
10320		if (size > vstate->dtvs_dynvars.dtds_chunksize)
10321			vstate->dtvs_dynvars.dtds_chunksize = size;
10322	}
10323}
10324
10325static void
10326dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10327{
10328	int i, oldsvars, osz, nsz, otlocals, ntlocals;
10329	uint_t id;
10330
10331	ASSERT(MUTEX_HELD(&dtrace_lock));
10332	ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
10333
10334	for (i = 0; i < dp->dtdo_varlen; i++) {
10335		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10336		dtrace_statvar_t *svar, ***svarp = NULL;
10337		size_t dsize = 0;
10338		uint8_t scope = v->dtdv_scope;
10339		int *np = NULL;
10340
10341		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10342			continue;
10343
10344		id -= DIF_VAR_OTHER_UBASE;
10345
10346		switch (scope) {
10347		case DIFV_SCOPE_THREAD:
10348			while (id >= (otlocals = vstate->dtvs_ntlocals)) {
10349				dtrace_difv_t *tlocals;
10350
10351				if ((ntlocals = (otlocals << 1)) == 0)
10352					ntlocals = 1;
10353
10354				osz = otlocals * sizeof (dtrace_difv_t);
10355				nsz = ntlocals * sizeof (dtrace_difv_t);
10356
10357				tlocals = kmem_zalloc(nsz, KM_SLEEP);
10358
10359				if (osz != 0) {
10360					bcopy(vstate->dtvs_tlocals,
10361					    tlocals, osz);
10362					kmem_free(vstate->dtvs_tlocals, osz);
10363				}
10364
10365				vstate->dtvs_tlocals = tlocals;
10366				vstate->dtvs_ntlocals = ntlocals;
10367			}
10368
10369			vstate->dtvs_tlocals[id] = *v;
10370			continue;
10371
10372		case DIFV_SCOPE_LOCAL:
10373			np = &vstate->dtvs_nlocals;
10374			svarp = &vstate->dtvs_locals;
10375
10376			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10377				dsize = NCPU * (v->dtdv_type.dtdt_size +
10378				    sizeof (uint64_t));
10379			else
10380				dsize = NCPU * sizeof (uint64_t);
10381
10382			break;
10383
10384		case DIFV_SCOPE_GLOBAL:
10385			np = &vstate->dtvs_nglobals;
10386			svarp = &vstate->dtvs_globals;
10387
10388			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10389				dsize = v->dtdv_type.dtdt_size +
10390				    sizeof (uint64_t);
10391
10392			break;
10393
10394		default:
10395			ASSERT(0);
10396		}
10397
10398		while (id >= (oldsvars = *np)) {
10399			dtrace_statvar_t **statics;
10400			int newsvars, oldsize, newsize;
10401
10402			if ((newsvars = (oldsvars << 1)) == 0)
10403				newsvars = 1;
10404
10405			oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10406			newsize = newsvars * sizeof (dtrace_statvar_t *);
10407
10408			statics = kmem_zalloc(newsize, KM_SLEEP);
10409
10410			if (oldsize != 0) {
10411				bcopy(*svarp, statics, oldsize);
10412				kmem_free(*svarp, oldsize);
10413			}
10414
10415			*svarp = statics;
10416			*np = newsvars;
10417		}
10418
10419		if ((svar = (*svarp)[id]) == NULL) {
10420			svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10421			svar->dtsv_var = *v;
10422
10423			if ((svar->dtsv_size = dsize) != 0) {
10424				svar->dtsv_data = (uint64_t)(uintptr_t)
10425				    kmem_zalloc(dsize, KM_SLEEP);
10426			}
10427
10428			(*svarp)[id] = svar;
10429		}
10430
10431		svar->dtsv_refcnt++;
10432	}
10433
10434	dtrace_difo_chunksize(dp, vstate);
10435	dtrace_difo_hold(dp);
10436}
10437
10438static dtrace_difo_t *
10439dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10440{
10441	dtrace_difo_t *new;
10442	size_t sz;
10443
10444	ASSERT(dp->dtdo_buf != NULL);
10445	ASSERT(dp->dtdo_refcnt != 0);
10446
10447	new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10448
10449	ASSERT(dp->dtdo_buf != NULL);
10450	sz = dp->dtdo_len * sizeof (dif_instr_t);
10451	new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10452	bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10453	new->dtdo_len = dp->dtdo_len;
10454
10455	if (dp->dtdo_strtab != NULL) {
10456		ASSERT(dp->dtdo_strlen != 0);
10457		new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10458		bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10459		new->dtdo_strlen = dp->dtdo_strlen;
10460	}
10461
10462	if (dp->dtdo_inttab != NULL) {
10463		ASSERT(dp->dtdo_intlen != 0);
10464		sz = dp->dtdo_intlen * sizeof (uint64_t);
10465		new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10466		bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10467		new->dtdo_intlen = dp->dtdo_intlen;
10468	}
10469
10470	if (dp->dtdo_vartab != NULL) {
10471		ASSERT(dp->dtdo_varlen != 0);
10472		sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10473		new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10474		bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10475		new->dtdo_varlen = dp->dtdo_varlen;
10476	}
10477
10478	dtrace_difo_init(new, vstate);
10479	return (new);
10480}
10481
10482static void
10483dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10484{
10485	int i;
10486
10487	ASSERT(dp->dtdo_refcnt == 0);
10488
10489	for (i = 0; i < dp->dtdo_varlen; i++) {
10490		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10491		dtrace_statvar_t *svar, **svarp = NULL;
10492		uint_t id;
10493		uint8_t scope = v->dtdv_scope;
10494		int *np = NULL;
10495
10496		switch (scope) {
10497		case DIFV_SCOPE_THREAD:
10498			continue;
10499
10500		case DIFV_SCOPE_LOCAL:
10501			np = &vstate->dtvs_nlocals;
10502			svarp = vstate->dtvs_locals;
10503			break;
10504
10505		case DIFV_SCOPE_GLOBAL:
10506			np = &vstate->dtvs_nglobals;
10507			svarp = vstate->dtvs_globals;
10508			break;
10509
10510		default:
10511			ASSERT(0);
10512		}
10513
10514		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10515			continue;
10516
10517		id -= DIF_VAR_OTHER_UBASE;
10518		ASSERT(id < *np);
10519
10520		svar = svarp[id];
10521		ASSERT(svar != NULL);
10522		ASSERT(svar->dtsv_refcnt > 0);
10523
10524		if (--svar->dtsv_refcnt > 0)
10525			continue;
10526
10527		if (svar->dtsv_size != 0) {
10528			ASSERT(svar->dtsv_data != 0);
10529			kmem_free((void *)(uintptr_t)svar->dtsv_data,
10530			    svar->dtsv_size);
10531		}
10532
10533		kmem_free(svar, sizeof (dtrace_statvar_t));
10534		svarp[id] = NULL;
10535	}
10536
10537	if (dp->dtdo_buf != NULL)
10538		kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10539	if (dp->dtdo_inttab != NULL)
10540		kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10541	if (dp->dtdo_strtab != NULL)
10542		kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10543	if (dp->dtdo_vartab != NULL)
10544		kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10545
10546	kmem_free(dp, sizeof (dtrace_difo_t));
10547}
10548
10549static void
10550dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10551{
10552	int i;
10553
10554	ASSERT(MUTEX_HELD(&dtrace_lock));
10555	ASSERT(dp->dtdo_refcnt != 0);
10556
10557	for (i = 0; i < dp->dtdo_varlen; i++) {
10558		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10559
10560		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10561			continue;
10562
10563		ASSERT(dtrace_vtime_references > 0);
10564		if (--dtrace_vtime_references == 0)
10565			dtrace_vtime_disable();
10566	}
10567
10568	if (--dp->dtdo_refcnt == 0)
10569		dtrace_difo_destroy(dp, vstate);
10570}
10571
10572/*
10573 * DTrace Format Functions
10574 */
10575static uint16_t
10576dtrace_format_add(dtrace_state_t *state, char *str)
10577{
10578	char *fmt, **new;
10579	uint16_t ndx, len = strlen(str) + 1;
10580
10581	fmt = kmem_zalloc(len, KM_SLEEP);
10582	bcopy(str, fmt, len);
10583
10584	for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10585		if (state->dts_formats[ndx] == NULL) {
10586			state->dts_formats[ndx] = fmt;
10587			return (ndx + 1);
10588		}
10589	}
10590
10591	if (state->dts_nformats == USHRT_MAX) {
10592		/*
10593		 * This is only likely if a denial-of-service attack is being
10594		 * attempted.  As such, it's okay to fail silently here.
10595		 */
10596		kmem_free(fmt, len);
10597		return (0);
10598	}
10599
10600	/*
10601	 * For simplicity, we always resize the formats array to be exactly the
10602	 * number of formats.
10603	 */
10604	ndx = state->dts_nformats++;
10605	new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
10606
10607	if (state->dts_formats != NULL) {
10608		ASSERT(ndx != 0);
10609		bcopy(state->dts_formats, new, ndx * sizeof (char *));
10610		kmem_free(state->dts_formats, ndx * sizeof (char *));
10611	}
10612
10613	state->dts_formats = new;
10614	state->dts_formats[ndx] = fmt;
10615
10616	return (ndx + 1);
10617}
10618
10619static void
10620dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10621{
10622	char *fmt;
10623
10624	ASSERT(state->dts_formats != NULL);
10625	ASSERT(format <= state->dts_nformats);
10626	ASSERT(state->dts_formats[format - 1] != NULL);
10627
10628	fmt = state->dts_formats[format - 1];
10629	kmem_free(fmt, strlen(fmt) + 1);
10630	state->dts_formats[format - 1] = NULL;
10631}
10632
10633static void
10634dtrace_format_destroy(dtrace_state_t *state)
10635{
10636	int i;
10637
10638	if (state->dts_nformats == 0) {
10639		ASSERT(state->dts_formats == NULL);
10640		return;
10641	}
10642
10643	ASSERT(state->dts_formats != NULL);
10644
10645	for (i = 0; i < state->dts_nformats; i++) {
10646		char *fmt = state->dts_formats[i];
10647
10648		if (fmt == NULL)
10649			continue;
10650
10651		kmem_free(fmt, strlen(fmt) + 1);
10652	}
10653
10654	kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
10655	state->dts_nformats = 0;
10656	state->dts_formats = NULL;
10657}
10658
10659/*
10660 * DTrace Predicate Functions
10661 */
10662static dtrace_predicate_t *
10663dtrace_predicate_create(dtrace_difo_t *dp)
10664{
10665	dtrace_predicate_t *pred;
10666
10667	ASSERT(MUTEX_HELD(&dtrace_lock));
10668	ASSERT(dp->dtdo_refcnt != 0);
10669
10670	pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10671	pred->dtp_difo = dp;
10672	pred->dtp_refcnt = 1;
10673
10674	if (!dtrace_difo_cacheable(dp))
10675		return (pred);
10676
10677	if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10678		/*
10679		 * This is only theoretically possible -- we have had 2^32
10680		 * cacheable predicates on this machine.  We cannot allow any
10681		 * more predicates to become cacheable:  as unlikely as it is,
10682		 * there may be a thread caching a (now stale) predicate cache
10683		 * ID. (N.B.: the temptation is being successfully resisted to
10684		 * have this cmn_err() "Holy shit -- we executed this code!")
10685		 */
10686		return (pred);
10687	}
10688
10689	pred->dtp_cacheid = dtrace_predcache_id++;
10690
10691	return (pred);
10692}
10693
10694static void
10695dtrace_predicate_hold(dtrace_predicate_t *pred)
10696{
10697	ASSERT(MUTEX_HELD(&dtrace_lock));
10698	ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10699	ASSERT(pred->dtp_refcnt > 0);
10700
10701	pred->dtp_refcnt++;
10702}
10703
10704static void
10705dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10706{
10707	dtrace_difo_t *dp = pred->dtp_difo;
10708
10709	ASSERT(MUTEX_HELD(&dtrace_lock));
10710	ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10711	ASSERT(pred->dtp_refcnt > 0);
10712
10713	if (--pred->dtp_refcnt == 0) {
10714		dtrace_difo_release(pred->dtp_difo, vstate);
10715		kmem_free(pred, sizeof (dtrace_predicate_t));
10716	}
10717}
10718
10719/*
10720 * DTrace Action Description Functions
10721 */
10722static dtrace_actdesc_t *
10723dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10724    uint64_t uarg, uint64_t arg)
10725{
10726	dtrace_actdesc_t *act;
10727
10728#if defined(sun)
10729	ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
10730	    arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
10731#endif
10732
10733	act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10734	act->dtad_kind = kind;
10735	act->dtad_ntuple = ntuple;
10736	act->dtad_uarg = uarg;
10737	act->dtad_arg = arg;
10738	act->dtad_refcnt = 1;
10739
10740	return (act);
10741}
10742
10743static void
10744dtrace_actdesc_hold(dtrace_actdesc_t *act)
10745{
10746	ASSERT(act->dtad_refcnt >= 1);
10747	act->dtad_refcnt++;
10748}
10749
10750static void
10751dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10752{
10753	dtrace_actkind_t kind = act->dtad_kind;
10754	dtrace_difo_t *dp;
10755
10756	ASSERT(act->dtad_refcnt >= 1);
10757
10758	if (--act->dtad_refcnt != 0)
10759		return;
10760
10761	if ((dp = act->dtad_difo) != NULL)
10762		dtrace_difo_release(dp, vstate);
10763
10764	if (DTRACEACT_ISPRINTFLIKE(kind)) {
10765		char *str = (char *)(uintptr_t)act->dtad_arg;
10766
10767#if defined(sun)
10768		ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10769		    (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10770#endif
10771
10772		if (str != NULL)
10773			kmem_free(str, strlen(str) + 1);
10774	}
10775
10776	kmem_free(act, sizeof (dtrace_actdesc_t));
10777}
10778
10779/*
10780 * DTrace ECB Functions
10781 */
10782static dtrace_ecb_t *
10783dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10784{
10785	dtrace_ecb_t *ecb;
10786	dtrace_epid_t epid;
10787
10788	ASSERT(MUTEX_HELD(&dtrace_lock));
10789
10790	ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10791	ecb->dte_predicate = NULL;
10792	ecb->dte_probe = probe;
10793
10794	/*
10795	 * The default size is the size of the default action: recording
10796	 * the header.
10797	 */
10798	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
10799	ecb->dte_alignment = sizeof (dtrace_epid_t);
10800
10801	epid = state->dts_epid++;
10802
10803	if (epid - 1 >= state->dts_necbs) {
10804		dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
10805		int necbs = state->dts_necbs << 1;
10806
10807		ASSERT(epid == state->dts_necbs + 1);
10808
10809		if (necbs == 0) {
10810			ASSERT(oecbs == NULL);
10811			necbs = 1;
10812		}
10813
10814		ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10815
10816		if (oecbs != NULL)
10817			bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
10818
10819		dtrace_membar_producer();
10820		state->dts_ecbs = ecbs;
10821
10822		if (oecbs != NULL) {
10823			/*
10824			 * If this state is active, we must dtrace_sync()
10825			 * before we can free the old dts_ecbs array:  we're
10826			 * coming in hot, and there may be active ring
10827			 * buffer processing (which indexes into the dts_ecbs
10828			 * array) on another CPU.
10829			 */
10830			if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
10831				dtrace_sync();
10832
10833			kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
10834		}
10835
10836		dtrace_membar_producer();
10837		state->dts_necbs = necbs;
10838	}
10839
10840	ecb->dte_state = state;
10841
10842	ASSERT(state->dts_ecbs[epid - 1] == NULL);
10843	dtrace_membar_producer();
10844	state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
10845
10846	return (ecb);
10847}
10848
10849static void
10850dtrace_ecb_enable(dtrace_ecb_t *ecb)
10851{
10852	dtrace_probe_t *probe = ecb->dte_probe;
10853
10854	ASSERT(MUTEX_HELD(&cpu_lock));
10855	ASSERT(MUTEX_HELD(&dtrace_lock));
10856	ASSERT(ecb->dte_next == NULL);
10857
10858	if (probe == NULL) {
10859		/*
10860		 * This is the NULL probe -- there's nothing to do.
10861		 */
10862		return;
10863	}
10864
10865	if (probe->dtpr_ecb == NULL) {
10866		dtrace_provider_t *prov = probe->dtpr_provider;
10867
10868		/*
10869		 * We're the first ECB on this probe.
10870		 */
10871		probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
10872
10873		if (ecb->dte_predicate != NULL)
10874			probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
10875
10876		prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
10877		    probe->dtpr_id, probe->dtpr_arg);
10878	} else {
10879		/*
10880		 * This probe is already active.  Swing the last pointer to
10881		 * point to the new ECB, and issue a dtrace_sync() to assure
10882		 * that all CPUs have seen the change.
10883		 */
10884		ASSERT(probe->dtpr_ecb_last != NULL);
10885		probe->dtpr_ecb_last->dte_next = ecb;
10886		probe->dtpr_ecb_last = ecb;
10887		probe->dtpr_predcache = 0;
10888
10889		dtrace_sync();
10890	}
10891}
10892
10893static void
10894dtrace_ecb_resize(dtrace_ecb_t *ecb)
10895{
10896	dtrace_action_t *act;
10897	uint32_t curneeded = UINT32_MAX;
10898	uint32_t aggbase = UINT32_MAX;
10899
10900	/*
10901	 * If we record anything, we always record the dtrace_rechdr_t.  (And
10902	 * we always record it first.)
10903	 */
10904	ecb->dte_size = sizeof (dtrace_rechdr_t);
10905	ecb->dte_alignment = sizeof (dtrace_epid_t);
10906
10907	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10908		dtrace_recdesc_t *rec = &act->dta_rec;
10909		ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
10910
10911		ecb->dte_alignment = MAX(ecb->dte_alignment,
10912		    rec->dtrd_alignment);
10913
10914		if (DTRACEACT_ISAGG(act->dta_kind)) {
10915			dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10916
10917			ASSERT(rec->dtrd_size != 0);
10918			ASSERT(agg->dtag_first != NULL);
10919			ASSERT(act->dta_prev->dta_intuple);
10920			ASSERT(aggbase != UINT32_MAX);
10921			ASSERT(curneeded != UINT32_MAX);
10922
10923			agg->dtag_base = aggbase;
10924
10925			curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10926			rec->dtrd_offset = curneeded;
10927			curneeded += rec->dtrd_size;
10928			ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
10929
10930			aggbase = UINT32_MAX;
10931			curneeded = UINT32_MAX;
10932		} else if (act->dta_intuple) {
10933			if (curneeded == UINT32_MAX) {
10934				/*
10935				 * This is the first record in a tuple.  Align
10936				 * curneeded to be at offset 4 in an 8-byte
10937				 * aligned block.
10938				 */
10939				ASSERT(act->dta_prev == NULL ||
10940				    !act->dta_prev->dta_intuple);
10941				ASSERT3U(aggbase, ==, UINT32_MAX);
10942				curneeded = P2PHASEUP(ecb->dte_size,
10943				    sizeof (uint64_t), sizeof (dtrace_aggid_t));
10944
10945				aggbase = curneeded - sizeof (dtrace_aggid_t);
10946				ASSERT(IS_P2ALIGNED(aggbase,
10947				    sizeof (uint64_t)));
10948			}
10949			curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10950			rec->dtrd_offset = curneeded;
10951			curneeded += rec->dtrd_size;
10952		} else {
10953			/* tuples must be followed by an aggregation */
10954			ASSERT(act->dta_prev == NULL ||
10955			    !act->dta_prev->dta_intuple);
10956
10957			ecb->dte_size = P2ROUNDUP(ecb->dte_size,
10958			    rec->dtrd_alignment);
10959			rec->dtrd_offset = ecb->dte_size;
10960			ecb->dte_size += rec->dtrd_size;
10961			ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
10962		}
10963	}
10964
10965	if ((act = ecb->dte_action) != NULL &&
10966	    !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
10967	    ecb->dte_size == sizeof (dtrace_rechdr_t)) {
10968		/*
10969		 * If the size is still sizeof (dtrace_rechdr_t), then all
10970		 * actions store no data; set the size to 0.
10971		 */
10972		ecb->dte_size = 0;
10973	}
10974
10975	ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
10976	ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
10977	ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
10978	    ecb->dte_needed);
10979}
10980
10981static dtrace_action_t *
10982dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10983{
10984	dtrace_aggregation_t *agg;
10985	size_t size = sizeof (uint64_t);
10986	int ntuple = desc->dtad_ntuple;
10987	dtrace_action_t *act;
10988	dtrace_recdesc_t *frec;
10989	dtrace_aggid_t aggid;
10990	dtrace_state_t *state = ecb->dte_state;
10991
10992	agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
10993	agg->dtag_ecb = ecb;
10994
10995	ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
10996
10997	switch (desc->dtad_kind) {
10998	case DTRACEAGG_MIN:
10999		agg->dtag_initial = INT64_MAX;
11000		agg->dtag_aggregate = dtrace_aggregate_min;
11001		break;
11002
11003	case DTRACEAGG_MAX:
11004		agg->dtag_initial = INT64_MIN;
11005		agg->dtag_aggregate = dtrace_aggregate_max;
11006		break;
11007
11008	case DTRACEAGG_COUNT:
11009		agg->dtag_aggregate = dtrace_aggregate_count;
11010		break;
11011
11012	case DTRACEAGG_QUANTIZE:
11013		agg->dtag_aggregate = dtrace_aggregate_quantize;
11014		size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
11015		    sizeof (uint64_t);
11016		break;
11017
11018	case DTRACEAGG_LQUANTIZE: {
11019		uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
11020		uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
11021
11022		agg->dtag_initial = desc->dtad_arg;
11023		agg->dtag_aggregate = dtrace_aggregate_lquantize;
11024
11025		if (step == 0 || levels == 0)
11026			goto err;
11027
11028		size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
11029		break;
11030	}
11031
11032	case DTRACEAGG_LLQUANTIZE: {
11033		uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
11034		uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
11035		uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
11036		uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
11037		int64_t v;
11038
11039		agg->dtag_initial = desc->dtad_arg;
11040		agg->dtag_aggregate = dtrace_aggregate_llquantize;
11041
11042		if (factor < 2 || low >= high || nsteps < factor)
11043			goto err;
11044
11045		/*
11046		 * Now check that the number of steps evenly divides a power
11047		 * of the factor.  (This assures both integer bucket size and
11048		 * linearity within each magnitude.)
11049		 */
11050		for (v = factor; v < nsteps; v *= factor)
11051			continue;
11052
11053		if ((v % nsteps) || (nsteps % factor))
11054			goto err;
11055
11056		size = (dtrace_aggregate_llquantize_bucket(factor,
11057		    low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
11058		break;
11059	}
11060
11061	case DTRACEAGG_AVG:
11062		agg->dtag_aggregate = dtrace_aggregate_avg;
11063		size = sizeof (uint64_t) * 2;
11064		break;
11065
11066	case DTRACEAGG_STDDEV:
11067		agg->dtag_aggregate = dtrace_aggregate_stddev;
11068		size = sizeof (uint64_t) * 4;
11069		break;
11070
11071	case DTRACEAGG_SUM:
11072		agg->dtag_aggregate = dtrace_aggregate_sum;
11073		break;
11074
11075	default:
11076		goto err;
11077	}
11078
11079	agg->dtag_action.dta_rec.dtrd_size = size;
11080
11081	if (ntuple == 0)
11082		goto err;
11083
11084	/*
11085	 * We must make sure that we have enough actions for the n-tuple.
11086	 */
11087	for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
11088		if (DTRACEACT_ISAGG(act->dta_kind))
11089			break;
11090
11091		if (--ntuple == 0) {
11092			/*
11093			 * This is the action with which our n-tuple begins.
11094			 */
11095			agg->dtag_first = act;
11096			goto success;
11097		}
11098	}
11099
11100	/*
11101	 * This n-tuple is short by ntuple elements.  Return failure.
11102	 */
11103	ASSERT(ntuple != 0);
11104err:
11105	kmem_free(agg, sizeof (dtrace_aggregation_t));
11106	return (NULL);
11107
11108success:
11109	/*
11110	 * If the last action in the tuple has a size of zero, it's actually
11111	 * an expression argument for the aggregating action.
11112	 */
11113	ASSERT(ecb->dte_action_last != NULL);
11114	act = ecb->dte_action_last;
11115
11116	if (act->dta_kind == DTRACEACT_DIFEXPR) {
11117		ASSERT(act->dta_difo != NULL);
11118
11119		if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
11120			agg->dtag_hasarg = 1;
11121	}
11122
11123	/*
11124	 * We need to allocate an id for this aggregation.
11125	 */
11126#if defined(sun)
11127	aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
11128	    VM_BESTFIT | VM_SLEEP);
11129#else
11130	aggid = alloc_unr(state->dts_aggid_arena);
11131#endif
11132
11133	if (aggid - 1 >= state->dts_naggregations) {
11134		dtrace_aggregation_t **oaggs = state->dts_aggregations;
11135		dtrace_aggregation_t **aggs;
11136		int naggs = state->dts_naggregations << 1;
11137		int onaggs = state->dts_naggregations;
11138
11139		ASSERT(aggid == state->dts_naggregations + 1);
11140
11141		if (naggs == 0) {
11142			ASSERT(oaggs == NULL);
11143			naggs = 1;
11144		}
11145
11146		aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11147
11148		if (oaggs != NULL) {
11149			bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
11150			kmem_free(oaggs, onaggs * sizeof (*aggs));
11151		}
11152
11153		state->dts_aggregations = aggs;
11154		state->dts_naggregations = naggs;
11155	}
11156
11157	ASSERT(state->dts_aggregations[aggid - 1] == NULL);
11158	state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
11159
11160	frec = &agg->dtag_first->dta_rec;
11161	if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11162		frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11163
11164	for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11165		ASSERT(!act->dta_intuple);
11166		act->dta_intuple = 1;
11167	}
11168
11169	return (&agg->dtag_action);
11170}
11171
11172static void
11173dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
11174{
11175	dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11176	dtrace_state_t *state = ecb->dte_state;
11177	dtrace_aggid_t aggid = agg->dtag_id;
11178
11179	ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11180#if defined(sun)
11181	vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
11182#else
11183	free_unr(state->dts_aggid_arena, aggid);
11184#endif
11185
11186	ASSERT(state->dts_aggregations[aggid - 1] == agg);
11187	state->dts_aggregations[aggid - 1] = NULL;
11188
11189	kmem_free(agg, sizeof (dtrace_aggregation_t));
11190}
11191
11192static int
11193dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11194{
11195	dtrace_action_t *action, *last;
11196	dtrace_difo_t *dp = desc->dtad_difo;
11197	uint32_t size = 0, align = sizeof (uint8_t), mask;
11198	uint16_t format = 0;
11199	dtrace_recdesc_t *rec;
11200	dtrace_state_t *state = ecb->dte_state;
11201	dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize;
11202	uint64_t arg = desc->dtad_arg;
11203
11204	ASSERT(MUTEX_HELD(&dtrace_lock));
11205	ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
11206
11207	if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11208		/*
11209		 * If this is an aggregating action, there must be neither
11210		 * a speculate nor a commit on the action chain.
11211		 */
11212		dtrace_action_t *act;
11213
11214		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11215			if (act->dta_kind == DTRACEACT_COMMIT)
11216				return (EINVAL);
11217
11218			if (act->dta_kind == DTRACEACT_SPECULATE)
11219				return (EINVAL);
11220		}
11221
11222		action = dtrace_ecb_aggregation_create(ecb, desc);
11223
11224		if (action == NULL)
11225			return (EINVAL);
11226	} else {
11227		if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
11228		    (desc->dtad_kind == DTRACEACT_DIFEXPR &&
11229		    dp != NULL && dp->dtdo_destructive)) {
11230			state->dts_destructive = 1;
11231		}
11232
11233		switch (desc->dtad_kind) {
11234		case DTRACEACT_PRINTF:
11235		case DTRACEACT_PRINTA:
11236		case DTRACEACT_SYSTEM:
11237		case DTRACEACT_FREOPEN:
11238		case DTRACEACT_DIFEXPR:
11239			/*
11240			 * We know that our arg is a string -- turn it into a
11241			 * format.
11242			 */
11243			if (arg == 0) {
11244				ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
11245				    desc->dtad_kind == DTRACEACT_DIFEXPR);
11246				format = 0;
11247			} else {
11248				ASSERT(arg != 0);
11249#if defined(sun)
11250				ASSERT(arg > KERNELBASE);
11251#endif
11252				format = dtrace_format_add(state,
11253				    (char *)(uintptr_t)arg);
11254			}
11255
11256			/*FALLTHROUGH*/
11257		case DTRACEACT_LIBACT:
11258		case DTRACEACT_TRACEMEM:
11259		case DTRACEACT_TRACEMEM_DYNSIZE:
11260			if (dp == NULL)
11261				return (EINVAL);
11262
11263			if ((size = dp->dtdo_rtype.dtdt_size) != 0)
11264				break;
11265
11266			if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11267				if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11268					return (EINVAL);
11269
11270				size = opt[DTRACEOPT_STRSIZE];
11271			}
11272
11273			break;
11274
11275		case DTRACEACT_STACK:
11276			if ((nframes = arg) == 0) {
11277				nframes = opt[DTRACEOPT_STACKFRAMES];
11278				ASSERT(nframes > 0);
11279				arg = nframes;
11280			}
11281
11282			size = nframes * sizeof (pc_t);
11283			break;
11284
11285		case DTRACEACT_JSTACK:
11286			if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
11287				strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11288
11289			if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
11290				nframes = opt[DTRACEOPT_JSTACKFRAMES];
11291
11292			arg = DTRACE_USTACK_ARG(nframes, strsize);
11293
11294			/*FALLTHROUGH*/
11295		case DTRACEACT_USTACK:
11296			if (desc->dtad_kind != DTRACEACT_JSTACK &&
11297			    (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
11298				strsize = DTRACE_USTACK_STRSIZE(arg);
11299				nframes = opt[DTRACEOPT_USTACKFRAMES];
11300				ASSERT(nframes > 0);
11301				arg = DTRACE_USTACK_ARG(nframes, strsize);
11302			}
11303
11304			/*
11305			 * Save a slot for the pid.
11306			 */
11307			size = (nframes + 1) * sizeof (uint64_t);
11308			size += DTRACE_USTACK_STRSIZE(arg);
11309			size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11310
11311			break;
11312
11313		case DTRACEACT_SYM:
11314		case DTRACEACT_MOD:
11315			if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
11316			    sizeof (uint64_t)) ||
11317			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11318				return (EINVAL);
11319			break;
11320
11321		case DTRACEACT_USYM:
11322		case DTRACEACT_UMOD:
11323		case DTRACEACT_UADDR:
11324			if (dp == NULL ||
11325			    (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
11326			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11327				return (EINVAL);
11328
11329			/*
11330			 * We have a slot for the pid, plus a slot for the
11331			 * argument.  To keep things simple (aligned with
11332			 * bitness-neutral sizing), we store each as a 64-bit
11333			 * quantity.
11334			 */
11335			size = 2 * sizeof (uint64_t);
11336			break;
11337
11338		case DTRACEACT_STOP:
11339		case DTRACEACT_BREAKPOINT:
11340		case DTRACEACT_PANIC:
11341			break;
11342
11343		case DTRACEACT_CHILL:
11344		case DTRACEACT_DISCARD:
11345		case DTRACEACT_RAISE:
11346			if (dp == NULL)
11347				return (EINVAL);
11348			break;
11349
11350		case DTRACEACT_EXIT:
11351			if (dp == NULL ||
11352			    (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
11353			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11354				return (EINVAL);
11355			break;
11356
11357		case DTRACEACT_SPECULATE:
11358			if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11359				return (EINVAL);
11360
11361			if (dp == NULL)
11362				return (EINVAL);
11363
11364			state->dts_speculates = 1;
11365			break;
11366
11367		case DTRACEACT_PRINTM:
11368		    	size = dp->dtdo_rtype.dtdt_size;
11369			break;
11370
11371		case DTRACEACT_PRINTT:
11372		    	size = dp->dtdo_rtype.dtdt_size;
11373			break;
11374
11375		case DTRACEACT_COMMIT: {
11376			dtrace_action_t *act = ecb->dte_action;
11377
11378			for (; act != NULL; act = act->dta_next) {
11379				if (act->dta_kind == DTRACEACT_COMMIT)
11380					return (EINVAL);
11381			}
11382
11383			if (dp == NULL)
11384				return (EINVAL);
11385			break;
11386		}
11387
11388		default:
11389			return (EINVAL);
11390		}
11391
11392		if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
11393			/*
11394			 * If this is a data-storing action or a speculate,
11395			 * we must be sure that there isn't a commit on the
11396			 * action chain.
11397			 */
11398			dtrace_action_t *act = ecb->dte_action;
11399
11400			for (; act != NULL; act = act->dta_next) {
11401				if (act->dta_kind == DTRACEACT_COMMIT)
11402					return (EINVAL);
11403			}
11404		}
11405
11406		action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11407		action->dta_rec.dtrd_size = size;
11408	}
11409
11410	action->dta_refcnt = 1;
11411	rec = &action->dta_rec;
11412	size = rec->dtrd_size;
11413
11414	for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11415		if (!(size & mask)) {
11416			align = mask + 1;
11417			break;
11418		}
11419	}
11420
11421	action->dta_kind = desc->dtad_kind;
11422
11423	if ((action->dta_difo = dp) != NULL)
11424		dtrace_difo_hold(dp);
11425
11426	rec->dtrd_action = action->dta_kind;
11427	rec->dtrd_arg = arg;
11428	rec->dtrd_uarg = desc->dtad_uarg;
11429	rec->dtrd_alignment = (uint16_t)align;
11430	rec->dtrd_format = format;
11431
11432	if ((last = ecb->dte_action_last) != NULL) {
11433		ASSERT(ecb->dte_action != NULL);
11434		action->dta_prev = last;
11435		last->dta_next = action;
11436	} else {
11437		ASSERT(ecb->dte_action == NULL);
11438		ecb->dte_action = action;
11439	}
11440
11441	ecb->dte_action_last = action;
11442
11443	return (0);
11444}
11445
11446static void
11447dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11448{
11449	dtrace_action_t *act = ecb->dte_action, *next;
11450	dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11451	dtrace_difo_t *dp;
11452	uint16_t format;
11453
11454	if (act != NULL && act->dta_refcnt > 1) {
11455		ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11456		act->dta_refcnt--;
11457	} else {
11458		for (; act != NULL; act = next) {
11459			next = act->dta_next;
11460			ASSERT(next != NULL || act == ecb->dte_action_last);
11461			ASSERT(act->dta_refcnt == 1);
11462
11463			if ((format = act->dta_rec.dtrd_format) != 0)
11464				dtrace_format_remove(ecb->dte_state, format);
11465
11466			if ((dp = act->dta_difo) != NULL)
11467				dtrace_difo_release(dp, vstate);
11468
11469			if (DTRACEACT_ISAGG(act->dta_kind)) {
11470				dtrace_ecb_aggregation_destroy(ecb, act);
11471			} else {
11472				kmem_free(act, sizeof (dtrace_action_t));
11473			}
11474		}
11475	}
11476
11477	ecb->dte_action = NULL;
11478	ecb->dte_action_last = NULL;
11479	ecb->dte_size = 0;
11480}
11481
11482static void
11483dtrace_ecb_disable(dtrace_ecb_t *ecb)
11484{
11485	/*
11486	 * We disable the ECB by removing it from its probe.
11487	 */
11488	dtrace_ecb_t *pecb, *prev = NULL;
11489	dtrace_probe_t *probe = ecb->dte_probe;
11490
11491	ASSERT(MUTEX_HELD(&dtrace_lock));
11492
11493	if (probe == NULL) {
11494		/*
11495		 * This is the NULL probe; there is nothing to disable.
11496		 */
11497		return;
11498	}
11499
11500	for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11501		if (pecb == ecb)
11502			break;
11503		prev = pecb;
11504	}
11505
11506	ASSERT(pecb != NULL);
11507
11508	if (prev == NULL) {
11509		probe->dtpr_ecb = ecb->dte_next;
11510	} else {
11511		prev->dte_next = ecb->dte_next;
11512	}
11513
11514	if (ecb == probe->dtpr_ecb_last) {
11515		ASSERT(ecb->dte_next == NULL);
11516		probe->dtpr_ecb_last = prev;
11517	}
11518
11519	/*
11520	 * The ECB has been disconnected from the probe; now sync to assure
11521	 * that all CPUs have seen the change before returning.
11522	 */
11523	dtrace_sync();
11524
11525	if (probe->dtpr_ecb == NULL) {
11526		/*
11527		 * That was the last ECB on the probe; clear the predicate
11528		 * cache ID for the probe, disable it and sync one more time
11529		 * to assure that we'll never hit it again.
11530		 */
11531		dtrace_provider_t *prov = probe->dtpr_provider;
11532
11533		ASSERT(ecb->dte_next == NULL);
11534		ASSERT(probe->dtpr_ecb_last == NULL);
11535		probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11536		prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11537		    probe->dtpr_id, probe->dtpr_arg);
11538		dtrace_sync();
11539	} else {
11540		/*
11541		 * There is at least one ECB remaining on the probe.  If there
11542		 * is _exactly_ one, set the probe's predicate cache ID to be
11543		 * the predicate cache ID of the remaining ECB.
11544		 */
11545		ASSERT(probe->dtpr_ecb_last != NULL);
11546		ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11547
11548		if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11549			dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11550
11551			ASSERT(probe->dtpr_ecb->dte_next == NULL);
11552
11553			if (p != NULL)
11554				probe->dtpr_predcache = p->dtp_cacheid;
11555		}
11556
11557		ecb->dte_next = NULL;
11558	}
11559}
11560
11561static void
11562dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11563{
11564	dtrace_state_t *state = ecb->dte_state;
11565	dtrace_vstate_t *vstate = &state->dts_vstate;
11566	dtrace_predicate_t *pred;
11567	dtrace_epid_t epid = ecb->dte_epid;
11568
11569	ASSERT(MUTEX_HELD(&dtrace_lock));
11570	ASSERT(ecb->dte_next == NULL);
11571	ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11572
11573	if ((pred = ecb->dte_predicate) != NULL)
11574		dtrace_predicate_release(pred, vstate);
11575
11576	dtrace_ecb_action_remove(ecb);
11577
11578	ASSERT(state->dts_ecbs[epid - 1] == ecb);
11579	state->dts_ecbs[epid - 1] = NULL;
11580
11581	kmem_free(ecb, sizeof (dtrace_ecb_t));
11582}
11583
11584static dtrace_ecb_t *
11585dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11586    dtrace_enabling_t *enab)
11587{
11588	dtrace_ecb_t *ecb;
11589	dtrace_predicate_t *pred;
11590	dtrace_actdesc_t *act;
11591	dtrace_provider_t *prov;
11592	dtrace_ecbdesc_t *desc = enab->dten_current;
11593
11594	ASSERT(MUTEX_HELD(&dtrace_lock));
11595	ASSERT(state != NULL);
11596
11597	ecb = dtrace_ecb_add(state, probe);
11598	ecb->dte_uarg = desc->dted_uarg;
11599
11600	if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11601		dtrace_predicate_hold(pred);
11602		ecb->dte_predicate = pred;
11603	}
11604
11605	if (probe != NULL) {
11606		/*
11607		 * If the provider shows more leg than the consumer is old
11608		 * enough to see, we need to enable the appropriate implicit
11609		 * predicate bits to prevent the ecb from activating at
11610		 * revealing times.
11611		 *
11612		 * Providers specifying DTRACE_PRIV_USER at register time
11613		 * are stating that they need the /proc-style privilege
11614		 * model to be enforced, and this is what DTRACE_COND_OWNER
11615		 * and DTRACE_COND_ZONEOWNER will then do at probe time.
11616		 */
11617		prov = probe->dtpr_provider;
11618		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11619		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11620			ecb->dte_cond |= DTRACE_COND_OWNER;
11621
11622		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11623		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11624			ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11625
11626		/*
11627		 * If the provider shows us kernel innards and the user
11628		 * is lacking sufficient privilege, enable the
11629		 * DTRACE_COND_USERMODE implicit predicate.
11630		 */
11631		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11632		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11633			ecb->dte_cond |= DTRACE_COND_USERMODE;
11634	}
11635
11636	if (dtrace_ecb_create_cache != NULL) {
11637		/*
11638		 * If we have a cached ecb, we'll use its action list instead
11639		 * of creating our own (saving both time and space).
11640		 */
11641		dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11642		dtrace_action_t *act = cached->dte_action;
11643
11644		if (act != NULL) {
11645			ASSERT(act->dta_refcnt > 0);
11646			act->dta_refcnt++;
11647			ecb->dte_action = act;
11648			ecb->dte_action_last = cached->dte_action_last;
11649			ecb->dte_needed = cached->dte_needed;
11650			ecb->dte_size = cached->dte_size;
11651			ecb->dte_alignment = cached->dte_alignment;
11652		}
11653
11654		return (ecb);
11655	}
11656
11657	for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11658		if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11659			dtrace_ecb_destroy(ecb);
11660			return (NULL);
11661		}
11662	}
11663
11664	dtrace_ecb_resize(ecb);
11665
11666	return (dtrace_ecb_create_cache = ecb);
11667}
11668
11669static int
11670dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
11671{
11672	dtrace_ecb_t *ecb;
11673	dtrace_enabling_t *enab = arg;
11674	dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11675
11676	ASSERT(state != NULL);
11677
11678	if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
11679		/*
11680		 * This probe was created in a generation for which this
11681		 * enabling has previously created ECBs; we don't want to
11682		 * enable it again, so just kick out.
11683		 */
11684		return (DTRACE_MATCH_NEXT);
11685	}
11686
11687	if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11688		return (DTRACE_MATCH_DONE);
11689
11690	dtrace_ecb_enable(ecb);
11691	return (DTRACE_MATCH_NEXT);
11692}
11693
11694static dtrace_ecb_t *
11695dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11696{
11697	dtrace_ecb_t *ecb;
11698
11699	ASSERT(MUTEX_HELD(&dtrace_lock));
11700
11701	if (id == 0 || id > state->dts_necbs)
11702		return (NULL);
11703
11704	ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11705	ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11706
11707	return (state->dts_ecbs[id - 1]);
11708}
11709
11710static dtrace_aggregation_t *
11711dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11712{
11713	dtrace_aggregation_t *agg;
11714
11715	ASSERT(MUTEX_HELD(&dtrace_lock));
11716
11717	if (id == 0 || id > state->dts_naggregations)
11718		return (NULL);
11719
11720	ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11721	ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11722	    agg->dtag_id == id);
11723
11724	return (state->dts_aggregations[id - 1]);
11725}
11726
11727/*
11728 * DTrace Buffer Functions
11729 *
11730 * The following functions manipulate DTrace buffers.  Most of these functions
11731 * are called in the context of establishing or processing consumer state;
11732 * exceptions are explicitly noted.
11733 */
11734
11735/*
11736 * Note:  called from cross call context.  This function switches the two
11737 * buffers on a given CPU.  The atomicity of this operation is assured by
11738 * disabling interrupts while the actual switch takes place; the disabling of
11739 * interrupts serializes the execution with any execution of dtrace_probe() on
11740 * the same CPU.
11741 */
11742static void
11743dtrace_buffer_switch(dtrace_buffer_t *buf)
11744{
11745	caddr_t tomax = buf->dtb_tomax;
11746	caddr_t xamot = buf->dtb_xamot;
11747	dtrace_icookie_t cookie;
11748	hrtime_t now;
11749
11750	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11751	ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11752
11753	cookie = dtrace_interrupt_disable();
11754	now = dtrace_gethrtime();
11755	buf->dtb_tomax = xamot;
11756	buf->dtb_xamot = tomax;
11757	buf->dtb_xamot_drops = buf->dtb_drops;
11758	buf->dtb_xamot_offset = buf->dtb_offset;
11759	buf->dtb_xamot_errors = buf->dtb_errors;
11760	buf->dtb_xamot_flags = buf->dtb_flags;
11761	buf->dtb_offset = 0;
11762	buf->dtb_drops = 0;
11763	buf->dtb_errors = 0;
11764	buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11765	buf->dtb_interval = now - buf->dtb_switched;
11766	buf->dtb_switched = now;
11767	dtrace_interrupt_enable(cookie);
11768}
11769
11770/*
11771 * Note:  called from cross call context.  This function activates a buffer
11772 * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
11773 * is guaranteed by the disabling of interrupts.
11774 */
11775static void
11776dtrace_buffer_activate(dtrace_state_t *state)
11777{
11778	dtrace_buffer_t *buf;
11779	dtrace_icookie_t cookie = dtrace_interrupt_disable();
11780
11781	buf = &state->dts_buffer[curcpu];
11782
11783	if (buf->dtb_tomax != NULL) {
11784		/*
11785		 * We might like to assert that the buffer is marked inactive,
11786		 * but this isn't necessarily true:  the buffer for the CPU
11787		 * that processes the BEGIN probe has its buffer activated
11788		 * manually.  In this case, we take the (harmless) action
11789		 * re-clearing the bit INACTIVE bit.
11790		 */
11791		buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11792	}
11793
11794	dtrace_interrupt_enable(cookie);
11795}
11796
11797static int
11798dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
11799    processorid_t cpu, int *factor)
11800{
11801#if defined(sun)
11802	cpu_t *cp;
11803#endif
11804	dtrace_buffer_t *buf;
11805	int allocated = 0, desired = 0;
11806
11807#if defined(sun)
11808	ASSERT(MUTEX_HELD(&cpu_lock));
11809	ASSERT(MUTEX_HELD(&dtrace_lock));
11810
11811	*factor = 1;
11812
11813	if (size > dtrace_nonroot_maxsize &&
11814	    !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11815		return (EFBIG);
11816
11817	cp = cpu_list;
11818
11819	do {
11820		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11821			continue;
11822
11823		buf = &bufs[cp->cpu_id];
11824
11825		/*
11826		 * If there is already a buffer allocated for this CPU, it
11827		 * is only possible that this is a DR event.  In this case,
11828		 */
11829		if (buf->dtb_tomax != NULL) {
11830			ASSERT(buf->dtb_size == size);
11831			continue;
11832		}
11833
11834		ASSERT(buf->dtb_xamot == NULL);
11835
11836		if ((buf->dtb_tomax = kmem_zalloc(size,
11837		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11838			goto err;
11839
11840		buf->dtb_size = size;
11841		buf->dtb_flags = flags;
11842		buf->dtb_offset = 0;
11843		buf->dtb_drops = 0;
11844
11845		if (flags & DTRACEBUF_NOSWITCH)
11846			continue;
11847
11848		if ((buf->dtb_xamot = kmem_zalloc(size,
11849		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11850			goto err;
11851	} while ((cp = cp->cpu_next) != cpu_list);
11852
11853	return (0);
11854
11855err:
11856	cp = cpu_list;
11857
11858	do {
11859		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11860			continue;
11861
11862		buf = &bufs[cp->cpu_id];
11863		desired += 2;
11864
11865		if (buf->dtb_xamot != NULL) {
11866			ASSERT(buf->dtb_tomax != NULL);
11867			ASSERT(buf->dtb_size == size);
11868			kmem_free(buf->dtb_xamot, size);
11869			allocated++;
11870		}
11871
11872		if (buf->dtb_tomax != NULL) {
11873			ASSERT(buf->dtb_size == size);
11874			kmem_free(buf->dtb_tomax, size);
11875			allocated++;
11876		}
11877
11878		buf->dtb_tomax = NULL;
11879		buf->dtb_xamot = NULL;
11880		buf->dtb_size = 0;
11881	} while ((cp = cp->cpu_next) != cpu_list);
11882#else
11883	int i;
11884
11885	*factor = 1;
11886#if defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
11887	/*
11888	 * FreeBSD isn't good at limiting the amount of memory we
11889	 * ask to malloc, so let's place a limit here before trying
11890	 * to do something that might well end in tears at bedtime.
11891	 */
11892	if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1)))
11893		return (ENOMEM);
11894#endif
11895
11896	ASSERT(MUTEX_HELD(&dtrace_lock));
11897	CPU_FOREACH(i) {
11898		if (cpu != DTRACE_CPUALL && cpu != i)
11899			continue;
11900
11901		buf = &bufs[i];
11902
11903		/*
11904		 * If there is already a buffer allocated for this CPU, it
11905		 * is only possible that this is a DR event.  In this case,
11906		 * the buffer size must match our specified size.
11907		 */
11908		if (buf->dtb_tomax != NULL) {
11909			ASSERT(buf->dtb_size == size);
11910			continue;
11911		}
11912
11913		ASSERT(buf->dtb_xamot == NULL);
11914
11915		if ((buf->dtb_tomax = kmem_zalloc(size,
11916		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11917			goto err;
11918
11919		buf->dtb_size = size;
11920		buf->dtb_flags = flags;
11921		buf->dtb_offset = 0;
11922		buf->dtb_drops = 0;
11923
11924		if (flags & DTRACEBUF_NOSWITCH)
11925			continue;
11926
11927		if ((buf->dtb_xamot = kmem_zalloc(size,
11928		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11929			goto err;
11930	}
11931
11932	return (0);
11933
11934err:
11935	/*
11936	 * Error allocating memory, so free the buffers that were
11937	 * allocated before the failed allocation.
11938	 */
11939	CPU_FOREACH(i) {
11940		if (cpu != DTRACE_CPUALL && cpu != i)
11941			continue;
11942
11943		buf = &bufs[i];
11944		desired += 2;
11945
11946		if (buf->dtb_xamot != NULL) {
11947			ASSERT(buf->dtb_tomax != NULL);
11948			ASSERT(buf->dtb_size == size);
11949			kmem_free(buf->dtb_xamot, size);
11950			allocated++;
11951		}
11952
11953		if (buf->dtb_tomax != NULL) {
11954			ASSERT(buf->dtb_size == size);
11955			kmem_free(buf->dtb_tomax, size);
11956			allocated++;
11957		}
11958
11959		buf->dtb_tomax = NULL;
11960		buf->dtb_xamot = NULL;
11961		buf->dtb_size = 0;
11962
11963	}
11964#endif
11965	*factor = desired / (allocated > 0 ? allocated : 1);
11966
11967	return (ENOMEM);
11968}
11969
11970/*
11971 * Note:  called from probe context.  This function just increments the drop
11972 * count on a buffer.  It has been made a function to allow for the
11973 * possibility of understanding the source of mysterious drop counts.  (A
11974 * problem for which one may be particularly disappointed that DTrace cannot
11975 * be used to understand DTrace.)
11976 */
11977static void
11978dtrace_buffer_drop(dtrace_buffer_t *buf)
11979{
11980	buf->dtb_drops++;
11981}
11982
11983/*
11984 * Note:  called from probe context.  This function is called to reserve space
11985 * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
11986 * mstate.  Returns the new offset in the buffer, or a negative value if an
11987 * error has occurred.
11988 */
11989static intptr_t
11990dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
11991    dtrace_state_t *state, dtrace_mstate_t *mstate)
11992{
11993	intptr_t offs = buf->dtb_offset, soffs;
11994	intptr_t woffs;
11995	caddr_t tomax;
11996	size_t total;
11997
11998	if (buf->dtb_flags & DTRACEBUF_INACTIVE)
11999		return (-1);
12000
12001	if ((tomax = buf->dtb_tomax) == NULL) {
12002		dtrace_buffer_drop(buf);
12003		return (-1);
12004	}
12005
12006	if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
12007		while (offs & (align - 1)) {
12008			/*
12009			 * Assert that our alignment is off by a number which
12010			 * is itself sizeof (uint32_t) aligned.
12011			 */
12012			ASSERT(!((align - (offs & (align - 1))) &
12013			    (sizeof (uint32_t) - 1)));
12014			DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12015			offs += sizeof (uint32_t);
12016		}
12017
12018		if ((soffs = offs + needed) > buf->dtb_size) {
12019			dtrace_buffer_drop(buf);
12020			return (-1);
12021		}
12022
12023		if (mstate == NULL)
12024			return (offs);
12025
12026		mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
12027		mstate->dtms_scratch_size = buf->dtb_size - soffs;
12028		mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12029
12030		return (offs);
12031	}
12032
12033	if (buf->dtb_flags & DTRACEBUF_FILL) {
12034		if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
12035		    (buf->dtb_flags & DTRACEBUF_FULL))
12036			return (-1);
12037		goto out;
12038	}
12039
12040	total = needed + (offs & (align - 1));
12041
12042	/*
12043	 * For a ring buffer, life is quite a bit more complicated.  Before
12044	 * we can store any padding, we need to adjust our wrapping offset.
12045	 * (If we've never before wrapped or we're not about to, no adjustment
12046	 * is required.)
12047	 */
12048	if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
12049	    offs + total > buf->dtb_size) {
12050		woffs = buf->dtb_xamot_offset;
12051
12052		if (offs + total > buf->dtb_size) {
12053			/*
12054			 * We can't fit in the end of the buffer.  First, a
12055			 * sanity check that we can fit in the buffer at all.
12056			 */
12057			if (total > buf->dtb_size) {
12058				dtrace_buffer_drop(buf);
12059				return (-1);
12060			}
12061
12062			/*
12063			 * We're going to be storing at the top of the buffer,
12064			 * so now we need to deal with the wrapped offset.  We
12065			 * only reset our wrapped offset to 0 if it is
12066			 * currently greater than the current offset.  If it
12067			 * is less than the current offset, it is because a
12068			 * previous allocation induced a wrap -- but the
12069			 * allocation didn't subsequently take the space due
12070			 * to an error or false predicate evaluation.  In this
12071			 * case, we'll just leave the wrapped offset alone: if
12072			 * the wrapped offset hasn't been advanced far enough
12073			 * for this allocation, it will be adjusted in the
12074			 * lower loop.
12075			 */
12076			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
12077				if (woffs >= offs)
12078					woffs = 0;
12079			} else {
12080				woffs = 0;
12081			}
12082
12083			/*
12084			 * Now we know that we're going to be storing to the
12085			 * top of the buffer and that there is room for us
12086			 * there.  We need to clear the buffer from the current
12087			 * offset to the end (there may be old gunk there).
12088			 */
12089			while (offs < buf->dtb_size)
12090				tomax[offs++] = 0;
12091
12092			/*
12093			 * We need to set our offset to zero.  And because we
12094			 * are wrapping, we need to set the bit indicating as
12095			 * much.  We can also adjust our needed space back
12096			 * down to the space required by the ECB -- we know
12097			 * that the top of the buffer is aligned.
12098			 */
12099			offs = 0;
12100			total = needed;
12101			buf->dtb_flags |= DTRACEBUF_WRAPPED;
12102		} else {
12103			/*
12104			 * There is room for us in the buffer, so we simply
12105			 * need to check the wrapped offset.
12106			 */
12107			if (woffs < offs) {
12108				/*
12109				 * The wrapped offset is less than the offset.
12110				 * This can happen if we allocated buffer space
12111				 * that induced a wrap, but then we didn't
12112				 * subsequently take the space due to an error
12113				 * or false predicate evaluation.  This is
12114				 * okay; we know that _this_ allocation isn't
12115				 * going to induce a wrap.  We still can't
12116				 * reset the wrapped offset to be zero,
12117				 * however: the space may have been trashed in
12118				 * the previous failed probe attempt.  But at
12119				 * least the wrapped offset doesn't need to
12120				 * be adjusted at all...
12121				 */
12122				goto out;
12123			}
12124		}
12125
12126		while (offs + total > woffs) {
12127			dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
12128			size_t size;
12129
12130			if (epid == DTRACE_EPIDNONE) {
12131				size = sizeof (uint32_t);
12132			} else {
12133				ASSERT3U(epid, <=, state->dts_necbs);
12134				ASSERT(state->dts_ecbs[epid - 1] != NULL);
12135
12136				size = state->dts_ecbs[epid - 1]->dte_size;
12137			}
12138
12139			ASSERT(woffs + size <= buf->dtb_size);
12140			ASSERT(size != 0);
12141
12142			if (woffs + size == buf->dtb_size) {
12143				/*
12144				 * We've reached the end of the buffer; we want
12145				 * to set the wrapped offset to 0 and break
12146				 * out.  However, if the offs is 0, then we're
12147				 * in a strange edge-condition:  the amount of
12148				 * space that we want to reserve plus the size
12149				 * of the record that we're overwriting is
12150				 * greater than the size of the buffer.  This
12151				 * is problematic because if we reserve the
12152				 * space but subsequently don't consume it (due
12153				 * to a failed predicate or error) the wrapped
12154				 * offset will be 0 -- yet the EPID at offset 0
12155				 * will not be committed.  This situation is
12156				 * relatively easy to deal with:  if we're in
12157				 * this case, the buffer is indistinguishable
12158				 * from one that hasn't wrapped; we need only
12159				 * finish the job by clearing the wrapped bit,
12160				 * explicitly setting the offset to be 0, and
12161				 * zero'ing out the old data in the buffer.
12162				 */
12163				if (offs == 0) {
12164					buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12165					buf->dtb_offset = 0;
12166					woffs = total;
12167
12168					while (woffs < buf->dtb_size)
12169						tomax[woffs++] = 0;
12170				}
12171
12172				woffs = 0;
12173				break;
12174			}
12175
12176			woffs += size;
12177		}
12178
12179		/*
12180		 * We have a wrapped offset.  It may be that the wrapped offset
12181		 * has become zero -- that's okay.
12182		 */
12183		buf->dtb_xamot_offset = woffs;
12184	}
12185
12186out:
12187	/*
12188	 * Now we can plow the buffer with any necessary padding.
12189	 */
12190	while (offs & (align - 1)) {
12191		/*
12192		 * Assert that our alignment is off by a number which
12193		 * is itself sizeof (uint32_t) aligned.
12194		 */
12195		ASSERT(!((align - (offs & (align - 1))) &
12196		    (sizeof (uint32_t) - 1)));
12197		DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12198		offs += sizeof (uint32_t);
12199	}
12200
12201	if (buf->dtb_flags & DTRACEBUF_FILL) {
12202		if (offs + needed > buf->dtb_size - state->dts_reserve) {
12203			buf->dtb_flags |= DTRACEBUF_FULL;
12204			return (-1);
12205		}
12206	}
12207
12208	if (mstate == NULL)
12209		return (offs);
12210
12211	/*
12212	 * For ring buffers and fill buffers, the scratch space is always
12213	 * the inactive buffer.
12214	 */
12215	mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12216	mstate->dtms_scratch_size = buf->dtb_size;
12217	mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12218
12219	return (offs);
12220}
12221
12222static void
12223dtrace_buffer_polish(dtrace_buffer_t *buf)
12224{
12225	ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12226	ASSERT(MUTEX_HELD(&dtrace_lock));
12227
12228	if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12229		return;
12230
12231	/*
12232	 * We need to polish the ring buffer.  There are three cases:
12233	 *
12234	 * - The first (and presumably most common) is that there is no gap
12235	 *   between the buffer offset and the wrapped offset.  In this case,
12236	 *   there is nothing in the buffer that isn't valid data; we can
12237	 *   mark the buffer as polished and return.
12238	 *
12239	 * - The second (less common than the first but still more common
12240	 *   than the third) is that there is a gap between the buffer offset
12241	 *   and the wrapped offset, and the wrapped offset is larger than the
12242	 *   buffer offset.  This can happen because of an alignment issue, or
12243	 *   can happen because of a call to dtrace_buffer_reserve() that
12244	 *   didn't subsequently consume the buffer space.  In this case,
12245	 *   we need to zero the data from the buffer offset to the wrapped
12246	 *   offset.
12247	 *
12248	 * - The third (and least common) is that there is a gap between the
12249	 *   buffer offset and the wrapped offset, but the wrapped offset is
12250	 *   _less_ than the buffer offset.  This can only happen because a
12251	 *   call to dtrace_buffer_reserve() induced a wrap, but the space
12252	 *   was not subsequently consumed.  In this case, we need to zero the
12253	 *   space from the offset to the end of the buffer _and_ from the
12254	 *   top of the buffer to the wrapped offset.
12255	 */
12256	if (buf->dtb_offset < buf->dtb_xamot_offset) {
12257		bzero(buf->dtb_tomax + buf->dtb_offset,
12258		    buf->dtb_xamot_offset - buf->dtb_offset);
12259	}
12260
12261	if (buf->dtb_offset > buf->dtb_xamot_offset) {
12262		bzero(buf->dtb_tomax + buf->dtb_offset,
12263		    buf->dtb_size - buf->dtb_offset);
12264		bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
12265	}
12266}
12267
12268/*
12269 * This routine determines if data generated at the specified time has likely
12270 * been entirely consumed at user-level.  This routine is called to determine
12271 * if an ECB on a defunct probe (but for an active enabling) can be safely
12272 * disabled and destroyed.
12273 */
12274static int
12275dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
12276{
12277	int i;
12278
12279	for (i = 0; i < NCPU; i++) {
12280		dtrace_buffer_t *buf = &bufs[i];
12281
12282		if (buf->dtb_size == 0)
12283			continue;
12284
12285		if (buf->dtb_flags & DTRACEBUF_RING)
12286			return (0);
12287
12288		if (!buf->dtb_switched && buf->dtb_offset != 0)
12289			return (0);
12290
12291		if (buf->dtb_switched - buf->dtb_interval < when)
12292			return (0);
12293	}
12294
12295	return (1);
12296}
12297
12298static void
12299dtrace_buffer_free(dtrace_buffer_t *bufs)
12300{
12301	int i;
12302
12303	for (i = 0; i < NCPU; i++) {
12304		dtrace_buffer_t *buf = &bufs[i];
12305
12306		if (buf->dtb_tomax == NULL) {
12307			ASSERT(buf->dtb_xamot == NULL);
12308			ASSERT(buf->dtb_size == 0);
12309			continue;
12310		}
12311
12312		if (buf->dtb_xamot != NULL) {
12313			ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12314			kmem_free(buf->dtb_xamot, buf->dtb_size);
12315		}
12316
12317		kmem_free(buf->dtb_tomax, buf->dtb_size);
12318		buf->dtb_size = 0;
12319		buf->dtb_tomax = NULL;
12320		buf->dtb_xamot = NULL;
12321	}
12322}
12323
12324/*
12325 * DTrace Enabling Functions
12326 */
12327static dtrace_enabling_t *
12328dtrace_enabling_create(dtrace_vstate_t *vstate)
12329{
12330	dtrace_enabling_t *enab;
12331
12332	enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12333	enab->dten_vstate = vstate;
12334
12335	return (enab);
12336}
12337
12338static void
12339dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
12340{
12341	dtrace_ecbdesc_t **ndesc;
12342	size_t osize, nsize;
12343
12344	/*
12345	 * We can't add to enablings after we've enabled them, or after we've
12346	 * retained them.
12347	 */
12348	ASSERT(enab->dten_probegen == 0);
12349	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12350
12351	if (enab->dten_ndesc < enab->dten_maxdesc) {
12352		enab->dten_desc[enab->dten_ndesc++] = ecb;
12353		return;
12354	}
12355
12356	osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12357
12358	if (enab->dten_maxdesc == 0) {
12359		enab->dten_maxdesc = 1;
12360	} else {
12361		enab->dten_maxdesc <<= 1;
12362	}
12363
12364	ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12365
12366	nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12367	ndesc = kmem_zalloc(nsize, KM_SLEEP);
12368	bcopy(enab->dten_desc, ndesc, osize);
12369	if (enab->dten_desc != NULL)
12370		kmem_free(enab->dten_desc, osize);
12371
12372	enab->dten_desc = ndesc;
12373	enab->dten_desc[enab->dten_ndesc++] = ecb;
12374}
12375
12376static void
12377dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
12378    dtrace_probedesc_t *pd)
12379{
12380	dtrace_ecbdesc_t *new;
12381	dtrace_predicate_t *pred;
12382	dtrace_actdesc_t *act;
12383
12384	/*
12385	 * We're going to create a new ECB description that matches the
12386	 * specified ECB in every way, but has the specified probe description.
12387	 */
12388	new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12389
12390	if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12391		dtrace_predicate_hold(pred);
12392
12393	for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12394		dtrace_actdesc_hold(act);
12395
12396	new->dted_action = ecb->dted_action;
12397	new->dted_pred = ecb->dted_pred;
12398	new->dted_probe = *pd;
12399	new->dted_uarg = ecb->dted_uarg;
12400
12401	dtrace_enabling_add(enab, new);
12402}
12403
12404static void
12405dtrace_enabling_dump(dtrace_enabling_t *enab)
12406{
12407	int i;
12408
12409	for (i = 0; i < enab->dten_ndesc; i++) {
12410		dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12411
12412		cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12413		    desc->dtpd_provider, desc->dtpd_mod,
12414		    desc->dtpd_func, desc->dtpd_name);
12415	}
12416}
12417
12418static void
12419dtrace_enabling_destroy(dtrace_enabling_t *enab)
12420{
12421	int i;
12422	dtrace_ecbdesc_t *ep;
12423	dtrace_vstate_t *vstate = enab->dten_vstate;
12424
12425	ASSERT(MUTEX_HELD(&dtrace_lock));
12426
12427	for (i = 0; i < enab->dten_ndesc; i++) {
12428		dtrace_actdesc_t *act, *next;
12429		dtrace_predicate_t *pred;
12430
12431		ep = enab->dten_desc[i];
12432
12433		if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12434			dtrace_predicate_release(pred, vstate);
12435
12436		for (act = ep->dted_action; act != NULL; act = next) {
12437			next = act->dtad_next;
12438			dtrace_actdesc_release(act, vstate);
12439		}
12440
12441		kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12442	}
12443
12444	if (enab->dten_desc != NULL)
12445		kmem_free(enab->dten_desc,
12446		    enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12447
12448	/*
12449	 * If this was a retained enabling, decrement the dts_nretained count
12450	 * and take it off of the dtrace_retained list.
12451	 */
12452	if (enab->dten_prev != NULL || enab->dten_next != NULL ||
12453	    dtrace_retained == enab) {
12454		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12455		ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
12456		enab->dten_vstate->dtvs_state->dts_nretained--;
12457		dtrace_retained_gen++;
12458	}
12459
12460	if (enab->dten_prev == NULL) {
12461		if (dtrace_retained == enab) {
12462			dtrace_retained = enab->dten_next;
12463
12464			if (dtrace_retained != NULL)
12465				dtrace_retained->dten_prev = NULL;
12466		}
12467	} else {
12468		ASSERT(enab != dtrace_retained);
12469		ASSERT(dtrace_retained != NULL);
12470		enab->dten_prev->dten_next = enab->dten_next;
12471	}
12472
12473	if (enab->dten_next != NULL) {
12474		ASSERT(dtrace_retained != NULL);
12475		enab->dten_next->dten_prev = enab->dten_prev;
12476	}
12477
12478	kmem_free(enab, sizeof (dtrace_enabling_t));
12479}
12480
12481static int
12482dtrace_enabling_retain(dtrace_enabling_t *enab)
12483{
12484	dtrace_state_t *state;
12485
12486	ASSERT(MUTEX_HELD(&dtrace_lock));
12487	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12488	ASSERT(enab->dten_vstate != NULL);
12489
12490	state = enab->dten_vstate->dtvs_state;
12491	ASSERT(state != NULL);
12492
12493	/*
12494	 * We only allow each state to retain dtrace_retain_max enablings.
12495	 */
12496	if (state->dts_nretained >= dtrace_retain_max)
12497		return (ENOSPC);
12498
12499	state->dts_nretained++;
12500	dtrace_retained_gen++;
12501
12502	if (dtrace_retained == NULL) {
12503		dtrace_retained = enab;
12504		return (0);
12505	}
12506
12507	enab->dten_next = dtrace_retained;
12508	dtrace_retained->dten_prev = enab;
12509	dtrace_retained = enab;
12510
12511	return (0);
12512}
12513
12514static int
12515dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12516    dtrace_probedesc_t *create)
12517{
12518	dtrace_enabling_t *new, *enab;
12519	int found = 0, err = ENOENT;
12520
12521	ASSERT(MUTEX_HELD(&dtrace_lock));
12522	ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12523	ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12524	ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12525	ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12526
12527	new = dtrace_enabling_create(&state->dts_vstate);
12528
12529	/*
12530	 * Iterate over all retained enablings, looking for enablings that
12531	 * match the specified state.
12532	 */
12533	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12534		int i;
12535
12536		/*
12537		 * dtvs_state can only be NULL for helper enablings -- and
12538		 * helper enablings can't be retained.
12539		 */
12540		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12541
12542		if (enab->dten_vstate->dtvs_state != state)
12543			continue;
12544
12545		/*
12546		 * Now iterate over each probe description; we're looking for
12547		 * an exact match to the specified probe description.
12548		 */
12549		for (i = 0; i < enab->dten_ndesc; i++) {
12550			dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12551			dtrace_probedesc_t *pd = &ep->dted_probe;
12552
12553			if (strcmp(pd->dtpd_provider, match->dtpd_provider))
12554				continue;
12555
12556			if (strcmp(pd->dtpd_mod, match->dtpd_mod))
12557				continue;
12558
12559			if (strcmp(pd->dtpd_func, match->dtpd_func))
12560				continue;
12561
12562			if (strcmp(pd->dtpd_name, match->dtpd_name))
12563				continue;
12564
12565			/*
12566			 * We have a winning probe!  Add it to our growing
12567			 * enabling.
12568			 */
12569			found = 1;
12570			dtrace_enabling_addlike(new, ep, create);
12571		}
12572	}
12573
12574	if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12575		dtrace_enabling_destroy(new);
12576		return (err);
12577	}
12578
12579	return (0);
12580}
12581
12582static void
12583dtrace_enabling_retract(dtrace_state_t *state)
12584{
12585	dtrace_enabling_t *enab, *next;
12586
12587	ASSERT(MUTEX_HELD(&dtrace_lock));
12588
12589	/*
12590	 * Iterate over all retained enablings, destroy the enablings retained
12591	 * for the specified state.
12592	 */
12593	for (enab = dtrace_retained; enab != NULL; enab = next) {
12594		next = enab->dten_next;
12595
12596		/*
12597		 * dtvs_state can only be NULL for helper enablings -- and
12598		 * helper enablings can't be retained.
12599		 */
12600		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12601
12602		if (enab->dten_vstate->dtvs_state == state) {
12603			ASSERT(state->dts_nretained > 0);
12604			dtrace_enabling_destroy(enab);
12605		}
12606	}
12607
12608	ASSERT(state->dts_nretained == 0);
12609}
12610
12611static int
12612dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
12613{
12614	int i = 0;
12615	int matched = 0;
12616
12617	ASSERT(MUTEX_HELD(&cpu_lock));
12618	ASSERT(MUTEX_HELD(&dtrace_lock));
12619
12620	for (i = 0; i < enab->dten_ndesc; i++) {
12621		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12622
12623		enab->dten_current = ep;
12624		enab->dten_error = 0;
12625
12626		matched += dtrace_probe_enable(&ep->dted_probe, enab);
12627
12628		if (enab->dten_error != 0) {
12629			/*
12630			 * If we get an error half-way through enabling the
12631			 * probes, we kick out -- perhaps with some number of
12632			 * them enabled.  Leaving enabled probes enabled may
12633			 * be slightly confusing for user-level, but we expect
12634			 * that no one will attempt to actually drive on in
12635			 * the face of such errors.  If this is an anonymous
12636			 * enabling (indicated with a NULL nmatched pointer),
12637			 * we cmn_err() a message.  We aren't expecting to
12638			 * get such an error -- such as it can exist at all,
12639			 * it would be a result of corrupted DOF in the driver
12640			 * properties.
12641			 */
12642			if (nmatched == NULL) {
12643				cmn_err(CE_WARN, "dtrace_enabling_match() "
12644				    "error on %p: %d", (void *)ep,
12645				    enab->dten_error);
12646			}
12647
12648			return (enab->dten_error);
12649		}
12650	}
12651
12652	enab->dten_probegen = dtrace_probegen;
12653	if (nmatched != NULL)
12654		*nmatched = matched;
12655
12656	return (0);
12657}
12658
12659static void
12660dtrace_enabling_matchall(void)
12661{
12662	dtrace_enabling_t *enab;
12663
12664	mutex_enter(&cpu_lock);
12665	mutex_enter(&dtrace_lock);
12666
12667	/*
12668	 * Iterate over all retained enablings to see if any probes match
12669	 * against them.  We only perform this operation on enablings for which
12670	 * we have sufficient permissions by virtue of being in the global zone
12671	 * or in the same zone as the DTrace client.  Because we can be called
12672	 * after dtrace_detach() has been called, we cannot assert that there
12673	 * are retained enablings.  We can safely load from dtrace_retained,
12674	 * however:  the taskq_destroy() at the end of dtrace_detach() will
12675	 * block pending our completion.
12676	 */
12677	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12678#if defined(sun)
12679		cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
12680
12681		if (INGLOBALZONE(curproc) ||
12682		    cr != NULL && getzoneid() == crgetzoneid(cr))
12683#endif
12684			(void) dtrace_enabling_match(enab, NULL);
12685	}
12686
12687	mutex_exit(&dtrace_lock);
12688	mutex_exit(&cpu_lock);
12689}
12690
12691/*
12692 * If an enabling is to be enabled without having matched probes (that is, if
12693 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12694 * enabling must be _primed_ by creating an ECB for every ECB description.
12695 * This must be done to assure that we know the number of speculations, the
12696 * number of aggregations, the minimum buffer size needed, etc. before we
12697 * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
12698 * enabling any probes, we create ECBs for every ECB decription, but with a
12699 * NULL probe -- which is exactly what this function does.
12700 */
12701static void
12702dtrace_enabling_prime(dtrace_state_t *state)
12703{
12704	dtrace_enabling_t *enab;
12705	int i;
12706
12707	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12708		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12709
12710		if (enab->dten_vstate->dtvs_state != state)
12711			continue;
12712
12713		/*
12714		 * We don't want to prime an enabling more than once, lest
12715		 * we allow a malicious user to induce resource exhaustion.
12716		 * (The ECBs that result from priming an enabling aren't
12717		 * leaked -- but they also aren't deallocated until the
12718		 * consumer state is destroyed.)
12719		 */
12720		if (enab->dten_primed)
12721			continue;
12722
12723		for (i = 0; i < enab->dten_ndesc; i++) {
12724			enab->dten_current = enab->dten_desc[i];
12725			(void) dtrace_probe_enable(NULL, enab);
12726		}
12727
12728		enab->dten_primed = 1;
12729	}
12730}
12731
12732/*
12733 * Called to indicate that probes should be provided due to retained
12734 * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
12735 * must take an initial lap through the enabling calling the dtps_provide()
12736 * entry point explicitly to allow for autocreated probes.
12737 */
12738static void
12739dtrace_enabling_provide(dtrace_provider_t *prv)
12740{
12741	int i, all = 0;
12742	dtrace_probedesc_t desc;
12743	dtrace_genid_t gen;
12744
12745	ASSERT(MUTEX_HELD(&dtrace_lock));
12746	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
12747
12748	if (prv == NULL) {
12749		all = 1;
12750		prv = dtrace_provider;
12751	}
12752
12753	do {
12754		dtrace_enabling_t *enab;
12755		void *parg = prv->dtpv_arg;
12756
12757retry:
12758		gen = dtrace_retained_gen;
12759		for (enab = dtrace_retained; enab != NULL;
12760		    enab = enab->dten_next) {
12761			for (i = 0; i < enab->dten_ndesc; i++) {
12762				desc = enab->dten_desc[i]->dted_probe;
12763				mutex_exit(&dtrace_lock);
12764				prv->dtpv_pops.dtps_provide(parg, &desc);
12765				mutex_enter(&dtrace_lock);
12766				/*
12767				 * Process the retained enablings again if
12768				 * they have changed while we weren't holding
12769				 * dtrace_lock.
12770				 */
12771				if (gen != dtrace_retained_gen)
12772					goto retry;
12773			}
12774		}
12775	} while (all && (prv = prv->dtpv_next) != NULL);
12776
12777	mutex_exit(&dtrace_lock);
12778	dtrace_probe_provide(NULL, all ? NULL : prv);
12779	mutex_enter(&dtrace_lock);
12780}
12781
12782/*
12783 * Called to reap ECBs that are attached to probes from defunct providers.
12784 */
12785static void
12786dtrace_enabling_reap(void)
12787{
12788	dtrace_provider_t *prov;
12789	dtrace_probe_t *probe;
12790	dtrace_ecb_t *ecb;
12791	hrtime_t when;
12792	int i;
12793
12794	mutex_enter(&cpu_lock);
12795	mutex_enter(&dtrace_lock);
12796
12797	for (i = 0; i < dtrace_nprobes; i++) {
12798		if ((probe = dtrace_probes[i]) == NULL)
12799			continue;
12800
12801		if (probe->dtpr_ecb == NULL)
12802			continue;
12803
12804		prov = probe->dtpr_provider;
12805
12806		if ((when = prov->dtpv_defunct) == 0)
12807			continue;
12808
12809		/*
12810		 * We have ECBs on a defunct provider:  we want to reap these
12811		 * ECBs to allow the provider to unregister.  The destruction
12812		 * of these ECBs must be done carefully:  if we destroy the ECB
12813		 * and the consumer later wishes to consume an EPID that
12814		 * corresponds to the destroyed ECB (and if the EPID metadata
12815		 * has not been previously consumed), the consumer will abort
12816		 * processing on the unknown EPID.  To reduce (but not, sadly,
12817		 * eliminate) the possibility of this, we will only destroy an
12818		 * ECB for a defunct provider if, for the state that
12819		 * corresponds to the ECB:
12820		 *
12821		 *  (a)	There is no speculative tracing (which can effectively
12822		 *	cache an EPID for an arbitrary amount of time).
12823		 *
12824		 *  (b)	The principal buffers have been switched twice since the
12825		 *	provider became defunct.
12826		 *
12827		 *  (c)	The aggregation buffers are of zero size or have been
12828		 *	switched twice since the provider became defunct.
12829		 *
12830		 * We use dts_speculates to determine (a) and call a function
12831		 * (dtrace_buffer_consumed()) to determine (b) and (c).  Note
12832		 * that as soon as we've been unable to destroy one of the ECBs
12833		 * associated with the probe, we quit trying -- reaping is only
12834		 * fruitful in as much as we can destroy all ECBs associated
12835		 * with the defunct provider's probes.
12836		 */
12837		while ((ecb = probe->dtpr_ecb) != NULL) {
12838			dtrace_state_t *state = ecb->dte_state;
12839			dtrace_buffer_t *buf = state->dts_buffer;
12840			dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
12841
12842			if (state->dts_speculates)
12843				break;
12844
12845			if (!dtrace_buffer_consumed(buf, when))
12846				break;
12847
12848			if (!dtrace_buffer_consumed(aggbuf, when))
12849				break;
12850
12851			dtrace_ecb_disable(ecb);
12852			ASSERT(probe->dtpr_ecb != ecb);
12853			dtrace_ecb_destroy(ecb);
12854		}
12855	}
12856
12857	mutex_exit(&dtrace_lock);
12858	mutex_exit(&cpu_lock);
12859}
12860
12861/*
12862 * DTrace DOF Functions
12863 */
12864/*ARGSUSED*/
12865static void
12866dtrace_dof_error(dof_hdr_t *dof, const char *str)
12867{
12868	if (dtrace_err_verbose)
12869		cmn_err(CE_WARN, "failed to process DOF: %s", str);
12870
12871#ifdef DTRACE_ERRDEBUG
12872	dtrace_errdebug(str);
12873#endif
12874}
12875
12876/*
12877 * Create DOF out of a currently enabled state.  Right now, we only create
12878 * DOF containing the run-time options -- but this could be expanded to create
12879 * complete DOF representing the enabled state.
12880 */
12881static dof_hdr_t *
12882dtrace_dof_create(dtrace_state_t *state)
12883{
12884	dof_hdr_t *dof;
12885	dof_sec_t *sec;
12886	dof_optdesc_t *opt;
12887	int i, len = sizeof (dof_hdr_t) +
12888	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12889	    sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12890
12891	ASSERT(MUTEX_HELD(&dtrace_lock));
12892
12893	dof = kmem_zalloc(len, KM_SLEEP);
12894	dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12895	dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12896	dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12897	dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12898
12899	dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12900	dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12901	dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12902	dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12903	dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12904	dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12905
12906	dof->dofh_flags = 0;
12907	dof->dofh_hdrsize = sizeof (dof_hdr_t);
12908	dof->dofh_secsize = sizeof (dof_sec_t);
12909	dof->dofh_secnum = 1;	/* only DOF_SECT_OPTDESC */
12910	dof->dofh_secoff = sizeof (dof_hdr_t);
12911	dof->dofh_loadsz = len;
12912	dof->dofh_filesz = len;
12913	dof->dofh_pad = 0;
12914
12915	/*
12916	 * Fill in the option section header...
12917	 */
12918	sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12919	sec->dofs_type = DOF_SECT_OPTDESC;
12920	sec->dofs_align = sizeof (uint64_t);
12921	sec->dofs_flags = DOF_SECF_LOAD;
12922	sec->dofs_entsize = sizeof (dof_optdesc_t);
12923
12924	opt = (dof_optdesc_t *)((uintptr_t)sec +
12925	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12926
12927	sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12928	sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12929
12930	for (i = 0; i < DTRACEOPT_MAX; i++) {
12931		opt[i].dofo_option = i;
12932		opt[i].dofo_strtab = DOF_SECIDX_NONE;
12933		opt[i].dofo_value = state->dts_options[i];
12934	}
12935
12936	return (dof);
12937}
12938
12939static dof_hdr_t *
12940dtrace_dof_copyin(uintptr_t uarg, int *errp)
12941{
12942	dof_hdr_t hdr, *dof;
12943
12944	ASSERT(!MUTEX_HELD(&dtrace_lock));
12945
12946	/*
12947	 * First, we're going to copyin() the sizeof (dof_hdr_t).
12948	 */
12949	if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
12950		dtrace_dof_error(NULL, "failed to copyin DOF header");
12951		*errp = EFAULT;
12952		return (NULL);
12953	}
12954
12955	/*
12956	 * Now we'll allocate the entire DOF and copy it in -- provided
12957	 * that the length isn't outrageous.
12958	 */
12959	if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
12960		dtrace_dof_error(&hdr, "load size exceeds maximum");
12961		*errp = E2BIG;
12962		return (NULL);
12963	}
12964
12965	if (hdr.dofh_loadsz < sizeof (hdr)) {
12966		dtrace_dof_error(&hdr, "invalid load size");
12967		*errp = EINVAL;
12968		return (NULL);
12969	}
12970
12971	dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
12972
12973	if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
12974	    dof->dofh_loadsz != hdr.dofh_loadsz) {
12975		kmem_free(dof, hdr.dofh_loadsz);
12976		*errp = EFAULT;
12977		return (NULL);
12978	}
12979
12980	return (dof);
12981}
12982
12983#if !defined(sun)
12984static __inline uchar_t
12985dtrace_dof_char(char c) {
12986	switch (c) {
12987	case '0':
12988	case '1':
12989	case '2':
12990	case '3':
12991	case '4':
12992	case '5':
12993	case '6':
12994	case '7':
12995	case '8':
12996	case '9':
12997		return (c - '0');
12998	case 'A':
12999	case 'B':
13000	case 'C':
13001	case 'D':
13002	case 'E':
13003	case 'F':
13004		return (c - 'A' + 10);
13005	case 'a':
13006	case 'b':
13007	case 'c':
13008	case 'd':
13009	case 'e':
13010	case 'f':
13011		return (c - 'a' + 10);
13012	}
13013	/* Should not reach here. */
13014	return (0);
13015}
13016#endif
13017
13018static dof_hdr_t *
13019dtrace_dof_property(const char *name)
13020{
13021	uchar_t *buf;
13022	uint64_t loadsz;
13023	unsigned int len, i;
13024	dof_hdr_t *dof;
13025
13026#if defined(sun)
13027	/*
13028	 * Unfortunately, array of values in .conf files are always (and
13029	 * only) interpreted to be integer arrays.  We must read our DOF
13030	 * as an integer array, and then squeeze it into a byte array.
13031	 */
13032	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
13033	    (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
13034		return (NULL);
13035
13036	for (i = 0; i < len; i++)
13037		buf[i] = (uchar_t)(((int *)buf)[i]);
13038
13039	if (len < sizeof (dof_hdr_t)) {
13040		ddi_prop_free(buf);
13041		dtrace_dof_error(NULL, "truncated header");
13042		return (NULL);
13043	}
13044
13045	if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
13046		ddi_prop_free(buf);
13047		dtrace_dof_error(NULL, "truncated DOF");
13048		return (NULL);
13049	}
13050
13051	if (loadsz >= dtrace_dof_maxsize) {
13052		ddi_prop_free(buf);
13053		dtrace_dof_error(NULL, "oversized DOF");
13054		return (NULL);
13055	}
13056
13057	dof = kmem_alloc(loadsz, KM_SLEEP);
13058	bcopy(buf, dof, loadsz);
13059	ddi_prop_free(buf);
13060#else
13061	char *p;
13062	char *p_env;
13063
13064	if ((p_env = getenv(name)) == NULL)
13065		return (NULL);
13066
13067	len = strlen(p_env) / 2;
13068
13069	buf = kmem_alloc(len, KM_SLEEP);
13070
13071	dof = (dof_hdr_t *) buf;
13072
13073	p = p_env;
13074
13075	for (i = 0; i < len; i++) {
13076		buf[i] = (dtrace_dof_char(p[0]) << 4) |
13077		     dtrace_dof_char(p[1]);
13078		p += 2;
13079	}
13080
13081	freeenv(p_env);
13082
13083	if (len < sizeof (dof_hdr_t)) {
13084		kmem_free(buf, 0);
13085		dtrace_dof_error(NULL, "truncated header");
13086		return (NULL);
13087	}
13088
13089	if (len < (loadsz = dof->dofh_loadsz)) {
13090		kmem_free(buf, 0);
13091		dtrace_dof_error(NULL, "truncated DOF");
13092		return (NULL);
13093	}
13094
13095	if (loadsz >= dtrace_dof_maxsize) {
13096		kmem_free(buf, 0);
13097		dtrace_dof_error(NULL, "oversized DOF");
13098		return (NULL);
13099	}
13100#endif
13101
13102	return (dof);
13103}
13104
13105static void
13106dtrace_dof_destroy(dof_hdr_t *dof)
13107{
13108	kmem_free(dof, dof->dofh_loadsz);
13109}
13110
13111/*
13112 * Return the dof_sec_t pointer corresponding to a given section index.  If the
13113 * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
13114 * a type other than DOF_SECT_NONE is specified, the header is checked against
13115 * this type and NULL is returned if the types do not match.
13116 */
13117static dof_sec_t *
13118dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13119{
13120	dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
13121	    ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13122
13123	if (i >= dof->dofh_secnum) {
13124		dtrace_dof_error(dof, "referenced section index is invalid");
13125		return (NULL);
13126	}
13127
13128	if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13129		dtrace_dof_error(dof, "referenced section is not loadable");
13130		return (NULL);
13131	}
13132
13133	if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13134		dtrace_dof_error(dof, "referenced section is the wrong type");
13135		return (NULL);
13136	}
13137
13138	return (sec);
13139}
13140
13141static dtrace_probedesc_t *
13142dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
13143{
13144	dof_probedesc_t *probe;
13145	dof_sec_t *strtab;
13146	uintptr_t daddr = (uintptr_t)dof;
13147	uintptr_t str;
13148	size_t size;
13149
13150	if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13151		dtrace_dof_error(dof, "invalid probe section");
13152		return (NULL);
13153	}
13154
13155	if (sec->dofs_align != sizeof (dof_secidx_t)) {
13156		dtrace_dof_error(dof, "bad alignment in probe description");
13157		return (NULL);
13158	}
13159
13160	if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13161		dtrace_dof_error(dof, "truncated probe description");
13162		return (NULL);
13163	}
13164
13165	probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13166	strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
13167
13168	if (strtab == NULL)
13169		return (NULL);
13170
13171	str = daddr + strtab->dofs_offset;
13172	size = strtab->dofs_size;
13173
13174	if (probe->dofp_provider >= strtab->dofs_size) {
13175		dtrace_dof_error(dof, "corrupt probe provider");
13176		return (NULL);
13177	}
13178
13179	(void) strncpy(desc->dtpd_provider,
13180	    (char *)(str + probe->dofp_provider),
13181	    MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
13182
13183	if (probe->dofp_mod >= strtab->dofs_size) {
13184		dtrace_dof_error(dof, "corrupt probe module");
13185		return (NULL);
13186	}
13187
13188	(void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13189	    MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
13190
13191	if (probe->dofp_func >= strtab->dofs_size) {
13192		dtrace_dof_error(dof, "corrupt probe function");
13193		return (NULL);
13194	}
13195
13196	(void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13197	    MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
13198
13199	if (probe->dofp_name >= strtab->dofs_size) {
13200		dtrace_dof_error(dof, "corrupt probe name");
13201		return (NULL);
13202	}
13203
13204	(void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13205	    MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
13206
13207	return (desc);
13208}
13209
13210static dtrace_difo_t *
13211dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13212    cred_t *cr)
13213{
13214	dtrace_difo_t *dp;
13215	size_t ttl = 0;
13216	dof_difohdr_t *dofd;
13217	uintptr_t daddr = (uintptr_t)dof;
13218	size_t max = dtrace_difo_maxsize;
13219	int i, l, n;
13220
13221	static const struct {
13222		int section;
13223		int bufoffs;
13224		int lenoffs;
13225		int entsize;
13226		int align;
13227		const char *msg;
13228	} difo[] = {
13229		{ DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13230		offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
13231		sizeof (dif_instr_t), "multiple DIF sections" },
13232
13233		{ DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13234		offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
13235		sizeof (uint64_t), "multiple integer tables" },
13236
13237		{ DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13238		offsetof(dtrace_difo_t, dtdo_strlen), 0,
13239		sizeof (char), "multiple string tables" },
13240
13241		{ DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13242		offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
13243		sizeof (uint_t), "multiple variable tables" },
13244
13245		{ DOF_SECT_NONE, 0, 0, 0, 0, NULL }
13246	};
13247
13248	if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13249		dtrace_dof_error(dof, "invalid DIFO header section");
13250		return (NULL);
13251	}
13252
13253	if (sec->dofs_align != sizeof (dof_secidx_t)) {
13254		dtrace_dof_error(dof, "bad alignment in DIFO header");
13255		return (NULL);
13256	}
13257
13258	if (sec->dofs_size < sizeof (dof_difohdr_t) ||
13259	    sec->dofs_size % sizeof (dof_secidx_t)) {
13260		dtrace_dof_error(dof, "bad size in DIFO header");
13261		return (NULL);
13262	}
13263
13264	dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13265	n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
13266
13267	dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13268	dp->dtdo_rtype = dofd->dofd_rtype;
13269
13270	for (l = 0; l < n; l++) {
13271		dof_sec_t *subsec;
13272		void **bufp;
13273		uint32_t *lenp;
13274
13275		if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13276		    dofd->dofd_links[l])) == NULL)
13277			goto err; /* invalid section link */
13278
13279		if (ttl + subsec->dofs_size > max) {
13280			dtrace_dof_error(dof, "exceeds maximum size");
13281			goto err;
13282		}
13283
13284		ttl += subsec->dofs_size;
13285
13286		for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
13287			if (subsec->dofs_type != difo[i].section)
13288				continue;
13289
13290			if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13291				dtrace_dof_error(dof, "section not loaded");
13292				goto err;
13293			}
13294
13295			if (subsec->dofs_align != difo[i].align) {
13296				dtrace_dof_error(dof, "bad alignment");
13297				goto err;
13298			}
13299
13300			bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13301			lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13302
13303			if (*bufp != NULL) {
13304				dtrace_dof_error(dof, difo[i].msg);
13305				goto err;
13306			}
13307
13308			if (difo[i].entsize != subsec->dofs_entsize) {
13309				dtrace_dof_error(dof, "entry size mismatch");
13310				goto err;
13311			}
13312
13313			if (subsec->dofs_entsize != 0 &&
13314			    (subsec->dofs_size % subsec->dofs_entsize) != 0) {
13315				dtrace_dof_error(dof, "corrupt entry size");
13316				goto err;
13317			}
13318
13319			*lenp = subsec->dofs_size;
13320			*bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13321			bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
13322			    *bufp, subsec->dofs_size);
13323
13324			if (subsec->dofs_entsize != 0)
13325				*lenp /= subsec->dofs_entsize;
13326
13327			break;
13328		}
13329
13330		/*
13331		 * If we encounter a loadable DIFO sub-section that is not
13332		 * known to us, assume this is a broken program and fail.
13333		 */
13334		if (difo[i].section == DOF_SECT_NONE &&
13335		    (subsec->dofs_flags & DOF_SECF_LOAD)) {
13336			dtrace_dof_error(dof, "unrecognized DIFO subsection");
13337			goto err;
13338		}
13339	}
13340
13341	if (dp->dtdo_buf == NULL) {
13342		/*
13343		 * We can't have a DIF object without DIF text.
13344		 */
13345		dtrace_dof_error(dof, "missing DIF text");
13346		goto err;
13347	}
13348
13349	/*
13350	 * Before we validate the DIF object, run through the variable table
13351	 * looking for the strings -- if any of their size are under, we'll set
13352	 * their size to be the system-wide default string size.  Note that
13353	 * this should _not_ happen if the "strsize" option has been set --
13354	 * in this case, the compiler should have set the size to reflect the
13355	 * setting of the option.
13356	 */
13357	for (i = 0; i < dp->dtdo_varlen; i++) {
13358		dtrace_difv_t *v = &dp->dtdo_vartab[i];
13359		dtrace_diftype_t *t = &v->dtdv_type;
13360
13361		if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13362			continue;
13363
13364		if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
13365			t->dtdt_size = dtrace_strsize_default;
13366	}
13367
13368	if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
13369		goto err;
13370
13371	dtrace_difo_init(dp, vstate);
13372	return (dp);
13373
13374err:
13375	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13376	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13377	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13378	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13379
13380	kmem_free(dp, sizeof (dtrace_difo_t));
13381	return (NULL);
13382}
13383
13384static dtrace_predicate_t *
13385dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13386    cred_t *cr)
13387{
13388	dtrace_difo_t *dp;
13389
13390	if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13391		return (NULL);
13392
13393	return (dtrace_predicate_create(dp));
13394}
13395
13396static dtrace_actdesc_t *
13397dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13398    cred_t *cr)
13399{
13400	dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
13401	dof_actdesc_t *desc;
13402	dof_sec_t *difosec;
13403	size_t offs;
13404	uintptr_t daddr = (uintptr_t)dof;
13405	uint64_t arg;
13406	dtrace_actkind_t kind;
13407
13408	if (sec->dofs_type != DOF_SECT_ACTDESC) {
13409		dtrace_dof_error(dof, "invalid action section");
13410		return (NULL);
13411	}
13412
13413	if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13414		dtrace_dof_error(dof, "truncated action description");
13415		return (NULL);
13416	}
13417
13418	if (sec->dofs_align != sizeof (uint64_t)) {
13419		dtrace_dof_error(dof, "bad alignment in action description");
13420		return (NULL);
13421	}
13422
13423	if (sec->dofs_size < sec->dofs_entsize) {
13424		dtrace_dof_error(dof, "section entry size exceeds total size");
13425		return (NULL);
13426	}
13427
13428	if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13429		dtrace_dof_error(dof, "bad entry size in action description");
13430		return (NULL);
13431	}
13432
13433	if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13434		dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
13435		return (NULL);
13436	}
13437
13438	for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13439		desc = (dof_actdesc_t *)(daddr +
13440		    (uintptr_t)sec->dofs_offset + offs);
13441		kind = (dtrace_actkind_t)desc->dofa_kind;
13442
13443		if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13444		    (kind != DTRACEACT_PRINTA ||
13445		    desc->dofa_strtab != DOF_SECIDX_NONE)) ||
13446		    (kind == DTRACEACT_DIFEXPR &&
13447		    desc->dofa_strtab != DOF_SECIDX_NONE)) {
13448			dof_sec_t *strtab;
13449			char *str, *fmt;
13450			uint64_t i;
13451
13452			/*
13453			 * The argument to these actions is an index into the
13454			 * DOF string table.  For printf()-like actions, this
13455			 * is the format string.  For print(), this is the
13456			 * CTF type of the expression result.
13457			 */
13458			if ((strtab = dtrace_dof_sect(dof,
13459			    DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
13460				goto err;
13461
13462			str = (char *)((uintptr_t)dof +
13463			    (uintptr_t)strtab->dofs_offset);
13464
13465			for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13466				if (str[i] == '\0')
13467					break;
13468			}
13469
13470			if (i >= strtab->dofs_size) {
13471				dtrace_dof_error(dof, "bogus format string");
13472				goto err;
13473			}
13474
13475			if (i == desc->dofa_arg) {
13476				dtrace_dof_error(dof, "empty format string");
13477				goto err;
13478			}
13479
13480			i -= desc->dofa_arg;
13481			fmt = kmem_alloc(i + 1, KM_SLEEP);
13482			bcopy(&str[desc->dofa_arg], fmt, i + 1);
13483			arg = (uint64_t)(uintptr_t)fmt;
13484		} else {
13485			if (kind == DTRACEACT_PRINTA) {
13486				ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13487				arg = 0;
13488			} else {
13489				arg = desc->dofa_arg;
13490			}
13491		}
13492
13493		act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13494		    desc->dofa_uarg, arg);
13495
13496		if (last != NULL) {
13497			last->dtad_next = act;
13498		} else {
13499			first = act;
13500		}
13501
13502		last = act;
13503
13504		if (desc->dofa_difo == DOF_SECIDX_NONE)
13505			continue;
13506
13507		if ((difosec = dtrace_dof_sect(dof,
13508		    DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13509			goto err;
13510
13511		act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13512
13513		if (act->dtad_difo == NULL)
13514			goto err;
13515	}
13516
13517	ASSERT(first != NULL);
13518	return (first);
13519
13520err:
13521	for (act = first; act != NULL; act = next) {
13522		next = act->dtad_next;
13523		dtrace_actdesc_release(act, vstate);
13524	}
13525
13526	return (NULL);
13527}
13528
13529static dtrace_ecbdesc_t *
13530dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13531    cred_t *cr)
13532{
13533	dtrace_ecbdesc_t *ep;
13534	dof_ecbdesc_t *ecb;
13535	dtrace_probedesc_t *desc;
13536	dtrace_predicate_t *pred = NULL;
13537
13538	if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13539		dtrace_dof_error(dof, "truncated ECB description");
13540		return (NULL);
13541	}
13542
13543	if (sec->dofs_align != sizeof (uint64_t)) {
13544		dtrace_dof_error(dof, "bad alignment in ECB description");
13545		return (NULL);
13546	}
13547
13548	ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13549	sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13550
13551	if (sec == NULL)
13552		return (NULL);
13553
13554	ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13555	ep->dted_uarg = ecb->dofe_uarg;
13556	desc = &ep->dted_probe;
13557
13558	if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13559		goto err;
13560
13561	if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13562		if ((sec = dtrace_dof_sect(dof,
13563		    DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13564			goto err;
13565
13566		if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13567			goto err;
13568
13569		ep->dted_pred.dtpdd_predicate = pred;
13570	}
13571
13572	if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13573		if ((sec = dtrace_dof_sect(dof,
13574		    DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13575			goto err;
13576
13577		ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13578
13579		if (ep->dted_action == NULL)
13580			goto err;
13581	}
13582
13583	return (ep);
13584
13585err:
13586	if (pred != NULL)
13587		dtrace_predicate_release(pred, vstate);
13588	kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13589	return (NULL);
13590}
13591
13592/*
13593 * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
13594 * specified DOF.  At present, this amounts to simply adding 'ubase' to the
13595 * site of any user SETX relocations to account for load object base address.
13596 * In the future, if we need other relocations, this function can be extended.
13597 */
13598static int
13599dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
13600{
13601	uintptr_t daddr = (uintptr_t)dof;
13602	dof_relohdr_t *dofr =
13603	    (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13604	dof_sec_t *ss, *rs, *ts;
13605	dof_relodesc_t *r;
13606	uint_t i, n;
13607
13608	if (sec->dofs_size < sizeof (dof_relohdr_t) ||
13609	    sec->dofs_align != sizeof (dof_secidx_t)) {
13610		dtrace_dof_error(dof, "invalid relocation header");
13611		return (-1);
13612	}
13613
13614	ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
13615	rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
13616	ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
13617
13618	if (ss == NULL || rs == NULL || ts == NULL)
13619		return (-1); /* dtrace_dof_error() has been called already */
13620
13621	if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
13622	    rs->dofs_align != sizeof (uint64_t)) {
13623		dtrace_dof_error(dof, "invalid relocation section");
13624		return (-1);
13625	}
13626
13627	r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
13628	n = rs->dofs_size / rs->dofs_entsize;
13629
13630	for (i = 0; i < n; i++) {
13631		uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
13632
13633		switch (r->dofr_type) {
13634		case DOF_RELO_NONE:
13635			break;
13636		case DOF_RELO_SETX:
13637			if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
13638			    sizeof (uint64_t) > ts->dofs_size) {
13639				dtrace_dof_error(dof, "bad relocation offset");
13640				return (-1);
13641			}
13642
13643			if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
13644				dtrace_dof_error(dof, "misaligned setx relo");
13645				return (-1);
13646			}
13647
13648			*(uint64_t *)taddr += ubase;
13649			break;
13650		default:
13651			dtrace_dof_error(dof, "invalid relocation type");
13652			return (-1);
13653		}
13654
13655		r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
13656	}
13657
13658	return (0);
13659}
13660
13661/*
13662 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13663 * header:  it should be at the front of a memory region that is at least
13664 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13665 * size.  It need not be validated in any other way.
13666 */
13667static int
13668dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
13669    dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
13670{
13671	uint64_t len = dof->dofh_loadsz, seclen;
13672	uintptr_t daddr = (uintptr_t)dof;
13673	dtrace_ecbdesc_t *ep;
13674	dtrace_enabling_t *enab;
13675	uint_t i;
13676
13677	ASSERT(MUTEX_HELD(&dtrace_lock));
13678	ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13679
13680	/*
13681	 * Check the DOF header identification bytes.  In addition to checking
13682	 * valid settings, we also verify that unused bits/bytes are zeroed so
13683	 * we can use them later without fear of regressing existing binaries.
13684	 */
13685	if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
13686	    DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
13687		dtrace_dof_error(dof, "DOF magic string mismatch");
13688		return (-1);
13689	}
13690
13691	if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13692	    dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13693		dtrace_dof_error(dof, "DOF has invalid data model");
13694		return (-1);
13695	}
13696
13697	if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13698		dtrace_dof_error(dof, "DOF encoding mismatch");
13699		return (-1);
13700	}
13701
13702	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
13703	    dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
13704		dtrace_dof_error(dof, "DOF version mismatch");
13705		return (-1);
13706	}
13707
13708	if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13709		dtrace_dof_error(dof, "DOF uses unsupported instruction set");
13710		return (-1);
13711	}
13712
13713	if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13714		dtrace_dof_error(dof, "DOF uses too many integer registers");
13715		return (-1);
13716	}
13717
13718	if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13719		dtrace_dof_error(dof, "DOF uses too many tuple registers");
13720		return (-1);
13721	}
13722
13723	for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13724		if (dof->dofh_ident[i] != 0) {
13725			dtrace_dof_error(dof, "DOF has invalid ident byte set");
13726			return (-1);
13727		}
13728	}
13729
13730	if (dof->dofh_flags & ~DOF_FL_VALID) {
13731		dtrace_dof_error(dof, "DOF has invalid flag bits set");
13732		return (-1);
13733	}
13734
13735	if (dof->dofh_secsize == 0) {
13736		dtrace_dof_error(dof, "zero section header size");
13737		return (-1);
13738	}
13739
13740	/*
13741	 * Check that the section headers don't exceed the amount of DOF
13742	 * data.  Note that we cast the section size and number of sections
13743	 * to uint64_t's to prevent possible overflow in the multiplication.
13744	 */
13745	seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13746
13747	if (dof->dofh_secoff > len || seclen > len ||
13748	    dof->dofh_secoff + seclen > len) {
13749		dtrace_dof_error(dof, "truncated section headers");
13750		return (-1);
13751	}
13752
13753	if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13754		dtrace_dof_error(dof, "misaligned section headers");
13755		return (-1);
13756	}
13757
13758	if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13759		dtrace_dof_error(dof, "misaligned section size");
13760		return (-1);
13761	}
13762
13763	/*
13764	 * Take an initial pass through the section headers to be sure that
13765	 * the headers don't have stray offsets.  If the 'noprobes' flag is
13766	 * set, do not permit sections relating to providers, probes, or args.
13767	 */
13768	for (i = 0; i < dof->dofh_secnum; i++) {
13769		dof_sec_t *sec = (dof_sec_t *)(daddr +
13770		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13771
13772		if (noprobes) {
13773			switch (sec->dofs_type) {
13774			case DOF_SECT_PROVIDER:
13775			case DOF_SECT_PROBES:
13776			case DOF_SECT_PRARGS:
13777			case DOF_SECT_PROFFS:
13778				dtrace_dof_error(dof, "illegal sections "
13779				    "for enabling");
13780				return (-1);
13781			}
13782		}
13783
13784		if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
13785		    !(sec->dofs_flags & DOF_SECF_LOAD)) {
13786			dtrace_dof_error(dof, "loadable section with load "
13787			    "flag unset");
13788			return (-1);
13789		}
13790
13791		if (!(sec->dofs_flags & DOF_SECF_LOAD))
13792			continue; /* just ignore non-loadable sections */
13793
13794		if (!ISP2(sec->dofs_align)) {
13795			dtrace_dof_error(dof, "bad section alignment");
13796			return (-1);
13797		}
13798
13799		if (sec->dofs_offset & (sec->dofs_align - 1)) {
13800			dtrace_dof_error(dof, "misaligned section");
13801			return (-1);
13802		}
13803
13804		if (sec->dofs_offset > len || sec->dofs_size > len ||
13805		    sec->dofs_offset + sec->dofs_size > len) {
13806			dtrace_dof_error(dof, "corrupt section header");
13807			return (-1);
13808		}
13809
13810		if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13811		    sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13812			dtrace_dof_error(dof, "non-terminating string table");
13813			return (-1);
13814		}
13815	}
13816
13817	/*
13818	 * Take a second pass through the sections and locate and perform any
13819	 * relocations that are present.  We do this after the first pass to
13820	 * be sure that all sections have had their headers validated.
13821	 */
13822	for (i = 0; i < dof->dofh_secnum; i++) {
13823		dof_sec_t *sec = (dof_sec_t *)(daddr +
13824		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13825
13826		if (!(sec->dofs_flags & DOF_SECF_LOAD))
13827			continue; /* skip sections that are not loadable */
13828
13829		switch (sec->dofs_type) {
13830		case DOF_SECT_URELHDR:
13831			if (dtrace_dof_relocate(dof, sec, ubase) != 0)
13832				return (-1);
13833			break;
13834		}
13835	}
13836
13837	if ((enab = *enabp) == NULL)
13838		enab = *enabp = dtrace_enabling_create(vstate);
13839
13840	for (i = 0; i < dof->dofh_secnum; i++) {
13841		dof_sec_t *sec = (dof_sec_t *)(daddr +
13842		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13843
13844		if (sec->dofs_type != DOF_SECT_ECBDESC)
13845			continue;
13846
13847		if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
13848			dtrace_enabling_destroy(enab);
13849			*enabp = NULL;
13850			return (-1);
13851		}
13852
13853		dtrace_enabling_add(enab, ep);
13854	}
13855
13856	return (0);
13857}
13858
13859/*
13860 * Process DOF for any options.  This routine assumes that the DOF has been
13861 * at least processed by dtrace_dof_slurp().
13862 */
13863static int
13864dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13865{
13866	int i, rval;
13867	uint32_t entsize;
13868	size_t offs;
13869	dof_optdesc_t *desc;
13870
13871	for (i = 0; i < dof->dofh_secnum; i++) {
13872		dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13873		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13874
13875		if (sec->dofs_type != DOF_SECT_OPTDESC)
13876			continue;
13877
13878		if (sec->dofs_align != sizeof (uint64_t)) {
13879			dtrace_dof_error(dof, "bad alignment in "
13880			    "option description");
13881			return (EINVAL);
13882		}
13883
13884		if ((entsize = sec->dofs_entsize) == 0) {
13885			dtrace_dof_error(dof, "zeroed option entry size");
13886			return (EINVAL);
13887		}
13888
13889		if (entsize < sizeof (dof_optdesc_t)) {
13890			dtrace_dof_error(dof, "bad option entry size");
13891			return (EINVAL);
13892		}
13893
13894		for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13895			desc = (dof_optdesc_t *)((uintptr_t)dof +
13896			    (uintptr_t)sec->dofs_offset + offs);
13897
13898			if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13899				dtrace_dof_error(dof, "non-zero option string");
13900				return (EINVAL);
13901			}
13902
13903			if (desc->dofo_value == DTRACEOPT_UNSET) {
13904				dtrace_dof_error(dof, "unset option");
13905				return (EINVAL);
13906			}
13907
13908			if ((rval = dtrace_state_option(state,
13909			    desc->dofo_option, desc->dofo_value)) != 0) {
13910				dtrace_dof_error(dof, "rejected option");
13911				return (rval);
13912			}
13913		}
13914	}
13915
13916	return (0);
13917}
13918
13919/*
13920 * DTrace Consumer State Functions
13921 */
13922static int
13923dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13924{
13925	size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
13926	void *base;
13927	uintptr_t limit;
13928	dtrace_dynvar_t *dvar, *next, *start;
13929	int i;
13930
13931	ASSERT(MUTEX_HELD(&dtrace_lock));
13932	ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13933
13934	bzero(dstate, sizeof (dtrace_dstate_t));
13935
13936	if ((dstate->dtds_chunksize = chunksize) == 0)
13937		dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13938
13939	if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13940		size = min;
13941
13942	if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
13943		return (ENOMEM);
13944
13945	dstate->dtds_size = size;
13946	dstate->dtds_base = base;
13947	dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13948	bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
13949
13950	hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13951
13952	if (hashsize != 1 && (hashsize & 1))
13953		hashsize--;
13954
13955	dstate->dtds_hashsize = hashsize;
13956	dstate->dtds_hash = dstate->dtds_base;
13957
13958	/*
13959	 * Set all of our hash buckets to point to the single sink, and (if
13960	 * it hasn't already been set), set the sink's hash value to be the
13961	 * sink sentinel value.  The sink is needed for dynamic variable
13962	 * lookups to know that they have iterated over an entire, valid hash
13963	 * chain.
13964	 */
13965	for (i = 0; i < hashsize; i++)
13966		dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13967
13968	if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13969		dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13970
13971	/*
13972	 * Determine number of active CPUs.  Divide free list evenly among
13973	 * active CPUs.
13974	 */
13975	start = (dtrace_dynvar_t *)
13976	    ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13977	limit = (uintptr_t)base + size;
13978
13979	maxper = (limit - (uintptr_t)start) / NCPU;
13980	maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13981
13982#if !defined(sun)
13983	CPU_FOREACH(i) {
13984#else
13985	for (i = 0; i < NCPU; i++) {
13986#endif
13987		dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13988
13989		/*
13990		 * If we don't even have enough chunks to make it once through
13991		 * NCPUs, we're just going to allocate everything to the first
13992		 * CPU.  And if we're on the last CPU, we're going to allocate
13993		 * whatever is left over.  In either case, we set the limit to
13994		 * be the limit of the dynamic variable space.
13995		 */
13996		if (maxper == 0 || i == NCPU - 1) {
13997			limit = (uintptr_t)base + size;
13998			start = NULL;
13999		} else {
14000			limit = (uintptr_t)start + maxper;
14001			start = (dtrace_dynvar_t *)limit;
14002		}
14003
14004		ASSERT(limit <= (uintptr_t)base + size);
14005
14006		for (;;) {
14007			next = (dtrace_dynvar_t *)((uintptr_t)dvar +
14008			    dstate->dtds_chunksize);
14009
14010			if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
14011				break;
14012
14013			dvar->dtdv_next = next;
14014			dvar = next;
14015		}
14016
14017		if (maxper == 0)
14018			break;
14019	}
14020
14021	return (0);
14022}
14023
14024static void
14025dtrace_dstate_fini(dtrace_dstate_t *dstate)
14026{
14027	ASSERT(MUTEX_HELD(&cpu_lock));
14028
14029	if (dstate->dtds_base == NULL)
14030		return;
14031
14032	kmem_free(dstate->dtds_base, dstate->dtds_size);
14033	kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
14034}
14035
14036static void
14037dtrace_vstate_fini(dtrace_vstate_t *vstate)
14038{
14039	/*
14040	 * Logical XOR, where are you?
14041	 */
14042	ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
14043
14044	if (vstate->dtvs_nglobals > 0) {
14045		kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
14046		    sizeof (dtrace_statvar_t *));
14047	}
14048
14049	if (vstate->dtvs_ntlocals > 0) {
14050		kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
14051		    sizeof (dtrace_difv_t));
14052	}
14053
14054	ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
14055
14056	if (vstate->dtvs_nlocals > 0) {
14057		kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
14058		    sizeof (dtrace_statvar_t *));
14059	}
14060}
14061
14062#if defined(sun)
14063static void
14064dtrace_state_clean(dtrace_state_t *state)
14065{
14066	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14067		return;
14068
14069	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14070	dtrace_speculation_clean(state);
14071}
14072
14073static void
14074dtrace_state_deadman(dtrace_state_t *state)
14075{
14076	hrtime_t now;
14077
14078	dtrace_sync();
14079
14080	now = dtrace_gethrtime();
14081
14082	if (state != dtrace_anon.dta_state &&
14083	    now - state->dts_laststatus >= dtrace_deadman_user)
14084		return;
14085
14086	/*
14087	 * We must be sure that dts_alive never appears to be less than the
14088	 * value upon entry to dtrace_state_deadman(), and because we lack a
14089	 * dtrace_cas64(), we cannot store to it atomically.  We thus instead
14090	 * store INT64_MAX to it, followed by a memory barrier, followed by
14091	 * the new value.  This assures that dts_alive never appears to be
14092	 * less than its true value, regardless of the order in which the
14093	 * stores to the underlying storage are issued.
14094	 */
14095	state->dts_alive = INT64_MAX;
14096	dtrace_membar_producer();
14097	state->dts_alive = now;
14098}
14099#else
14100static void
14101dtrace_state_clean(void *arg)
14102{
14103	dtrace_state_t *state = arg;
14104	dtrace_optval_t *opt = state->dts_options;
14105
14106	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14107		return;
14108
14109	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14110	dtrace_speculation_clean(state);
14111
14112	callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14113	    dtrace_state_clean, state);
14114}
14115
14116static void
14117dtrace_state_deadman(void *arg)
14118{
14119	dtrace_state_t *state = arg;
14120	hrtime_t now;
14121
14122	dtrace_sync();
14123
14124	dtrace_debug_output();
14125
14126	now = dtrace_gethrtime();
14127
14128	if (state != dtrace_anon.dta_state &&
14129	    now - state->dts_laststatus >= dtrace_deadman_user)
14130		return;
14131
14132	/*
14133	 * We must be sure that dts_alive never appears to be less than the
14134	 * value upon entry to dtrace_state_deadman(), and because we lack a
14135	 * dtrace_cas64(), we cannot store to it atomically.  We thus instead
14136	 * store INT64_MAX to it, followed by a memory barrier, followed by
14137	 * the new value.  This assures that dts_alive never appears to be
14138	 * less than its true value, regardless of the order in which the
14139	 * stores to the underlying storage are issued.
14140	 */
14141	state->dts_alive = INT64_MAX;
14142	dtrace_membar_producer();
14143	state->dts_alive = now;
14144
14145	callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14146	    dtrace_state_deadman, state);
14147}
14148#endif
14149
14150static dtrace_state_t *
14151#if defined(sun)
14152dtrace_state_create(dev_t *devp, cred_t *cr)
14153#else
14154dtrace_state_create(struct cdev *dev)
14155#endif
14156{
14157#if defined(sun)
14158	minor_t minor;
14159	major_t major;
14160#else
14161	cred_t *cr = NULL;
14162	int m = 0;
14163#endif
14164	char c[30];
14165	dtrace_state_t *state;
14166	dtrace_optval_t *opt;
14167	int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
14168
14169	ASSERT(MUTEX_HELD(&dtrace_lock));
14170	ASSERT(MUTEX_HELD(&cpu_lock));
14171
14172#if defined(sun)
14173	minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
14174	    VM_BESTFIT | VM_SLEEP);
14175
14176	if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
14177		vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
14178		return (NULL);
14179	}
14180
14181	state = ddi_get_soft_state(dtrace_softstate, minor);
14182#else
14183	if (dev != NULL) {
14184		cr = dev->si_cred;
14185		m = dev2unit(dev);
14186	}
14187
14188	/* Allocate memory for the state. */
14189	state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
14190#endif
14191
14192	state->dts_epid = DTRACE_EPIDNONE + 1;
14193
14194	(void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
14195#if defined(sun)
14196	state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
14197	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14198
14199	if (devp != NULL) {
14200		major = getemajor(*devp);
14201	} else {
14202		major = ddi_driver_major(dtrace_devi);
14203	}
14204
14205	state->dts_dev = makedevice(major, minor);
14206
14207	if (devp != NULL)
14208		*devp = state->dts_dev;
14209#else
14210	state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
14211	state->dts_dev = dev;
14212#endif
14213
14214	/*
14215	 * We allocate NCPU buffers.  On the one hand, this can be quite
14216	 * a bit of memory per instance (nearly 36K on a Starcat).  On the
14217	 * other hand, it saves an additional memory reference in the probe
14218	 * path.
14219	 */
14220	state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14221	state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14222
14223#if defined(sun)
14224	state->dts_cleaner = CYCLIC_NONE;
14225	state->dts_deadman = CYCLIC_NONE;
14226#else
14227	callout_init(&state->dts_cleaner, CALLOUT_MPSAFE);
14228	callout_init(&state->dts_deadman, CALLOUT_MPSAFE);
14229#endif
14230	state->dts_vstate.dtvs_state = state;
14231
14232	for (i = 0; i < DTRACEOPT_MAX; i++)
14233		state->dts_options[i] = DTRACEOPT_UNSET;
14234
14235	/*
14236	 * Set the default options.
14237	 */
14238	opt = state->dts_options;
14239	opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14240	opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14241	opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14242	opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14243	opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14244	opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14245	opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14246	opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14247	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14248	opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14249	opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14250	opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14251	opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14252	opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14253
14254	state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
14255
14256	/*
14257	 * Depending on the user credentials, we set flag bits which alter probe
14258	 * visibility or the amount of destructiveness allowed.  In the case of
14259	 * actual anonymous tracing, or the possession of all privileges, all of
14260	 * the normal checks are bypassed.
14261	 */
14262	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14263		state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14264		state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14265	} else {
14266		/*
14267		 * Set up the credentials for this instantiation.  We take a
14268		 * hold on the credential to prevent it from disappearing on
14269		 * us; this in turn prevents the zone_t referenced by this
14270		 * credential from disappearing.  This means that we can
14271		 * examine the credential and the zone from probe context.
14272		 */
14273		crhold(cr);
14274		state->dts_cred.dcr_cred = cr;
14275
14276		/*
14277		 * CRA_PROC means "we have *some* privilege for dtrace" and
14278		 * unlocks the use of variables like pid, zonename, etc.
14279		 */
14280		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
14281		    PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14282			state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
14283		}
14284
14285		/*
14286		 * dtrace_user allows use of syscall and profile providers.
14287		 * If the user also has proc_owner and/or proc_zone, we
14288		 * extend the scope to include additional visibility and
14289		 * destructive power.
14290		 */
14291		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14292			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14293				state->dts_cred.dcr_visible |=
14294				    DTRACE_CRV_ALLPROC;
14295
14296				state->dts_cred.dcr_action |=
14297				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14298			}
14299
14300			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14301				state->dts_cred.dcr_visible |=
14302				    DTRACE_CRV_ALLZONE;
14303
14304				state->dts_cred.dcr_action |=
14305				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14306			}
14307
14308			/*
14309			 * If we have all privs in whatever zone this is,
14310			 * we can do destructive things to processes which
14311			 * have altered credentials.
14312			 */
14313#if defined(sun)
14314			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14315			    cr->cr_zone->zone_privset)) {
14316				state->dts_cred.dcr_action |=
14317				    DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14318			}
14319#endif
14320		}
14321
14322		/*
14323		 * Holding the dtrace_kernel privilege also implies that
14324		 * the user has the dtrace_user privilege from a visibility
14325		 * perspective.  But without further privileges, some
14326		 * destructive actions are not available.
14327		 */
14328		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14329			/*
14330			 * Make all probes in all zones visible.  However,
14331			 * this doesn't mean that all actions become available
14332			 * to all zones.
14333			 */
14334			state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
14335			    DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
14336
14337			state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
14338			    DTRACE_CRA_PROC;
14339			/*
14340			 * Holding proc_owner means that destructive actions
14341			 * for *this* zone are allowed.
14342			 */
14343			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14344				state->dts_cred.dcr_action |=
14345				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14346
14347			/*
14348			 * Holding proc_zone means that destructive actions
14349			 * for this user/group ID in all zones is allowed.
14350			 */
14351			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14352				state->dts_cred.dcr_action |=
14353				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14354
14355#if defined(sun)
14356			/*
14357			 * If we have all privs in whatever zone this is,
14358			 * we can do destructive things to processes which
14359			 * have altered credentials.
14360			 */
14361			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14362			    cr->cr_zone->zone_privset)) {
14363				state->dts_cred.dcr_action |=
14364				    DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14365			}
14366#endif
14367		}
14368
14369		/*
14370		 * Holding the dtrace_proc privilege gives control over fasttrap
14371		 * and pid providers.  We need to grant wider destructive
14372		 * privileges in the event that the user has proc_owner and/or
14373		 * proc_zone.
14374		 */
14375		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14376			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14377				state->dts_cred.dcr_action |=
14378				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14379
14380			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14381				state->dts_cred.dcr_action |=
14382				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14383		}
14384	}
14385
14386	return (state);
14387}
14388
14389static int
14390dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
14391{
14392	dtrace_optval_t *opt = state->dts_options, size;
14393	processorid_t cpu = 0;;
14394	int flags = 0, rval, factor, divisor = 1;
14395
14396	ASSERT(MUTEX_HELD(&dtrace_lock));
14397	ASSERT(MUTEX_HELD(&cpu_lock));
14398	ASSERT(which < DTRACEOPT_MAX);
14399	ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
14400	    (state == dtrace_anon.dta_state &&
14401	    state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14402
14403	if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
14404		return (0);
14405
14406	if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14407		cpu = opt[DTRACEOPT_CPU];
14408
14409	if (which == DTRACEOPT_SPECSIZE)
14410		flags |= DTRACEBUF_NOSWITCH;
14411
14412	if (which == DTRACEOPT_BUFSIZE) {
14413		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14414			flags |= DTRACEBUF_RING;
14415
14416		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14417			flags |= DTRACEBUF_FILL;
14418
14419		if (state != dtrace_anon.dta_state ||
14420		    state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14421			flags |= DTRACEBUF_INACTIVE;
14422	}
14423
14424	for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
14425		/*
14426		 * The size must be 8-byte aligned.  If the size is not 8-byte
14427		 * aligned, drop it down by the difference.
14428		 */
14429		if (size & (sizeof (uint64_t) - 1))
14430			size -= size & (sizeof (uint64_t) - 1);
14431
14432		if (size < state->dts_reserve) {
14433			/*
14434			 * Buffers always must be large enough to accommodate
14435			 * their prereserved space.  We return E2BIG instead
14436			 * of ENOMEM in this case to allow for user-level
14437			 * software to differentiate the cases.
14438			 */
14439			return (E2BIG);
14440		}
14441
14442		rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
14443
14444		if (rval != ENOMEM) {
14445			opt[which] = size;
14446			return (rval);
14447		}
14448
14449		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14450			return (rval);
14451
14452		for (divisor = 2; divisor < factor; divisor <<= 1)
14453			continue;
14454	}
14455
14456	return (ENOMEM);
14457}
14458
14459static int
14460dtrace_state_buffers(dtrace_state_t *state)
14461{
14462	dtrace_speculation_t *spec = state->dts_speculations;
14463	int rval, i;
14464
14465	if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14466	    DTRACEOPT_BUFSIZE)) != 0)
14467		return (rval);
14468
14469	if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14470	    DTRACEOPT_AGGSIZE)) != 0)
14471		return (rval);
14472
14473	for (i = 0; i < state->dts_nspeculations; i++) {
14474		if ((rval = dtrace_state_buffer(state,
14475		    spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14476			return (rval);
14477	}
14478
14479	return (0);
14480}
14481
14482static void
14483dtrace_state_prereserve(dtrace_state_t *state)
14484{
14485	dtrace_ecb_t *ecb;
14486	dtrace_probe_t *probe;
14487
14488	state->dts_reserve = 0;
14489
14490	if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14491		return;
14492
14493	/*
14494	 * If our buffer policy is a "fill" buffer policy, we need to set the
14495	 * prereserved space to be the space required by the END probes.
14496	 */
14497	probe = dtrace_probes[dtrace_probeid_end - 1];
14498	ASSERT(probe != NULL);
14499
14500	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14501		if (ecb->dte_state != state)
14502			continue;
14503
14504		state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14505	}
14506}
14507
14508static int
14509dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14510{
14511	dtrace_optval_t *opt = state->dts_options, sz, nspec;
14512	dtrace_speculation_t *spec;
14513	dtrace_buffer_t *buf;
14514#if defined(sun)
14515	cyc_handler_t hdlr;
14516	cyc_time_t when;
14517#endif
14518	int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
14519	dtrace_icookie_t cookie;
14520
14521	mutex_enter(&cpu_lock);
14522	mutex_enter(&dtrace_lock);
14523
14524	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14525		rval = EBUSY;
14526		goto out;
14527	}
14528
14529	/*
14530	 * Before we can perform any checks, we must prime all of the
14531	 * retained enablings that correspond to this state.
14532	 */
14533	dtrace_enabling_prime(state);
14534
14535	if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14536		rval = EACCES;
14537		goto out;
14538	}
14539
14540	dtrace_state_prereserve(state);
14541
14542	/*
14543	 * Now we want to do is try to allocate our speculations.
14544	 * We do not automatically resize the number of speculations; if
14545	 * this fails, we will fail the operation.
14546	 */
14547	nspec = opt[DTRACEOPT_NSPEC];
14548	ASSERT(nspec != DTRACEOPT_UNSET);
14549
14550	if (nspec > INT_MAX) {
14551		rval = ENOMEM;
14552		goto out;
14553	}
14554
14555	spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
14556	    KM_NOSLEEP | KM_NORMALPRI);
14557
14558	if (spec == NULL) {
14559		rval = ENOMEM;
14560		goto out;
14561	}
14562
14563	state->dts_speculations = spec;
14564	state->dts_nspeculations = (int)nspec;
14565
14566	for (i = 0; i < nspec; i++) {
14567		if ((buf = kmem_zalloc(bufsize,
14568		    KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
14569			rval = ENOMEM;
14570			goto err;
14571		}
14572
14573		spec[i].dtsp_buffer = buf;
14574	}
14575
14576	if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14577		if (dtrace_anon.dta_state == NULL) {
14578			rval = ENOENT;
14579			goto out;
14580		}
14581
14582		if (state->dts_necbs != 0) {
14583			rval = EALREADY;
14584			goto out;
14585		}
14586
14587		state->dts_anon = dtrace_anon_grab();
14588		ASSERT(state->dts_anon != NULL);
14589		state = state->dts_anon;
14590
14591		/*
14592		 * We want "grabanon" to be set in the grabbed state, so we'll
14593		 * copy that option value from the grabbing state into the
14594		 * grabbed state.
14595		 */
14596		state->dts_options[DTRACEOPT_GRABANON] =
14597		    opt[DTRACEOPT_GRABANON];
14598
14599		*cpu = dtrace_anon.dta_beganon;
14600
14601		/*
14602		 * If the anonymous state is active (as it almost certainly
14603		 * is if the anonymous enabling ultimately matched anything),
14604		 * we don't allow any further option processing -- but we
14605		 * don't return failure.
14606		 */
14607		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14608			goto out;
14609	}
14610
14611	if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14612	    opt[DTRACEOPT_AGGSIZE] != 0) {
14613		if (state->dts_aggregations == NULL) {
14614			/*
14615			 * We're not going to create an aggregation buffer
14616			 * because we don't have any ECBs that contain
14617			 * aggregations -- set this option to 0.
14618			 */
14619			opt[DTRACEOPT_AGGSIZE] = 0;
14620		} else {
14621			/*
14622			 * If we have an aggregation buffer, we must also have
14623			 * a buffer to use as scratch.
14624			 */
14625			if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14626			    opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14627				opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14628			}
14629		}
14630	}
14631
14632	if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14633	    opt[DTRACEOPT_SPECSIZE] != 0) {
14634		if (!state->dts_speculates) {
14635			/*
14636			 * We're not going to create speculation buffers
14637			 * because we don't have any ECBs that actually
14638			 * speculate -- set the speculation size to 0.
14639			 */
14640			opt[DTRACEOPT_SPECSIZE] = 0;
14641		}
14642	}
14643
14644	/*
14645	 * The bare minimum size for any buffer that we're actually going to
14646	 * do anything to is sizeof (uint64_t).
14647	 */
14648	sz = sizeof (uint64_t);
14649
14650	if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
14651	    (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
14652	    (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14653		/*
14654		 * A buffer size has been explicitly set to 0 (or to a size
14655		 * that will be adjusted to 0) and we need the space -- we
14656		 * need to return failure.  We return ENOSPC to differentiate
14657		 * it from failing to allocate a buffer due to failure to meet
14658		 * the reserve (for which we return E2BIG).
14659		 */
14660		rval = ENOSPC;
14661		goto out;
14662	}
14663
14664	if ((rval = dtrace_state_buffers(state)) != 0)
14665		goto err;
14666
14667	if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14668		sz = dtrace_dstate_defsize;
14669
14670	do {
14671		rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
14672
14673		if (rval == 0)
14674			break;
14675
14676		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14677			goto err;
14678	} while (sz >>= 1);
14679
14680	opt[DTRACEOPT_DYNVARSIZE] = sz;
14681
14682	if (rval != 0)
14683		goto err;
14684
14685	if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14686		opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14687
14688	if (opt[DTRACEOPT_CLEANRATE] == 0)
14689		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14690
14691	if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14692		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14693
14694	if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14695		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14696
14697	state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14698#if defined(sun)
14699	hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14700	hdlr.cyh_arg = state;
14701	hdlr.cyh_level = CY_LOW_LEVEL;
14702
14703	when.cyt_when = 0;
14704	when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14705
14706	state->dts_cleaner = cyclic_add(&hdlr, &when);
14707
14708	hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14709	hdlr.cyh_arg = state;
14710	hdlr.cyh_level = CY_LOW_LEVEL;
14711
14712	when.cyt_when = 0;
14713	when.cyt_interval = dtrace_deadman_interval;
14714
14715	state->dts_deadman = cyclic_add(&hdlr, &when);
14716#else
14717	callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14718	    dtrace_state_clean, state);
14719	callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14720	    dtrace_state_deadman, state);
14721#endif
14722
14723	state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14724
14725#if defined(sun)
14726	if (state->dts_getf != 0 &&
14727	    !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
14728		/*
14729		 * We don't have kernel privs but we have at least one call
14730		 * to getf(); we need to bump our zone's count, and (if
14731		 * this is the first enabling to have an unprivileged call
14732		 * to getf()) we need to hook into closef().
14733		 */
14734		state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
14735
14736		if (dtrace_getf++ == 0) {
14737			ASSERT(dtrace_closef == NULL);
14738			dtrace_closef = dtrace_getf_barrier;
14739		}
14740	}
14741#endif
14742
14743	/*
14744	 * Now it's time to actually fire the BEGIN probe.  We need to disable
14745	 * interrupts here both to record the CPU on which we fired the BEGIN
14746	 * probe (the data from this CPU will be processed first at user
14747	 * level) and to manually activate the buffer for this CPU.
14748	 */
14749	cookie = dtrace_interrupt_disable();
14750	*cpu = curcpu;
14751	ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14752	state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14753
14754	dtrace_probe(dtrace_probeid_begin,
14755	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14756	dtrace_interrupt_enable(cookie);
14757	/*
14758	 * We may have had an exit action from a BEGIN probe; only change our
14759	 * state to ACTIVE if we're still in WARMUP.
14760	 */
14761	ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
14762	    state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14763
14764	if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14765		state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14766
14767	/*
14768	 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
14769	 * want each CPU to transition its principal buffer out of the
14770	 * INACTIVE state.  Doing this assures that no CPU will suddenly begin
14771	 * processing an ECB halfway down a probe's ECB chain; all CPUs will
14772	 * atomically transition from processing none of a state's ECBs to
14773	 * processing all of them.
14774	 */
14775	dtrace_xcall(DTRACE_CPUALL,
14776	    (dtrace_xcall_t)dtrace_buffer_activate, state);
14777	goto out;
14778
14779err:
14780	dtrace_buffer_free(state->dts_buffer);
14781	dtrace_buffer_free(state->dts_aggbuffer);
14782
14783	if ((nspec = state->dts_nspeculations) == 0) {
14784		ASSERT(state->dts_speculations == NULL);
14785		goto out;
14786	}
14787
14788	spec = state->dts_speculations;
14789	ASSERT(spec != NULL);
14790
14791	for (i = 0; i < state->dts_nspeculations; i++) {
14792		if ((buf = spec[i].dtsp_buffer) == NULL)
14793			break;
14794
14795		dtrace_buffer_free(buf);
14796		kmem_free(buf, bufsize);
14797	}
14798
14799	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14800	state->dts_nspeculations = 0;
14801	state->dts_speculations = NULL;
14802
14803out:
14804	mutex_exit(&dtrace_lock);
14805	mutex_exit(&cpu_lock);
14806
14807	return (rval);
14808}
14809
14810static int
14811dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
14812{
14813	dtrace_icookie_t cookie;
14814
14815	ASSERT(MUTEX_HELD(&dtrace_lock));
14816
14817	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14818	    state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14819		return (EINVAL);
14820
14821	/*
14822	 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14823	 * to be sure that every CPU has seen it.  See below for the details
14824	 * on why this is done.
14825	 */
14826	state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14827	dtrace_sync();
14828
14829	/*
14830	 * By this point, it is impossible for any CPU to be still processing
14831	 * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
14832	 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14833	 * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
14834	 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14835	 * iff we're in the END probe.
14836	 */
14837	state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14838	dtrace_sync();
14839	ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14840
14841	/*
14842	 * Finally, we can release the reserve and call the END probe.  We
14843	 * disable interrupts across calling the END probe to allow us to
14844	 * return the CPU on which we actually called the END probe.  This
14845	 * allows user-land to be sure that this CPU's principal buffer is
14846	 * processed last.
14847	 */
14848	state->dts_reserve = 0;
14849
14850	cookie = dtrace_interrupt_disable();
14851	*cpu = curcpu;
14852	dtrace_probe(dtrace_probeid_end,
14853	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14854	dtrace_interrupt_enable(cookie);
14855
14856	state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14857	dtrace_sync();
14858
14859#if defined(sun)
14860	if (state->dts_getf != 0 &&
14861	    !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
14862		/*
14863		 * We don't have kernel privs but we have at least one call
14864		 * to getf(); we need to lower our zone's count, and (if
14865		 * this is the last enabling to have an unprivileged call
14866		 * to getf()) we need to clear the closef() hook.
14867		 */
14868		ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
14869		ASSERT(dtrace_closef == dtrace_getf_barrier);
14870		ASSERT(dtrace_getf > 0);
14871
14872		state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
14873
14874		if (--dtrace_getf == 0)
14875			dtrace_closef = NULL;
14876	}
14877#endif
14878
14879	return (0);
14880}
14881
14882static int
14883dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14884    dtrace_optval_t val)
14885{
14886	ASSERT(MUTEX_HELD(&dtrace_lock));
14887
14888	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14889		return (EBUSY);
14890
14891	if (option >= DTRACEOPT_MAX)
14892		return (EINVAL);
14893
14894	if (option != DTRACEOPT_CPU && val < 0)
14895		return (EINVAL);
14896
14897	switch (option) {
14898	case DTRACEOPT_DESTRUCTIVE:
14899		if (dtrace_destructive_disallow)
14900			return (EACCES);
14901
14902		state->dts_cred.dcr_destructive = 1;
14903		break;
14904
14905	case DTRACEOPT_BUFSIZE:
14906	case DTRACEOPT_DYNVARSIZE:
14907	case DTRACEOPT_AGGSIZE:
14908	case DTRACEOPT_SPECSIZE:
14909	case DTRACEOPT_STRSIZE:
14910		if (val < 0)
14911			return (EINVAL);
14912
14913		if (val >= LONG_MAX) {
14914			/*
14915			 * If this is an otherwise negative value, set it to
14916			 * the highest multiple of 128m less than LONG_MAX.
14917			 * Technically, we're adjusting the size without
14918			 * regard to the buffer resizing policy, but in fact,
14919			 * this has no effect -- if we set the buffer size to
14920			 * ~LONG_MAX and the buffer policy is ultimately set to
14921			 * be "manual", the buffer allocation is guaranteed to
14922			 * fail, if only because the allocation requires two
14923			 * buffers.  (We set the the size to the highest
14924			 * multiple of 128m because it ensures that the size
14925			 * will remain a multiple of a megabyte when
14926			 * repeatedly halved -- all the way down to 15m.)
14927			 */
14928			val = LONG_MAX - (1 << 27) + 1;
14929		}
14930	}
14931
14932	state->dts_options[option] = val;
14933
14934	return (0);
14935}
14936
14937static void
14938dtrace_state_destroy(dtrace_state_t *state)
14939{
14940	dtrace_ecb_t *ecb;
14941	dtrace_vstate_t *vstate = &state->dts_vstate;
14942#if defined(sun)
14943	minor_t minor = getminor(state->dts_dev);
14944#endif
14945	int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
14946	dtrace_speculation_t *spec = state->dts_speculations;
14947	int nspec = state->dts_nspeculations;
14948	uint32_t match;
14949
14950	ASSERT(MUTEX_HELD(&dtrace_lock));
14951	ASSERT(MUTEX_HELD(&cpu_lock));
14952
14953	/*
14954	 * First, retract any retained enablings for this state.
14955	 */
14956	dtrace_enabling_retract(state);
14957	ASSERT(state->dts_nretained == 0);
14958
14959	if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14960	    state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14961		/*
14962		 * We have managed to come into dtrace_state_destroy() on a
14963		 * hot enabling -- almost certainly because of a disorderly
14964		 * shutdown of a consumer.  (That is, a consumer that is
14965		 * exiting without having called dtrace_stop().) In this case,
14966		 * we're going to set our activity to be KILLED, and then
14967		 * issue a sync to be sure that everyone is out of probe
14968		 * context before we start blowing away ECBs.
14969		 */
14970		state->dts_activity = DTRACE_ACTIVITY_KILLED;
14971		dtrace_sync();
14972	}
14973
14974	/*
14975	 * Release the credential hold we took in dtrace_state_create().
14976	 */
14977	if (state->dts_cred.dcr_cred != NULL)
14978		crfree(state->dts_cred.dcr_cred);
14979
14980	/*
14981	 * Now we can safely disable and destroy any enabled probes.  Because
14982	 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14983	 * (especially if they're all enabled), we take two passes through the
14984	 * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14985	 * in the second we disable whatever is left over.
14986	 */
14987	for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14988		for (i = 0; i < state->dts_necbs; i++) {
14989			if ((ecb = state->dts_ecbs[i]) == NULL)
14990				continue;
14991
14992			if (match && ecb->dte_probe != NULL) {
14993				dtrace_probe_t *probe = ecb->dte_probe;
14994				dtrace_provider_t *prov = probe->dtpr_provider;
14995
14996				if (!(prov->dtpv_priv.dtpp_flags & match))
14997					continue;
14998			}
14999
15000			dtrace_ecb_disable(ecb);
15001			dtrace_ecb_destroy(ecb);
15002		}
15003
15004		if (!match)
15005			break;
15006	}
15007
15008	/*
15009	 * Before we free the buffers, perform one more sync to assure that
15010	 * every CPU is out of probe context.
15011	 */
15012	dtrace_sync();
15013
15014	dtrace_buffer_free(state->dts_buffer);
15015	dtrace_buffer_free(state->dts_aggbuffer);
15016
15017	for (i = 0; i < nspec; i++)
15018		dtrace_buffer_free(spec[i].dtsp_buffer);
15019
15020#if defined(sun)
15021	if (state->dts_cleaner != CYCLIC_NONE)
15022		cyclic_remove(state->dts_cleaner);
15023
15024	if (state->dts_deadman != CYCLIC_NONE)
15025		cyclic_remove(state->dts_deadman);
15026#else
15027	callout_stop(&state->dts_cleaner);
15028	callout_drain(&state->dts_cleaner);
15029	callout_stop(&state->dts_deadman);
15030	callout_drain(&state->dts_deadman);
15031#endif
15032
15033	dtrace_dstate_fini(&vstate->dtvs_dynvars);
15034	dtrace_vstate_fini(vstate);
15035	if (state->dts_ecbs != NULL)
15036		kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
15037
15038	if (state->dts_aggregations != NULL) {
15039#ifdef DEBUG
15040		for (i = 0; i < state->dts_naggregations; i++)
15041			ASSERT(state->dts_aggregations[i] == NULL);
15042#endif
15043		ASSERT(state->dts_naggregations > 0);
15044		kmem_free(state->dts_aggregations,
15045		    state->dts_naggregations * sizeof (dtrace_aggregation_t *));
15046	}
15047
15048	kmem_free(state->dts_buffer, bufsize);
15049	kmem_free(state->dts_aggbuffer, bufsize);
15050
15051	for (i = 0; i < nspec; i++)
15052		kmem_free(spec[i].dtsp_buffer, bufsize);
15053
15054	if (spec != NULL)
15055		kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
15056
15057	dtrace_format_destroy(state);
15058
15059	if (state->dts_aggid_arena != NULL) {
15060#if defined(sun)
15061		vmem_destroy(state->dts_aggid_arena);
15062#else
15063		delete_unrhdr(state->dts_aggid_arena);
15064#endif
15065		state->dts_aggid_arena = NULL;
15066	}
15067#if defined(sun)
15068	ddi_soft_state_free(dtrace_softstate, minor);
15069	vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
15070#endif
15071}
15072
15073/*
15074 * DTrace Anonymous Enabling Functions
15075 */
15076static dtrace_state_t *
15077dtrace_anon_grab(void)
15078{
15079	dtrace_state_t *state;
15080
15081	ASSERT(MUTEX_HELD(&dtrace_lock));
15082
15083	if ((state = dtrace_anon.dta_state) == NULL) {
15084		ASSERT(dtrace_anon.dta_enabling == NULL);
15085		return (NULL);
15086	}
15087
15088	ASSERT(dtrace_anon.dta_enabling != NULL);
15089	ASSERT(dtrace_retained != NULL);
15090
15091	dtrace_enabling_destroy(dtrace_anon.dta_enabling);
15092	dtrace_anon.dta_enabling = NULL;
15093	dtrace_anon.dta_state = NULL;
15094
15095	return (state);
15096}
15097
15098static void
15099dtrace_anon_property(void)
15100{
15101	int i, rv;
15102	dtrace_state_t *state;
15103	dof_hdr_t *dof;
15104	char c[32];		/* enough for "dof-data-" + digits */
15105
15106	ASSERT(MUTEX_HELD(&dtrace_lock));
15107	ASSERT(MUTEX_HELD(&cpu_lock));
15108
15109	for (i = 0; ; i++) {
15110		(void) snprintf(c, sizeof (c), "dof-data-%d", i);
15111
15112		dtrace_err_verbose = 1;
15113
15114		if ((dof = dtrace_dof_property(c)) == NULL) {
15115			dtrace_err_verbose = 0;
15116			break;
15117		}
15118
15119#if defined(sun)
15120		/*
15121		 * We want to create anonymous state, so we need to transition
15122		 * the kernel debugger to indicate that DTrace is active.  If
15123		 * this fails (e.g. because the debugger has modified text in
15124		 * some way), we won't continue with the processing.
15125		 */
15126		if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15127			cmn_err(CE_NOTE, "kernel debugger active; anonymous "
15128			    "enabling ignored.");
15129			dtrace_dof_destroy(dof);
15130			break;
15131		}
15132#endif
15133
15134		/*
15135		 * If we haven't allocated an anonymous state, we'll do so now.
15136		 */
15137		if ((state = dtrace_anon.dta_state) == NULL) {
15138#if defined(sun)
15139			state = dtrace_state_create(NULL, NULL);
15140#else
15141			state = dtrace_state_create(NULL);
15142#endif
15143			dtrace_anon.dta_state = state;
15144
15145			if (state == NULL) {
15146				/*
15147				 * This basically shouldn't happen:  the only
15148				 * failure mode from dtrace_state_create() is a
15149				 * failure of ddi_soft_state_zalloc() that
15150				 * itself should never happen.  Still, the
15151				 * interface allows for a failure mode, and
15152				 * we want to fail as gracefully as possible:
15153				 * we'll emit an error message and cease
15154				 * processing anonymous state in this case.
15155				 */
15156				cmn_err(CE_WARN, "failed to create "
15157				    "anonymous state");
15158				dtrace_dof_destroy(dof);
15159				break;
15160			}
15161		}
15162
15163		rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
15164		    &dtrace_anon.dta_enabling, 0, B_TRUE);
15165
15166		if (rv == 0)
15167			rv = dtrace_dof_options(dof, state);
15168
15169		dtrace_err_verbose = 0;
15170		dtrace_dof_destroy(dof);
15171
15172		if (rv != 0) {
15173			/*
15174			 * This is malformed DOF; chuck any anonymous state
15175			 * that we created.
15176			 */
15177			ASSERT(dtrace_anon.dta_enabling == NULL);
15178			dtrace_state_destroy(state);
15179			dtrace_anon.dta_state = NULL;
15180			break;
15181		}
15182
15183		ASSERT(dtrace_anon.dta_enabling != NULL);
15184	}
15185
15186	if (dtrace_anon.dta_enabling != NULL) {
15187		int rval;
15188
15189		/*
15190		 * dtrace_enabling_retain() can only fail because we are
15191		 * trying to retain more enablings than are allowed -- but
15192		 * we only have one anonymous enabling, and we are guaranteed
15193		 * to be allowed at least one retained enabling; we assert
15194		 * that dtrace_enabling_retain() returns success.
15195		 */
15196		rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
15197		ASSERT(rval == 0);
15198
15199		dtrace_enabling_dump(dtrace_anon.dta_enabling);
15200	}
15201}
15202
15203/*
15204 * DTrace Helper Functions
15205 */
15206static void
15207dtrace_helper_trace(dtrace_helper_action_t *helper,
15208    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
15209{
15210	uint32_t size, next, nnext, i;
15211	dtrace_helptrace_t *ent, *buffer;
15212	uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;
15213
15214	if ((buffer = dtrace_helptrace_buffer) == NULL)
15215		return;
15216
15217	ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
15218
15219	/*
15220	 * What would a tracing framework be without its own tracing
15221	 * framework?  (Well, a hell of a lot simpler, for starters...)
15222	 */
15223	size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
15224	    sizeof (uint64_t) - sizeof (uint64_t);
15225
15226	/*
15227	 * Iterate until we can allocate a slot in the trace buffer.
15228	 */
15229	do {
15230		next = dtrace_helptrace_next;
15231
15232		if (next + size < dtrace_helptrace_bufsize) {
15233			nnext = next + size;
15234		} else {
15235			nnext = size;
15236		}
15237	} while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15238
15239	/*
15240	 * We have our slot; fill it in.
15241	 */
15242	if (nnext == size) {
15243		dtrace_helptrace_wrapped++;
15244		next = 0;
15245	}
15246
15247	ent = (dtrace_helptrace_t *)((uintptr_t)buffer + next);
15248	ent->dtht_helper = helper;
15249	ent->dtht_where = where;
15250	ent->dtht_nlocals = vstate->dtvs_nlocals;
15251
15252	ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15253	    mstate->dtms_fltoffs : -1;
15254	ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15255	ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval;
15256
15257	for (i = 0; i < vstate->dtvs_nlocals; i++) {
15258		dtrace_statvar_t *svar;
15259
15260		if ((svar = vstate->dtvs_locals[i]) == NULL)
15261			continue;
15262
15263		ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
15264		ent->dtht_locals[i] =
15265		    ((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];
15266	}
15267}
15268
15269static uint64_t
15270dtrace_helper(int which, dtrace_mstate_t *mstate,
15271    dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15272{
15273	uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
15274	uint64_t sarg0 = mstate->dtms_arg[0];
15275	uint64_t sarg1 = mstate->dtms_arg[1];
15276	uint64_t rval = 0;
15277	dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15278	dtrace_helper_action_t *helper;
15279	dtrace_vstate_t *vstate;
15280	dtrace_difo_t *pred;
15281	int i, trace = dtrace_helptrace_buffer != NULL;
15282
15283	ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
15284
15285	if (helpers == NULL)
15286		return (0);
15287
15288	if ((helper = helpers->dthps_actions[which]) == NULL)
15289		return (0);
15290
15291	vstate = &helpers->dthps_vstate;
15292	mstate->dtms_arg[0] = arg0;
15293	mstate->dtms_arg[1] = arg1;
15294
15295	/*
15296	 * Now iterate over each helper.  If its predicate evaluates to 'true',
15297	 * we'll call the corresponding actions.  Note that the below calls
15298	 * to dtrace_dif_emulate() may set faults in machine state.  This is
15299	 * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
15300	 * the stored DIF offset with its own (which is the desired behavior).
15301	 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
15302	 * from machine state; this is okay, too.
15303	 */
15304	for (; helper != NULL; helper = helper->dtha_next) {
15305		if ((pred = helper->dtha_predicate) != NULL) {
15306			if (trace)
15307				dtrace_helper_trace(helper, mstate, vstate, 0);
15308
15309			if (!dtrace_dif_emulate(pred, mstate, vstate, state))
15310				goto next;
15311
15312			if (*flags & CPU_DTRACE_FAULT)
15313				goto err;
15314		}
15315
15316		for (i = 0; i < helper->dtha_nactions; i++) {
15317			if (trace)
15318				dtrace_helper_trace(helper,
15319				    mstate, vstate, i + 1);
15320
15321			rval = dtrace_dif_emulate(helper->dtha_actions[i],
15322			    mstate, vstate, state);
15323
15324			if (*flags & CPU_DTRACE_FAULT)
15325				goto err;
15326		}
15327
15328next:
15329		if (trace)
15330			dtrace_helper_trace(helper, mstate, vstate,
15331			    DTRACE_HELPTRACE_NEXT);
15332	}
15333
15334	if (trace)
15335		dtrace_helper_trace(helper, mstate, vstate,
15336		    DTRACE_HELPTRACE_DONE);
15337
15338	/*
15339	 * Restore the arg0 that we saved upon entry.
15340	 */
15341	mstate->dtms_arg[0] = sarg0;
15342	mstate->dtms_arg[1] = sarg1;
15343
15344	return (rval);
15345
15346err:
15347	if (trace)
15348		dtrace_helper_trace(helper, mstate, vstate,
15349		    DTRACE_HELPTRACE_ERR);
15350
15351	/*
15352	 * Restore the arg0 that we saved upon entry.
15353	 */
15354	mstate->dtms_arg[0] = sarg0;
15355	mstate->dtms_arg[1] = sarg1;
15356
15357	return (0);
15358}
15359
15360static void
15361dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15362    dtrace_vstate_t *vstate)
15363{
15364	int i;
15365
15366	if (helper->dtha_predicate != NULL)
15367		dtrace_difo_release(helper->dtha_predicate, vstate);
15368
15369	for (i = 0; i < helper->dtha_nactions; i++) {
15370		ASSERT(helper->dtha_actions[i] != NULL);
15371		dtrace_difo_release(helper->dtha_actions[i], vstate);
15372	}
15373
15374	kmem_free(helper->dtha_actions,
15375	    helper->dtha_nactions * sizeof (dtrace_difo_t *));
15376	kmem_free(helper, sizeof (dtrace_helper_action_t));
15377}
15378
15379static int
15380dtrace_helper_destroygen(int gen)
15381{
15382	proc_t *p = curproc;
15383	dtrace_helpers_t *help = p->p_dtrace_helpers;
15384	dtrace_vstate_t *vstate;
15385	int i;
15386
15387	ASSERT(MUTEX_HELD(&dtrace_lock));
15388
15389	if (help == NULL || gen > help->dthps_generation)
15390		return (EINVAL);
15391
15392	vstate = &help->dthps_vstate;
15393
15394	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15395		dtrace_helper_action_t *last = NULL, *h, *next;
15396
15397		for (h = help->dthps_actions[i]; h != NULL; h = next) {
15398			next = h->dtha_next;
15399
15400			if (h->dtha_generation == gen) {
15401				if (last != NULL) {
15402					last->dtha_next = next;
15403				} else {
15404					help->dthps_actions[i] = next;
15405				}
15406
15407				dtrace_helper_action_destroy(h, vstate);
15408			} else {
15409				last = h;
15410			}
15411		}
15412	}
15413
15414	/*
15415	 * Interate until we've cleared out all helper providers with the
15416	 * given generation number.
15417	 */
15418	for (;;) {
15419		dtrace_helper_provider_t *prov;
15420
15421		/*
15422		 * Look for a helper provider with the right generation. We
15423		 * have to start back at the beginning of the list each time
15424		 * because we drop dtrace_lock. It's unlikely that we'll make
15425		 * more than two passes.
15426		 */
15427		for (i = 0; i < help->dthps_nprovs; i++) {
15428			prov = help->dthps_provs[i];
15429
15430			if (prov->dthp_generation == gen)
15431				break;
15432		}
15433
15434		/*
15435		 * If there were no matches, we're done.
15436		 */
15437		if (i == help->dthps_nprovs)
15438			break;
15439
15440		/*
15441		 * Move the last helper provider into this slot.
15442		 */
15443		help->dthps_nprovs--;
15444		help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15445		help->dthps_provs[help->dthps_nprovs] = NULL;
15446
15447		mutex_exit(&dtrace_lock);
15448
15449		/*
15450		 * If we have a meta provider, remove this helper provider.
15451		 */
15452		mutex_enter(&dtrace_meta_lock);
15453		if (dtrace_meta_pid != NULL) {
15454			ASSERT(dtrace_deferred_pid == NULL);
15455			dtrace_helper_provider_remove(&prov->dthp_prov,
15456			    p->p_pid);
15457		}
15458		mutex_exit(&dtrace_meta_lock);
15459
15460		dtrace_helper_provider_destroy(prov);
15461
15462		mutex_enter(&dtrace_lock);
15463	}
15464
15465	return (0);
15466}
15467
15468static int
15469dtrace_helper_validate(dtrace_helper_action_t *helper)
15470{
15471	int err = 0, i;
15472	dtrace_difo_t *dp;
15473
15474	if ((dp = helper->dtha_predicate) != NULL)
15475		err += dtrace_difo_validate_helper(dp);
15476
15477	for (i = 0; i < helper->dtha_nactions; i++)
15478		err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
15479
15480	return (err == 0);
15481}
15482
15483static int
15484dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
15485{
15486	dtrace_helpers_t *help;
15487	dtrace_helper_action_t *helper, *last;
15488	dtrace_actdesc_t *act;
15489	dtrace_vstate_t *vstate;
15490	dtrace_predicate_t *pred;
15491	int count = 0, nactions = 0, i;
15492
15493	if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
15494		return (EINVAL);
15495
15496	help = curproc->p_dtrace_helpers;
15497	last = help->dthps_actions[which];
15498	vstate = &help->dthps_vstate;
15499
15500	for (count = 0; last != NULL; last = last->dtha_next) {
15501		count++;
15502		if (last->dtha_next == NULL)
15503			break;
15504	}
15505
15506	/*
15507	 * If we already have dtrace_helper_actions_max helper actions for this
15508	 * helper action type, we'll refuse to add a new one.
15509	 */
15510	if (count >= dtrace_helper_actions_max)
15511		return (ENOSPC);
15512
15513	helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15514	helper->dtha_generation = help->dthps_generation;
15515
15516	if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15517		ASSERT(pred->dtp_difo != NULL);
15518		dtrace_difo_hold(pred->dtp_difo);
15519		helper->dtha_predicate = pred->dtp_difo;
15520	}
15521
15522	for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15523		if (act->dtad_kind != DTRACEACT_DIFEXPR)
15524			goto err;
15525
15526		if (act->dtad_difo == NULL)
15527			goto err;
15528
15529		nactions++;
15530	}
15531
15532	helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
15533	    (helper->dtha_nactions = nactions), KM_SLEEP);
15534
15535	for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15536		dtrace_difo_hold(act->dtad_difo);
15537		helper->dtha_actions[i++] = act->dtad_difo;
15538	}
15539
15540	if (!dtrace_helper_validate(helper))
15541		goto err;
15542
15543	if (last == NULL) {
15544		help->dthps_actions[which] = helper;
15545	} else {
15546		last->dtha_next = helper;
15547	}
15548
15549	if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15550		dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15551		dtrace_helptrace_next = 0;
15552	}
15553
15554	return (0);
15555err:
15556	dtrace_helper_action_destroy(helper, vstate);
15557	return (EINVAL);
15558}
15559
15560static void
15561dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
15562    dof_helper_t *dofhp)
15563{
15564	ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
15565
15566	mutex_enter(&dtrace_meta_lock);
15567	mutex_enter(&dtrace_lock);
15568
15569	if (!dtrace_attached() || dtrace_meta_pid == NULL) {
15570		/*
15571		 * If the dtrace module is loaded but not attached, or if
15572		 * there aren't isn't a meta provider registered to deal with
15573		 * these provider descriptions, we need to postpone creating
15574		 * the actual providers until later.
15575		 */
15576
15577		if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15578		    dtrace_deferred_pid != help) {
15579			help->dthps_deferred = 1;
15580			help->dthps_pid = p->p_pid;
15581			help->dthps_next = dtrace_deferred_pid;
15582			help->dthps_prev = NULL;
15583			if (dtrace_deferred_pid != NULL)
15584				dtrace_deferred_pid->dthps_prev = help;
15585			dtrace_deferred_pid = help;
15586		}
15587
15588		mutex_exit(&dtrace_lock);
15589
15590	} else if (dofhp != NULL) {
15591		/*
15592		 * If the dtrace module is loaded and we have a particular
15593		 * helper provider description, pass that off to the
15594		 * meta provider.
15595		 */
15596
15597		mutex_exit(&dtrace_lock);
15598
15599		dtrace_helper_provide(dofhp, p->p_pid);
15600
15601	} else {
15602		/*
15603		 * Otherwise, just pass all the helper provider descriptions
15604		 * off to the meta provider.
15605		 */
15606
15607		int i;
15608		mutex_exit(&dtrace_lock);
15609
15610		for (i = 0; i < help->dthps_nprovs; i++) {
15611			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
15612			    p->p_pid);
15613		}
15614	}
15615
15616	mutex_exit(&dtrace_meta_lock);
15617}
15618
15619static int
15620dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
15621{
15622	dtrace_helpers_t *help;
15623	dtrace_helper_provider_t *hprov, **tmp_provs;
15624	uint_t tmp_maxprovs, i;
15625
15626	ASSERT(MUTEX_HELD(&dtrace_lock));
15627
15628	help = curproc->p_dtrace_helpers;
15629	ASSERT(help != NULL);
15630
15631	/*
15632	 * If we already have dtrace_helper_providers_max helper providers,
15633	 * we're refuse to add a new one.
15634	 */
15635	if (help->dthps_nprovs >= dtrace_helper_providers_max)
15636		return (ENOSPC);
15637
15638	/*
15639	 * Check to make sure this isn't a duplicate.
15640	 */
15641	for (i = 0; i < help->dthps_nprovs; i++) {
15642		if (dofhp->dofhp_dof ==
15643		    help->dthps_provs[i]->dthp_prov.dofhp_dof)
15644			return (EALREADY);
15645	}
15646
15647	hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15648	hprov->dthp_prov = *dofhp;
15649	hprov->dthp_ref = 1;
15650	hprov->dthp_generation = gen;
15651
15652	/*
15653	 * Allocate a bigger table for helper providers if it's already full.
15654	 */
15655	if (help->dthps_maxprovs == help->dthps_nprovs) {
15656		tmp_maxprovs = help->dthps_maxprovs;
15657		tmp_provs = help->dthps_provs;
15658
15659		if (help->dthps_maxprovs == 0)
15660			help->dthps_maxprovs = 2;
15661		else
15662			help->dthps_maxprovs *= 2;
15663		if (help->dthps_maxprovs > dtrace_helper_providers_max)
15664			help->dthps_maxprovs = dtrace_helper_providers_max;
15665
15666		ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15667
15668		help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15669		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15670
15671		if (tmp_provs != NULL) {
15672			bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
15673			    sizeof (dtrace_helper_provider_t *));
15674			kmem_free(tmp_provs, tmp_maxprovs *
15675			    sizeof (dtrace_helper_provider_t *));
15676		}
15677	}
15678
15679	help->dthps_provs[help->dthps_nprovs] = hprov;
15680	help->dthps_nprovs++;
15681
15682	return (0);
15683}
15684
15685static void
15686dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15687{
15688	mutex_enter(&dtrace_lock);
15689
15690	if (--hprov->dthp_ref == 0) {
15691		dof_hdr_t *dof;
15692		mutex_exit(&dtrace_lock);
15693		dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15694		dtrace_dof_destroy(dof);
15695		kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15696	} else {
15697		mutex_exit(&dtrace_lock);
15698	}
15699}
15700
15701static int
15702dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
15703{
15704	uintptr_t daddr = (uintptr_t)dof;
15705	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
15706	dof_provider_t *provider;
15707	dof_probe_t *probe;
15708	uint8_t *arg;
15709	char *strtab, *typestr;
15710	dof_stridx_t typeidx;
15711	size_t typesz;
15712	uint_t nprobes, j, k;
15713
15714	ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15715
15716	if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
15717		dtrace_dof_error(dof, "misaligned section offset");
15718		return (-1);
15719	}
15720
15721	/*
15722	 * The section needs to be large enough to contain the DOF provider
15723	 * structure appropriate for the given version.
15724	 */
15725	if (sec->dofs_size <
15726	    ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15727	    offsetof(dof_provider_t, dofpv_prenoffs) :
15728	    sizeof (dof_provider_t))) {
15729		dtrace_dof_error(dof, "provider section too small");
15730		return (-1);
15731	}
15732
15733	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15734	str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
15735	prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
15736	arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
15737	off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
15738
15739	if (str_sec == NULL || prb_sec == NULL ||
15740	    arg_sec == NULL || off_sec == NULL)
15741		return (-1);
15742
15743	enoff_sec = NULL;
15744
15745	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15746	    provider->dofpv_prenoffs != DOF_SECT_NONE &&
15747	    (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15748	    provider->dofpv_prenoffs)) == NULL)
15749		return (-1);
15750
15751	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15752
15753	if (provider->dofpv_name >= str_sec->dofs_size ||
15754	    strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15755		dtrace_dof_error(dof, "invalid provider name");
15756		return (-1);
15757	}
15758
15759	if (prb_sec->dofs_entsize == 0 ||
15760	    prb_sec->dofs_entsize > prb_sec->dofs_size) {
15761		dtrace_dof_error(dof, "invalid entry size");
15762		return (-1);
15763	}
15764
15765	if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
15766		dtrace_dof_error(dof, "misaligned entry size");
15767		return (-1);
15768	}
15769
15770	if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15771		dtrace_dof_error(dof, "invalid entry size");
15772		return (-1);
15773	}
15774
15775	if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
15776		dtrace_dof_error(dof, "misaligned section offset");
15777		return (-1);
15778	}
15779
15780	if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15781		dtrace_dof_error(dof, "invalid entry size");
15782		return (-1);
15783	}
15784
15785	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15786
15787	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15788
15789	/*
15790	 * Take a pass through the probes to check for errors.
15791	 */
15792	for (j = 0; j < nprobes; j++) {
15793		probe = (dof_probe_t *)(uintptr_t)(daddr +
15794		    prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15795
15796		if (probe->dofpr_func >= str_sec->dofs_size) {
15797			dtrace_dof_error(dof, "invalid function name");
15798			return (-1);
15799		}
15800
15801		if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15802			dtrace_dof_error(dof, "function name too long");
15803			return (-1);
15804		}
15805
15806		if (probe->dofpr_name >= str_sec->dofs_size ||
15807		    strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15808			dtrace_dof_error(dof, "invalid probe name");
15809			return (-1);
15810		}
15811
15812		/*
15813		 * The offset count must not wrap the index, and the offsets
15814		 * must also not overflow the section's data.
15815		 */
15816		if (probe->dofpr_offidx + probe->dofpr_noffs <
15817		    probe->dofpr_offidx ||
15818		    (probe->dofpr_offidx + probe->dofpr_noffs) *
15819		    off_sec->dofs_entsize > off_sec->dofs_size) {
15820			dtrace_dof_error(dof, "invalid probe offset");
15821			return (-1);
15822		}
15823
15824		if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15825			/*
15826			 * If there's no is-enabled offset section, make sure
15827			 * there aren't any is-enabled offsets. Otherwise
15828			 * perform the same checks as for probe offsets
15829			 * (immediately above).
15830			 */
15831			if (enoff_sec == NULL) {
15832				if (probe->dofpr_enoffidx != 0 ||
15833				    probe->dofpr_nenoffs != 0) {
15834					dtrace_dof_error(dof, "is-enabled "
15835					    "offsets with null section");
15836					return (-1);
15837				}
15838			} else if (probe->dofpr_enoffidx +
15839			    probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
15840			    (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15841			    enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15842				dtrace_dof_error(dof, "invalid is-enabled "
15843				    "offset");
15844				return (-1);
15845			}
15846
15847			if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
15848				dtrace_dof_error(dof, "zero probe and "
15849				    "is-enabled offsets");
15850				return (-1);
15851			}
15852		} else if (probe->dofpr_noffs == 0) {
15853			dtrace_dof_error(dof, "zero probe offsets");
15854			return (-1);
15855		}
15856
15857		if (probe->dofpr_argidx + probe->dofpr_xargc <
15858		    probe->dofpr_argidx ||
15859		    (probe->dofpr_argidx + probe->dofpr_xargc) *
15860		    arg_sec->dofs_entsize > arg_sec->dofs_size) {
15861			dtrace_dof_error(dof, "invalid args");
15862			return (-1);
15863		}
15864
15865		typeidx = probe->dofpr_nargv;
15866		typestr = strtab + probe->dofpr_nargv;
15867		for (k = 0; k < probe->dofpr_nargc; k++) {
15868			if (typeidx >= str_sec->dofs_size) {
15869				dtrace_dof_error(dof, "bad "
15870				    "native argument type");
15871				return (-1);
15872			}
15873
15874			typesz = strlen(typestr) + 1;
15875			if (typesz > DTRACE_ARGTYPELEN) {
15876				dtrace_dof_error(dof, "native "
15877				    "argument type too long");
15878				return (-1);
15879			}
15880			typeidx += typesz;
15881			typestr += typesz;
15882		}
15883
15884		typeidx = probe->dofpr_xargv;
15885		typestr = strtab + probe->dofpr_xargv;
15886		for (k = 0; k < probe->dofpr_xargc; k++) {
15887			if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15888				dtrace_dof_error(dof, "bad "
15889				    "native argument index");
15890				return (-1);
15891			}
15892
15893			if (typeidx >= str_sec->dofs_size) {
15894				dtrace_dof_error(dof, "bad "
15895				    "translated argument type");
15896				return (-1);
15897			}
15898
15899			typesz = strlen(typestr) + 1;
15900			if (typesz > DTRACE_ARGTYPELEN) {
15901				dtrace_dof_error(dof, "translated argument "
15902				    "type too long");
15903				return (-1);
15904			}
15905
15906			typeidx += typesz;
15907			typestr += typesz;
15908		}
15909	}
15910
15911	return (0);
15912}
15913
15914static int
15915dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
15916{
15917	dtrace_helpers_t *help;
15918	dtrace_vstate_t *vstate;
15919	dtrace_enabling_t *enab = NULL;
15920	int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15921	uintptr_t daddr = (uintptr_t)dof;
15922
15923	ASSERT(MUTEX_HELD(&dtrace_lock));
15924
15925	if ((help = curproc->p_dtrace_helpers) == NULL)
15926		help = dtrace_helpers_create(curproc);
15927
15928	vstate = &help->dthps_vstate;
15929
15930	if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15931	    dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15932		dtrace_dof_destroy(dof);
15933		return (rv);
15934	}
15935
15936	/*
15937	 * Look for helper providers and validate their descriptions.
15938	 */
15939	if (dhp != NULL) {
15940		for (i = 0; i < dof->dofh_secnum; i++) {
15941			dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15942			    dof->dofh_secoff + i * dof->dofh_secsize);
15943
15944			if (sec->dofs_type != DOF_SECT_PROVIDER)
15945				continue;
15946
15947			if (dtrace_helper_provider_validate(dof, sec) != 0) {
15948				dtrace_enabling_destroy(enab);
15949				dtrace_dof_destroy(dof);
15950				return (-1);
15951			}
15952
15953			nprovs++;
15954		}
15955	}
15956
15957	/*
15958	 * Now we need to walk through the ECB descriptions in the enabling.
15959	 */
15960	for (i = 0; i < enab->dten_ndesc; i++) {
15961		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15962		dtrace_probedesc_t *desc = &ep->dted_probe;
15963
15964		if (strcmp(desc->dtpd_provider, "dtrace") != 0)
15965			continue;
15966
15967		if (strcmp(desc->dtpd_mod, "helper") != 0)
15968			continue;
15969
15970		if (strcmp(desc->dtpd_func, "ustack") != 0)
15971			continue;
15972
15973		if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
15974		    ep)) != 0) {
15975			/*
15976			 * Adding this helper action failed -- we are now going
15977			 * to rip out the entire generation and return failure.
15978			 */
15979			(void) dtrace_helper_destroygen(help->dthps_generation);
15980			dtrace_enabling_destroy(enab);
15981			dtrace_dof_destroy(dof);
15982			return (-1);
15983		}
15984
15985		nhelpers++;
15986	}
15987
15988	if (nhelpers < enab->dten_ndesc)
15989		dtrace_dof_error(dof, "unmatched helpers");
15990
15991	gen = help->dthps_generation++;
15992	dtrace_enabling_destroy(enab);
15993
15994	if (dhp != NULL && nprovs > 0) {
15995		dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15996		if (dtrace_helper_provider_add(dhp, gen) == 0) {
15997			mutex_exit(&dtrace_lock);
15998			dtrace_helper_provider_register(curproc, help, dhp);
15999			mutex_enter(&dtrace_lock);
16000
16001			destroy = 0;
16002		}
16003	}
16004
16005	if (destroy)
16006		dtrace_dof_destroy(dof);
16007
16008	return (gen);
16009}
16010
16011static dtrace_helpers_t *
16012dtrace_helpers_create(proc_t *p)
16013{
16014	dtrace_helpers_t *help;
16015
16016	ASSERT(MUTEX_HELD(&dtrace_lock));
16017	ASSERT(p->p_dtrace_helpers == NULL);
16018
16019	help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
16020	help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
16021	    DTRACE_NHELPER_ACTIONS, KM_SLEEP);
16022
16023	p->p_dtrace_helpers = help;
16024	dtrace_helpers++;
16025
16026	return (help);
16027}
16028
16029#if defined(sun)
16030static
16031#endif
16032void
16033dtrace_helpers_destroy(proc_t *p)
16034{
16035	dtrace_helpers_t *help;
16036	dtrace_vstate_t *vstate;
16037#if defined(sun)
16038	proc_t *p = curproc;
16039#endif
16040	int i;
16041
16042	mutex_enter(&dtrace_lock);
16043
16044	ASSERT(p->p_dtrace_helpers != NULL);
16045	ASSERT(dtrace_helpers > 0);
16046
16047	help = p->p_dtrace_helpers;
16048	vstate = &help->dthps_vstate;
16049
16050	/*
16051	 * We're now going to lose the help from this process.
16052	 */
16053	p->p_dtrace_helpers = NULL;
16054	dtrace_sync();
16055
16056	/*
16057	 * Destory the helper actions.
16058	 */
16059	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16060		dtrace_helper_action_t *h, *next;
16061
16062		for (h = help->dthps_actions[i]; h != NULL; h = next) {
16063			next = h->dtha_next;
16064			dtrace_helper_action_destroy(h, vstate);
16065			h = next;
16066		}
16067	}
16068
16069	mutex_exit(&dtrace_lock);
16070
16071	/*
16072	 * Destroy the helper providers.
16073	 */
16074	if (help->dthps_maxprovs > 0) {
16075		mutex_enter(&dtrace_meta_lock);
16076		if (dtrace_meta_pid != NULL) {
16077			ASSERT(dtrace_deferred_pid == NULL);
16078
16079			for (i = 0; i < help->dthps_nprovs; i++) {
16080				dtrace_helper_provider_remove(
16081				    &help->dthps_provs[i]->dthp_prov, p->p_pid);
16082			}
16083		} else {
16084			mutex_enter(&dtrace_lock);
16085			ASSERT(help->dthps_deferred == 0 ||
16086			    help->dthps_next != NULL ||
16087			    help->dthps_prev != NULL ||
16088			    help == dtrace_deferred_pid);
16089
16090			/*
16091			 * Remove the helper from the deferred list.
16092			 */
16093			if (help->dthps_next != NULL)
16094				help->dthps_next->dthps_prev = help->dthps_prev;
16095			if (help->dthps_prev != NULL)
16096				help->dthps_prev->dthps_next = help->dthps_next;
16097			if (dtrace_deferred_pid == help) {
16098				dtrace_deferred_pid = help->dthps_next;
16099				ASSERT(help->dthps_prev == NULL);
16100			}
16101
16102			mutex_exit(&dtrace_lock);
16103		}
16104
16105		mutex_exit(&dtrace_meta_lock);
16106
16107		for (i = 0; i < help->dthps_nprovs; i++) {
16108			dtrace_helper_provider_destroy(help->dthps_provs[i]);
16109		}
16110
16111		kmem_free(help->dthps_provs, help->dthps_maxprovs *
16112		    sizeof (dtrace_helper_provider_t *));
16113	}
16114
16115	mutex_enter(&dtrace_lock);
16116
16117	dtrace_vstate_fini(&help->dthps_vstate);
16118	kmem_free(help->dthps_actions,
16119	    sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16120	kmem_free(help, sizeof (dtrace_helpers_t));
16121
16122	--dtrace_helpers;
16123	mutex_exit(&dtrace_lock);
16124}
16125
16126#if defined(sun)
16127static
16128#endif
16129void
16130dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16131{
16132	dtrace_helpers_t *help, *newhelp;
16133	dtrace_helper_action_t *helper, *new, *last;
16134	dtrace_difo_t *dp;
16135	dtrace_vstate_t *vstate;
16136	int i, j, sz, hasprovs = 0;
16137
16138	mutex_enter(&dtrace_lock);
16139	ASSERT(from->p_dtrace_helpers != NULL);
16140	ASSERT(dtrace_helpers > 0);
16141
16142	help = from->p_dtrace_helpers;
16143	newhelp = dtrace_helpers_create(to);
16144	ASSERT(to->p_dtrace_helpers != NULL);
16145
16146	newhelp->dthps_generation = help->dthps_generation;
16147	vstate = &newhelp->dthps_vstate;
16148
16149	/*
16150	 * Duplicate the helper actions.
16151	 */
16152	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16153		if ((helper = help->dthps_actions[i]) == NULL)
16154			continue;
16155
16156		for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16157			new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16158			    KM_SLEEP);
16159			new->dtha_generation = helper->dtha_generation;
16160
16161			if ((dp = helper->dtha_predicate) != NULL) {
16162				dp = dtrace_difo_duplicate(dp, vstate);
16163				new->dtha_predicate = dp;
16164			}
16165
16166			new->dtha_nactions = helper->dtha_nactions;
16167			sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16168			new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16169
16170			for (j = 0; j < new->dtha_nactions; j++) {
16171				dtrace_difo_t *dp = helper->dtha_actions[j];
16172
16173				ASSERT(dp != NULL);
16174				dp = dtrace_difo_duplicate(dp, vstate);
16175				new->dtha_actions[j] = dp;
16176			}
16177
16178			if (last != NULL) {
16179				last->dtha_next = new;
16180			} else {
16181				newhelp->dthps_actions[i] = new;
16182			}
16183
16184			last = new;
16185		}
16186	}
16187
16188	/*
16189	 * Duplicate the helper providers and register them with the
16190	 * DTrace framework.
16191	 */
16192	if (help->dthps_nprovs > 0) {
16193		newhelp->dthps_nprovs = help->dthps_nprovs;
16194		newhelp->dthps_maxprovs = help->dthps_nprovs;
16195		newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16196		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16197		for (i = 0; i < newhelp->dthps_nprovs; i++) {
16198			newhelp->dthps_provs[i] = help->dthps_provs[i];
16199			newhelp->dthps_provs[i]->dthp_ref++;
16200		}
16201
16202		hasprovs = 1;
16203	}
16204
16205	mutex_exit(&dtrace_lock);
16206
16207	if (hasprovs)
16208		dtrace_helper_provider_register(to, newhelp, NULL);
16209}
16210
16211/*
16212 * DTrace Hook Functions
16213 */
16214static void
16215dtrace_module_loaded(modctl_t *ctl)
16216{
16217	dtrace_provider_t *prv;
16218
16219	mutex_enter(&dtrace_provider_lock);
16220#if defined(sun)
16221	mutex_enter(&mod_lock);
16222#endif
16223
16224#if defined(sun)
16225	ASSERT(ctl->mod_busy);
16226#endif
16227
16228	/*
16229	 * We're going to call each providers per-module provide operation
16230	 * specifying only this module.
16231	 */
16232	for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16233		prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16234
16235#if defined(sun)
16236	mutex_exit(&mod_lock);
16237#endif
16238	mutex_exit(&dtrace_provider_lock);
16239
16240	/*
16241	 * If we have any retained enablings, we need to match against them.
16242	 * Enabling probes requires that cpu_lock be held, and we cannot hold
16243	 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16244	 * module.  (In particular, this happens when loading scheduling
16245	 * classes.)  So if we have any retained enablings, we need to dispatch
16246	 * our task queue to do the match for us.
16247	 */
16248	mutex_enter(&dtrace_lock);
16249
16250	if (dtrace_retained == NULL) {
16251		mutex_exit(&dtrace_lock);
16252		return;
16253	}
16254
16255	(void) taskq_dispatch(dtrace_taskq,
16256	    (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
16257
16258	mutex_exit(&dtrace_lock);
16259
16260	/*
16261	 * And now, for a little heuristic sleaze:  in general, we want to
16262	 * match modules as soon as they load.  However, we cannot guarantee
16263	 * this, because it would lead us to the lock ordering violation
16264	 * outlined above.  The common case, of course, is that cpu_lock is
16265	 * _not_ held -- so we delay here for a clock tick, hoping that that's
16266	 * long enough for the task queue to do its work.  If it's not, it's
16267	 * not a serious problem -- it just means that the module that we
16268	 * just loaded may not be immediately instrumentable.
16269	 */
16270	delay(1);
16271}
16272
16273static void
16274#if defined(sun)
16275dtrace_module_unloaded(modctl_t *ctl)
16276#else
16277dtrace_module_unloaded(modctl_t *ctl, int *error)
16278#endif
16279{
16280	dtrace_probe_t template, *probe, *first, *next;
16281	dtrace_provider_t *prov;
16282#if !defined(sun)
16283	char modname[DTRACE_MODNAMELEN];
16284	size_t len;
16285#endif
16286
16287#if defined(sun)
16288	template.dtpr_mod = ctl->mod_modname;
16289#else
16290	/* Handle the fact that ctl->filename may end in ".ko". */
16291	strlcpy(modname, ctl->filename, sizeof(modname));
16292	len = strlen(ctl->filename);
16293	if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
16294		modname[len - 3] = '\0';
16295	template.dtpr_mod = modname;
16296#endif
16297
16298	mutex_enter(&dtrace_provider_lock);
16299#if defined(sun)
16300	mutex_enter(&mod_lock);
16301#endif
16302	mutex_enter(&dtrace_lock);
16303
16304#if !defined(sun)
16305	if (ctl->nenabled > 0) {
16306		/* Don't allow unloads if a probe is enabled. */
16307		mutex_exit(&dtrace_provider_lock);
16308		mutex_exit(&dtrace_lock);
16309		*error = -1;
16310		printf(
16311	"kldunload: attempt to unload module that has DTrace probes enabled\n");
16312		return;
16313	}
16314#endif
16315
16316	if (dtrace_bymod == NULL) {
16317		/*
16318		 * The DTrace module is loaded (obviously) but not attached;
16319		 * we don't have any work to do.
16320		 */
16321		mutex_exit(&dtrace_provider_lock);
16322#if defined(sun)
16323		mutex_exit(&mod_lock);
16324#endif
16325		mutex_exit(&dtrace_lock);
16326		return;
16327	}
16328
16329	for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16330	    probe != NULL; probe = probe->dtpr_nextmod) {
16331		if (probe->dtpr_ecb != NULL) {
16332			mutex_exit(&dtrace_provider_lock);
16333#if defined(sun)
16334			mutex_exit(&mod_lock);
16335#endif
16336			mutex_exit(&dtrace_lock);
16337
16338			/*
16339			 * This shouldn't _actually_ be possible -- we're
16340			 * unloading a module that has an enabled probe in it.
16341			 * (It's normally up to the provider to make sure that
16342			 * this can't happen.)  However, because dtps_enable()
16343			 * doesn't have a failure mode, there can be an
16344			 * enable/unload race.  Upshot:  we don't want to
16345			 * assert, but we're not going to disable the
16346			 * probe, either.
16347			 */
16348			if (dtrace_err_verbose) {
16349#if defined(sun)
16350				cmn_err(CE_WARN, "unloaded module '%s' had "
16351				    "enabled probes", ctl->mod_modname);
16352#else
16353				cmn_err(CE_WARN, "unloaded module '%s' had "
16354				    "enabled probes", modname);
16355#endif
16356			}
16357
16358			return;
16359		}
16360	}
16361
16362	probe = first;
16363
16364	for (first = NULL; probe != NULL; probe = next) {
16365		ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16366
16367		dtrace_probes[probe->dtpr_id - 1] = NULL;
16368
16369		next = probe->dtpr_nextmod;
16370		dtrace_hash_remove(dtrace_bymod, probe);
16371		dtrace_hash_remove(dtrace_byfunc, probe);
16372		dtrace_hash_remove(dtrace_byname, probe);
16373
16374		if (first == NULL) {
16375			first = probe;
16376			probe->dtpr_nextmod = NULL;
16377		} else {
16378			probe->dtpr_nextmod = first;
16379			first = probe;
16380		}
16381	}
16382
16383	/*
16384	 * We've removed all of the module's probes from the hash chains and
16385	 * from the probe array.  Now issue a dtrace_sync() to be sure that
16386	 * everyone has cleared out from any probe array processing.
16387	 */
16388	dtrace_sync();
16389
16390	for (probe = first; probe != NULL; probe = first) {
16391		first = probe->dtpr_nextmod;
16392		prov = probe->dtpr_provider;
16393		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16394		    probe->dtpr_arg);
16395		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
16396		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
16397		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
16398#if defined(sun)
16399		vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16400#else
16401		free_unr(dtrace_arena, probe->dtpr_id);
16402#endif
16403		kmem_free(probe, sizeof (dtrace_probe_t));
16404	}
16405
16406	mutex_exit(&dtrace_lock);
16407#if defined(sun)
16408	mutex_exit(&mod_lock);
16409#endif
16410	mutex_exit(&dtrace_provider_lock);
16411}
16412
16413#if !defined(sun)
16414static void
16415dtrace_kld_load(void *arg __unused, linker_file_t lf)
16416{
16417
16418	dtrace_module_loaded(lf);
16419}
16420
16421static void
16422dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)
16423{
16424
16425	if (*error != 0)
16426		/* We already have an error, so don't do anything. */
16427		return;
16428	dtrace_module_unloaded(lf, error);
16429}
16430#endif
16431
16432#if defined(sun)
16433static void
16434dtrace_suspend(void)
16435{
16436	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
16437}
16438
16439static void
16440dtrace_resume(void)
16441{
16442	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
16443}
16444#endif
16445
16446static int
16447dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
16448{
16449	ASSERT(MUTEX_HELD(&cpu_lock));
16450	mutex_enter(&dtrace_lock);
16451
16452	switch (what) {
16453	case CPU_CONFIG: {
16454		dtrace_state_t *state;
16455		dtrace_optval_t *opt, rs, c;
16456
16457		/*
16458		 * For now, we only allocate a new buffer for anonymous state.
16459		 */
16460		if ((state = dtrace_anon.dta_state) == NULL)
16461			break;
16462
16463		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
16464			break;
16465
16466		opt = state->dts_options;
16467		c = opt[DTRACEOPT_CPU];
16468
16469		if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
16470			break;
16471
16472		/*
16473		 * Regardless of what the actual policy is, we're going to
16474		 * temporarily set our resize policy to be manual.  We're
16475		 * also going to temporarily set our CPU option to denote
16476		 * the newly configured CPU.
16477		 */
16478		rs = opt[DTRACEOPT_BUFRESIZE];
16479		opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
16480		opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
16481
16482		(void) dtrace_state_buffers(state);
16483
16484		opt[DTRACEOPT_BUFRESIZE] = rs;
16485		opt[DTRACEOPT_CPU] = c;
16486
16487		break;
16488	}
16489
16490	case CPU_UNCONFIG:
16491		/*
16492		 * We don't free the buffer in the CPU_UNCONFIG case.  (The
16493		 * buffer will be freed when the consumer exits.)
16494		 */
16495		break;
16496
16497	default:
16498		break;
16499	}
16500
16501	mutex_exit(&dtrace_lock);
16502	return (0);
16503}
16504
16505#if defined(sun)
16506static void
16507dtrace_cpu_setup_initial(processorid_t cpu)
16508{
16509	(void) dtrace_cpu_setup(CPU_CONFIG, cpu);
16510}
16511#endif
16512
16513static void
16514dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
16515{
16516	if (dtrace_toxranges >= dtrace_toxranges_max) {
16517		int osize, nsize;
16518		dtrace_toxrange_t *range;
16519
16520		osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16521
16522		if (osize == 0) {
16523			ASSERT(dtrace_toxrange == NULL);
16524			ASSERT(dtrace_toxranges_max == 0);
16525			dtrace_toxranges_max = 1;
16526		} else {
16527			dtrace_toxranges_max <<= 1;
16528		}
16529
16530		nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16531		range = kmem_zalloc(nsize, KM_SLEEP);
16532
16533		if (dtrace_toxrange != NULL) {
16534			ASSERT(osize != 0);
16535			bcopy(dtrace_toxrange, range, osize);
16536			kmem_free(dtrace_toxrange, osize);
16537		}
16538
16539		dtrace_toxrange = range;
16540	}
16541
16542	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
16543	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
16544
16545	dtrace_toxrange[dtrace_toxranges].dtt_base = base;
16546	dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
16547	dtrace_toxranges++;
16548}
16549
16550static void
16551dtrace_getf_barrier()
16552{
16553#if defined(sun)
16554	/*
16555	 * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
16556	 * that contain calls to getf(), this routine will be called on every
16557	 * closef() before either the underlying vnode is released or the
16558	 * file_t itself is freed.  By the time we are here, it is essential
16559	 * that the file_t can no longer be accessed from a call to getf()
16560	 * in probe context -- that assures that a dtrace_sync() can be used
16561	 * to clear out any enablings referring to the old structures.
16562	 */
16563	if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
16564	    kcred->cr_zone->zone_dtrace_getf != 0)
16565		dtrace_sync();
16566#endif
16567}
16568
16569/*
16570 * DTrace Driver Cookbook Functions
16571 */
16572#if defined(sun)
16573/*ARGSUSED*/
16574static int
16575dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
16576{
16577	dtrace_provider_id_t id;
16578	dtrace_state_t *state = NULL;
16579	dtrace_enabling_t *enab;
16580
16581	mutex_enter(&cpu_lock);
16582	mutex_enter(&dtrace_provider_lock);
16583	mutex_enter(&dtrace_lock);
16584
16585	if (ddi_soft_state_init(&dtrace_softstate,
16586	    sizeof (dtrace_state_t), 0) != 0) {
16587		cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
16588		mutex_exit(&cpu_lock);
16589		mutex_exit(&dtrace_provider_lock);
16590		mutex_exit(&dtrace_lock);
16591		return (DDI_FAILURE);
16592	}
16593
16594	if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
16595	    DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
16596	    ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
16597	    DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
16598		cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
16599		ddi_remove_minor_node(devi, NULL);
16600		ddi_soft_state_fini(&dtrace_softstate);
16601		mutex_exit(&cpu_lock);
16602		mutex_exit(&dtrace_provider_lock);
16603		mutex_exit(&dtrace_lock);
16604		return (DDI_FAILURE);
16605	}
16606
16607	ddi_report_dev(devi);
16608	dtrace_devi = devi;
16609
16610	dtrace_modload = dtrace_module_loaded;
16611	dtrace_modunload = dtrace_module_unloaded;
16612	dtrace_cpu_init = dtrace_cpu_setup_initial;
16613	dtrace_helpers_cleanup = dtrace_helpers_destroy;
16614	dtrace_helpers_fork = dtrace_helpers_duplicate;
16615	dtrace_cpustart_init = dtrace_suspend;
16616	dtrace_cpustart_fini = dtrace_resume;
16617	dtrace_debugger_init = dtrace_suspend;
16618	dtrace_debugger_fini = dtrace_resume;
16619
16620	register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16621
16622	ASSERT(MUTEX_HELD(&cpu_lock));
16623
16624	dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
16625	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
16626	dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
16627	    UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
16628	    VM_SLEEP | VMC_IDENTIFIER);
16629	dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
16630	    1, INT_MAX, 0);
16631
16632	dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
16633	    sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
16634	    NULL, NULL, NULL, NULL, NULL, 0);
16635
16636	ASSERT(MUTEX_HELD(&cpu_lock));
16637	dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
16638	    offsetof(dtrace_probe_t, dtpr_nextmod),
16639	    offsetof(dtrace_probe_t, dtpr_prevmod));
16640
16641	dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
16642	    offsetof(dtrace_probe_t, dtpr_nextfunc),
16643	    offsetof(dtrace_probe_t, dtpr_prevfunc));
16644
16645	dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
16646	    offsetof(dtrace_probe_t, dtpr_nextname),
16647	    offsetof(dtrace_probe_t, dtpr_prevname));
16648
16649	if (dtrace_retain_max < 1) {
16650		cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
16651		    "setting to 1", dtrace_retain_max);
16652		dtrace_retain_max = 1;
16653	}
16654
16655	/*
16656	 * Now discover our toxic ranges.
16657	 */
16658	dtrace_toxic_ranges(dtrace_toxrange_add);
16659
16660	/*
16661	 * Before we register ourselves as a provider to our own framework,
16662	 * we would like to assert that dtrace_provider is NULL -- but that's
16663	 * not true if we were loaded as a dependency of a DTrace provider.
16664	 * Once we've registered, we can assert that dtrace_provider is our
16665	 * pseudo provider.
16666	 */
16667	(void) dtrace_register("dtrace", &dtrace_provider_attr,
16668	    DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
16669
16670	ASSERT(dtrace_provider != NULL);
16671	ASSERT((dtrace_provider_id_t)dtrace_provider == id);
16672
16673	dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
16674	    dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
16675	dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
16676	    dtrace_provider, NULL, NULL, "END", 0, NULL);
16677	dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
16678	    dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
16679
16680	dtrace_anon_property();
16681	mutex_exit(&cpu_lock);
16682
16683	/*
16684	 * If there are already providers, we must ask them to provide their
16685	 * probes, and then match any anonymous enabling against them.  Note
16686	 * that there should be no other retained enablings at this time:
16687	 * the only retained enablings at this time should be the anonymous
16688	 * enabling.
16689	 */
16690	if (dtrace_anon.dta_enabling != NULL) {
16691		ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
16692
16693		dtrace_enabling_provide(NULL);
16694		state = dtrace_anon.dta_state;
16695
16696		/*
16697		 * We couldn't hold cpu_lock across the above call to
16698		 * dtrace_enabling_provide(), but we must hold it to actually
16699		 * enable the probes.  We have to drop all of our locks, pick
16700		 * up cpu_lock, and regain our locks before matching the
16701		 * retained anonymous enabling.
16702		 */
16703		mutex_exit(&dtrace_lock);
16704		mutex_exit(&dtrace_provider_lock);
16705
16706		mutex_enter(&cpu_lock);
16707		mutex_enter(&dtrace_provider_lock);
16708		mutex_enter(&dtrace_lock);
16709
16710		if ((enab = dtrace_anon.dta_enabling) != NULL)
16711			(void) dtrace_enabling_match(enab, NULL);
16712
16713		mutex_exit(&cpu_lock);
16714	}
16715
16716	mutex_exit(&dtrace_lock);
16717	mutex_exit(&dtrace_provider_lock);
16718
16719	if (state != NULL) {
16720		/*
16721		 * If we created any anonymous state, set it going now.
16722		 */
16723		(void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
16724	}
16725
16726	return (DDI_SUCCESS);
16727}
16728#endif
16729
16730#if !defined(sun)
16731static void dtrace_dtr(void *);
16732#endif
16733
16734/*ARGSUSED*/
16735static int
16736#if defined(sun)
16737dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
16738#else
16739dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
16740#endif
16741{
16742	dtrace_state_t *state;
16743	uint32_t priv;
16744	uid_t uid;
16745	zoneid_t zoneid;
16746
16747#if defined(sun)
16748	if (getminor(*devp) == DTRACEMNRN_HELPER)
16749		return (0);
16750
16751	/*
16752	 * If this wasn't an open with the "helper" minor, then it must be
16753	 * the "dtrace" minor.
16754	 */
16755	if (getminor(*devp) == DTRACEMNRN_DTRACE)
16756		return (ENXIO);
16757#else
16758	cred_t *cred_p = NULL;
16759	cred_p = dev->si_cred;
16760
16761	/*
16762	 * If no DTRACE_PRIV_* bits are set in the credential, then the
16763	 * caller lacks sufficient permission to do anything with DTrace.
16764	 */
16765	dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
16766	if (priv == DTRACE_PRIV_NONE) {
16767#endif
16768
16769		return (EACCES);
16770	}
16771
16772	/*
16773	 * Ask all providers to provide all their probes.
16774	 */
16775	mutex_enter(&dtrace_provider_lock);
16776	dtrace_probe_provide(NULL, NULL);
16777	mutex_exit(&dtrace_provider_lock);
16778
16779	mutex_enter(&cpu_lock);
16780	mutex_enter(&dtrace_lock);
16781	dtrace_opens++;
16782	dtrace_membar_producer();
16783
16784#if defined(sun)
16785	/*
16786	 * If the kernel debugger is active (that is, if the kernel debugger
16787	 * modified text in some way), we won't allow the open.
16788	 */
16789	if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
16790		dtrace_opens--;
16791		mutex_exit(&cpu_lock);
16792		mutex_exit(&dtrace_lock);
16793		return (EBUSY);
16794	}
16795
16796	if (dtrace_helptrace_enable && dtrace_helptrace_buffer == NULL) {
16797		/*
16798		 * If DTrace helper tracing is enabled, we need to allocate the
16799		 * trace buffer and initialize the values.
16800		 */
16801		dtrace_helptrace_buffer =
16802		    kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
16803		dtrace_helptrace_next = 0;
16804		dtrace_helptrace_wrapped = 0;
16805		dtrace_helptrace_enable = 0;
16806	}
16807
16808	state = dtrace_state_create(devp, cred_p);
16809#else
16810	state = dtrace_state_create(dev);
16811	devfs_set_cdevpriv(state, dtrace_dtr);
16812#endif
16813
16814	mutex_exit(&cpu_lock);
16815
16816	if (state == NULL) {
16817#if defined(sun)
16818		if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
16819			(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16820#else
16821		--dtrace_opens;
16822#endif
16823		mutex_exit(&dtrace_lock);
16824		return (EAGAIN);
16825	}
16826
16827	mutex_exit(&dtrace_lock);
16828
16829	return (0);
16830}
16831
16832/*ARGSUSED*/
16833#if defined(sun)
16834static int
16835dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
16836#else
16837static void
16838dtrace_dtr(void *data)
16839#endif
16840{
16841#if defined(sun)
16842	minor_t minor = getminor(dev);
16843	dtrace_state_t *state;
16844#endif
16845	dtrace_helptrace_t *buf = NULL;
16846
16847#ifdef illumos
16848	if (minor == DTRACEMNRN_HELPER)
16849		return (0);
16850
16851	state = ddi_get_soft_state(dtrace_softstate, minor);
16852#else
16853	dtrace_state_t *state = data;
16854#endif
16855
16856	mutex_enter(&cpu_lock);
16857	mutex_enter(&dtrace_lock);
16858
16859#ifdef illumos
16860	if (state->dts_anon)
16861#else
16862	if (state != NULL && state->dts_anon)
16863#endif
16864	{
16865		/*
16866		 * There is anonymous state. Destroy that first.
16867		 */
16868		ASSERT(dtrace_anon.dta_state == NULL);
16869		dtrace_state_destroy(state->dts_anon);
16870	}
16871
16872	if (dtrace_helptrace_disable) {
16873		/*
16874		 * If we have been told to disable helper tracing, set the
16875		 * buffer to NULL before calling into dtrace_state_destroy();
16876		 * we take advantage of its dtrace_sync() to know that no
16877		 * CPU is in probe context with enabled helper tracing
16878		 * after it returns.
16879		 */
16880		buf = dtrace_helptrace_buffer;
16881		dtrace_helptrace_buffer = NULL;
16882	}
16883
16884#ifdef illumos
16885	dtrace_state_destroy(state);
16886#else
16887	if (state != NULL) {
16888		dtrace_state_destroy(state);
16889		kmem_free(state, 0);
16890	}
16891#endif
16892	ASSERT(dtrace_opens > 0);
16893
16894#if defined(sun)
16895	/*
16896	 * Only relinquish control of the kernel debugger interface when there
16897	 * are no consumers and no anonymous enablings.
16898	 */
16899	if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
16900		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16901#else
16902	--dtrace_opens;
16903#endif
16904
16905	if (buf != NULL) {
16906		kmem_free(buf, dtrace_helptrace_bufsize);
16907		dtrace_helptrace_disable = 0;
16908	}
16909
16910	mutex_exit(&dtrace_lock);
16911	mutex_exit(&cpu_lock);
16912
16913#if defined(sun)
16914	return (0);
16915#endif
16916}
16917
16918#if defined(sun)
16919/*ARGSUSED*/
16920static int
16921dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
16922{
16923	int rval;
16924	dof_helper_t help, *dhp = NULL;
16925
16926	switch (cmd) {
16927	case DTRACEHIOC_ADDDOF:
16928		if (copyin((void *)arg, &help, sizeof (help)) != 0) {
16929			dtrace_dof_error(NULL, "failed to copyin DOF helper");
16930			return (EFAULT);
16931		}
16932
16933		dhp = &help;
16934		arg = (intptr_t)help.dofhp_dof;
16935		/*FALLTHROUGH*/
16936
16937	case DTRACEHIOC_ADD: {
16938		dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
16939
16940		if (dof == NULL)
16941			return (rval);
16942
16943		mutex_enter(&dtrace_lock);
16944
16945		/*
16946		 * dtrace_helper_slurp() takes responsibility for the dof --
16947		 * it may free it now or it may save it and free it later.
16948		 */
16949		if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
16950			*rv = rval;
16951			rval = 0;
16952		} else {
16953			rval = EINVAL;
16954		}
16955
16956		mutex_exit(&dtrace_lock);
16957		return (rval);
16958	}
16959
16960	case DTRACEHIOC_REMOVE: {
16961		mutex_enter(&dtrace_lock);
16962		rval = dtrace_helper_destroygen(arg);
16963		mutex_exit(&dtrace_lock);
16964
16965		return (rval);
16966	}
16967
16968	default:
16969		break;
16970	}
16971
16972	return (ENOTTY);
16973}
16974
16975/*ARGSUSED*/
16976static int
16977dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
16978{
16979	minor_t minor = getminor(dev);
16980	dtrace_state_t *state;
16981	int rval;
16982
16983	if (minor == DTRACEMNRN_HELPER)
16984		return (dtrace_ioctl_helper(cmd, arg, rv));
16985
16986	state = ddi_get_soft_state(dtrace_softstate, minor);
16987
16988	if (state->dts_anon) {
16989		ASSERT(dtrace_anon.dta_state == NULL);
16990		state = state->dts_anon;
16991	}
16992
16993	switch (cmd) {
16994	case DTRACEIOC_PROVIDER: {
16995		dtrace_providerdesc_t pvd;
16996		dtrace_provider_t *pvp;
16997
16998		if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
16999			return (EFAULT);
17000
17001		pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
17002		mutex_enter(&dtrace_provider_lock);
17003
17004		for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17005			if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
17006				break;
17007		}
17008
17009		mutex_exit(&dtrace_provider_lock);
17010
17011		if (pvp == NULL)
17012			return (ESRCH);
17013
17014		bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17015		bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17016
17017		if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
17018			return (EFAULT);
17019
17020		return (0);
17021	}
17022
17023	case DTRACEIOC_EPROBE: {
17024		dtrace_eprobedesc_t epdesc;
17025		dtrace_ecb_t *ecb;
17026		dtrace_action_t *act;
17027		void *buf;
17028		size_t size;
17029		uintptr_t dest;
17030		int nrecs;
17031
17032		if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
17033			return (EFAULT);
17034
17035		mutex_enter(&dtrace_lock);
17036
17037		if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17038			mutex_exit(&dtrace_lock);
17039			return (EINVAL);
17040		}
17041
17042		if (ecb->dte_probe == NULL) {
17043			mutex_exit(&dtrace_lock);
17044			return (EINVAL);
17045		}
17046
17047		epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17048		epdesc.dtepd_uarg = ecb->dte_uarg;
17049		epdesc.dtepd_size = ecb->dte_size;
17050
17051		nrecs = epdesc.dtepd_nrecs;
17052		epdesc.dtepd_nrecs = 0;
17053		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17054			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17055				continue;
17056
17057			epdesc.dtepd_nrecs++;
17058		}
17059
17060		/*
17061		 * Now that we have the size, we need to allocate a temporary
17062		 * buffer in which to store the complete description.  We need
17063		 * the temporary buffer to be able to drop dtrace_lock()
17064		 * across the copyout(), below.
17065		 */
17066		size = sizeof (dtrace_eprobedesc_t) +
17067		    (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17068
17069		buf = kmem_alloc(size, KM_SLEEP);
17070		dest = (uintptr_t)buf;
17071
17072		bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17073		dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17074
17075		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17076			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17077				continue;
17078
17079			if (nrecs-- == 0)
17080				break;
17081
17082			bcopy(&act->dta_rec, (void *)dest,
17083			    sizeof (dtrace_recdesc_t));
17084			dest += sizeof (dtrace_recdesc_t);
17085		}
17086
17087		mutex_exit(&dtrace_lock);
17088
17089		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17090			kmem_free(buf, size);
17091			return (EFAULT);
17092		}
17093
17094		kmem_free(buf, size);
17095		return (0);
17096	}
17097
17098	case DTRACEIOC_AGGDESC: {
17099		dtrace_aggdesc_t aggdesc;
17100		dtrace_action_t *act;
17101		dtrace_aggregation_t *agg;
17102		int nrecs;
17103		uint32_t offs;
17104		dtrace_recdesc_t *lrec;
17105		void *buf;
17106		size_t size;
17107		uintptr_t dest;
17108
17109		if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
17110			return (EFAULT);
17111
17112		mutex_enter(&dtrace_lock);
17113
17114		if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17115			mutex_exit(&dtrace_lock);
17116			return (EINVAL);
17117		}
17118
17119		aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17120
17121		nrecs = aggdesc.dtagd_nrecs;
17122		aggdesc.dtagd_nrecs = 0;
17123
17124		offs = agg->dtag_base;
17125		lrec = &agg->dtag_action.dta_rec;
17126		aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17127
17128		for (act = agg->dtag_first; ; act = act->dta_next) {
17129			ASSERT(act->dta_intuple ||
17130			    DTRACEACT_ISAGG(act->dta_kind));
17131
17132			/*
17133			 * If this action has a record size of zero, it
17134			 * denotes an argument to the aggregating action.
17135			 * Because the presence of this record doesn't (or
17136			 * shouldn't) affect the way the data is interpreted,
17137			 * we don't copy it out to save user-level the
17138			 * confusion of dealing with a zero-length record.
17139			 */
17140			if (act->dta_rec.dtrd_size == 0) {
17141				ASSERT(agg->dtag_hasarg);
17142				continue;
17143			}
17144
17145			aggdesc.dtagd_nrecs++;
17146
17147			if (act == &agg->dtag_action)
17148				break;
17149		}
17150
17151		/*
17152		 * Now that we have the size, we need to allocate a temporary
17153		 * buffer in which to store the complete description.  We need
17154		 * the temporary buffer to be able to drop dtrace_lock()
17155		 * across the copyout(), below.
17156		 */
17157		size = sizeof (dtrace_aggdesc_t) +
17158		    (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17159
17160		buf = kmem_alloc(size, KM_SLEEP);
17161		dest = (uintptr_t)buf;
17162
17163		bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17164		dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17165
17166		for (act = agg->dtag_first; ; act = act->dta_next) {
17167			dtrace_recdesc_t rec = act->dta_rec;
17168
17169			/*
17170			 * See the comment in the above loop for why we pass
17171			 * over zero-length records.
17172			 */
17173			if (rec.dtrd_size == 0) {
17174				ASSERT(agg->dtag_hasarg);
17175				continue;
17176			}
17177
17178			if (nrecs-- == 0)
17179				break;
17180
17181			rec.dtrd_offset -= offs;
17182			bcopy(&rec, (void *)dest, sizeof (rec));
17183			dest += sizeof (dtrace_recdesc_t);
17184
17185			if (act == &agg->dtag_action)
17186				break;
17187		}
17188
17189		mutex_exit(&dtrace_lock);
17190
17191		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17192			kmem_free(buf, size);
17193			return (EFAULT);
17194		}
17195
17196		kmem_free(buf, size);
17197		return (0);
17198	}
17199
17200	case DTRACEIOC_ENABLE: {
17201		dof_hdr_t *dof;
17202		dtrace_enabling_t *enab = NULL;
17203		dtrace_vstate_t *vstate;
17204		int err = 0;
17205
17206		*rv = 0;
17207
17208		/*
17209		 * If a NULL argument has been passed, we take this as our
17210		 * cue to reevaluate our enablings.
17211		 */
17212		if (arg == NULL) {
17213			dtrace_enabling_matchall();
17214
17215			return (0);
17216		}
17217
17218		if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17219			return (rval);
17220
17221		mutex_enter(&cpu_lock);
17222		mutex_enter(&dtrace_lock);
17223		vstate = &state->dts_vstate;
17224
17225		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17226			mutex_exit(&dtrace_lock);
17227			mutex_exit(&cpu_lock);
17228			dtrace_dof_destroy(dof);
17229			return (EBUSY);
17230		}
17231
17232		if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17233			mutex_exit(&dtrace_lock);
17234			mutex_exit(&cpu_lock);
17235			dtrace_dof_destroy(dof);
17236			return (EINVAL);
17237		}
17238
17239		if ((rval = dtrace_dof_options(dof, state)) != 0) {
17240			dtrace_enabling_destroy(enab);
17241			mutex_exit(&dtrace_lock);
17242			mutex_exit(&cpu_lock);
17243			dtrace_dof_destroy(dof);
17244			return (rval);
17245		}
17246
17247		if ((err = dtrace_enabling_match(enab, rv)) == 0) {
17248			err = dtrace_enabling_retain(enab);
17249		} else {
17250			dtrace_enabling_destroy(enab);
17251		}
17252
17253		mutex_exit(&cpu_lock);
17254		mutex_exit(&dtrace_lock);
17255		dtrace_dof_destroy(dof);
17256
17257		return (err);
17258	}
17259
17260	case DTRACEIOC_REPLICATE: {
17261		dtrace_repldesc_t desc;
17262		dtrace_probedesc_t *match = &desc.dtrpd_match;
17263		dtrace_probedesc_t *create = &desc.dtrpd_create;
17264		int err;
17265
17266		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17267			return (EFAULT);
17268
17269		match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17270		match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17271		match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17272		match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17273
17274		create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17275		create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17276		create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17277		create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17278
17279		mutex_enter(&dtrace_lock);
17280		err = dtrace_enabling_replicate(state, match, create);
17281		mutex_exit(&dtrace_lock);
17282
17283		return (err);
17284	}
17285
17286	case DTRACEIOC_PROBEMATCH:
17287	case DTRACEIOC_PROBES: {
17288		dtrace_probe_t *probe = NULL;
17289		dtrace_probedesc_t desc;
17290		dtrace_probekey_t pkey;
17291		dtrace_id_t i;
17292		int m = 0;
17293		uint32_t priv;
17294		uid_t uid;
17295		zoneid_t zoneid;
17296
17297		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17298			return (EFAULT);
17299
17300		desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17301		desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17302		desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17303		desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17304
17305		/*
17306		 * Before we attempt to match this probe, we want to give
17307		 * all providers the opportunity to provide it.
17308		 */
17309		if (desc.dtpd_id == DTRACE_IDNONE) {
17310			mutex_enter(&dtrace_provider_lock);
17311			dtrace_probe_provide(&desc, NULL);
17312			mutex_exit(&dtrace_provider_lock);
17313			desc.dtpd_id++;
17314		}
17315
17316		if (cmd == DTRACEIOC_PROBEMATCH)  {
17317			dtrace_probekey(&desc, &pkey);
17318			pkey.dtpk_id = DTRACE_IDNONE;
17319		}
17320
17321		dtrace_cred2priv(cr, &priv, &uid, &zoneid);
17322
17323		mutex_enter(&dtrace_lock);
17324
17325		if (cmd == DTRACEIOC_PROBEMATCH) {
17326			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17327				if ((probe = dtrace_probes[i - 1]) != NULL &&
17328				    (m = dtrace_match_probe(probe, &pkey,
17329				    priv, uid, zoneid)) != 0)
17330					break;
17331			}
17332
17333			if (m < 0) {
17334				mutex_exit(&dtrace_lock);
17335				return (EINVAL);
17336			}
17337
17338		} else {
17339			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17340				if ((probe = dtrace_probes[i - 1]) != NULL &&
17341				    dtrace_match_priv(probe, priv, uid, zoneid))
17342					break;
17343			}
17344		}
17345
17346		if (probe == NULL) {
17347			mutex_exit(&dtrace_lock);
17348			return (ESRCH);
17349		}
17350
17351		dtrace_probe_description(probe, &desc);
17352		mutex_exit(&dtrace_lock);
17353
17354		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17355			return (EFAULT);
17356
17357		return (0);
17358	}
17359
17360	case DTRACEIOC_PROBEARG: {
17361		dtrace_argdesc_t desc;
17362		dtrace_probe_t *probe;
17363		dtrace_provider_t *prov;
17364
17365		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17366			return (EFAULT);
17367
17368		if (desc.dtargd_id == DTRACE_IDNONE)
17369			return (EINVAL);
17370
17371		if (desc.dtargd_ndx == DTRACE_ARGNONE)
17372			return (EINVAL);
17373
17374		mutex_enter(&dtrace_provider_lock);
17375		mutex_enter(&mod_lock);
17376		mutex_enter(&dtrace_lock);
17377
17378		if (desc.dtargd_id > dtrace_nprobes) {
17379			mutex_exit(&dtrace_lock);
17380			mutex_exit(&mod_lock);
17381			mutex_exit(&dtrace_provider_lock);
17382			return (EINVAL);
17383		}
17384
17385		if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
17386			mutex_exit(&dtrace_lock);
17387			mutex_exit(&mod_lock);
17388			mutex_exit(&dtrace_provider_lock);
17389			return (EINVAL);
17390		}
17391
17392		mutex_exit(&dtrace_lock);
17393
17394		prov = probe->dtpr_provider;
17395
17396		if (prov->dtpv_pops.dtps_getargdesc == NULL) {
17397			/*
17398			 * There isn't any typed information for this probe.
17399			 * Set the argument number to DTRACE_ARGNONE.
17400			 */
17401			desc.dtargd_ndx = DTRACE_ARGNONE;
17402		} else {
17403			desc.dtargd_native[0] = '\0';
17404			desc.dtargd_xlate[0] = '\0';
17405			desc.dtargd_mapping = desc.dtargd_ndx;
17406
17407			prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
17408			    probe->dtpr_id, probe->dtpr_arg, &desc);
17409		}
17410
17411		mutex_exit(&mod_lock);
17412		mutex_exit(&dtrace_provider_lock);
17413
17414		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17415			return (EFAULT);
17416
17417		return (0);
17418	}
17419
17420	case DTRACEIOC_GO: {
17421		processorid_t cpuid;
17422		rval = dtrace_state_go(state, &cpuid);
17423
17424		if (rval != 0)
17425			return (rval);
17426
17427		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17428			return (EFAULT);
17429
17430		return (0);
17431	}
17432
17433	case DTRACEIOC_STOP: {
17434		processorid_t cpuid;
17435
17436		mutex_enter(&dtrace_lock);
17437		rval = dtrace_state_stop(state, &cpuid);
17438		mutex_exit(&dtrace_lock);
17439
17440		if (rval != 0)
17441			return (rval);
17442
17443		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17444			return (EFAULT);
17445
17446		return (0);
17447	}
17448
17449	case DTRACEIOC_DOFGET: {
17450		dof_hdr_t hdr, *dof;
17451		uint64_t len;
17452
17453		if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
17454			return (EFAULT);
17455
17456		mutex_enter(&dtrace_lock);
17457		dof = dtrace_dof_create(state);
17458		mutex_exit(&dtrace_lock);
17459
17460		len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
17461		rval = copyout(dof, (void *)arg, len);
17462		dtrace_dof_destroy(dof);
17463
17464		return (rval == 0 ? 0 : EFAULT);
17465	}
17466
17467	case DTRACEIOC_AGGSNAP:
17468	case DTRACEIOC_BUFSNAP: {
17469		dtrace_bufdesc_t desc;
17470		caddr_t cached;
17471		dtrace_buffer_t *buf;
17472
17473		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17474			return (EFAULT);
17475
17476		if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
17477			return (EINVAL);
17478
17479		mutex_enter(&dtrace_lock);
17480
17481		if (cmd == DTRACEIOC_BUFSNAP) {
17482			buf = &state->dts_buffer[desc.dtbd_cpu];
17483		} else {
17484			buf = &state->dts_aggbuffer[desc.dtbd_cpu];
17485		}
17486
17487		if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
17488			size_t sz = buf->dtb_offset;
17489
17490			if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
17491				mutex_exit(&dtrace_lock);
17492				return (EBUSY);
17493			}
17494
17495			/*
17496			 * If this buffer has already been consumed, we're
17497			 * going to indicate that there's nothing left here
17498			 * to consume.
17499			 */
17500			if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
17501				mutex_exit(&dtrace_lock);
17502
17503				desc.dtbd_size = 0;
17504				desc.dtbd_drops = 0;
17505				desc.dtbd_errors = 0;
17506				desc.dtbd_oldest = 0;
17507				sz = sizeof (desc);
17508
17509				if (copyout(&desc, (void *)arg, sz) != 0)
17510					return (EFAULT);
17511
17512				return (0);
17513			}
17514
17515			/*
17516			 * If this is a ring buffer that has wrapped, we want
17517			 * to copy the whole thing out.
17518			 */
17519			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
17520				dtrace_buffer_polish(buf);
17521				sz = buf->dtb_size;
17522			}
17523
17524			if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
17525				mutex_exit(&dtrace_lock);
17526				return (EFAULT);
17527			}
17528
17529			desc.dtbd_size = sz;
17530			desc.dtbd_drops = buf->dtb_drops;
17531			desc.dtbd_errors = buf->dtb_errors;
17532			desc.dtbd_oldest = buf->dtb_xamot_offset;
17533			desc.dtbd_timestamp = dtrace_gethrtime();
17534
17535			mutex_exit(&dtrace_lock);
17536
17537			if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17538				return (EFAULT);
17539
17540			buf->dtb_flags |= DTRACEBUF_CONSUMED;
17541
17542			return (0);
17543		}
17544
17545		if (buf->dtb_tomax == NULL) {
17546			ASSERT(buf->dtb_xamot == NULL);
17547			mutex_exit(&dtrace_lock);
17548			return (ENOENT);
17549		}
17550
17551		cached = buf->dtb_tomax;
17552		ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
17553
17554		dtrace_xcall(desc.dtbd_cpu,
17555		    (dtrace_xcall_t)dtrace_buffer_switch, buf);
17556
17557		state->dts_errors += buf->dtb_xamot_errors;
17558
17559		/*
17560		 * If the buffers did not actually switch, then the cross call
17561		 * did not take place -- presumably because the given CPU is
17562		 * not in the ready set.  If this is the case, we'll return
17563		 * ENOENT.
17564		 */
17565		if (buf->dtb_tomax == cached) {
17566			ASSERT(buf->dtb_xamot != cached);
17567			mutex_exit(&dtrace_lock);
17568			return (ENOENT);
17569		}
17570
17571		ASSERT(cached == buf->dtb_xamot);
17572
17573		/*
17574		 * We have our snapshot; now copy it out.
17575		 */
17576		if (copyout(buf->dtb_xamot, desc.dtbd_data,
17577		    buf->dtb_xamot_offset) != 0) {
17578			mutex_exit(&dtrace_lock);
17579			return (EFAULT);
17580		}
17581
17582		desc.dtbd_size = buf->dtb_xamot_offset;
17583		desc.dtbd_drops = buf->dtb_xamot_drops;
17584		desc.dtbd_errors = buf->dtb_xamot_errors;
17585		desc.dtbd_oldest = 0;
17586		desc.dtbd_timestamp = buf->dtb_switched;
17587
17588		mutex_exit(&dtrace_lock);
17589
17590		/*
17591		 * Finally, copy out the buffer description.
17592		 */
17593		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17594			return (EFAULT);
17595
17596		return (0);
17597	}
17598
17599	case DTRACEIOC_CONF: {
17600		dtrace_conf_t conf;
17601
17602		bzero(&conf, sizeof (conf));
17603		conf.dtc_difversion = DIF_VERSION;
17604		conf.dtc_difintregs = DIF_DIR_NREGS;
17605		conf.dtc_diftupregs = DIF_DTR_NREGS;
17606		conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
17607
17608		if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
17609			return (EFAULT);
17610
17611		return (0);
17612	}
17613
17614	case DTRACEIOC_STATUS: {
17615		dtrace_status_t stat;
17616		dtrace_dstate_t *dstate;
17617		int i, j;
17618		uint64_t nerrs;
17619
17620		/*
17621		 * See the comment in dtrace_state_deadman() for the reason
17622		 * for setting dts_laststatus to INT64_MAX before setting
17623		 * it to the correct value.
17624		 */
17625		state->dts_laststatus = INT64_MAX;
17626		dtrace_membar_producer();
17627		state->dts_laststatus = dtrace_gethrtime();
17628
17629		bzero(&stat, sizeof (stat));
17630
17631		mutex_enter(&dtrace_lock);
17632
17633		if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
17634			mutex_exit(&dtrace_lock);
17635			return (ENOENT);
17636		}
17637
17638		if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
17639			stat.dtst_exiting = 1;
17640
17641		nerrs = state->dts_errors;
17642		dstate = &state->dts_vstate.dtvs_dynvars;
17643
17644		for (i = 0; i < NCPU; i++) {
17645			dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
17646
17647			stat.dtst_dyndrops += dcpu->dtdsc_drops;
17648			stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
17649			stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
17650
17651			if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
17652				stat.dtst_filled++;
17653
17654			nerrs += state->dts_buffer[i].dtb_errors;
17655
17656			for (j = 0; j < state->dts_nspeculations; j++) {
17657				dtrace_speculation_t *spec;
17658				dtrace_buffer_t *buf;
17659
17660				spec = &state->dts_speculations[j];
17661				buf = &spec->dtsp_buffer[i];
17662				stat.dtst_specdrops += buf->dtb_xamot_drops;
17663			}
17664		}
17665
17666		stat.dtst_specdrops_busy = state->dts_speculations_busy;
17667		stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
17668		stat.dtst_stkstroverflows = state->dts_stkstroverflows;
17669		stat.dtst_dblerrors = state->dts_dblerrors;
17670		stat.dtst_killed =
17671		    (state->dts_activity == DTRACE_ACTIVITY_KILLED);
17672		stat.dtst_errors = nerrs;
17673
17674		mutex_exit(&dtrace_lock);
17675
17676		if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
17677			return (EFAULT);
17678
17679		return (0);
17680	}
17681
17682	case DTRACEIOC_FORMAT: {
17683		dtrace_fmtdesc_t fmt;
17684		char *str;
17685		int len;
17686
17687		if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
17688			return (EFAULT);
17689
17690		mutex_enter(&dtrace_lock);
17691
17692		if (fmt.dtfd_format == 0 ||
17693		    fmt.dtfd_format > state->dts_nformats) {
17694			mutex_exit(&dtrace_lock);
17695			return (EINVAL);
17696		}
17697
17698		/*
17699		 * Format strings are allocated contiguously and they are
17700		 * never freed; if a format index is less than the number
17701		 * of formats, we can assert that the format map is non-NULL
17702		 * and that the format for the specified index is non-NULL.
17703		 */
17704		ASSERT(state->dts_formats != NULL);
17705		str = state->dts_formats[fmt.dtfd_format - 1];
17706		ASSERT(str != NULL);
17707
17708		len = strlen(str) + 1;
17709
17710		if (len > fmt.dtfd_length) {
17711			fmt.dtfd_length = len;
17712
17713			if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
17714				mutex_exit(&dtrace_lock);
17715				return (EINVAL);
17716			}
17717		} else {
17718			if (copyout(str, fmt.dtfd_string, len) != 0) {
17719				mutex_exit(&dtrace_lock);
17720				return (EINVAL);
17721			}
17722		}
17723
17724		mutex_exit(&dtrace_lock);
17725		return (0);
17726	}
17727
17728	default:
17729		break;
17730	}
17731
17732	return (ENOTTY);
17733}
17734
17735/*ARGSUSED*/
17736static int
17737dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
17738{
17739	dtrace_state_t *state;
17740
17741	switch (cmd) {
17742	case DDI_DETACH:
17743		break;
17744
17745	case DDI_SUSPEND:
17746		return (DDI_SUCCESS);
17747
17748	default:
17749		return (DDI_FAILURE);
17750	}
17751
17752	mutex_enter(&cpu_lock);
17753	mutex_enter(&dtrace_provider_lock);
17754	mutex_enter(&dtrace_lock);
17755
17756	ASSERT(dtrace_opens == 0);
17757
17758	if (dtrace_helpers > 0) {
17759		mutex_exit(&dtrace_provider_lock);
17760		mutex_exit(&dtrace_lock);
17761		mutex_exit(&cpu_lock);
17762		return (DDI_FAILURE);
17763	}
17764
17765	if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
17766		mutex_exit(&dtrace_provider_lock);
17767		mutex_exit(&dtrace_lock);
17768		mutex_exit(&cpu_lock);
17769		return (DDI_FAILURE);
17770	}
17771
17772	dtrace_provider = NULL;
17773
17774	if ((state = dtrace_anon_grab()) != NULL) {
17775		/*
17776		 * If there were ECBs on this state, the provider should
17777		 * have not been allowed to detach; assert that there is
17778		 * none.
17779		 */
17780		ASSERT(state->dts_necbs == 0);
17781		dtrace_state_destroy(state);
17782
17783		/*
17784		 * If we're being detached with anonymous state, we need to
17785		 * indicate to the kernel debugger that DTrace is now inactive.
17786		 */
17787		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17788	}
17789
17790	bzero(&dtrace_anon, sizeof (dtrace_anon_t));
17791	unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17792	dtrace_cpu_init = NULL;
17793	dtrace_helpers_cleanup = NULL;
17794	dtrace_helpers_fork = NULL;
17795	dtrace_cpustart_init = NULL;
17796	dtrace_cpustart_fini = NULL;
17797	dtrace_debugger_init = NULL;
17798	dtrace_debugger_fini = NULL;
17799	dtrace_modload = NULL;
17800	dtrace_modunload = NULL;
17801
17802	ASSERT(dtrace_getf == 0);
17803	ASSERT(dtrace_closef == NULL);
17804
17805	mutex_exit(&cpu_lock);
17806
17807	kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
17808	dtrace_probes = NULL;
17809	dtrace_nprobes = 0;
17810
17811	dtrace_hash_destroy(dtrace_bymod);
17812	dtrace_hash_destroy(dtrace_byfunc);
17813	dtrace_hash_destroy(dtrace_byname);
17814	dtrace_bymod = NULL;
17815	dtrace_byfunc = NULL;
17816	dtrace_byname = NULL;
17817
17818	kmem_cache_destroy(dtrace_state_cache);
17819	vmem_destroy(dtrace_minor);
17820	vmem_destroy(dtrace_arena);
17821
17822	if (dtrace_toxrange != NULL) {
17823		kmem_free(dtrace_toxrange,
17824		    dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
17825		dtrace_toxrange = NULL;
17826		dtrace_toxranges = 0;
17827		dtrace_toxranges_max = 0;
17828	}
17829
17830	ddi_remove_minor_node(dtrace_devi, NULL);
17831	dtrace_devi = NULL;
17832
17833	ddi_soft_state_fini(&dtrace_softstate);
17834
17835	ASSERT(dtrace_vtime_references == 0);
17836	ASSERT(dtrace_opens == 0);
17837	ASSERT(dtrace_retained == NULL);
17838
17839	mutex_exit(&dtrace_lock);
17840	mutex_exit(&dtrace_provider_lock);
17841
17842	/*
17843	 * We don't destroy the task queue until after we have dropped our
17844	 * locks (taskq_destroy() may block on running tasks).  To prevent
17845	 * attempting to do work after we have effectively detached but before
17846	 * the task queue has been destroyed, all tasks dispatched via the
17847	 * task queue must check that DTrace is still attached before
17848	 * performing any operation.
17849	 */
17850	taskq_destroy(dtrace_taskq);
17851	dtrace_taskq = NULL;
17852
17853	return (DDI_SUCCESS);
17854}
17855#endif
17856
17857#if defined(sun)
17858/*ARGSUSED*/
17859static int
17860dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
17861{
17862	int error;
17863
17864	switch (infocmd) {
17865	case DDI_INFO_DEVT2DEVINFO:
17866		*result = (void *)dtrace_devi;
17867		error = DDI_SUCCESS;
17868		break;
17869	case DDI_INFO_DEVT2INSTANCE:
17870		*result = (void *)0;
17871		error = DDI_SUCCESS;
17872		break;
17873	default:
17874		error = DDI_FAILURE;
17875	}
17876	return (error);
17877}
17878#endif
17879
17880#if defined(sun)
17881static struct cb_ops dtrace_cb_ops = {
17882	dtrace_open,		/* open */
17883	dtrace_close,		/* close */
17884	nulldev,		/* strategy */
17885	nulldev,		/* print */
17886	nodev,			/* dump */
17887	nodev,			/* read */
17888	nodev,			/* write */
17889	dtrace_ioctl,		/* ioctl */
17890	nodev,			/* devmap */
17891	nodev,			/* mmap */
17892	nodev,			/* segmap */
17893	nochpoll,		/* poll */
17894	ddi_prop_op,		/* cb_prop_op */
17895	0,			/* streamtab  */
17896	D_NEW | D_MP		/* Driver compatibility flag */
17897};
17898
17899static struct dev_ops dtrace_ops = {
17900	DEVO_REV,		/* devo_rev */
17901	0,			/* refcnt */
17902	dtrace_info,		/* get_dev_info */
17903	nulldev,		/* identify */
17904	nulldev,		/* probe */
17905	dtrace_attach,		/* attach */
17906	dtrace_detach,		/* detach */
17907	nodev,			/* reset */
17908	&dtrace_cb_ops,		/* driver operations */
17909	NULL,			/* bus operations */
17910	nodev			/* dev power */
17911};
17912
17913static struct modldrv modldrv = {
17914	&mod_driverops,		/* module type (this is a pseudo driver) */
17915	"Dynamic Tracing",	/* name of module */
17916	&dtrace_ops,		/* driver ops */
17917};
17918
17919static struct modlinkage modlinkage = {
17920	MODREV_1,
17921	(void *)&modldrv,
17922	NULL
17923};
17924
17925int
17926_init(void)
17927{
17928	return (mod_install(&modlinkage));
17929}
17930
17931int
17932_info(struct modinfo *modinfop)
17933{
17934	return (mod_info(&modlinkage, modinfop));
17935}
17936
17937int
17938_fini(void)
17939{
17940	return (mod_remove(&modlinkage));
17941}
17942#else
17943
17944static d_ioctl_t	dtrace_ioctl;
17945static d_ioctl_t	dtrace_ioctl_helper;
17946static void		dtrace_load(void *);
17947static int		dtrace_unload(void);
17948static struct cdev	*dtrace_dev;
17949static struct cdev	*helper_dev;
17950
17951void dtrace_invop_init(void);
17952void dtrace_invop_uninit(void);
17953
17954static struct cdevsw dtrace_cdevsw = {
17955	.d_version	= D_VERSION,
17956	.d_ioctl	= dtrace_ioctl,
17957	.d_open		= dtrace_open,
17958	.d_name		= "dtrace",
17959};
17960
17961static struct cdevsw helper_cdevsw = {
17962	.d_version	= D_VERSION,
17963	.d_ioctl	= dtrace_ioctl_helper,
17964	.d_name		= "helper",
17965};
17966
17967#include <dtrace_anon.c>
17968#include <dtrace_ioctl.c>
17969#include <dtrace_load.c>
17970#include <dtrace_modevent.c>
17971#include <dtrace_sysctl.c>
17972#include <dtrace_unload.c>
17973#include <dtrace_vtime.c>
17974#include <dtrace_hacks.c>
17975#include <dtrace_isa.c>
17976
17977SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL);
17978SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL);
17979SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL);
17980
17981DEV_MODULE(dtrace, dtrace_modevent, NULL);
17982MODULE_VERSION(dtrace, 1);
17983MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
17984#endif
17985