dtrace.c revision 268578
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
21 * $FreeBSD: stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c 268578 2014-07-12 22:56:41Z rpaulo $
22 */
23
24/*
25 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 * Copyright (c) 2012 by Delphix. All rights reserved.
28 */
29
30/*
31 * DTrace - Dynamic Tracing for Solaris
32 *
33 * This is the implementation of the Solaris Dynamic Tracing framework
34 * (DTrace).  The user-visible interface to DTrace is described at length in
35 * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
36 * library, the in-kernel DTrace framework, and the DTrace providers are
37 * described in the block comments in the <sys/dtrace.h> header file.  The
38 * internal architecture of DTrace is described in the block comments in the
39 * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
40 * implementation very much assume mastery of all of these sources; if one has
41 * an unanswered question about the implementation, one should consult them
42 * first.
43 *
44 * The functions here are ordered roughly as follows:
45 *
46 *   - Probe context functions
47 *   - Probe hashing functions
48 *   - Non-probe context utility functions
49 *   - Matching functions
50 *   - Provider-to-Framework API functions
51 *   - Probe management functions
52 *   - DIF object functions
53 *   - Format functions
54 *   - Predicate functions
55 *   - ECB functions
56 *   - Buffer functions
57 *   - Enabling functions
58 *   - DOF functions
59 *   - Anonymous enabling functions
60 *   - Consumer state functions
61 *   - Helper functions
62 *   - Hook functions
63 *   - Driver cookbook functions
64 *
65 * Each group of functions begins with a block comment labelled the "DTrace
66 * [Group] Functions", allowing one to find each block by searching forward
67 * on capital-f functions.
68 */
69#include <sys/errno.h>
70#if !defined(sun)
71#include <sys/time.h>
72#endif
73#include <sys/stat.h>
74#include <sys/modctl.h>
75#include <sys/conf.h>
76#include <sys/systm.h>
77#if defined(sun)
78#include <sys/ddi.h>
79#include <sys/sunddi.h>
80#endif
81#include <sys/cpuvar.h>
82#include <sys/kmem.h>
83#if defined(sun)
84#include <sys/strsubr.h>
85#endif
86#include <sys/sysmacros.h>
87#include <sys/dtrace_impl.h>
88#include <sys/atomic.h>
89#include <sys/cmn_err.h>
90#if defined(sun)
91#include <sys/mutex_impl.h>
92#include <sys/rwlock_impl.h>
93#endif
94#include <sys/ctf_api.h>
95#if defined(sun)
96#include <sys/panic.h>
97#include <sys/priv_impl.h>
98#endif
99#include <sys/policy.h>
100#if defined(sun)
101#include <sys/cred_impl.h>
102#include <sys/procfs_isa.h>
103#endif
104#include <sys/taskq.h>
105#if defined(sun)
106#include <sys/mkdev.h>
107#include <sys/kdi.h>
108#endif
109#include <sys/zone.h>
110#include <sys/socket.h>
111#include <netinet/in.h>
112#include "strtolctype.h"
113
114/* FreeBSD includes: */
115#if !defined(sun)
116#include <sys/callout.h>
117#include <sys/ctype.h>
118#include <sys/eventhandler.h>
119#include <sys/limits.h>
120#include <sys/kdb.h>
121#include <sys/kernel.h>
122#include <sys/malloc.h>
123#include <sys/sysctl.h>
124#include <sys/lock.h>
125#include <sys/mutex.h>
126#include <sys/rwlock.h>
127#include <sys/sx.h>
128#include <sys/dtrace_bsd.h>
129#include <netinet/in.h>
130#include "dtrace_cddl.h"
131#include "dtrace_debug.c"
132#endif
133
134/*
135 * DTrace Tunable Variables
136 *
137 * The following variables may be tuned by adding a line to /etc/system that
138 * includes both the name of the DTrace module ("dtrace") and the name of the
139 * variable.  For example:
140 *
141 *   set dtrace:dtrace_destructive_disallow = 1
142 *
143 * In general, the only variables that one should be tuning this way are those
144 * that affect system-wide DTrace behavior, and for which the default behavior
145 * is undesirable.  Most of these variables are tunable on a per-consumer
146 * basis using DTrace options, and need not be tuned on a system-wide basis.
147 * When tuning these variables, avoid pathological values; while some attempt
148 * is made to verify the integrity of these variables, they are not considered
149 * part of the supported interface to DTrace, and they are therefore not
150 * checked comprehensively.  Further, these variables should not be tuned
151 * dynamically via "mdb -kw" or other means; they should only be tuned via
152 * /etc/system.
153 */
154int		dtrace_destructive_disallow = 0;
155dtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
156size_t		dtrace_difo_maxsize = (256 * 1024);
157dtrace_optval_t	dtrace_dof_maxsize = (8 * 1024 * 1024);
158size_t		dtrace_global_maxsize = (16 * 1024);
159size_t		dtrace_actions_max = (16 * 1024);
160size_t		dtrace_retain_max = 1024;
161dtrace_optval_t	dtrace_helper_actions_max = 128;
162dtrace_optval_t	dtrace_helper_providers_max = 32;
163dtrace_optval_t	dtrace_dstate_defsize = (1 * 1024 * 1024);
164size_t		dtrace_strsize_default = 256;
165dtrace_optval_t	dtrace_cleanrate_default = 9900990;		/* 101 hz */
166dtrace_optval_t	dtrace_cleanrate_min = 200000;			/* 5000 hz */
167dtrace_optval_t	dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;	/* 1/minute */
168dtrace_optval_t	dtrace_aggrate_default = NANOSEC;		/* 1 hz */
169dtrace_optval_t	dtrace_statusrate_default = NANOSEC;		/* 1 hz */
170dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;	 /* 6/minute */
171dtrace_optval_t	dtrace_switchrate_default = NANOSEC;		/* 1 hz */
172dtrace_optval_t	dtrace_nspec_default = 1;
173dtrace_optval_t	dtrace_specsize_default = 32 * 1024;
174dtrace_optval_t dtrace_stackframes_default = 20;
175dtrace_optval_t dtrace_ustackframes_default = 20;
176dtrace_optval_t dtrace_jstackframes_default = 50;
177dtrace_optval_t dtrace_jstackstrsize_default = 512;
178int		dtrace_msgdsize_max = 128;
179hrtime_t	dtrace_chill_max = 500 * (NANOSEC / MILLISEC);	/* 500 ms */
180hrtime_t	dtrace_chill_interval = NANOSEC;		/* 1000 ms */
181int		dtrace_devdepth_max = 32;
182int		dtrace_err_verbose;
183hrtime_t	dtrace_deadman_interval = NANOSEC;
184hrtime_t	dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
185hrtime_t	dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
186hrtime_t	dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
187
188/*
189 * DTrace External Variables
190 *
191 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
192 * available to DTrace consumers via the backtick (`) syntax.  One of these,
193 * dtrace_zero, is made deliberately so:  it is provided as a source of
194 * well-known, zero-filled memory.  While this variable is not documented,
195 * it is used by some translators as an implementation detail.
196 */
197const char	dtrace_zero[256] = { 0 };	/* zero-filled memory */
198
199/*
200 * DTrace Internal Variables
201 */
202#if defined(sun)
203static dev_info_t	*dtrace_devi;		/* device info */
204#endif
205#if defined(sun)
206static vmem_t		*dtrace_arena;		/* probe ID arena */
207static vmem_t		*dtrace_minor;		/* minor number arena */
208#else
209static taskq_t		*dtrace_taskq;		/* task queue */
210static struct unrhdr	*dtrace_arena;		/* Probe ID number.     */
211#endif
212static dtrace_probe_t	**dtrace_probes;	/* array of all probes */
213static int		dtrace_nprobes;		/* number of probes */
214static dtrace_provider_t *dtrace_provider;	/* provider list */
215static dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
216static int		dtrace_opens;		/* number of opens */
217static int		dtrace_helpers;		/* number of helpers */
218static int		dtrace_getf;		/* number of unpriv getf()s */
219#if defined(sun)
220static void		*dtrace_softstate;	/* softstate pointer */
221#endif
222static dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
223static dtrace_hash_t	*dtrace_byfunc;		/* probes hashed by function */
224static dtrace_hash_t	*dtrace_byname;		/* probes hashed by name */
225static dtrace_toxrange_t *dtrace_toxrange;	/* toxic range array */
226static int		dtrace_toxranges;	/* number of toxic ranges */
227static int		dtrace_toxranges_max;	/* size of toxic range array */
228static dtrace_anon_t	dtrace_anon;		/* anonymous enabling */
229static kmem_cache_t	*dtrace_state_cache;	/* cache for dynamic state */
230static uint64_t		dtrace_vtime_references; /* number of vtimestamp refs */
231static kthread_t	*dtrace_panicked;	/* panicking thread */
232static dtrace_ecb_t	*dtrace_ecb_create_cache; /* cached created ECB */
233static dtrace_genid_t	dtrace_probegen;	/* current probe generation */
234static dtrace_helpers_t *dtrace_deferred_pid;	/* deferred helper list */
235static dtrace_enabling_t *dtrace_retained;	/* list of retained enablings */
236static dtrace_genid_t	dtrace_retained_gen;	/* current retained enab gen */
237static dtrace_dynvar_t	dtrace_dynhash_sink;	/* end of dynamic hash chains */
238#if !defined(sun)
239static struct mtx	dtrace_unr_mtx;
240MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
241int		dtrace_in_probe;	/* non-zero if executing a probe */
242#if defined(__i386__) || defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
243uintptr_t	dtrace_in_probe_addr;	/* Address of invop when already in probe */
244#endif
245static eventhandler_tag	dtrace_kld_load_tag;
246static eventhandler_tag	dtrace_kld_unload_try_tag;
247#endif
248
249/*
250 * DTrace Locking
251 * DTrace is protected by three (relatively coarse-grained) locks:
252 *
253 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
254 *     including enabling state, probes, ECBs, consumer state, helper state,
255 *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
256 *     probe context is lock-free -- synchronization is handled via the
257 *     dtrace_sync() cross call mechanism.
258 *
259 * (2) dtrace_provider_lock is required when manipulating provider state, or
260 *     when provider state must be held constant.
261 *
262 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
263 *     when meta provider state must be held constant.
264 *
265 * The lock ordering between these three locks is dtrace_meta_lock before
266 * dtrace_provider_lock before dtrace_lock.  (In particular, there are
267 * several places where dtrace_provider_lock is held by the framework as it
268 * calls into the providers -- which then call back into the framework,
269 * grabbing dtrace_lock.)
270 *
271 * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
272 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
273 * role as a coarse-grained lock; it is acquired before both of these locks.
274 * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
275 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
276 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
277 * acquired _between_ dtrace_provider_lock and dtrace_lock.
278 */
279static kmutex_t		dtrace_lock;		/* probe state lock */
280static kmutex_t		dtrace_provider_lock;	/* provider state lock */
281static kmutex_t		dtrace_meta_lock;	/* meta-provider state lock */
282
283#if !defined(sun)
284/* XXX FreeBSD hacks. */
285#define cr_suid		cr_svuid
286#define cr_sgid		cr_svgid
287#define	ipaddr_t	in_addr_t
288#define mod_modname	pathname
289#define vuprintf	vprintf
290#define ttoproc(_a)	((_a)->td_proc)
291#define crgetzoneid(_a)	0
292#define	NCPU		MAXCPU
293#define SNOCD		0
294#define CPU_ON_INTR(_a)	0
295
296#define PRIV_EFFECTIVE		(1 << 0)
297#define PRIV_DTRACE_KERNEL	(1 << 1)
298#define PRIV_DTRACE_PROC	(1 << 2)
299#define PRIV_DTRACE_USER	(1 << 3)
300#define PRIV_PROC_OWNER		(1 << 4)
301#define PRIV_PROC_ZONE		(1 << 5)
302#define PRIV_ALL		~0
303
304SYSCTL_DECL(_debug_dtrace);
305SYSCTL_DECL(_kern_dtrace);
306#endif
307
308#if defined(sun)
309#define curcpu	CPU->cpu_id
310#endif
311
312
313/*
314 * DTrace Provider Variables
315 *
316 * These are the variables relating to DTrace as a provider (that is, the
317 * provider of the BEGIN, END, and ERROR probes).
318 */
319static dtrace_pattr_t	dtrace_provider_attr = {
320{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
321{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
322{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
323{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
324{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
325};
326
327static void
328dtrace_nullop(void)
329{}
330
331static dtrace_pops_t	dtrace_provider_ops = {
332	(void (*)(void *, dtrace_probedesc_t *))dtrace_nullop,
333	(void (*)(void *, modctl_t *))dtrace_nullop,
334	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
335	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
336	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
337	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
338	NULL,
339	NULL,
340	NULL,
341	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop
342};
343
344static dtrace_id_t	dtrace_probeid_begin;	/* special BEGIN probe */
345static dtrace_id_t	dtrace_probeid_end;	/* special END probe */
346dtrace_id_t		dtrace_probeid_error;	/* special ERROR probe */
347
348/*
349 * DTrace Helper Tracing Variables
350 */
351uint32_t dtrace_helptrace_next = 0;
352uint32_t dtrace_helptrace_nlocals;
353char	*dtrace_helptrace_buffer;
354int	dtrace_helptrace_bufsize = 512 * 1024;
355
356#ifdef DEBUG
357int	dtrace_helptrace_enabled = 1;
358#else
359int	dtrace_helptrace_enabled = 0;
360#endif
361
362/*
363 * DTrace Error Hashing
364 *
365 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
366 * table.  This is very useful for checking coverage of tests that are
367 * expected to induce DIF or DOF processing errors, and may be useful for
368 * debugging problems in the DIF code generator or in DOF generation .  The
369 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
370 */
371#ifdef DEBUG
372static dtrace_errhash_t	dtrace_errhash[DTRACE_ERRHASHSZ];
373static const char *dtrace_errlast;
374static kthread_t *dtrace_errthread;
375static kmutex_t dtrace_errlock;
376#endif
377
378/*
379 * DTrace Macros and Constants
380 *
381 * These are various macros that are useful in various spots in the
382 * implementation, along with a few random constants that have no meaning
383 * outside of the implementation.  There is no real structure to this cpp
384 * mishmash -- but is there ever?
385 */
386#define	DTRACE_HASHSTR(hash, probe)	\
387	dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
388
389#define	DTRACE_HASHNEXT(hash, probe)	\
390	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
391
392#define	DTRACE_HASHPREV(hash, probe)	\
393	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
394
395#define	DTRACE_HASHEQ(hash, lhs, rhs)	\
396	(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
397	    *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
398
399#define	DTRACE_AGGHASHSIZE_SLEW		17
400
401#define	DTRACE_V4MAPPED_OFFSET		(sizeof (uint32_t) * 3)
402
403/*
404 * The key for a thread-local variable consists of the lower 61 bits of the
405 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
406 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
407 * equal to a variable identifier.  This is necessary (but not sufficient) to
408 * assure that global associative arrays never collide with thread-local
409 * variables.  To guarantee that they cannot collide, we must also define the
410 * order for keying dynamic variables.  That order is:
411 *
412 *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
413 *
414 * Because the variable-key and the tls-key are in orthogonal spaces, there is
415 * no way for a global variable key signature to match a thread-local key
416 * signature.
417 */
418#if defined(sun)
419#define	DTRACE_TLS_THRKEY(where) { \
420	uint_t intr = 0; \
421	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
422	for (; actv; actv >>= 1) \
423		intr++; \
424	ASSERT(intr < (1 << 3)); \
425	(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
426	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
427}
428#else
429#define	DTRACE_TLS_THRKEY(where) { \
430	solaris_cpu_t *_c = &solaris_cpu[curcpu]; \
431	uint_t intr = 0; \
432	uint_t actv = _c->cpu_intr_actv; \
433	for (; actv; actv >>= 1) \
434		intr++; \
435	ASSERT(intr < (1 << 3)); \
436	(where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \
437	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
438}
439#endif
440
441#define	DT_BSWAP_8(x)	((x) & 0xff)
442#define	DT_BSWAP_16(x)	((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
443#define	DT_BSWAP_32(x)	((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
444#define	DT_BSWAP_64(x)	((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
445
446#define	DT_MASK_LO 0x00000000FFFFFFFFULL
447
448#define	DTRACE_STORE(type, tomax, offset, what) \
449	*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
450
451#ifndef __x86
452#define	DTRACE_ALIGNCHECK(addr, size, flags)				\
453	if (addr & (size - 1)) {					\
454		*flags |= CPU_DTRACE_BADALIGN;				\
455		cpu_core[curcpu].cpuc_dtrace_illval = addr;	\
456		return (0);						\
457	}
458#else
459#define	DTRACE_ALIGNCHECK(addr, size, flags)
460#endif
461
462/*
463 * Test whether a range of memory starting at testaddr of size testsz falls
464 * within the range of memory described by addr, sz.  We take care to avoid
465 * problems with overflow and underflow of the unsigned quantities, and
466 * disallow all negative sizes.  Ranges of size 0 are allowed.
467 */
468#define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
469	((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
470	(testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
471	(testaddr) + (testsz) >= (testaddr))
472
473/*
474 * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
475 * alloc_sz on the righthand side of the comparison in order to avoid overflow
476 * or underflow in the comparison with it.  This is simpler than the INRANGE
477 * check above, because we know that the dtms_scratch_ptr is valid in the
478 * range.  Allocations of size zero are allowed.
479 */
480#define	DTRACE_INSCRATCH(mstate, alloc_sz) \
481	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
482	(mstate)->dtms_scratch_ptr >= (alloc_sz))
483
484#define	DTRACE_LOADFUNC(bits)						\
485/*CSTYLED*/								\
486uint##bits##_t								\
487dtrace_load##bits(uintptr_t addr)					\
488{									\
489	size_t size = bits / NBBY;					\
490	/*CSTYLED*/							\
491	uint##bits##_t rval;						\
492	int i;								\
493	volatile uint16_t *flags = (volatile uint16_t *)		\
494	    &cpu_core[curcpu].cpuc_dtrace_flags;			\
495									\
496	DTRACE_ALIGNCHECK(addr, size, flags);				\
497									\
498	for (i = 0; i < dtrace_toxranges; i++) {			\
499		if (addr >= dtrace_toxrange[i].dtt_limit)		\
500			continue;					\
501									\
502		if (addr + size <= dtrace_toxrange[i].dtt_base)		\
503			continue;					\
504									\
505		/*							\
506		 * This address falls within a toxic region; return 0.	\
507		 */							\
508		*flags |= CPU_DTRACE_BADADDR;				\
509		cpu_core[curcpu].cpuc_dtrace_illval = addr;		\
510		return (0);						\
511	}								\
512									\
513	*flags |= CPU_DTRACE_NOFAULT;					\
514	/*CSTYLED*/							\
515	rval = *((volatile uint##bits##_t *)addr);			\
516	*flags &= ~CPU_DTRACE_NOFAULT;					\
517									\
518	return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);		\
519}
520
521#ifdef _LP64
522#define	dtrace_loadptr	dtrace_load64
523#else
524#define	dtrace_loadptr	dtrace_load32
525#endif
526
527#define	DTRACE_DYNHASH_FREE	0
528#define	DTRACE_DYNHASH_SINK	1
529#define	DTRACE_DYNHASH_VALID	2
530
531#define	DTRACE_MATCH_NEXT	0
532#define	DTRACE_MATCH_DONE	1
533#define	DTRACE_ANCHORED(probe)	((probe)->dtpr_func[0] != '\0')
534#define	DTRACE_STATE_ALIGN	64
535
536#define	DTRACE_FLAGS2FLT(flags)						\
537	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :		\
538	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :		\
539	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :		\
540	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :		\
541	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :		\
542	((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :		\
543	((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :		\
544	((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :	\
545	((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :		\
546	DTRACEFLT_UNKNOWN)
547
548#define	DTRACEACT_ISSTRING(act)						\
549	((act)->dta_kind == DTRACEACT_DIFEXPR &&			\
550	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
551
552/* Function prototype definitions: */
553static size_t dtrace_strlen(const char *, size_t);
554static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
555static void dtrace_enabling_provide(dtrace_provider_t *);
556static int dtrace_enabling_match(dtrace_enabling_t *, int *);
557static void dtrace_enabling_matchall(void);
558static void dtrace_enabling_reap(void);
559static dtrace_state_t *dtrace_anon_grab(void);
560static uint64_t dtrace_helper(int, dtrace_mstate_t *,
561    dtrace_state_t *, uint64_t, uint64_t);
562static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
563static void dtrace_buffer_drop(dtrace_buffer_t *);
564static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
565static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
566    dtrace_state_t *, dtrace_mstate_t *);
567static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
568    dtrace_optval_t);
569static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
570static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
571uint16_t dtrace_load16(uintptr_t);
572uint32_t dtrace_load32(uintptr_t);
573uint64_t dtrace_load64(uintptr_t);
574uint8_t dtrace_load8(uintptr_t);
575void dtrace_dynvar_clean(dtrace_dstate_t *);
576dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
577    size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
578uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
579static int dtrace_priv_proc(dtrace_state_t *);
580static void dtrace_getf_barrier(void);
581
582/*
583 * DTrace Probe Context Functions
584 *
585 * These functions are called from probe context.  Because probe context is
586 * any context in which C may be called, arbitrarily locks may be held,
587 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
588 * As a result, functions called from probe context may only call other DTrace
589 * support functions -- they may not interact at all with the system at large.
590 * (Note that the ASSERT macro is made probe-context safe by redefining it in
591 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
592 * loads are to be performed from probe context, they _must_ be in terms of
593 * the safe dtrace_load*() variants.
594 *
595 * Some functions in this block are not actually called from probe context;
596 * for these functions, there will be a comment above the function reading
597 * "Note:  not called from probe context."
598 */
599void
600dtrace_panic(const char *format, ...)
601{
602	va_list alist;
603
604	va_start(alist, format);
605	dtrace_vpanic(format, alist);
606	va_end(alist);
607}
608
609int
610dtrace_assfail(const char *a, const char *f, int l)
611{
612	dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
613
614	/*
615	 * We just need something here that even the most clever compiler
616	 * cannot optimize away.
617	 */
618	return (a[(uintptr_t)f]);
619}
620
621/*
622 * Atomically increment a specified error counter from probe context.
623 */
624static void
625dtrace_error(uint32_t *counter)
626{
627	/*
628	 * Most counters stored to in probe context are per-CPU counters.
629	 * However, there are some error conditions that are sufficiently
630	 * arcane that they don't merit per-CPU storage.  If these counters
631	 * are incremented concurrently on different CPUs, scalability will be
632	 * adversely affected -- but we don't expect them to be white-hot in a
633	 * correctly constructed enabling...
634	 */
635	uint32_t oval, nval;
636
637	do {
638		oval = *counter;
639
640		if ((nval = oval + 1) == 0) {
641			/*
642			 * If the counter would wrap, set it to 1 -- assuring
643			 * that the counter is never zero when we have seen
644			 * errors.  (The counter must be 32-bits because we
645			 * aren't guaranteed a 64-bit compare&swap operation.)
646			 * To save this code both the infamy of being fingered
647			 * by a priggish news story and the indignity of being
648			 * the target of a neo-puritan witch trial, we're
649			 * carefully avoiding any colorful description of the
650			 * likelihood of this condition -- but suffice it to
651			 * say that it is only slightly more likely than the
652			 * overflow of predicate cache IDs, as discussed in
653			 * dtrace_predicate_create().
654			 */
655			nval = 1;
656		}
657	} while (dtrace_cas32(counter, oval, nval) != oval);
658}
659
660/*
661 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
662 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
663 */
664DTRACE_LOADFUNC(8)
665DTRACE_LOADFUNC(16)
666DTRACE_LOADFUNC(32)
667DTRACE_LOADFUNC(64)
668
669static int
670dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
671{
672	if (dest < mstate->dtms_scratch_base)
673		return (0);
674
675	if (dest + size < dest)
676		return (0);
677
678	if (dest + size > mstate->dtms_scratch_ptr)
679		return (0);
680
681	return (1);
682}
683
684static int
685dtrace_canstore_statvar(uint64_t addr, size_t sz,
686    dtrace_statvar_t **svars, int nsvars)
687{
688	int i;
689
690	for (i = 0; i < nsvars; i++) {
691		dtrace_statvar_t *svar = svars[i];
692
693		if (svar == NULL || svar->dtsv_size == 0)
694			continue;
695
696		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
697			return (1);
698	}
699
700	return (0);
701}
702
703/*
704 * Check to see if the address is within a memory region to which a store may
705 * be issued.  This includes the DTrace scratch areas, and any DTrace variable
706 * region.  The caller of dtrace_canstore() is responsible for performing any
707 * alignment checks that are needed before stores are actually executed.
708 */
709static int
710dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
711    dtrace_vstate_t *vstate)
712{
713	/*
714	 * First, check to see if the address is in scratch space...
715	 */
716	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
717	    mstate->dtms_scratch_size))
718		return (1);
719
720	/*
721	 * Now check to see if it's a dynamic variable.  This check will pick
722	 * up both thread-local variables and any global dynamically-allocated
723	 * variables.
724	 */
725	if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
726	    vstate->dtvs_dynvars.dtds_size)) {
727		dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
728		uintptr_t base = (uintptr_t)dstate->dtds_base +
729		    (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
730		uintptr_t chunkoffs;
731
732		/*
733		 * Before we assume that we can store here, we need to make
734		 * sure that it isn't in our metadata -- storing to our
735		 * dynamic variable metadata would corrupt our state.  For
736		 * the range to not include any dynamic variable metadata,
737		 * it must:
738		 *
739		 *	(1) Start above the hash table that is at the base of
740		 *	the dynamic variable space
741		 *
742		 *	(2) Have a starting chunk offset that is beyond the
743		 *	dtrace_dynvar_t that is at the base of every chunk
744		 *
745		 *	(3) Not span a chunk boundary
746		 *
747		 */
748		if (addr < base)
749			return (0);
750
751		chunkoffs = (addr - base) % dstate->dtds_chunksize;
752
753		if (chunkoffs < sizeof (dtrace_dynvar_t))
754			return (0);
755
756		if (chunkoffs + sz > dstate->dtds_chunksize)
757			return (0);
758
759		return (1);
760	}
761
762	/*
763	 * Finally, check the static local and global variables.  These checks
764	 * take the longest, so we perform them last.
765	 */
766	if (dtrace_canstore_statvar(addr, sz,
767	    vstate->dtvs_locals, vstate->dtvs_nlocals))
768		return (1);
769
770	if (dtrace_canstore_statvar(addr, sz,
771	    vstate->dtvs_globals, vstate->dtvs_nglobals))
772		return (1);
773
774	return (0);
775}
776
777
778/*
779 * Convenience routine to check to see if the address is within a memory
780 * region in which a load may be issued given the user's privilege level;
781 * if not, it sets the appropriate error flags and loads 'addr' into the
782 * illegal value slot.
783 *
784 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
785 * appropriate memory access protection.
786 */
787static int
788dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
789    dtrace_vstate_t *vstate)
790{
791	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
792	file_t *fp;
793
794	/*
795	 * If we hold the privilege to read from kernel memory, then
796	 * everything is readable.
797	 */
798	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
799		return (1);
800
801	/*
802	 * You can obviously read that which you can store.
803	 */
804	if (dtrace_canstore(addr, sz, mstate, vstate))
805		return (1);
806
807	/*
808	 * We're allowed to read from our own string table.
809	 */
810	if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
811	    mstate->dtms_difo->dtdo_strlen))
812		return (1);
813
814	if (vstate->dtvs_state != NULL &&
815	    dtrace_priv_proc(vstate->dtvs_state)) {
816		proc_t *p;
817
818		/*
819		 * When we have privileges to the current process, there are
820		 * several context-related kernel structures that are safe to
821		 * read, even absent the privilege to read from kernel memory.
822		 * These reads are safe because these structures contain only
823		 * state that (1) we're permitted to read, (2) is harmless or
824		 * (3) contains pointers to additional kernel state that we're
825		 * not permitted to read (and as such, do not present an
826		 * opportunity for privilege escalation).  Finally (and
827		 * critically), because of the nature of their relation with
828		 * the current thread context, the memory associated with these
829		 * structures cannot change over the duration of probe context,
830		 * and it is therefore impossible for this memory to be
831		 * deallocated and reallocated as something else while it's
832		 * being operated upon.
833		 */
834		if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t)))
835			return (1);
836
837		if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
838		    sz, curthread->t_procp, sizeof (proc_t))) {
839			return (1);
840		}
841
842		if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
843		    curthread->t_cred, sizeof (cred_t))) {
844			return (1);
845		}
846
847#if defined(sun)
848		if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
849		    &(p->p_pidp->pid_id), sizeof (pid_t))) {
850			return (1);
851		}
852
853		if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
854		    curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
855			return (1);
856		}
857#endif
858	}
859
860	if ((fp = mstate->dtms_getf) != NULL) {
861		uintptr_t psz = sizeof (void *);
862		vnode_t *vp;
863		vnodeops_t *op;
864
865		/*
866		 * When getf() returns a file_t, the enabling is implicitly
867		 * granted the (transient) right to read the returned file_t
868		 * as well as the v_path and v_op->vnop_name of the underlying
869		 * vnode.  These accesses are allowed after a successful
870		 * getf() because the members that they refer to cannot change
871		 * once set -- and the barrier logic in the kernel's closef()
872		 * path assures that the file_t and its referenced vode_t
873		 * cannot themselves be stale (that is, it impossible for
874		 * either dtms_getf itself or its f_vnode member to reference
875		 * freed memory).
876		 */
877		if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t)))
878			return (1);
879
880		if ((vp = fp->f_vnode) != NULL) {
881#if defined(sun)
882			if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz))
883				return (1);
884			if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz,
885			    vp->v_path, strlen(vp->v_path) + 1)) {
886				return (1);
887			}
888#endif
889
890			if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz))
891				return (1);
892
893#if defined(sun)
894			if ((op = vp->v_op) != NULL &&
895			    DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
896				return (1);
897			}
898
899			if (op != NULL && op->vnop_name != NULL &&
900			    DTRACE_INRANGE(addr, sz, op->vnop_name,
901			    strlen(op->vnop_name) + 1)) {
902				return (1);
903			}
904#endif
905		}
906	}
907
908	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
909	*illval = addr;
910	return (0);
911}
912
913/*
914 * Convenience routine to check to see if a given string is within a memory
915 * region in which a load may be issued given the user's privilege level;
916 * this exists so that we don't need to issue unnecessary dtrace_strlen()
917 * calls in the event that the user has all privileges.
918 */
919static int
920dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
921    dtrace_vstate_t *vstate)
922{
923	size_t strsz;
924
925	/*
926	 * If we hold the privilege to read from kernel memory, then
927	 * everything is readable.
928	 */
929	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
930		return (1);
931
932	strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
933	if (dtrace_canload(addr, strsz, mstate, vstate))
934		return (1);
935
936	return (0);
937}
938
939/*
940 * Convenience routine to check to see if a given variable is within a memory
941 * region in which a load may be issued given the user's privilege level.
942 */
943static int
944dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
945    dtrace_vstate_t *vstate)
946{
947	size_t sz;
948	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
949
950	/*
951	 * If we hold the privilege to read from kernel memory, then
952	 * everything is readable.
953	 */
954	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
955		return (1);
956
957	if (type->dtdt_kind == DIF_TYPE_STRING)
958		sz = dtrace_strlen(src,
959		    vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
960	else
961		sz = type->dtdt_size;
962
963	return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
964}
965
966/*
967 * Convert a string to a signed integer using safe loads.
968 *
969 * NOTE: This function uses various macros from strtolctype.h to manipulate
970 * digit values, etc -- these have all been checked to ensure they make
971 * no additional function calls.
972 */
973static int64_t
974dtrace_strtoll(char *input, int base, size_t limit)
975{
976	uintptr_t pos = (uintptr_t)input;
977	int64_t val = 0;
978	int x;
979	boolean_t neg = B_FALSE;
980	char c, cc, ccc;
981	uintptr_t end = pos + limit;
982
983	/*
984	 * Consume any whitespace preceding digits.
985	 */
986	while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
987		pos++;
988
989	/*
990	 * Handle an explicit sign if one is present.
991	 */
992	if (c == '-' || c == '+') {
993		if (c == '-')
994			neg = B_TRUE;
995		c = dtrace_load8(++pos);
996	}
997
998	/*
999	 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1000	 * if present.
1001	 */
1002	if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1003	    cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1004		pos += 2;
1005		c = ccc;
1006	}
1007
1008	/*
1009	 * Read in contiguous digits until the first non-digit character.
1010	 */
1011	for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1012	    c = dtrace_load8(++pos))
1013		val = val * base + x;
1014
1015	return (neg ? -val : val);
1016}
1017
1018/*
1019 * Compare two strings using safe loads.
1020 */
1021static int
1022dtrace_strncmp(char *s1, char *s2, size_t limit)
1023{
1024	uint8_t c1, c2;
1025	volatile uint16_t *flags;
1026
1027	if (s1 == s2 || limit == 0)
1028		return (0);
1029
1030	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1031
1032	do {
1033		if (s1 == NULL) {
1034			c1 = '\0';
1035		} else {
1036			c1 = dtrace_load8((uintptr_t)s1++);
1037		}
1038
1039		if (s2 == NULL) {
1040			c2 = '\0';
1041		} else {
1042			c2 = dtrace_load8((uintptr_t)s2++);
1043		}
1044
1045		if (c1 != c2)
1046			return (c1 - c2);
1047	} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1048
1049	return (0);
1050}
1051
1052/*
1053 * Compute strlen(s) for a string using safe memory accesses.  The additional
1054 * len parameter is used to specify a maximum length to ensure completion.
1055 */
1056static size_t
1057dtrace_strlen(const char *s, size_t lim)
1058{
1059	uint_t len;
1060
1061	for (len = 0; len != lim; len++) {
1062		if (dtrace_load8((uintptr_t)s++) == '\0')
1063			break;
1064	}
1065
1066	return (len);
1067}
1068
1069/*
1070 * Check if an address falls within a toxic region.
1071 */
1072static int
1073dtrace_istoxic(uintptr_t kaddr, size_t size)
1074{
1075	uintptr_t taddr, tsize;
1076	int i;
1077
1078	for (i = 0; i < dtrace_toxranges; i++) {
1079		taddr = dtrace_toxrange[i].dtt_base;
1080		tsize = dtrace_toxrange[i].dtt_limit - taddr;
1081
1082		if (kaddr - taddr < tsize) {
1083			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1084			cpu_core[curcpu].cpuc_dtrace_illval = kaddr;
1085			return (1);
1086		}
1087
1088		if (taddr - kaddr < size) {
1089			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1090			cpu_core[curcpu].cpuc_dtrace_illval = taddr;
1091			return (1);
1092		}
1093	}
1094
1095	return (0);
1096}
1097
1098/*
1099 * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
1100 * memory specified by the DIF program.  The dst is assumed to be safe memory
1101 * that we can store to directly because it is managed by DTrace.  As with
1102 * standard bcopy, overlapping copies are handled properly.
1103 */
1104static void
1105dtrace_bcopy(const void *src, void *dst, size_t len)
1106{
1107	if (len != 0) {
1108		uint8_t *s1 = dst;
1109		const uint8_t *s2 = src;
1110
1111		if (s1 <= s2) {
1112			do {
1113				*s1++ = dtrace_load8((uintptr_t)s2++);
1114			} while (--len != 0);
1115		} else {
1116			s2 += len;
1117			s1 += len;
1118
1119			do {
1120				*--s1 = dtrace_load8((uintptr_t)--s2);
1121			} while (--len != 0);
1122		}
1123	}
1124}
1125
1126/*
1127 * Copy src to dst using safe memory accesses, up to either the specified
1128 * length, or the point that a nul byte is encountered.  The src is assumed to
1129 * be unsafe memory specified by the DIF program.  The dst is assumed to be
1130 * safe memory that we can store to directly because it is managed by DTrace.
1131 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1132 */
1133static void
1134dtrace_strcpy(const void *src, void *dst, size_t len)
1135{
1136	if (len != 0) {
1137		uint8_t *s1 = dst, c;
1138		const uint8_t *s2 = src;
1139
1140		do {
1141			*s1++ = c = dtrace_load8((uintptr_t)s2++);
1142		} while (--len != 0 && c != '\0');
1143	}
1144}
1145
1146/*
1147 * Copy src to dst, deriving the size and type from the specified (BYREF)
1148 * variable type.  The src is assumed to be unsafe memory specified by the DIF
1149 * program.  The dst is assumed to be DTrace variable memory that is of the
1150 * specified type; we assume that we can store to directly.
1151 */
1152static void
1153dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
1154{
1155	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1156
1157	if (type->dtdt_kind == DIF_TYPE_STRING) {
1158		dtrace_strcpy(src, dst, type->dtdt_size);
1159	} else {
1160		dtrace_bcopy(src, dst, type->dtdt_size);
1161	}
1162}
1163
1164/*
1165 * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1166 * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1167 * safe memory that we can access directly because it is managed by DTrace.
1168 */
1169static int
1170dtrace_bcmp(const void *s1, const void *s2, size_t len)
1171{
1172	volatile uint16_t *flags;
1173
1174	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1175
1176	if (s1 == s2)
1177		return (0);
1178
1179	if (s1 == NULL || s2 == NULL)
1180		return (1);
1181
1182	if (s1 != s2 && len != 0) {
1183		const uint8_t *ps1 = s1;
1184		const uint8_t *ps2 = s2;
1185
1186		do {
1187			if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1188				return (1);
1189		} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1190	}
1191	return (0);
1192}
1193
1194/*
1195 * Zero the specified region using a simple byte-by-byte loop.  Note that this
1196 * is for safe DTrace-managed memory only.
1197 */
1198static void
1199dtrace_bzero(void *dst, size_t len)
1200{
1201	uchar_t *cp;
1202
1203	for (cp = dst; len != 0; len--)
1204		*cp++ = 0;
1205}
1206
1207static void
1208dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1209{
1210	uint64_t result[2];
1211
1212	result[0] = addend1[0] + addend2[0];
1213	result[1] = addend1[1] + addend2[1] +
1214	    (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1215
1216	sum[0] = result[0];
1217	sum[1] = result[1];
1218}
1219
1220/*
1221 * Shift the 128-bit value in a by b. If b is positive, shift left.
1222 * If b is negative, shift right.
1223 */
1224static void
1225dtrace_shift_128(uint64_t *a, int b)
1226{
1227	uint64_t mask;
1228
1229	if (b == 0)
1230		return;
1231
1232	if (b < 0) {
1233		b = -b;
1234		if (b >= 64) {
1235			a[0] = a[1] >> (b - 64);
1236			a[1] = 0;
1237		} else {
1238			a[0] >>= b;
1239			mask = 1LL << (64 - b);
1240			mask -= 1;
1241			a[0] |= ((a[1] & mask) << (64 - b));
1242			a[1] >>= b;
1243		}
1244	} else {
1245		if (b >= 64) {
1246			a[1] = a[0] << (b - 64);
1247			a[0] = 0;
1248		} else {
1249			a[1] <<= b;
1250			mask = a[0] >> (64 - b);
1251			a[1] |= mask;
1252			a[0] <<= b;
1253		}
1254	}
1255}
1256
1257/*
1258 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1259 * use native multiplication on those, and then re-combine into the
1260 * resulting 128-bit value.
1261 *
1262 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1263 *     hi1 * hi2 << 64 +
1264 *     hi1 * lo2 << 32 +
1265 *     hi2 * lo1 << 32 +
1266 *     lo1 * lo2
1267 */
1268static void
1269dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1270{
1271	uint64_t hi1, hi2, lo1, lo2;
1272	uint64_t tmp[2];
1273
1274	hi1 = factor1 >> 32;
1275	hi2 = factor2 >> 32;
1276
1277	lo1 = factor1 & DT_MASK_LO;
1278	lo2 = factor2 & DT_MASK_LO;
1279
1280	product[0] = lo1 * lo2;
1281	product[1] = hi1 * hi2;
1282
1283	tmp[0] = hi1 * lo2;
1284	tmp[1] = 0;
1285	dtrace_shift_128(tmp, 32);
1286	dtrace_add_128(product, tmp, product);
1287
1288	tmp[0] = hi2 * lo1;
1289	tmp[1] = 0;
1290	dtrace_shift_128(tmp, 32);
1291	dtrace_add_128(product, tmp, product);
1292}
1293
1294/*
1295 * This privilege check should be used by actions and subroutines to
1296 * verify that the user credentials of the process that enabled the
1297 * invoking ECB match the target credentials
1298 */
1299static int
1300dtrace_priv_proc_common_user(dtrace_state_t *state)
1301{
1302	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1303
1304	/*
1305	 * We should always have a non-NULL state cred here, since if cred
1306	 * is null (anonymous tracing), we fast-path bypass this routine.
1307	 */
1308	ASSERT(s_cr != NULL);
1309
1310	if ((cr = CRED()) != NULL &&
1311	    s_cr->cr_uid == cr->cr_uid &&
1312	    s_cr->cr_uid == cr->cr_ruid &&
1313	    s_cr->cr_uid == cr->cr_suid &&
1314	    s_cr->cr_gid == cr->cr_gid &&
1315	    s_cr->cr_gid == cr->cr_rgid &&
1316	    s_cr->cr_gid == cr->cr_sgid)
1317		return (1);
1318
1319	return (0);
1320}
1321
1322/*
1323 * This privilege check should be used by actions and subroutines to
1324 * verify that the zone of the process that enabled the invoking ECB
1325 * matches the target credentials
1326 */
1327static int
1328dtrace_priv_proc_common_zone(dtrace_state_t *state)
1329{
1330#if defined(sun)
1331	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1332
1333	/*
1334	 * We should always have a non-NULL state cred here, since if cred
1335	 * is null (anonymous tracing), we fast-path bypass this routine.
1336	 */
1337	ASSERT(s_cr != NULL);
1338
1339	if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
1340		return (1);
1341
1342	return (0);
1343#else
1344	return (1);
1345#endif
1346}
1347
1348/*
1349 * This privilege check should be used by actions and subroutines to
1350 * verify that the process has not setuid or changed credentials.
1351 */
1352static int
1353dtrace_priv_proc_common_nocd(void)
1354{
1355	proc_t *proc;
1356
1357	if ((proc = ttoproc(curthread)) != NULL &&
1358	    !(proc->p_flag & SNOCD))
1359		return (1);
1360
1361	return (0);
1362}
1363
1364static int
1365dtrace_priv_proc_destructive(dtrace_state_t *state)
1366{
1367	int action = state->dts_cred.dcr_action;
1368
1369	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1370	    dtrace_priv_proc_common_zone(state) == 0)
1371		goto bad;
1372
1373	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1374	    dtrace_priv_proc_common_user(state) == 0)
1375		goto bad;
1376
1377	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1378	    dtrace_priv_proc_common_nocd() == 0)
1379		goto bad;
1380
1381	return (1);
1382
1383bad:
1384	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1385
1386	return (0);
1387}
1388
1389static int
1390dtrace_priv_proc_control(dtrace_state_t *state)
1391{
1392	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1393		return (1);
1394
1395	if (dtrace_priv_proc_common_zone(state) &&
1396	    dtrace_priv_proc_common_user(state) &&
1397	    dtrace_priv_proc_common_nocd())
1398		return (1);
1399
1400	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1401
1402	return (0);
1403}
1404
1405static int
1406dtrace_priv_proc(dtrace_state_t *state)
1407{
1408	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1409		return (1);
1410
1411	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1412
1413	return (0);
1414}
1415
1416static int
1417dtrace_priv_kernel(dtrace_state_t *state)
1418{
1419	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1420		return (1);
1421
1422	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1423
1424	return (0);
1425}
1426
1427static int
1428dtrace_priv_kernel_destructive(dtrace_state_t *state)
1429{
1430	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1431		return (1);
1432
1433	cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1434
1435	return (0);
1436}
1437
1438/*
1439 * Determine if the dte_cond of the specified ECB allows for processing of
1440 * the current probe to continue.  Note that this routine may allow continued
1441 * processing, but with access(es) stripped from the mstate's dtms_access
1442 * field.
1443 */
1444static int
1445dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
1446    dtrace_ecb_t *ecb)
1447{
1448	dtrace_probe_t *probe = ecb->dte_probe;
1449	dtrace_provider_t *prov = probe->dtpr_provider;
1450	dtrace_pops_t *pops = &prov->dtpv_pops;
1451	int mode = DTRACE_MODE_NOPRIV_DROP;
1452
1453	ASSERT(ecb->dte_cond);
1454
1455#if defined(sun)
1456	if (pops->dtps_mode != NULL) {
1457		mode = pops->dtps_mode(prov->dtpv_arg,
1458		    probe->dtpr_id, probe->dtpr_arg);
1459
1460		ASSERT((mode & DTRACE_MODE_USER) ||
1461		    (mode & DTRACE_MODE_KERNEL));
1462		ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
1463		    (mode & DTRACE_MODE_NOPRIV_DROP));
1464	}
1465
1466	/*
1467	 * If the dte_cond bits indicate that this consumer is only allowed to
1468	 * see user-mode firings of this probe, call the provider's dtps_mode()
1469	 * entry point to check that the probe was fired while in a user
1470	 * context.  If that's not the case, use the policy specified by the
1471	 * provider to determine if we drop the probe or merely restrict
1472	 * operation.
1473	 */
1474	if (ecb->dte_cond & DTRACE_COND_USERMODE) {
1475		ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
1476
1477		if (!(mode & DTRACE_MODE_USER)) {
1478			if (mode & DTRACE_MODE_NOPRIV_DROP)
1479				return (0);
1480
1481			mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1482		}
1483	}
1484#endif
1485
1486	/*
1487	 * This is more subtle than it looks. We have to be absolutely certain
1488	 * that CRED() isn't going to change out from under us so it's only
1489	 * legit to examine that structure if we're in constrained situations.
1490	 * Currently, the only times we'll this check is if a non-super-user
1491	 * has enabled the profile or syscall providers -- providers that
1492	 * allow visibility of all processes. For the profile case, the check
1493	 * above will ensure that we're examining a user context.
1494	 */
1495	if (ecb->dte_cond & DTRACE_COND_OWNER) {
1496		cred_t *cr;
1497		cred_t *s_cr = state->dts_cred.dcr_cred;
1498		proc_t *proc;
1499
1500		ASSERT(s_cr != NULL);
1501
1502		if ((cr = CRED()) == NULL ||
1503		    s_cr->cr_uid != cr->cr_uid ||
1504		    s_cr->cr_uid != cr->cr_ruid ||
1505		    s_cr->cr_uid != cr->cr_suid ||
1506		    s_cr->cr_gid != cr->cr_gid ||
1507		    s_cr->cr_gid != cr->cr_rgid ||
1508		    s_cr->cr_gid != cr->cr_sgid ||
1509		    (proc = ttoproc(curthread)) == NULL ||
1510		    (proc->p_flag & SNOCD)) {
1511			if (mode & DTRACE_MODE_NOPRIV_DROP)
1512				return (0);
1513
1514#if defined(sun)
1515			mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
1516#endif
1517		}
1518	}
1519
1520#if defined(sun)
1521	/*
1522	 * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
1523	 * in our zone, check to see if our mode policy is to restrict rather
1524	 * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
1525	 * and DTRACE_ACCESS_ARGS
1526	 */
1527	if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
1528		cred_t *cr;
1529		cred_t *s_cr = state->dts_cred.dcr_cred;
1530
1531		ASSERT(s_cr != NULL);
1532
1533		if ((cr = CRED()) == NULL ||
1534		    s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
1535			if (mode & DTRACE_MODE_NOPRIV_DROP)
1536				return (0);
1537
1538			mstate->dtms_access &=
1539			    ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
1540		}
1541	}
1542#endif
1543
1544	return (1);
1545}
1546
1547/*
1548 * Note:  not called from probe context.  This function is called
1549 * asynchronously (and at a regular interval) from outside of probe context to
1550 * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1551 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1552 */
1553void
1554dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1555{
1556	dtrace_dynvar_t *dirty;
1557	dtrace_dstate_percpu_t *dcpu;
1558	int i, work = 0;
1559
1560	for (i = 0; i < NCPU; i++) {
1561		dcpu = &dstate->dtds_percpu[i];
1562
1563		ASSERT(dcpu->dtdsc_rinsing == NULL);
1564
1565		/*
1566		 * If the dirty list is NULL, there is no dirty work to do.
1567		 */
1568		if (dcpu->dtdsc_dirty == NULL)
1569			continue;
1570
1571		/*
1572		 * If the clean list is non-NULL, then we're not going to do
1573		 * any work for this CPU -- it means that there has not been
1574		 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1575		 * since the last time we cleaned house.
1576		 */
1577		if (dcpu->dtdsc_clean != NULL)
1578			continue;
1579
1580		work = 1;
1581
1582		/*
1583		 * Atomically move the dirty list aside.
1584		 */
1585		do {
1586			dirty = dcpu->dtdsc_dirty;
1587
1588			/*
1589			 * Before we zap the dirty list, set the rinsing list.
1590			 * (This allows for a potential assertion in
1591			 * dtrace_dynvar():  if a free dynamic variable appears
1592			 * on a hash chain, either the dirty list or the
1593			 * rinsing list for some CPU must be non-NULL.)
1594			 */
1595			dcpu->dtdsc_rinsing = dirty;
1596			dtrace_membar_producer();
1597		} while (dtrace_casptr(&dcpu->dtdsc_dirty,
1598		    dirty, NULL) != dirty);
1599	}
1600
1601	if (!work) {
1602		/*
1603		 * We have no work to do; we can simply return.
1604		 */
1605		return;
1606	}
1607
1608	dtrace_sync();
1609
1610	for (i = 0; i < NCPU; i++) {
1611		dcpu = &dstate->dtds_percpu[i];
1612
1613		if (dcpu->dtdsc_rinsing == NULL)
1614			continue;
1615
1616		/*
1617		 * We are now guaranteed that no hash chain contains a pointer
1618		 * into this dirty list; we can make it clean.
1619		 */
1620		ASSERT(dcpu->dtdsc_clean == NULL);
1621		dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1622		dcpu->dtdsc_rinsing = NULL;
1623	}
1624
1625	/*
1626	 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1627	 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1628	 * This prevents a race whereby a CPU incorrectly decides that
1629	 * the state should be something other than DTRACE_DSTATE_CLEAN
1630	 * after dtrace_dynvar_clean() has completed.
1631	 */
1632	dtrace_sync();
1633
1634	dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1635}
1636
1637/*
1638 * Depending on the value of the op parameter, this function looks-up,
1639 * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1640 * allocation is requested, this function will return a pointer to a
1641 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1642 * variable can be allocated.  If NULL is returned, the appropriate counter
1643 * will be incremented.
1644 */
1645dtrace_dynvar_t *
1646dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1647    dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1648    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1649{
1650	uint64_t hashval = DTRACE_DYNHASH_VALID;
1651	dtrace_dynhash_t *hash = dstate->dtds_hash;
1652	dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1653	processorid_t me = curcpu, cpu = me;
1654	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1655	size_t bucket, ksize;
1656	size_t chunksize = dstate->dtds_chunksize;
1657	uintptr_t kdata, lock, nstate;
1658	uint_t i;
1659
1660	ASSERT(nkeys != 0);
1661
1662	/*
1663	 * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1664	 * algorithm.  For the by-value portions, we perform the algorithm in
1665	 * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1666	 * bit, and seems to have only a minute effect on distribution.  For
1667	 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1668	 * over each referenced byte.  It's painful to do this, but it's much
1669	 * better than pathological hash distribution.  The efficacy of the
1670	 * hashing algorithm (and a comparison with other algorithms) may be
1671	 * found by running the ::dtrace_dynstat MDB dcmd.
1672	 */
1673	for (i = 0; i < nkeys; i++) {
1674		if (key[i].dttk_size == 0) {
1675			uint64_t val = key[i].dttk_value;
1676
1677			hashval += (val >> 48) & 0xffff;
1678			hashval += (hashval << 10);
1679			hashval ^= (hashval >> 6);
1680
1681			hashval += (val >> 32) & 0xffff;
1682			hashval += (hashval << 10);
1683			hashval ^= (hashval >> 6);
1684
1685			hashval += (val >> 16) & 0xffff;
1686			hashval += (hashval << 10);
1687			hashval ^= (hashval >> 6);
1688
1689			hashval += val & 0xffff;
1690			hashval += (hashval << 10);
1691			hashval ^= (hashval >> 6);
1692		} else {
1693			/*
1694			 * This is incredibly painful, but it beats the hell
1695			 * out of the alternative.
1696			 */
1697			uint64_t j, size = key[i].dttk_size;
1698			uintptr_t base = (uintptr_t)key[i].dttk_value;
1699
1700			if (!dtrace_canload(base, size, mstate, vstate))
1701				break;
1702
1703			for (j = 0; j < size; j++) {
1704				hashval += dtrace_load8(base + j);
1705				hashval += (hashval << 10);
1706				hashval ^= (hashval >> 6);
1707			}
1708		}
1709	}
1710
1711	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1712		return (NULL);
1713
1714	hashval += (hashval << 3);
1715	hashval ^= (hashval >> 11);
1716	hashval += (hashval << 15);
1717
1718	/*
1719	 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1720	 * comes out to be one of our two sentinel hash values.  If this
1721	 * actually happens, we set the hashval to be a value known to be a
1722	 * non-sentinel value.
1723	 */
1724	if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1725		hashval = DTRACE_DYNHASH_VALID;
1726
1727	/*
1728	 * Yes, it's painful to do a divide here.  If the cycle count becomes
1729	 * important here, tricks can be pulled to reduce it.  (However, it's
1730	 * critical that hash collisions be kept to an absolute minimum;
1731	 * they're much more painful than a divide.)  It's better to have a
1732	 * solution that generates few collisions and still keeps things
1733	 * relatively simple.
1734	 */
1735	bucket = hashval % dstate->dtds_hashsize;
1736
1737	if (op == DTRACE_DYNVAR_DEALLOC) {
1738		volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1739
1740		for (;;) {
1741			while ((lock = *lockp) & 1)
1742				continue;
1743
1744			if (dtrace_casptr((volatile void *)lockp,
1745			    (volatile void *)lock, (volatile void *)(lock + 1)) == (void *)lock)
1746				break;
1747		}
1748
1749		dtrace_membar_producer();
1750	}
1751
1752top:
1753	prev = NULL;
1754	lock = hash[bucket].dtdh_lock;
1755
1756	dtrace_membar_consumer();
1757
1758	start = hash[bucket].dtdh_chain;
1759	ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1760	    start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1761	    op != DTRACE_DYNVAR_DEALLOC));
1762
1763	for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1764		dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1765		dtrace_key_t *dkey = &dtuple->dtt_key[0];
1766
1767		if (dvar->dtdv_hashval != hashval) {
1768			if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1769				/*
1770				 * We've reached the sink, and therefore the
1771				 * end of the hash chain; we can kick out of
1772				 * the loop knowing that we have seen a valid
1773				 * snapshot of state.
1774				 */
1775				ASSERT(dvar->dtdv_next == NULL);
1776				ASSERT(dvar == &dtrace_dynhash_sink);
1777				break;
1778			}
1779
1780			if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1781				/*
1782				 * We've gone off the rails:  somewhere along
1783				 * the line, one of the members of this hash
1784				 * chain was deleted.  Note that we could also
1785				 * detect this by simply letting this loop run
1786				 * to completion, as we would eventually hit
1787				 * the end of the dirty list.  However, we
1788				 * want to avoid running the length of the
1789				 * dirty list unnecessarily (it might be quite
1790				 * long), so we catch this as early as
1791				 * possible by detecting the hash marker.  In
1792				 * this case, we simply set dvar to NULL and
1793				 * break; the conditional after the loop will
1794				 * send us back to top.
1795				 */
1796				dvar = NULL;
1797				break;
1798			}
1799
1800			goto next;
1801		}
1802
1803		if (dtuple->dtt_nkeys != nkeys)
1804			goto next;
1805
1806		for (i = 0; i < nkeys; i++, dkey++) {
1807			if (dkey->dttk_size != key[i].dttk_size)
1808				goto next; /* size or type mismatch */
1809
1810			if (dkey->dttk_size != 0) {
1811				if (dtrace_bcmp(
1812				    (void *)(uintptr_t)key[i].dttk_value,
1813				    (void *)(uintptr_t)dkey->dttk_value,
1814				    dkey->dttk_size))
1815					goto next;
1816			} else {
1817				if (dkey->dttk_value != key[i].dttk_value)
1818					goto next;
1819			}
1820		}
1821
1822		if (op != DTRACE_DYNVAR_DEALLOC)
1823			return (dvar);
1824
1825		ASSERT(dvar->dtdv_next == NULL ||
1826		    dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1827
1828		if (prev != NULL) {
1829			ASSERT(hash[bucket].dtdh_chain != dvar);
1830			ASSERT(start != dvar);
1831			ASSERT(prev->dtdv_next == dvar);
1832			prev->dtdv_next = dvar->dtdv_next;
1833		} else {
1834			if (dtrace_casptr(&hash[bucket].dtdh_chain,
1835			    start, dvar->dtdv_next) != start) {
1836				/*
1837				 * We have failed to atomically swing the
1838				 * hash table head pointer, presumably because
1839				 * of a conflicting allocation on another CPU.
1840				 * We need to reread the hash chain and try
1841				 * again.
1842				 */
1843				goto top;
1844			}
1845		}
1846
1847		dtrace_membar_producer();
1848
1849		/*
1850		 * Now set the hash value to indicate that it's free.
1851		 */
1852		ASSERT(hash[bucket].dtdh_chain != dvar);
1853		dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1854
1855		dtrace_membar_producer();
1856
1857		/*
1858		 * Set the next pointer to point at the dirty list, and
1859		 * atomically swing the dirty pointer to the newly freed dvar.
1860		 */
1861		do {
1862			next = dcpu->dtdsc_dirty;
1863			dvar->dtdv_next = next;
1864		} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1865
1866		/*
1867		 * Finally, unlock this hash bucket.
1868		 */
1869		ASSERT(hash[bucket].dtdh_lock == lock);
1870		ASSERT(lock & 1);
1871		hash[bucket].dtdh_lock++;
1872
1873		return (NULL);
1874next:
1875		prev = dvar;
1876		continue;
1877	}
1878
1879	if (dvar == NULL) {
1880		/*
1881		 * If dvar is NULL, it is because we went off the rails:
1882		 * one of the elements that we traversed in the hash chain
1883		 * was deleted while we were traversing it.  In this case,
1884		 * we assert that we aren't doing a dealloc (deallocs lock
1885		 * the hash bucket to prevent themselves from racing with
1886		 * one another), and retry the hash chain traversal.
1887		 */
1888		ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1889		goto top;
1890	}
1891
1892	if (op != DTRACE_DYNVAR_ALLOC) {
1893		/*
1894		 * If we are not to allocate a new variable, we want to
1895		 * return NULL now.  Before we return, check that the value
1896		 * of the lock word hasn't changed.  If it has, we may have
1897		 * seen an inconsistent snapshot.
1898		 */
1899		if (op == DTRACE_DYNVAR_NOALLOC) {
1900			if (hash[bucket].dtdh_lock != lock)
1901				goto top;
1902		} else {
1903			ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1904			ASSERT(hash[bucket].dtdh_lock == lock);
1905			ASSERT(lock & 1);
1906			hash[bucket].dtdh_lock++;
1907		}
1908
1909		return (NULL);
1910	}
1911
1912	/*
1913	 * We need to allocate a new dynamic variable.  The size we need is the
1914	 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1915	 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1916	 * the size of any referred-to data (dsize).  We then round the final
1917	 * size up to the chunksize for allocation.
1918	 */
1919	for (ksize = 0, i = 0; i < nkeys; i++)
1920		ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1921
1922	/*
1923	 * This should be pretty much impossible, but could happen if, say,
1924	 * strange DIF specified the tuple.  Ideally, this should be an
1925	 * assertion and not an error condition -- but that requires that the
1926	 * chunksize calculation in dtrace_difo_chunksize() be absolutely
1927	 * bullet-proof.  (That is, it must not be able to be fooled by
1928	 * malicious DIF.)  Given the lack of backwards branches in DIF,
1929	 * solving this would presumably not amount to solving the Halting
1930	 * Problem -- but it still seems awfully hard.
1931	 */
1932	if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1933	    ksize + dsize > chunksize) {
1934		dcpu->dtdsc_drops++;
1935		return (NULL);
1936	}
1937
1938	nstate = DTRACE_DSTATE_EMPTY;
1939
1940	do {
1941retry:
1942		free = dcpu->dtdsc_free;
1943
1944		if (free == NULL) {
1945			dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1946			void *rval;
1947
1948			if (clean == NULL) {
1949				/*
1950				 * We're out of dynamic variable space on
1951				 * this CPU.  Unless we have tried all CPUs,
1952				 * we'll try to allocate from a different
1953				 * CPU.
1954				 */
1955				switch (dstate->dtds_state) {
1956				case DTRACE_DSTATE_CLEAN: {
1957					void *sp = &dstate->dtds_state;
1958
1959					if (++cpu >= NCPU)
1960						cpu = 0;
1961
1962					if (dcpu->dtdsc_dirty != NULL &&
1963					    nstate == DTRACE_DSTATE_EMPTY)
1964						nstate = DTRACE_DSTATE_DIRTY;
1965
1966					if (dcpu->dtdsc_rinsing != NULL)
1967						nstate = DTRACE_DSTATE_RINSING;
1968
1969					dcpu = &dstate->dtds_percpu[cpu];
1970
1971					if (cpu != me)
1972						goto retry;
1973
1974					(void) dtrace_cas32(sp,
1975					    DTRACE_DSTATE_CLEAN, nstate);
1976
1977					/*
1978					 * To increment the correct bean
1979					 * counter, take another lap.
1980					 */
1981					goto retry;
1982				}
1983
1984				case DTRACE_DSTATE_DIRTY:
1985					dcpu->dtdsc_dirty_drops++;
1986					break;
1987
1988				case DTRACE_DSTATE_RINSING:
1989					dcpu->dtdsc_rinsing_drops++;
1990					break;
1991
1992				case DTRACE_DSTATE_EMPTY:
1993					dcpu->dtdsc_drops++;
1994					break;
1995				}
1996
1997				DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1998				return (NULL);
1999			}
2000
2001			/*
2002			 * The clean list appears to be non-empty.  We want to
2003			 * move the clean list to the free list; we start by
2004			 * moving the clean pointer aside.
2005			 */
2006			if (dtrace_casptr(&dcpu->dtdsc_clean,
2007			    clean, NULL) != clean) {
2008				/*
2009				 * We are in one of two situations:
2010				 *
2011				 *  (a)	The clean list was switched to the
2012				 *	free list by another CPU.
2013				 *
2014				 *  (b)	The clean list was added to by the
2015				 *	cleansing cyclic.
2016				 *
2017				 * In either of these situations, we can
2018				 * just reattempt the free list allocation.
2019				 */
2020				goto retry;
2021			}
2022
2023			ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2024
2025			/*
2026			 * Now we'll move the clean list to the free list.
2027			 * It's impossible for this to fail:  the only way
2028			 * the free list can be updated is through this
2029			 * code path, and only one CPU can own the clean list.
2030			 * Thus, it would only be possible for this to fail if
2031			 * this code were racing with dtrace_dynvar_clean().
2032			 * (That is, if dtrace_dynvar_clean() updated the clean
2033			 * list, and we ended up racing to update the free
2034			 * list.)  This race is prevented by the dtrace_sync()
2035			 * in dtrace_dynvar_clean() -- which flushes the
2036			 * owners of the clean lists out before resetting
2037			 * the clean lists.
2038			 */
2039			rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2040			ASSERT(rval == NULL);
2041			goto retry;
2042		}
2043
2044		dvar = free;
2045		new_free = dvar->dtdv_next;
2046	} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2047
2048	/*
2049	 * We have now allocated a new chunk.  We copy the tuple keys into the
2050	 * tuple array and copy any referenced key data into the data space
2051	 * following the tuple array.  As we do this, we relocate dttk_value
2052	 * in the final tuple to point to the key data address in the chunk.
2053	 */
2054	kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2055	dvar->dtdv_data = (void *)(kdata + ksize);
2056	dvar->dtdv_tuple.dtt_nkeys = nkeys;
2057
2058	for (i = 0; i < nkeys; i++) {
2059		dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2060		size_t kesize = key[i].dttk_size;
2061
2062		if (kesize != 0) {
2063			dtrace_bcopy(
2064			    (const void *)(uintptr_t)key[i].dttk_value,
2065			    (void *)kdata, kesize);
2066			dkey->dttk_value = kdata;
2067			kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2068		} else {
2069			dkey->dttk_value = key[i].dttk_value;
2070		}
2071
2072		dkey->dttk_size = kesize;
2073	}
2074
2075	ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2076	dvar->dtdv_hashval = hashval;
2077	dvar->dtdv_next = start;
2078
2079	if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2080		return (dvar);
2081
2082	/*
2083	 * The cas has failed.  Either another CPU is adding an element to
2084	 * this hash chain, or another CPU is deleting an element from this
2085	 * hash chain.  The simplest way to deal with both of these cases
2086	 * (though not necessarily the most efficient) is to free our
2087	 * allocated block and tail-call ourselves.  Note that the free is
2088	 * to the dirty list and _not_ to the free list.  This is to prevent
2089	 * races with allocators, above.
2090	 */
2091	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2092
2093	dtrace_membar_producer();
2094
2095	do {
2096		free = dcpu->dtdsc_dirty;
2097		dvar->dtdv_next = free;
2098	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2099
2100	return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2101}
2102
2103/*ARGSUSED*/
2104static void
2105dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2106{
2107	if ((int64_t)nval < (int64_t)*oval)
2108		*oval = nval;
2109}
2110
2111/*ARGSUSED*/
2112static void
2113dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2114{
2115	if ((int64_t)nval > (int64_t)*oval)
2116		*oval = nval;
2117}
2118
2119static void
2120dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2121{
2122	int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2123	int64_t val = (int64_t)nval;
2124
2125	if (val < 0) {
2126		for (i = 0; i < zero; i++) {
2127			if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2128				quanta[i] += incr;
2129				return;
2130			}
2131		}
2132	} else {
2133		for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2134			if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2135				quanta[i - 1] += incr;
2136				return;
2137			}
2138		}
2139
2140		quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2141		return;
2142	}
2143
2144	ASSERT(0);
2145}
2146
2147static void
2148dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2149{
2150	uint64_t arg = *lquanta++;
2151	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2152	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2153	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2154	int32_t val = (int32_t)nval, level;
2155
2156	ASSERT(step != 0);
2157	ASSERT(levels != 0);
2158
2159	if (val < base) {
2160		/*
2161		 * This is an underflow.
2162		 */
2163		lquanta[0] += incr;
2164		return;
2165	}
2166
2167	level = (val - base) / step;
2168
2169	if (level < levels) {
2170		lquanta[level + 1] += incr;
2171		return;
2172	}
2173
2174	/*
2175	 * This is an overflow.
2176	 */
2177	lquanta[levels + 1] += incr;
2178}
2179
2180static int
2181dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
2182    uint16_t high, uint16_t nsteps, int64_t value)
2183{
2184	int64_t this = 1, last, next;
2185	int base = 1, order;
2186
2187	ASSERT(factor <= nsteps);
2188	ASSERT(nsteps % factor == 0);
2189
2190	for (order = 0; order < low; order++)
2191		this *= factor;
2192
2193	/*
2194	 * If our value is less than our factor taken to the power of the
2195	 * low order of magnitude, it goes into the zeroth bucket.
2196	 */
2197	if (value < (last = this))
2198		return (0);
2199
2200	for (this *= factor; order <= high; order++) {
2201		int nbuckets = this > nsteps ? nsteps : this;
2202
2203		if ((next = this * factor) < this) {
2204			/*
2205			 * We should not generally get log/linear quantizations
2206			 * with a high magnitude that allows 64-bits to
2207			 * overflow, but we nonetheless protect against this
2208			 * by explicitly checking for overflow, and clamping
2209			 * our value accordingly.
2210			 */
2211			value = this - 1;
2212		}
2213
2214		if (value < this) {
2215			/*
2216			 * If our value lies within this order of magnitude,
2217			 * determine its position by taking the offset within
2218			 * the order of magnitude, dividing by the bucket
2219			 * width, and adding to our (accumulated) base.
2220			 */
2221			return (base + (value - last) / (this / nbuckets));
2222		}
2223
2224		base += nbuckets - (nbuckets / factor);
2225		last = this;
2226		this = next;
2227	}
2228
2229	/*
2230	 * Our value is greater than or equal to our factor taken to the
2231	 * power of one plus the high magnitude -- return the top bucket.
2232	 */
2233	return (base);
2234}
2235
2236static void
2237dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2238{
2239	uint64_t arg = *llquanta++;
2240	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2241	uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2242	uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2243	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2244
2245	llquanta[dtrace_aggregate_llquantize_bucket(factor,
2246	    low, high, nsteps, nval)] += incr;
2247}
2248
2249/*ARGSUSED*/
2250static void
2251dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2252{
2253	data[0]++;
2254	data[1] += nval;
2255}
2256
2257/*ARGSUSED*/
2258static void
2259dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2260{
2261	int64_t snval = (int64_t)nval;
2262	uint64_t tmp[2];
2263
2264	data[0]++;
2265	data[1] += nval;
2266
2267	/*
2268	 * What we want to say here is:
2269	 *
2270	 * data[2] += nval * nval;
2271	 *
2272	 * But given that nval is 64-bit, we could easily overflow, so
2273	 * we do this as 128-bit arithmetic.
2274	 */
2275	if (snval < 0)
2276		snval = -snval;
2277
2278	dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2279	dtrace_add_128(data + 2, tmp, data + 2);
2280}
2281
2282/*ARGSUSED*/
2283static void
2284dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2285{
2286	*oval = *oval + 1;
2287}
2288
2289/*ARGSUSED*/
2290static void
2291dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2292{
2293	*oval += nval;
2294}
2295
2296/*
2297 * Aggregate given the tuple in the principal data buffer, and the aggregating
2298 * action denoted by the specified dtrace_aggregation_t.  The aggregation
2299 * buffer is specified as the buf parameter.  This routine does not return
2300 * failure; if there is no space in the aggregation buffer, the data will be
2301 * dropped, and a corresponding counter incremented.
2302 */
2303static void
2304dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2305    intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2306{
2307	dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2308	uint32_t i, ndx, size, fsize;
2309	uint32_t align = sizeof (uint64_t) - 1;
2310	dtrace_aggbuffer_t *agb;
2311	dtrace_aggkey_t *key;
2312	uint32_t hashval = 0, limit, isstr;
2313	caddr_t tomax, data, kdata;
2314	dtrace_actkind_t action;
2315	dtrace_action_t *act;
2316	uintptr_t offs;
2317
2318	if (buf == NULL)
2319		return;
2320
2321	if (!agg->dtag_hasarg) {
2322		/*
2323		 * Currently, only quantize() and lquantize() take additional
2324		 * arguments, and they have the same semantics:  an increment
2325		 * value that defaults to 1 when not present.  If additional
2326		 * aggregating actions take arguments, the setting of the
2327		 * default argument value will presumably have to become more
2328		 * sophisticated...
2329		 */
2330		arg = 1;
2331	}
2332
2333	action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2334	size = rec->dtrd_offset - agg->dtag_base;
2335	fsize = size + rec->dtrd_size;
2336
2337	ASSERT(dbuf->dtb_tomax != NULL);
2338	data = dbuf->dtb_tomax + offset + agg->dtag_base;
2339
2340	if ((tomax = buf->dtb_tomax) == NULL) {
2341		dtrace_buffer_drop(buf);
2342		return;
2343	}
2344
2345	/*
2346	 * The metastructure is always at the bottom of the buffer.
2347	 */
2348	agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2349	    sizeof (dtrace_aggbuffer_t));
2350
2351	if (buf->dtb_offset == 0) {
2352		/*
2353		 * We just kludge up approximately 1/8th of the size to be
2354		 * buckets.  If this guess ends up being routinely
2355		 * off-the-mark, we may need to dynamically readjust this
2356		 * based on past performance.
2357		 */
2358		uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2359
2360		if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2361		    (uintptr_t)tomax || hashsize == 0) {
2362			/*
2363			 * We've been given a ludicrously small buffer;
2364			 * increment our drop count and leave.
2365			 */
2366			dtrace_buffer_drop(buf);
2367			return;
2368		}
2369
2370		/*
2371		 * And now, a pathetic attempt to try to get a an odd (or
2372		 * perchance, a prime) hash size for better hash distribution.
2373		 */
2374		if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2375			hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2376
2377		agb->dtagb_hashsize = hashsize;
2378		agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2379		    agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2380		agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2381
2382		for (i = 0; i < agb->dtagb_hashsize; i++)
2383			agb->dtagb_hash[i] = NULL;
2384	}
2385
2386	ASSERT(agg->dtag_first != NULL);
2387	ASSERT(agg->dtag_first->dta_intuple);
2388
2389	/*
2390	 * Calculate the hash value based on the key.  Note that we _don't_
2391	 * include the aggid in the hashing (but we will store it as part of
2392	 * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2393	 * algorithm: a simple, quick algorithm that has no known funnels, and
2394	 * gets good distribution in practice.  The efficacy of the hashing
2395	 * algorithm (and a comparison with other algorithms) may be found by
2396	 * running the ::dtrace_aggstat MDB dcmd.
2397	 */
2398	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2399		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2400		limit = i + act->dta_rec.dtrd_size;
2401		ASSERT(limit <= size);
2402		isstr = DTRACEACT_ISSTRING(act);
2403
2404		for (; i < limit; i++) {
2405			hashval += data[i];
2406			hashval += (hashval << 10);
2407			hashval ^= (hashval >> 6);
2408
2409			if (isstr && data[i] == '\0')
2410				break;
2411		}
2412	}
2413
2414	hashval += (hashval << 3);
2415	hashval ^= (hashval >> 11);
2416	hashval += (hashval << 15);
2417
2418	/*
2419	 * Yes, the divide here is expensive -- but it's generally the least
2420	 * of the performance issues given the amount of data that we iterate
2421	 * over to compute hash values, compare data, etc.
2422	 */
2423	ndx = hashval % agb->dtagb_hashsize;
2424
2425	for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2426		ASSERT((caddr_t)key >= tomax);
2427		ASSERT((caddr_t)key < tomax + buf->dtb_size);
2428
2429		if (hashval != key->dtak_hashval || key->dtak_size != size)
2430			continue;
2431
2432		kdata = key->dtak_data;
2433		ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2434
2435		for (act = agg->dtag_first; act->dta_intuple;
2436		    act = act->dta_next) {
2437			i = act->dta_rec.dtrd_offset - agg->dtag_base;
2438			limit = i + act->dta_rec.dtrd_size;
2439			ASSERT(limit <= size);
2440			isstr = DTRACEACT_ISSTRING(act);
2441
2442			for (; i < limit; i++) {
2443				if (kdata[i] != data[i])
2444					goto next;
2445
2446				if (isstr && data[i] == '\0')
2447					break;
2448			}
2449		}
2450
2451		if (action != key->dtak_action) {
2452			/*
2453			 * We are aggregating on the same value in the same
2454			 * aggregation with two different aggregating actions.
2455			 * (This should have been picked up in the compiler,
2456			 * so we may be dealing with errant or devious DIF.)
2457			 * This is an error condition; we indicate as much,
2458			 * and return.
2459			 */
2460			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2461			return;
2462		}
2463
2464		/*
2465		 * This is a hit:  we need to apply the aggregator to
2466		 * the value at this key.
2467		 */
2468		agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2469		return;
2470next:
2471		continue;
2472	}
2473
2474	/*
2475	 * We didn't find it.  We need to allocate some zero-filled space,
2476	 * link it into the hash table appropriately, and apply the aggregator
2477	 * to the (zero-filled) value.
2478	 */
2479	offs = buf->dtb_offset;
2480	while (offs & (align - 1))
2481		offs += sizeof (uint32_t);
2482
2483	/*
2484	 * If we don't have enough room to both allocate a new key _and_
2485	 * its associated data, increment the drop count and return.
2486	 */
2487	if ((uintptr_t)tomax + offs + fsize >
2488	    agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2489		dtrace_buffer_drop(buf);
2490		return;
2491	}
2492
2493	/*CONSTCOND*/
2494	ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2495	key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2496	agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2497
2498	key->dtak_data = kdata = tomax + offs;
2499	buf->dtb_offset = offs + fsize;
2500
2501	/*
2502	 * Now copy the data across.
2503	 */
2504	*((dtrace_aggid_t *)kdata) = agg->dtag_id;
2505
2506	for (i = sizeof (dtrace_aggid_t); i < size; i++)
2507		kdata[i] = data[i];
2508
2509	/*
2510	 * Because strings are not zeroed out by default, we need to iterate
2511	 * looking for actions that store strings, and we need to explicitly
2512	 * pad these strings out with zeroes.
2513	 */
2514	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2515		int nul;
2516
2517		if (!DTRACEACT_ISSTRING(act))
2518			continue;
2519
2520		i = act->dta_rec.dtrd_offset - agg->dtag_base;
2521		limit = i + act->dta_rec.dtrd_size;
2522		ASSERT(limit <= size);
2523
2524		for (nul = 0; i < limit; i++) {
2525			if (nul) {
2526				kdata[i] = '\0';
2527				continue;
2528			}
2529
2530			if (data[i] != '\0')
2531				continue;
2532
2533			nul = 1;
2534		}
2535	}
2536
2537	for (i = size; i < fsize; i++)
2538		kdata[i] = 0;
2539
2540	key->dtak_hashval = hashval;
2541	key->dtak_size = size;
2542	key->dtak_action = action;
2543	key->dtak_next = agb->dtagb_hash[ndx];
2544	agb->dtagb_hash[ndx] = key;
2545
2546	/*
2547	 * Finally, apply the aggregator.
2548	 */
2549	*((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2550	agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2551}
2552
2553/*
2554 * Given consumer state, this routine finds a speculation in the INACTIVE
2555 * state and transitions it into the ACTIVE state.  If there is no speculation
2556 * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2557 * incremented -- it is up to the caller to take appropriate action.
2558 */
2559static int
2560dtrace_speculation(dtrace_state_t *state)
2561{
2562	int i = 0;
2563	dtrace_speculation_state_t current;
2564	uint32_t *stat = &state->dts_speculations_unavail, count;
2565
2566	while (i < state->dts_nspeculations) {
2567		dtrace_speculation_t *spec = &state->dts_speculations[i];
2568
2569		current = spec->dtsp_state;
2570
2571		if (current != DTRACESPEC_INACTIVE) {
2572			if (current == DTRACESPEC_COMMITTINGMANY ||
2573			    current == DTRACESPEC_COMMITTING ||
2574			    current == DTRACESPEC_DISCARDING)
2575				stat = &state->dts_speculations_busy;
2576			i++;
2577			continue;
2578		}
2579
2580		if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2581		    current, DTRACESPEC_ACTIVE) == current)
2582			return (i + 1);
2583	}
2584
2585	/*
2586	 * We couldn't find a speculation.  If we found as much as a single
2587	 * busy speculation buffer, we'll attribute this failure as "busy"
2588	 * instead of "unavail".
2589	 */
2590	do {
2591		count = *stat;
2592	} while (dtrace_cas32(stat, count, count + 1) != count);
2593
2594	return (0);
2595}
2596
2597/*
2598 * This routine commits an active speculation.  If the specified speculation
2599 * is not in a valid state to perform a commit(), this routine will silently do
2600 * nothing.  The state of the specified speculation is transitioned according
2601 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2602 */
2603static void
2604dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2605    dtrace_specid_t which)
2606{
2607	dtrace_speculation_t *spec;
2608	dtrace_buffer_t *src, *dest;
2609	uintptr_t daddr, saddr, dlimit, slimit;
2610	dtrace_speculation_state_t current, new = 0;
2611	intptr_t offs;
2612	uint64_t timestamp;
2613
2614	if (which == 0)
2615		return;
2616
2617	if (which > state->dts_nspeculations) {
2618		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2619		return;
2620	}
2621
2622	spec = &state->dts_speculations[which - 1];
2623	src = &spec->dtsp_buffer[cpu];
2624	dest = &state->dts_buffer[cpu];
2625
2626	do {
2627		current = spec->dtsp_state;
2628
2629		if (current == DTRACESPEC_COMMITTINGMANY)
2630			break;
2631
2632		switch (current) {
2633		case DTRACESPEC_INACTIVE:
2634		case DTRACESPEC_DISCARDING:
2635			return;
2636
2637		case DTRACESPEC_COMMITTING:
2638			/*
2639			 * This is only possible if we are (a) commit()'ing
2640			 * without having done a prior speculate() on this CPU
2641			 * and (b) racing with another commit() on a different
2642			 * CPU.  There's nothing to do -- we just assert that
2643			 * our offset is 0.
2644			 */
2645			ASSERT(src->dtb_offset == 0);
2646			return;
2647
2648		case DTRACESPEC_ACTIVE:
2649			new = DTRACESPEC_COMMITTING;
2650			break;
2651
2652		case DTRACESPEC_ACTIVEONE:
2653			/*
2654			 * This speculation is active on one CPU.  If our
2655			 * buffer offset is non-zero, we know that the one CPU
2656			 * must be us.  Otherwise, we are committing on a
2657			 * different CPU from the speculate(), and we must
2658			 * rely on being asynchronously cleaned.
2659			 */
2660			if (src->dtb_offset != 0) {
2661				new = DTRACESPEC_COMMITTING;
2662				break;
2663			}
2664			/*FALLTHROUGH*/
2665
2666		case DTRACESPEC_ACTIVEMANY:
2667			new = DTRACESPEC_COMMITTINGMANY;
2668			break;
2669
2670		default:
2671			ASSERT(0);
2672		}
2673	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2674	    current, new) != current);
2675
2676	/*
2677	 * We have set the state to indicate that we are committing this
2678	 * speculation.  Now reserve the necessary space in the destination
2679	 * buffer.
2680	 */
2681	if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2682	    sizeof (uint64_t), state, NULL)) < 0) {
2683		dtrace_buffer_drop(dest);
2684		goto out;
2685	}
2686
2687	/*
2688	 * We have sufficient space to copy the speculative buffer into the
2689	 * primary buffer.  First, modify the speculative buffer, filling
2690	 * in the timestamp of all entries with the current time.  The data
2691	 * must have the commit() time rather than the time it was traced,
2692	 * so that all entries in the primary buffer are in timestamp order.
2693	 */
2694	timestamp = dtrace_gethrtime();
2695	saddr = (uintptr_t)src->dtb_tomax;
2696	slimit = saddr + src->dtb_offset;
2697	while (saddr < slimit) {
2698		size_t size;
2699		dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2700
2701		if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2702			saddr += sizeof (dtrace_epid_t);
2703			continue;
2704		}
2705		ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
2706		size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2707
2708		ASSERT3U(saddr + size, <=, slimit);
2709		ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
2710		ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
2711
2712		DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2713
2714		saddr += size;
2715	}
2716
2717	/*
2718	 * Copy the buffer across.  (Note that this is a
2719	 * highly subobtimal bcopy(); in the unlikely event that this becomes
2720	 * a serious performance issue, a high-performance DTrace-specific
2721	 * bcopy() should obviously be invented.)
2722	 */
2723	daddr = (uintptr_t)dest->dtb_tomax + offs;
2724	dlimit = daddr + src->dtb_offset;
2725	saddr = (uintptr_t)src->dtb_tomax;
2726
2727	/*
2728	 * First, the aligned portion.
2729	 */
2730	while (dlimit - daddr >= sizeof (uint64_t)) {
2731		*((uint64_t *)daddr) = *((uint64_t *)saddr);
2732
2733		daddr += sizeof (uint64_t);
2734		saddr += sizeof (uint64_t);
2735	}
2736
2737	/*
2738	 * Now any left-over bit...
2739	 */
2740	while (dlimit - daddr)
2741		*((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2742
2743	/*
2744	 * Finally, commit the reserved space in the destination buffer.
2745	 */
2746	dest->dtb_offset = offs + src->dtb_offset;
2747
2748out:
2749	/*
2750	 * If we're lucky enough to be the only active CPU on this speculation
2751	 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2752	 */
2753	if (current == DTRACESPEC_ACTIVE ||
2754	    (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2755		uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2756		    DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2757
2758		ASSERT(rval == DTRACESPEC_COMMITTING);
2759	}
2760
2761	src->dtb_offset = 0;
2762	src->dtb_xamot_drops += src->dtb_drops;
2763	src->dtb_drops = 0;
2764}
2765
2766/*
2767 * This routine discards an active speculation.  If the specified speculation
2768 * is not in a valid state to perform a discard(), this routine will silently
2769 * do nothing.  The state of the specified speculation is transitioned
2770 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2771 */
2772static void
2773dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2774    dtrace_specid_t which)
2775{
2776	dtrace_speculation_t *spec;
2777	dtrace_speculation_state_t current, new = 0;
2778	dtrace_buffer_t *buf;
2779
2780	if (which == 0)
2781		return;
2782
2783	if (which > state->dts_nspeculations) {
2784		cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2785		return;
2786	}
2787
2788	spec = &state->dts_speculations[which - 1];
2789	buf = &spec->dtsp_buffer[cpu];
2790
2791	do {
2792		current = spec->dtsp_state;
2793
2794		switch (current) {
2795		case DTRACESPEC_INACTIVE:
2796		case DTRACESPEC_COMMITTINGMANY:
2797		case DTRACESPEC_COMMITTING:
2798		case DTRACESPEC_DISCARDING:
2799			return;
2800
2801		case DTRACESPEC_ACTIVE:
2802		case DTRACESPEC_ACTIVEMANY:
2803			new = DTRACESPEC_DISCARDING;
2804			break;
2805
2806		case DTRACESPEC_ACTIVEONE:
2807			if (buf->dtb_offset != 0) {
2808				new = DTRACESPEC_INACTIVE;
2809			} else {
2810				new = DTRACESPEC_DISCARDING;
2811			}
2812			break;
2813
2814		default:
2815			ASSERT(0);
2816		}
2817	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2818	    current, new) != current);
2819
2820	buf->dtb_offset = 0;
2821	buf->dtb_drops = 0;
2822}
2823
2824/*
2825 * Note:  not called from probe context.  This function is called
2826 * asynchronously from cross call context to clean any speculations that are
2827 * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2828 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2829 * speculation.
2830 */
2831static void
2832dtrace_speculation_clean_here(dtrace_state_t *state)
2833{
2834	dtrace_icookie_t cookie;
2835	processorid_t cpu = curcpu;
2836	dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2837	dtrace_specid_t i;
2838
2839	cookie = dtrace_interrupt_disable();
2840
2841	if (dest->dtb_tomax == NULL) {
2842		dtrace_interrupt_enable(cookie);
2843		return;
2844	}
2845
2846	for (i = 0; i < state->dts_nspeculations; i++) {
2847		dtrace_speculation_t *spec = &state->dts_speculations[i];
2848		dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2849
2850		if (src->dtb_tomax == NULL)
2851			continue;
2852
2853		if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2854			src->dtb_offset = 0;
2855			continue;
2856		}
2857
2858		if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2859			continue;
2860
2861		if (src->dtb_offset == 0)
2862			continue;
2863
2864		dtrace_speculation_commit(state, cpu, i + 1);
2865	}
2866
2867	dtrace_interrupt_enable(cookie);
2868}
2869
2870/*
2871 * Note:  not called from probe context.  This function is called
2872 * asynchronously (and at a regular interval) to clean any speculations that
2873 * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2874 * is work to be done, it cross calls all CPUs to perform that work;
2875 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2876 * INACTIVE state until they have been cleaned by all CPUs.
2877 */
2878static void
2879dtrace_speculation_clean(dtrace_state_t *state)
2880{
2881	int work = 0, rv;
2882	dtrace_specid_t i;
2883
2884	for (i = 0; i < state->dts_nspeculations; i++) {
2885		dtrace_speculation_t *spec = &state->dts_speculations[i];
2886
2887		ASSERT(!spec->dtsp_cleaning);
2888
2889		if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2890		    spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2891			continue;
2892
2893		work++;
2894		spec->dtsp_cleaning = 1;
2895	}
2896
2897	if (!work)
2898		return;
2899
2900	dtrace_xcall(DTRACE_CPUALL,
2901	    (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2902
2903	/*
2904	 * We now know that all CPUs have committed or discarded their
2905	 * speculation buffers, as appropriate.  We can now set the state
2906	 * to inactive.
2907	 */
2908	for (i = 0; i < state->dts_nspeculations; i++) {
2909		dtrace_speculation_t *spec = &state->dts_speculations[i];
2910		dtrace_speculation_state_t current, new;
2911
2912		if (!spec->dtsp_cleaning)
2913			continue;
2914
2915		current = spec->dtsp_state;
2916		ASSERT(current == DTRACESPEC_DISCARDING ||
2917		    current == DTRACESPEC_COMMITTINGMANY);
2918
2919		new = DTRACESPEC_INACTIVE;
2920
2921		rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2922		ASSERT(rv == current);
2923		spec->dtsp_cleaning = 0;
2924	}
2925}
2926
2927/*
2928 * Called as part of a speculate() to get the speculative buffer associated
2929 * with a given speculation.  Returns NULL if the specified speculation is not
2930 * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
2931 * the active CPU is not the specified CPU -- the speculation will be
2932 * atomically transitioned into the ACTIVEMANY state.
2933 */
2934static dtrace_buffer_t *
2935dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2936    dtrace_specid_t which)
2937{
2938	dtrace_speculation_t *spec;
2939	dtrace_speculation_state_t current, new = 0;
2940	dtrace_buffer_t *buf;
2941
2942	if (which == 0)
2943		return (NULL);
2944
2945	if (which > state->dts_nspeculations) {
2946		cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2947		return (NULL);
2948	}
2949
2950	spec = &state->dts_speculations[which - 1];
2951	buf = &spec->dtsp_buffer[cpuid];
2952
2953	do {
2954		current = spec->dtsp_state;
2955
2956		switch (current) {
2957		case DTRACESPEC_INACTIVE:
2958		case DTRACESPEC_COMMITTINGMANY:
2959		case DTRACESPEC_DISCARDING:
2960			return (NULL);
2961
2962		case DTRACESPEC_COMMITTING:
2963			ASSERT(buf->dtb_offset == 0);
2964			return (NULL);
2965
2966		case DTRACESPEC_ACTIVEONE:
2967			/*
2968			 * This speculation is currently active on one CPU.
2969			 * Check the offset in the buffer; if it's non-zero,
2970			 * that CPU must be us (and we leave the state alone).
2971			 * If it's zero, assume that we're starting on a new
2972			 * CPU -- and change the state to indicate that the
2973			 * speculation is active on more than one CPU.
2974			 */
2975			if (buf->dtb_offset != 0)
2976				return (buf);
2977
2978			new = DTRACESPEC_ACTIVEMANY;
2979			break;
2980
2981		case DTRACESPEC_ACTIVEMANY:
2982			return (buf);
2983
2984		case DTRACESPEC_ACTIVE:
2985			new = DTRACESPEC_ACTIVEONE;
2986			break;
2987
2988		default:
2989			ASSERT(0);
2990		}
2991	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2992	    current, new) != current);
2993
2994	ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2995	return (buf);
2996}
2997
2998/*
2999 * Return a string.  In the event that the user lacks the privilege to access
3000 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3001 * don't fail access checking.
3002 *
3003 * dtrace_dif_variable() uses this routine as a helper for various
3004 * builtin values such as 'execname' and 'probefunc.'
3005 */
3006uintptr_t
3007dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3008    dtrace_mstate_t *mstate)
3009{
3010	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3011	uintptr_t ret;
3012	size_t strsz;
3013
3014	/*
3015	 * The easy case: this probe is allowed to read all of memory, so
3016	 * we can just return this as a vanilla pointer.
3017	 */
3018	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3019		return (addr);
3020
3021	/*
3022	 * This is the tougher case: we copy the string in question from
3023	 * kernel memory into scratch memory and return it that way: this
3024	 * ensures that we won't trip up when access checking tests the
3025	 * BYREF return value.
3026	 */
3027	strsz = dtrace_strlen((char *)addr, size) + 1;
3028
3029	if (mstate->dtms_scratch_ptr + strsz >
3030	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3031		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3032		return (0);
3033	}
3034
3035	dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3036	    strsz);
3037	ret = mstate->dtms_scratch_ptr;
3038	mstate->dtms_scratch_ptr += strsz;
3039	return (ret);
3040}
3041
3042/*
3043 * Return a string from a memoy address which is known to have one or
3044 * more concatenated, individually zero terminated, sub-strings.
3045 * In the event that the user lacks the privilege to access
3046 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3047 * don't fail access checking.
3048 *
3049 * dtrace_dif_variable() uses this routine as a helper for various
3050 * builtin values such as 'execargs'.
3051 */
3052static uintptr_t
3053dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state,
3054    dtrace_mstate_t *mstate)
3055{
3056	char *p;
3057	size_t i;
3058	uintptr_t ret;
3059
3060	if (mstate->dtms_scratch_ptr + strsz >
3061	    mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3062		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3063		return (0);
3064	}
3065
3066	dtrace_bcopy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3067	    strsz);
3068
3069	/* Replace sub-string termination characters with a space. */
3070	for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1;
3071	    p++, i++)
3072		if (*p == '\0')
3073			*p = ' ';
3074
3075	ret = mstate->dtms_scratch_ptr;
3076	mstate->dtms_scratch_ptr += strsz;
3077	return (ret);
3078}
3079
3080/*
3081 * This function implements the DIF emulator's variable lookups.  The emulator
3082 * passes a reserved variable identifier and optional built-in array index.
3083 */
3084static uint64_t
3085dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3086    uint64_t ndx)
3087{
3088	/*
3089	 * If we're accessing one of the uncached arguments, we'll turn this
3090	 * into a reference in the args array.
3091	 */
3092	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3093		ndx = v - DIF_VAR_ARG0;
3094		v = DIF_VAR_ARGS;
3095	}
3096
3097	switch (v) {
3098	case DIF_VAR_ARGS:
3099		ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3100		if (ndx >= sizeof (mstate->dtms_arg) /
3101		    sizeof (mstate->dtms_arg[0])) {
3102			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3103			dtrace_provider_t *pv;
3104			uint64_t val;
3105
3106			pv = mstate->dtms_probe->dtpr_provider;
3107			if (pv->dtpv_pops.dtps_getargval != NULL)
3108				val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3109				    mstate->dtms_probe->dtpr_id,
3110				    mstate->dtms_probe->dtpr_arg, ndx, aframes);
3111			else
3112				val = dtrace_getarg(ndx, aframes);
3113
3114			/*
3115			 * This is regrettably required to keep the compiler
3116			 * from tail-optimizing the call to dtrace_getarg().
3117			 * The condition always evaluates to true, but the
3118			 * compiler has no way of figuring that out a priori.
3119			 * (None of this would be necessary if the compiler
3120			 * could be relied upon to _always_ tail-optimize
3121			 * the call to dtrace_getarg() -- but it can't.)
3122			 */
3123			if (mstate->dtms_probe != NULL)
3124				return (val);
3125
3126			ASSERT(0);
3127		}
3128
3129		return (mstate->dtms_arg[ndx]);
3130
3131#if defined(sun)
3132	case DIF_VAR_UREGS: {
3133		klwp_t *lwp;
3134
3135		if (!dtrace_priv_proc(state))
3136			return (0);
3137
3138		if ((lwp = curthread->t_lwp) == NULL) {
3139			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3140			cpu_core[curcpu].cpuc_dtrace_illval = NULL;
3141			return (0);
3142		}
3143
3144		return (dtrace_getreg(lwp->lwp_regs, ndx));
3145		return (0);
3146	}
3147#else
3148	case DIF_VAR_UREGS: {
3149		struct trapframe *tframe;
3150
3151		if (!dtrace_priv_proc(state))
3152			return (0);
3153
3154		if ((tframe = curthread->td_frame) == NULL) {
3155			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3156			cpu_core[curcpu].cpuc_dtrace_illval = 0;
3157			return (0);
3158		}
3159
3160		return (dtrace_getreg(tframe, ndx));
3161	}
3162#endif
3163
3164	case DIF_VAR_CURTHREAD:
3165		if (!dtrace_priv_proc(state))
3166			return (0);
3167		return ((uint64_t)(uintptr_t)curthread);
3168
3169	case DIF_VAR_TIMESTAMP:
3170		if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3171			mstate->dtms_timestamp = dtrace_gethrtime();
3172			mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3173		}
3174		return (mstate->dtms_timestamp);
3175
3176	case DIF_VAR_VTIMESTAMP:
3177		ASSERT(dtrace_vtime_references != 0);
3178		return (curthread->t_dtrace_vtime);
3179
3180	case DIF_VAR_WALLTIMESTAMP:
3181		if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3182			mstate->dtms_walltimestamp = dtrace_gethrestime();
3183			mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3184		}
3185		return (mstate->dtms_walltimestamp);
3186
3187#if defined(sun)
3188	case DIF_VAR_IPL:
3189		if (!dtrace_priv_kernel(state))
3190			return (0);
3191		if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3192			mstate->dtms_ipl = dtrace_getipl();
3193			mstate->dtms_present |= DTRACE_MSTATE_IPL;
3194		}
3195		return (mstate->dtms_ipl);
3196#endif
3197
3198	case DIF_VAR_EPID:
3199		ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3200		return (mstate->dtms_epid);
3201
3202	case DIF_VAR_ID:
3203		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3204		return (mstate->dtms_probe->dtpr_id);
3205
3206	case DIF_VAR_STACKDEPTH:
3207		if (!dtrace_priv_kernel(state))
3208			return (0);
3209		if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3210			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3211
3212			mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3213			mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3214		}
3215		return (mstate->dtms_stackdepth);
3216
3217	case DIF_VAR_USTACKDEPTH:
3218		if (!dtrace_priv_proc(state))
3219			return (0);
3220		if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3221			/*
3222			 * See comment in DIF_VAR_PID.
3223			 */
3224			if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3225			    CPU_ON_INTR(CPU)) {
3226				mstate->dtms_ustackdepth = 0;
3227			} else {
3228				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3229				mstate->dtms_ustackdepth =
3230				    dtrace_getustackdepth();
3231				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3232			}
3233			mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3234		}
3235		return (mstate->dtms_ustackdepth);
3236
3237	case DIF_VAR_CALLER:
3238		if (!dtrace_priv_kernel(state))
3239			return (0);
3240		if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3241			int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3242
3243			if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3244				/*
3245				 * If this is an unanchored probe, we are
3246				 * required to go through the slow path:
3247				 * dtrace_caller() only guarantees correct
3248				 * results for anchored probes.
3249				 */
3250				pc_t caller[2] = {0, 0};
3251
3252				dtrace_getpcstack(caller, 2, aframes,
3253				    (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3254				mstate->dtms_caller = caller[1];
3255			} else if ((mstate->dtms_caller =
3256			    dtrace_caller(aframes)) == -1) {
3257				/*
3258				 * We have failed to do this the quick way;
3259				 * we must resort to the slower approach of
3260				 * calling dtrace_getpcstack().
3261				 */
3262				pc_t caller = 0;
3263
3264				dtrace_getpcstack(&caller, 1, aframes, NULL);
3265				mstate->dtms_caller = caller;
3266			}
3267
3268			mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3269		}
3270		return (mstate->dtms_caller);
3271
3272	case DIF_VAR_UCALLER:
3273		if (!dtrace_priv_proc(state))
3274			return (0);
3275
3276		if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3277			uint64_t ustack[3];
3278
3279			/*
3280			 * dtrace_getupcstack() fills in the first uint64_t
3281			 * with the current PID.  The second uint64_t will
3282			 * be the program counter at user-level.  The third
3283			 * uint64_t will contain the caller, which is what
3284			 * we're after.
3285			 */
3286			ustack[2] = 0;
3287			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3288			dtrace_getupcstack(ustack, 3);
3289			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3290			mstate->dtms_ucaller = ustack[2];
3291			mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3292		}
3293
3294		return (mstate->dtms_ucaller);
3295
3296	case DIF_VAR_PROBEPROV:
3297		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3298		return (dtrace_dif_varstr(
3299		    (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3300		    state, mstate));
3301
3302	case DIF_VAR_PROBEMOD:
3303		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3304		return (dtrace_dif_varstr(
3305		    (uintptr_t)mstate->dtms_probe->dtpr_mod,
3306		    state, mstate));
3307
3308	case DIF_VAR_PROBEFUNC:
3309		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3310		return (dtrace_dif_varstr(
3311		    (uintptr_t)mstate->dtms_probe->dtpr_func,
3312		    state, mstate));
3313
3314	case DIF_VAR_PROBENAME:
3315		ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3316		return (dtrace_dif_varstr(
3317		    (uintptr_t)mstate->dtms_probe->dtpr_name,
3318		    state, mstate));
3319
3320	case DIF_VAR_PID:
3321		if (!dtrace_priv_proc(state))
3322			return (0);
3323
3324#if defined(sun)
3325		/*
3326		 * Note that we are assuming that an unanchored probe is
3327		 * always due to a high-level interrupt.  (And we're assuming
3328		 * that there is only a single high level interrupt.)
3329		 */
3330		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3331			return (pid0.pid_id);
3332
3333		/*
3334		 * It is always safe to dereference one's own t_procp pointer:
3335		 * it always points to a valid, allocated proc structure.
3336		 * Further, it is always safe to dereference the p_pidp member
3337		 * of one's own proc structure.  (These are truisms becuase
3338		 * threads and processes don't clean up their own state --
3339		 * they leave that task to whomever reaps them.)
3340		 */
3341		return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3342#else
3343		return ((uint64_t)curproc->p_pid);
3344#endif
3345
3346	case DIF_VAR_PPID:
3347		if (!dtrace_priv_proc(state))
3348			return (0);
3349
3350#if defined(sun)
3351		/*
3352		 * See comment in DIF_VAR_PID.
3353		 */
3354		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3355			return (pid0.pid_id);
3356
3357		/*
3358		 * It is always safe to dereference one's own t_procp pointer:
3359		 * it always points to a valid, allocated proc structure.
3360		 * (This is true because threads don't clean up their own
3361		 * state -- they leave that task to whomever reaps them.)
3362		 */
3363		return ((uint64_t)curthread->t_procp->p_ppid);
3364#else
3365		return ((uint64_t)curproc->p_pptr->p_pid);
3366#endif
3367
3368	case DIF_VAR_TID:
3369#if defined(sun)
3370		/*
3371		 * See comment in DIF_VAR_PID.
3372		 */
3373		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3374			return (0);
3375#endif
3376
3377		return ((uint64_t)curthread->t_tid);
3378
3379	case DIF_VAR_EXECARGS: {
3380		struct pargs *p_args = curthread->td_proc->p_args;
3381
3382		if (p_args == NULL)
3383			return(0);
3384
3385		return (dtrace_dif_varstrz(
3386		    (uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));
3387	}
3388
3389	case DIF_VAR_EXECNAME:
3390#if defined(sun)
3391		if (!dtrace_priv_proc(state))
3392			return (0);
3393
3394		/*
3395		 * See comment in DIF_VAR_PID.
3396		 */
3397		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3398			return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3399
3400		/*
3401		 * It is always safe to dereference one's own t_procp pointer:
3402		 * it always points to a valid, allocated proc structure.
3403		 * (This is true because threads don't clean up their own
3404		 * state -- they leave that task to whomever reaps them.)
3405		 */
3406		return (dtrace_dif_varstr(
3407		    (uintptr_t)curthread->t_procp->p_user.u_comm,
3408		    state, mstate));
3409#else
3410		return (dtrace_dif_varstr(
3411		    (uintptr_t) curthread->td_proc->p_comm, state, mstate));
3412#endif
3413
3414	case DIF_VAR_ZONENAME:
3415#if defined(sun)
3416		if (!dtrace_priv_proc(state))
3417			return (0);
3418
3419		/*
3420		 * See comment in DIF_VAR_PID.
3421		 */
3422		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3423			return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3424
3425		/*
3426		 * It is always safe to dereference one's own t_procp pointer:
3427		 * it always points to a valid, allocated proc structure.
3428		 * (This is true because threads don't clean up their own
3429		 * state -- they leave that task to whomever reaps them.)
3430		 */
3431		return (dtrace_dif_varstr(
3432		    (uintptr_t)curthread->t_procp->p_zone->zone_name,
3433		    state, mstate));
3434#else
3435		return (0);
3436#endif
3437
3438	case DIF_VAR_UID:
3439		if (!dtrace_priv_proc(state))
3440			return (0);
3441
3442#if defined(sun)
3443		/*
3444		 * See comment in DIF_VAR_PID.
3445		 */
3446		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3447			return ((uint64_t)p0.p_cred->cr_uid);
3448#endif
3449
3450		/*
3451		 * It is always safe to dereference one's own t_procp pointer:
3452		 * it always points to a valid, allocated proc structure.
3453		 * (This is true because threads don't clean up their own
3454		 * state -- they leave that task to whomever reaps them.)
3455		 *
3456		 * Additionally, it is safe to dereference one's own process
3457		 * credential, since this is never NULL after process birth.
3458		 */
3459		return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3460
3461	case DIF_VAR_GID:
3462		if (!dtrace_priv_proc(state))
3463			return (0);
3464
3465#if defined(sun)
3466		/*
3467		 * See comment in DIF_VAR_PID.
3468		 */
3469		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3470			return ((uint64_t)p0.p_cred->cr_gid);
3471#endif
3472
3473		/*
3474		 * It is always safe to dereference one's own t_procp pointer:
3475		 * it always points to a valid, allocated proc structure.
3476		 * (This is true because threads don't clean up their own
3477		 * state -- they leave that task to whomever reaps them.)
3478		 *
3479		 * Additionally, it is safe to dereference one's own process
3480		 * credential, since this is never NULL after process birth.
3481		 */
3482		return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3483
3484	case DIF_VAR_ERRNO: {
3485#if defined(sun)
3486		klwp_t *lwp;
3487		if (!dtrace_priv_proc(state))
3488			return (0);
3489
3490		/*
3491		 * See comment in DIF_VAR_PID.
3492		 */
3493		if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3494			return (0);
3495
3496		/*
3497		 * It is always safe to dereference one's own t_lwp pointer in
3498		 * the event that this pointer is non-NULL.  (This is true
3499		 * because threads and lwps don't clean up their own state --
3500		 * they leave that task to whomever reaps them.)
3501		 */
3502		if ((lwp = curthread->t_lwp) == NULL)
3503			return (0);
3504
3505		return ((uint64_t)lwp->lwp_errno);
3506#else
3507		return (curthread->td_errno);
3508#endif
3509	}
3510#if !defined(sun)
3511	case DIF_VAR_CPU: {
3512		return curcpu;
3513	}
3514#endif
3515	default:
3516		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3517		return (0);
3518	}
3519}
3520
3521
3522typedef enum dtrace_json_state {
3523	DTRACE_JSON_REST = 1,
3524	DTRACE_JSON_OBJECT,
3525	DTRACE_JSON_STRING,
3526	DTRACE_JSON_STRING_ESCAPE,
3527	DTRACE_JSON_STRING_ESCAPE_UNICODE,
3528	DTRACE_JSON_COLON,
3529	DTRACE_JSON_COMMA,
3530	DTRACE_JSON_VALUE,
3531	DTRACE_JSON_IDENTIFIER,
3532	DTRACE_JSON_NUMBER,
3533	DTRACE_JSON_NUMBER_FRAC,
3534	DTRACE_JSON_NUMBER_EXP,
3535	DTRACE_JSON_COLLECT_OBJECT
3536} dtrace_json_state_t;
3537
3538/*
3539 * This function possesses just enough knowledge about JSON to extract a single
3540 * value from a JSON string and store it in the scratch buffer.  It is able
3541 * to extract nested object values, and members of arrays by index.
3542 *
3543 * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3544 * be looked up as we descend into the object tree.  e.g.
3545 *
3546 *    foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3547 *       with nelems = 5.
3548 *
3549 * The run time of this function must be bounded above by strsize to limit the
3550 * amount of work done in probe context.  As such, it is implemented as a
3551 * simple state machine, reading one character at a time using safe loads
3552 * until we find the requested element, hit a parsing error or run off the
3553 * end of the object or string.
3554 *
3555 * As there is no way for a subroutine to return an error without interrupting
3556 * clause execution, we simply return NULL in the event of a missing key or any
3557 * other error condition.  Each NULL return in this function is commented with
3558 * the error condition it represents -- parsing or otherwise.
3559 *
3560 * The set of states for the state machine closely matches the JSON
3561 * specification (http://json.org/).  Briefly:
3562 *
3563 *   DTRACE_JSON_REST:
3564 *     Skip whitespace until we find either a top-level Object, moving
3565 *     to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3566 *
3567 *   DTRACE_JSON_OBJECT:
3568 *     Locate the next key String in an Object.  Sets a flag to denote
3569 *     the next String as a key string and moves to DTRACE_JSON_STRING.
3570 *
3571 *   DTRACE_JSON_COLON:
3572 *     Skip whitespace until we find the colon that separates key Strings
3573 *     from their values.  Once found, move to DTRACE_JSON_VALUE.
3574 *
3575 *   DTRACE_JSON_VALUE:
3576 *     Detects the type of the next value (String, Number, Identifier, Object
3577 *     or Array) and routes to the states that process that type.  Here we also
3578 *     deal with the element selector list if we are requested to traverse down
3579 *     into the object tree.
3580 *
3581 *   DTRACE_JSON_COMMA:
3582 *     Skip whitespace until we find the comma that separates key-value pairs
3583 *     in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3584 *     (similarly DTRACE_JSON_VALUE).  All following literal value processing
3585 *     states return to this state at the end of their value, unless otherwise
3586 *     noted.
3587 *
3588 *   DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3589 *     Processes a Number literal from the JSON, including any exponent
3590 *     component that may be present.  Numbers are returned as strings, which
3591 *     may be passed to strtoll() if an integer is required.
3592 *
3593 *   DTRACE_JSON_IDENTIFIER:
3594 *     Processes a "true", "false" or "null" literal in the JSON.
3595 *
3596 *   DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3597 *   DTRACE_JSON_STRING_ESCAPE_UNICODE:
3598 *     Processes a String literal from the JSON, whether the String denotes
3599 *     a key, a value or part of a larger Object.  Handles all escape sequences
3600 *     present in the specification, including four-digit unicode characters,
3601 *     but merely includes the escape sequence without converting it to the
3602 *     actual escaped character.  If the String is flagged as a key, we
3603 *     move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3604 *
3605 *   DTRACE_JSON_COLLECT_OBJECT:
3606 *     This state collects an entire Object (or Array), correctly handling
3607 *     embedded strings.  If the full element selector list matches this nested
3608 *     object, we return the Object in full as a string.  If not, we use this
3609 *     state to skip to the next value at this level and continue processing.
3610 *
3611 * NOTE: This function uses various macros from strtolctype.h to manipulate
3612 * digit values, etc -- these have all been checked to ensure they make
3613 * no additional function calls.
3614 */
3615static char *
3616dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3617    char *dest)
3618{
3619	dtrace_json_state_t state = DTRACE_JSON_REST;
3620	int64_t array_elem = INT64_MIN;
3621	int64_t array_pos = 0;
3622	uint8_t escape_unicount = 0;
3623	boolean_t string_is_key = B_FALSE;
3624	boolean_t collect_object = B_FALSE;
3625	boolean_t found_key = B_FALSE;
3626	boolean_t in_array = B_FALSE;
3627	uint32_t braces = 0, brackets = 0;
3628	char *elem = elemlist;
3629	char *dd = dest;
3630	uintptr_t cur;
3631
3632	for (cur = json; cur < json + size; cur++) {
3633		char cc = dtrace_load8(cur);
3634		if (cc == '\0')
3635			return (NULL);
3636
3637		switch (state) {
3638		case DTRACE_JSON_REST:
3639			if (isspace(cc))
3640				break;
3641
3642			if (cc == '{') {
3643				state = DTRACE_JSON_OBJECT;
3644				break;
3645			}
3646
3647			if (cc == '[') {
3648				in_array = B_TRUE;
3649				array_pos = 0;
3650				array_elem = dtrace_strtoll(elem, 10, size);
3651				found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3652				state = DTRACE_JSON_VALUE;
3653				break;
3654			}
3655
3656			/*
3657			 * ERROR: expected to find a top-level object or array.
3658			 */
3659			return (NULL);
3660		case DTRACE_JSON_OBJECT:
3661			if (isspace(cc))
3662				break;
3663
3664			if (cc == '"') {
3665				state = DTRACE_JSON_STRING;
3666				string_is_key = B_TRUE;
3667				break;
3668			}
3669
3670			/*
3671			 * ERROR: either the object did not start with a key
3672			 * string, or we've run off the end of the object
3673			 * without finding the requested key.
3674			 */
3675			return (NULL);
3676		case DTRACE_JSON_STRING:
3677			if (cc == '\\') {
3678				*dd++ = '\\';
3679				state = DTRACE_JSON_STRING_ESCAPE;
3680				break;
3681			}
3682
3683			if (cc == '"') {
3684				if (collect_object) {
3685					/*
3686					 * We don't reset the dest here, as
3687					 * the string is part of a larger
3688					 * object being collected.
3689					 */
3690					*dd++ = cc;
3691					collect_object = B_FALSE;
3692					state = DTRACE_JSON_COLLECT_OBJECT;
3693					break;
3694				}
3695				*dd = '\0';
3696				dd = dest; /* reset string buffer */
3697				if (string_is_key) {
3698					if (dtrace_strncmp(dest, elem,
3699					    size) == 0)
3700						found_key = B_TRUE;
3701				} else if (found_key) {
3702					if (nelems > 1) {
3703						/*
3704						 * We expected an object, not
3705						 * this string.
3706						 */
3707						return (NULL);
3708					}
3709					return (dest);
3710				}
3711				state = string_is_key ? DTRACE_JSON_COLON :
3712				    DTRACE_JSON_COMMA;
3713				string_is_key = B_FALSE;
3714				break;
3715			}
3716
3717			*dd++ = cc;
3718			break;
3719		case DTRACE_JSON_STRING_ESCAPE:
3720			*dd++ = cc;
3721			if (cc == 'u') {
3722				escape_unicount = 0;
3723				state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3724			} else {
3725				state = DTRACE_JSON_STRING;
3726			}
3727			break;
3728		case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3729			if (!isxdigit(cc)) {
3730				/*
3731				 * ERROR: invalid unicode escape, expected
3732				 * four valid hexidecimal digits.
3733				 */
3734				return (NULL);
3735			}
3736
3737			*dd++ = cc;
3738			if (++escape_unicount == 4)
3739				state = DTRACE_JSON_STRING;
3740			break;
3741		case DTRACE_JSON_COLON:
3742			if (isspace(cc))
3743				break;
3744
3745			if (cc == ':') {
3746				state = DTRACE_JSON_VALUE;
3747				break;
3748			}
3749
3750			/*
3751			 * ERROR: expected a colon.
3752			 */
3753			return (NULL);
3754		case DTRACE_JSON_COMMA:
3755			if (isspace(cc))
3756				break;
3757
3758			if (cc == ',') {
3759				if (in_array) {
3760					state = DTRACE_JSON_VALUE;
3761					if (++array_pos == array_elem)
3762						found_key = B_TRUE;
3763				} else {
3764					state = DTRACE_JSON_OBJECT;
3765				}
3766				break;
3767			}
3768
3769			/*
3770			 * ERROR: either we hit an unexpected character, or
3771			 * we reached the end of the object or array without
3772			 * finding the requested key.
3773			 */
3774			return (NULL);
3775		case DTRACE_JSON_IDENTIFIER:
3776			if (islower(cc)) {
3777				*dd++ = cc;
3778				break;
3779			}
3780
3781			*dd = '\0';
3782			dd = dest; /* reset string buffer */
3783
3784			if (dtrace_strncmp(dest, "true", 5) == 0 ||
3785			    dtrace_strncmp(dest, "false", 6) == 0 ||
3786			    dtrace_strncmp(dest, "null", 5) == 0) {
3787				if (found_key) {
3788					if (nelems > 1) {
3789						/*
3790						 * ERROR: We expected an object,
3791						 * not this identifier.
3792						 */
3793						return (NULL);
3794					}
3795					return (dest);
3796				} else {
3797					cur--;
3798					state = DTRACE_JSON_COMMA;
3799					break;
3800				}
3801			}
3802
3803			/*
3804			 * ERROR: we did not recognise the identifier as one
3805			 * of those in the JSON specification.
3806			 */
3807			return (NULL);
3808		case DTRACE_JSON_NUMBER:
3809			if (cc == '.') {
3810				*dd++ = cc;
3811				state = DTRACE_JSON_NUMBER_FRAC;
3812				break;
3813			}
3814
3815			if (cc == 'x' || cc == 'X') {
3816				/*
3817				 * ERROR: specification explicitly excludes
3818				 * hexidecimal or octal numbers.
3819				 */
3820				return (NULL);
3821			}
3822
3823			/* FALLTHRU */
3824		case DTRACE_JSON_NUMBER_FRAC:
3825			if (cc == 'e' || cc == 'E') {
3826				*dd++ = cc;
3827				state = DTRACE_JSON_NUMBER_EXP;
3828				break;
3829			}
3830
3831			if (cc == '+' || cc == '-') {
3832				/*
3833				 * ERROR: expect sign as part of exponent only.
3834				 */
3835				return (NULL);
3836			}
3837			/* FALLTHRU */
3838		case DTRACE_JSON_NUMBER_EXP:
3839			if (isdigit(cc) || cc == '+' || cc == '-') {
3840				*dd++ = cc;
3841				break;
3842			}
3843
3844			*dd = '\0';
3845			dd = dest; /* reset string buffer */
3846			if (found_key) {
3847				if (nelems > 1) {
3848					/*
3849					 * ERROR: We expected an object, not
3850					 * this number.
3851					 */
3852					return (NULL);
3853				}
3854				return (dest);
3855			}
3856
3857			cur--;
3858			state = DTRACE_JSON_COMMA;
3859			break;
3860		case DTRACE_JSON_VALUE:
3861			if (isspace(cc))
3862				break;
3863
3864			if (cc == '{' || cc == '[') {
3865				if (nelems > 1 && found_key) {
3866					in_array = cc == '[' ? B_TRUE : B_FALSE;
3867					/*
3868					 * If our element selector directs us
3869					 * to descend into this nested object,
3870					 * then move to the next selector
3871					 * element in the list and restart the
3872					 * state machine.
3873					 */
3874					while (*elem != '\0')
3875						elem++;
3876					elem++; /* skip the inter-element NUL */
3877					nelems--;
3878					dd = dest;
3879					if (in_array) {
3880						state = DTRACE_JSON_VALUE;
3881						array_pos = 0;
3882						array_elem = dtrace_strtoll(
3883						    elem, 10, size);
3884						found_key = array_elem == 0 ?
3885						    B_TRUE : B_FALSE;
3886					} else {
3887						found_key = B_FALSE;
3888						state = DTRACE_JSON_OBJECT;
3889					}
3890					break;
3891				}
3892
3893				/*
3894				 * Otherwise, we wish to either skip this
3895				 * nested object or return it in full.
3896				 */
3897				if (cc == '[')
3898					brackets = 1;
3899				else
3900					braces = 1;
3901				*dd++ = cc;
3902				state = DTRACE_JSON_COLLECT_OBJECT;
3903				break;
3904			}
3905
3906			if (cc == '"') {
3907				state = DTRACE_JSON_STRING;
3908				break;
3909			}
3910
3911			if (islower(cc)) {
3912				/*
3913				 * Here we deal with true, false and null.
3914				 */
3915				*dd++ = cc;
3916				state = DTRACE_JSON_IDENTIFIER;
3917				break;
3918			}
3919
3920			if (cc == '-' || isdigit(cc)) {
3921				*dd++ = cc;
3922				state = DTRACE_JSON_NUMBER;
3923				break;
3924			}
3925
3926			/*
3927			 * ERROR: unexpected character at start of value.
3928			 */
3929			return (NULL);
3930		case DTRACE_JSON_COLLECT_OBJECT:
3931			if (cc == '\0')
3932				/*
3933				 * ERROR: unexpected end of input.
3934				 */
3935				return (NULL);
3936
3937			*dd++ = cc;
3938			if (cc == '"') {
3939				collect_object = B_TRUE;
3940				state = DTRACE_JSON_STRING;
3941				break;
3942			}
3943
3944			if (cc == ']') {
3945				if (brackets-- == 0) {
3946					/*
3947					 * ERROR: unbalanced brackets.
3948					 */
3949					return (NULL);
3950				}
3951			} else if (cc == '}') {
3952				if (braces-- == 0) {
3953					/*
3954					 * ERROR: unbalanced braces.
3955					 */
3956					return (NULL);
3957				}
3958			} else if (cc == '{') {
3959				braces++;
3960			} else if (cc == '[') {
3961				brackets++;
3962			}
3963
3964			if (brackets == 0 && braces == 0) {
3965				if (found_key) {
3966					*dd = '\0';
3967					return (dest);
3968				}
3969				dd = dest; /* reset string buffer */
3970				state = DTRACE_JSON_COMMA;
3971			}
3972			break;
3973		}
3974	}
3975	return (NULL);
3976}
3977
3978/*
3979 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3980 * Notice that we don't bother validating the proper number of arguments or
3981 * their types in the tuple stack.  This isn't needed because all argument
3982 * interpretation is safe because of our load safety -- the worst that can
3983 * happen is that a bogus program can obtain bogus results.
3984 */
3985static void
3986dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3987    dtrace_key_t *tupregs, int nargs,
3988    dtrace_mstate_t *mstate, dtrace_state_t *state)
3989{
3990	volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
3991	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
3992	dtrace_vstate_t *vstate = &state->dts_vstate;
3993
3994#if defined(sun)
3995	union {
3996		mutex_impl_t mi;
3997		uint64_t mx;
3998	} m;
3999
4000	union {
4001		krwlock_t ri;
4002		uintptr_t rw;
4003	} r;
4004#else
4005	struct thread *lowner;
4006	union {
4007		struct lock_object *li;
4008		uintptr_t lx;
4009	} l;
4010#endif
4011
4012	switch (subr) {
4013	case DIF_SUBR_RAND:
4014		regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
4015		break;
4016
4017#if defined(sun)
4018	case DIF_SUBR_MUTEX_OWNED:
4019		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4020		    mstate, vstate)) {
4021			regs[rd] = 0;
4022			break;
4023		}
4024
4025		m.mx = dtrace_load64(tupregs[0].dttk_value);
4026		if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4027			regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4028		else
4029			regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4030		break;
4031
4032	case DIF_SUBR_MUTEX_OWNER:
4033		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4034		    mstate, vstate)) {
4035			regs[rd] = 0;
4036			break;
4037		}
4038
4039		m.mx = dtrace_load64(tupregs[0].dttk_value);
4040		if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4041		    MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4042			regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4043		else
4044			regs[rd] = 0;
4045		break;
4046
4047	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4048		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4049		    mstate, vstate)) {
4050			regs[rd] = 0;
4051			break;
4052		}
4053
4054		m.mx = dtrace_load64(tupregs[0].dttk_value);
4055		regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4056		break;
4057
4058	case DIF_SUBR_MUTEX_TYPE_SPIN:
4059		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4060		    mstate, vstate)) {
4061			regs[rd] = 0;
4062			break;
4063		}
4064
4065		m.mx = dtrace_load64(tupregs[0].dttk_value);
4066		regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4067		break;
4068
4069	case DIF_SUBR_RW_READ_HELD: {
4070		uintptr_t tmp;
4071
4072		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4073		    mstate, vstate)) {
4074			regs[rd] = 0;
4075			break;
4076		}
4077
4078		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4079		regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4080		break;
4081	}
4082
4083	case DIF_SUBR_RW_WRITE_HELD:
4084		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4085		    mstate, vstate)) {
4086			regs[rd] = 0;
4087			break;
4088		}
4089
4090		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4091		regs[rd] = _RW_WRITE_HELD(&r.ri);
4092		break;
4093
4094	case DIF_SUBR_RW_ISWRITER:
4095		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4096		    mstate, vstate)) {
4097			regs[rd] = 0;
4098			break;
4099		}
4100
4101		r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4102		regs[rd] = _RW_ISWRITER(&r.ri);
4103		break;
4104
4105#else
4106	case DIF_SUBR_MUTEX_OWNED:
4107		if (!dtrace_canload(tupregs[0].dttk_value,
4108			sizeof (struct lock_object), mstate, vstate)) {
4109			regs[rd] = 0;
4110			break;
4111		}
4112		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4113		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4114		break;
4115
4116	case DIF_SUBR_MUTEX_OWNER:
4117		if (!dtrace_canload(tupregs[0].dttk_value,
4118			sizeof (struct lock_object), mstate, vstate)) {
4119			regs[rd] = 0;
4120			break;
4121		}
4122		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4123		LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4124		regs[rd] = (uintptr_t)lowner;
4125		break;
4126
4127	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4128		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4129		    mstate, vstate)) {
4130			regs[rd] = 0;
4131			break;
4132		}
4133		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4134		/* XXX - should be only LC_SLEEPABLE? */
4135		regs[rd] = (LOCK_CLASS(l.li)->lc_flags &
4136		    (LC_SLEEPLOCK | LC_SLEEPABLE)) != 0;
4137		break;
4138
4139	case DIF_SUBR_MUTEX_TYPE_SPIN:
4140		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4141		    mstate, vstate)) {
4142			regs[rd] = 0;
4143			break;
4144		}
4145		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4146		regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;
4147		break;
4148
4149	case DIF_SUBR_RW_READ_HELD:
4150	case DIF_SUBR_SX_SHARED_HELD:
4151		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4152		    mstate, vstate)) {
4153			regs[rd] = 0;
4154			break;
4155		}
4156		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4157		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4158		    lowner == NULL;
4159		break;
4160
4161	case DIF_SUBR_RW_WRITE_HELD:
4162	case DIF_SUBR_SX_EXCLUSIVE_HELD:
4163		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4164		    mstate, vstate)) {
4165			regs[rd] = 0;
4166			break;
4167		}
4168		l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4169		LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4170		regs[rd] = (lowner == curthread);
4171		break;
4172
4173	case DIF_SUBR_RW_ISWRITER:
4174	case DIF_SUBR_SX_ISEXCLUSIVE:
4175		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4176		    mstate, vstate)) {
4177			regs[rd] = 0;
4178			break;
4179		}
4180		l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4181		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4182		    lowner != NULL;
4183		break;
4184#endif /* ! defined(sun) */
4185
4186	case DIF_SUBR_BCOPY: {
4187		/*
4188		 * We need to be sure that the destination is in the scratch
4189		 * region -- no other region is allowed.
4190		 */
4191		uintptr_t src = tupregs[0].dttk_value;
4192		uintptr_t dest = tupregs[1].dttk_value;
4193		size_t size = tupregs[2].dttk_value;
4194
4195		if (!dtrace_inscratch(dest, size, mstate)) {
4196			*flags |= CPU_DTRACE_BADADDR;
4197			*illval = regs[rd];
4198			break;
4199		}
4200
4201		if (!dtrace_canload(src, size, mstate, vstate)) {
4202			regs[rd] = 0;
4203			break;
4204		}
4205
4206		dtrace_bcopy((void *)src, (void *)dest, size);
4207		break;
4208	}
4209
4210	case DIF_SUBR_ALLOCA:
4211	case DIF_SUBR_COPYIN: {
4212		uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4213		uint64_t size =
4214		    tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4215		size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4216
4217		/*
4218		 * This action doesn't require any credential checks since
4219		 * probes will not activate in user contexts to which the
4220		 * enabling user does not have permissions.
4221		 */
4222
4223		/*
4224		 * Rounding up the user allocation size could have overflowed
4225		 * a large, bogus allocation (like -1ULL) to 0.
4226		 */
4227		if (scratch_size < size ||
4228		    !DTRACE_INSCRATCH(mstate, scratch_size)) {
4229			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4230			regs[rd] = 0;
4231			break;
4232		}
4233
4234		if (subr == DIF_SUBR_COPYIN) {
4235			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4236			dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4237			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4238		}
4239
4240		mstate->dtms_scratch_ptr += scratch_size;
4241		regs[rd] = dest;
4242		break;
4243	}
4244
4245	case DIF_SUBR_COPYINTO: {
4246		uint64_t size = tupregs[1].dttk_value;
4247		uintptr_t dest = tupregs[2].dttk_value;
4248
4249		/*
4250		 * This action doesn't require any credential checks since
4251		 * probes will not activate in user contexts to which the
4252		 * enabling user does not have permissions.
4253		 */
4254		if (!dtrace_inscratch(dest, size, mstate)) {
4255			*flags |= CPU_DTRACE_BADADDR;
4256			*illval = regs[rd];
4257			break;
4258		}
4259
4260		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4261		dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4262		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4263		break;
4264	}
4265
4266	case DIF_SUBR_COPYINSTR: {
4267		uintptr_t dest = mstate->dtms_scratch_ptr;
4268		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4269
4270		if (nargs > 1 && tupregs[1].dttk_value < size)
4271			size = tupregs[1].dttk_value + 1;
4272
4273		/*
4274		 * This action doesn't require any credential checks since
4275		 * probes will not activate in user contexts to which the
4276		 * enabling user does not have permissions.
4277		 */
4278		if (!DTRACE_INSCRATCH(mstate, size)) {
4279			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4280			regs[rd] = 0;
4281			break;
4282		}
4283
4284		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4285		dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4286		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4287
4288		((char *)dest)[size - 1] = '\0';
4289		mstate->dtms_scratch_ptr += size;
4290		regs[rd] = dest;
4291		break;
4292	}
4293
4294#if defined(sun)
4295	case DIF_SUBR_MSGSIZE:
4296	case DIF_SUBR_MSGDSIZE: {
4297		uintptr_t baddr = tupregs[0].dttk_value, daddr;
4298		uintptr_t wptr, rptr;
4299		size_t count = 0;
4300		int cont = 0;
4301
4302		while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
4303
4304			if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
4305			    vstate)) {
4306				regs[rd] = 0;
4307				break;
4308			}
4309
4310			wptr = dtrace_loadptr(baddr +
4311			    offsetof(mblk_t, b_wptr));
4312
4313			rptr = dtrace_loadptr(baddr +
4314			    offsetof(mblk_t, b_rptr));
4315
4316			if (wptr < rptr) {
4317				*flags |= CPU_DTRACE_BADADDR;
4318				*illval = tupregs[0].dttk_value;
4319				break;
4320			}
4321
4322			daddr = dtrace_loadptr(baddr +
4323			    offsetof(mblk_t, b_datap));
4324
4325			baddr = dtrace_loadptr(baddr +
4326			    offsetof(mblk_t, b_cont));
4327
4328			/*
4329			 * We want to prevent against denial-of-service here,
4330			 * so we're only going to search the list for
4331			 * dtrace_msgdsize_max mblks.
4332			 */
4333			if (cont++ > dtrace_msgdsize_max) {
4334				*flags |= CPU_DTRACE_ILLOP;
4335				break;
4336			}
4337
4338			if (subr == DIF_SUBR_MSGDSIZE) {
4339				if (dtrace_load8(daddr +
4340				    offsetof(dblk_t, db_type)) != M_DATA)
4341					continue;
4342			}
4343
4344			count += wptr - rptr;
4345		}
4346
4347		if (!(*flags & CPU_DTRACE_FAULT))
4348			regs[rd] = count;
4349
4350		break;
4351	}
4352#endif
4353
4354	case DIF_SUBR_PROGENYOF: {
4355		pid_t pid = tupregs[0].dttk_value;
4356		proc_t *p;
4357		int rval = 0;
4358
4359		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4360
4361		for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
4362#if defined(sun)
4363			if (p->p_pidp->pid_id == pid) {
4364#else
4365			if (p->p_pid == pid) {
4366#endif
4367				rval = 1;
4368				break;
4369			}
4370		}
4371
4372		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4373
4374		regs[rd] = rval;
4375		break;
4376	}
4377
4378	case DIF_SUBR_SPECULATION:
4379		regs[rd] = dtrace_speculation(state);
4380		break;
4381
4382	case DIF_SUBR_COPYOUT: {
4383		uintptr_t kaddr = tupregs[0].dttk_value;
4384		uintptr_t uaddr = tupregs[1].dttk_value;
4385		uint64_t size = tupregs[2].dttk_value;
4386
4387		if (!dtrace_destructive_disallow &&
4388		    dtrace_priv_proc_control(state) &&
4389		    !dtrace_istoxic(kaddr, size)) {
4390			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4391			dtrace_copyout(kaddr, uaddr, size, flags);
4392			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4393		}
4394		break;
4395	}
4396
4397	case DIF_SUBR_COPYOUTSTR: {
4398		uintptr_t kaddr = tupregs[0].dttk_value;
4399		uintptr_t uaddr = tupregs[1].dttk_value;
4400		uint64_t size = tupregs[2].dttk_value;
4401
4402		if (!dtrace_destructive_disallow &&
4403		    dtrace_priv_proc_control(state) &&
4404		    !dtrace_istoxic(kaddr, size)) {
4405			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4406			dtrace_copyoutstr(kaddr, uaddr, size, flags);
4407			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4408		}
4409		break;
4410	}
4411
4412	case DIF_SUBR_STRLEN: {
4413		size_t sz;
4414		uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4415		sz = dtrace_strlen((char *)addr,
4416		    state->dts_options[DTRACEOPT_STRSIZE]);
4417
4418		if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
4419			regs[rd] = 0;
4420			break;
4421		}
4422
4423		regs[rd] = sz;
4424
4425		break;
4426	}
4427
4428	case DIF_SUBR_STRCHR:
4429	case DIF_SUBR_STRRCHR: {
4430		/*
4431		 * We're going to iterate over the string looking for the
4432		 * specified character.  We will iterate until we have reached
4433		 * the string length or we have found the character.  If this
4434		 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4435		 * of the specified character instead of the first.
4436		 */
4437		uintptr_t saddr = tupregs[0].dttk_value;
4438		uintptr_t addr = tupregs[0].dttk_value;
4439		uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
4440		char c, target = (char)tupregs[1].dttk_value;
4441
4442		for (regs[rd] = 0; addr < limit; addr++) {
4443			if ((c = dtrace_load8(addr)) == target) {
4444				regs[rd] = addr;
4445
4446				if (subr == DIF_SUBR_STRCHR)
4447					break;
4448			}
4449
4450			if (c == '\0')
4451				break;
4452		}
4453
4454		if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
4455			regs[rd] = 0;
4456			break;
4457		}
4458
4459		break;
4460	}
4461
4462	case DIF_SUBR_STRSTR:
4463	case DIF_SUBR_INDEX:
4464	case DIF_SUBR_RINDEX: {
4465		/*
4466		 * We're going to iterate over the string looking for the
4467		 * specified string.  We will iterate until we have reached
4468		 * the string length or we have found the string.  (Yes, this
4469		 * is done in the most naive way possible -- but considering
4470		 * that the string we're searching for is likely to be
4471		 * relatively short, the complexity of Rabin-Karp or similar
4472		 * hardly seems merited.)
4473		 */
4474		char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4475		char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4476		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4477		size_t len = dtrace_strlen(addr, size);
4478		size_t sublen = dtrace_strlen(substr, size);
4479		char *limit = addr + len, *orig = addr;
4480		int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4481		int inc = 1;
4482
4483		regs[rd] = notfound;
4484
4485		if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4486			regs[rd] = 0;
4487			break;
4488		}
4489
4490		if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4491		    vstate)) {
4492			regs[rd] = 0;
4493			break;
4494		}
4495
4496		/*
4497		 * strstr() and index()/rindex() have similar semantics if
4498		 * both strings are the empty string: strstr() returns a
4499		 * pointer to the (empty) string, and index() and rindex()
4500		 * both return index 0 (regardless of any position argument).
4501		 */
4502		if (sublen == 0 && len == 0) {
4503			if (subr == DIF_SUBR_STRSTR)
4504				regs[rd] = (uintptr_t)addr;
4505			else
4506				regs[rd] = 0;
4507			break;
4508		}
4509
4510		if (subr != DIF_SUBR_STRSTR) {
4511			if (subr == DIF_SUBR_RINDEX) {
4512				limit = orig - 1;
4513				addr += len;
4514				inc = -1;
4515			}
4516
4517			/*
4518			 * Both index() and rindex() take an optional position
4519			 * argument that denotes the starting position.
4520			 */
4521			if (nargs == 3) {
4522				int64_t pos = (int64_t)tupregs[2].dttk_value;
4523
4524				/*
4525				 * If the position argument to index() is
4526				 * negative, Perl implicitly clamps it at
4527				 * zero.  This semantic is a little surprising
4528				 * given the special meaning of negative
4529				 * positions to similar Perl functions like
4530				 * substr(), but it appears to reflect a
4531				 * notion that index() can start from a
4532				 * negative index and increment its way up to
4533				 * the string.  Given this notion, Perl's
4534				 * rindex() is at least self-consistent in
4535				 * that it implicitly clamps positions greater
4536				 * than the string length to be the string
4537				 * length.  Where Perl completely loses
4538				 * coherence, however, is when the specified
4539				 * substring is the empty string ("").  In
4540				 * this case, even if the position is
4541				 * negative, rindex() returns 0 -- and even if
4542				 * the position is greater than the length,
4543				 * index() returns the string length.  These
4544				 * semantics violate the notion that index()
4545				 * should never return a value less than the
4546				 * specified position and that rindex() should
4547				 * never return a value greater than the
4548				 * specified position.  (One assumes that
4549				 * these semantics are artifacts of Perl's
4550				 * implementation and not the results of
4551				 * deliberate design -- it beggars belief that
4552				 * even Larry Wall could desire such oddness.)
4553				 * While in the abstract one would wish for
4554				 * consistent position semantics across
4555				 * substr(), index() and rindex() -- or at the
4556				 * very least self-consistent position
4557				 * semantics for index() and rindex() -- we
4558				 * instead opt to keep with the extant Perl
4559				 * semantics, in all their broken glory.  (Do
4560				 * we have more desire to maintain Perl's
4561				 * semantics than Perl does?  Probably.)
4562				 */
4563				if (subr == DIF_SUBR_RINDEX) {
4564					if (pos < 0) {
4565						if (sublen == 0)
4566							regs[rd] = 0;
4567						break;
4568					}
4569
4570					if (pos > len)
4571						pos = len;
4572				} else {
4573					if (pos < 0)
4574						pos = 0;
4575
4576					if (pos >= len) {
4577						if (sublen == 0)
4578							regs[rd] = len;
4579						break;
4580					}
4581				}
4582
4583				addr = orig + pos;
4584			}
4585		}
4586
4587		for (regs[rd] = notfound; addr != limit; addr += inc) {
4588			if (dtrace_strncmp(addr, substr, sublen) == 0) {
4589				if (subr != DIF_SUBR_STRSTR) {
4590					/*
4591					 * As D index() and rindex() are
4592					 * modeled on Perl (and not on awk),
4593					 * we return a zero-based (and not a
4594					 * one-based) index.  (For you Perl
4595					 * weenies: no, we're not going to add
4596					 * $[ -- and shouldn't you be at a con
4597					 * or something?)
4598					 */
4599					regs[rd] = (uintptr_t)(addr - orig);
4600					break;
4601				}
4602
4603				ASSERT(subr == DIF_SUBR_STRSTR);
4604				regs[rd] = (uintptr_t)addr;
4605				break;
4606			}
4607		}
4608
4609		break;
4610	}
4611
4612	case DIF_SUBR_STRTOK: {
4613		uintptr_t addr = tupregs[0].dttk_value;
4614		uintptr_t tokaddr = tupregs[1].dttk_value;
4615		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4616		uintptr_t limit, toklimit = tokaddr + size;
4617		uint8_t c = 0, tokmap[32];	 /* 256 / 8 */
4618		char *dest = (char *)mstate->dtms_scratch_ptr;
4619		int i;
4620
4621		/*
4622		 * Check both the token buffer and (later) the input buffer,
4623		 * since both could be non-scratch addresses.
4624		 */
4625		if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
4626			regs[rd] = 0;
4627			break;
4628		}
4629
4630		if (!DTRACE_INSCRATCH(mstate, size)) {
4631			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4632			regs[rd] = 0;
4633			break;
4634		}
4635
4636		if (addr == 0) {
4637			/*
4638			 * If the address specified is NULL, we use our saved
4639			 * strtok pointer from the mstate.  Note that this
4640			 * means that the saved strtok pointer is _only_
4641			 * valid within multiple enablings of the same probe --
4642			 * it behaves like an implicit clause-local variable.
4643			 */
4644			addr = mstate->dtms_strtok;
4645		} else {
4646			/*
4647			 * If the user-specified address is non-NULL we must
4648			 * access check it.  This is the only time we have
4649			 * a chance to do so, since this address may reside
4650			 * in the string table of this clause-- future calls
4651			 * (when we fetch addr from mstate->dtms_strtok)
4652			 * would fail this access check.
4653			 */
4654			if (!dtrace_strcanload(addr, size, mstate, vstate)) {
4655				regs[rd] = 0;
4656				break;
4657			}
4658		}
4659
4660		/*
4661		 * First, zero the token map, and then process the token
4662		 * string -- setting a bit in the map for every character
4663		 * found in the token string.
4664		 */
4665		for (i = 0; i < sizeof (tokmap); i++)
4666			tokmap[i] = 0;
4667
4668		for (; tokaddr < toklimit; tokaddr++) {
4669			if ((c = dtrace_load8(tokaddr)) == '\0')
4670				break;
4671
4672			ASSERT((c >> 3) < sizeof (tokmap));
4673			tokmap[c >> 3] |= (1 << (c & 0x7));
4674		}
4675
4676		for (limit = addr + size; addr < limit; addr++) {
4677			/*
4678			 * We're looking for a character that is _not_ contained
4679			 * in the token string.
4680			 */
4681			if ((c = dtrace_load8(addr)) == '\0')
4682				break;
4683
4684			if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4685				break;
4686		}
4687
4688		if (c == '\0') {
4689			/*
4690			 * We reached the end of the string without finding
4691			 * any character that was not in the token string.
4692			 * We return NULL in this case, and we set the saved
4693			 * address to NULL as well.
4694			 */
4695			regs[rd] = 0;
4696			mstate->dtms_strtok = 0;
4697			break;
4698		}
4699
4700		/*
4701		 * From here on, we're copying into the destination string.
4702		 */
4703		for (i = 0; addr < limit && i < size - 1; addr++) {
4704			if ((c = dtrace_load8(addr)) == '\0')
4705				break;
4706
4707			if (tokmap[c >> 3] & (1 << (c & 0x7)))
4708				break;
4709
4710			ASSERT(i < size);
4711			dest[i++] = c;
4712		}
4713
4714		ASSERT(i < size);
4715		dest[i] = '\0';
4716		regs[rd] = (uintptr_t)dest;
4717		mstate->dtms_scratch_ptr += size;
4718		mstate->dtms_strtok = addr;
4719		break;
4720	}
4721
4722	case DIF_SUBR_SUBSTR: {
4723		uintptr_t s = tupregs[0].dttk_value;
4724		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4725		char *d = (char *)mstate->dtms_scratch_ptr;
4726		int64_t index = (int64_t)tupregs[1].dttk_value;
4727		int64_t remaining = (int64_t)tupregs[2].dttk_value;
4728		size_t len = dtrace_strlen((char *)s, size);
4729		int64_t i = 0;
4730
4731		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4732			regs[rd] = 0;
4733			break;
4734		}
4735
4736		if (!DTRACE_INSCRATCH(mstate, size)) {
4737			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4738			regs[rd] = 0;
4739			break;
4740		}
4741
4742		if (nargs <= 2)
4743			remaining = (int64_t)size;
4744
4745		if (index < 0) {
4746			index += len;
4747
4748			if (index < 0 && index + remaining > 0) {
4749				remaining += index;
4750				index = 0;
4751			}
4752		}
4753
4754		if (index >= len || index < 0) {
4755			remaining = 0;
4756		} else if (remaining < 0) {
4757			remaining += len - index;
4758		} else if (index + remaining > size) {
4759			remaining = size - index;
4760		}
4761
4762		for (i = 0; i < remaining; i++) {
4763			if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4764				break;
4765		}
4766
4767		d[i] = '\0';
4768
4769		mstate->dtms_scratch_ptr += size;
4770		regs[rd] = (uintptr_t)d;
4771		break;
4772	}
4773
4774	case DIF_SUBR_JSON: {
4775		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4776		uintptr_t json = tupregs[0].dttk_value;
4777		size_t jsonlen = dtrace_strlen((char *)json, size);
4778		uintptr_t elem = tupregs[1].dttk_value;
4779		size_t elemlen = dtrace_strlen((char *)elem, size);
4780
4781		char *dest = (char *)mstate->dtms_scratch_ptr;
4782		char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
4783		char *ee = elemlist;
4784		int nelems = 1;
4785		uintptr_t cur;
4786
4787		if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
4788		    !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
4789			regs[rd] = 0;
4790			break;
4791		}
4792
4793		if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
4794			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4795			regs[rd] = 0;
4796			break;
4797		}
4798
4799		/*
4800		 * Read the element selector and split it up into a packed list
4801		 * of strings.
4802		 */
4803		for (cur = elem; cur < elem + elemlen; cur++) {
4804			char cc = dtrace_load8(cur);
4805
4806			if (cur == elem && cc == '[') {
4807				/*
4808				 * If the first element selector key is
4809				 * actually an array index then ignore the
4810				 * bracket.
4811				 */
4812				continue;
4813			}
4814
4815			if (cc == ']')
4816				continue;
4817
4818			if (cc == '.' || cc == '[') {
4819				nelems++;
4820				cc = '\0';
4821			}
4822
4823			*ee++ = cc;
4824		}
4825		*ee++ = '\0';
4826
4827		if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
4828		    nelems, dest)) != 0)
4829			mstate->dtms_scratch_ptr += jsonlen + 1;
4830		break;
4831	}
4832
4833	case DIF_SUBR_TOUPPER:
4834	case DIF_SUBR_TOLOWER: {
4835		uintptr_t s = tupregs[0].dttk_value;
4836		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4837		char *dest = (char *)mstate->dtms_scratch_ptr, c;
4838		size_t len = dtrace_strlen((char *)s, size);
4839		char lower, upper, convert;
4840		int64_t i;
4841
4842		if (subr == DIF_SUBR_TOUPPER) {
4843			lower = 'a';
4844			upper = 'z';
4845			convert = 'A';
4846		} else {
4847			lower = 'A';
4848			upper = 'Z';
4849			convert = 'a';
4850		}
4851
4852		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4853			regs[rd] = 0;
4854			break;
4855		}
4856
4857		if (!DTRACE_INSCRATCH(mstate, size)) {
4858			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4859			regs[rd] = 0;
4860			break;
4861		}
4862
4863		for (i = 0; i < size - 1; i++) {
4864			if ((c = dtrace_load8(s + i)) == '\0')
4865				break;
4866
4867			if (c >= lower && c <= upper)
4868				c = convert + (c - lower);
4869
4870			dest[i] = c;
4871		}
4872
4873		ASSERT(i < size);
4874		dest[i] = '\0';
4875		regs[rd] = (uintptr_t)dest;
4876		mstate->dtms_scratch_ptr += size;
4877		break;
4878	}
4879
4880#if defined(sun)
4881	case DIF_SUBR_GETMAJOR:
4882#ifdef _LP64
4883		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
4884#else
4885		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
4886#endif
4887		break;
4888
4889	case DIF_SUBR_GETMINOR:
4890#ifdef _LP64
4891		regs[rd] = tupregs[0].dttk_value & MAXMIN64;
4892#else
4893		regs[rd] = tupregs[0].dttk_value & MAXMIN;
4894#endif
4895		break;
4896
4897	case DIF_SUBR_DDI_PATHNAME: {
4898		/*
4899		 * This one is a galactic mess.  We are going to roughly
4900		 * emulate ddi_pathname(), but it's made more complicated
4901		 * by the fact that we (a) want to include the minor name and
4902		 * (b) must proceed iteratively instead of recursively.
4903		 */
4904		uintptr_t dest = mstate->dtms_scratch_ptr;
4905		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4906		char *start = (char *)dest, *end = start + size - 1;
4907		uintptr_t daddr = tupregs[0].dttk_value;
4908		int64_t minor = (int64_t)tupregs[1].dttk_value;
4909		char *s;
4910		int i, len, depth = 0;
4911
4912		/*
4913		 * Due to all the pointer jumping we do and context we must
4914		 * rely upon, we just mandate that the user must have kernel
4915		 * read privileges to use this routine.
4916		 */
4917		if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4918			*flags |= CPU_DTRACE_KPRIV;
4919			*illval = daddr;
4920			regs[rd] = 0;
4921		}
4922
4923		if (!DTRACE_INSCRATCH(mstate, size)) {
4924			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4925			regs[rd] = 0;
4926			break;
4927		}
4928
4929		*end = '\0';
4930
4931		/*
4932		 * We want to have a name for the minor.  In order to do this,
4933		 * we need to walk the minor list from the devinfo.  We want
4934		 * to be sure that we don't infinitely walk a circular list,
4935		 * so we check for circularity by sending a scout pointer
4936		 * ahead two elements for every element that we iterate over;
4937		 * if the list is circular, these will ultimately point to the
4938		 * same element.  You may recognize this little trick as the
4939		 * answer to a stupid interview question -- one that always
4940		 * seems to be asked by those who had to have it laboriously
4941		 * explained to them, and who can't even concisely describe
4942		 * the conditions under which one would be forced to resort to
4943		 * this technique.  Needless to say, those conditions are
4944		 * found here -- and probably only here.  Is this the only use
4945		 * of this infamous trick in shipping, production code?  If it
4946		 * isn't, it probably should be...
4947		 */
4948		if (minor != -1) {
4949			uintptr_t maddr = dtrace_loadptr(daddr +
4950			    offsetof(struct dev_info, devi_minor));
4951
4952			uintptr_t next = offsetof(struct ddi_minor_data, next);
4953			uintptr_t name = offsetof(struct ddi_minor_data,
4954			    d_minor) + offsetof(struct ddi_minor, name);
4955			uintptr_t dev = offsetof(struct ddi_minor_data,
4956			    d_minor) + offsetof(struct ddi_minor, dev);
4957			uintptr_t scout;
4958
4959			if (maddr != NULL)
4960				scout = dtrace_loadptr(maddr + next);
4961
4962			while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4963				uint64_t m;
4964#ifdef _LP64
4965				m = dtrace_load64(maddr + dev) & MAXMIN64;
4966#else
4967				m = dtrace_load32(maddr + dev) & MAXMIN;
4968#endif
4969				if (m != minor) {
4970					maddr = dtrace_loadptr(maddr + next);
4971
4972					if (scout == NULL)
4973						continue;
4974
4975					scout = dtrace_loadptr(scout + next);
4976
4977					if (scout == NULL)
4978						continue;
4979
4980					scout = dtrace_loadptr(scout + next);
4981
4982					if (scout == NULL)
4983						continue;
4984
4985					if (scout == maddr) {
4986						*flags |= CPU_DTRACE_ILLOP;
4987						break;
4988					}
4989
4990					continue;
4991				}
4992
4993				/*
4994				 * We have the minor data.  Now we need to
4995				 * copy the minor's name into the end of the
4996				 * pathname.
4997				 */
4998				s = (char *)dtrace_loadptr(maddr + name);
4999				len = dtrace_strlen(s, size);
5000
5001				if (*flags & CPU_DTRACE_FAULT)
5002					break;
5003
5004				if (len != 0) {
5005					if ((end -= (len + 1)) < start)
5006						break;
5007
5008					*end = ':';
5009				}
5010
5011				for (i = 1; i <= len; i++)
5012					end[i] = dtrace_load8((uintptr_t)s++);
5013				break;
5014			}
5015		}
5016
5017		while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
5018			ddi_node_state_t devi_state;
5019
5020			devi_state = dtrace_load32(daddr +
5021			    offsetof(struct dev_info, devi_node_state));
5022
5023			if (*flags & CPU_DTRACE_FAULT)
5024				break;
5025
5026			if (devi_state >= DS_INITIALIZED) {
5027				s = (char *)dtrace_loadptr(daddr +
5028				    offsetof(struct dev_info, devi_addr));
5029				len = dtrace_strlen(s, size);
5030
5031				if (*flags & CPU_DTRACE_FAULT)
5032					break;
5033
5034				if (len != 0) {
5035					if ((end -= (len + 1)) < start)
5036						break;
5037
5038					*end = '@';
5039				}
5040
5041				for (i = 1; i <= len; i++)
5042					end[i] = dtrace_load8((uintptr_t)s++);
5043			}
5044
5045			/*
5046			 * Now for the node name...
5047			 */
5048			s = (char *)dtrace_loadptr(daddr +
5049			    offsetof(struct dev_info, devi_node_name));
5050
5051			daddr = dtrace_loadptr(daddr +
5052			    offsetof(struct dev_info, devi_parent));
5053
5054			/*
5055			 * If our parent is NULL (that is, if we're the root
5056			 * node), we're going to use the special path
5057			 * "devices".
5058			 */
5059			if (daddr == 0)
5060				s = "devices";
5061
5062			len = dtrace_strlen(s, size);
5063			if (*flags & CPU_DTRACE_FAULT)
5064				break;
5065
5066			if ((end -= (len + 1)) < start)
5067				break;
5068
5069			for (i = 1; i <= len; i++)
5070				end[i] = dtrace_load8((uintptr_t)s++);
5071			*end = '/';
5072
5073			if (depth++ > dtrace_devdepth_max) {
5074				*flags |= CPU_DTRACE_ILLOP;
5075				break;
5076			}
5077		}
5078
5079		if (end < start)
5080			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5081
5082		if (daddr == 0) {
5083			regs[rd] = (uintptr_t)end;
5084			mstate->dtms_scratch_ptr += size;
5085		}
5086
5087		break;
5088	}
5089#endif
5090
5091	case DIF_SUBR_STRJOIN: {
5092		char *d = (char *)mstate->dtms_scratch_ptr;
5093		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5094		uintptr_t s1 = tupregs[0].dttk_value;
5095		uintptr_t s2 = tupregs[1].dttk_value;
5096		int i = 0;
5097
5098		if (!dtrace_strcanload(s1, size, mstate, vstate) ||
5099		    !dtrace_strcanload(s2, size, mstate, vstate)) {
5100			regs[rd] = 0;
5101			break;
5102		}
5103
5104		if (!DTRACE_INSCRATCH(mstate, size)) {
5105			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5106			regs[rd] = 0;
5107			break;
5108		}
5109
5110		for (;;) {
5111			if (i >= size) {
5112				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5113				regs[rd] = 0;
5114				break;
5115			}
5116
5117			if ((d[i++] = dtrace_load8(s1++)) == '\0') {
5118				i--;
5119				break;
5120			}
5121		}
5122
5123		for (;;) {
5124			if (i >= size) {
5125				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5126				regs[rd] = 0;
5127				break;
5128			}
5129
5130			if ((d[i++] = dtrace_load8(s2++)) == '\0')
5131				break;
5132		}
5133
5134		if (i < size) {
5135			mstate->dtms_scratch_ptr += i;
5136			regs[rd] = (uintptr_t)d;
5137		}
5138
5139		break;
5140	}
5141
5142	case DIF_SUBR_STRTOLL: {
5143		uintptr_t s = tupregs[0].dttk_value;
5144		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5145		int base = 10;
5146
5147		if (nargs > 1) {
5148			if ((base = tupregs[1].dttk_value) <= 1 ||
5149			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5150				*flags |= CPU_DTRACE_ILLOP;
5151				break;
5152			}
5153		}
5154
5155		if (!dtrace_strcanload(s, size, mstate, vstate)) {
5156			regs[rd] = INT64_MIN;
5157			break;
5158		}
5159
5160		regs[rd] = dtrace_strtoll((char *)s, base, size);
5161		break;
5162	}
5163
5164	case DIF_SUBR_LLTOSTR: {
5165		int64_t i = (int64_t)tupregs[0].dttk_value;
5166		uint64_t val, digit;
5167		uint64_t size = 65;	/* enough room for 2^64 in binary */
5168		char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
5169		int base = 10;
5170
5171		if (nargs > 1) {
5172			if ((base = tupregs[1].dttk_value) <= 1 ||
5173			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5174				*flags |= CPU_DTRACE_ILLOP;
5175				break;
5176			}
5177		}
5178
5179		val = (base == 10 && i < 0) ? i * -1 : i;
5180
5181		if (!DTRACE_INSCRATCH(mstate, size)) {
5182			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5183			regs[rd] = 0;
5184			break;
5185		}
5186
5187		for (*end-- = '\0'; val; val /= base) {
5188			if ((digit = val % base) <= '9' - '0') {
5189				*end-- = '0' + digit;
5190			} else {
5191				*end-- = 'a' + (digit - ('9' - '0') - 1);
5192			}
5193		}
5194
5195		if (i == 0 && base == 16)
5196			*end-- = '0';
5197
5198		if (base == 16)
5199			*end-- = 'x';
5200
5201		if (i == 0 || base == 8 || base == 16)
5202			*end-- = '0';
5203
5204		if (i < 0 && base == 10)
5205			*end-- = '-';
5206
5207		regs[rd] = (uintptr_t)end + 1;
5208		mstate->dtms_scratch_ptr += size;
5209		break;
5210	}
5211
5212	case DIF_SUBR_HTONS:
5213	case DIF_SUBR_NTOHS:
5214#if BYTE_ORDER == BIG_ENDIAN
5215		regs[rd] = (uint16_t)tupregs[0].dttk_value;
5216#else
5217		regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
5218#endif
5219		break;
5220
5221
5222	case DIF_SUBR_HTONL:
5223	case DIF_SUBR_NTOHL:
5224#if BYTE_ORDER == BIG_ENDIAN
5225		regs[rd] = (uint32_t)tupregs[0].dttk_value;
5226#else
5227		regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
5228#endif
5229		break;
5230
5231
5232	case DIF_SUBR_HTONLL:
5233	case DIF_SUBR_NTOHLL:
5234#if BYTE_ORDER == BIG_ENDIAN
5235		regs[rd] = (uint64_t)tupregs[0].dttk_value;
5236#else
5237		regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
5238#endif
5239		break;
5240
5241
5242	case DIF_SUBR_DIRNAME:
5243	case DIF_SUBR_BASENAME: {
5244		char *dest = (char *)mstate->dtms_scratch_ptr;
5245		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5246		uintptr_t src = tupregs[0].dttk_value;
5247		int i, j, len = dtrace_strlen((char *)src, size);
5248		int lastbase = -1, firstbase = -1, lastdir = -1;
5249		int start, end;
5250
5251		if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5252			regs[rd] = 0;
5253			break;
5254		}
5255
5256		if (!DTRACE_INSCRATCH(mstate, size)) {
5257			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5258			regs[rd] = 0;
5259			break;
5260		}
5261
5262		/*
5263		 * The basename and dirname for a zero-length string is
5264		 * defined to be "."
5265		 */
5266		if (len == 0) {
5267			len = 1;
5268			src = (uintptr_t)".";
5269		}
5270
5271		/*
5272		 * Start from the back of the string, moving back toward the
5273		 * front until we see a character that isn't a slash.  That
5274		 * character is the last character in the basename.
5275		 */
5276		for (i = len - 1; i >= 0; i--) {
5277			if (dtrace_load8(src + i) != '/')
5278				break;
5279		}
5280
5281		if (i >= 0)
5282			lastbase = i;
5283
5284		/*
5285		 * Starting from the last character in the basename, move
5286		 * towards the front until we find a slash.  The character
5287		 * that we processed immediately before that is the first
5288		 * character in the basename.
5289		 */
5290		for (; i >= 0; i--) {
5291			if (dtrace_load8(src + i) == '/')
5292				break;
5293		}
5294
5295		if (i >= 0)
5296			firstbase = i + 1;
5297
5298		/*
5299		 * Now keep going until we find a non-slash character.  That
5300		 * character is the last character in the dirname.
5301		 */
5302		for (; i >= 0; i--) {
5303			if (dtrace_load8(src + i) != '/')
5304				break;
5305		}
5306
5307		if (i >= 0)
5308			lastdir = i;
5309
5310		ASSERT(!(lastbase == -1 && firstbase != -1));
5311		ASSERT(!(firstbase == -1 && lastdir != -1));
5312
5313		if (lastbase == -1) {
5314			/*
5315			 * We didn't find a non-slash character.  We know that
5316			 * the length is non-zero, so the whole string must be
5317			 * slashes.  In either the dirname or the basename
5318			 * case, we return '/'.
5319			 */
5320			ASSERT(firstbase == -1);
5321			firstbase = lastbase = lastdir = 0;
5322		}
5323
5324		if (firstbase == -1) {
5325			/*
5326			 * The entire string consists only of a basename
5327			 * component.  If we're looking for dirname, we need
5328			 * to change our string to be just "."; if we're
5329			 * looking for a basename, we'll just set the first
5330			 * character of the basename to be 0.
5331			 */
5332			if (subr == DIF_SUBR_DIRNAME) {
5333				ASSERT(lastdir == -1);
5334				src = (uintptr_t)".";
5335				lastdir = 0;
5336			} else {
5337				firstbase = 0;
5338			}
5339		}
5340
5341		if (subr == DIF_SUBR_DIRNAME) {
5342			if (lastdir == -1) {
5343				/*
5344				 * We know that we have a slash in the name --
5345				 * or lastdir would be set to 0, above.  And
5346				 * because lastdir is -1, we know that this
5347				 * slash must be the first character.  (That
5348				 * is, the full string must be of the form
5349				 * "/basename".)  In this case, the last
5350				 * character of the directory name is 0.
5351				 */
5352				lastdir = 0;
5353			}
5354
5355			start = 0;
5356			end = lastdir;
5357		} else {
5358			ASSERT(subr == DIF_SUBR_BASENAME);
5359			ASSERT(firstbase != -1 && lastbase != -1);
5360			start = firstbase;
5361			end = lastbase;
5362		}
5363
5364		for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
5365			dest[j] = dtrace_load8(src + i);
5366
5367		dest[j] = '\0';
5368		regs[rd] = (uintptr_t)dest;
5369		mstate->dtms_scratch_ptr += size;
5370		break;
5371	}
5372
5373	case DIF_SUBR_GETF: {
5374		uintptr_t fd = tupregs[0].dttk_value;
5375		struct filedesc *fdp;
5376		file_t *fp;
5377
5378		if (!dtrace_priv_proc(state)) {
5379			regs[rd] = 0;
5380			break;
5381		}
5382		fdp = curproc->p_fd;
5383		FILEDESC_SLOCK(fdp);
5384		fp = fget_locked(fdp, fd);
5385		mstate->dtms_getf = fp;
5386		regs[rd] = (uintptr_t)fp;
5387		FILEDESC_SUNLOCK(fdp);
5388		break;
5389	}
5390
5391	case DIF_SUBR_CLEANPATH: {
5392		char *dest = (char *)mstate->dtms_scratch_ptr, c;
5393		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5394		uintptr_t src = tupregs[0].dttk_value;
5395		int i = 0, j = 0;
5396#if defined(sun)
5397		zone_t *z;
5398#endif
5399
5400		if (!dtrace_strcanload(src, size, mstate, vstate)) {
5401			regs[rd] = 0;
5402			break;
5403		}
5404
5405		if (!DTRACE_INSCRATCH(mstate, size)) {
5406			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5407			regs[rd] = 0;
5408			break;
5409		}
5410
5411		/*
5412		 * Move forward, loading each character.
5413		 */
5414		do {
5415			c = dtrace_load8(src + i++);
5416next:
5417			if (j + 5 >= size)	/* 5 = strlen("/..c\0") */
5418				break;
5419
5420			if (c != '/') {
5421				dest[j++] = c;
5422				continue;
5423			}
5424
5425			c = dtrace_load8(src + i++);
5426
5427			if (c == '/') {
5428				/*
5429				 * We have two slashes -- we can just advance
5430				 * to the next character.
5431				 */
5432				goto next;
5433			}
5434
5435			if (c != '.') {
5436				/*
5437				 * This is not "." and it's not ".." -- we can
5438				 * just store the "/" and this character and
5439				 * drive on.
5440				 */
5441				dest[j++] = '/';
5442				dest[j++] = c;
5443				continue;
5444			}
5445
5446			c = dtrace_load8(src + i++);
5447
5448			if (c == '/') {
5449				/*
5450				 * This is a "/./" component.  We're not going
5451				 * to store anything in the destination buffer;
5452				 * we're just going to go to the next component.
5453				 */
5454				goto next;
5455			}
5456
5457			if (c != '.') {
5458				/*
5459				 * This is not ".." -- we can just store the
5460				 * "/." and this character and continue
5461				 * processing.
5462				 */
5463				dest[j++] = '/';
5464				dest[j++] = '.';
5465				dest[j++] = c;
5466				continue;
5467			}
5468
5469			c = dtrace_load8(src + i++);
5470
5471			if (c != '/' && c != '\0') {
5472				/*
5473				 * This is not ".." -- it's "..[mumble]".
5474				 * We'll store the "/.." and this character
5475				 * and continue processing.
5476				 */
5477				dest[j++] = '/';
5478				dest[j++] = '.';
5479				dest[j++] = '.';
5480				dest[j++] = c;
5481				continue;
5482			}
5483
5484			/*
5485			 * This is "/../" or "/..\0".  We need to back up
5486			 * our destination pointer until we find a "/".
5487			 */
5488			i--;
5489			while (j != 0 && dest[--j] != '/')
5490				continue;
5491
5492			if (c == '\0')
5493				dest[++j] = '/';
5494		} while (c != '\0');
5495
5496		dest[j] = '\0';
5497
5498#if defined(sun)
5499		if (mstate->dtms_getf != NULL &&
5500		    !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
5501		    (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
5502			/*
5503			 * If we've done a getf() as a part of this ECB and we
5504			 * don't have kernel access (and we're not in the global
5505			 * zone), check if the path we cleaned up begins with
5506			 * the zone's root path, and trim it off if so.  Note
5507			 * that this is an output cleanliness issue, not a
5508			 * security issue: knowing one's zone root path does
5509			 * not enable privilege escalation.
5510			 */
5511			if (strstr(dest, z->zone_rootpath) == dest)
5512				dest += strlen(z->zone_rootpath) - 1;
5513		}
5514#endif
5515
5516		regs[rd] = (uintptr_t)dest;
5517		mstate->dtms_scratch_ptr += size;
5518		break;
5519	}
5520
5521	case DIF_SUBR_INET_NTOA:
5522	case DIF_SUBR_INET_NTOA6:
5523	case DIF_SUBR_INET_NTOP: {
5524		size_t size;
5525		int af, argi, i;
5526		char *base, *end;
5527
5528		if (subr == DIF_SUBR_INET_NTOP) {
5529			af = (int)tupregs[0].dttk_value;
5530			argi = 1;
5531		} else {
5532			af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5533			argi = 0;
5534		}
5535
5536		if (af == AF_INET) {
5537			ipaddr_t ip4;
5538			uint8_t *ptr8, val;
5539
5540			/*
5541			 * Safely load the IPv4 address.
5542			 */
5543			ip4 = dtrace_load32(tupregs[argi].dttk_value);
5544
5545			/*
5546			 * Check an IPv4 string will fit in scratch.
5547			 */
5548			size = INET_ADDRSTRLEN;
5549			if (!DTRACE_INSCRATCH(mstate, size)) {
5550				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5551				regs[rd] = 0;
5552				break;
5553			}
5554			base = (char *)mstate->dtms_scratch_ptr;
5555			end = (char *)mstate->dtms_scratch_ptr + size - 1;
5556
5557			/*
5558			 * Stringify as a dotted decimal quad.
5559			 */
5560			*end-- = '\0';
5561			ptr8 = (uint8_t *)&ip4;
5562			for (i = 3; i >= 0; i--) {
5563				val = ptr8[i];
5564
5565				if (val == 0) {
5566					*end-- = '0';
5567				} else {
5568					for (; val; val /= 10) {
5569						*end-- = '0' + (val % 10);
5570					}
5571				}
5572
5573				if (i > 0)
5574					*end-- = '.';
5575			}
5576			ASSERT(end + 1 >= base);
5577
5578		} else if (af == AF_INET6) {
5579			struct in6_addr ip6;
5580			int firstzero, tryzero, numzero, v6end;
5581			uint16_t val;
5582			const char digits[] = "0123456789abcdef";
5583
5584			/*
5585			 * Stringify using RFC 1884 convention 2 - 16 bit
5586			 * hexadecimal values with a zero-run compression.
5587			 * Lower case hexadecimal digits are used.
5588			 * 	eg, fe80::214:4fff:fe0b:76c8.
5589			 * The IPv4 embedded form is returned for inet_ntop,
5590			 * just the IPv4 string is returned for inet_ntoa6.
5591			 */
5592
5593			/*
5594			 * Safely load the IPv6 address.
5595			 */
5596			dtrace_bcopy(
5597			    (void *)(uintptr_t)tupregs[argi].dttk_value,
5598			    (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5599
5600			/*
5601			 * Check an IPv6 string will fit in scratch.
5602			 */
5603			size = INET6_ADDRSTRLEN;
5604			if (!DTRACE_INSCRATCH(mstate, size)) {
5605				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5606				regs[rd] = 0;
5607				break;
5608			}
5609			base = (char *)mstate->dtms_scratch_ptr;
5610			end = (char *)mstate->dtms_scratch_ptr + size - 1;
5611			*end-- = '\0';
5612
5613			/*
5614			 * Find the longest run of 16 bit zero values
5615			 * for the single allowed zero compression - "::".
5616			 */
5617			firstzero = -1;
5618			tryzero = -1;
5619			numzero = 1;
5620			for (i = 0; i < sizeof (struct in6_addr); i++) {
5621#if defined(sun)
5622				if (ip6._S6_un._S6_u8[i] == 0 &&
5623#else
5624				if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5625#endif
5626				    tryzero == -1 && i % 2 == 0) {
5627					tryzero = i;
5628					continue;
5629				}
5630
5631				if (tryzero != -1 &&
5632#if defined(sun)
5633				    (ip6._S6_un._S6_u8[i] != 0 ||
5634#else
5635				    (ip6.__u6_addr.__u6_addr8[i] != 0 ||
5636#endif
5637				    i == sizeof (struct in6_addr) - 1)) {
5638
5639					if (i - tryzero <= numzero) {
5640						tryzero = -1;
5641						continue;
5642					}
5643
5644					firstzero = tryzero;
5645					numzero = i - i % 2 - tryzero;
5646					tryzero = -1;
5647
5648#if defined(sun)
5649					if (ip6._S6_un._S6_u8[i] == 0 &&
5650#else
5651					if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5652#endif
5653					    i == sizeof (struct in6_addr) - 1)
5654						numzero += 2;
5655				}
5656			}
5657			ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
5658
5659			/*
5660			 * Check for an IPv4 embedded address.
5661			 */
5662			v6end = sizeof (struct in6_addr) - 2;
5663			if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5664			    IN6_IS_ADDR_V4COMPAT(&ip6)) {
5665				for (i = sizeof (struct in6_addr) - 1;
5666				    i >= DTRACE_V4MAPPED_OFFSET; i--) {
5667					ASSERT(end >= base);
5668
5669#if defined(sun)
5670					val = ip6._S6_un._S6_u8[i];
5671#else
5672					val = ip6.__u6_addr.__u6_addr8[i];
5673#endif
5674
5675					if (val == 0) {
5676						*end-- = '0';
5677					} else {
5678						for (; val; val /= 10) {
5679							*end-- = '0' + val % 10;
5680						}
5681					}
5682
5683					if (i > DTRACE_V4MAPPED_OFFSET)
5684						*end-- = '.';
5685				}
5686
5687				if (subr == DIF_SUBR_INET_NTOA6)
5688					goto inetout;
5689
5690				/*
5691				 * Set v6end to skip the IPv4 address that
5692				 * we have already stringified.
5693				 */
5694				v6end = 10;
5695			}
5696
5697			/*
5698			 * Build the IPv6 string by working through the
5699			 * address in reverse.
5700			 */
5701			for (i = v6end; i >= 0; i -= 2) {
5702				ASSERT(end >= base);
5703
5704				if (i == firstzero + numzero - 2) {
5705					*end-- = ':';
5706					*end-- = ':';
5707					i -= numzero - 2;
5708					continue;
5709				}
5710
5711				if (i < 14 && i != firstzero - 2)
5712					*end-- = ':';
5713
5714#if defined(sun)
5715				val = (ip6._S6_un._S6_u8[i] << 8) +
5716				    ip6._S6_un._S6_u8[i + 1];
5717#else
5718				val = (ip6.__u6_addr.__u6_addr8[i] << 8) +
5719				    ip6.__u6_addr.__u6_addr8[i + 1];
5720#endif
5721
5722				if (val == 0) {
5723					*end-- = '0';
5724				} else {
5725					for (; val; val /= 16) {
5726						*end-- = digits[val % 16];
5727					}
5728				}
5729			}
5730			ASSERT(end + 1 >= base);
5731
5732		} else {
5733			/*
5734			 * The user didn't use AH_INET or AH_INET6.
5735			 */
5736			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5737			regs[rd] = 0;
5738			break;
5739		}
5740
5741inetout:	regs[rd] = (uintptr_t)end + 1;
5742		mstate->dtms_scratch_ptr += size;
5743		break;
5744	}
5745
5746	case DIF_SUBR_MEMREF: {
5747		uintptr_t size = 2 * sizeof(uintptr_t);
5748		uintptr_t *memref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
5749		size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size;
5750
5751		/* address and length */
5752		memref[0] = tupregs[0].dttk_value;
5753		memref[1] = tupregs[1].dttk_value;
5754
5755		regs[rd] = (uintptr_t) memref;
5756		mstate->dtms_scratch_ptr += scratch_size;
5757		break;
5758	}
5759
5760	case DIF_SUBR_TYPEREF: {
5761		uintptr_t size = 4 * sizeof(uintptr_t);
5762		uintptr_t *typeref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
5763		size_t scratch_size = ((uintptr_t) typeref - mstate->dtms_scratch_ptr) + size;
5764
5765		/* address, num_elements, type_str, type_len */
5766		typeref[0] = tupregs[0].dttk_value;
5767		typeref[1] = tupregs[1].dttk_value;
5768		typeref[2] = tupregs[2].dttk_value;
5769		typeref[3] = tupregs[3].dttk_value;
5770
5771		regs[rd] = (uintptr_t) typeref;
5772		mstate->dtms_scratch_ptr += scratch_size;
5773		break;
5774	}
5775	}
5776}
5777
5778/*
5779 * Emulate the execution of DTrace IR instructions specified by the given
5780 * DIF object.  This function is deliberately void of assertions as all of
5781 * the necessary checks are handled by a call to dtrace_difo_validate().
5782 */
5783static uint64_t
5784dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5785    dtrace_vstate_t *vstate, dtrace_state_t *state)
5786{
5787	const dif_instr_t *text = difo->dtdo_buf;
5788	const uint_t textlen = difo->dtdo_len;
5789	const char *strtab = difo->dtdo_strtab;
5790	const uint64_t *inttab = difo->dtdo_inttab;
5791
5792	uint64_t rval = 0;
5793	dtrace_statvar_t *svar;
5794	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5795	dtrace_difv_t *v;
5796	volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
5797	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
5798
5799	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5800	uint64_t regs[DIF_DIR_NREGS];
5801	uint64_t *tmp;
5802
5803	uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5804	int64_t cc_r;
5805	uint_t pc = 0, id, opc = 0;
5806	uint8_t ttop = 0;
5807	dif_instr_t instr;
5808	uint_t r1, r2, rd;
5809
5810	/*
5811	 * We stash the current DIF object into the machine state: we need it
5812	 * for subsequent access checking.
5813	 */
5814	mstate->dtms_difo = difo;
5815
5816	regs[DIF_REG_R0] = 0; 		/* %r0 is fixed at zero */
5817
5818	while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5819		opc = pc;
5820
5821		instr = text[pc++];
5822		r1 = DIF_INSTR_R1(instr);
5823		r2 = DIF_INSTR_R2(instr);
5824		rd = DIF_INSTR_RD(instr);
5825
5826		switch (DIF_INSTR_OP(instr)) {
5827		case DIF_OP_OR:
5828			regs[rd] = regs[r1] | regs[r2];
5829			break;
5830		case DIF_OP_XOR:
5831			regs[rd] = regs[r1] ^ regs[r2];
5832			break;
5833		case DIF_OP_AND:
5834			regs[rd] = regs[r1] & regs[r2];
5835			break;
5836		case DIF_OP_SLL:
5837			regs[rd] = regs[r1] << regs[r2];
5838			break;
5839		case DIF_OP_SRL:
5840			regs[rd] = regs[r1] >> regs[r2];
5841			break;
5842		case DIF_OP_SUB:
5843			regs[rd] = regs[r1] - regs[r2];
5844			break;
5845		case DIF_OP_ADD:
5846			regs[rd] = regs[r1] + regs[r2];
5847			break;
5848		case DIF_OP_MUL:
5849			regs[rd] = regs[r1] * regs[r2];
5850			break;
5851		case DIF_OP_SDIV:
5852			if (regs[r2] == 0) {
5853				regs[rd] = 0;
5854				*flags |= CPU_DTRACE_DIVZERO;
5855			} else {
5856				regs[rd] = (int64_t)regs[r1] /
5857				    (int64_t)regs[r2];
5858			}
5859			break;
5860
5861		case DIF_OP_UDIV:
5862			if (regs[r2] == 0) {
5863				regs[rd] = 0;
5864				*flags |= CPU_DTRACE_DIVZERO;
5865			} else {
5866				regs[rd] = regs[r1] / regs[r2];
5867			}
5868			break;
5869
5870		case DIF_OP_SREM:
5871			if (regs[r2] == 0) {
5872				regs[rd] = 0;
5873				*flags |= CPU_DTRACE_DIVZERO;
5874			} else {
5875				regs[rd] = (int64_t)regs[r1] %
5876				    (int64_t)regs[r2];
5877			}
5878			break;
5879
5880		case DIF_OP_UREM:
5881			if (regs[r2] == 0) {
5882				regs[rd] = 0;
5883				*flags |= CPU_DTRACE_DIVZERO;
5884			} else {
5885				regs[rd] = regs[r1] % regs[r2];
5886			}
5887			break;
5888
5889		case DIF_OP_NOT:
5890			regs[rd] = ~regs[r1];
5891			break;
5892		case DIF_OP_MOV:
5893			regs[rd] = regs[r1];
5894			break;
5895		case DIF_OP_CMP:
5896			cc_r = regs[r1] - regs[r2];
5897			cc_n = cc_r < 0;
5898			cc_z = cc_r == 0;
5899			cc_v = 0;
5900			cc_c = regs[r1] < regs[r2];
5901			break;
5902		case DIF_OP_TST:
5903			cc_n = cc_v = cc_c = 0;
5904			cc_z = regs[r1] == 0;
5905			break;
5906		case DIF_OP_BA:
5907			pc = DIF_INSTR_LABEL(instr);
5908			break;
5909		case DIF_OP_BE:
5910			if (cc_z)
5911				pc = DIF_INSTR_LABEL(instr);
5912			break;
5913		case DIF_OP_BNE:
5914			if (cc_z == 0)
5915				pc = DIF_INSTR_LABEL(instr);
5916			break;
5917		case DIF_OP_BG:
5918			if ((cc_z | (cc_n ^ cc_v)) == 0)
5919				pc = DIF_INSTR_LABEL(instr);
5920			break;
5921		case DIF_OP_BGU:
5922			if ((cc_c | cc_z) == 0)
5923				pc = DIF_INSTR_LABEL(instr);
5924			break;
5925		case DIF_OP_BGE:
5926			if ((cc_n ^ cc_v) == 0)
5927				pc = DIF_INSTR_LABEL(instr);
5928			break;
5929		case DIF_OP_BGEU:
5930			if (cc_c == 0)
5931				pc = DIF_INSTR_LABEL(instr);
5932			break;
5933		case DIF_OP_BL:
5934			if (cc_n ^ cc_v)
5935				pc = DIF_INSTR_LABEL(instr);
5936			break;
5937		case DIF_OP_BLU:
5938			if (cc_c)
5939				pc = DIF_INSTR_LABEL(instr);
5940			break;
5941		case DIF_OP_BLE:
5942			if (cc_z | (cc_n ^ cc_v))
5943				pc = DIF_INSTR_LABEL(instr);
5944			break;
5945		case DIF_OP_BLEU:
5946			if (cc_c | cc_z)
5947				pc = DIF_INSTR_LABEL(instr);
5948			break;
5949		case DIF_OP_RLDSB:
5950			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
5951				break;
5952			/*FALLTHROUGH*/
5953		case DIF_OP_LDSB:
5954			regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5955			break;
5956		case DIF_OP_RLDSH:
5957			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
5958				break;
5959			/*FALLTHROUGH*/
5960		case DIF_OP_LDSH:
5961			regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5962			break;
5963		case DIF_OP_RLDSW:
5964			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
5965				break;
5966			/*FALLTHROUGH*/
5967		case DIF_OP_LDSW:
5968			regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5969			break;
5970		case DIF_OP_RLDUB:
5971			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
5972				break;
5973			/*FALLTHROUGH*/
5974		case DIF_OP_LDUB:
5975			regs[rd] = dtrace_load8(regs[r1]);
5976			break;
5977		case DIF_OP_RLDUH:
5978			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
5979				break;
5980			/*FALLTHROUGH*/
5981		case DIF_OP_LDUH:
5982			regs[rd] = dtrace_load16(regs[r1]);
5983			break;
5984		case DIF_OP_RLDUW:
5985			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
5986				break;
5987			/*FALLTHROUGH*/
5988		case DIF_OP_LDUW:
5989			regs[rd] = dtrace_load32(regs[r1]);
5990			break;
5991		case DIF_OP_RLDX:
5992			if (!dtrace_canload(regs[r1], 8, mstate, vstate))
5993				break;
5994			/*FALLTHROUGH*/
5995		case DIF_OP_LDX:
5996			regs[rd] = dtrace_load64(regs[r1]);
5997			break;
5998		case DIF_OP_ULDSB:
5999			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6000			regs[rd] = (int8_t)
6001			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6002			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6003			break;
6004		case DIF_OP_ULDSH:
6005			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6006			regs[rd] = (int16_t)
6007			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6008			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6009			break;
6010		case DIF_OP_ULDSW:
6011			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6012			regs[rd] = (int32_t)
6013			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6014			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6015			break;
6016		case DIF_OP_ULDUB:
6017			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6018			regs[rd] =
6019			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6020			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6021			break;
6022		case DIF_OP_ULDUH:
6023			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6024			regs[rd] =
6025			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6026			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6027			break;
6028		case DIF_OP_ULDUW:
6029			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6030			regs[rd] =
6031			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6032			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6033			break;
6034		case DIF_OP_ULDX:
6035			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6036			regs[rd] =
6037			    dtrace_fuword64((void *)(uintptr_t)regs[r1]);
6038			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6039			break;
6040		case DIF_OP_RET:
6041			rval = regs[rd];
6042			pc = textlen;
6043			break;
6044		case DIF_OP_NOP:
6045			break;
6046		case DIF_OP_SETX:
6047			regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
6048			break;
6049		case DIF_OP_SETS:
6050			regs[rd] = (uint64_t)(uintptr_t)
6051			    (strtab + DIF_INSTR_STRING(instr));
6052			break;
6053		case DIF_OP_SCMP: {
6054			size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
6055			uintptr_t s1 = regs[r1];
6056			uintptr_t s2 = regs[r2];
6057
6058			if (s1 != 0 &&
6059			    !dtrace_strcanload(s1, sz, mstate, vstate))
6060				break;
6061			if (s2 != 0 &&
6062			    !dtrace_strcanload(s2, sz, mstate, vstate))
6063				break;
6064
6065			cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
6066
6067			cc_n = cc_r < 0;
6068			cc_z = cc_r == 0;
6069			cc_v = cc_c = 0;
6070			break;
6071		}
6072		case DIF_OP_LDGA:
6073			regs[rd] = dtrace_dif_variable(mstate, state,
6074			    r1, regs[r2]);
6075			break;
6076		case DIF_OP_LDGS:
6077			id = DIF_INSTR_VAR(instr);
6078
6079			if (id >= DIF_VAR_OTHER_UBASE) {
6080				uintptr_t a;
6081
6082				id -= DIF_VAR_OTHER_UBASE;
6083				svar = vstate->dtvs_globals[id];
6084				ASSERT(svar != NULL);
6085				v = &svar->dtsv_var;
6086
6087				if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
6088					regs[rd] = svar->dtsv_data;
6089					break;
6090				}
6091
6092				a = (uintptr_t)svar->dtsv_data;
6093
6094				if (*(uint8_t *)a == UINT8_MAX) {
6095					/*
6096					 * If the 0th byte is set to UINT8_MAX
6097					 * then this is to be treated as a
6098					 * reference to a NULL variable.
6099					 */
6100					regs[rd] = 0;
6101				} else {
6102					regs[rd] = a + sizeof (uint64_t);
6103				}
6104
6105				break;
6106			}
6107
6108			regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6109			break;
6110
6111		case DIF_OP_STGS:
6112			id = DIF_INSTR_VAR(instr);
6113
6114			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6115			id -= DIF_VAR_OTHER_UBASE;
6116
6117			svar = vstate->dtvs_globals[id];
6118			ASSERT(svar != NULL);
6119			v = &svar->dtsv_var;
6120
6121			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6122				uintptr_t a = (uintptr_t)svar->dtsv_data;
6123
6124				ASSERT(a != 0);
6125				ASSERT(svar->dtsv_size != 0);
6126
6127				if (regs[rd] == 0) {
6128					*(uint8_t *)a = UINT8_MAX;
6129					break;
6130				} else {
6131					*(uint8_t *)a = 0;
6132					a += sizeof (uint64_t);
6133				}
6134				if (!dtrace_vcanload(
6135				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6136				    mstate, vstate))
6137					break;
6138
6139				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6140				    (void *)a, &v->dtdv_type);
6141				break;
6142			}
6143
6144			svar->dtsv_data = regs[rd];
6145			break;
6146
6147		case DIF_OP_LDTA:
6148			/*
6149			 * There are no DTrace built-in thread-local arrays at
6150			 * present.  This opcode is saved for future work.
6151			 */
6152			*flags |= CPU_DTRACE_ILLOP;
6153			regs[rd] = 0;
6154			break;
6155
6156		case DIF_OP_LDLS:
6157			id = DIF_INSTR_VAR(instr);
6158
6159			if (id < DIF_VAR_OTHER_UBASE) {
6160				/*
6161				 * For now, this has no meaning.
6162				 */
6163				regs[rd] = 0;
6164				break;
6165			}
6166
6167			id -= DIF_VAR_OTHER_UBASE;
6168
6169			ASSERT(id < vstate->dtvs_nlocals);
6170			ASSERT(vstate->dtvs_locals != NULL);
6171
6172			svar = vstate->dtvs_locals[id];
6173			ASSERT(svar != NULL);
6174			v = &svar->dtsv_var;
6175
6176			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6177				uintptr_t a = (uintptr_t)svar->dtsv_data;
6178				size_t sz = v->dtdv_type.dtdt_size;
6179
6180				sz += sizeof (uint64_t);
6181				ASSERT(svar->dtsv_size == NCPU * sz);
6182				a += curcpu * sz;
6183
6184				if (*(uint8_t *)a == UINT8_MAX) {
6185					/*
6186					 * If the 0th byte is set to UINT8_MAX
6187					 * then this is to be treated as a
6188					 * reference to a NULL variable.
6189					 */
6190					regs[rd] = 0;
6191				} else {
6192					regs[rd] = a + sizeof (uint64_t);
6193				}
6194
6195				break;
6196			}
6197
6198			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6199			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6200			regs[rd] = tmp[curcpu];
6201			break;
6202
6203		case DIF_OP_STLS:
6204			id = DIF_INSTR_VAR(instr);
6205
6206			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6207			id -= DIF_VAR_OTHER_UBASE;
6208			ASSERT(id < vstate->dtvs_nlocals);
6209
6210			ASSERT(vstate->dtvs_locals != NULL);
6211			svar = vstate->dtvs_locals[id];
6212			ASSERT(svar != NULL);
6213			v = &svar->dtsv_var;
6214
6215			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6216				uintptr_t a = (uintptr_t)svar->dtsv_data;
6217				size_t sz = v->dtdv_type.dtdt_size;
6218
6219				sz += sizeof (uint64_t);
6220				ASSERT(svar->dtsv_size == NCPU * sz);
6221				a += curcpu * sz;
6222
6223				if (regs[rd] == 0) {
6224					*(uint8_t *)a = UINT8_MAX;
6225					break;
6226				} else {
6227					*(uint8_t *)a = 0;
6228					a += sizeof (uint64_t);
6229				}
6230
6231				if (!dtrace_vcanload(
6232				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6233				    mstate, vstate))
6234					break;
6235
6236				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6237				    (void *)a, &v->dtdv_type);
6238				break;
6239			}
6240
6241			ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6242			tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6243			tmp[curcpu] = regs[rd];
6244			break;
6245
6246		case DIF_OP_LDTS: {
6247			dtrace_dynvar_t *dvar;
6248			dtrace_key_t *key;
6249
6250			id = DIF_INSTR_VAR(instr);
6251			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6252			id -= DIF_VAR_OTHER_UBASE;
6253			v = &vstate->dtvs_tlocals[id];
6254
6255			key = &tupregs[DIF_DTR_NREGS];
6256			key[0].dttk_value = (uint64_t)id;
6257			key[0].dttk_size = 0;
6258			DTRACE_TLS_THRKEY(key[1].dttk_value);
6259			key[1].dttk_size = 0;
6260
6261			dvar = dtrace_dynvar(dstate, 2, key,
6262			    sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6263			    mstate, vstate);
6264
6265			if (dvar == NULL) {
6266				regs[rd] = 0;
6267				break;
6268			}
6269
6270			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6271				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6272			} else {
6273				regs[rd] = *((uint64_t *)dvar->dtdv_data);
6274			}
6275
6276			break;
6277		}
6278
6279		case DIF_OP_STTS: {
6280			dtrace_dynvar_t *dvar;
6281			dtrace_key_t *key;
6282
6283			id = DIF_INSTR_VAR(instr);
6284			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6285			id -= DIF_VAR_OTHER_UBASE;
6286
6287			key = &tupregs[DIF_DTR_NREGS];
6288			key[0].dttk_value = (uint64_t)id;
6289			key[0].dttk_size = 0;
6290			DTRACE_TLS_THRKEY(key[1].dttk_value);
6291			key[1].dttk_size = 0;
6292			v = &vstate->dtvs_tlocals[id];
6293
6294			dvar = dtrace_dynvar(dstate, 2, key,
6295			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6296			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6297			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
6298			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6299
6300			/*
6301			 * Given that we're storing to thread-local data,
6302			 * we need to flush our predicate cache.
6303			 */
6304			curthread->t_predcache = 0;
6305
6306			if (dvar == NULL)
6307				break;
6308
6309			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6310				if (!dtrace_vcanload(
6311				    (void *)(uintptr_t)regs[rd],
6312				    &v->dtdv_type, mstate, vstate))
6313					break;
6314
6315				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6316				    dvar->dtdv_data, &v->dtdv_type);
6317			} else {
6318				*((uint64_t *)dvar->dtdv_data) = regs[rd];
6319			}
6320
6321			break;
6322		}
6323
6324		case DIF_OP_SRA:
6325			regs[rd] = (int64_t)regs[r1] >> regs[r2];
6326			break;
6327
6328		case DIF_OP_CALL:
6329			dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6330			    regs, tupregs, ttop, mstate, state);
6331			break;
6332
6333		case DIF_OP_PUSHTR:
6334			if (ttop == DIF_DTR_NREGS) {
6335				*flags |= CPU_DTRACE_TUPOFLOW;
6336				break;
6337			}
6338
6339			if (r1 == DIF_TYPE_STRING) {
6340				/*
6341				 * If this is a string type and the size is 0,
6342				 * we'll use the system-wide default string
6343				 * size.  Note that we are _not_ looking at
6344				 * the value of the DTRACEOPT_STRSIZE option;
6345				 * had this been set, we would expect to have
6346				 * a non-zero size value in the "pushtr".
6347				 */
6348				tupregs[ttop].dttk_size =
6349				    dtrace_strlen((char *)(uintptr_t)regs[rd],
6350				    regs[r2] ? regs[r2] :
6351				    dtrace_strsize_default) + 1;
6352			} else {
6353				tupregs[ttop].dttk_size = regs[r2];
6354			}
6355
6356			tupregs[ttop++].dttk_value = regs[rd];
6357			break;
6358
6359		case DIF_OP_PUSHTV:
6360			if (ttop == DIF_DTR_NREGS) {
6361				*flags |= CPU_DTRACE_TUPOFLOW;
6362				break;
6363			}
6364
6365			tupregs[ttop].dttk_value = regs[rd];
6366			tupregs[ttop++].dttk_size = 0;
6367			break;
6368
6369		case DIF_OP_POPTS:
6370			if (ttop != 0)
6371				ttop--;
6372			break;
6373
6374		case DIF_OP_FLUSHTS:
6375			ttop = 0;
6376			break;
6377
6378		case DIF_OP_LDGAA:
6379		case DIF_OP_LDTAA: {
6380			dtrace_dynvar_t *dvar;
6381			dtrace_key_t *key = tupregs;
6382			uint_t nkeys = ttop;
6383
6384			id = DIF_INSTR_VAR(instr);
6385			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6386			id -= DIF_VAR_OTHER_UBASE;
6387
6388			key[nkeys].dttk_value = (uint64_t)id;
6389			key[nkeys++].dttk_size = 0;
6390
6391			if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6392				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6393				key[nkeys++].dttk_size = 0;
6394				v = &vstate->dtvs_tlocals[id];
6395			} else {
6396				v = &vstate->dtvs_globals[id]->dtsv_var;
6397			}
6398
6399			dvar = dtrace_dynvar(dstate, nkeys, key,
6400			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6401			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6402			    DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6403
6404			if (dvar == NULL) {
6405				regs[rd] = 0;
6406				break;
6407			}
6408
6409			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6410				regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6411			} else {
6412				regs[rd] = *((uint64_t *)dvar->dtdv_data);
6413			}
6414
6415			break;
6416		}
6417
6418		case DIF_OP_STGAA:
6419		case DIF_OP_STTAA: {
6420			dtrace_dynvar_t *dvar;
6421			dtrace_key_t *key = tupregs;
6422			uint_t nkeys = ttop;
6423
6424			id = DIF_INSTR_VAR(instr);
6425			ASSERT(id >= DIF_VAR_OTHER_UBASE);
6426			id -= DIF_VAR_OTHER_UBASE;
6427
6428			key[nkeys].dttk_value = (uint64_t)id;
6429			key[nkeys++].dttk_size = 0;
6430
6431			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6432				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6433				key[nkeys++].dttk_size = 0;
6434				v = &vstate->dtvs_tlocals[id];
6435			} else {
6436				v = &vstate->dtvs_globals[id]->dtsv_var;
6437			}
6438
6439			dvar = dtrace_dynvar(dstate, nkeys, key,
6440			    v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6441			    v->dtdv_type.dtdt_size : sizeof (uint64_t),
6442			    regs[rd] ? DTRACE_DYNVAR_ALLOC :
6443			    DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6444
6445			if (dvar == NULL)
6446				break;
6447
6448			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6449				if (!dtrace_vcanload(
6450				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6451				    mstate, vstate))
6452					break;
6453
6454				dtrace_vcopy((void *)(uintptr_t)regs[rd],
6455				    dvar->dtdv_data, &v->dtdv_type);
6456			} else {
6457				*((uint64_t *)dvar->dtdv_data) = regs[rd];
6458			}
6459
6460			break;
6461		}
6462
6463		case DIF_OP_ALLOCS: {
6464			uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6465			size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6466
6467			/*
6468			 * Rounding up the user allocation size could have
6469			 * overflowed large, bogus allocations (like -1ULL) to
6470			 * 0.
6471			 */
6472			if (size < regs[r1] ||
6473			    !DTRACE_INSCRATCH(mstate, size)) {
6474				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6475				regs[rd] = 0;
6476				break;
6477			}
6478
6479			dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6480			mstate->dtms_scratch_ptr += size;
6481			regs[rd] = ptr;
6482			break;
6483		}
6484
6485		case DIF_OP_COPYS:
6486			if (!dtrace_canstore(regs[rd], regs[r2],
6487			    mstate, vstate)) {
6488				*flags |= CPU_DTRACE_BADADDR;
6489				*illval = regs[rd];
6490				break;
6491			}
6492
6493			if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6494				break;
6495
6496			dtrace_bcopy((void *)(uintptr_t)regs[r1],
6497			    (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6498			break;
6499
6500		case DIF_OP_STB:
6501			if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6502				*flags |= CPU_DTRACE_BADADDR;
6503				*illval = regs[rd];
6504				break;
6505			}
6506			*((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6507			break;
6508
6509		case DIF_OP_STH:
6510			if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6511				*flags |= CPU_DTRACE_BADADDR;
6512				*illval = regs[rd];
6513				break;
6514			}
6515			if (regs[rd] & 1) {
6516				*flags |= CPU_DTRACE_BADALIGN;
6517				*illval = regs[rd];
6518				break;
6519			}
6520			*((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6521			break;
6522
6523		case DIF_OP_STW:
6524			if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6525				*flags |= CPU_DTRACE_BADADDR;
6526				*illval = regs[rd];
6527				break;
6528			}
6529			if (regs[rd] & 3) {
6530				*flags |= CPU_DTRACE_BADALIGN;
6531				*illval = regs[rd];
6532				break;
6533			}
6534			*((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6535			break;
6536
6537		case DIF_OP_STX:
6538			if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6539				*flags |= CPU_DTRACE_BADADDR;
6540				*illval = regs[rd];
6541				break;
6542			}
6543			if (regs[rd] & 7) {
6544				*flags |= CPU_DTRACE_BADALIGN;
6545				*illval = regs[rd];
6546				break;
6547			}
6548			*((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6549			break;
6550		}
6551	}
6552
6553	if (!(*flags & CPU_DTRACE_FAULT))
6554		return (rval);
6555
6556	mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6557	mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6558
6559	return (0);
6560}
6561
6562static void
6563dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6564{
6565	dtrace_probe_t *probe = ecb->dte_probe;
6566	dtrace_provider_t *prov = probe->dtpr_provider;
6567	char c[DTRACE_FULLNAMELEN + 80], *str;
6568	char *msg = "dtrace: breakpoint action at probe ";
6569	char *ecbmsg = " (ecb ";
6570	uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6571	uintptr_t val = (uintptr_t)ecb;
6572	int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6573
6574	if (dtrace_destructive_disallow)
6575		return;
6576
6577	/*
6578	 * It's impossible to be taking action on the NULL probe.
6579	 */
6580	ASSERT(probe != NULL);
6581
6582	/*
6583	 * This is a poor man's (destitute man's?) sprintf():  we want to
6584	 * print the provider name, module name, function name and name of
6585	 * the probe, along with the hex address of the ECB with the breakpoint
6586	 * action -- all of which we must place in the character buffer by
6587	 * hand.
6588	 */
6589	while (*msg != '\0')
6590		c[i++] = *msg++;
6591
6592	for (str = prov->dtpv_name; *str != '\0'; str++)
6593		c[i++] = *str;
6594	c[i++] = ':';
6595
6596	for (str = probe->dtpr_mod; *str != '\0'; str++)
6597		c[i++] = *str;
6598	c[i++] = ':';
6599
6600	for (str = probe->dtpr_func; *str != '\0'; str++)
6601		c[i++] = *str;
6602	c[i++] = ':';
6603
6604	for (str = probe->dtpr_name; *str != '\0'; str++)
6605		c[i++] = *str;
6606
6607	while (*ecbmsg != '\0')
6608		c[i++] = *ecbmsg++;
6609
6610	while (shift >= 0) {
6611		mask = (uintptr_t)0xf << shift;
6612
6613		if (val >= ((uintptr_t)1 << shift))
6614			c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6615		shift -= 4;
6616	}
6617
6618	c[i++] = ')';
6619	c[i] = '\0';
6620
6621#if defined(sun)
6622	debug_enter(c);
6623#else
6624	kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
6625#endif
6626}
6627
6628static void
6629dtrace_action_panic(dtrace_ecb_t *ecb)
6630{
6631	dtrace_probe_t *probe = ecb->dte_probe;
6632
6633	/*
6634	 * It's impossible to be taking action on the NULL probe.
6635	 */
6636	ASSERT(probe != NULL);
6637
6638	if (dtrace_destructive_disallow)
6639		return;
6640
6641	if (dtrace_panicked != NULL)
6642		return;
6643
6644	if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
6645		return;
6646
6647	/*
6648	 * We won the right to panic.  (We want to be sure that only one
6649	 * thread calls panic() from dtrace_probe(), and that panic() is
6650	 * called exactly once.)
6651	 */
6652	dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6653	    probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6654	    probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6655}
6656
6657static void
6658dtrace_action_raise(uint64_t sig)
6659{
6660	if (dtrace_destructive_disallow)
6661		return;
6662
6663	if (sig >= NSIG) {
6664		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6665		return;
6666	}
6667
6668#if defined(sun)
6669	/*
6670	 * raise() has a queue depth of 1 -- we ignore all subsequent
6671	 * invocations of the raise() action.
6672	 */
6673	if (curthread->t_dtrace_sig == 0)
6674		curthread->t_dtrace_sig = (uint8_t)sig;
6675
6676	curthread->t_sig_check = 1;
6677	aston(curthread);
6678#else
6679	struct proc *p = curproc;
6680	PROC_LOCK(p);
6681	kern_psignal(p, sig);
6682	PROC_UNLOCK(p);
6683#endif
6684}
6685
6686static void
6687dtrace_action_stop(void)
6688{
6689	if (dtrace_destructive_disallow)
6690		return;
6691
6692#if defined(sun)
6693	if (!curthread->t_dtrace_stop) {
6694		curthread->t_dtrace_stop = 1;
6695		curthread->t_sig_check = 1;
6696		aston(curthread);
6697	}
6698#else
6699	struct proc *p = curproc;
6700	PROC_LOCK(p);
6701	kern_psignal(p, SIGSTOP);
6702	PROC_UNLOCK(p);
6703#endif
6704}
6705
6706static void
6707dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6708{
6709	hrtime_t now;
6710	volatile uint16_t *flags;
6711#if defined(sun)
6712	cpu_t *cpu = CPU;
6713#else
6714	cpu_t *cpu = &solaris_cpu[curcpu];
6715#endif
6716
6717	if (dtrace_destructive_disallow)
6718		return;
6719
6720	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
6721
6722	now = dtrace_gethrtime();
6723
6724	if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6725		/*
6726		 * We need to advance the mark to the current time.
6727		 */
6728		cpu->cpu_dtrace_chillmark = now;
6729		cpu->cpu_dtrace_chilled = 0;
6730	}
6731
6732	/*
6733	 * Now check to see if the requested chill time would take us over
6734	 * the maximum amount of time allowed in the chill interval.  (Or
6735	 * worse, if the calculation itself induces overflow.)
6736	 */
6737	if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6738	    cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6739		*flags |= CPU_DTRACE_ILLOP;
6740		return;
6741	}
6742
6743	while (dtrace_gethrtime() - now < val)
6744		continue;
6745
6746	/*
6747	 * Normally, we assure that the value of the variable "timestamp" does
6748	 * not change within an ECB.  The presence of chill() represents an
6749	 * exception to this rule, however.
6750	 */
6751	mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6752	cpu->cpu_dtrace_chilled += val;
6753}
6754
6755static void
6756dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6757    uint64_t *buf, uint64_t arg)
6758{
6759	int nframes = DTRACE_USTACK_NFRAMES(arg);
6760	int strsize = DTRACE_USTACK_STRSIZE(arg);
6761	uint64_t *pcs = &buf[1], *fps;
6762	char *str = (char *)&pcs[nframes];
6763	int size, offs = 0, i, j;
6764	uintptr_t old = mstate->dtms_scratch_ptr, saved;
6765	uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
6766	char *sym;
6767
6768	/*
6769	 * Should be taking a faster path if string space has not been
6770	 * allocated.
6771	 */
6772	ASSERT(strsize != 0);
6773
6774	/*
6775	 * We will first allocate some temporary space for the frame pointers.
6776	 */
6777	fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6778	size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6779	    (nframes * sizeof (uint64_t));
6780
6781	if (!DTRACE_INSCRATCH(mstate, size)) {
6782		/*
6783		 * Not enough room for our frame pointers -- need to indicate
6784		 * that we ran out of scratch space.
6785		 */
6786		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6787		return;
6788	}
6789
6790	mstate->dtms_scratch_ptr += size;
6791	saved = mstate->dtms_scratch_ptr;
6792
6793	/*
6794	 * Now get a stack with both program counters and frame pointers.
6795	 */
6796	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6797	dtrace_getufpstack(buf, fps, nframes + 1);
6798	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6799
6800	/*
6801	 * If that faulted, we're cooked.
6802	 */
6803	if (*flags & CPU_DTRACE_FAULT)
6804		goto out;
6805
6806	/*
6807	 * Now we want to walk up the stack, calling the USTACK helper.  For
6808	 * each iteration, we restore the scratch pointer.
6809	 */
6810	for (i = 0; i < nframes; i++) {
6811		mstate->dtms_scratch_ptr = saved;
6812
6813		if (offs >= strsize)
6814			break;
6815
6816		sym = (char *)(uintptr_t)dtrace_helper(
6817		    DTRACE_HELPER_ACTION_USTACK,
6818		    mstate, state, pcs[i], fps[i]);
6819
6820		/*
6821		 * If we faulted while running the helper, we're going to
6822		 * clear the fault and null out the corresponding string.
6823		 */
6824		if (*flags & CPU_DTRACE_FAULT) {
6825			*flags &= ~CPU_DTRACE_FAULT;
6826			str[offs++] = '\0';
6827			continue;
6828		}
6829
6830		if (sym == NULL) {
6831			str[offs++] = '\0';
6832			continue;
6833		}
6834
6835		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6836
6837		/*
6838		 * Now copy in the string that the helper returned to us.
6839		 */
6840		for (j = 0; offs + j < strsize; j++) {
6841			if ((str[offs + j] = sym[j]) == '\0')
6842				break;
6843		}
6844
6845		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6846
6847		offs += j + 1;
6848	}
6849
6850	if (offs >= strsize) {
6851		/*
6852		 * If we didn't have room for all of the strings, we don't
6853		 * abort processing -- this needn't be a fatal error -- but we
6854		 * still want to increment a counter (dts_stkstroverflows) to
6855		 * allow this condition to be warned about.  (If this is from
6856		 * a jstack() action, it is easily tuned via jstackstrsize.)
6857		 */
6858		dtrace_error(&state->dts_stkstroverflows);
6859	}
6860
6861	while (offs < strsize)
6862		str[offs++] = '\0';
6863
6864out:
6865	mstate->dtms_scratch_ptr = old;
6866}
6867
6868static void
6869dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6870    size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
6871{
6872	volatile uint16_t *flags;
6873	uint64_t val = *valp;
6874	size_t valoffs = *valoffsp;
6875
6876	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
6877	ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
6878
6879	/*
6880	 * If this is a string, we're going to only load until we find the zero
6881	 * byte -- after which we'll store zero bytes.
6882	 */
6883	if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6884		char c = '\0' + 1;
6885		size_t s;
6886
6887		for (s = 0; s < size; s++) {
6888			if (c != '\0' && dtkind == DIF_TF_BYREF) {
6889				c = dtrace_load8(val++);
6890			} else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
6891				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6892				c = dtrace_fuword8((void *)(uintptr_t)val++);
6893				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6894				if (*flags & CPU_DTRACE_FAULT)
6895					break;
6896			}
6897
6898			DTRACE_STORE(uint8_t, tomax, valoffs++, c);
6899
6900			if (c == '\0' && intuple)
6901				break;
6902		}
6903	} else {
6904		uint8_t c;
6905		while (valoffs < end) {
6906			if (dtkind == DIF_TF_BYREF) {
6907				c = dtrace_load8(val++);
6908			} else if (dtkind == DIF_TF_BYUREF) {
6909				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6910				c = dtrace_fuword8((void *)(uintptr_t)val++);
6911				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6912				if (*flags & CPU_DTRACE_FAULT)
6913					break;
6914			}
6915
6916			DTRACE_STORE(uint8_t, tomax,
6917			    valoffs++, c);
6918		}
6919	}
6920
6921	*valp = val;
6922	*valoffsp = valoffs;
6923}
6924
6925/*
6926 * If you're looking for the epicenter of DTrace, you just found it.  This
6927 * is the function called by the provider to fire a probe -- from which all
6928 * subsequent probe-context DTrace activity emanates.
6929 */
6930void
6931dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
6932    uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
6933{
6934	processorid_t cpuid;
6935	dtrace_icookie_t cookie;
6936	dtrace_probe_t *probe;
6937	dtrace_mstate_t mstate;
6938	dtrace_ecb_t *ecb;
6939	dtrace_action_t *act;
6940	intptr_t offs;
6941	size_t size;
6942	int vtime, onintr;
6943	volatile uint16_t *flags;
6944	hrtime_t now;
6945
6946	if (panicstr != NULL)
6947		return;
6948
6949#if defined(sun)
6950	/*
6951	 * Kick out immediately if this CPU is still being born (in which case
6952	 * curthread will be set to -1) or the current thread can't allow
6953	 * probes in its current context.
6954	 */
6955	if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
6956		return;
6957#endif
6958
6959	cookie = dtrace_interrupt_disable();
6960	probe = dtrace_probes[id - 1];
6961	cpuid = curcpu;
6962	onintr = CPU_ON_INTR(CPU);
6963
6964	if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
6965	    probe->dtpr_predcache == curthread->t_predcache) {
6966		/*
6967		 * We have hit in the predicate cache; we know that
6968		 * this predicate would evaluate to be false.
6969		 */
6970		dtrace_interrupt_enable(cookie);
6971		return;
6972	}
6973
6974#if defined(sun)
6975	if (panic_quiesce) {
6976#else
6977	if (panicstr != NULL) {
6978#endif
6979		/*
6980		 * We don't trace anything if we're panicking.
6981		 */
6982		dtrace_interrupt_enable(cookie);
6983		return;
6984	}
6985
6986	now = dtrace_gethrtime();
6987	vtime = dtrace_vtime_references != 0;
6988
6989	if (vtime && curthread->t_dtrace_start)
6990		curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
6991
6992	mstate.dtms_difo = NULL;
6993	mstate.dtms_probe = probe;
6994	mstate.dtms_strtok = 0;
6995	mstate.dtms_arg[0] = arg0;
6996	mstate.dtms_arg[1] = arg1;
6997	mstate.dtms_arg[2] = arg2;
6998	mstate.dtms_arg[3] = arg3;
6999	mstate.dtms_arg[4] = arg4;
7000
7001	flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
7002
7003	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
7004		dtrace_predicate_t *pred = ecb->dte_predicate;
7005		dtrace_state_t *state = ecb->dte_state;
7006		dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
7007		dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
7008		dtrace_vstate_t *vstate = &state->dts_vstate;
7009		dtrace_provider_t *prov = probe->dtpr_provider;
7010		uint64_t tracememsize = 0;
7011		int committed = 0;
7012		caddr_t tomax;
7013
7014		/*
7015		 * A little subtlety with the following (seemingly innocuous)
7016		 * declaration of the automatic 'val':  by looking at the
7017		 * code, you might think that it could be declared in the
7018		 * action processing loop, below.  (That is, it's only used in
7019		 * the action processing loop.)  However, it must be declared
7020		 * out of that scope because in the case of DIF expression
7021		 * arguments to aggregating actions, one iteration of the
7022		 * action loop will use the last iteration's value.
7023		 */
7024		uint64_t val = 0;
7025
7026		mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
7027		mstate.dtms_getf = NULL;
7028
7029		*flags &= ~CPU_DTRACE_ERROR;
7030
7031		if (prov == dtrace_provider) {
7032			/*
7033			 * If dtrace itself is the provider of this probe,
7034			 * we're only going to continue processing the ECB if
7035			 * arg0 (the dtrace_state_t) is equal to the ECB's
7036			 * creating state.  (This prevents disjoint consumers
7037			 * from seeing one another's metaprobes.)
7038			 */
7039			if (arg0 != (uint64_t)(uintptr_t)state)
7040				continue;
7041		}
7042
7043		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
7044			/*
7045			 * We're not currently active.  If our provider isn't
7046			 * the dtrace pseudo provider, we're not interested.
7047			 */
7048			if (prov != dtrace_provider)
7049				continue;
7050
7051			/*
7052			 * Now we must further check if we are in the BEGIN
7053			 * probe.  If we are, we will only continue processing
7054			 * if we're still in WARMUP -- if one BEGIN enabling
7055			 * has invoked the exit() action, we don't want to
7056			 * evaluate subsequent BEGIN enablings.
7057			 */
7058			if (probe->dtpr_id == dtrace_probeid_begin &&
7059			    state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
7060				ASSERT(state->dts_activity ==
7061				    DTRACE_ACTIVITY_DRAINING);
7062				continue;
7063			}
7064		}
7065
7066		if (ecb->dte_cond) {
7067			/*
7068			 * If the dte_cond bits indicate that this
7069			 * consumer is only allowed to see user-mode firings
7070			 * of this probe, call the provider's dtps_usermode()
7071			 * entry point to check that the probe was fired
7072			 * while in a user context. Skip this ECB if that's
7073			 * not the case.
7074			 */
7075			if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
7076			    prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
7077			    probe->dtpr_id, probe->dtpr_arg) == 0)
7078				continue;
7079
7080#if defined(sun)
7081			/*
7082			 * This is more subtle than it looks. We have to be
7083			 * absolutely certain that CRED() isn't going to
7084			 * change out from under us so it's only legit to
7085			 * examine that structure if we're in constrained
7086			 * situations. Currently, the only times we'll this
7087			 * check is if a non-super-user has enabled the
7088			 * profile or syscall providers -- providers that
7089			 * allow visibility of all processes. For the
7090			 * profile case, the check above will ensure that
7091			 * we're examining a user context.
7092			 */
7093			if (ecb->dte_cond & DTRACE_COND_OWNER) {
7094				cred_t *cr;
7095				cred_t *s_cr =
7096				    ecb->dte_state->dts_cred.dcr_cred;
7097				proc_t *proc;
7098
7099				ASSERT(s_cr != NULL);
7100
7101				if ((cr = CRED()) == NULL ||
7102				    s_cr->cr_uid != cr->cr_uid ||
7103				    s_cr->cr_uid != cr->cr_ruid ||
7104				    s_cr->cr_uid != cr->cr_suid ||
7105				    s_cr->cr_gid != cr->cr_gid ||
7106				    s_cr->cr_gid != cr->cr_rgid ||
7107				    s_cr->cr_gid != cr->cr_sgid ||
7108				    (proc = ttoproc(curthread)) == NULL ||
7109				    (proc->p_flag & SNOCD))
7110					continue;
7111			}
7112
7113			if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
7114				cred_t *cr;
7115				cred_t *s_cr =
7116				    ecb->dte_state->dts_cred.dcr_cred;
7117
7118				ASSERT(s_cr != NULL);
7119
7120				if ((cr = CRED()) == NULL ||
7121				    s_cr->cr_zone->zone_id !=
7122				    cr->cr_zone->zone_id)
7123					continue;
7124			}
7125#endif
7126		}
7127
7128		if (now - state->dts_alive > dtrace_deadman_timeout) {
7129			/*
7130			 * We seem to be dead.  Unless we (a) have kernel
7131			 * destructive permissions (b) have explicitly enabled
7132			 * destructive actions and (c) destructive actions have
7133			 * not been disabled, we're going to transition into
7134			 * the KILLED state, from which no further processing
7135			 * on this state will be performed.
7136			 */
7137			if (!dtrace_priv_kernel_destructive(state) ||
7138			    !state->dts_cred.dcr_destructive ||
7139			    dtrace_destructive_disallow) {
7140				void *activity = &state->dts_activity;
7141				dtrace_activity_t current;
7142
7143				do {
7144					current = state->dts_activity;
7145				} while (dtrace_cas32(activity, current,
7146				    DTRACE_ACTIVITY_KILLED) != current);
7147
7148				continue;
7149			}
7150		}
7151
7152		if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
7153		    ecb->dte_alignment, state, &mstate)) < 0)
7154			continue;
7155
7156		tomax = buf->dtb_tomax;
7157		ASSERT(tomax != NULL);
7158
7159		if (ecb->dte_size != 0) {
7160			dtrace_rechdr_t dtrh;
7161			if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
7162				mstate.dtms_timestamp = dtrace_gethrtime();
7163				mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7164			}
7165			ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
7166			dtrh.dtrh_epid = ecb->dte_epid;
7167			DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
7168			    mstate.dtms_timestamp);
7169			*((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
7170		}
7171
7172		mstate.dtms_epid = ecb->dte_epid;
7173		mstate.dtms_present |= DTRACE_MSTATE_EPID;
7174
7175		if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
7176			mstate.dtms_access = DTRACE_ACCESS_KERNEL;
7177		else
7178			mstate.dtms_access = 0;
7179
7180		if (pred != NULL) {
7181			dtrace_difo_t *dp = pred->dtp_difo;
7182			int rval;
7183
7184			rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
7185
7186			if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
7187				dtrace_cacheid_t cid = probe->dtpr_predcache;
7188
7189				if (cid != DTRACE_CACHEIDNONE && !onintr) {
7190					/*
7191					 * Update the predicate cache...
7192					 */
7193					ASSERT(cid == pred->dtp_cacheid);
7194					curthread->t_predcache = cid;
7195				}
7196
7197				continue;
7198			}
7199		}
7200
7201		for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
7202		    act != NULL; act = act->dta_next) {
7203			size_t valoffs;
7204			dtrace_difo_t *dp;
7205			dtrace_recdesc_t *rec = &act->dta_rec;
7206
7207			size = rec->dtrd_size;
7208			valoffs = offs + rec->dtrd_offset;
7209
7210			if (DTRACEACT_ISAGG(act->dta_kind)) {
7211				uint64_t v = 0xbad;
7212				dtrace_aggregation_t *agg;
7213
7214				agg = (dtrace_aggregation_t *)act;
7215
7216				if ((dp = act->dta_difo) != NULL)
7217					v = dtrace_dif_emulate(dp,
7218					    &mstate, vstate, state);
7219
7220				if (*flags & CPU_DTRACE_ERROR)
7221					continue;
7222
7223				/*
7224				 * Note that we always pass the expression
7225				 * value from the previous iteration of the
7226				 * action loop.  This value will only be used
7227				 * if there is an expression argument to the
7228				 * aggregating action, denoted by the
7229				 * dtag_hasarg field.
7230				 */
7231				dtrace_aggregate(agg, buf,
7232				    offs, aggbuf, v, val);
7233				continue;
7234			}
7235
7236			switch (act->dta_kind) {
7237			case DTRACEACT_STOP:
7238				if (dtrace_priv_proc_destructive(state))
7239					dtrace_action_stop();
7240				continue;
7241
7242			case DTRACEACT_BREAKPOINT:
7243				if (dtrace_priv_kernel_destructive(state))
7244					dtrace_action_breakpoint(ecb);
7245				continue;
7246
7247			case DTRACEACT_PANIC:
7248				if (dtrace_priv_kernel_destructive(state))
7249					dtrace_action_panic(ecb);
7250				continue;
7251
7252			case DTRACEACT_STACK:
7253				if (!dtrace_priv_kernel(state))
7254					continue;
7255
7256				dtrace_getpcstack((pc_t *)(tomax + valoffs),
7257				    size / sizeof (pc_t), probe->dtpr_aframes,
7258				    DTRACE_ANCHORED(probe) ? NULL :
7259				    (uint32_t *)arg0);
7260				continue;
7261
7262			case DTRACEACT_JSTACK:
7263			case DTRACEACT_USTACK:
7264				if (!dtrace_priv_proc(state))
7265					continue;
7266
7267				/*
7268				 * See comment in DIF_VAR_PID.
7269				 */
7270				if (DTRACE_ANCHORED(mstate.dtms_probe) &&
7271				    CPU_ON_INTR(CPU)) {
7272					int depth = DTRACE_USTACK_NFRAMES(
7273					    rec->dtrd_arg) + 1;
7274
7275					dtrace_bzero((void *)(tomax + valoffs),
7276					    DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
7277					    + depth * sizeof (uint64_t));
7278
7279					continue;
7280				}
7281
7282				if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
7283				    curproc->p_dtrace_helpers != NULL) {
7284					/*
7285					 * This is the slow path -- we have
7286					 * allocated string space, and we're
7287					 * getting the stack of a process that
7288					 * has helpers.  Call into a separate
7289					 * routine to perform this processing.
7290					 */
7291					dtrace_action_ustack(&mstate, state,
7292					    (uint64_t *)(tomax + valoffs),
7293					    rec->dtrd_arg);
7294					continue;
7295				}
7296
7297				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7298				dtrace_getupcstack((uint64_t *)
7299				    (tomax + valoffs),
7300				    DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
7301				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7302				continue;
7303
7304			default:
7305				break;
7306			}
7307
7308			dp = act->dta_difo;
7309			ASSERT(dp != NULL);
7310
7311			val = dtrace_dif_emulate(dp, &mstate, vstate, state);
7312
7313			if (*flags & CPU_DTRACE_ERROR)
7314				continue;
7315
7316			switch (act->dta_kind) {
7317			case DTRACEACT_SPECULATE: {
7318				dtrace_rechdr_t *dtrh;
7319
7320				ASSERT(buf == &state->dts_buffer[cpuid]);
7321				buf = dtrace_speculation_buffer(state,
7322				    cpuid, val);
7323
7324				if (buf == NULL) {
7325					*flags |= CPU_DTRACE_DROP;
7326					continue;
7327				}
7328
7329				offs = dtrace_buffer_reserve(buf,
7330				    ecb->dte_needed, ecb->dte_alignment,
7331				    state, NULL);
7332
7333				if (offs < 0) {
7334					*flags |= CPU_DTRACE_DROP;
7335					continue;
7336				}
7337
7338				tomax = buf->dtb_tomax;
7339				ASSERT(tomax != NULL);
7340
7341				if (ecb->dte_size == 0)
7342					continue;
7343
7344				ASSERT3U(ecb->dte_size, >=,
7345				    sizeof (dtrace_rechdr_t));
7346				dtrh = ((void *)(tomax + offs));
7347				dtrh->dtrh_epid = ecb->dte_epid;
7348				/*
7349				 * When the speculation is committed, all of
7350				 * the records in the speculative buffer will
7351				 * have their timestamps set to the commit
7352				 * time.  Until then, it is set to a sentinel
7353				 * value, for debugability.
7354				 */
7355				DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
7356				continue;
7357			}
7358
7359			case DTRACEACT_PRINTM: {
7360				/* The DIF returns a 'memref'. */
7361				uintptr_t *memref = (uintptr_t *)(uintptr_t) val;
7362
7363				/* Get the size from the memref. */
7364				size = memref[1];
7365
7366				/*
7367				 * Check if the size exceeds the allocated
7368				 * buffer size.
7369				 */
7370				if (size + sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
7371					/* Flag a drop! */
7372					*flags |= CPU_DTRACE_DROP;
7373					continue;
7374				}
7375
7376				/* Store the size in the buffer first. */
7377				DTRACE_STORE(uintptr_t, tomax,
7378				    valoffs, size);
7379
7380				/*
7381				 * Offset the buffer address to the start
7382				 * of the data.
7383				 */
7384				valoffs += sizeof(uintptr_t);
7385
7386				/*
7387				 * Reset to the memory address rather than
7388				 * the memref array, then let the BYREF
7389				 * code below do the work to store the
7390				 * memory data in the buffer.
7391				 */
7392				val = memref[0];
7393				break;
7394			}
7395
7396			case DTRACEACT_PRINTT: {
7397				/* The DIF returns a 'typeref'. */
7398				uintptr_t *typeref = (uintptr_t *)(uintptr_t) val;
7399				char c = '\0' + 1;
7400				size_t s;
7401
7402				/*
7403				 * Get the type string length and round it
7404				 * up so that the data that follows is
7405				 * aligned for easy access.
7406				 */
7407				size_t typs = strlen((char *) typeref[2]) + 1;
7408				typs = roundup(typs,  sizeof(uintptr_t));
7409
7410				/*
7411				 *Get the size from the typeref using the
7412				 * number of elements and the type size.
7413				 */
7414				size = typeref[1] * typeref[3];
7415
7416				/*
7417				 * Check if the size exceeds the allocated
7418				 * buffer size.
7419				 */
7420				if (size + typs + 2 * sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
7421					/* Flag a drop! */
7422					*flags |= CPU_DTRACE_DROP;
7423
7424				}
7425
7426				/* Store the size in the buffer first. */
7427				DTRACE_STORE(uintptr_t, tomax,
7428				    valoffs, size);
7429				valoffs += sizeof(uintptr_t);
7430
7431				/* Store the type size in the buffer. */
7432				DTRACE_STORE(uintptr_t, tomax,
7433				    valoffs, typeref[3]);
7434				valoffs += sizeof(uintptr_t);
7435
7436				val = typeref[2];
7437
7438				for (s = 0; s < typs; s++) {
7439					if (c != '\0')
7440						c = dtrace_load8(val++);
7441
7442					DTRACE_STORE(uint8_t, tomax,
7443					    valoffs++, c);
7444				}
7445
7446				/*
7447				 * Reset to the memory address rather than
7448				 * the typeref array, then let the BYREF
7449				 * code below do the work to store the
7450				 * memory data in the buffer.
7451				 */
7452				val = typeref[0];
7453				break;
7454			}
7455
7456			case DTRACEACT_CHILL:
7457				if (dtrace_priv_kernel_destructive(state))
7458					dtrace_action_chill(&mstate, val);
7459				continue;
7460
7461			case DTRACEACT_RAISE:
7462				if (dtrace_priv_proc_destructive(state))
7463					dtrace_action_raise(val);
7464				continue;
7465
7466			case DTRACEACT_COMMIT:
7467				ASSERT(!committed);
7468
7469				/*
7470				 * We need to commit our buffer state.
7471				 */
7472				if (ecb->dte_size)
7473					buf->dtb_offset = offs + ecb->dte_size;
7474				buf = &state->dts_buffer[cpuid];
7475				dtrace_speculation_commit(state, cpuid, val);
7476				committed = 1;
7477				continue;
7478
7479			case DTRACEACT_DISCARD:
7480				dtrace_speculation_discard(state, cpuid, val);
7481				continue;
7482
7483			case DTRACEACT_DIFEXPR:
7484			case DTRACEACT_LIBACT:
7485			case DTRACEACT_PRINTF:
7486			case DTRACEACT_PRINTA:
7487			case DTRACEACT_SYSTEM:
7488			case DTRACEACT_FREOPEN:
7489			case DTRACEACT_TRACEMEM:
7490				break;
7491
7492			case DTRACEACT_TRACEMEM_DYNSIZE:
7493				tracememsize = val;
7494				break;
7495
7496			case DTRACEACT_SYM:
7497			case DTRACEACT_MOD:
7498				if (!dtrace_priv_kernel(state))
7499					continue;
7500				break;
7501
7502			case DTRACEACT_USYM:
7503			case DTRACEACT_UMOD:
7504			case DTRACEACT_UADDR: {
7505#if defined(sun)
7506				struct pid *pid = curthread->t_procp->p_pidp;
7507#endif
7508
7509				if (!dtrace_priv_proc(state))
7510					continue;
7511
7512				DTRACE_STORE(uint64_t, tomax,
7513#if defined(sun)
7514				    valoffs, (uint64_t)pid->pid_id);
7515#else
7516				    valoffs, (uint64_t) curproc->p_pid);
7517#endif
7518				DTRACE_STORE(uint64_t, tomax,
7519				    valoffs + sizeof (uint64_t), val);
7520
7521				continue;
7522			}
7523
7524			case DTRACEACT_EXIT: {
7525				/*
7526				 * For the exit action, we are going to attempt
7527				 * to atomically set our activity to be
7528				 * draining.  If this fails (either because
7529				 * another CPU has beat us to the exit action,
7530				 * or because our current activity is something
7531				 * other than ACTIVE or WARMUP), we will
7532				 * continue.  This assures that the exit action
7533				 * can be successfully recorded at most once
7534				 * when we're in the ACTIVE state.  If we're
7535				 * encountering the exit() action while in
7536				 * COOLDOWN, however, we want to honor the new
7537				 * status code.  (We know that we're the only
7538				 * thread in COOLDOWN, so there is no race.)
7539				 */
7540				void *activity = &state->dts_activity;
7541				dtrace_activity_t current = state->dts_activity;
7542
7543				if (current == DTRACE_ACTIVITY_COOLDOWN)
7544					break;
7545
7546				if (current != DTRACE_ACTIVITY_WARMUP)
7547					current = DTRACE_ACTIVITY_ACTIVE;
7548
7549				if (dtrace_cas32(activity, current,
7550				    DTRACE_ACTIVITY_DRAINING) != current) {
7551					*flags |= CPU_DTRACE_DROP;
7552					continue;
7553				}
7554
7555				break;
7556			}
7557
7558			default:
7559				ASSERT(0);
7560			}
7561
7562			if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ||
7563			    dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) {
7564				uintptr_t end = valoffs + size;
7565
7566				if (tracememsize != 0 &&
7567				    valoffs + tracememsize < end) {
7568					end = valoffs + tracememsize;
7569					tracememsize = 0;
7570				}
7571
7572				if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
7573				    !dtrace_vcanload((void *)(uintptr_t)val,
7574				    &dp->dtdo_rtype, &mstate, vstate))
7575					continue;
7576
7577				dtrace_store_by_ref(dp, tomax, size, &valoffs,
7578				    &val, end, act->dta_intuple,
7579				    dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
7580				    DIF_TF_BYREF: DIF_TF_BYUREF);
7581				continue;
7582			}
7583
7584			switch (size) {
7585			case 0:
7586				break;
7587
7588			case sizeof (uint8_t):
7589				DTRACE_STORE(uint8_t, tomax, valoffs, val);
7590				break;
7591			case sizeof (uint16_t):
7592				DTRACE_STORE(uint16_t, tomax, valoffs, val);
7593				break;
7594			case sizeof (uint32_t):
7595				DTRACE_STORE(uint32_t, tomax, valoffs, val);
7596				break;
7597			case sizeof (uint64_t):
7598				DTRACE_STORE(uint64_t, tomax, valoffs, val);
7599				break;
7600			default:
7601				/*
7602				 * Any other size should have been returned by
7603				 * reference, not by value.
7604				 */
7605				ASSERT(0);
7606				break;
7607			}
7608		}
7609
7610		if (*flags & CPU_DTRACE_DROP)
7611			continue;
7612
7613		if (*flags & CPU_DTRACE_FAULT) {
7614			int ndx;
7615			dtrace_action_t *err;
7616
7617			buf->dtb_errors++;
7618
7619			if (probe->dtpr_id == dtrace_probeid_error) {
7620				/*
7621				 * There's nothing we can do -- we had an
7622				 * error on the error probe.  We bump an
7623				 * error counter to at least indicate that
7624				 * this condition happened.
7625				 */
7626				dtrace_error(&state->dts_dblerrors);
7627				continue;
7628			}
7629
7630			if (vtime) {
7631				/*
7632				 * Before recursing on dtrace_probe(), we
7633				 * need to explicitly clear out our start
7634				 * time to prevent it from being accumulated
7635				 * into t_dtrace_vtime.
7636				 */
7637				curthread->t_dtrace_start = 0;
7638			}
7639
7640			/*
7641			 * Iterate over the actions to figure out which action
7642			 * we were processing when we experienced the error.
7643			 * Note that act points _past_ the faulting action; if
7644			 * act is ecb->dte_action, the fault was in the
7645			 * predicate, if it's ecb->dte_action->dta_next it's
7646			 * in action #1, and so on.
7647			 */
7648			for (err = ecb->dte_action, ndx = 0;
7649			    err != act; err = err->dta_next, ndx++)
7650				continue;
7651
7652			dtrace_probe_error(state, ecb->dte_epid, ndx,
7653			    (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7654			    mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
7655			    cpu_core[cpuid].cpuc_dtrace_illval);
7656
7657			continue;
7658		}
7659
7660		if (!committed)
7661			buf->dtb_offset = offs + ecb->dte_size;
7662	}
7663
7664	if (vtime)
7665		curthread->t_dtrace_start = dtrace_gethrtime();
7666
7667	dtrace_interrupt_enable(cookie);
7668}
7669
7670/*
7671 * DTrace Probe Hashing Functions
7672 *
7673 * The functions in this section (and indeed, the functions in remaining
7674 * sections) are not _called_ from probe context.  (Any exceptions to this are
7675 * marked with a "Note:".)  Rather, they are called from elsewhere in the
7676 * DTrace framework to look-up probes in, add probes to and remove probes from
7677 * the DTrace probe hashes.  (Each probe is hashed by each element of the
7678 * probe tuple -- allowing for fast lookups, regardless of what was
7679 * specified.)
7680 */
7681static uint_t
7682dtrace_hash_str(const char *p)
7683{
7684	unsigned int g;
7685	uint_t hval = 0;
7686
7687	while (*p) {
7688		hval = (hval << 4) + *p++;
7689		if ((g = (hval & 0xf0000000)) != 0)
7690			hval ^= g >> 24;
7691		hval &= ~g;
7692	}
7693	return (hval);
7694}
7695
7696static dtrace_hash_t *
7697dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
7698{
7699	dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7700
7701	hash->dth_stroffs = stroffs;
7702	hash->dth_nextoffs = nextoffs;
7703	hash->dth_prevoffs = prevoffs;
7704
7705	hash->dth_size = 1;
7706	hash->dth_mask = hash->dth_size - 1;
7707
7708	hash->dth_tab = kmem_zalloc(hash->dth_size *
7709	    sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7710
7711	return (hash);
7712}
7713
7714static void
7715dtrace_hash_destroy(dtrace_hash_t *hash)
7716{
7717#ifdef DEBUG
7718	int i;
7719
7720	for (i = 0; i < hash->dth_size; i++)
7721		ASSERT(hash->dth_tab[i] == NULL);
7722#endif
7723
7724	kmem_free(hash->dth_tab,
7725	    hash->dth_size * sizeof (dtrace_hashbucket_t *));
7726	kmem_free(hash, sizeof (dtrace_hash_t));
7727}
7728
7729static void
7730dtrace_hash_resize(dtrace_hash_t *hash)
7731{
7732	int size = hash->dth_size, i, ndx;
7733	int new_size = hash->dth_size << 1;
7734	int new_mask = new_size - 1;
7735	dtrace_hashbucket_t **new_tab, *bucket, *next;
7736
7737	ASSERT((new_size & new_mask) == 0);
7738
7739	new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7740
7741	for (i = 0; i < size; i++) {
7742		for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7743			dtrace_probe_t *probe = bucket->dthb_chain;
7744
7745			ASSERT(probe != NULL);
7746			ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
7747
7748			next = bucket->dthb_next;
7749			bucket->dthb_next = new_tab[ndx];
7750			new_tab[ndx] = bucket;
7751		}
7752	}
7753
7754	kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7755	hash->dth_tab = new_tab;
7756	hash->dth_size = new_size;
7757	hash->dth_mask = new_mask;
7758}
7759
7760static void
7761dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
7762{
7763	int hashval = DTRACE_HASHSTR(hash, new);
7764	int ndx = hashval & hash->dth_mask;
7765	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7766	dtrace_probe_t **nextp, **prevp;
7767
7768	for (; bucket != NULL; bucket = bucket->dthb_next) {
7769		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7770			goto add;
7771	}
7772
7773	if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7774		dtrace_hash_resize(hash);
7775		dtrace_hash_add(hash, new);
7776		return;
7777	}
7778
7779	bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7780	bucket->dthb_next = hash->dth_tab[ndx];
7781	hash->dth_tab[ndx] = bucket;
7782	hash->dth_nbuckets++;
7783
7784add:
7785	nextp = DTRACE_HASHNEXT(hash, new);
7786	ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7787	*nextp = bucket->dthb_chain;
7788
7789	if (bucket->dthb_chain != NULL) {
7790		prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7791		ASSERT(*prevp == NULL);
7792		*prevp = new;
7793	}
7794
7795	bucket->dthb_chain = new;
7796	bucket->dthb_len++;
7797}
7798
7799static dtrace_probe_t *
7800dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
7801{
7802	int hashval = DTRACE_HASHSTR(hash, template);
7803	int ndx = hashval & hash->dth_mask;
7804	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7805
7806	for (; bucket != NULL; bucket = bucket->dthb_next) {
7807		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7808			return (bucket->dthb_chain);
7809	}
7810
7811	return (NULL);
7812}
7813
7814static int
7815dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
7816{
7817	int hashval = DTRACE_HASHSTR(hash, template);
7818	int ndx = hashval & hash->dth_mask;
7819	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7820
7821	for (; bucket != NULL; bucket = bucket->dthb_next) {
7822		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7823			return (bucket->dthb_len);
7824	}
7825
7826	return (0);
7827}
7828
7829static void
7830dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
7831{
7832	int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
7833	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7834
7835	dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
7836	dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
7837
7838	/*
7839	 * Find the bucket that we're removing this probe from.
7840	 */
7841	for (; bucket != NULL; bucket = bucket->dthb_next) {
7842		if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
7843			break;
7844	}
7845
7846	ASSERT(bucket != NULL);
7847
7848	if (*prevp == NULL) {
7849		if (*nextp == NULL) {
7850			/*
7851			 * The removed probe was the only probe on this
7852			 * bucket; we need to remove the bucket.
7853			 */
7854			dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7855
7856			ASSERT(bucket->dthb_chain == probe);
7857			ASSERT(b != NULL);
7858
7859			if (b == bucket) {
7860				hash->dth_tab[ndx] = bucket->dthb_next;
7861			} else {
7862				while (b->dthb_next != bucket)
7863					b = b->dthb_next;
7864				b->dthb_next = bucket->dthb_next;
7865			}
7866
7867			ASSERT(hash->dth_nbuckets > 0);
7868			hash->dth_nbuckets--;
7869			kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7870			return;
7871		}
7872
7873		bucket->dthb_chain = *nextp;
7874	} else {
7875		*(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7876	}
7877
7878	if (*nextp != NULL)
7879		*(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7880}
7881
7882/*
7883 * DTrace Utility Functions
7884 *
7885 * These are random utility functions that are _not_ called from probe context.
7886 */
7887static int
7888dtrace_badattr(const dtrace_attribute_t *a)
7889{
7890	return (a->dtat_name > DTRACE_STABILITY_MAX ||
7891	    a->dtat_data > DTRACE_STABILITY_MAX ||
7892	    a->dtat_class > DTRACE_CLASS_MAX);
7893}
7894
7895/*
7896 * Return a duplicate copy of a string.  If the specified string is NULL,
7897 * this function returns a zero-length string.
7898 */
7899static char *
7900dtrace_strdup(const char *str)
7901{
7902	char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
7903
7904	if (str != NULL)
7905		(void) strcpy(new, str);
7906
7907	return (new);
7908}
7909
7910#define	DTRACE_ISALPHA(c)	\
7911	(((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
7912
7913static int
7914dtrace_badname(const char *s)
7915{
7916	char c;
7917
7918	if (s == NULL || (c = *s++) == '\0')
7919		return (0);
7920
7921	if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
7922		return (1);
7923
7924	while ((c = *s++) != '\0') {
7925		if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
7926		    c != '-' && c != '_' && c != '.' && c != '`')
7927			return (1);
7928	}
7929
7930	return (0);
7931}
7932
7933static void
7934dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
7935{
7936	uint32_t priv;
7937
7938#if defined(sun)
7939	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
7940		/*
7941		 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
7942		 */
7943		priv = DTRACE_PRIV_ALL;
7944	} else {
7945		*uidp = crgetuid(cr);
7946		*zoneidp = crgetzoneid(cr);
7947
7948		priv = 0;
7949		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
7950			priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
7951		else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
7952			priv |= DTRACE_PRIV_USER;
7953		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
7954			priv |= DTRACE_PRIV_PROC;
7955		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
7956			priv |= DTRACE_PRIV_OWNER;
7957		if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
7958			priv |= DTRACE_PRIV_ZONEOWNER;
7959	}
7960#else
7961	priv = DTRACE_PRIV_ALL;
7962#endif
7963
7964	*privp = priv;
7965}
7966
7967#ifdef DTRACE_ERRDEBUG
7968static void
7969dtrace_errdebug(const char *str)
7970{
7971	int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
7972	int occupied = 0;
7973
7974	mutex_enter(&dtrace_errlock);
7975	dtrace_errlast = str;
7976	dtrace_errthread = curthread;
7977
7978	while (occupied++ < DTRACE_ERRHASHSZ) {
7979		if (dtrace_errhash[hval].dter_msg == str) {
7980			dtrace_errhash[hval].dter_count++;
7981			goto out;
7982		}
7983
7984		if (dtrace_errhash[hval].dter_msg != NULL) {
7985			hval = (hval + 1) % DTRACE_ERRHASHSZ;
7986			continue;
7987		}
7988
7989		dtrace_errhash[hval].dter_msg = str;
7990		dtrace_errhash[hval].dter_count = 1;
7991		goto out;
7992	}
7993
7994	panic("dtrace: undersized error hash");
7995out:
7996	mutex_exit(&dtrace_errlock);
7997}
7998#endif
7999
8000/*
8001 * DTrace Matching Functions
8002 *
8003 * These functions are used to match groups of probes, given some elements of
8004 * a probe tuple, or some globbed expressions for elements of a probe tuple.
8005 */
8006static int
8007dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
8008    zoneid_t zoneid)
8009{
8010	if (priv != DTRACE_PRIV_ALL) {
8011		uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
8012		uint32_t match = priv & ppriv;
8013
8014		/*
8015		 * No PRIV_DTRACE_* privileges...
8016		 */
8017		if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
8018		    DTRACE_PRIV_KERNEL)) == 0)
8019			return (0);
8020
8021		/*
8022		 * No matching bits, but there were bits to match...
8023		 */
8024		if (match == 0 && ppriv != 0)
8025			return (0);
8026
8027		/*
8028		 * Need to have permissions to the process, but don't...
8029		 */
8030		if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
8031		    uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
8032			return (0);
8033		}
8034
8035		/*
8036		 * Need to be in the same zone unless we possess the
8037		 * privilege to examine all zones.
8038		 */
8039		if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
8040		    zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
8041			return (0);
8042		}
8043	}
8044
8045	return (1);
8046}
8047
8048/*
8049 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
8050 * consists of input pattern strings and an ops-vector to evaluate them.
8051 * This function returns >0 for match, 0 for no match, and <0 for error.
8052 */
8053static int
8054dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
8055    uint32_t priv, uid_t uid, zoneid_t zoneid)
8056{
8057	dtrace_provider_t *pvp = prp->dtpr_provider;
8058	int rv;
8059
8060	if (pvp->dtpv_defunct)
8061		return (0);
8062
8063	if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
8064		return (rv);
8065
8066	if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
8067		return (rv);
8068
8069	if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
8070		return (rv);
8071
8072	if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
8073		return (rv);
8074
8075	if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
8076		return (0);
8077
8078	return (rv);
8079}
8080
8081/*
8082 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
8083 * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
8084 * libc's version, the kernel version only applies to 8-bit ASCII strings.
8085 * In addition, all of the recursion cases except for '*' matching have been
8086 * unwound.  For '*', we still implement recursive evaluation, but a depth
8087 * counter is maintained and matching is aborted if we recurse too deep.
8088 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
8089 */
8090static int
8091dtrace_match_glob(const char *s, const char *p, int depth)
8092{
8093	const char *olds;
8094	char s1, c;
8095	int gs;
8096
8097	if (depth > DTRACE_PROBEKEY_MAXDEPTH)
8098		return (-1);
8099
8100	if (s == NULL)
8101		s = ""; /* treat NULL as empty string */
8102
8103top:
8104	olds = s;
8105	s1 = *s++;
8106
8107	if (p == NULL)
8108		return (0);
8109
8110	if ((c = *p++) == '\0')
8111		return (s1 == '\0');
8112
8113	switch (c) {
8114	case '[': {
8115		int ok = 0, notflag = 0;
8116		char lc = '\0';
8117
8118		if (s1 == '\0')
8119			return (0);
8120
8121		if (*p == '!') {
8122			notflag = 1;
8123			p++;
8124		}
8125
8126		if ((c = *p++) == '\0')
8127			return (0);
8128
8129		do {
8130			if (c == '-' && lc != '\0' && *p != ']') {
8131				if ((c = *p++) == '\0')
8132					return (0);
8133				if (c == '\\' && (c = *p++) == '\0')
8134					return (0);
8135
8136				if (notflag) {
8137					if (s1 < lc || s1 > c)
8138						ok++;
8139					else
8140						return (0);
8141				} else if (lc <= s1 && s1 <= c)
8142					ok++;
8143
8144			} else if (c == '\\' && (c = *p++) == '\0')
8145				return (0);
8146
8147			lc = c; /* save left-hand 'c' for next iteration */
8148
8149			if (notflag) {
8150				if (s1 != c)
8151					ok++;
8152				else
8153					return (0);
8154			} else if (s1 == c)
8155				ok++;
8156
8157			if ((c = *p++) == '\0')
8158				return (0);
8159
8160		} while (c != ']');
8161
8162		if (ok)
8163			goto top;
8164
8165		return (0);
8166	}
8167
8168	case '\\':
8169		if ((c = *p++) == '\0')
8170			return (0);
8171		/*FALLTHRU*/
8172
8173	default:
8174		if (c != s1)
8175			return (0);
8176		/*FALLTHRU*/
8177
8178	case '?':
8179		if (s1 != '\0')
8180			goto top;
8181		return (0);
8182
8183	case '*':
8184		while (*p == '*')
8185			p++; /* consecutive *'s are identical to a single one */
8186
8187		if (*p == '\0')
8188			return (1);
8189
8190		for (s = olds; *s != '\0'; s++) {
8191			if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
8192				return (gs);
8193		}
8194
8195		return (0);
8196	}
8197}
8198
8199/*ARGSUSED*/
8200static int
8201dtrace_match_string(const char *s, const char *p, int depth)
8202{
8203	return (s != NULL && strcmp(s, p) == 0);
8204}
8205
8206/*ARGSUSED*/
8207static int
8208dtrace_match_nul(const char *s, const char *p, int depth)
8209{
8210	return (1); /* always match the empty pattern */
8211}
8212
8213/*ARGSUSED*/
8214static int
8215dtrace_match_nonzero(const char *s, const char *p, int depth)
8216{
8217	return (s != NULL && s[0] != '\0');
8218}
8219
8220static int
8221dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
8222    zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
8223{
8224	dtrace_probe_t template, *probe;
8225	dtrace_hash_t *hash = NULL;
8226	int len, best = INT_MAX, nmatched = 0;
8227	dtrace_id_t i;
8228
8229	ASSERT(MUTEX_HELD(&dtrace_lock));
8230
8231	/*
8232	 * If the probe ID is specified in the key, just lookup by ID and
8233	 * invoke the match callback once if a matching probe is found.
8234	 */
8235	if (pkp->dtpk_id != DTRACE_IDNONE) {
8236		if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
8237		    dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
8238			(void) (*matched)(probe, arg);
8239			nmatched++;
8240		}
8241		return (nmatched);
8242	}
8243
8244	template.dtpr_mod = (char *)pkp->dtpk_mod;
8245	template.dtpr_func = (char *)pkp->dtpk_func;
8246	template.dtpr_name = (char *)pkp->dtpk_name;
8247
8248	/*
8249	 * We want to find the most distinct of the module name, function
8250	 * name, and name.  So for each one that is not a glob pattern or
8251	 * empty string, we perform a lookup in the corresponding hash and
8252	 * use the hash table with the fewest collisions to do our search.
8253	 */
8254	if (pkp->dtpk_mmatch == &dtrace_match_string &&
8255	    (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
8256		best = len;
8257		hash = dtrace_bymod;
8258	}
8259
8260	if (pkp->dtpk_fmatch == &dtrace_match_string &&
8261	    (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
8262		best = len;
8263		hash = dtrace_byfunc;
8264	}
8265
8266	if (pkp->dtpk_nmatch == &dtrace_match_string &&
8267	    (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
8268		best = len;
8269		hash = dtrace_byname;
8270	}
8271
8272	/*
8273	 * If we did not select a hash table, iterate over every probe and
8274	 * invoke our callback for each one that matches our input probe key.
8275	 */
8276	if (hash == NULL) {
8277		for (i = 0; i < dtrace_nprobes; i++) {
8278			if ((probe = dtrace_probes[i]) == NULL ||
8279			    dtrace_match_probe(probe, pkp, priv, uid,
8280			    zoneid) <= 0)
8281				continue;
8282
8283			nmatched++;
8284
8285			if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8286				break;
8287		}
8288
8289		return (nmatched);
8290	}
8291
8292	/*
8293	 * If we selected a hash table, iterate over each probe of the same key
8294	 * name and invoke the callback for every probe that matches the other
8295	 * attributes of our input probe key.
8296	 */
8297	for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
8298	    probe = *(DTRACE_HASHNEXT(hash, probe))) {
8299
8300		if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
8301			continue;
8302
8303		nmatched++;
8304
8305		if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8306			break;
8307	}
8308
8309	return (nmatched);
8310}
8311
8312/*
8313 * Return the function pointer dtrace_probecmp() should use to compare the
8314 * specified pattern with a string.  For NULL or empty patterns, we select
8315 * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
8316 * For non-empty non-glob strings, we use dtrace_match_string().
8317 */
8318static dtrace_probekey_f *
8319dtrace_probekey_func(const char *p)
8320{
8321	char c;
8322
8323	if (p == NULL || *p == '\0')
8324		return (&dtrace_match_nul);
8325
8326	while ((c = *p++) != '\0') {
8327		if (c == '[' || c == '?' || c == '*' || c == '\\')
8328			return (&dtrace_match_glob);
8329	}
8330
8331	return (&dtrace_match_string);
8332}
8333
8334/*
8335 * Build a probe comparison key for use with dtrace_match_probe() from the
8336 * given probe description.  By convention, a null key only matches anchored
8337 * probes: if each field is the empty string, reset dtpk_fmatch to
8338 * dtrace_match_nonzero().
8339 */
8340static void
8341dtrace_probekey(dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
8342{
8343	pkp->dtpk_prov = pdp->dtpd_provider;
8344	pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
8345
8346	pkp->dtpk_mod = pdp->dtpd_mod;
8347	pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
8348
8349	pkp->dtpk_func = pdp->dtpd_func;
8350	pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
8351
8352	pkp->dtpk_name = pdp->dtpd_name;
8353	pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
8354
8355	pkp->dtpk_id = pdp->dtpd_id;
8356
8357	if (pkp->dtpk_id == DTRACE_IDNONE &&
8358	    pkp->dtpk_pmatch == &dtrace_match_nul &&
8359	    pkp->dtpk_mmatch == &dtrace_match_nul &&
8360	    pkp->dtpk_fmatch == &dtrace_match_nul &&
8361	    pkp->dtpk_nmatch == &dtrace_match_nul)
8362		pkp->dtpk_fmatch = &dtrace_match_nonzero;
8363}
8364
8365/*
8366 * DTrace Provider-to-Framework API Functions
8367 *
8368 * These functions implement much of the Provider-to-Framework API, as
8369 * described in <sys/dtrace.h>.  The parts of the API not in this section are
8370 * the functions in the API for probe management (found below), and
8371 * dtrace_probe() itself (found above).
8372 */
8373
8374/*
8375 * Register the calling provider with the DTrace framework.  This should
8376 * generally be called by DTrace providers in their attach(9E) entry point.
8377 */
8378int
8379dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
8380    cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
8381{
8382	dtrace_provider_t *provider;
8383
8384	if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
8385		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8386		    "arguments", name ? name : "<NULL>");
8387		return (EINVAL);
8388	}
8389
8390	if (name[0] == '\0' || dtrace_badname(name)) {
8391		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8392		    "provider name", name);
8393		return (EINVAL);
8394	}
8395
8396	if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
8397	    pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
8398	    pops->dtps_destroy == NULL ||
8399	    ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8400		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8401		    "provider ops", name);
8402		return (EINVAL);
8403	}
8404
8405	if (dtrace_badattr(&pap->dtpa_provider) ||
8406	    dtrace_badattr(&pap->dtpa_mod) ||
8407	    dtrace_badattr(&pap->dtpa_func) ||
8408	    dtrace_badattr(&pap->dtpa_name) ||
8409	    dtrace_badattr(&pap->dtpa_args)) {
8410		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8411		    "provider attributes", name);
8412		return (EINVAL);
8413	}
8414
8415	if (priv & ~DTRACE_PRIV_ALL) {
8416		cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8417		    "privilege attributes", name);
8418		return (EINVAL);
8419	}
8420
8421	if ((priv & DTRACE_PRIV_KERNEL) &&
8422	    (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8423	    pops->dtps_usermode == NULL) {
8424		cmn_err(CE_WARN, "failed to register provider '%s': need "
8425		    "dtps_usermode() op for given privilege attributes", name);
8426		return (EINVAL);
8427	}
8428
8429	provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8430	provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8431	(void) strcpy(provider->dtpv_name, name);
8432
8433	provider->dtpv_attr = *pap;
8434	provider->dtpv_priv.dtpp_flags = priv;
8435	if (cr != NULL) {
8436		provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8437		provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8438	}
8439	provider->dtpv_pops = *pops;
8440
8441	if (pops->dtps_provide == NULL) {
8442		ASSERT(pops->dtps_provide_module != NULL);
8443		provider->dtpv_pops.dtps_provide =
8444		    (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop;
8445	}
8446
8447	if (pops->dtps_provide_module == NULL) {
8448		ASSERT(pops->dtps_provide != NULL);
8449		provider->dtpv_pops.dtps_provide_module =
8450		    (void (*)(void *, modctl_t *))dtrace_nullop;
8451	}
8452
8453	if (pops->dtps_suspend == NULL) {
8454		ASSERT(pops->dtps_resume == NULL);
8455		provider->dtpv_pops.dtps_suspend =
8456		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8457		provider->dtpv_pops.dtps_resume =
8458		    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8459	}
8460
8461	provider->dtpv_arg = arg;
8462	*idp = (dtrace_provider_id_t)provider;
8463
8464	if (pops == &dtrace_provider_ops) {
8465		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8466		ASSERT(MUTEX_HELD(&dtrace_lock));
8467		ASSERT(dtrace_anon.dta_enabling == NULL);
8468
8469		/*
8470		 * We make sure that the DTrace provider is at the head of
8471		 * the provider chain.
8472		 */
8473		provider->dtpv_next = dtrace_provider;
8474		dtrace_provider = provider;
8475		return (0);
8476	}
8477
8478	mutex_enter(&dtrace_provider_lock);
8479	mutex_enter(&dtrace_lock);
8480
8481	/*
8482	 * If there is at least one provider registered, we'll add this
8483	 * provider after the first provider.
8484	 */
8485	if (dtrace_provider != NULL) {
8486		provider->dtpv_next = dtrace_provider->dtpv_next;
8487		dtrace_provider->dtpv_next = provider;
8488	} else {
8489		dtrace_provider = provider;
8490	}
8491
8492	if (dtrace_retained != NULL) {
8493		dtrace_enabling_provide(provider);
8494
8495		/*
8496		 * Now we need to call dtrace_enabling_matchall() -- which
8497		 * will acquire cpu_lock and dtrace_lock.  We therefore need
8498		 * to drop all of our locks before calling into it...
8499		 */
8500		mutex_exit(&dtrace_lock);
8501		mutex_exit(&dtrace_provider_lock);
8502		dtrace_enabling_matchall();
8503
8504		return (0);
8505	}
8506
8507	mutex_exit(&dtrace_lock);
8508	mutex_exit(&dtrace_provider_lock);
8509
8510	return (0);
8511}
8512
8513/*
8514 * Unregister the specified provider from the DTrace framework.  This should
8515 * generally be called by DTrace providers in their detach(9E) entry point.
8516 */
8517int
8518dtrace_unregister(dtrace_provider_id_t id)
8519{
8520	dtrace_provider_t *old = (dtrace_provider_t *)id;
8521	dtrace_provider_t *prev = NULL;
8522	int i, self = 0, noreap = 0;
8523	dtrace_probe_t *probe, *first = NULL;
8524
8525	if (old->dtpv_pops.dtps_enable ==
8526	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
8527		/*
8528		 * If DTrace itself is the provider, we're called with locks
8529		 * already held.
8530		 */
8531		ASSERT(old == dtrace_provider);
8532#if defined(sun)
8533		ASSERT(dtrace_devi != NULL);
8534#endif
8535		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8536		ASSERT(MUTEX_HELD(&dtrace_lock));
8537		self = 1;
8538
8539		if (dtrace_provider->dtpv_next != NULL) {
8540			/*
8541			 * There's another provider here; return failure.
8542			 */
8543			return (EBUSY);
8544		}
8545	} else {
8546		mutex_enter(&dtrace_provider_lock);
8547#if defined(sun)
8548		mutex_enter(&mod_lock);
8549#endif
8550		mutex_enter(&dtrace_lock);
8551	}
8552
8553	/*
8554	 * If anyone has /dev/dtrace open, or if there are anonymous enabled
8555	 * probes, we refuse to let providers slither away, unless this
8556	 * provider has already been explicitly invalidated.
8557	 */
8558	if (!old->dtpv_defunct &&
8559	    (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8560	    dtrace_anon.dta_state->dts_necbs > 0))) {
8561		if (!self) {
8562			mutex_exit(&dtrace_lock);
8563#if defined(sun)
8564			mutex_exit(&mod_lock);
8565#endif
8566			mutex_exit(&dtrace_provider_lock);
8567		}
8568		return (EBUSY);
8569	}
8570
8571	/*
8572	 * Attempt to destroy the probes associated with this provider.
8573	 */
8574	for (i = 0; i < dtrace_nprobes; i++) {
8575		if ((probe = dtrace_probes[i]) == NULL)
8576			continue;
8577
8578		if (probe->dtpr_provider != old)
8579			continue;
8580
8581		if (probe->dtpr_ecb == NULL)
8582			continue;
8583
8584		/*
8585		 * If we are trying to unregister a defunct provider, and the
8586		 * provider was made defunct within the interval dictated by
8587		 * dtrace_unregister_defunct_reap, we'll (asynchronously)
8588		 * attempt to reap our enablings.  To denote that the provider
8589		 * should reattempt to unregister itself at some point in the
8590		 * future, we will return a differentiable error code (EAGAIN
8591		 * instead of EBUSY) in this case.
8592		 */
8593		if (dtrace_gethrtime() - old->dtpv_defunct >
8594		    dtrace_unregister_defunct_reap)
8595			noreap = 1;
8596
8597		if (!self) {
8598			mutex_exit(&dtrace_lock);
8599#if defined(sun)
8600			mutex_exit(&mod_lock);
8601#endif
8602			mutex_exit(&dtrace_provider_lock);
8603		}
8604
8605		if (noreap)
8606			return (EBUSY);
8607
8608		(void) taskq_dispatch(dtrace_taskq,
8609		    (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
8610
8611		return (EAGAIN);
8612	}
8613
8614	/*
8615	 * All of the probes for this provider are disabled; we can safely
8616	 * remove all of them from their hash chains and from the probe array.
8617	 */
8618	for (i = 0; i < dtrace_nprobes; i++) {
8619		if ((probe = dtrace_probes[i]) == NULL)
8620			continue;
8621
8622		if (probe->dtpr_provider != old)
8623			continue;
8624
8625		dtrace_probes[i] = NULL;
8626
8627		dtrace_hash_remove(dtrace_bymod, probe);
8628		dtrace_hash_remove(dtrace_byfunc, probe);
8629		dtrace_hash_remove(dtrace_byname, probe);
8630
8631		if (first == NULL) {
8632			first = probe;
8633			probe->dtpr_nextmod = NULL;
8634		} else {
8635			probe->dtpr_nextmod = first;
8636			first = probe;
8637		}
8638	}
8639
8640	/*
8641	 * The provider's probes have been removed from the hash chains and
8642	 * from the probe array.  Now issue a dtrace_sync() to be sure that
8643	 * everyone has cleared out from any probe array processing.
8644	 */
8645	dtrace_sync();
8646
8647	for (probe = first; probe != NULL; probe = first) {
8648		first = probe->dtpr_nextmod;
8649
8650		old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8651		    probe->dtpr_arg);
8652		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8653		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8654		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8655#if defined(sun)
8656		vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8657#else
8658		free_unr(dtrace_arena, probe->dtpr_id);
8659#endif
8660		kmem_free(probe, sizeof (dtrace_probe_t));
8661	}
8662
8663	if ((prev = dtrace_provider) == old) {
8664#if defined(sun)
8665		ASSERT(self || dtrace_devi == NULL);
8666		ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8667#endif
8668		dtrace_provider = old->dtpv_next;
8669	} else {
8670		while (prev != NULL && prev->dtpv_next != old)
8671			prev = prev->dtpv_next;
8672
8673		if (prev == NULL) {
8674			panic("attempt to unregister non-existent "
8675			    "dtrace provider %p\n", (void *)id);
8676		}
8677
8678		prev->dtpv_next = old->dtpv_next;
8679	}
8680
8681	if (!self) {
8682		mutex_exit(&dtrace_lock);
8683#if defined(sun)
8684		mutex_exit(&mod_lock);
8685#endif
8686		mutex_exit(&dtrace_provider_lock);
8687	}
8688
8689	kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
8690	kmem_free(old, sizeof (dtrace_provider_t));
8691
8692	return (0);
8693}
8694
8695/*
8696 * Invalidate the specified provider.  All subsequent probe lookups for the
8697 * specified provider will fail, but its probes will not be removed.
8698 */
8699void
8700dtrace_invalidate(dtrace_provider_id_t id)
8701{
8702	dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8703
8704	ASSERT(pvp->dtpv_pops.dtps_enable !=
8705	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
8706
8707	mutex_enter(&dtrace_provider_lock);
8708	mutex_enter(&dtrace_lock);
8709
8710	pvp->dtpv_defunct = dtrace_gethrtime();
8711
8712	mutex_exit(&dtrace_lock);
8713	mutex_exit(&dtrace_provider_lock);
8714}
8715
8716/*
8717 * Indicate whether or not DTrace has attached.
8718 */
8719int
8720dtrace_attached(void)
8721{
8722	/*
8723	 * dtrace_provider will be non-NULL iff the DTrace driver has
8724	 * attached.  (It's non-NULL because DTrace is always itself a
8725	 * provider.)
8726	 */
8727	return (dtrace_provider != NULL);
8728}
8729
8730/*
8731 * Remove all the unenabled probes for the given provider.  This function is
8732 * not unlike dtrace_unregister(), except that it doesn't remove the provider
8733 * -- just as many of its associated probes as it can.
8734 */
8735int
8736dtrace_condense(dtrace_provider_id_t id)
8737{
8738	dtrace_provider_t *prov = (dtrace_provider_t *)id;
8739	int i;
8740	dtrace_probe_t *probe;
8741
8742	/*
8743	 * Make sure this isn't the dtrace provider itself.
8744	 */
8745	ASSERT(prov->dtpv_pops.dtps_enable !=
8746	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
8747
8748	mutex_enter(&dtrace_provider_lock);
8749	mutex_enter(&dtrace_lock);
8750
8751	/*
8752	 * Attempt to destroy the probes associated with this provider.
8753	 */
8754	for (i = 0; i < dtrace_nprobes; i++) {
8755		if ((probe = dtrace_probes[i]) == NULL)
8756			continue;
8757
8758		if (probe->dtpr_provider != prov)
8759			continue;
8760
8761		if (probe->dtpr_ecb != NULL)
8762			continue;
8763
8764		dtrace_probes[i] = NULL;
8765
8766		dtrace_hash_remove(dtrace_bymod, probe);
8767		dtrace_hash_remove(dtrace_byfunc, probe);
8768		dtrace_hash_remove(dtrace_byname, probe);
8769
8770		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
8771		    probe->dtpr_arg);
8772		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8773		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8774		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8775		kmem_free(probe, sizeof (dtrace_probe_t));
8776#if defined(sun)
8777		vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
8778#else
8779		free_unr(dtrace_arena, i + 1);
8780#endif
8781	}
8782
8783	mutex_exit(&dtrace_lock);
8784	mutex_exit(&dtrace_provider_lock);
8785
8786	return (0);
8787}
8788
8789/*
8790 * DTrace Probe Management Functions
8791 *
8792 * The functions in this section perform the DTrace probe management,
8793 * including functions to create probes, look-up probes, and call into the
8794 * providers to request that probes be provided.  Some of these functions are
8795 * in the Provider-to-Framework API; these functions can be identified by the
8796 * fact that they are not declared "static".
8797 */
8798
8799/*
8800 * Create a probe with the specified module name, function name, and name.
8801 */
8802dtrace_id_t
8803dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
8804    const char *func, const char *name, int aframes, void *arg)
8805{
8806	dtrace_probe_t *probe, **probes;
8807	dtrace_provider_t *provider = (dtrace_provider_t *)prov;
8808	dtrace_id_t id;
8809
8810	if (provider == dtrace_provider) {
8811		ASSERT(MUTEX_HELD(&dtrace_lock));
8812	} else {
8813		mutex_enter(&dtrace_lock);
8814	}
8815
8816#if defined(sun)
8817	id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
8818	    VM_BESTFIT | VM_SLEEP);
8819#else
8820	id = alloc_unr(dtrace_arena);
8821#endif
8822	probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
8823
8824	probe->dtpr_id = id;
8825	probe->dtpr_gen = dtrace_probegen++;
8826	probe->dtpr_mod = dtrace_strdup(mod);
8827	probe->dtpr_func = dtrace_strdup(func);
8828	probe->dtpr_name = dtrace_strdup(name);
8829	probe->dtpr_arg = arg;
8830	probe->dtpr_aframes = aframes;
8831	probe->dtpr_provider = provider;
8832
8833	dtrace_hash_add(dtrace_bymod, probe);
8834	dtrace_hash_add(dtrace_byfunc, probe);
8835	dtrace_hash_add(dtrace_byname, probe);
8836
8837	if (id - 1 >= dtrace_nprobes) {
8838		size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
8839		size_t nsize = osize << 1;
8840
8841		if (nsize == 0) {
8842			ASSERT(osize == 0);
8843			ASSERT(dtrace_probes == NULL);
8844			nsize = sizeof (dtrace_probe_t *);
8845		}
8846
8847		probes = kmem_zalloc(nsize, KM_SLEEP);
8848
8849		if (dtrace_probes == NULL) {
8850			ASSERT(osize == 0);
8851			dtrace_probes = probes;
8852			dtrace_nprobes = 1;
8853		} else {
8854			dtrace_probe_t **oprobes = dtrace_probes;
8855
8856			bcopy(oprobes, probes, osize);
8857			dtrace_membar_producer();
8858			dtrace_probes = probes;
8859
8860			dtrace_sync();
8861
8862			/*
8863			 * All CPUs are now seeing the new probes array; we can
8864			 * safely free the old array.
8865			 */
8866			kmem_free(oprobes, osize);
8867			dtrace_nprobes <<= 1;
8868		}
8869
8870		ASSERT(id - 1 < dtrace_nprobes);
8871	}
8872
8873	ASSERT(dtrace_probes[id - 1] == NULL);
8874	dtrace_probes[id - 1] = probe;
8875
8876	if (provider != dtrace_provider)
8877		mutex_exit(&dtrace_lock);
8878
8879	return (id);
8880}
8881
8882static dtrace_probe_t *
8883dtrace_probe_lookup_id(dtrace_id_t id)
8884{
8885	ASSERT(MUTEX_HELD(&dtrace_lock));
8886
8887	if (id == 0 || id > dtrace_nprobes)
8888		return (NULL);
8889
8890	return (dtrace_probes[id - 1]);
8891}
8892
8893static int
8894dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
8895{
8896	*((dtrace_id_t *)arg) = probe->dtpr_id;
8897
8898	return (DTRACE_MATCH_DONE);
8899}
8900
8901/*
8902 * Look up a probe based on provider and one or more of module name, function
8903 * name and probe name.
8904 */
8905dtrace_id_t
8906dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,
8907    char *func, char *name)
8908{
8909	dtrace_probekey_t pkey;
8910	dtrace_id_t id;
8911	int match;
8912
8913	pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
8914	pkey.dtpk_pmatch = &dtrace_match_string;
8915	pkey.dtpk_mod = mod;
8916	pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
8917	pkey.dtpk_func = func;
8918	pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
8919	pkey.dtpk_name = name;
8920	pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
8921	pkey.dtpk_id = DTRACE_IDNONE;
8922
8923	mutex_enter(&dtrace_lock);
8924	match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
8925	    dtrace_probe_lookup_match, &id);
8926	mutex_exit(&dtrace_lock);
8927
8928	ASSERT(match == 1 || match == 0);
8929	return (match ? id : 0);
8930}
8931
8932/*
8933 * Returns the probe argument associated with the specified probe.
8934 */
8935void *
8936dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
8937{
8938	dtrace_probe_t *probe;
8939	void *rval = NULL;
8940
8941	mutex_enter(&dtrace_lock);
8942
8943	if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
8944	    probe->dtpr_provider == (dtrace_provider_t *)id)
8945		rval = probe->dtpr_arg;
8946
8947	mutex_exit(&dtrace_lock);
8948
8949	return (rval);
8950}
8951
8952/*
8953 * Copy a probe into a probe description.
8954 */
8955static void
8956dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
8957{
8958	bzero(pdp, sizeof (dtrace_probedesc_t));
8959	pdp->dtpd_id = prp->dtpr_id;
8960
8961	(void) strncpy(pdp->dtpd_provider,
8962	    prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
8963
8964	(void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
8965	(void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
8966	(void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
8967}
8968
8969/*
8970 * Called to indicate that a probe -- or probes -- should be provided by a
8971 * specfied provider.  If the specified description is NULL, the provider will
8972 * be told to provide all of its probes.  (This is done whenever a new
8973 * consumer comes along, or whenever a retained enabling is to be matched.) If
8974 * the specified description is non-NULL, the provider is given the
8975 * opportunity to dynamically provide the specified probe, allowing providers
8976 * to support the creation of probes on-the-fly.  (So-called _autocreated_
8977 * probes.)  If the provider is NULL, the operations will be applied to all
8978 * providers; if the provider is non-NULL the operations will only be applied
8979 * to the specified provider.  The dtrace_provider_lock must be held, and the
8980 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
8981 * will need to grab the dtrace_lock when it reenters the framework through
8982 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
8983 */
8984static void
8985dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
8986{
8987#if defined(sun)
8988	modctl_t *ctl;
8989#endif
8990	int all = 0;
8991
8992	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8993
8994	if (prv == NULL) {
8995		all = 1;
8996		prv = dtrace_provider;
8997	}
8998
8999	do {
9000		/*
9001		 * First, call the blanket provide operation.
9002		 */
9003		prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
9004
9005#if defined(sun)
9006		/*
9007		 * Now call the per-module provide operation.  We will grab
9008		 * mod_lock to prevent the list from being modified.  Note
9009		 * that this also prevents the mod_busy bits from changing.
9010		 * (mod_busy can only be changed with mod_lock held.)
9011		 */
9012		mutex_enter(&mod_lock);
9013
9014		ctl = &modules;
9015		do {
9016			if (ctl->mod_busy || ctl->mod_mp == NULL)
9017				continue;
9018
9019			prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
9020
9021		} while ((ctl = ctl->mod_next) != &modules);
9022
9023		mutex_exit(&mod_lock);
9024#endif
9025	} while (all && (prv = prv->dtpv_next) != NULL);
9026}
9027
9028#if defined(sun)
9029/*
9030 * Iterate over each probe, and call the Framework-to-Provider API function
9031 * denoted by offs.
9032 */
9033static void
9034dtrace_probe_foreach(uintptr_t offs)
9035{
9036	dtrace_provider_t *prov;
9037	void (*func)(void *, dtrace_id_t, void *);
9038	dtrace_probe_t *probe;
9039	dtrace_icookie_t cookie;
9040	int i;
9041
9042	/*
9043	 * We disable interrupts to walk through the probe array.  This is
9044	 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
9045	 * won't see stale data.
9046	 */
9047	cookie = dtrace_interrupt_disable();
9048
9049	for (i = 0; i < dtrace_nprobes; i++) {
9050		if ((probe = dtrace_probes[i]) == NULL)
9051			continue;
9052
9053		if (probe->dtpr_ecb == NULL) {
9054			/*
9055			 * This probe isn't enabled -- don't call the function.
9056			 */
9057			continue;
9058		}
9059
9060		prov = probe->dtpr_provider;
9061		func = *((void(**)(void *, dtrace_id_t, void *))
9062		    ((uintptr_t)&prov->dtpv_pops + offs));
9063
9064		func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
9065	}
9066
9067	dtrace_interrupt_enable(cookie);
9068}
9069#endif
9070
9071static int
9072dtrace_probe_enable(dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
9073{
9074	dtrace_probekey_t pkey;
9075	uint32_t priv;
9076	uid_t uid;
9077	zoneid_t zoneid;
9078
9079	ASSERT(MUTEX_HELD(&dtrace_lock));
9080	dtrace_ecb_create_cache = NULL;
9081
9082	if (desc == NULL) {
9083		/*
9084		 * If we're passed a NULL description, we're being asked to
9085		 * create an ECB with a NULL probe.
9086		 */
9087		(void) dtrace_ecb_create_enable(NULL, enab);
9088		return (0);
9089	}
9090
9091	dtrace_probekey(desc, &pkey);
9092	dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
9093	    &priv, &uid, &zoneid);
9094
9095	return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
9096	    enab));
9097}
9098
9099/*
9100 * DTrace Helper Provider Functions
9101 */
9102static void
9103dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
9104{
9105	attr->dtat_name = DOF_ATTR_NAME(dofattr);
9106	attr->dtat_data = DOF_ATTR_DATA(dofattr);
9107	attr->dtat_class = DOF_ATTR_CLASS(dofattr);
9108}
9109
9110static void
9111dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
9112    const dof_provider_t *dofprov, char *strtab)
9113{
9114	hprov->dthpv_provname = strtab + dofprov->dofpv_name;
9115	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
9116	    dofprov->dofpv_provattr);
9117	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
9118	    dofprov->dofpv_modattr);
9119	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
9120	    dofprov->dofpv_funcattr);
9121	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
9122	    dofprov->dofpv_nameattr);
9123	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
9124	    dofprov->dofpv_argsattr);
9125}
9126
9127static void
9128dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9129{
9130	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9131	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9132	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
9133	dof_provider_t *provider;
9134	dof_probe_t *probe;
9135	uint32_t *off, *enoff;
9136	uint8_t *arg;
9137	char *strtab;
9138	uint_t i, nprobes;
9139	dtrace_helper_provdesc_t dhpv;
9140	dtrace_helper_probedesc_t dhpb;
9141	dtrace_meta_t *meta = dtrace_meta_pid;
9142	dtrace_mops_t *mops = &meta->dtm_mops;
9143	void *parg;
9144
9145	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9146	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9147	    provider->dofpv_strtab * dof->dofh_secsize);
9148	prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9149	    provider->dofpv_probes * dof->dofh_secsize);
9150	arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9151	    provider->dofpv_prargs * dof->dofh_secsize);
9152	off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9153	    provider->dofpv_proffs * dof->dofh_secsize);
9154
9155	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9156	off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
9157	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
9158	enoff = NULL;
9159
9160	/*
9161	 * See dtrace_helper_provider_validate().
9162	 */
9163	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
9164	    provider->dofpv_prenoffs != DOF_SECT_NONE) {
9165		enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9166		    provider->dofpv_prenoffs * dof->dofh_secsize);
9167		enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
9168	}
9169
9170	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
9171
9172	/*
9173	 * Create the provider.
9174	 */
9175	dtrace_dofprov2hprov(&dhpv, provider, strtab);
9176
9177	if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
9178		return;
9179
9180	meta->dtm_count++;
9181
9182	/*
9183	 * Create the probes.
9184	 */
9185	for (i = 0; i < nprobes; i++) {
9186		probe = (dof_probe_t *)(uintptr_t)(daddr +
9187		    prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
9188
9189		dhpb.dthpb_mod = dhp->dofhp_mod;
9190		dhpb.dthpb_func = strtab + probe->dofpr_func;
9191		dhpb.dthpb_name = strtab + probe->dofpr_name;
9192		dhpb.dthpb_base = probe->dofpr_addr;
9193		dhpb.dthpb_offs = off + probe->dofpr_offidx;
9194		dhpb.dthpb_noffs = probe->dofpr_noffs;
9195		if (enoff != NULL) {
9196			dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
9197			dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
9198		} else {
9199			dhpb.dthpb_enoffs = NULL;
9200			dhpb.dthpb_nenoffs = 0;
9201		}
9202		dhpb.dthpb_args = arg + probe->dofpr_argidx;
9203		dhpb.dthpb_nargc = probe->dofpr_nargc;
9204		dhpb.dthpb_xargc = probe->dofpr_xargc;
9205		dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
9206		dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
9207
9208		mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
9209	}
9210}
9211
9212static void
9213dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
9214{
9215	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9216	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9217	int i;
9218
9219	ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9220
9221	for (i = 0; i < dof->dofh_secnum; i++) {
9222		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9223		    dof->dofh_secoff + i * dof->dofh_secsize);
9224
9225		if (sec->dofs_type != DOF_SECT_PROVIDER)
9226			continue;
9227
9228		dtrace_helper_provide_one(dhp, sec, pid);
9229	}
9230
9231	/*
9232	 * We may have just created probes, so we must now rematch against
9233	 * any retained enablings.  Note that this call will acquire both
9234	 * cpu_lock and dtrace_lock; the fact that we are holding
9235	 * dtrace_meta_lock now is what defines the ordering with respect to
9236	 * these three locks.
9237	 */
9238	dtrace_enabling_matchall();
9239}
9240
9241static void
9242dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9243{
9244	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9245	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9246	dof_sec_t *str_sec;
9247	dof_provider_t *provider;
9248	char *strtab;
9249	dtrace_helper_provdesc_t dhpv;
9250	dtrace_meta_t *meta = dtrace_meta_pid;
9251	dtrace_mops_t *mops = &meta->dtm_mops;
9252
9253	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9254	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9255	    provider->dofpv_strtab * dof->dofh_secsize);
9256
9257	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9258
9259	/*
9260	 * Create the provider.
9261	 */
9262	dtrace_dofprov2hprov(&dhpv, provider, strtab);
9263
9264	mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
9265
9266	meta->dtm_count--;
9267}
9268
9269static void
9270dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
9271{
9272	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9273	dof_hdr_t *dof = (dof_hdr_t *)daddr;
9274	int i;
9275
9276	ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9277
9278	for (i = 0; i < dof->dofh_secnum; i++) {
9279		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9280		    dof->dofh_secoff + i * dof->dofh_secsize);
9281
9282		if (sec->dofs_type != DOF_SECT_PROVIDER)
9283			continue;
9284
9285		dtrace_helper_provider_remove_one(dhp, sec, pid);
9286	}
9287}
9288
9289/*
9290 * DTrace Meta Provider-to-Framework API Functions
9291 *
9292 * These functions implement the Meta Provider-to-Framework API, as described
9293 * in <sys/dtrace.h>.
9294 */
9295int
9296dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
9297    dtrace_meta_provider_id_t *idp)
9298{
9299	dtrace_meta_t *meta;
9300	dtrace_helpers_t *help, *next;
9301	int i;
9302
9303	*idp = DTRACE_METAPROVNONE;
9304
9305	/*
9306	 * We strictly don't need the name, but we hold onto it for
9307	 * debuggability. All hail error queues!
9308	 */
9309	if (name == NULL) {
9310		cmn_err(CE_WARN, "failed to register meta-provider: "
9311		    "invalid name");
9312		return (EINVAL);
9313	}
9314
9315	if (mops == NULL ||
9316	    mops->dtms_create_probe == NULL ||
9317	    mops->dtms_provide_pid == NULL ||
9318	    mops->dtms_remove_pid == NULL) {
9319		cmn_err(CE_WARN, "failed to register meta-register %s: "
9320		    "invalid ops", name);
9321		return (EINVAL);
9322	}
9323
9324	meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
9325	meta->dtm_mops = *mops;
9326	meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
9327	(void) strcpy(meta->dtm_name, name);
9328	meta->dtm_arg = arg;
9329
9330	mutex_enter(&dtrace_meta_lock);
9331	mutex_enter(&dtrace_lock);
9332
9333	if (dtrace_meta_pid != NULL) {
9334		mutex_exit(&dtrace_lock);
9335		mutex_exit(&dtrace_meta_lock);
9336		cmn_err(CE_WARN, "failed to register meta-register %s: "
9337		    "user-land meta-provider exists", name);
9338		kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
9339		kmem_free(meta, sizeof (dtrace_meta_t));
9340		return (EINVAL);
9341	}
9342
9343	dtrace_meta_pid = meta;
9344	*idp = (dtrace_meta_provider_id_t)meta;
9345
9346	/*
9347	 * If there are providers and probes ready to go, pass them
9348	 * off to the new meta provider now.
9349	 */
9350
9351	help = dtrace_deferred_pid;
9352	dtrace_deferred_pid = NULL;
9353
9354	mutex_exit(&dtrace_lock);
9355
9356	while (help != NULL) {
9357		for (i = 0; i < help->dthps_nprovs; i++) {
9358			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
9359			    help->dthps_pid);
9360		}
9361
9362		next = help->dthps_next;
9363		help->dthps_next = NULL;
9364		help->dthps_prev = NULL;
9365		help->dthps_deferred = 0;
9366		help = next;
9367	}
9368
9369	mutex_exit(&dtrace_meta_lock);
9370
9371	return (0);
9372}
9373
9374int
9375dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9376{
9377	dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9378
9379	mutex_enter(&dtrace_meta_lock);
9380	mutex_enter(&dtrace_lock);
9381
9382	if (old == dtrace_meta_pid) {
9383		pp = &dtrace_meta_pid;
9384	} else {
9385		panic("attempt to unregister non-existent "
9386		    "dtrace meta-provider %p\n", (void *)old);
9387	}
9388
9389	if (old->dtm_count != 0) {
9390		mutex_exit(&dtrace_lock);
9391		mutex_exit(&dtrace_meta_lock);
9392		return (EBUSY);
9393	}
9394
9395	*pp = NULL;
9396
9397	mutex_exit(&dtrace_lock);
9398	mutex_exit(&dtrace_meta_lock);
9399
9400	kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
9401	kmem_free(old, sizeof (dtrace_meta_t));
9402
9403	return (0);
9404}
9405
9406
9407/*
9408 * DTrace DIF Object Functions
9409 */
9410static int
9411dtrace_difo_err(uint_t pc, const char *format, ...)
9412{
9413	if (dtrace_err_verbose) {
9414		va_list alist;
9415
9416		(void) uprintf("dtrace DIF object error: [%u]: ", pc);
9417		va_start(alist, format);
9418		(void) vuprintf(format, alist);
9419		va_end(alist);
9420	}
9421
9422#ifdef DTRACE_ERRDEBUG
9423	dtrace_errdebug(format);
9424#endif
9425	return (1);
9426}
9427
9428/*
9429 * Validate a DTrace DIF object by checking the IR instructions.  The following
9430 * rules are currently enforced by dtrace_difo_validate():
9431 *
9432 * 1. Each instruction must have a valid opcode
9433 * 2. Each register, string, variable, or subroutine reference must be valid
9434 * 3. No instruction can modify register %r0 (must be zero)
9435 * 4. All instruction reserved bits must be set to zero
9436 * 5. The last instruction must be a "ret" instruction
9437 * 6. All branch targets must reference a valid instruction _after_ the branch
9438 */
9439static int
9440dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9441    cred_t *cr)
9442{
9443	int err = 0, i;
9444	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9445	int kcheckload;
9446	uint_t pc;
9447
9448	kcheckload = cr == NULL ||
9449	    (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9450
9451	dp->dtdo_destructive = 0;
9452
9453	for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9454		dif_instr_t instr = dp->dtdo_buf[pc];
9455
9456		uint_t r1 = DIF_INSTR_R1(instr);
9457		uint_t r2 = DIF_INSTR_R2(instr);
9458		uint_t rd = DIF_INSTR_RD(instr);
9459		uint_t rs = DIF_INSTR_RS(instr);
9460		uint_t label = DIF_INSTR_LABEL(instr);
9461		uint_t v = DIF_INSTR_VAR(instr);
9462		uint_t subr = DIF_INSTR_SUBR(instr);
9463		uint_t type = DIF_INSTR_TYPE(instr);
9464		uint_t op = DIF_INSTR_OP(instr);
9465
9466		switch (op) {
9467		case DIF_OP_OR:
9468		case DIF_OP_XOR:
9469		case DIF_OP_AND:
9470		case DIF_OP_SLL:
9471		case DIF_OP_SRL:
9472		case DIF_OP_SRA:
9473		case DIF_OP_SUB:
9474		case DIF_OP_ADD:
9475		case DIF_OP_MUL:
9476		case DIF_OP_SDIV:
9477		case DIF_OP_UDIV:
9478		case DIF_OP_SREM:
9479		case DIF_OP_UREM:
9480		case DIF_OP_COPYS:
9481			if (r1 >= nregs)
9482				err += efunc(pc, "invalid register %u\n", r1);
9483			if (r2 >= nregs)
9484				err += efunc(pc, "invalid register %u\n", r2);
9485			if (rd >= nregs)
9486				err += efunc(pc, "invalid register %u\n", rd);
9487			if (rd == 0)
9488				err += efunc(pc, "cannot write to %r0\n");
9489			break;
9490		case DIF_OP_NOT:
9491		case DIF_OP_MOV:
9492		case DIF_OP_ALLOCS:
9493			if (r1 >= nregs)
9494				err += efunc(pc, "invalid register %u\n", r1);
9495			if (r2 != 0)
9496				err += efunc(pc, "non-zero reserved bits\n");
9497			if (rd >= nregs)
9498				err += efunc(pc, "invalid register %u\n", rd);
9499			if (rd == 0)
9500				err += efunc(pc, "cannot write to %r0\n");
9501			break;
9502		case DIF_OP_LDSB:
9503		case DIF_OP_LDSH:
9504		case DIF_OP_LDSW:
9505		case DIF_OP_LDUB:
9506		case DIF_OP_LDUH:
9507		case DIF_OP_LDUW:
9508		case DIF_OP_LDX:
9509			if (r1 >= nregs)
9510				err += efunc(pc, "invalid register %u\n", r1);
9511			if (r2 != 0)
9512				err += efunc(pc, "non-zero reserved bits\n");
9513			if (rd >= nregs)
9514				err += efunc(pc, "invalid register %u\n", rd);
9515			if (rd == 0)
9516				err += efunc(pc, "cannot write to %r0\n");
9517			if (kcheckload)
9518				dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9519				    DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9520			break;
9521		case DIF_OP_RLDSB:
9522		case DIF_OP_RLDSH:
9523		case DIF_OP_RLDSW:
9524		case DIF_OP_RLDUB:
9525		case DIF_OP_RLDUH:
9526		case DIF_OP_RLDUW:
9527		case DIF_OP_RLDX:
9528			if (r1 >= nregs)
9529				err += efunc(pc, "invalid register %u\n", r1);
9530			if (r2 != 0)
9531				err += efunc(pc, "non-zero reserved bits\n");
9532			if (rd >= nregs)
9533				err += efunc(pc, "invalid register %u\n", rd);
9534			if (rd == 0)
9535				err += efunc(pc, "cannot write to %r0\n");
9536			break;
9537		case DIF_OP_ULDSB:
9538		case DIF_OP_ULDSH:
9539		case DIF_OP_ULDSW:
9540		case DIF_OP_ULDUB:
9541		case DIF_OP_ULDUH:
9542		case DIF_OP_ULDUW:
9543		case DIF_OP_ULDX:
9544			if (r1 >= nregs)
9545				err += efunc(pc, "invalid register %u\n", r1);
9546			if (r2 != 0)
9547				err += efunc(pc, "non-zero reserved bits\n");
9548			if (rd >= nregs)
9549				err += efunc(pc, "invalid register %u\n", rd);
9550			if (rd == 0)
9551				err += efunc(pc, "cannot write to %r0\n");
9552			break;
9553		case DIF_OP_STB:
9554		case DIF_OP_STH:
9555		case DIF_OP_STW:
9556		case DIF_OP_STX:
9557			if (r1 >= nregs)
9558				err += efunc(pc, "invalid register %u\n", r1);
9559			if (r2 != 0)
9560				err += efunc(pc, "non-zero reserved bits\n");
9561			if (rd >= nregs)
9562				err += efunc(pc, "invalid register %u\n", rd);
9563			if (rd == 0)
9564				err += efunc(pc, "cannot write to 0 address\n");
9565			break;
9566		case DIF_OP_CMP:
9567		case DIF_OP_SCMP:
9568			if (r1 >= nregs)
9569				err += efunc(pc, "invalid register %u\n", r1);
9570			if (r2 >= nregs)
9571				err += efunc(pc, "invalid register %u\n", r2);
9572			if (rd != 0)
9573				err += efunc(pc, "non-zero reserved bits\n");
9574			break;
9575		case DIF_OP_TST:
9576			if (r1 >= nregs)
9577				err += efunc(pc, "invalid register %u\n", r1);
9578			if (r2 != 0 || rd != 0)
9579				err += efunc(pc, "non-zero reserved bits\n");
9580			break;
9581		case DIF_OP_BA:
9582		case DIF_OP_BE:
9583		case DIF_OP_BNE:
9584		case DIF_OP_BG:
9585		case DIF_OP_BGU:
9586		case DIF_OP_BGE:
9587		case DIF_OP_BGEU:
9588		case DIF_OP_BL:
9589		case DIF_OP_BLU:
9590		case DIF_OP_BLE:
9591		case DIF_OP_BLEU:
9592			if (label >= dp->dtdo_len) {
9593				err += efunc(pc, "invalid branch target %u\n",
9594				    label);
9595			}
9596			if (label <= pc) {
9597				err += efunc(pc, "backward branch to %u\n",
9598				    label);
9599			}
9600			break;
9601		case DIF_OP_RET:
9602			if (r1 != 0 || r2 != 0)
9603				err += efunc(pc, "non-zero reserved bits\n");
9604			if (rd >= nregs)
9605				err += efunc(pc, "invalid register %u\n", rd);
9606			break;
9607		case DIF_OP_NOP:
9608		case DIF_OP_POPTS:
9609		case DIF_OP_FLUSHTS:
9610			if (r1 != 0 || r2 != 0 || rd != 0)
9611				err += efunc(pc, "non-zero reserved bits\n");
9612			break;
9613		case DIF_OP_SETX:
9614			if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9615				err += efunc(pc, "invalid integer ref %u\n",
9616				    DIF_INSTR_INTEGER(instr));
9617			}
9618			if (rd >= nregs)
9619				err += efunc(pc, "invalid register %u\n", rd);
9620			if (rd == 0)
9621				err += efunc(pc, "cannot write to %r0\n");
9622			break;
9623		case DIF_OP_SETS:
9624			if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9625				err += efunc(pc, "invalid string ref %u\n",
9626				    DIF_INSTR_STRING(instr));
9627			}
9628			if (rd >= nregs)
9629				err += efunc(pc, "invalid register %u\n", rd);
9630			if (rd == 0)
9631				err += efunc(pc, "cannot write to %r0\n");
9632			break;
9633		case DIF_OP_LDGA:
9634		case DIF_OP_LDTA:
9635			if (r1 > DIF_VAR_ARRAY_MAX)
9636				err += efunc(pc, "invalid array %u\n", r1);
9637			if (r2 >= nregs)
9638				err += efunc(pc, "invalid register %u\n", r2);
9639			if (rd >= nregs)
9640				err += efunc(pc, "invalid register %u\n", rd);
9641			if (rd == 0)
9642				err += efunc(pc, "cannot write to %r0\n");
9643			break;
9644		case DIF_OP_LDGS:
9645		case DIF_OP_LDTS:
9646		case DIF_OP_LDLS:
9647		case DIF_OP_LDGAA:
9648		case DIF_OP_LDTAA:
9649			if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9650				err += efunc(pc, "invalid variable %u\n", v);
9651			if (rd >= nregs)
9652				err += efunc(pc, "invalid register %u\n", rd);
9653			if (rd == 0)
9654				err += efunc(pc, "cannot write to %r0\n");
9655			break;
9656		case DIF_OP_STGS:
9657		case DIF_OP_STTS:
9658		case DIF_OP_STLS:
9659		case DIF_OP_STGAA:
9660		case DIF_OP_STTAA:
9661			if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9662				err += efunc(pc, "invalid variable %u\n", v);
9663			if (rs >= nregs)
9664				err += efunc(pc, "invalid register %u\n", rd);
9665			break;
9666		case DIF_OP_CALL:
9667			if (subr > DIF_SUBR_MAX)
9668				err += efunc(pc, "invalid subr %u\n", subr);
9669			if (rd >= nregs)
9670				err += efunc(pc, "invalid register %u\n", rd);
9671			if (rd == 0)
9672				err += efunc(pc, "cannot write to %r0\n");
9673
9674			if (subr == DIF_SUBR_COPYOUT ||
9675			    subr == DIF_SUBR_COPYOUTSTR) {
9676				dp->dtdo_destructive = 1;
9677			}
9678
9679			if (subr == DIF_SUBR_GETF) {
9680				/*
9681				 * If we have a getf() we need to record that
9682				 * in our state.  Note that our state can be
9683				 * NULL if this is a helper -- but in that
9684				 * case, the call to getf() is itself illegal,
9685				 * and will be caught (slightly later) when
9686				 * the helper is validated.
9687				 */
9688				if (vstate->dtvs_state != NULL)
9689					vstate->dtvs_state->dts_getf++;
9690			}
9691
9692			break;
9693		case DIF_OP_PUSHTR:
9694			if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9695				err += efunc(pc, "invalid ref type %u\n", type);
9696			if (r2 >= nregs)
9697				err += efunc(pc, "invalid register %u\n", r2);
9698			if (rs >= nregs)
9699				err += efunc(pc, "invalid register %u\n", rs);
9700			break;
9701		case DIF_OP_PUSHTV:
9702			if (type != DIF_TYPE_CTF)
9703				err += efunc(pc, "invalid val type %u\n", type);
9704			if (r2 >= nregs)
9705				err += efunc(pc, "invalid register %u\n", r2);
9706			if (rs >= nregs)
9707				err += efunc(pc, "invalid register %u\n", rs);
9708			break;
9709		default:
9710			err += efunc(pc, "invalid opcode %u\n",
9711			    DIF_INSTR_OP(instr));
9712		}
9713	}
9714
9715	if (dp->dtdo_len != 0 &&
9716	    DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9717		err += efunc(dp->dtdo_len - 1,
9718		    "expected 'ret' as last DIF instruction\n");
9719	}
9720
9721	if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
9722		/*
9723		 * If we're not returning by reference, the size must be either
9724		 * 0 or the size of one of the base types.
9725		 */
9726		switch (dp->dtdo_rtype.dtdt_size) {
9727		case 0:
9728		case sizeof (uint8_t):
9729		case sizeof (uint16_t):
9730		case sizeof (uint32_t):
9731		case sizeof (uint64_t):
9732			break;
9733
9734		default:
9735			err += efunc(dp->dtdo_len - 1, "bad return size\n");
9736		}
9737	}
9738
9739	for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9740		dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9741		dtrace_diftype_t *vt, *et;
9742		uint_t id, ndx;
9743
9744		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9745		    v->dtdv_scope != DIFV_SCOPE_THREAD &&
9746		    v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9747			err += efunc(i, "unrecognized variable scope %d\n",
9748			    v->dtdv_scope);
9749			break;
9750		}
9751
9752		if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9753		    v->dtdv_kind != DIFV_KIND_SCALAR) {
9754			err += efunc(i, "unrecognized variable type %d\n",
9755			    v->dtdv_kind);
9756			break;
9757		}
9758
9759		if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9760			err += efunc(i, "%d exceeds variable id limit\n", id);
9761			break;
9762		}
9763
9764		if (id < DIF_VAR_OTHER_UBASE)
9765			continue;
9766
9767		/*
9768		 * For user-defined variables, we need to check that this
9769		 * definition is identical to any previous definition that we
9770		 * encountered.
9771		 */
9772		ndx = id - DIF_VAR_OTHER_UBASE;
9773
9774		switch (v->dtdv_scope) {
9775		case DIFV_SCOPE_GLOBAL:
9776			if (ndx < vstate->dtvs_nglobals) {
9777				dtrace_statvar_t *svar;
9778
9779				if ((svar = vstate->dtvs_globals[ndx]) != NULL)
9780					existing = &svar->dtsv_var;
9781			}
9782
9783			break;
9784
9785		case DIFV_SCOPE_THREAD:
9786			if (ndx < vstate->dtvs_ntlocals)
9787				existing = &vstate->dtvs_tlocals[ndx];
9788			break;
9789
9790		case DIFV_SCOPE_LOCAL:
9791			if (ndx < vstate->dtvs_nlocals) {
9792				dtrace_statvar_t *svar;
9793
9794				if ((svar = vstate->dtvs_locals[ndx]) != NULL)
9795					existing = &svar->dtsv_var;
9796			}
9797
9798			break;
9799		}
9800
9801		vt = &v->dtdv_type;
9802
9803		if (vt->dtdt_flags & DIF_TF_BYREF) {
9804			if (vt->dtdt_size == 0) {
9805				err += efunc(i, "zero-sized variable\n");
9806				break;
9807			}
9808
9809			if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
9810			    vt->dtdt_size > dtrace_global_maxsize) {
9811				err += efunc(i, "oversized by-ref global\n");
9812				break;
9813			}
9814		}
9815
9816		if (existing == NULL || existing->dtdv_id == 0)
9817			continue;
9818
9819		ASSERT(existing->dtdv_id == v->dtdv_id);
9820		ASSERT(existing->dtdv_scope == v->dtdv_scope);
9821
9822		if (existing->dtdv_kind != v->dtdv_kind)
9823			err += efunc(i, "%d changed variable kind\n", id);
9824
9825		et = &existing->dtdv_type;
9826
9827		if (vt->dtdt_flags != et->dtdt_flags) {
9828			err += efunc(i, "%d changed variable type flags\n", id);
9829			break;
9830		}
9831
9832		if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
9833			err += efunc(i, "%d changed variable type size\n", id);
9834			break;
9835		}
9836	}
9837
9838	return (err);
9839}
9840
9841/*
9842 * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
9843 * are much more constrained than normal DIFOs.  Specifically, they may
9844 * not:
9845 *
9846 * 1. Make calls to subroutines other than copyin(), copyinstr() or
9847 *    miscellaneous string routines
9848 * 2. Access DTrace variables other than the args[] array, and the
9849 *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
9850 * 3. Have thread-local variables.
9851 * 4. Have dynamic variables.
9852 */
9853static int
9854dtrace_difo_validate_helper(dtrace_difo_t *dp)
9855{
9856	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9857	int err = 0;
9858	uint_t pc;
9859
9860	for (pc = 0; pc < dp->dtdo_len; pc++) {
9861		dif_instr_t instr = dp->dtdo_buf[pc];
9862
9863		uint_t v = DIF_INSTR_VAR(instr);
9864		uint_t subr = DIF_INSTR_SUBR(instr);
9865		uint_t op = DIF_INSTR_OP(instr);
9866
9867		switch (op) {
9868		case DIF_OP_OR:
9869		case DIF_OP_XOR:
9870		case DIF_OP_AND:
9871		case DIF_OP_SLL:
9872		case DIF_OP_SRL:
9873		case DIF_OP_SRA:
9874		case DIF_OP_SUB:
9875		case DIF_OP_ADD:
9876		case DIF_OP_MUL:
9877		case DIF_OP_SDIV:
9878		case DIF_OP_UDIV:
9879		case DIF_OP_SREM:
9880		case DIF_OP_UREM:
9881		case DIF_OP_COPYS:
9882		case DIF_OP_NOT:
9883		case DIF_OP_MOV:
9884		case DIF_OP_RLDSB:
9885		case DIF_OP_RLDSH:
9886		case DIF_OP_RLDSW:
9887		case DIF_OP_RLDUB:
9888		case DIF_OP_RLDUH:
9889		case DIF_OP_RLDUW:
9890		case DIF_OP_RLDX:
9891		case DIF_OP_ULDSB:
9892		case DIF_OP_ULDSH:
9893		case DIF_OP_ULDSW:
9894		case DIF_OP_ULDUB:
9895		case DIF_OP_ULDUH:
9896		case DIF_OP_ULDUW:
9897		case DIF_OP_ULDX:
9898		case DIF_OP_STB:
9899		case DIF_OP_STH:
9900		case DIF_OP_STW:
9901		case DIF_OP_STX:
9902		case DIF_OP_ALLOCS:
9903		case DIF_OP_CMP:
9904		case DIF_OP_SCMP:
9905		case DIF_OP_TST:
9906		case DIF_OP_BA:
9907		case DIF_OP_BE:
9908		case DIF_OP_BNE:
9909		case DIF_OP_BG:
9910		case DIF_OP_BGU:
9911		case DIF_OP_BGE:
9912		case DIF_OP_BGEU:
9913		case DIF_OP_BL:
9914		case DIF_OP_BLU:
9915		case DIF_OP_BLE:
9916		case DIF_OP_BLEU:
9917		case DIF_OP_RET:
9918		case DIF_OP_NOP:
9919		case DIF_OP_POPTS:
9920		case DIF_OP_FLUSHTS:
9921		case DIF_OP_SETX:
9922		case DIF_OP_SETS:
9923		case DIF_OP_LDGA:
9924		case DIF_OP_LDLS:
9925		case DIF_OP_STGS:
9926		case DIF_OP_STLS:
9927		case DIF_OP_PUSHTR:
9928		case DIF_OP_PUSHTV:
9929			break;
9930
9931		case DIF_OP_LDGS:
9932			if (v >= DIF_VAR_OTHER_UBASE)
9933				break;
9934
9935			if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
9936				break;
9937
9938			if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
9939			    v == DIF_VAR_PPID || v == DIF_VAR_TID ||
9940			    v == DIF_VAR_EXECARGS ||
9941			    v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
9942			    v == DIF_VAR_UID || v == DIF_VAR_GID)
9943				break;
9944
9945			err += efunc(pc, "illegal variable %u\n", v);
9946			break;
9947
9948		case DIF_OP_LDTA:
9949		case DIF_OP_LDTS:
9950		case DIF_OP_LDGAA:
9951		case DIF_OP_LDTAA:
9952			err += efunc(pc, "illegal dynamic variable load\n");
9953			break;
9954
9955		case DIF_OP_STTS:
9956		case DIF_OP_STGAA:
9957		case DIF_OP_STTAA:
9958			err += efunc(pc, "illegal dynamic variable store\n");
9959			break;
9960
9961		case DIF_OP_CALL:
9962			if (subr == DIF_SUBR_ALLOCA ||
9963			    subr == DIF_SUBR_BCOPY ||
9964			    subr == DIF_SUBR_COPYIN ||
9965			    subr == DIF_SUBR_COPYINTO ||
9966			    subr == DIF_SUBR_COPYINSTR ||
9967			    subr == DIF_SUBR_INDEX ||
9968			    subr == DIF_SUBR_INET_NTOA ||
9969			    subr == DIF_SUBR_INET_NTOA6 ||
9970			    subr == DIF_SUBR_INET_NTOP ||
9971			    subr == DIF_SUBR_JSON ||
9972			    subr == DIF_SUBR_LLTOSTR ||
9973			    subr == DIF_SUBR_STRTOLL ||
9974			    subr == DIF_SUBR_RINDEX ||
9975			    subr == DIF_SUBR_STRCHR ||
9976			    subr == DIF_SUBR_STRJOIN ||
9977			    subr == DIF_SUBR_STRRCHR ||
9978			    subr == DIF_SUBR_STRSTR ||
9979			    subr == DIF_SUBR_HTONS ||
9980			    subr == DIF_SUBR_HTONL ||
9981			    subr == DIF_SUBR_HTONLL ||
9982			    subr == DIF_SUBR_NTOHS ||
9983			    subr == DIF_SUBR_NTOHL ||
9984			    subr == DIF_SUBR_NTOHLL ||
9985			    subr == DIF_SUBR_MEMREF ||
9986			    subr == DIF_SUBR_TYPEREF)
9987				break;
9988
9989			err += efunc(pc, "invalid subr %u\n", subr);
9990			break;
9991
9992		default:
9993			err += efunc(pc, "invalid opcode %u\n",
9994			    DIF_INSTR_OP(instr));
9995		}
9996	}
9997
9998	return (err);
9999}
10000
10001/*
10002 * Returns 1 if the expression in the DIF object can be cached on a per-thread
10003 * basis; 0 if not.
10004 */
10005static int
10006dtrace_difo_cacheable(dtrace_difo_t *dp)
10007{
10008	int i;
10009
10010	if (dp == NULL)
10011		return (0);
10012
10013	for (i = 0; i < dp->dtdo_varlen; i++) {
10014		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10015
10016		if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10017			continue;
10018
10019		switch (v->dtdv_id) {
10020		case DIF_VAR_CURTHREAD:
10021		case DIF_VAR_PID:
10022		case DIF_VAR_TID:
10023		case DIF_VAR_EXECARGS:
10024		case DIF_VAR_EXECNAME:
10025		case DIF_VAR_ZONENAME:
10026			break;
10027
10028		default:
10029			return (0);
10030		}
10031	}
10032
10033	/*
10034	 * This DIF object may be cacheable.  Now we need to look for any
10035	 * array loading instructions, any memory loading instructions, or
10036	 * any stores to thread-local variables.
10037	 */
10038	for (i = 0; i < dp->dtdo_len; i++) {
10039		uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10040
10041		if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
10042		    (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
10043		    (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
10044		    op == DIF_OP_LDGA || op == DIF_OP_STTS)
10045			return (0);
10046	}
10047
10048	return (1);
10049}
10050
10051static void
10052dtrace_difo_hold(dtrace_difo_t *dp)
10053{
10054	int i;
10055
10056	ASSERT(MUTEX_HELD(&dtrace_lock));
10057
10058	dp->dtdo_refcnt++;
10059	ASSERT(dp->dtdo_refcnt != 0);
10060
10061	/*
10062	 * We need to check this DIF object for references to the variable
10063	 * DIF_VAR_VTIMESTAMP.
10064	 */
10065	for (i = 0; i < dp->dtdo_varlen; i++) {
10066		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10067
10068		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10069			continue;
10070
10071		if (dtrace_vtime_references++ == 0)
10072			dtrace_vtime_enable();
10073	}
10074}
10075
10076/*
10077 * This routine calculates the dynamic variable chunksize for a given DIF
10078 * object.  The calculation is not fool-proof, and can probably be tricked by
10079 * malicious DIF -- but it works for all compiler-generated DIF.  Because this
10080 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10081 * if a dynamic variable size exceeds the chunksize.
10082 */
10083static void
10084dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10085{
10086	uint64_t sval = 0;
10087	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
10088	const dif_instr_t *text = dp->dtdo_buf;
10089	uint_t pc, srd = 0;
10090	uint_t ttop = 0;
10091	size_t size, ksize;
10092	uint_t id, i;
10093
10094	for (pc = 0; pc < dp->dtdo_len; pc++) {
10095		dif_instr_t instr = text[pc];
10096		uint_t op = DIF_INSTR_OP(instr);
10097		uint_t rd = DIF_INSTR_RD(instr);
10098		uint_t r1 = DIF_INSTR_R1(instr);
10099		uint_t nkeys = 0;
10100		uchar_t scope = 0;
10101
10102		dtrace_key_t *key = tupregs;
10103
10104		switch (op) {
10105		case DIF_OP_SETX:
10106			sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10107			srd = rd;
10108			continue;
10109
10110		case DIF_OP_STTS:
10111			key = &tupregs[DIF_DTR_NREGS];
10112			key[0].dttk_size = 0;
10113			key[1].dttk_size = 0;
10114			nkeys = 2;
10115			scope = DIFV_SCOPE_THREAD;
10116			break;
10117
10118		case DIF_OP_STGAA:
10119		case DIF_OP_STTAA:
10120			nkeys = ttop;
10121
10122			if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10123				key[nkeys++].dttk_size = 0;
10124
10125			key[nkeys++].dttk_size = 0;
10126
10127			if (op == DIF_OP_STTAA) {
10128				scope = DIFV_SCOPE_THREAD;
10129			} else {
10130				scope = DIFV_SCOPE_GLOBAL;
10131			}
10132
10133			break;
10134
10135		case DIF_OP_PUSHTR:
10136			if (ttop == DIF_DTR_NREGS)
10137				return;
10138
10139			if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
10140				/*
10141				 * If the register for the size of the "pushtr"
10142				 * is %r0 (or the value is 0) and the type is
10143				 * a string, we'll use the system-wide default
10144				 * string size.
10145				 */
10146				tupregs[ttop++].dttk_size =
10147				    dtrace_strsize_default;
10148			} else {
10149				if (srd == 0)
10150					return;
10151
10152				tupregs[ttop++].dttk_size = sval;
10153			}
10154
10155			break;
10156
10157		case DIF_OP_PUSHTV:
10158			if (ttop == DIF_DTR_NREGS)
10159				return;
10160
10161			tupregs[ttop++].dttk_size = 0;
10162			break;
10163
10164		case DIF_OP_FLUSHTS:
10165			ttop = 0;
10166			break;
10167
10168		case DIF_OP_POPTS:
10169			if (ttop != 0)
10170				ttop--;
10171			break;
10172		}
10173
10174		sval = 0;
10175		srd = 0;
10176
10177		if (nkeys == 0)
10178			continue;
10179
10180		/*
10181		 * We have a dynamic variable allocation; calculate its size.
10182		 */
10183		for (ksize = 0, i = 0; i < nkeys; i++)
10184			ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10185
10186		size = sizeof (dtrace_dynvar_t);
10187		size += sizeof (dtrace_key_t) * (nkeys - 1);
10188		size += ksize;
10189
10190		/*
10191		 * Now we need to determine the size of the stored data.
10192		 */
10193		id = DIF_INSTR_VAR(instr);
10194
10195		for (i = 0; i < dp->dtdo_varlen; i++) {
10196			dtrace_difv_t *v = &dp->dtdo_vartab[i];
10197
10198			if (v->dtdv_id == id && v->dtdv_scope == scope) {
10199				size += v->dtdv_type.dtdt_size;
10200				break;
10201			}
10202		}
10203
10204		if (i == dp->dtdo_varlen)
10205			return;
10206
10207		/*
10208		 * We have the size.  If this is larger than the chunk size
10209		 * for our dynamic variable state, reset the chunk size.
10210		 */
10211		size = P2ROUNDUP(size, sizeof (uint64_t));
10212
10213		if (size > vstate->dtvs_dynvars.dtds_chunksize)
10214			vstate->dtvs_dynvars.dtds_chunksize = size;
10215	}
10216}
10217
10218static void
10219dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10220{
10221	int i, oldsvars, osz, nsz, otlocals, ntlocals;
10222	uint_t id;
10223
10224	ASSERT(MUTEX_HELD(&dtrace_lock));
10225	ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
10226
10227	for (i = 0; i < dp->dtdo_varlen; i++) {
10228		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10229		dtrace_statvar_t *svar, ***svarp = NULL;
10230		size_t dsize = 0;
10231		uint8_t scope = v->dtdv_scope;
10232		int *np = NULL;
10233
10234		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10235			continue;
10236
10237		id -= DIF_VAR_OTHER_UBASE;
10238
10239		switch (scope) {
10240		case DIFV_SCOPE_THREAD:
10241			while (id >= (otlocals = vstate->dtvs_ntlocals)) {
10242				dtrace_difv_t *tlocals;
10243
10244				if ((ntlocals = (otlocals << 1)) == 0)
10245					ntlocals = 1;
10246
10247				osz = otlocals * sizeof (dtrace_difv_t);
10248				nsz = ntlocals * sizeof (dtrace_difv_t);
10249
10250				tlocals = kmem_zalloc(nsz, KM_SLEEP);
10251
10252				if (osz != 0) {
10253					bcopy(vstate->dtvs_tlocals,
10254					    tlocals, osz);
10255					kmem_free(vstate->dtvs_tlocals, osz);
10256				}
10257
10258				vstate->dtvs_tlocals = tlocals;
10259				vstate->dtvs_ntlocals = ntlocals;
10260			}
10261
10262			vstate->dtvs_tlocals[id] = *v;
10263			continue;
10264
10265		case DIFV_SCOPE_LOCAL:
10266			np = &vstate->dtvs_nlocals;
10267			svarp = &vstate->dtvs_locals;
10268
10269			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10270				dsize = NCPU * (v->dtdv_type.dtdt_size +
10271				    sizeof (uint64_t));
10272			else
10273				dsize = NCPU * sizeof (uint64_t);
10274
10275			break;
10276
10277		case DIFV_SCOPE_GLOBAL:
10278			np = &vstate->dtvs_nglobals;
10279			svarp = &vstate->dtvs_globals;
10280
10281			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10282				dsize = v->dtdv_type.dtdt_size +
10283				    sizeof (uint64_t);
10284
10285			break;
10286
10287		default:
10288			ASSERT(0);
10289		}
10290
10291		while (id >= (oldsvars = *np)) {
10292			dtrace_statvar_t **statics;
10293			int newsvars, oldsize, newsize;
10294
10295			if ((newsvars = (oldsvars << 1)) == 0)
10296				newsvars = 1;
10297
10298			oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10299			newsize = newsvars * sizeof (dtrace_statvar_t *);
10300
10301			statics = kmem_zalloc(newsize, KM_SLEEP);
10302
10303			if (oldsize != 0) {
10304				bcopy(*svarp, statics, oldsize);
10305				kmem_free(*svarp, oldsize);
10306			}
10307
10308			*svarp = statics;
10309			*np = newsvars;
10310		}
10311
10312		if ((svar = (*svarp)[id]) == NULL) {
10313			svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10314			svar->dtsv_var = *v;
10315
10316			if ((svar->dtsv_size = dsize) != 0) {
10317				svar->dtsv_data = (uint64_t)(uintptr_t)
10318				    kmem_zalloc(dsize, KM_SLEEP);
10319			}
10320
10321			(*svarp)[id] = svar;
10322		}
10323
10324		svar->dtsv_refcnt++;
10325	}
10326
10327	dtrace_difo_chunksize(dp, vstate);
10328	dtrace_difo_hold(dp);
10329}
10330
10331static dtrace_difo_t *
10332dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10333{
10334	dtrace_difo_t *new;
10335	size_t sz;
10336
10337	ASSERT(dp->dtdo_buf != NULL);
10338	ASSERT(dp->dtdo_refcnt != 0);
10339
10340	new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10341
10342	ASSERT(dp->dtdo_buf != NULL);
10343	sz = dp->dtdo_len * sizeof (dif_instr_t);
10344	new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10345	bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10346	new->dtdo_len = dp->dtdo_len;
10347
10348	if (dp->dtdo_strtab != NULL) {
10349		ASSERT(dp->dtdo_strlen != 0);
10350		new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10351		bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10352		new->dtdo_strlen = dp->dtdo_strlen;
10353	}
10354
10355	if (dp->dtdo_inttab != NULL) {
10356		ASSERT(dp->dtdo_intlen != 0);
10357		sz = dp->dtdo_intlen * sizeof (uint64_t);
10358		new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10359		bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10360		new->dtdo_intlen = dp->dtdo_intlen;
10361	}
10362
10363	if (dp->dtdo_vartab != NULL) {
10364		ASSERT(dp->dtdo_varlen != 0);
10365		sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10366		new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10367		bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10368		new->dtdo_varlen = dp->dtdo_varlen;
10369	}
10370
10371	dtrace_difo_init(new, vstate);
10372	return (new);
10373}
10374
10375static void
10376dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10377{
10378	int i;
10379
10380	ASSERT(dp->dtdo_refcnt == 0);
10381
10382	for (i = 0; i < dp->dtdo_varlen; i++) {
10383		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10384		dtrace_statvar_t *svar, **svarp = NULL;
10385		uint_t id;
10386		uint8_t scope = v->dtdv_scope;
10387		int *np = NULL;
10388
10389		switch (scope) {
10390		case DIFV_SCOPE_THREAD:
10391			continue;
10392
10393		case DIFV_SCOPE_LOCAL:
10394			np = &vstate->dtvs_nlocals;
10395			svarp = vstate->dtvs_locals;
10396			break;
10397
10398		case DIFV_SCOPE_GLOBAL:
10399			np = &vstate->dtvs_nglobals;
10400			svarp = vstate->dtvs_globals;
10401			break;
10402
10403		default:
10404			ASSERT(0);
10405		}
10406
10407		if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10408			continue;
10409
10410		id -= DIF_VAR_OTHER_UBASE;
10411		ASSERT(id < *np);
10412
10413		svar = svarp[id];
10414		ASSERT(svar != NULL);
10415		ASSERT(svar->dtsv_refcnt > 0);
10416
10417		if (--svar->dtsv_refcnt > 0)
10418			continue;
10419
10420		if (svar->dtsv_size != 0) {
10421			ASSERT(svar->dtsv_data != 0);
10422			kmem_free((void *)(uintptr_t)svar->dtsv_data,
10423			    svar->dtsv_size);
10424		}
10425
10426		kmem_free(svar, sizeof (dtrace_statvar_t));
10427		svarp[id] = NULL;
10428	}
10429
10430	if (dp->dtdo_buf != NULL)
10431		kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10432	if (dp->dtdo_inttab != NULL)
10433		kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10434	if (dp->dtdo_strtab != NULL)
10435		kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10436	if (dp->dtdo_vartab != NULL)
10437		kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10438
10439	kmem_free(dp, sizeof (dtrace_difo_t));
10440}
10441
10442static void
10443dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10444{
10445	int i;
10446
10447	ASSERT(MUTEX_HELD(&dtrace_lock));
10448	ASSERT(dp->dtdo_refcnt != 0);
10449
10450	for (i = 0; i < dp->dtdo_varlen; i++) {
10451		dtrace_difv_t *v = &dp->dtdo_vartab[i];
10452
10453		if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10454			continue;
10455
10456		ASSERT(dtrace_vtime_references > 0);
10457		if (--dtrace_vtime_references == 0)
10458			dtrace_vtime_disable();
10459	}
10460
10461	if (--dp->dtdo_refcnt == 0)
10462		dtrace_difo_destroy(dp, vstate);
10463}
10464
10465/*
10466 * DTrace Format Functions
10467 */
10468static uint16_t
10469dtrace_format_add(dtrace_state_t *state, char *str)
10470{
10471	char *fmt, **new;
10472	uint16_t ndx, len = strlen(str) + 1;
10473
10474	fmt = kmem_zalloc(len, KM_SLEEP);
10475	bcopy(str, fmt, len);
10476
10477	for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10478		if (state->dts_formats[ndx] == NULL) {
10479			state->dts_formats[ndx] = fmt;
10480			return (ndx + 1);
10481		}
10482	}
10483
10484	if (state->dts_nformats == USHRT_MAX) {
10485		/*
10486		 * This is only likely if a denial-of-service attack is being
10487		 * attempted.  As such, it's okay to fail silently here.
10488		 */
10489		kmem_free(fmt, len);
10490		return (0);
10491	}
10492
10493	/*
10494	 * For simplicity, we always resize the formats array to be exactly the
10495	 * number of formats.
10496	 */
10497	ndx = state->dts_nformats++;
10498	new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
10499
10500	if (state->dts_formats != NULL) {
10501		ASSERT(ndx != 0);
10502		bcopy(state->dts_formats, new, ndx * sizeof (char *));
10503		kmem_free(state->dts_formats, ndx * sizeof (char *));
10504	}
10505
10506	state->dts_formats = new;
10507	state->dts_formats[ndx] = fmt;
10508
10509	return (ndx + 1);
10510}
10511
10512static void
10513dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10514{
10515	char *fmt;
10516
10517	ASSERT(state->dts_formats != NULL);
10518	ASSERT(format <= state->dts_nformats);
10519	ASSERT(state->dts_formats[format - 1] != NULL);
10520
10521	fmt = state->dts_formats[format - 1];
10522	kmem_free(fmt, strlen(fmt) + 1);
10523	state->dts_formats[format - 1] = NULL;
10524}
10525
10526static void
10527dtrace_format_destroy(dtrace_state_t *state)
10528{
10529	int i;
10530
10531	if (state->dts_nformats == 0) {
10532		ASSERT(state->dts_formats == NULL);
10533		return;
10534	}
10535
10536	ASSERT(state->dts_formats != NULL);
10537
10538	for (i = 0; i < state->dts_nformats; i++) {
10539		char *fmt = state->dts_formats[i];
10540
10541		if (fmt == NULL)
10542			continue;
10543
10544		kmem_free(fmt, strlen(fmt) + 1);
10545	}
10546
10547	kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
10548	state->dts_nformats = 0;
10549	state->dts_formats = NULL;
10550}
10551
10552/*
10553 * DTrace Predicate Functions
10554 */
10555static dtrace_predicate_t *
10556dtrace_predicate_create(dtrace_difo_t *dp)
10557{
10558	dtrace_predicate_t *pred;
10559
10560	ASSERT(MUTEX_HELD(&dtrace_lock));
10561	ASSERT(dp->dtdo_refcnt != 0);
10562
10563	pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10564	pred->dtp_difo = dp;
10565	pred->dtp_refcnt = 1;
10566
10567	if (!dtrace_difo_cacheable(dp))
10568		return (pred);
10569
10570	if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10571		/*
10572		 * This is only theoretically possible -- we have had 2^32
10573		 * cacheable predicates on this machine.  We cannot allow any
10574		 * more predicates to become cacheable:  as unlikely as it is,
10575		 * there may be a thread caching a (now stale) predicate cache
10576		 * ID. (N.B.: the temptation is being successfully resisted to
10577		 * have this cmn_err() "Holy shit -- we executed this code!")
10578		 */
10579		return (pred);
10580	}
10581
10582	pred->dtp_cacheid = dtrace_predcache_id++;
10583
10584	return (pred);
10585}
10586
10587static void
10588dtrace_predicate_hold(dtrace_predicate_t *pred)
10589{
10590	ASSERT(MUTEX_HELD(&dtrace_lock));
10591	ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10592	ASSERT(pred->dtp_refcnt > 0);
10593
10594	pred->dtp_refcnt++;
10595}
10596
10597static void
10598dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10599{
10600	dtrace_difo_t *dp = pred->dtp_difo;
10601
10602	ASSERT(MUTEX_HELD(&dtrace_lock));
10603	ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10604	ASSERT(pred->dtp_refcnt > 0);
10605
10606	if (--pred->dtp_refcnt == 0) {
10607		dtrace_difo_release(pred->dtp_difo, vstate);
10608		kmem_free(pred, sizeof (dtrace_predicate_t));
10609	}
10610}
10611
10612/*
10613 * DTrace Action Description Functions
10614 */
10615static dtrace_actdesc_t *
10616dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10617    uint64_t uarg, uint64_t arg)
10618{
10619	dtrace_actdesc_t *act;
10620
10621#if defined(sun)
10622	ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
10623	    arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
10624#endif
10625
10626	act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10627	act->dtad_kind = kind;
10628	act->dtad_ntuple = ntuple;
10629	act->dtad_uarg = uarg;
10630	act->dtad_arg = arg;
10631	act->dtad_refcnt = 1;
10632
10633	return (act);
10634}
10635
10636static void
10637dtrace_actdesc_hold(dtrace_actdesc_t *act)
10638{
10639	ASSERT(act->dtad_refcnt >= 1);
10640	act->dtad_refcnt++;
10641}
10642
10643static void
10644dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10645{
10646	dtrace_actkind_t kind = act->dtad_kind;
10647	dtrace_difo_t *dp;
10648
10649	ASSERT(act->dtad_refcnt >= 1);
10650
10651	if (--act->dtad_refcnt != 0)
10652		return;
10653
10654	if ((dp = act->dtad_difo) != NULL)
10655		dtrace_difo_release(dp, vstate);
10656
10657	if (DTRACEACT_ISPRINTFLIKE(kind)) {
10658		char *str = (char *)(uintptr_t)act->dtad_arg;
10659
10660#if defined(sun)
10661		ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10662		    (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10663#endif
10664
10665		if (str != NULL)
10666			kmem_free(str, strlen(str) + 1);
10667	}
10668
10669	kmem_free(act, sizeof (dtrace_actdesc_t));
10670}
10671
10672/*
10673 * DTrace ECB Functions
10674 */
10675static dtrace_ecb_t *
10676dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10677{
10678	dtrace_ecb_t *ecb;
10679	dtrace_epid_t epid;
10680
10681	ASSERT(MUTEX_HELD(&dtrace_lock));
10682
10683	ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10684	ecb->dte_predicate = NULL;
10685	ecb->dte_probe = probe;
10686
10687	/*
10688	 * The default size is the size of the default action: recording
10689	 * the header.
10690	 */
10691	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
10692	ecb->dte_alignment = sizeof (dtrace_epid_t);
10693
10694	epid = state->dts_epid++;
10695
10696	if (epid - 1 >= state->dts_necbs) {
10697		dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
10698		int necbs = state->dts_necbs << 1;
10699
10700		ASSERT(epid == state->dts_necbs + 1);
10701
10702		if (necbs == 0) {
10703			ASSERT(oecbs == NULL);
10704			necbs = 1;
10705		}
10706
10707		ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10708
10709		if (oecbs != NULL)
10710			bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
10711
10712		dtrace_membar_producer();
10713		state->dts_ecbs = ecbs;
10714
10715		if (oecbs != NULL) {
10716			/*
10717			 * If this state is active, we must dtrace_sync()
10718			 * before we can free the old dts_ecbs array:  we're
10719			 * coming in hot, and there may be active ring
10720			 * buffer processing (which indexes into the dts_ecbs
10721			 * array) on another CPU.
10722			 */
10723			if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
10724				dtrace_sync();
10725
10726			kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
10727		}
10728
10729		dtrace_membar_producer();
10730		state->dts_necbs = necbs;
10731	}
10732
10733	ecb->dte_state = state;
10734
10735	ASSERT(state->dts_ecbs[epid - 1] == NULL);
10736	dtrace_membar_producer();
10737	state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
10738
10739	return (ecb);
10740}
10741
10742static void
10743dtrace_ecb_enable(dtrace_ecb_t *ecb)
10744{
10745	dtrace_probe_t *probe = ecb->dte_probe;
10746
10747	ASSERT(MUTEX_HELD(&cpu_lock));
10748	ASSERT(MUTEX_HELD(&dtrace_lock));
10749	ASSERT(ecb->dte_next == NULL);
10750
10751	if (probe == NULL) {
10752		/*
10753		 * This is the NULL probe -- there's nothing to do.
10754		 */
10755		return;
10756	}
10757
10758	if (probe->dtpr_ecb == NULL) {
10759		dtrace_provider_t *prov = probe->dtpr_provider;
10760
10761		/*
10762		 * We're the first ECB on this probe.
10763		 */
10764		probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
10765
10766		if (ecb->dte_predicate != NULL)
10767			probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
10768
10769		prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
10770		    probe->dtpr_id, probe->dtpr_arg);
10771	} else {
10772		/*
10773		 * This probe is already active.  Swing the last pointer to
10774		 * point to the new ECB, and issue a dtrace_sync() to assure
10775		 * that all CPUs have seen the change.
10776		 */
10777		ASSERT(probe->dtpr_ecb_last != NULL);
10778		probe->dtpr_ecb_last->dte_next = ecb;
10779		probe->dtpr_ecb_last = ecb;
10780		probe->dtpr_predcache = 0;
10781
10782		dtrace_sync();
10783	}
10784}
10785
10786static void
10787dtrace_ecb_resize(dtrace_ecb_t *ecb)
10788{
10789	dtrace_action_t *act;
10790	uint32_t curneeded = UINT32_MAX;
10791	uint32_t aggbase = UINT32_MAX;
10792
10793	/*
10794	 * If we record anything, we always record the dtrace_rechdr_t.  (And
10795	 * we always record it first.)
10796	 */
10797	ecb->dte_size = sizeof (dtrace_rechdr_t);
10798	ecb->dte_alignment = sizeof (dtrace_epid_t);
10799
10800	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10801		dtrace_recdesc_t *rec = &act->dta_rec;
10802		ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
10803
10804		ecb->dte_alignment = MAX(ecb->dte_alignment,
10805		    rec->dtrd_alignment);
10806
10807		if (DTRACEACT_ISAGG(act->dta_kind)) {
10808			dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10809
10810			ASSERT(rec->dtrd_size != 0);
10811			ASSERT(agg->dtag_first != NULL);
10812			ASSERT(act->dta_prev->dta_intuple);
10813			ASSERT(aggbase != UINT32_MAX);
10814			ASSERT(curneeded != UINT32_MAX);
10815
10816			agg->dtag_base = aggbase;
10817
10818			curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10819			rec->dtrd_offset = curneeded;
10820			curneeded += rec->dtrd_size;
10821			ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
10822
10823			aggbase = UINT32_MAX;
10824			curneeded = UINT32_MAX;
10825		} else if (act->dta_intuple) {
10826			if (curneeded == UINT32_MAX) {
10827				/*
10828				 * This is the first record in a tuple.  Align
10829				 * curneeded to be at offset 4 in an 8-byte
10830				 * aligned block.
10831				 */
10832				ASSERT(act->dta_prev == NULL ||
10833				    !act->dta_prev->dta_intuple);
10834				ASSERT3U(aggbase, ==, UINT32_MAX);
10835				curneeded = P2PHASEUP(ecb->dte_size,
10836				    sizeof (uint64_t), sizeof (dtrace_aggid_t));
10837
10838				aggbase = curneeded - sizeof (dtrace_aggid_t);
10839				ASSERT(IS_P2ALIGNED(aggbase,
10840				    sizeof (uint64_t)));
10841			}
10842			curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10843			rec->dtrd_offset = curneeded;
10844			curneeded += rec->dtrd_size;
10845		} else {
10846			/* tuples must be followed by an aggregation */
10847			ASSERT(act->dta_prev == NULL ||
10848			    !act->dta_prev->dta_intuple);
10849
10850			ecb->dte_size = P2ROUNDUP(ecb->dte_size,
10851			    rec->dtrd_alignment);
10852			rec->dtrd_offset = ecb->dte_size;
10853			ecb->dte_size += rec->dtrd_size;
10854			ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
10855		}
10856	}
10857
10858	if ((act = ecb->dte_action) != NULL &&
10859	    !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
10860	    ecb->dte_size == sizeof (dtrace_rechdr_t)) {
10861		/*
10862		 * If the size is still sizeof (dtrace_rechdr_t), then all
10863		 * actions store no data; set the size to 0.
10864		 */
10865		ecb->dte_size = 0;
10866	}
10867
10868	ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
10869	ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
10870	ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
10871	    ecb->dte_needed);
10872}
10873
10874static dtrace_action_t *
10875dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10876{
10877	dtrace_aggregation_t *agg;
10878	size_t size = sizeof (uint64_t);
10879	int ntuple = desc->dtad_ntuple;
10880	dtrace_action_t *act;
10881	dtrace_recdesc_t *frec;
10882	dtrace_aggid_t aggid;
10883	dtrace_state_t *state = ecb->dte_state;
10884
10885	agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
10886	agg->dtag_ecb = ecb;
10887
10888	ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
10889
10890	switch (desc->dtad_kind) {
10891	case DTRACEAGG_MIN:
10892		agg->dtag_initial = INT64_MAX;
10893		agg->dtag_aggregate = dtrace_aggregate_min;
10894		break;
10895
10896	case DTRACEAGG_MAX:
10897		agg->dtag_initial = INT64_MIN;
10898		agg->dtag_aggregate = dtrace_aggregate_max;
10899		break;
10900
10901	case DTRACEAGG_COUNT:
10902		agg->dtag_aggregate = dtrace_aggregate_count;
10903		break;
10904
10905	case DTRACEAGG_QUANTIZE:
10906		agg->dtag_aggregate = dtrace_aggregate_quantize;
10907		size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
10908		    sizeof (uint64_t);
10909		break;
10910
10911	case DTRACEAGG_LQUANTIZE: {
10912		uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
10913		uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
10914
10915		agg->dtag_initial = desc->dtad_arg;
10916		agg->dtag_aggregate = dtrace_aggregate_lquantize;
10917
10918		if (step == 0 || levels == 0)
10919			goto err;
10920
10921		size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
10922		break;
10923	}
10924
10925	case DTRACEAGG_LLQUANTIZE: {
10926		uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
10927		uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
10928		uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
10929		uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
10930		int64_t v;
10931
10932		agg->dtag_initial = desc->dtad_arg;
10933		agg->dtag_aggregate = dtrace_aggregate_llquantize;
10934
10935		if (factor < 2 || low >= high || nsteps < factor)
10936			goto err;
10937
10938		/*
10939		 * Now check that the number of steps evenly divides a power
10940		 * of the factor.  (This assures both integer bucket size and
10941		 * linearity within each magnitude.)
10942		 */
10943		for (v = factor; v < nsteps; v *= factor)
10944			continue;
10945
10946		if ((v % nsteps) || (nsteps % factor))
10947			goto err;
10948
10949		size = (dtrace_aggregate_llquantize_bucket(factor,
10950		    low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
10951		break;
10952	}
10953
10954	case DTRACEAGG_AVG:
10955		agg->dtag_aggregate = dtrace_aggregate_avg;
10956		size = sizeof (uint64_t) * 2;
10957		break;
10958
10959	case DTRACEAGG_STDDEV:
10960		agg->dtag_aggregate = dtrace_aggregate_stddev;
10961		size = sizeof (uint64_t) * 4;
10962		break;
10963
10964	case DTRACEAGG_SUM:
10965		agg->dtag_aggregate = dtrace_aggregate_sum;
10966		break;
10967
10968	default:
10969		goto err;
10970	}
10971
10972	agg->dtag_action.dta_rec.dtrd_size = size;
10973
10974	if (ntuple == 0)
10975		goto err;
10976
10977	/*
10978	 * We must make sure that we have enough actions for the n-tuple.
10979	 */
10980	for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
10981		if (DTRACEACT_ISAGG(act->dta_kind))
10982			break;
10983
10984		if (--ntuple == 0) {
10985			/*
10986			 * This is the action with which our n-tuple begins.
10987			 */
10988			agg->dtag_first = act;
10989			goto success;
10990		}
10991	}
10992
10993	/*
10994	 * This n-tuple is short by ntuple elements.  Return failure.
10995	 */
10996	ASSERT(ntuple != 0);
10997err:
10998	kmem_free(agg, sizeof (dtrace_aggregation_t));
10999	return (NULL);
11000
11001success:
11002	/*
11003	 * If the last action in the tuple has a size of zero, it's actually
11004	 * an expression argument for the aggregating action.
11005	 */
11006	ASSERT(ecb->dte_action_last != NULL);
11007	act = ecb->dte_action_last;
11008
11009	if (act->dta_kind == DTRACEACT_DIFEXPR) {
11010		ASSERT(act->dta_difo != NULL);
11011
11012		if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
11013			agg->dtag_hasarg = 1;
11014	}
11015
11016	/*
11017	 * We need to allocate an id for this aggregation.
11018	 */
11019#if defined(sun)
11020	aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
11021	    VM_BESTFIT | VM_SLEEP);
11022#else
11023	aggid = alloc_unr(state->dts_aggid_arena);
11024#endif
11025
11026	if (aggid - 1 >= state->dts_naggregations) {
11027		dtrace_aggregation_t **oaggs = state->dts_aggregations;
11028		dtrace_aggregation_t **aggs;
11029		int naggs = state->dts_naggregations << 1;
11030		int onaggs = state->dts_naggregations;
11031
11032		ASSERT(aggid == state->dts_naggregations + 1);
11033
11034		if (naggs == 0) {
11035			ASSERT(oaggs == NULL);
11036			naggs = 1;
11037		}
11038
11039		aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11040
11041		if (oaggs != NULL) {
11042			bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
11043			kmem_free(oaggs, onaggs * sizeof (*aggs));
11044		}
11045
11046		state->dts_aggregations = aggs;
11047		state->dts_naggregations = naggs;
11048	}
11049
11050	ASSERT(state->dts_aggregations[aggid - 1] == NULL);
11051	state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
11052
11053	frec = &agg->dtag_first->dta_rec;
11054	if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11055		frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11056
11057	for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11058		ASSERT(!act->dta_intuple);
11059		act->dta_intuple = 1;
11060	}
11061
11062	return (&agg->dtag_action);
11063}
11064
11065static void
11066dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
11067{
11068	dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11069	dtrace_state_t *state = ecb->dte_state;
11070	dtrace_aggid_t aggid = agg->dtag_id;
11071
11072	ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11073#if defined(sun)
11074	vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
11075#else
11076	free_unr(state->dts_aggid_arena, aggid);
11077#endif
11078
11079	ASSERT(state->dts_aggregations[aggid - 1] == agg);
11080	state->dts_aggregations[aggid - 1] = NULL;
11081
11082	kmem_free(agg, sizeof (dtrace_aggregation_t));
11083}
11084
11085static int
11086dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11087{
11088	dtrace_action_t *action, *last;
11089	dtrace_difo_t *dp = desc->dtad_difo;
11090	uint32_t size = 0, align = sizeof (uint8_t), mask;
11091	uint16_t format = 0;
11092	dtrace_recdesc_t *rec;
11093	dtrace_state_t *state = ecb->dte_state;
11094	dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize;
11095	uint64_t arg = desc->dtad_arg;
11096
11097	ASSERT(MUTEX_HELD(&dtrace_lock));
11098	ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
11099
11100	if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11101		/*
11102		 * If this is an aggregating action, there must be neither
11103		 * a speculate nor a commit on the action chain.
11104		 */
11105		dtrace_action_t *act;
11106
11107		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11108			if (act->dta_kind == DTRACEACT_COMMIT)
11109				return (EINVAL);
11110
11111			if (act->dta_kind == DTRACEACT_SPECULATE)
11112				return (EINVAL);
11113		}
11114
11115		action = dtrace_ecb_aggregation_create(ecb, desc);
11116
11117		if (action == NULL)
11118			return (EINVAL);
11119	} else {
11120		if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
11121		    (desc->dtad_kind == DTRACEACT_DIFEXPR &&
11122		    dp != NULL && dp->dtdo_destructive)) {
11123			state->dts_destructive = 1;
11124		}
11125
11126		switch (desc->dtad_kind) {
11127		case DTRACEACT_PRINTF:
11128		case DTRACEACT_PRINTA:
11129		case DTRACEACT_SYSTEM:
11130		case DTRACEACT_FREOPEN:
11131		case DTRACEACT_DIFEXPR:
11132			/*
11133			 * We know that our arg is a string -- turn it into a
11134			 * format.
11135			 */
11136			if (arg == 0) {
11137				ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
11138				    desc->dtad_kind == DTRACEACT_DIFEXPR);
11139				format = 0;
11140			} else {
11141				ASSERT(arg != 0);
11142#if defined(sun)
11143				ASSERT(arg > KERNELBASE);
11144#endif
11145				format = dtrace_format_add(state,
11146				    (char *)(uintptr_t)arg);
11147			}
11148
11149			/*FALLTHROUGH*/
11150		case DTRACEACT_LIBACT:
11151		case DTRACEACT_TRACEMEM:
11152		case DTRACEACT_TRACEMEM_DYNSIZE:
11153			if (dp == NULL)
11154				return (EINVAL);
11155
11156			if ((size = dp->dtdo_rtype.dtdt_size) != 0)
11157				break;
11158
11159			if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11160				if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11161					return (EINVAL);
11162
11163				size = opt[DTRACEOPT_STRSIZE];
11164			}
11165
11166			break;
11167
11168		case DTRACEACT_STACK:
11169			if ((nframes = arg) == 0) {
11170				nframes = opt[DTRACEOPT_STACKFRAMES];
11171				ASSERT(nframes > 0);
11172				arg = nframes;
11173			}
11174
11175			size = nframes * sizeof (pc_t);
11176			break;
11177
11178		case DTRACEACT_JSTACK:
11179			if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
11180				strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11181
11182			if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
11183				nframes = opt[DTRACEOPT_JSTACKFRAMES];
11184
11185			arg = DTRACE_USTACK_ARG(nframes, strsize);
11186
11187			/*FALLTHROUGH*/
11188		case DTRACEACT_USTACK:
11189			if (desc->dtad_kind != DTRACEACT_JSTACK &&
11190			    (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
11191				strsize = DTRACE_USTACK_STRSIZE(arg);
11192				nframes = opt[DTRACEOPT_USTACKFRAMES];
11193				ASSERT(nframes > 0);
11194				arg = DTRACE_USTACK_ARG(nframes, strsize);
11195			}
11196
11197			/*
11198			 * Save a slot for the pid.
11199			 */
11200			size = (nframes + 1) * sizeof (uint64_t);
11201			size += DTRACE_USTACK_STRSIZE(arg);
11202			size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11203
11204			break;
11205
11206		case DTRACEACT_SYM:
11207		case DTRACEACT_MOD:
11208			if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
11209			    sizeof (uint64_t)) ||
11210			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11211				return (EINVAL);
11212			break;
11213
11214		case DTRACEACT_USYM:
11215		case DTRACEACT_UMOD:
11216		case DTRACEACT_UADDR:
11217			if (dp == NULL ||
11218			    (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
11219			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11220				return (EINVAL);
11221
11222			/*
11223			 * We have a slot for the pid, plus a slot for the
11224			 * argument.  To keep things simple (aligned with
11225			 * bitness-neutral sizing), we store each as a 64-bit
11226			 * quantity.
11227			 */
11228			size = 2 * sizeof (uint64_t);
11229			break;
11230
11231		case DTRACEACT_STOP:
11232		case DTRACEACT_BREAKPOINT:
11233		case DTRACEACT_PANIC:
11234			break;
11235
11236		case DTRACEACT_CHILL:
11237		case DTRACEACT_DISCARD:
11238		case DTRACEACT_RAISE:
11239			if (dp == NULL)
11240				return (EINVAL);
11241			break;
11242
11243		case DTRACEACT_EXIT:
11244			if (dp == NULL ||
11245			    (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
11246			    (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11247				return (EINVAL);
11248			break;
11249
11250		case DTRACEACT_SPECULATE:
11251			if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11252				return (EINVAL);
11253
11254			if (dp == NULL)
11255				return (EINVAL);
11256
11257			state->dts_speculates = 1;
11258			break;
11259
11260		case DTRACEACT_PRINTM:
11261		    	size = dp->dtdo_rtype.dtdt_size;
11262			break;
11263
11264		case DTRACEACT_PRINTT:
11265		    	size = dp->dtdo_rtype.dtdt_size;
11266			break;
11267
11268		case DTRACEACT_COMMIT: {
11269			dtrace_action_t *act = ecb->dte_action;
11270
11271			for (; act != NULL; act = act->dta_next) {
11272				if (act->dta_kind == DTRACEACT_COMMIT)
11273					return (EINVAL);
11274			}
11275
11276			if (dp == NULL)
11277				return (EINVAL);
11278			break;
11279		}
11280
11281		default:
11282			return (EINVAL);
11283		}
11284
11285		if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
11286			/*
11287			 * If this is a data-storing action or a speculate,
11288			 * we must be sure that there isn't a commit on the
11289			 * action chain.
11290			 */
11291			dtrace_action_t *act = ecb->dte_action;
11292
11293			for (; act != NULL; act = act->dta_next) {
11294				if (act->dta_kind == DTRACEACT_COMMIT)
11295					return (EINVAL);
11296			}
11297		}
11298
11299		action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11300		action->dta_rec.dtrd_size = size;
11301	}
11302
11303	action->dta_refcnt = 1;
11304	rec = &action->dta_rec;
11305	size = rec->dtrd_size;
11306
11307	for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11308		if (!(size & mask)) {
11309			align = mask + 1;
11310			break;
11311		}
11312	}
11313
11314	action->dta_kind = desc->dtad_kind;
11315
11316	if ((action->dta_difo = dp) != NULL)
11317		dtrace_difo_hold(dp);
11318
11319	rec->dtrd_action = action->dta_kind;
11320	rec->dtrd_arg = arg;
11321	rec->dtrd_uarg = desc->dtad_uarg;
11322	rec->dtrd_alignment = (uint16_t)align;
11323	rec->dtrd_format = format;
11324
11325	if ((last = ecb->dte_action_last) != NULL) {
11326		ASSERT(ecb->dte_action != NULL);
11327		action->dta_prev = last;
11328		last->dta_next = action;
11329	} else {
11330		ASSERT(ecb->dte_action == NULL);
11331		ecb->dte_action = action;
11332	}
11333
11334	ecb->dte_action_last = action;
11335
11336	return (0);
11337}
11338
11339static void
11340dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11341{
11342	dtrace_action_t *act = ecb->dte_action, *next;
11343	dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11344	dtrace_difo_t *dp;
11345	uint16_t format;
11346
11347	if (act != NULL && act->dta_refcnt > 1) {
11348		ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11349		act->dta_refcnt--;
11350	} else {
11351		for (; act != NULL; act = next) {
11352			next = act->dta_next;
11353			ASSERT(next != NULL || act == ecb->dte_action_last);
11354			ASSERT(act->dta_refcnt == 1);
11355
11356			if ((format = act->dta_rec.dtrd_format) != 0)
11357				dtrace_format_remove(ecb->dte_state, format);
11358
11359			if ((dp = act->dta_difo) != NULL)
11360				dtrace_difo_release(dp, vstate);
11361
11362			if (DTRACEACT_ISAGG(act->dta_kind)) {
11363				dtrace_ecb_aggregation_destroy(ecb, act);
11364			} else {
11365				kmem_free(act, sizeof (dtrace_action_t));
11366			}
11367		}
11368	}
11369
11370	ecb->dte_action = NULL;
11371	ecb->dte_action_last = NULL;
11372	ecb->dte_size = 0;
11373}
11374
11375static void
11376dtrace_ecb_disable(dtrace_ecb_t *ecb)
11377{
11378	/*
11379	 * We disable the ECB by removing it from its probe.
11380	 */
11381	dtrace_ecb_t *pecb, *prev = NULL;
11382	dtrace_probe_t *probe = ecb->dte_probe;
11383
11384	ASSERT(MUTEX_HELD(&dtrace_lock));
11385
11386	if (probe == NULL) {
11387		/*
11388		 * This is the NULL probe; there is nothing to disable.
11389		 */
11390		return;
11391	}
11392
11393	for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11394		if (pecb == ecb)
11395			break;
11396		prev = pecb;
11397	}
11398
11399	ASSERT(pecb != NULL);
11400
11401	if (prev == NULL) {
11402		probe->dtpr_ecb = ecb->dte_next;
11403	} else {
11404		prev->dte_next = ecb->dte_next;
11405	}
11406
11407	if (ecb == probe->dtpr_ecb_last) {
11408		ASSERT(ecb->dte_next == NULL);
11409		probe->dtpr_ecb_last = prev;
11410	}
11411
11412	/*
11413	 * The ECB has been disconnected from the probe; now sync to assure
11414	 * that all CPUs have seen the change before returning.
11415	 */
11416	dtrace_sync();
11417
11418	if (probe->dtpr_ecb == NULL) {
11419		/*
11420		 * That was the last ECB on the probe; clear the predicate
11421		 * cache ID for the probe, disable it and sync one more time
11422		 * to assure that we'll never hit it again.
11423		 */
11424		dtrace_provider_t *prov = probe->dtpr_provider;
11425
11426		ASSERT(ecb->dte_next == NULL);
11427		ASSERT(probe->dtpr_ecb_last == NULL);
11428		probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11429		prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11430		    probe->dtpr_id, probe->dtpr_arg);
11431		dtrace_sync();
11432	} else {
11433		/*
11434		 * There is at least one ECB remaining on the probe.  If there
11435		 * is _exactly_ one, set the probe's predicate cache ID to be
11436		 * the predicate cache ID of the remaining ECB.
11437		 */
11438		ASSERT(probe->dtpr_ecb_last != NULL);
11439		ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11440
11441		if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11442			dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11443
11444			ASSERT(probe->dtpr_ecb->dte_next == NULL);
11445
11446			if (p != NULL)
11447				probe->dtpr_predcache = p->dtp_cacheid;
11448		}
11449
11450		ecb->dte_next = NULL;
11451	}
11452}
11453
11454static void
11455dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11456{
11457	dtrace_state_t *state = ecb->dte_state;
11458	dtrace_vstate_t *vstate = &state->dts_vstate;
11459	dtrace_predicate_t *pred;
11460	dtrace_epid_t epid = ecb->dte_epid;
11461
11462	ASSERT(MUTEX_HELD(&dtrace_lock));
11463	ASSERT(ecb->dte_next == NULL);
11464	ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11465
11466	if ((pred = ecb->dte_predicate) != NULL)
11467		dtrace_predicate_release(pred, vstate);
11468
11469	dtrace_ecb_action_remove(ecb);
11470
11471	ASSERT(state->dts_ecbs[epid - 1] == ecb);
11472	state->dts_ecbs[epid - 1] = NULL;
11473
11474	kmem_free(ecb, sizeof (dtrace_ecb_t));
11475}
11476
11477static dtrace_ecb_t *
11478dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11479    dtrace_enabling_t *enab)
11480{
11481	dtrace_ecb_t *ecb;
11482	dtrace_predicate_t *pred;
11483	dtrace_actdesc_t *act;
11484	dtrace_provider_t *prov;
11485	dtrace_ecbdesc_t *desc = enab->dten_current;
11486
11487	ASSERT(MUTEX_HELD(&dtrace_lock));
11488	ASSERT(state != NULL);
11489
11490	ecb = dtrace_ecb_add(state, probe);
11491	ecb->dte_uarg = desc->dted_uarg;
11492
11493	if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11494		dtrace_predicate_hold(pred);
11495		ecb->dte_predicate = pred;
11496	}
11497
11498	if (probe != NULL) {
11499		/*
11500		 * If the provider shows more leg than the consumer is old
11501		 * enough to see, we need to enable the appropriate implicit
11502		 * predicate bits to prevent the ecb from activating at
11503		 * revealing times.
11504		 *
11505		 * Providers specifying DTRACE_PRIV_USER at register time
11506		 * are stating that they need the /proc-style privilege
11507		 * model to be enforced, and this is what DTRACE_COND_OWNER
11508		 * and DTRACE_COND_ZONEOWNER will then do at probe time.
11509		 */
11510		prov = probe->dtpr_provider;
11511		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11512		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11513			ecb->dte_cond |= DTRACE_COND_OWNER;
11514
11515		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11516		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11517			ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11518
11519		/*
11520		 * If the provider shows us kernel innards and the user
11521		 * is lacking sufficient privilege, enable the
11522		 * DTRACE_COND_USERMODE implicit predicate.
11523		 */
11524		if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11525		    (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11526			ecb->dte_cond |= DTRACE_COND_USERMODE;
11527	}
11528
11529	if (dtrace_ecb_create_cache != NULL) {
11530		/*
11531		 * If we have a cached ecb, we'll use its action list instead
11532		 * of creating our own (saving both time and space).
11533		 */
11534		dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11535		dtrace_action_t *act = cached->dte_action;
11536
11537		if (act != NULL) {
11538			ASSERT(act->dta_refcnt > 0);
11539			act->dta_refcnt++;
11540			ecb->dte_action = act;
11541			ecb->dte_action_last = cached->dte_action_last;
11542			ecb->dte_needed = cached->dte_needed;
11543			ecb->dte_size = cached->dte_size;
11544			ecb->dte_alignment = cached->dte_alignment;
11545		}
11546
11547		return (ecb);
11548	}
11549
11550	for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11551		if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11552			dtrace_ecb_destroy(ecb);
11553			return (NULL);
11554		}
11555	}
11556
11557	dtrace_ecb_resize(ecb);
11558
11559	return (dtrace_ecb_create_cache = ecb);
11560}
11561
11562static int
11563dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
11564{
11565	dtrace_ecb_t *ecb;
11566	dtrace_enabling_t *enab = arg;
11567	dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11568
11569	ASSERT(state != NULL);
11570
11571	if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
11572		/*
11573		 * This probe was created in a generation for which this
11574		 * enabling has previously created ECBs; we don't want to
11575		 * enable it again, so just kick out.
11576		 */
11577		return (DTRACE_MATCH_NEXT);
11578	}
11579
11580	if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11581		return (DTRACE_MATCH_DONE);
11582
11583	dtrace_ecb_enable(ecb);
11584	return (DTRACE_MATCH_NEXT);
11585}
11586
11587static dtrace_ecb_t *
11588dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11589{
11590	dtrace_ecb_t *ecb;
11591
11592	ASSERT(MUTEX_HELD(&dtrace_lock));
11593
11594	if (id == 0 || id > state->dts_necbs)
11595		return (NULL);
11596
11597	ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11598	ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11599
11600	return (state->dts_ecbs[id - 1]);
11601}
11602
11603static dtrace_aggregation_t *
11604dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11605{
11606	dtrace_aggregation_t *agg;
11607
11608	ASSERT(MUTEX_HELD(&dtrace_lock));
11609
11610	if (id == 0 || id > state->dts_naggregations)
11611		return (NULL);
11612
11613	ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11614	ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11615	    agg->dtag_id == id);
11616
11617	return (state->dts_aggregations[id - 1]);
11618}
11619
11620/*
11621 * DTrace Buffer Functions
11622 *
11623 * The following functions manipulate DTrace buffers.  Most of these functions
11624 * are called in the context of establishing or processing consumer state;
11625 * exceptions are explicitly noted.
11626 */
11627
11628/*
11629 * Note:  called from cross call context.  This function switches the two
11630 * buffers on a given CPU.  The atomicity of this operation is assured by
11631 * disabling interrupts while the actual switch takes place; the disabling of
11632 * interrupts serializes the execution with any execution of dtrace_probe() on
11633 * the same CPU.
11634 */
11635static void
11636dtrace_buffer_switch(dtrace_buffer_t *buf)
11637{
11638	caddr_t tomax = buf->dtb_tomax;
11639	caddr_t xamot = buf->dtb_xamot;
11640	dtrace_icookie_t cookie;
11641	hrtime_t now;
11642
11643	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11644	ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11645
11646	cookie = dtrace_interrupt_disable();
11647	now = dtrace_gethrtime();
11648	buf->dtb_tomax = xamot;
11649	buf->dtb_xamot = tomax;
11650	buf->dtb_xamot_drops = buf->dtb_drops;
11651	buf->dtb_xamot_offset = buf->dtb_offset;
11652	buf->dtb_xamot_errors = buf->dtb_errors;
11653	buf->dtb_xamot_flags = buf->dtb_flags;
11654	buf->dtb_offset = 0;
11655	buf->dtb_drops = 0;
11656	buf->dtb_errors = 0;
11657	buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11658	buf->dtb_interval = now - buf->dtb_switched;
11659	buf->dtb_switched = now;
11660	dtrace_interrupt_enable(cookie);
11661}
11662
11663/*
11664 * Note:  called from cross call context.  This function activates a buffer
11665 * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
11666 * is guaranteed by the disabling of interrupts.
11667 */
11668static void
11669dtrace_buffer_activate(dtrace_state_t *state)
11670{
11671	dtrace_buffer_t *buf;
11672	dtrace_icookie_t cookie = dtrace_interrupt_disable();
11673
11674	buf = &state->dts_buffer[curcpu];
11675
11676	if (buf->dtb_tomax != NULL) {
11677		/*
11678		 * We might like to assert that the buffer is marked inactive,
11679		 * but this isn't necessarily true:  the buffer for the CPU
11680		 * that processes the BEGIN probe has its buffer activated
11681		 * manually.  In this case, we take the (harmless) action
11682		 * re-clearing the bit INACTIVE bit.
11683		 */
11684		buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11685	}
11686
11687	dtrace_interrupt_enable(cookie);
11688}
11689
11690static int
11691dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
11692    processorid_t cpu, int *factor)
11693{
11694#if defined(sun)
11695	cpu_t *cp;
11696#endif
11697	dtrace_buffer_t *buf;
11698	int allocated = 0, desired = 0;
11699
11700#if defined(sun)
11701	ASSERT(MUTEX_HELD(&cpu_lock));
11702	ASSERT(MUTEX_HELD(&dtrace_lock));
11703
11704	*factor = 1;
11705
11706	if (size > dtrace_nonroot_maxsize &&
11707	    !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11708		return (EFBIG);
11709
11710	cp = cpu_list;
11711
11712	do {
11713		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11714			continue;
11715
11716		buf = &bufs[cp->cpu_id];
11717
11718		/*
11719		 * If there is already a buffer allocated for this CPU, it
11720		 * is only possible that this is a DR event.  In this case,
11721		 */
11722		if (buf->dtb_tomax != NULL) {
11723			ASSERT(buf->dtb_size == size);
11724			continue;
11725		}
11726
11727		ASSERT(buf->dtb_xamot == NULL);
11728
11729		if ((buf->dtb_tomax = kmem_zalloc(size,
11730		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11731			goto err;
11732
11733		buf->dtb_size = size;
11734		buf->dtb_flags = flags;
11735		buf->dtb_offset = 0;
11736		buf->dtb_drops = 0;
11737
11738		if (flags & DTRACEBUF_NOSWITCH)
11739			continue;
11740
11741		if ((buf->dtb_xamot = kmem_zalloc(size,
11742		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11743			goto err;
11744	} while ((cp = cp->cpu_next) != cpu_list);
11745
11746	return (0);
11747
11748err:
11749	cp = cpu_list;
11750
11751	do {
11752		if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11753			continue;
11754
11755		buf = &bufs[cp->cpu_id];
11756		desired += 2;
11757
11758		if (buf->dtb_xamot != NULL) {
11759			ASSERT(buf->dtb_tomax != NULL);
11760			ASSERT(buf->dtb_size == size);
11761			kmem_free(buf->dtb_xamot, size);
11762			allocated++;
11763		}
11764
11765		if (buf->dtb_tomax != NULL) {
11766			ASSERT(buf->dtb_size == size);
11767			kmem_free(buf->dtb_tomax, size);
11768			allocated++;
11769		}
11770
11771		buf->dtb_tomax = NULL;
11772		buf->dtb_xamot = NULL;
11773		buf->dtb_size = 0;
11774	} while ((cp = cp->cpu_next) != cpu_list);
11775#else
11776	int i;
11777
11778	*factor = 1;
11779#if defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
11780	/*
11781	 * FreeBSD isn't good at limiting the amount of memory we
11782	 * ask to malloc, so let's place a limit here before trying
11783	 * to do something that might well end in tears at bedtime.
11784	 */
11785	if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1)))
11786		return (ENOMEM);
11787#endif
11788
11789	ASSERT(MUTEX_HELD(&dtrace_lock));
11790	CPU_FOREACH(i) {
11791		if (cpu != DTRACE_CPUALL && cpu != i)
11792			continue;
11793
11794		buf = &bufs[i];
11795
11796		/*
11797		 * If there is already a buffer allocated for this CPU, it
11798		 * is only possible that this is a DR event.  In this case,
11799		 * the buffer size must match our specified size.
11800		 */
11801		if (buf->dtb_tomax != NULL) {
11802			ASSERT(buf->dtb_size == size);
11803			continue;
11804		}
11805
11806		ASSERT(buf->dtb_xamot == NULL);
11807
11808		if ((buf->dtb_tomax = kmem_zalloc(size,
11809		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11810			goto err;
11811
11812		buf->dtb_size = size;
11813		buf->dtb_flags = flags;
11814		buf->dtb_offset = 0;
11815		buf->dtb_drops = 0;
11816
11817		if (flags & DTRACEBUF_NOSWITCH)
11818			continue;
11819
11820		if ((buf->dtb_xamot = kmem_zalloc(size,
11821		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11822			goto err;
11823	}
11824
11825	return (0);
11826
11827err:
11828	/*
11829	 * Error allocating memory, so free the buffers that were
11830	 * allocated before the failed allocation.
11831	 */
11832	CPU_FOREACH(i) {
11833		if (cpu != DTRACE_CPUALL && cpu != i)
11834			continue;
11835
11836		buf = &bufs[i];
11837		desired += 2;
11838
11839		if (buf->dtb_xamot != NULL) {
11840			ASSERT(buf->dtb_tomax != NULL);
11841			ASSERT(buf->dtb_size == size);
11842			kmem_free(buf->dtb_xamot, size);
11843			allocated++;
11844		}
11845
11846		if (buf->dtb_tomax != NULL) {
11847			ASSERT(buf->dtb_size == size);
11848			kmem_free(buf->dtb_tomax, size);
11849			allocated++;
11850		}
11851
11852		buf->dtb_tomax = NULL;
11853		buf->dtb_xamot = NULL;
11854		buf->dtb_size = 0;
11855
11856	}
11857#endif
11858	*factor = desired / (allocated > 0 ? allocated : 1);
11859
11860	return (ENOMEM);
11861}
11862
11863/*
11864 * Note:  called from probe context.  This function just increments the drop
11865 * count on a buffer.  It has been made a function to allow for the
11866 * possibility of understanding the source of mysterious drop counts.  (A
11867 * problem for which one may be particularly disappointed that DTrace cannot
11868 * be used to understand DTrace.)
11869 */
11870static void
11871dtrace_buffer_drop(dtrace_buffer_t *buf)
11872{
11873	buf->dtb_drops++;
11874}
11875
11876/*
11877 * Note:  called from probe context.  This function is called to reserve space
11878 * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
11879 * mstate.  Returns the new offset in the buffer, or a negative value if an
11880 * error has occurred.
11881 */
11882static intptr_t
11883dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
11884    dtrace_state_t *state, dtrace_mstate_t *mstate)
11885{
11886	intptr_t offs = buf->dtb_offset, soffs;
11887	intptr_t woffs;
11888	caddr_t tomax;
11889	size_t total;
11890
11891	if (buf->dtb_flags & DTRACEBUF_INACTIVE)
11892		return (-1);
11893
11894	if ((tomax = buf->dtb_tomax) == NULL) {
11895		dtrace_buffer_drop(buf);
11896		return (-1);
11897	}
11898
11899	if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
11900		while (offs & (align - 1)) {
11901			/*
11902			 * Assert that our alignment is off by a number which
11903			 * is itself sizeof (uint32_t) aligned.
11904			 */
11905			ASSERT(!((align - (offs & (align - 1))) &
11906			    (sizeof (uint32_t) - 1)));
11907			DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11908			offs += sizeof (uint32_t);
11909		}
11910
11911		if ((soffs = offs + needed) > buf->dtb_size) {
11912			dtrace_buffer_drop(buf);
11913			return (-1);
11914		}
11915
11916		if (mstate == NULL)
11917			return (offs);
11918
11919		mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
11920		mstate->dtms_scratch_size = buf->dtb_size - soffs;
11921		mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11922
11923		return (offs);
11924	}
11925
11926	if (buf->dtb_flags & DTRACEBUF_FILL) {
11927		if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
11928		    (buf->dtb_flags & DTRACEBUF_FULL))
11929			return (-1);
11930		goto out;
11931	}
11932
11933	total = needed + (offs & (align - 1));
11934
11935	/*
11936	 * For a ring buffer, life is quite a bit more complicated.  Before
11937	 * we can store any padding, we need to adjust our wrapping offset.
11938	 * (If we've never before wrapped or we're not about to, no adjustment
11939	 * is required.)
11940	 */
11941	if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
11942	    offs + total > buf->dtb_size) {
11943		woffs = buf->dtb_xamot_offset;
11944
11945		if (offs + total > buf->dtb_size) {
11946			/*
11947			 * We can't fit in the end of the buffer.  First, a
11948			 * sanity check that we can fit in the buffer at all.
11949			 */
11950			if (total > buf->dtb_size) {
11951				dtrace_buffer_drop(buf);
11952				return (-1);
11953			}
11954
11955			/*
11956			 * We're going to be storing at the top of the buffer,
11957			 * so now we need to deal with the wrapped offset.  We
11958			 * only reset our wrapped offset to 0 if it is
11959			 * currently greater than the current offset.  If it
11960			 * is less than the current offset, it is because a
11961			 * previous allocation induced a wrap -- but the
11962			 * allocation didn't subsequently take the space due
11963			 * to an error or false predicate evaluation.  In this
11964			 * case, we'll just leave the wrapped offset alone: if
11965			 * the wrapped offset hasn't been advanced far enough
11966			 * for this allocation, it will be adjusted in the
11967			 * lower loop.
11968			 */
11969			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
11970				if (woffs >= offs)
11971					woffs = 0;
11972			} else {
11973				woffs = 0;
11974			}
11975
11976			/*
11977			 * Now we know that we're going to be storing to the
11978			 * top of the buffer and that there is room for us
11979			 * there.  We need to clear the buffer from the current
11980			 * offset to the end (there may be old gunk there).
11981			 */
11982			while (offs < buf->dtb_size)
11983				tomax[offs++] = 0;
11984
11985			/*
11986			 * We need to set our offset to zero.  And because we
11987			 * are wrapping, we need to set the bit indicating as
11988			 * much.  We can also adjust our needed space back
11989			 * down to the space required by the ECB -- we know
11990			 * that the top of the buffer is aligned.
11991			 */
11992			offs = 0;
11993			total = needed;
11994			buf->dtb_flags |= DTRACEBUF_WRAPPED;
11995		} else {
11996			/*
11997			 * There is room for us in the buffer, so we simply
11998			 * need to check the wrapped offset.
11999			 */
12000			if (woffs < offs) {
12001				/*
12002				 * The wrapped offset is less than the offset.
12003				 * This can happen if we allocated buffer space
12004				 * that induced a wrap, but then we didn't
12005				 * subsequently take the space due to an error
12006				 * or false predicate evaluation.  This is
12007				 * okay; we know that _this_ allocation isn't
12008				 * going to induce a wrap.  We still can't
12009				 * reset the wrapped offset to be zero,
12010				 * however: the space may have been trashed in
12011				 * the previous failed probe attempt.  But at
12012				 * least the wrapped offset doesn't need to
12013				 * be adjusted at all...
12014				 */
12015				goto out;
12016			}
12017		}
12018
12019		while (offs + total > woffs) {
12020			dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
12021			size_t size;
12022
12023			if (epid == DTRACE_EPIDNONE) {
12024				size = sizeof (uint32_t);
12025			} else {
12026				ASSERT3U(epid, <=, state->dts_necbs);
12027				ASSERT(state->dts_ecbs[epid - 1] != NULL);
12028
12029				size = state->dts_ecbs[epid - 1]->dte_size;
12030			}
12031
12032			ASSERT(woffs + size <= buf->dtb_size);
12033			ASSERT(size != 0);
12034
12035			if (woffs + size == buf->dtb_size) {
12036				/*
12037				 * We've reached the end of the buffer; we want
12038				 * to set the wrapped offset to 0 and break
12039				 * out.  However, if the offs is 0, then we're
12040				 * in a strange edge-condition:  the amount of
12041				 * space that we want to reserve plus the size
12042				 * of the record that we're overwriting is
12043				 * greater than the size of the buffer.  This
12044				 * is problematic because if we reserve the
12045				 * space but subsequently don't consume it (due
12046				 * to a failed predicate or error) the wrapped
12047				 * offset will be 0 -- yet the EPID at offset 0
12048				 * will not be committed.  This situation is
12049				 * relatively easy to deal with:  if we're in
12050				 * this case, the buffer is indistinguishable
12051				 * from one that hasn't wrapped; we need only
12052				 * finish the job by clearing the wrapped bit,
12053				 * explicitly setting the offset to be 0, and
12054				 * zero'ing out the old data in the buffer.
12055				 */
12056				if (offs == 0) {
12057					buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12058					buf->dtb_offset = 0;
12059					woffs = total;
12060
12061					while (woffs < buf->dtb_size)
12062						tomax[woffs++] = 0;
12063				}
12064
12065				woffs = 0;
12066				break;
12067			}
12068
12069			woffs += size;
12070		}
12071
12072		/*
12073		 * We have a wrapped offset.  It may be that the wrapped offset
12074		 * has become zero -- that's okay.
12075		 */
12076		buf->dtb_xamot_offset = woffs;
12077	}
12078
12079out:
12080	/*
12081	 * Now we can plow the buffer with any necessary padding.
12082	 */
12083	while (offs & (align - 1)) {
12084		/*
12085		 * Assert that our alignment is off by a number which
12086		 * is itself sizeof (uint32_t) aligned.
12087		 */
12088		ASSERT(!((align - (offs & (align - 1))) &
12089		    (sizeof (uint32_t) - 1)));
12090		DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12091		offs += sizeof (uint32_t);
12092	}
12093
12094	if (buf->dtb_flags & DTRACEBUF_FILL) {
12095		if (offs + needed > buf->dtb_size - state->dts_reserve) {
12096			buf->dtb_flags |= DTRACEBUF_FULL;
12097			return (-1);
12098		}
12099	}
12100
12101	if (mstate == NULL)
12102		return (offs);
12103
12104	/*
12105	 * For ring buffers and fill buffers, the scratch space is always
12106	 * the inactive buffer.
12107	 */
12108	mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12109	mstate->dtms_scratch_size = buf->dtb_size;
12110	mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12111
12112	return (offs);
12113}
12114
12115static void
12116dtrace_buffer_polish(dtrace_buffer_t *buf)
12117{
12118	ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12119	ASSERT(MUTEX_HELD(&dtrace_lock));
12120
12121	if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12122		return;
12123
12124	/*
12125	 * We need to polish the ring buffer.  There are three cases:
12126	 *
12127	 * - The first (and presumably most common) is that there is no gap
12128	 *   between the buffer offset and the wrapped offset.  In this case,
12129	 *   there is nothing in the buffer that isn't valid data; we can
12130	 *   mark the buffer as polished and return.
12131	 *
12132	 * - The second (less common than the first but still more common
12133	 *   than the third) is that there is a gap between the buffer offset
12134	 *   and the wrapped offset, and the wrapped offset is larger than the
12135	 *   buffer offset.  This can happen because of an alignment issue, or
12136	 *   can happen because of a call to dtrace_buffer_reserve() that
12137	 *   didn't subsequently consume the buffer space.  In this case,
12138	 *   we need to zero the data from the buffer offset to the wrapped
12139	 *   offset.
12140	 *
12141	 * - The third (and least common) is that there is a gap between the
12142	 *   buffer offset and the wrapped offset, but the wrapped offset is
12143	 *   _less_ than the buffer offset.  This can only happen because a
12144	 *   call to dtrace_buffer_reserve() induced a wrap, but the space
12145	 *   was not subsequently consumed.  In this case, we need to zero the
12146	 *   space from the offset to the end of the buffer _and_ from the
12147	 *   top of the buffer to the wrapped offset.
12148	 */
12149	if (buf->dtb_offset < buf->dtb_xamot_offset) {
12150		bzero(buf->dtb_tomax + buf->dtb_offset,
12151		    buf->dtb_xamot_offset - buf->dtb_offset);
12152	}
12153
12154	if (buf->dtb_offset > buf->dtb_xamot_offset) {
12155		bzero(buf->dtb_tomax + buf->dtb_offset,
12156		    buf->dtb_size - buf->dtb_offset);
12157		bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
12158	}
12159}
12160
12161/*
12162 * This routine determines if data generated at the specified time has likely
12163 * been entirely consumed at user-level.  This routine is called to determine
12164 * if an ECB on a defunct probe (but for an active enabling) can be safely
12165 * disabled and destroyed.
12166 */
12167static int
12168dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
12169{
12170	int i;
12171
12172	for (i = 0; i < NCPU; i++) {
12173		dtrace_buffer_t *buf = &bufs[i];
12174
12175		if (buf->dtb_size == 0)
12176			continue;
12177
12178		if (buf->dtb_flags & DTRACEBUF_RING)
12179			return (0);
12180
12181		if (!buf->dtb_switched && buf->dtb_offset != 0)
12182			return (0);
12183
12184		if (buf->dtb_switched - buf->dtb_interval < when)
12185			return (0);
12186	}
12187
12188	return (1);
12189}
12190
12191static void
12192dtrace_buffer_free(dtrace_buffer_t *bufs)
12193{
12194	int i;
12195
12196	for (i = 0; i < NCPU; i++) {
12197		dtrace_buffer_t *buf = &bufs[i];
12198
12199		if (buf->dtb_tomax == NULL) {
12200			ASSERT(buf->dtb_xamot == NULL);
12201			ASSERT(buf->dtb_size == 0);
12202			continue;
12203		}
12204
12205		if (buf->dtb_xamot != NULL) {
12206			ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12207			kmem_free(buf->dtb_xamot, buf->dtb_size);
12208		}
12209
12210		kmem_free(buf->dtb_tomax, buf->dtb_size);
12211		buf->dtb_size = 0;
12212		buf->dtb_tomax = NULL;
12213		buf->dtb_xamot = NULL;
12214	}
12215}
12216
12217/*
12218 * DTrace Enabling Functions
12219 */
12220static dtrace_enabling_t *
12221dtrace_enabling_create(dtrace_vstate_t *vstate)
12222{
12223	dtrace_enabling_t *enab;
12224
12225	enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12226	enab->dten_vstate = vstate;
12227
12228	return (enab);
12229}
12230
12231static void
12232dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
12233{
12234	dtrace_ecbdesc_t **ndesc;
12235	size_t osize, nsize;
12236
12237	/*
12238	 * We can't add to enablings after we've enabled them, or after we've
12239	 * retained them.
12240	 */
12241	ASSERT(enab->dten_probegen == 0);
12242	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12243
12244	if (enab->dten_ndesc < enab->dten_maxdesc) {
12245		enab->dten_desc[enab->dten_ndesc++] = ecb;
12246		return;
12247	}
12248
12249	osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12250
12251	if (enab->dten_maxdesc == 0) {
12252		enab->dten_maxdesc = 1;
12253	} else {
12254		enab->dten_maxdesc <<= 1;
12255	}
12256
12257	ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12258
12259	nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12260	ndesc = kmem_zalloc(nsize, KM_SLEEP);
12261	bcopy(enab->dten_desc, ndesc, osize);
12262	if (enab->dten_desc != NULL)
12263		kmem_free(enab->dten_desc, osize);
12264
12265	enab->dten_desc = ndesc;
12266	enab->dten_desc[enab->dten_ndesc++] = ecb;
12267}
12268
12269static void
12270dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
12271    dtrace_probedesc_t *pd)
12272{
12273	dtrace_ecbdesc_t *new;
12274	dtrace_predicate_t *pred;
12275	dtrace_actdesc_t *act;
12276
12277	/*
12278	 * We're going to create a new ECB description that matches the
12279	 * specified ECB in every way, but has the specified probe description.
12280	 */
12281	new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12282
12283	if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12284		dtrace_predicate_hold(pred);
12285
12286	for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12287		dtrace_actdesc_hold(act);
12288
12289	new->dted_action = ecb->dted_action;
12290	new->dted_pred = ecb->dted_pred;
12291	new->dted_probe = *pd;
12292	new->dted_uarg = ecb->dted_uarg;
12293
12294	dtrace_enabling_add(enab, new);
12295}
12296
12297static void
12298dtrace_enabling_dump(dtrace_enabling_t *enab)
12299{
12300	int i;
12301
12302	for (i = 0; i < enab->dten_ndesc; i++) {
12303		dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12304
12305		cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12306		    desc->dtpd_provider, desc->dtpd_mod,
12307		    desc->dtpd_func, desc->dtpd_name);
12308	}
12309}
12310
12311static void
12312dtrace_enabling_destroy(dtrace_enabling_t *enab)
12313{
12314	int i;
12315	dtrace_ecbdesc_t *ep;
12316	dtrace_vstate_t *vstate = enab->dten_vstate;
12317
12318	ASSERT(MUTEX_HELD(&dtrace_lock));
12319
12320	for (i = 0; i < enab->dten_ndesc; i++) {
12321		dtrace_actdesc_t *act, *next;
12322		dtrace_predicate_t *pred;
12323
12324		ep = enab->dten_desc[i];
12325
12326		if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12327			dtrace_predicate_release(pred, vstate);
12328
12329		for (act = ep->dted_action; act != NULL; act = next) {
12330			next = act->dtad_next;
12331			dtrace_actdesc_release(act, vstate);
12332		}
12333
12334		kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12335	}
12336
12337	if (enab->dten_desc != NULL)
12338		kmem_free(enab->dten_desc,
12339		    enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12340
12341	/*
12342	 * If this was a retained enabling, decrement the dts_nretained count
12343	 * and take it off of the dtrace_retained list.
12344	 */
12345	if (enab->dten_prev != NULL || enab->dten_next != NULL ||
12346	    dtrace_retained == enab) {
12347		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12348		ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
12349		enab->dten_vstate->dtvs_state->dts_nretained--;
12350		dtrace_retained_gen++;
12351	}
12352
12353	if (enab->dten_prev == NULL) {
12354		if (dtrace_retained == enab) {
12355			dtrace_retained = enab->dten_next;
12356
12357			if (dtrace_retained != NULL)
12358				dtrace_retained->dten_prev = NULL;
12359		}
12360	} else {
12361		ASSERT(enab != dtrace_retained);
12362		ASSERT(dtrace_retained != NULL);
12363		enab->dten_prev->dten_next = enab->dten_next;
12364	}
12365
12366	if (enab->dten_next != NULL) {
12367		ASSERT(dtrace_retained != NULL);
12368		enab->dten_next->dten_prev = enab->dten_prev;
12369	}
12370
12371	kmem_free(enab, sizeof (dtrace_enabling_t));
12372}
12373
12374static int
12375dtrace_enabling_retain(dtrace_enabling_t *enab)
12376{
12377	dtrace_state_t *state;
12378
12379	ASSERT(MUTEX_HELD(&dtrace_lock));
12380	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12381	ASSERT(enab->dten_vstate != NULL);
12382
12383	state = enab->dten_vstate->dtvs_state;
12384	ASSERT(state != NULL);
12385
12386	/*
12387	 * We only allow each state to retain dtrace_retain_max enablings.
12388	 */
12389	if (state->dts_nretained >= dtrace_retain_max)
12390		return (ENOSPC);
12391
12392	state->dts_nretained++;
12393	dtrace_retained_gen++;
12394
12395	if (dtrace_retained == NULL) {
12396		dtrace_retained = enab;
12397		return (0);
12398	}
12399
12400	enab->dten_next = dtrace_retained;
12401	dtrace_retained->dten_prev = enab;
12402	dtrace_retained = enab;
12403
12404	return (0);
12405}
12406
12407static int
12408dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12409    dtrace_probedesc_t *create)
12410{
12411	dtrace_enabling_t *new, *enab;
12412	int found = 0, err = ENOENT;
12413
12414	ASSERT(MUTEX_HELD(&dtrace_lock));
12415	ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12416	ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12417	ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12418	ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12419
12420	new = dtrace_enabling_create(&state->dts_vstate);
12421
12422	/*
12423	 * Iterate over all retained enablings, looking for enablings that
12424	 * match the specified state.
12425	 */
12426	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12427		int i;
12428
12429		/*
12430		 * dtvs_state can only be NULL for helper enablings -- and
12431		 * helper enablings can't be retained.
12432		 */
12433		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12434
12435		if (enab->dten_vstate->dtvs_state != state)
12436			continue;
12437
12438		/*
12439		 * Now iterate over each probe description; we're looking for
12440		 * an exact match to the specified probe description.
12441		 */
12442		for (i = 0; i < enab->dten_ndesc; i++) {
12443			dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12444			dtrace_probedesc_t *pd = &ep->dted_probe;
12445
12446			if (strcmp(pd->dtpd_provider, match->dtpd_provider))
12447				continue;
12448
12449			if (strcmp(pd->dtpd_mod, match->dtpd_mod))
12450				continue;
12451
12452			if (strcmp(pd->dtpd_func, match->dtpd_func))
12453				continue;
12454
12455			if (strcmp(pd->dtpd_name, match->dtpd_name))
12456				continue;
12457
12458			/*
12459			 * We have a winning probe!  Add it to our growing
12460			 * enabling.
12461			 */
12462			found = 1;
12463			dtrace_enabling_addlike(new, ep, create);
12464		}
12465	}
12466
12467	if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12468		dtrace_enabling_destroy(new);
12469		return (err);
12470	}
12471
12472	return (0);
12473}
12474
12475static void
12476dtrace_enabling_retract(dtrace_state_t *state)
12477{
12478	dtrace_enabling_t *enab, *next;
12479
12480	ASSERT(MUTEX_HELD(&dtrace_lock));
12481
12482	/*
12483	 * Iterate over all retained enablings, destroy the enablings retained
12484	 * for the specified state.
12485	 */
12486	for (enab = dtrace_retained; enab != NULL; enab = next) {
12487		next = enab->dten_next;
12488
12489		/*
12490		 * dtvs_state can only be NULL for helper enablings -- and
12491		 * helper enablings can't be retained.
12492		 */
12493		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12494
12495		if (enab->dten_vstate->dtvs_state == state) {
12496			ASSERT(state->dts_nretained > 0);
12497			dtrace_enabling_destroy(enab);
12498		}
12499	}
12500
12501	ASSERT(state->dts_nretained == 0);
12502}
12503
12504static int
12505dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
12506{
12507	int i = 0;
12508	int matched = 0;
12509
12510	ASSERT(MUTEX_HELD(&cpu_lock));
12511	ASSERT(MUTEX_HELD(&dtrace_lock));
12512
12513	for (i = 0; i < enab->dten_ndesc; i++) {
12514		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12515
12516		enab->dten_current = ep;
12517		enab->dten_error = 0;
12518
12519		matched += dtrace_probe_enable(&ep->dted_probe, enab);
12520
12521		if (enab->dten_error != 0) {
12522			/*
12523			 * If we get an error half-way through enabling the
12524			 * probes, we kick out -- perhaps with some number of
12525			 * them enabled.  Leaving enabled probes enabled may
12526			 * be slightly confusing for user-level, but we expect
12527			 * that no one will attempt to actually drive on in
12528			 * the face of such errors.  If this is an anonymous
12529			 * enabling (indicated with a NULL nmatched pointer),
12530			 * we cmn_err() a message.  We aren't expecting to
12531			 * get such an error -- such as it can exist at all,
12532			 * it would be a result of corrupted DOF in the driver
12533			 * properties.
12534			 */
12535			if (nmatched == NULL) {
12536				cmn_err(CE_WARN, "dtrace_enabling_match() "
12537				    "error on %p: %d", (void *)ep,
12538				    enab->dten_error);
12539			}
12540
12541			return (enab->dten_error);
12542		}
12543	}
12544
12545	enab->dten_probegen = dtrace_probegen;
12546	if (nmatched != NULL)
12547		*nmatched = matched;
12548
12549	return (0);
12550}
12551
12552static void
12553dtrace_enabling_matchall(void)
12554{
12555	dtrace_enabling_t *enab;
12556
12557	mutex_enter(&cpu_lock);
12558	mutex_enter(&dtrace_lock);
12559
12560	/*
12561	 * Iterate over all retained enablings to see if any probes match
12562	 * against them.  We only perform this operation on enablings for which
12563	 * we have sufficient permissions by virtue of being in the global zone
12564	 * or in the same zone as the DTrace client.  Because we can be called
12565	 * after dtrace_detach() has been called, we cannot assert that there
12566	 * are retained enablings.  We can safely load from dtrace_retained,
12567	 * however:  the taskq_destroy() at the end of dtrace_detach() will
12568	 * block pending our completion.
12569	 */
12570	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12571#if defined(sun)
12572		cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
12573
12574		if (INGLOBALZONE(curproc) ||
12575		    cr != NULL && getzoneid() == crgetzoneid(cr))
12576#endif
12577			(void) dtrace_enabling_match(enab, NULL);
12578	}
12579
12580	mutex_exit(&dtrace_lock);
12581	mutex_exit(&cpu_lock);
12582}
12583
12584/*
12585 * If an enabling is to be enabled without having matched probes (that is, if
12586 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12587 * enabling must be _primed_ by creating an ECB for every ECB description.
12588 * This must be done to assure that we know the number of speculations, the
12589 * number of aggregations, the minimum buffer size needed, etc. before we
12590 * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
12591 * enabling any probes, we create ECBs for every ECB decription, but with a
12592 * NULL probe -- which is exactly what this function does.
12593 */
12594static void
12595dtrace_enabling_prime(dtrace_state_t *state)
12596{
12597	dtrace_enabling_t *enab;
12598	int i;
12599
12600	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12601		ASSERT(enab->dten_vstate->dtvs_state != NULL);
12602
12603		if (enab->dten_vstate->dtvs_state != state)
12604			continue;
12605
12606		/*
12607		 * We don't want to prime an enabling more than once, lest
12608		 * we allow a malicious user to induce resource exhaustion.
12609		 * (The ECBs that result from priming an enabling aren't
12610		 * leaked -- but they also aren't deallocated until the
12611		 * consumer state is destroyed.)
12612		 */
12613		if (enab->dten_primed)
12614			continue;
12615
12616		for (i = 0; i < enab->dten_ndesc; i++) {
12617			enab->dten_current = enab->dten_desc[i];
12618			(void) dtrace_probe_enable(NULL, enab);
12619		}
12620
12621		enab->dten_primed = 1;
12622	}
12623}
12624
12625/*
12626 * Called to indicate that probes should be provided due to retained
12627 * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
12628 * must take an initial lap through the enabling calling the dtps_provide()
12629 * entry point explicitly to allow for autocreated probes.
12630 */
12631static void
12632dtrace_enabling_provide(dtrace_provider_t *prv)
12633{
12634	int i, all = 0;
12635	dtrace_probedesc_t desc;
12636	dtrace_genid_t gen;
12637
12638	ASSERT(MUTEX_HELD(&dtrace_lock));
12639	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
12640
12641	if (prv == NULL) {
12642		all = 1;
12643		prv = dtrace_provider;
12644	}
12645
12646	do {
12647		dtrace_enabling_t *enab;
12648		void *parg = prv->dtpv_arg;
12649
12650retry:
12651		gen = dtrace_retained_gen;
12652		for (enab = dtrace_retained; enab != NULL;
12653		    enab = enab->dten_next) {
12654			for (i = 0; i < enab->dten_ndesc; i++) {
12655				desc = enab->dten_desc[i]->dted_probe;
12656				mutex_exit(&dtrace_lock);
12657				prv->dtpv_pops.dtps_provide(parg, &desc);
12658				mutex_enter(&dtrace_lock);
12659				/*
12660				 * Process the retained enablings again if
12661				 * they have changed while we weren't holding
12662				 * dtrace_lock.
12663				 */
12664				if (gen != dtrace_retained_gen)
12665					goto retry;
12666			}
12667		}
12668	} while (all && (prv = prv->dtpv_next) != NULL);
12669
12670	mutex_exit(&dtrace_lock);
12671	dtrace_probe_provide(NULL, all ? NULL : prv);
12672	mutex_enter(&dtrace_lock);
12673}
12674
12675/*
12676 * Called to reap ECBs that are attached to probes from defunct providers.
12677 */
12678static void
12679dtrace_enabling_reap(void)
12680{
12681	dtrace_provider_t *prov;
12682	dtrace_probe_t *probe;
12683	dtrace_ecb_t *ecb;
12684	hrtime_t when;
12685	int i;
12686
12687	mutex_enter(&cpu_lock);
12688	mutex_enter(&dtrace_lock);
12689
12690	for (i = 0; i < dtrace_nprobes; i++) {
12691		if ((probe = dtrace_probes[i]) == NULL)
12692			continue;
12693
12694		if (probe->dtpr_ecb == NULL)
12695			continue;
12696
12697		prov = probe->dtpr_provider;
12698
12699		if ((when = prov->dtpv_defunct) == 0)
12700			continue;
12701
12702		/*
12703		 * We have ECBs on a defunct provider:  we want to reap these
12704		 * ECBs to allow the provider to unregister.  The destruction
12705		 * of these ECBs must be done carefully:  if we destroy the ECB
12706		 * and the consumer later wishes to consume an EPID that
12707		 * corresponds to the destroyed ECB (and if the EPID metadata
12708		 * has not been previously consumed), the consumer will abort
12709		 * processing on the unknown EPID.  To reduce (but not, sadly,
12710		 * eliminate) the possibility of this, we will only destroy an
12711		 * ECB for a defunct provider if, for the state that
12712		 * corresponds to the ECB:
12713		 *
12714		 *  (a)	There is no speculative tracing (which can effectively
12715		 *	cache an EPID for an arbitrary amount of time).
12716		 *
12717		 *  (b)	The principal buffers have been switched twice since the
12718		 *	provider became defunct.
12719		 *
12720		 *  (c)	The aggregation buffers are of zero size or have been
12721		 *	switched twice since the provider became defunct.
12722		 *
12723		 * We use dts_speculates to determine (a) and call a function
12724		 * (dtrace_buffer_consumed()) to determine (b) and (c).  Note
12725		 * that as soon as we've been unable to destroy one of the ECBs
12726		 * associated with the probe, we quit trying -- reaping is only
12727		 * fruitful in as much as we can destroy all ECBs associated
12728		 * with the defunct provider's probes.
12729		 */
12730		while ((ecb = probe->dtpr_ecb) != NULL) {
12731			dtrace_state_t *state = ecb->dte_state;
12732			dtrace_buffer_t *buf = state->dts_buffer;
12733			dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
12734
12735			if (state->dts_speculates)
12736				break;
12737
12738			if (!dtrace_buffer_consumed(buf, when))
12739				break;
12740
12741			if (!dtrace_buffer_consumed(aggbuf, when))
12742				break;
12743
12744			dtrace_ecb_disable(ecb);
12745			ASSERT(probe->dtpr_ecb != ecb);
12746			dtrace_ecb_destroy(ecb);
12747		}
12748	}
12749
12750	mutex_exit(&dtrace_lock);
12751	mutex_exit(&cpu_lock);
12752}
12753
12754/*
12755 * DTrace DOF Functions
12756 */
12757/*ARGSUSED*/
12758static void
12759dtrace_dof_error(dof_hdr_t *dof, const char *str)
12760{
12761	if (dtrace_err_verbose)
12762		cmn_err(CE_WARN, "failed to process DOF: %s", str);
12763
12764#ifdef DTRACE_ERRDEBUG
12765	dtrace_errdebug(str);
12766#endif
12767}
12768
12769/*
12770 * Create DOF out of a currently enabled state.  Right now, we only create
12771 * DOF containing the run-time options -- but this could be expanded to create
12772 * complete DOF representing the enabled state.
12773 */
12774static dof_hdr_t *
12775dtrace_dof_create(dtrace_state_t *state)
12776{
12777	dof_hdr_t *dof;
12778	dof_sec_t *sec;
12779	dof_optdesc_t *opt;
12780	int i, len = sizeof (dof_hdr_t) +
12781	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12782	    sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12783
12784	ASSERT(MUTEX_HELD(&dtrace_lock));
12785
12786	dof = kmem_zalloc(len, KM_SLEEP);
12787	dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12788	dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12789	dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12790	dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12791
12792	dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12793	dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12794	dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12795	dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12796	dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12797	dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12798
12799	dof->dofh_flags = 0;
12800	dof->dofh_hdrsize = sizeof (dof_hdr_t);
12801	dof->dofh_secsize = sizeof (dof_sec_t);
12802	dof->dofh_secnum = 1;	/* only DOF_SECT_OPTDESC */
12803	dof->dofh_secoff = sizeof (dof_hdr_t);
12804	dof->dofh_loadsz = len;
12805	dof->dofh_filesz = len;
12806	dof->dofh_pad = 0;
12807
12808	/*
12809	 * Fill in the option section header...
12810	 */
12811	sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12812	sec->dofs_type = DOF_SECT_OPTDESC;
12813	sec->dofs_align = sizeof (uint64_t);
12814	sec->dofs_flags = DOF_SECF_LOAD;
12815	sec->dofs_entsize = sizeof (dof_optdesc_t);
12816
12817	opt = (dof_optdesc_t *)((uintptr_t)sec +
12818	    roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12819
12820	sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12821	sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12822
12823	for (i = 0; i < DTRACEOPT_MAX; i++) {
12824		opt[i].dofo_option = i;
12825		opt[i].dofo_strtab = DOF_SECIDX_NONE;
12826		opt[i].dofo_value = state->dts_options[i];
12827	}
12828
12829	return (dof);
12830}
12831
12832static dof_hdr_t *
12833dtrace_dof_copyin(uintptr_t uarg, int *errp)
12834{
12835	dof_hdr_t hdr, *dof;
12836
12837	ASSERT(!MUTEX_HELD(&dtrace_lock));
12838
12839	/*
12840	 * First, we're going to copyin() the sizeof (dof_hdr_t).
12841	 */
12842	if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
12843		dtrace_dof_error(NULL, "failed to copyin DOF header");
12844		*errp = EFAULT;
12845		return (NULL);
12846	}
12847
12848	/*
12849	 * Now we'll allocate the entire DOF and copy it in -- provided
12850	 * that the length isn't outrageous.
12851	 */
12852	if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
12853		dtrace_dof_error(&hdr, "load size exceeds maximum");
12854		*errp = E2BIG;
12855		return (NULL);
12856	}
12857
12858	if (hdr.dofh_loadsz < sizeof (hdr)) {
12859		dtrace_dof_error(&hdr, "invalid load size");
12860		*errp = EINVAL;
12861		return (NULL);
12862	}
12863
12864	dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
12865
12866	if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
12867	    dof->dofh_loadsz != hdr.dofh_loadsz) {
12868		kmem_free(dof, hdr.dofh_loadsz);
12869		*errp = EFAULT;
12870		return (NULL);
12871	}
12872
12873	return (dof);
12874}
12875
12876#if !defined(sun)
12877static __inline uchar_t
12878dtrace_dof_char(char c) {
12879	switch (c) {
12880	case '0':
12881	case '1':
12882	case '2':
12883	case '3':
12884	case '4':
12885	case '5':
12886	case '6':
12887	case '7':
12888	case '8':
12889	case '9':
12890		return (c - '0');
12891	case 'A':
12892	case 'B':
12893	case 'C':
12894	case 'D':
12895	case 'E':
12896	case 'F':
12897		return (c - 'A' + 10);
12898	case 'a':
12899	case 'b':
12900	case 'c':
12901	case 'd':
12902	case 'e':
12903	case 'f':
12904		return (c - 'a' + 10);
12905	}
12906	/* Should not reach here. */
12907	return (0);
12908}
12909#endif
12910
12911static dof_hdr_t *
12912dtrace_dof_property(const char *name)
12913{
12914	uchar_t *buf;
12915	uint64_t loadsz;
12916	unsigned int len, i;
12917	dof_hdr_t *dof;
12918
12919#if defined(sun)
12920	/*
12921	 * Unfortunately, array of values in .conf files are always (and
12922	 * only) interpreted to be integer arrays.  We must read our DOF
12923	 * as an integer array, and then squeeze it into a byte array.
12924	 */
12925	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
12926	    (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
12927		return (NULL);
12928
12929	for (i = 0; i < len; i++)
12930		buf[i] = (uchar_t)(((int *)buf)[i]);
12931
12932	if (len < sizeof (dof_hdr_t)) {
12933		ddi_prop_free(buf);
12934		dtrace_dof_error(NULL, "truncated header");
12935		return (NULL);
12936	}
12937
12938	if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
12939		ddi_prop_free(buf);
12940		dtrace_dof_error(NULL, "truncated DOF");
12941		return (NULL);
12942	}
12943
12944	if (loadsz >= dtrace_dof_maxsize) {
12945		ddi_prop_free(buf);
12946		dtrace_dof_error(NULL, "oversized DOF");
12947		return (NULL);
12948	}
12949
12950	dof = kmem_alloc(loadsz, KM_SLEEP);
12951	bcopy(buf, dof, loadsz);
12952	ddi_prop_free(buf);
12953#else
12954	char *p;
12955	char *p_env;
12956
12957	if ((p_env = getenv(name)) == NULL)
12958		return (NULL);
12959
12960	len = strlen(p_env) / 2;
12961
12962	buf = kmem_alloc(len, KM_SLEEP);
12963
12964	dof = (dof_hdr_t *) buf;
12965
12966	p = p_env;
12967
12968	for (i = 0; i < len; i++) {
12969		buf[i] = (dtrace_dof_char(p[0]) << 4) |
12970		     dtrace_dof_char(p[1]);
12971		p += 2;
12972	}
12973
12974	freeenv(p_env);
12975
12976	if (len < sizeof (dof_hdr_t)) {
12977		kmem_free(buf, 0);
12978		dtrace_dof_error(NULL, "truncated header");
12979		return (NULL);
12980	}
12981
12982	if (len < (loadsz = dof->dofh_loadsz)) {
12983		kmem_free(buf, 0);
12984		dtrace_dof_error(NULL, "truncated DOF");
12985		return (NULL);
12986	}
12987
12988	if (loadsz >= dtrace_dof_maxsize) {
12989		kmem_free(buf, 0);
12990		dtrace_dof_error(NULL, "oversized DOF");
12991		return (NULL);
12992	}
12993#endif
12994
12995	return (dof);
12996}
12997
12998static void
12999dtrace_dof_destroy(dof_hdr_t *dof)
13000{
13001	kmem_free(dof, dof->dofh_loadsz);
13002}
13003
13004/*
13005 * Return the dof_sec_t pointer corresponding to a given section index.  If the
13006 * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
13007 * a type other than DOF_SECT_NONE is specified, the header is checked against
13008 * this type and NULL is returned if the types do not match.
13009 */
13010static dof_sec_t *
13011dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13012{
13013	dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
13014	    ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13015
13016	if (i >= dof->dofh_secnum) {
13017		dtrace_dof_error(dof, "referenced section index is invalid");
13018		return (NULL);
13019	}
13020
13021	if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13022		dtrace_dof_error(dof, "referenced section is not loadable");
13023		return (NULL);
13024	}
13025
13026	if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13027		dtrace_dof_error(dof, "referenced section is the wrong type");
13028		return (NULL);
13029	}
13030
13031	return (sec);
13032}
13033
13034static dtrace_probedesc_t *
13035dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
13036{
13037	dof_probedesc_t *probe;
13038	dof_sec_t *strtab;
13039	uintptr_t daddr = (uintptr_t)dof;
13040	uintptr_t str;
13041	size_t size;
13042
13043	if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13044		dtrace_dof_error(dof, "invalid probe section");
13045		return (NULL);
13046	}
13047
13048	if (sec->dofs_align != sizeof (dof_secidx_t)) {
13049		dtrace_dof_error(dof, "bad alignment in probe description");
13050		return (NULL);
13051	}
13052
13053	if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13054		dtrace_dof_error(dof, "truncated probe description");
13055		return (NULL);
13056	}
13057
13058	probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13059	strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
13060
13061	if (strtab == NULL)
13062		return (NULL);
13063
13064	str = daddr + strtab->dofs_offset;
13065	size = strtab->dofs_size;
13066
13067	if (probe->dofp_provider >= strtab->dofs_size) {
13068		dtrace_dof_error(dof, "corrupt probe provider");
13069		return (NULL);
13070	}
13071
13072	(void) strncpy(desc->dtpd_provider,
13073	    (char *)(str + probe->dofp_provider),
13074	    MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
13075
13076	if (probe->dofp_mod >= strtab->dofs_size) {
13077		dtrace_dof_error(dof, "corrupt probe module");
13078		return (NULL);
13079	}
13080
13081	(void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13082	    MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
13083
13084	if (probe->dofp_func >= strtab->dofs_size) {
13085		dtrace_dof_error(dof, "corrupt probe function");
13086		return (NULL);
13087	}
13088
13089	(void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13090	    MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
13091
13092	if (probe->dofp_name >= strtab->dofs_size) {
13093		dtrace_dof_error(dof, "corrupt probe name");
13094		return (NULL);
13095	}
13096
13097	(void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13098	    MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
13099
13100	return (desc);
13101}
13102
13103static dtrace_difo_t *
13104dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13105    cred_t *cr)
13106{
13107	dtrace_difo_t *dp;
13108	size_t ttl = 0;
13109	dof_difohdr_t *dofd;
13110	uintptr_t daddr = (uintptr_t)dof;
13111	size_t max = dtrace_difo_maxsize;
13112	int i, l, n;
13113
13114	static const struct {
13115		int section;
13116		int bufoffs;
13117		int lenoffs;
13118		int entsize;
13119		int align;
13120		const char *msg;
13121	} difo[] = {
13122		{ DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13123		offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
13124		sizeof (dif_instr_t), "multiple DIF sections" },
13125
13126		{ DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13127		offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
13128		sizeof (uint64_t), "multiple integer tables" },
13129
13130		{ DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13131		offsetof(dtrace_difo_t, dtdo_strlen), 0,
13132		sizeof (char), "multiple string tables" },
13133
13134		{ DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13135		offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
13136		sizeof (uint_t), "multiple variable tables" },
13137
13138		{ DOF_SECT_NONE, 0, 0, 0, 0, NULL }
13139	};
13140
13141	if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13142		dtrace_dof_error(dof, "invalid DIFO header section");
13143		return (NULL);
13144	}
13145
13146	if (sec->dofs_align != sizeof (dof_secidx_t)) {
13147		dtrace_dof_error(dof, "bad alignment in DIFO header");
13148		return (NULL);
13149	}
13150
13151	if (sec->dofs_size < sizeof (dof_difohdr_t) ||
13152	    sec->dofs_size % sizeof (dof_secidx_t)) {
13153		dtrace_dof_error(dof, "bad size in DIFO header");
13154		return (NULL);
13155	}
13156
13157	dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13158	n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
13159
13160	dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13161	dp->dtdo_rtype = dofd->dofd_rtype;
13162
13163	for (l = 0; l < n; l++) {
13164		dof_sec_t *subsec;
13165		void **bufp;
13166		uint32_t *lenp;
13167
13168		if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13169		    dofd->dofd_links[l])) == NULL)
13170			goto err; /* invalid section link */
13171
13172		if (ttl + subsec->dofs_size > max) {
13173			dtrace_dof_error(dof, "exceeds maximum size");
13174			goto err;
13175		}
13176
13177		ttl += subsec->dofs_size;
13178
13179		for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
13180			if (subsec->dofs_type != difo[i].section)
13181				continue;
13182
13183			if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13184				dtrace_dof_error(dof, "section not loaded");
13185				goto err;
13186			}
13187
13188			if (subsec->dofs_align != difo[i].align) {
13189				dtrace_dof_error(dof, "bad alignment");
13190				goto err;
13191			}
13192
13193			bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13194			lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13195
13196			if (*bufp != NULL) {
13197				dtrace_dof_error(dof, difo[i].msg);
13198				goto err;
13199			}
13200
13201			if (difo[i].entsize != subsec->dofs_entsize) {
13202				dtrace_dof_error(dof, "entry size mismatch");
13203				goto err;
13204			}
13205
13206			if (subsec->dofs_entsize != 0 &&
13207			    (subsec->dofs_size % subsec->dofs_entsize) != 0) {
13208				dtrace_dof_error(dof, "corrupt entry size");
13209				goto err;
13210			}
13211
13212			*lenp = subsec->dofs_size;
13213			*bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13214			bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
13215			    *bufp, subsec->dofs_size);
13216
13217			if (subsec->dofs_entsize != 0)
13218				*lenp /= subsec->dofs_entsize;
13219
13220			break;
13221		}
13222
13223		/*
13224		 * If we encounter a loadable DIFO sub-section that is not
13225		 * known to us, assume this is a broken program and fail.
13226		 */
13227		if (difo[i].section == DOF_SECT_NONE &&
13228		    (subsec->dofs_flags & DOF_SECF_LOAD)) {
13229			dtrace_dof_error(dof, "unrecognized DIFO subsection");
13230			goto err;
13231		}
13232	}
13233
13234	if (dp->dtdo_buf == NULL) {
13235		/*
13236		 * We can't have a DIF object without DIF text.
13237		 */
13238		dtrace_dof_error(dof, "missing DIF text");
13239		goto err;
13240	}
13241
13242	/*
13243	 * Before we validate the DIF object, run through the variable table
13244	 * looking for the strings -- if any of their size are under, we'll set
13245	 * their size to be the system-wide default string size.  Note that
13246	 * this should _not_ happen if the "strsize" option has been set --
13247	 * in this case, the compiler should have set the size to reflect the
13248	 * setting of the option.
13249	 */
13250	for (i = 0; i < dp->dtdo_varlen; i++) {
13251		dtrace_difv_t *v = &dp->dtdo_vartab[i];
13252		dtrace_diftype_t *t = &v->dtdv_type;
13253
13254		if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13255			continue;
13256
13257		if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
13258			t->dtdt_size = dtrace_strsize_default;
13259	}
13260
13261	if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
13262		goto err;
13263
13264	dtrace_difo_init(dp, vstate);
13265	return (dp);
13266
13267err:
13268	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13269	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13270	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13271	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13272
13273	kmem_free(dp, sizeof (dtrace_difo_t));
13274	return (NULL);
13275}
13276
13277static dtrace_predicate_t *
13278dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13279    cred_t *cr)
13280{
13281	dtrace_difo_t *dp;
13282
13283	if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13284		return (NULL);
13285
13286	return (dtrace_predicate_create(dp));
13287}
13288
13289static dtrace_actdesc_t *
13290dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13291    cred_t *cr)
13292{
13293	dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
13294	dof_actdesc_t *desc;
13295	dof_sec_t *difosec;
13296	size_t offs;
13297	uintptr_t daddr = (uintptr_t)dof;
13298	uint64_t arg;
13299	dtrace_actkind_t kind;
13300
13301	if (sec->dofs_type != DOF_SECT_ACTDESC) {
13302		dtrace_dof_error(dof, "invalid action section");
13303		return (NULL);
13304	}
13305
13306	if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13307		dtrace_dof_error(dof, "truncated action description");
13308		return (NULL);
13309	}
13310
13311	if (sec->dofs_align != sizeof (uint64_t)) {
13312		dtrace_dof_error(dof, "bad alignment in action description");
13313		return (NULL);
13314	}
13315
13316	if (sec->dofs_size < sec->dofs_entsize) {
13317		dtrace_dof_error(dof, "section entry size exceeds total size");
13318		return (NULL);
13319	}
13320
13321	if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13322		dtrace_dof_error(dof, "bad entry size in action description");
13323		return (NULL);
13324	}
13325
13326	if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13327		dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
13328		return (NULL);
13329	}
13330
13331	for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13332		desc = (dof_actdesc_t *)(daddr +
13333		    (uintptr_t)sec->dofs_offset + offs);
13334		kind = (dtrace_actkind_t)desc->dofa_kind;
13335
13336		if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13337		    (kind != DTRACEACT_PRINTA ||
13338		    desc->dofa_strtab != DOF_SECIDX_NONE)) ||
13339		    (kind == DTRACEACT_DIFEXPR &&
13340		    desc->dofa_strtab != DOF_SECIDX_NONE)) {
13341			dof_sec_t *strtab;
13342			char *str, *fmt;
13343			uint64_t i;
13344
13345			/*
13346			 * The argument to these actions is an index into the
13347			 * DOF string table.  For printf()-like actions, this
13348			 * is the format string.  For print(), this is the
13349			 * CTF type of the expression result.
13350			 */
13351			if ((strtab = dtrace_dof_sect(dof,
13352			    DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
13353				goto err;
13354
13355			str = (char *)((uintptr_t)dof +
13356			    (uintptr_t)strtab->dofs_offset);
13357
13358			for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13359				if (str[i] == '\0')
13360					break;
13361			}
13362
13363			if (i >= strtab->dofs_size) {
13364				dtrace_dof_error(dof, "bogus format string");
13365				goto err;
13366			}
13367
13368			if (i == desc->dofa_arg) {
13369				dtrace_dof_error(dof, "empty format string");
13370				goto err;
13371			}
13372
13373			i -= desc->dofa_arg;
13374			fmt = kmem_alloc(i + 1, KM_SLEEP);
13375			bcopy(&str[desc->dofa_arg], fmt, i + 1);
13376			arg = (uint64_t)(uintptr_t)fmt;
13377		} else {
13378			if (kind == DTRACEACT_PRINTA) {
13379				ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13380				arg = 0;
13381			} else {
13382				arg = desc->dofa_arg;
13383			}
13384		}
13385
13386		act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13387		    desc->dofa_uarg, arg);
13388
13389		if (last != NULL) {
13390			last->dtad_next = act;
13391		} else {
13392			first = act;
13393		}
13394
13395		last = act;
13396
13397		if (desc->dofa_difo == DOF_SECIDX_NONE)
13398			continue;
13399
13400		if ((difosec = dtrace_dof_sect(dof,
13401		    DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13402			goto err;
13403
13404		act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13405
13406		if (act->dtad_difo == NULL)
13407			goto err;
13408	}
13409
13410	ASSERT(first != NULL);
13411	return (first);
13412
13413err:
13414	for (act = first; act != NULL; act = next) {
13415		next = act->dtad_next;
13416		dtrace_actdesc_release(act, vstate);
13417	}
13418
13419	return (NULL);
13420}
13421
13422static dtrace_ecbdesc_t *
13423dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13424    cred_t *cr)
13425{
13426	dtrace_ecbdesc_t *ep;
13427	dof_ecbdesc_t *ecb;
13428	dtrace_probedesc_t *desc;
13429	dtrace_predicate_t *pred = NULL;
13430
13431	if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13432		dtrace_dof_error(dof, "truncated ECB description");
13433		return (NULL);
13434	}
13435
13436	if (sec->dofs_align != sizeof (uint64_t)) {
13437		dtrace_dof_error(dof, "bad alignment in ECB description");
13438		return (NULL);
13439	}
13440
13441	ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13442	sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13443
13444	if (sec == NULL)
13445		return (NULL);
13446
13447	ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13448	ep->dted_uarg = ecb->dofe_uarg;
13449	desc = &ep->dted_probe;
13450
13451	if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13452		goto err;
13453
13454	if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13455		if ((sec = dtrace_dof_sect(dof,
13456		    DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13457			goto err;
13458
13459		if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13460			goto err;
13461
13462		ep->dted_pred.dtpdd_predicate = pred;
13463	}
13464
13465	if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13466		if ((sec = dtrace_dof_sect(dof,
13467		    DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13468			goto err;
13469
13470		ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13471
13472		if (ep->dted_action == NULL)
13473			goto err;
13474	}
13475
13476	return (ep);
13477
13478err:
13479	if (pred != NULL)
13480		dtrace_predicate_release(pred, vstate);
13481	kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13482	return (NULL);
13483}
13484
13485/*
13486 * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
13487 * specified DOF.  At present, this amounts to simply adding 'ubase' to the
13488 * site of any user SETX relocations to account for load object base address.
13489 * In the future, if we need other relocations, this function can be extended.
13490 */
13491static int
13492dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
13493{
13494	uintptr_t daddr = (uintptr_t)dof;
13495	dof_relohdr_t *dofr =
13496	    (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13497	dof_sec_t *ss, *rs, *ts;
13498	dof_relodesc_t *r;
13499	uint_t i, n;
13500
13501	if (sec->dofs_size < sizeof (dof_relohdr_t) ||
13502	    sec->dofs_align != sizeof (dof_secidx_t)) {
13503		dtrace_dof_error(dof, "invalid relocation header");
13504		return (-1);
13505	}
13506
13507	ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
13508	rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
13509	ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
13510
13511	if (ss == NULL || rs == NULL || ts == NULL)
13512		return (-1); /* dtrace_dof_error() has been called already */
13513
13514	if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
13515	    rs->dofs_align != sizeof (uint64_t)) {
13516		dtrace_dof_error(dof, "invalid relocation section");
13517		return (-1);
13518	}
13519
13520	r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
13521	n = rs->dofs_size / rs->dofs_entsize;
13522
13523	for (i = 0; i < n; i++) {
13524		uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
13525
13526		switch (r->dofr_type) {
13527		case DOF_RELO_NONE:
13528			break;
13529		case DOF_RELO_SETX:
13530			if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
13531			    sizeof (uint64_t) > ts->dofs_size) {
13532				dtrace_dof_error(dof, "bad relocation offset");
13533				return (-1);
13534			}
13535
13536			if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
13537				dtrace_dof_error(dof, "misaligned setx relo");
13538				return (-1);
13539			}
13540
13541			*(uint64_t *)taddr += ubase;
13542			break;
13543		default:
13544			dtrace_dof_error(dof, "invalid relocation type");
13545			return (-1);
13546		}
13547
13548		r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
13549	}
13550
13551	return (0);
13552}
13553
13554/*
13555 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13556 * header:  it should be at the front of a memory region that is at least
13557 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13558 * size.  It need not be validated in any other way.
13559 */
13560static int
13561dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
13562    dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
13563{
13564	uint64_t len = dof->dofh_loadsz, seclen;
13565	uintptr_t daddr = (uintptr_t)dof;
13566	dtrace_ecbdesc_t *ep;
13567	dtrace_enabling_t *enab;
13568	uint_t i;
13569
13570	ASSERT(MUTEX_HELD(&dtrace_lock));
13571	ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13572
13573	/*
13574	 * Check the DOF header identification bytes.  In addition to checking
13575	 * valid settings, we also verify that unused bits/bytes are zeroed so
13576	 * we can use them later without fear of regressing existing binaries.
13577	 */
13578	if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
13579	    DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
13580		dtrace_dof_error(dof, "DOF magic string mismatch");
13581		return (-1);
13582	}
13583
13584	if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13585	    dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13586		dtrace_dof_error(dof, "DOF has invalid data model");
13587		return (-1);
13588	}
13589
13590	if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13591		dtrace_dof_error(dof, "DOF encoding mismatch");
13592		return (-1);
13593	}
13594
13595	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
13596	    dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
13597		dtrace_dof_error(dof, "DOF version mismatch");
13598		return (-1);
13599	}
13600
13601	if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13602		dtrace_dof_error(dof, "DOF uses unsupported instruction set");
13603		return (-1);
13604	}
13605
13606	if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13607		dtrace_dof_error(dof, "DOF uses too many integer registers");
13608		return (-1);
13609	}
13610
13611	if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13612		dtrace_dof_error(dof, "DOF uses too many tuple registers");
13613		return (-1);
13614	}
13615
13616	for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13617		if (dof->dofh_ident[i] != 0) {
13618			dtrace_dof_error(dof, "DOF has invalid ident byte set");
13619			return (-1);
13620		}
13621	}
13622
13623	if (dof->dofh_flags & ~DOF_FL_VALID) {
13624		dtrace_dof_error(dof, "DOF has invalid flag bits set");
13625		return (-1);
13626	}
13627
13628	if (dof->dofh_secsize == 0) {
13629		dtrace_dof_error(dof, "zero section header size");
13630		return (-1);
13631	}
13632
13633	/*
13634	 * Check that the section headers don't exceed the amount of DOF
13635	 * data.  Note that we cast the section size and number of sections
13636	 * to uint64_t's to prevent possible overflow in the multiplication.
13637	 */
13638	seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13639
13640	if (dof->dofh_secoff > len || seclen > len ||
13641	    dof->dofh_secoff + seclen > len) {
13642		dtrace_dof_error(dof, "truncated section headers");
13643		return (-1);
13644	}
13645
13646	if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13647		dtrace_dof_error(dof, "misaligned section headers");
13648		return (-1);
13649	}
13650
13651	if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13652		dtrace_dof_error(dof, "misaligned section size");
13653		return (-1);
13654	}
13655
13656	/*
13657	 * Take an initial pass through the section headers to be sure that
13658	 * the headers don't have stray offsets.  If the 'noprobes' flag is
13659	 * set, do not permit sections relating to providers, probes, or args.
13660	 */
13661	for (i = 0; i < dof->dofh_secnum; i++) {
13662		dof_sec_t *sec = (dof_sec_t *)(daddr +
13663		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13664
13665		if (noprobes) {
13666			switch (sec->dofs_type) {
13667			case DOF_SECT_PROVIDER:
13668			case DOF_SECT_PROBES:
13669			case DOF_SECT_PRARGS:
13670			case DOF_SECT_PROFFS:
13671				dtrace_dof_error(dof, "illegal sections "
13672				    "for enabling");
13673				return (-1);
13674			}
13675		}
13676
13677		if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
13678		    !(sec->dofs_flags & DOF_SECF_LOAD)) {
13679			dtrace_dof_error(dof, "loadable section with load "
13680			    "flag unset");
13681			return (-1);
13682		}
13683
13684		if (!(sec->dofs_flags & DOF_SECF_LOAD))
13685			continue; /* just ignore non-loadable sections */
13686
13687		if (sec->dofs_align & (sec->dofs_align - 1)) {
13688			dtrace_dof_error(dof, "bad section alignment");
13689			return (-1);
13690		}
13691
13692		if (sec->dofs_offset & (sec->dofs_align - 1)) {
13693			dtrace_dof_error(dof, "misaligned section");
13694			return (-1);
13695		}
13696
13697		if (sec->dofs_offset > len || sec->dofs_size > len ||
13698		    sec->dofs_offset + sec->dofs_size > len) {
13699			dtrace_dof_error(dof, "corrupt section header");
13700			return (-1);
13701		}
13702
13703		if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13704		    sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13705			dtrace_dof_error(dof, "non-terminating string table");
13706			return (-1);
13707		}
13708	}
13709
13710	/*
13711	 * Take a second pass through the sections and locate and perform any
13712	 * relocations that are present.  We do this after the first pass to
13713	 * be sure that all sections have had their headers validated.
13714	 */
13715	for (i = 0; i < dof->dofh_secnum; i++) {
13716		dof_sec_t *sec = (dof_sec_t *)(daddr +
13717		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13718
13719		if (!(sec->dofs_flags & DOF_SECF_LOAD))
13720			continue; /* skip sections that are not loadable */
13721
13722		switch (sec->dofs_type) {
13723		case DOF_SECT_URELHDR:
13724			if (dtrace_dof_relocate(dof, sec, ubase) != 0)
13725				return (-1);
13726			break;
13727		}
13728	}
13729
13730	if ((enab = *enabp) == NULL)
13731		enab = *enabp = dtrace_enabling_create(vstate);
13732
13733	for (i = 0; i < dof->dofh_secnum; i++) {
13734		dof_sec_t *sec = (dof_sec_t *)(daddr +
13735		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13736
13737		if (sec->dofs_type != DOF_SECT_ECBDESC)
13738			continue;
13739
13740		if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
13741			dtrace_enabling_destroy(enab);
13742			*enabp = NULL;
13743			return (-1);
13744		}
13745
13746		dtrace_enabling_add(enab, ep);
13747	}
13748
13749	return (0);
13750}
13751
13752/*
13753 * Process DOF for any options.  This routine assumes that the DOF has been
13754 * at least processed by dtrace_dof_slurp().
13755 */
13756static int
13757dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13758{
13759	int i, rval;
13760	uint32_t entsize;
13761	size_t offs;
13762	dof_optdesc_t *desc;
13763
13764	for (i = 0; i < dof->dofh_secnum; i++) {
13765		dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13766		    (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13767
13768		if (sec->dofs_type != DOF_SECT_OPTDESC)
13769			continue;
13770
13771		if (sec->dofs_align != sizeof (uint64_t)) {
13772			dtrace_dof_error(dof, "bad alignment in "
13773			    "option description");
13774			return (EINVAL);
13775		}
13776
13777		if ((entsize = sec->dofs_entsize) == 0) {
13778			dtrace_dof_error(dof, "zeroed option entry size");
13779			return (EINVAL);
13780		}
13781
13782		if (entsize < sizeof (dof_optdesc_t)) {
13783			dtrace_dof_error(dof, "bad option entry size");
13784			return (EINVAL);
13785		}
13786
13787		for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13788			desc = (dof_optdesc_t *)((uintptr_t)dof +
13789			    (uintptr_t)sec->dofs_offset + offs);
13790
13791			if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13792				dtrace_dof_error(dof, "non-zero option string");
13793				return (EINVAL);
13794			}
13795
13796			if (desc->dofo_value == DTRACEOPT_UNSET) {
13797				dtrace_dof_error(dof, "unset option");
13798				return (EINVAL);
13799			}
13800
13801			if ((rval = dtrace_state_option(state,
13802			    desc->dofo_option, desc->dofo_value)) != 0) {
13803				dtrace_dof_error(dof, "rejected option");
13804				return (rval);
13805			}
13806		}
13807	}
13808
13809	return (0);
13810}
13811
13812/*
13813 * DTrace Consumer State Functions
13814 */
13815static int
13816dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13817{
13818	size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
13819	void *base;
13820	uintptr_t limit;
13821	dtrace_dynvar_t *dvar, *next, *start;
13822	int i;
13823
13824	ASSERT(MUTEX_HELD(&dtrace_lock));
13825	ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13826
13827	bzero(dstate, sizeof (dtrace_dstate_t));
13828
13829	if ((dstate->dtds_chunksize = chunksize) == 0)
13830		dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13831
13832	if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13833		size = min;
13834
13835	if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
13836		return (ENOMEM);
13837
13838	dstate->dtds_size = size;
13839	dstate->dtds_base = base;
13840	dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13841	bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
13842
13843	hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13844
13845	if (hashsize != 1 && (hashsize & 1))
13846		hashsize--;
13847
13848	dstate->dtds_hashsize = hashsize;
13849	dstate->dtds_hash = dstate->dtds_base;
13850
13851	/*
13852	 * Set all of our hash buckets to point to the single sink, and (if
13853	 * it hasn't already been set), set the sink's hash value to be the
13854	 * sink sentinel value.  The sink is needed for dynamic variable
13855	 * lookups to know that they have iterated over an entire, valid hash
13856	 * chain.
13857	 */
13858	for (i = 0; i < hashsize; i++)
13859		dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13860
13861	if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13862		dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13863
13864	/*
13865	 * Determine number of active CPUs.  Divide free list evenly among
13866	 * active CPUs.
13867	 */
13868	start = (dtrace_dynvar_t *)
13869	    ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13870	limit = (uintptr_t)base + size;
13871
13872	maxper = (limit - (uintptr_t)start) / NCPU;
13873	maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13874
13875#if !defined(sun)
13876	CPU_FOREACH(i) {
13877#else
13878	for (i = 0; i < NCPU; i++) {
13879#endif
13880		dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13881
13882		/*
13883		 * If we don't even have enough chunks to make it once through
13884		 * NCPUs, we're just going to allocate everything to the first
13885		 * CPU.  And if we're on the last CPU, we're going to allocate
13886		 * whatever is left over.  In either case, we set the limit to
13887		 * be the limit of the dynamic variable space.
13888		 */
13889		if (maxper == 0 || i == NCPU - 1) {
13890			limit = (uintptr_t)base + size;
13891			start = NULL;
13892		} else {
13893			limit = (uintptr_t)start + maxper;
13894			start = (dtrace_dynvar_t *)limit;
13895		}
13896
13897		ASSERT(limit <= (uintptr_t)base + size);
13898
13899		for (;;) {
13900			next = (dtrace_dynvar_t *)((uintptr_t)dvar +
13901			    dstate->dtds_chunksize);
13902
13903			if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
13904				break;
13905
13906			dvar->dtdv_next = next;
13907			dvar = next;
13908		}
13909
13910		if (maxper == 0)
13911			break;
13912	}
13913
13914	return (0);
13915}
13916
13917static void
13918dtrace_dstate_fini(dtrace_dstate_t *dstate)
13919{
13920	ASSERT(MUTEX_HELD(&cpu_lock));
13921
13922	if (dstate->dtds_base == NULL)
13923		return;
13924
13925	kmem_free(dstate->dtds_base, dstate->dtds_size);
13926	kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
13927}
13928
13929static void
13930dtrace_vstate_fini(dtrace_vstate_t *vstate)
13931{
13932	/*
13933	 * Logical XOR, where are you?
13934	 */
13935	ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
13936
13937	if (vstate->dtvs_nglobals > 0) {
13938		kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
13939		    sizeof (dtrace_statvar_t *));
13940	}
13941
13942	if (vstate->dtvs_ntlocals > 0) {
13943		kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
13944		    sizeof (dtrace_difv_t));
13945	}
13946
13947	ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
13948
13949	if (vstate->dtvs_nlocals > 0) {
13950		kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
13951		    sizeof (dtrace_statvar_t *));
13952	}
13953}
13954
13955#if defined(sun)
13956static void
13957dtrace_state_clean(dtrace_state_t *state)
13958{
13959	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
13960		return;
13961
13962	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
13963	dtrace_speculation_clean(state);
13964}
13965
13966static void
13967dtrace_state_deadman(dtrace_state_t *state)
13968{
13969	hrtime_t now;
13970
13971	dtrace_sync();
13972
13973	now = dtrace_gethrtime();
13974
13975	if (state != dtrace_anon.dta_state &&
13976	    now - state->dts_laststatus >= dtrace_deadman_user)
13977		return;
13978
13979	/*
13980	 * We must be sure that dts_alive never appears to be less than the
13981	 * value upon entry to dtrace_state_deadman(), and because we lack a
13982	 * dtrace_cas64(), we cannot store to it atomically.  We thus instead
13983	 * store INT64_MAX to it, followed by a memory barrier, followed by
13984	 * the new value.  This assures that dts_alive never appears to be
13985	 * less than its true value, regardless of the order in which the
13986	 * stores to the underlying storage are issued.
13987	 */
13988	state->dts_alive = INT64_MAX;
13989	dtrace_membar_producer();
13990	state->dts_alive = now;
13991}
13992#else
13993static void
13994dtrace_state_clean(void *arg)
13995{
13996	dtrace_state_t *state = arg;
13997	dtrace_optval_t *opt = state->dts_options;
13998
13999	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14000		return;
14001
14002	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14003	dtrace_speculation_clean(state);
14004
14005	callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14006	    dtrace_state_clean, state);
14007}
14008
14009static void
14010dtrace_state_deadman(void *arg)
14011{
14012	dtrace_state_t *state = arg;
14013	hrtime_t now;
14014
14015	dtrace_sync();
14016
14017	dtrace_debug_output();
14018
14019	now = dtrace_gethrtime();
14020
14021	if (state != dtrace_anon.dta_state &&
14022	    now - state->dts_laststatus >= dtrace_deadman_user)
14023		return;
14024
14025	/*
14026	 * We must be sure that dts_alive never appears to be less than the
14027	 * value upon entry to dtrace_state_deadman(), and because we lack a
14028	 * dtrace_cas64(), we cannot store to it atomically.  We thus instead
14029	 * store INT64_MAX to it, followed by a memory barrier, followed by
14030	 * the new value.  This assures that dts_alive never appears to be
14031	 * less than its true value, regardless of the order in which the
14032	 * stores to the underlying storage are issued.
14033	 */
14034	state->dts_alive = INT64_MAX;
14035	dtrace_membar_producer();
14036	state->dts_alive = now;
14037
14038	callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14039	    dtrace_state_deadman, state);
14040}
14041#endif
14042
14043static dtrace_state_t *
14044#if defined(sun)
14045dtrace_state_create(dev_t *devp, cred_t *cr)
14046#else
14047dtrace_state_create(struct cdev *dev)
14048#endif
14049{
14050#if defined(sun)
14051	minor_t minor;
14052	major_t major;
14053#else
14054	cred_t *cr = NULL;
14055	int m = 0;
14056#endif
14057	char c[30];
14058	dtrace_state_t *state;
14059	dtrace_optval_t *opt;
14060	int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
14061
14062	ASSERT(MUTEX_HELD(&dtrace_lock));
14063	ASSERT(MUTEX_HELD(&cpu_lock));
14064
14065#if defined(sun)
14066	minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
14067	    VM_BESTFIT | VM_SLEEP);
14068
14069	if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
14070		vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
14071		return (NULL);
14072	}
14073
14074	state = ddi_get_soft_state(dtrace_softstate, minor);
14075#else
14076	if (dev != NULL) {
14077		cr = dev->si_cred;
14078		m = dev2unit(dev);
14079		}
14080
14081	/* Allocate memory for the state. */
14082	state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
14083#endif
14084
14085	state->dts_epid = DTRACE_EPIDNONE + 1;
14086
14087	(void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
14088#if defined(sun)
14089	state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
14090	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14091
14092	if (devp != NULL) {
14093		major = getemajor(*devp);
14094	} else {
14095		major = ddi_driver_major(dtrace_devi);
14096	}
14097
14098	state->dts_dev = makedevice(major, minor);
14099
14100	if (devp != NULL)
14101		*devp = state->dts_dev;
14102#else
14103	state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
14104	state->dts_dev = dev;
14105#endif
14106
14107	/*
14108	 * We allocate NCPU buffers.  On the one hand, this can be quite
14109	 * a bit of memory per instance (nearly 36K on a Starcat).  On the
14110	 * other hand, it saves an additional memory reference in the probe
14111	 * path.
14112	 */
14113	state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14114	state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14115
14116#if defined(sun)
14117	state->dts_cleaner = CYCLIC_NONE;
14118	state->dts_deadman = CYCLIC_NONE;
14119#else
14120	callout_init(&state->dts_cleaner, CALLOUT_MPSAFE);
14121	callout_init(&state->dts_deadman, CALLOUT_MPSAFE);
14122#endif
14123	state->dts_vstate.dtvs_state = state;
14124
14125	for (i = 0; i < DTRACEOPT_MAX; i++)
14126		state->dts_options[i] = DTRACEOPT_UNSET;
14127
14128	/*
14129	 * Set the default options.
14130	 */
14131	opt = state->dts_options;
14132	opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14133	opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14134	opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14135	opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14136	opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14137	opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14138	opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14139	opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14140	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14141	opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14142	opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14143	opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14144	opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14145	opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14146
14147	state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
14148
14149	/*
14150	 * Depending on the user credentials, we set flag bits which alter probe
14151	 * visibility or the amount of destructiveness allowed.  In the case of
14152	 * actual anonymous tracing, or the possession of all privileges, all of
14153	 * the normal checks are bypassed.
14154	 */
14155	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14156		state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14157		state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14158	} else {
14159		/*
14160		 * Set up the credentials for this instantiation.  We take a
14161		 * hold on the credential to prevent it from disappearing on
14162		 * us; this in turn prevents the zone_t referenced by this
14163		 * credential from disappearing.  This means that we can
14164		 * examine the credential and the zone from probe context.
14165		 */
14166		crhold(cr);
14167		state->dts_cred.dcr_cred = cr;
14168
14169		/*
14170		 * CRA_PROC means "we have *some* privilege for dtrace" and
14171		 * unlocks the use of variables like pid, zonename, etc.
14172		 */
14173		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
14174		    PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14175			state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
14176		}
14177
14178		/*
14179		 * dtrace_user allows use of syscall and profile providers.
14180		 * If the user also has proc_owner and/or proc_zone, we
14181		 * extend the scope to include additional visibility and
14182		 * destructive power.
14183		 */
14184		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14185			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14186				state->dts_cred.dcr_visible |=
14187				    DTRACE_CRV_ALLPROC;
14188
14189				state->dts_cred.dcr_action |=
14190				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14191			}
14192
14193			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14194				state->dts_cred.dcr_visible |=
14195				    DTRACE_CRV_ALLZONE;
14196
14197				state->dts_cred.dcr_action |=
14198				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14199			}
14200
14201			/*
14202			 * If we have all privs in whatever zone this is,
14203			 * we can do destructive things to processes which
14204			 * have altered credentials.
14205			 */
14206#if defined(sun)
14207			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14208			    cr->cr_zone->zone_privset)) {
14209				state->dts_cred.dcr_action |=
14210				    DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14211			}
14212#endif
14213		}
14214
14215		/*
14216		 * Holding the dtrace_kernel privilege also implies that
14217		 * the user has the dtrace_user privilege from a visibility
14218		 * perspective.  But without further privileges, some
14219		 * destructive actions are not available.
14220		 */
14221		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14222			/*
14223			 * Make all probes in all zones visible.  However,
14224			 * this doesn't mean that all actions become available
14225			 * to all zones.
14226			 */
14227			state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
14228			    DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
14229
14230			state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
14231			    DTRACE_CRA_PROC;
14232			/*
14233			 * Holding proc_owner means that destructive actions
14234			 * for *this* zone are allowed.
14235			 */
14236			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14237				state->dts_cred.dcr_action |=
14238				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14239
14240			/*
14241			 * Holding proc_zone means that destructive actions
14242			 * for this user/group ID in all zones is allowed.
14243			 */
14244			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14245				state->dts_cred.dcr_action |=
14246				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14247
14248#if defined(sun)
14249			/*
14250			 * If we have all privs in whatever zone this is,
14251			 * we can do destructive things to processes which
14252			 * have altered credentials.
14253			 */
14254			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14255			    cr->cr_zone->zone_privset)) {
14256				state->dts_cred.dcr_action |=
14257				    DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14258			}
14259#endif
14260		}
14261
14262		/*
14263		 * Holding the dtrace_proc privilege gives control over fasttrap
14264		 * and pid providers.  We need to grant wider destructive
14265		 * privileges in the event that the user has proc_owner and/or
14266		 * proc_zone.
14267		 */
14268		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14269			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14270				state->dts_cred.dcr_action |=
14271				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14272
14273			if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14274				state->dts_cred.dcr_action |=
14275				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14276		}
14277	}
14278
14279	return (state);
14280}
14281
14282static int
14283dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
14284{
14285	dtrace_optval_t *opt = state->dts_options, size;
14286	processorid_t cpu = 0;;
14287	int flags = 0, rval, factor, divisor = 1;
14288
14289	ASSERT(MUTEX_HELD(&dtrace_lock));
14290	ASSERT(MUTEX_HELD(&cpu_lock));
14291	ASSERT(which < DTRACEOPT_MAX);
14292	ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
14293	    (state == dtrace_anon.dta_state &&
14294	    state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14295
14296	if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
14297		return (0);
14298
14299	if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14300		cpu = opt[DTRACEOPT_CPU];
14301
14302	if (which == DTRACEOPT_SPECSIZE)
14303		flags |= DTRACEBUF_NOSWITCH;
14304
14305	if (which == DTRACEOPT_BUFSIZE) {
14306		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14307			flags |= DTRACEBUF_RING;
14308
14309		if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14310			flags |= DTRACEBUF_FILL;
14311
14312		if (state != dtrace_anon.dta_state ||
14313		    state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14314			flags |= DTRACEBUF_INACTIVE;
14315	}
14316
14317	for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
14318		/*
14319		 * The size must be 8-byte aligned.  If the size is not 8-byte
14320		 * aligned, drop it down by the difference.
14321		 */
14322		if (size & (sizeof (uint64_t) - 1))
14323			size -= size & (sizeof (uint64_t) - 1);
14324
14325		if (size < state->dts_reserve) {
14326			/*
14327			 * Buffers always must be large enough to accommodate
14328			 * their prereserved space.  We return E2BIG instead
14329			 * of ENOMEM in this case to allow for user-level
14330			 * software to differentiate the cases.
14331			 */
14332			return (E2BIG);
14333		}
14334
14335		rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
14336
14337		if (rval != ENOMEM) {
14338			opt[which] = size;
14339			return (rval);
14340		}
14341
14342		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14343			return (rval);
14344
14345		for (divisor = 2; divisor < factor; divisor <<= 1)
14346			continue;
14347	}
14348
14349	return (ENOMEM);
14350}
14351
14352static int
14353dtrace_state_buffers(dtrace_state_t *state)
14354{
14355	dtrace_speculation_t *spec = state->dts_speculations;
14356	int rval, i;
14357
14358	if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14359	    DTRACEOPT_BUFSIZE)) != 0)
14360		return (rval);
14361
14362	if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14363	    DTRACEOPT_AGGSIZE)) != 0)
14364		return (rval);
14365
14366	for (i = 0; i < state->dts_nspeculations; i++) {
14367		if ((rval = dtrace_state_buffer(state,
14368		    spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14369			return (rval);
14370	}
14371
14372	return (0);
14373}
14374
14375static void
14376dtrace_state_prereserve(dtrace_state_t *state)
14377{
14378	dtrace_ecb_t *ecb;
14379	dtrace_probe_t *probe;
14380
14381	state->dts_reserve = 0;
14382
14383	if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14384		return;
14385
14386	/*
14387	 * If our buffer policy is a "fill" buffer policy, we need to set the
14388	 * prereserved space to be the space required by the END probes.
14389	 */
14390	probe = dtrace_probes[dtrace_probeid_end - 1];
14391	ASSERT(probe != NULL);
14392
14393	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14394		if (ecb->dte_state != state)
14395			continue;
14396
14397		state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14398	}
14399}
14400
14401static int
14402dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14403{
14404	dtrace_optval_t *opt = state->dts_options, sz, nspec;
14405	dtrace_speculation_t *spec;
14406	dtrace_buffer_t *buf;
14407#if defined(sun)
14408	cyc_handler_t hdlr;
14409	cyc_time_t when;
14410#endif
14411	int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
14412	dtrace_icookie_t cookie;
14413
14414	mutex_enter(&cpu_lock);
14415	mutex_enter(&dtrace_lock);
14416
14417	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14418		rval = EBUSY;
14419		goto out;
14420	}
14421
14422	/*
14423	 * Before we can perform any checks, we must prime all of the
14424	 * retained enablings that correspond to this state.
14425	 */
14426	dtrace_enabling_prime(state);
14427
14428	if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14429		rval = EACCES;
14430		goto out;
14431	}
14432
14433	dtrace_state_prereserve(state);
14434
14435	/*
14436	 * Now we want to do is try to allocate our speculations.
14437	 * We do not automatically resize the number of speculations; if
14438	 * this fails, we will fail the operation.
14439	 */
14440	nspec = opt[DTRACEOPT_NSPEC];
14441	ASSERT(nspec != DTRACEOPT_UNSET);
14442
14443	if (nspec > INT_MAX) {
14444		rval = ENOMEM;
14445		goto out;
14446	}
14447
14448	spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
14449	    KM_NOSLEEP | KM_NORMALPRI);
14450
14451	if (spec == NULL) {
14452		rval = ENOMEM;
14453		goto out;
14454	}
14455
14456	state->dts_speculations = spec;
14457	state->dts_nspeculations = (int)nspec;
14458
14459	for (i = 0; i < nspec; i++) {
14460		if ((buf = kmem_zalloc(bufsize,
14461		    KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
14462			rval = ENOMEM;
14463			goto err;
14464		}
14465
14466		spec[i].dtsp_buffer = buf;
14467	}
14468
14469	if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14470		if (dtrace_anon.dta_state == NULL) {
14471			rval = ENOENT;
14472			goto out;
14473		}
14474
14475		if (state->dts_necbs != 0) {
14476			rval = EALREADY;
14477			goto out;
14478		}
14479
14480		state->dts_anon = dtrace_anon_grab();
14481		ASSERT(state->dts_anon != NULL);
14482		state = state->dts_anon;
14483
14484		/*
14485		 * We want "grabanon" to be set in the grabbed state, so we'll
14486		 * copy that option value from the grabbing state into the
14487		 * grabbed state.
14488		 */
14489		state->dts_options[DTRACEOPT_GRABANON] =
14490		    opt[DTRACEOPT_GRABANON];
14491
14492		*cpu = dtrace_anon.dta_beganon;
14493
14494		/*
14495		 * If the anonymous state is active (as it almost certainly
14496		 * is if the anonymous enabling ultimately matched anything),
14497		 * we don't allow any further option processing -- but we
14498		 * don't return failure.
14499		 */
14500		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14501			goto out;
14502	}
14503
14504	if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14505	    opt[DTRACEOPT_AGGSIZE] != 0) {
14506		if (state->dts_aggregations == NULL) {
14507			/*
14508			 * We're not going to create an aggregation buffer
14509			 * because we don't have any ECBs that contain
14510			 * aggregations -- set this option to 0.
14511			 */
14512			opt[DTRACEOPT_AGGSIZE] = 0;
14513		} else {
14514			/*
14515			 * If we have an aggregation buffer, we must also have
14516			 * a buffer to use as scratch.
14517			 */
14518			if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14519			    opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14520				opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14521			}
14522		}
14523	}
14524
14525	if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14526	    opt[DTRACEOPT_SPECSIZE] != 0) {
14527		if (!state->dts_speculates) {
14528			/*
14529			 * We're not going to create speculation buffers
14530			 * because we don't have any ECBs that actually
14531			 * speculate -- set the speculation size to 0.
14532			 */
14533			opt[DTRACEOPT_SPECSIZE] = 0;
14534		}
14535	}
14536
14537	/*
14538	 * The bare minimum size for any buffer that we're actually going to
14539	 * do anything to is sizeof (uint64_t).
14540	 */
14541	sz = sizeof (uint64_t);
14542
14543	if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
14544	    (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
14545	    (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14546		/*
14547		 * A buffer size has been explicitly set to 0 (or to a size
14548		 * that will be adjusted to 0) and we need the space -- we
14549		 * need to return failure.  We return ENOSPC to differentiate
14550		 * it from failing to allocate a buffer due to failure to meet
14551		 * the reserve (for which we return E2BIG).
14552		 */
14553		rval = ENOSPC;
14554		goto out;
14555	}
14556
14557	if ((rval = dtrace_state_buffers(state)) != 0)
14558		goto err;
14559
14560	if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14561		sz = dtrace_dstate_defsize;
14562
14563	do {
14564		rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
14565
14566		if (rval == 0)
14567			break;
14568
14569		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14570			goto err;
14571	} while (sz >>= 1);
14572
14573	opt[DTRACEOPT_DYNVARSIZE] = sz;
14574
14575	if (rval != 0)
14576		goto err;
14577
14578	if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14579		opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14580
14581	if (opt[DTRACEOPT_CLEANRATE] == 0)
14582		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14583
14584	if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14585		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14586
14587	if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14588		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14589
14590	state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14591#if defined(sun)
14592	hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14593	hdlr.cyh_arg = state;
14594	hdlr.cyh_level = CY_LOW_LEVEL;
14595
14596	when.cyt_when = 0;
14597	when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14598
14599	state->dts_cleaner = cyclic_add(&hdlr, &when);
14600
14601	hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14602	hdlr.cyh_arg = state;
14603	hdlr.cyh_level = CY_LOW_LEVEL;
14604
14605	when.cyt_when = 0;
14606	when.cyt_interval = dtrace_deadman_interval;
14607
14608	state->dts_deadman = cyclic_add(&hdlr, &when);
14609#else
14610	callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14611	    dtrace_state_clean, state);
14612	callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14613	    dtrace_state_deadman, state);
14614#endif
14615
14616	state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14617
14618#if defined(sun)
14619	if (state->dts_getf != 0 &&
14620	    !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
14621		/*
14622		 * We don't have kernel privs but we have at least one call
14623		 * to getf(); we need to bump our zone's count, and (if
14624		 * this is the first enabling to have an unprivileged call
14625		 * to getf()) we need to hook into closef().
14626		 */
14627		state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
14628
14629		if (dtrace_getf++ == 0) {
14630			ASSERT(dtrace_closef == NULL);
14631			dtrace_closef = dtrace_getf_barrier;
14632		}
14633	}
14634#endif
14635
14636	/*
14637	 * Now it's time to actually fire the BEGIN probe.  We need to disable
14638	 * interrupts here both to record the CPU on which we fired the BEGIN
14639	 * probe (the data from this CPU will be processed first at user
14640	 * level) and to manually activate the buffer for this CPU.
14641	 */
14642	cookie = dtrace_interrupt_disable();
14643	*cpu = curcpu;
14644	ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14645	state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14646
14647	dtrace_probe(dtrace_probeid_begin,
14648	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14649	dtrace_interrupt_enable(cookie);
14650	/*
14651	 * We may have had an exit action from a BEGIN probe; only change our
14652	 * state to ACTIVE if we're still in WARMUP.
14653	 */
14654	ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
14655	    state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14656
14657	if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14658		state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14659
14660	/*
14661	 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
14662	 * want each CPU to transition its principal buffer out of the
14663	 * INACTIVE state.  Doing this assures that no CPU will suddenly begin
14664	 * processing an ECB halfway down a probe's ECB chain; all CPUs will
14665	 * atomically transition from processing none of a state's ECBs to
14666	 * processing all of them.
14667	 */
14668	dtrace_xcall(DTRACE_CPUALL,
14669	    (dtrace_xcall_t)dtrace_buffer_activate, state);
14670	goto out;
14671
14672err:
14673	dtrace_buffer_free(state->dts_buffer);
14674	dtrace_buffer_free(state->dts_aggbuffer);
14675
14676	if ((nspec = state->dts_nspeculations) == 0) {
14677		ASSERT(state->dts_speculations == NULL);
14678		goto out;
14679	}
14680
14681	spec = state->dts_speculations;
14682	ASSERT(spec != NULL);
14683
14684	for (i = 0; i < state->dts_nspeculations; i++) {
14685		if ((buf = spec[i].dtsp_buffer) == NULL)
14686			break;
14687
14688		dtrace_buffer_free(buf);
14689		kmem_free(buf, bufsize);
14690	}
14691
14692	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14693	state->dts_nspeculations = 0;
14694	state->dts_speculations = NULL;
14695
14696out:
14697	mutex_exit(&dtrace_lock);
14698	mutex_exit(&cpu_lock);
14699
14700	return (rval);
14701}
14702
14703static int
14704dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
14705{
14706	dtrace_icookie_t cookie;
14707
14708	ASSERT(MUTEX_HELD(&dtrace_lock));
14709
14710	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14711	    state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14712		return (EINVAL);
14713
14714	/*
14715	 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14716	 * to be sure that every CPU has seen it.  See below for the details
14717	 * on why this is done.
14718	 */
14719	state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14720	dtrace_sync();
14721
14722	/*
14723	 * By this point, it is impossible for any CPU to be still processing
14724	 * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
14725	 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14726	 * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
14727	 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14728	 * iff we're in the END probe.
14729	 */
14730	state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14731	dtrace_sync();
14732	ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14733
14734	/*
14735	 * Finally, we can release the reserve and call the END probe.  We
14736	 * disable interrupts across calling the END probe to allow us to
14737	 * return the CPU on which we actually called the END probe.  This
14738	 * allows user-land to be sure that this CPU's principal buffer is
14739	 * processed last.
14740	 */
14741	state->dts_reserve = 0;
14742
14743	cookie = dtrace_interrupt_disable();
14744	*cpu = curcpu;
14745	dtrace_probe(dtrace_probeid_end,
14746	    (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14747	dtrace_interrupt_enable(cookie);
14748
14749	state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14750	dtrace_sync();
14751
14752#if defined(sun)
14753	if (state->dts_getf != 0 &&
14754	    !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
14755		/*
14756		 * We don't have kernel privs but we have at least one call
14757		 * to getf(); we need to lower our zone's count, and (if
14758		 * this is the last enabling to have an unprivileged call
14759		 * to getf()) we need to clear the closef() hook.
14760		 */
14761		ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
14762		ASSERT(dtrace_closef == dtrace_getf_barrier);
14763		ASSERT(dtrace_getf > 0);
14764
14765		state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
14766
14767		if (--dtrace_getf == 0)
14768			dtrace_closef = NULL;
14769	}
14770#endif
14771
14772	return (0);
14773}
14774
14775static int
14776dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14777    dtrace_optval_t val)
14778{
14779	ASSERT(MUTEX_HELD(&dtrace_lock));
14780
14781	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14782		return (EBUSY);
14783
14784	if (option >= DTRACEOPT_MAX)
14785		return (EINVAL);
14786
14787	if (option != DTRACEOPT_CPU && val < 0)
14788		return (EINVAL);
14789
14790	switch (option) {
14791	case DTRACEOPT_DESTRUCTIVE:
14792		if (dtrace_destructive_disallow)
14793			return (EACCES);
14794
14795		state->dts_cred.dcr_destructive = 1;
14796		break;
14797
14798	case DTRACEOPT_BUFSIZE:
14799	case DTRACEOPT_DYNVARSIZE:
14800	case DTRACEOPT_AGGSIZE:
14801	case DTRACEOPT_SPECSIZE:
14802	case DTRACEOPT_STRSIZE:
14803		if (val < 0)
14804			return (EINVAL);
14805
14806		if (val >= LONG_MAX) {
14807			/*
14808			 * If this is an otherwise negative value, set it to
14809			 * the highest multiple of 128m less than LONG_MAX.
14810			 * Technically, we're adjusting the size without
14811			 * regard to the buffer resizing policy, but in fact,
14812			 * this has no effect -- if we set the buffer size to
14813			 * ~LONG_MAX and the buffer policy is ultimately set to
14814			 * be "manual", the buffer allocation is guaranteed to
14815			 * fail, if only because the allocation requires two
14816			 * buffers.  (We set the the size to the highest
14817			 * multiple of 128m because it ensures that the size
14818			 * will remain a multiple of a megabyte when
14819			 * repeatedly halved -- all the way down to 15m.)
14820			 */
14821			val = LONG_MAX - (1 << 27) + 1;
14822		}
14823	}
14824
14825	state->dts_options[option] = val;
14826
14827	return (0);
14828}
14829
14830static void
14831dtrace_state_destroy(dtrace_state_t *state)
14832{
14833	dtrace_ecb_t *ecb;
14834	dtrace_vstate_t *vstate = &state->dts_vstate;
14835#if defined(sun)
14836	minor_t minor = getminor(state->dts_dev);
14837#endif
14838	int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
14839	dtrace_speculation_t *spec = state->dts_speculations;
14840	int nspec = state->dts_nspeculations;
14841	uint32_t match;
14842
14843	ASSERT(MUTEX_HELD(&dtrace_lock));
14844	ASSERT(MUTEX_HELD(&cpu_lock));
14845
14846	/*
14847	 * First, retract any retained enablings for this state.
14848	 */
14849	dtrace_enabling_retract(state);
14850	ASSERT(state->dts_nretained == 0);
14851
14852	if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14853	    state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14854		/*
14855		 * We have managed to come into dtrace_state_destroy() on a
14856		 * hot enabling -- almost certainly because of a disorderly
14857		 * shutdown of a consumer.  (That is, a consumer that is
14858		 * exiting without having called dtrace_stop().) In this case,
14859		 * we're going to set our activity to be KILLED, and then
14860		 * issue a sync to be sure that everyone is out of probe
14861		 * context before we start blowing away ECBs.
14862		 */
14863		state->dts_activity = DTRACE_ACTIVITY_KILLED;
14864		dtrace_sync();
14865	}
14866
14867	/*
14868	 * Release the credential hold we took in dtrace_state_create().
14869	 */
14870	if (state->dts_cred.dcr_cred != NULL)
14871		crfree(state->dts_cred.dcr_cred);
14872
14873	/*
14874	 * Now we can safely disable and destroy any enabled probes.  Because
14875	 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14876	 * (especially if they're all enabled), we take two passes through the
14877	 * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14878	 * in the second we disable whatever is left over.
14879	 */
14880	for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14881		for (i = 0; i < state->dts_necbs; i++) {
14882			if ((ecb = state->dts_ecbs[i]) == NULL)
14883				continue;
14884
14885			if (match && ecb->dte_probe != NULL) {
14886				dtrace_probe_t *probe = ecb->dte_probe;
14887				dtrace_provider_t *prov = probe->dtpr_provider;
14888
14889				if (!(prov->dtpv_priv.dtpp_flags & match))
14890					continue;
14891			}
14892
14893			dtrace_ecb_disable(ecb);
14894			dtrace_ecb_destroy(ecb);
14895		}
14896
14897		if (!match)
14898			break;
14899	}
14900
14901	/*
14902	 * Before we free the buffers, perform one more sync to assure that
14903	 * every CPU is out of probe context.
14904	 */
14905	dtrace_sync();
14906
14907	dtrace_buffer_free(state->dts_buffer);
14908	dtrace_buffer_free(state->dts_aggbuffer);
14909
14910	for (i = 0; i < nspec; i++)
14911		dtrace_buffer_free(spec[i].dtsp_buffer);
14912
14913#if defined(sun)
14914	if (state->dts_cleaner != CYCLIC_NONE)
14915		cyclic_remove(state->dts_cleaner);
14916
14917	if (state->dts_deadman != CYCLIC_NONE)
14918		cyclic_remove(state->dts_deadman);
14919#else
14920	callout_stop(&state->dts_cleaner);
14921	callout_drain(&state->dts_cleaner);
14922	callout_stop(&state->dts_deadman);
14923	callout_drain(&state->dts_deadman);
14924#endif
14925
14926	dtrace_dstate_fini(&vstate->dtvs_dynvars);
14927	dtrace_vstate_fini(vstate);
14928	if (state->dts_ecbs != NULL)
14929		kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
14930
14931	if (state->dts_aggregations != NULL) {
14932#ifdef DEBUG
14933		for (i = 0; i < state->dts_naggregations; i++)
14934			ASSERT(state->dts_aggregations[i] == NULL);
14935#endif
14936		ASSERT(state->dts_naggregations > 0);
14937		kmem_free(state->dts_aggregations,
14938		    state->dts_naggregations * sizeof (dtrace_aggregation_t *));
14939	}
14940
14941	kmem_free(state->dts_buffer, bufsize);
14942	kmem_free(state->dts_aggbuffer, bufsize);
14943
14944	for (i = 0; i < nspec; i++)
14945		kmem_free(spec[i].dtsp_buffer, bufsize);
14946
14947	if (spec != NULL)
14948		kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14949
14950	dtrace_format_destroy(state);
14951
14952	if (state->dts_aggid_arena != NULL) {
14953#if defined(sun)
14954		vmem_destroy(state->dts_aggid_arena);
14955#else
14956		delete_unrhdr(state->dts_aggid_arena);
14957#endif
14958		state->dts_aggid_arena = NULL;
14959	}
14960#if defined(sun)
14961	ddi_soft_state_free(dtrace_softstate, minor);
14962	vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
14963#endif
14964}
14965
14966/*
14967 * DTrace Anonymous Enabling Functions
14968 */
14969static dtrace_state_t *
14970dtrace_anon_grab(void)
14971{
14972	dtrace_state_t *state;
14973
14974	ASSERT(MUTEX_HELD(&dtrace_lock));
14975
14976	if ((state = dtrace_anon.dta_state) == NULL) {
14977		ASSERT(dtrace_anon.dta_enabling == NULL);
14978		return (NULL);
14979	}
14980
14981	ASSERT(dtrace_anon.dta_enabling != NULL);
14982	ASSERT(dtrace_retained != NULL);
14983
14984	dtrace_enabling_destroy(dtrace_anon.dta_enabling);
14985	dtrace_anon.dta_enabling = NULL;
14986	dtrace_anon.dta_state = NULL;
14987
14988	return (state);
14989}
14990
14991static void
14992dtrace_anon_property(void)
14993{
14994	int i, rv;
14995	dtrace_state_t *state;
14996	dof_hdr_t *dof;
14997	char c[32];		/* enough for "dof-data-" + digits */
14998
14999	ASSERT(MUTEX_HELD(&dtrace_lock));
15000	ASSERT(MUTEX_HELD(&cpu_lock));
15001
15002	for (i = 0; ; i++) {
15003		(void) snprintf(c, sizeof (c), "dof-data-%d", i);
15004
15005		dtrace_err_verbose = 1;
15006
15007		if ((dof = dtrace_dof_property(c)) == NULL) {
15008			dtrace_err_verbose = 0;
15009			break;
15010		}
15011
15012#if defined(sun)
15013		/*
15014		 * We want to create anonymous state, so we need to transition
15015		 * the kernel debugger to indicate that DTrace is active.  If
15016		 * this fails (e.g. because the debugger has modified text in
15017		 * some way), we won't continue with the processing.
15018		 */
15019		if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15020			cmn_err(CE_NOTE, "kernel debugger active; anonymous "
15021			    "enabling ignored.");
15022			dtrace_dof_destroy(dof);
15023			break;
15024		}
15025#endif
15026
15027		/*
15028		 * If we haven't allocated an anonymous state, we'll do so now.
15029		 */
15030		if ((state = dtrace_anon.dta_state) == NULL) {
15031#if defined(sun)
15032			state = dtrace_state_create(NULL, NULL);
15033#else
15034			state = dtrace_state_create(NULL);
15035#endif
15036			dtrace_anon.dta_state = state;
15037
15038			if (state == NULL) {
15039				/*
15040				 * This basically shouldn't happen:  the only
15041				 * failure mode from dtrace_state_create() is a
15042				 * failure of ddi_soft_state_zalloc() that
15043				 * itself should never happen.  Still, the
15044				 * interface allows for a failure mode, and
15045				 * we want to fail as gracefully as possible:
15046				 * we'll emit an error message and cease
15047				 * processing anonymous state in this case.
15048				 */
15049				cmn_err(CE_WARN, "failed to create "
15050				    "anonymous state");
15051				dtrace_dof_destroy(dof);
15052				break;
15053			}
15054		}
15055
15056		rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
15057		    &dtrace_anon.dta_enabling, 0, B_TRUE);
15058
15059		if (rv == 0)
15060			rv = dtrace_dof_options(dof, state);
15061
15062		dtrace_err_verbose = 0;
15063		dtrace_dof_destroy(dof);
15064
15065		if (rv != 0) {
15066			/*
15067			 * This is malformed DOF; chuck any anonymous state
15068			 * that we created.
15069			 */
15070			ASSERT(dtrace_anon.dta_enabling == NULL);
15071			dtrace_state_destroy(state);
15072			dtrace_anon.dta_state = NULL;
15073			break;
15074		}
15075
15076		ASSERT(dtrace_anon.dta_enabling != NULL);
15077	}
15078
15079	if (dtrace_anon.dta_enabling != NULL) {
15080		int rval;
15081
15082		/*
15083		 * dtrace_enabling_retain() can only fail because we are
15084		 * trying to retain more enablings than are allowed -- but
15085		 * we only have one anonymous enabling, and we are guaranteed
15086		 * to be allowed at least one retained enabling; we assert
15087		 * that dtrace_enabling_retain() returns success.
15088		 */
15089		rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
15090		ASSERT(rval == 0);
15091
15092		dtrace_enabling_dump(dtrace_anon.dta_enabling);
15093	}
15094}
15095
15096/*
15097 * DTrace Helper Functions
15098 */
15099static void
15100dtrace_helper_trace(dtrace_helper_action_t *helper,
15101    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
15102{
15103	uint32_t size, next, nnext, i;
15104	dtrace_helptrace_t *ent;
15105	uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;
15106
15107	if (!dtrace_helptrace_enabled)
15108		return;
15109
15110	ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
15111
15112	/*
15113	 * What would a tracing framework be without its own tracing
15114	 * framework?  (Well, a hell of a lot simpler, for starters...)
15115	 */
15116	size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
15117	    sizeof (uint64_t) - sizeof (uint64_t);
15118
15119	/*
15120	 * Iterate until we can allocate a slot in the trace buffer.
15121	 */
15122	do {
15123		next = dtrace_helptrace_next;
15124
15125		if (next + size < dtrace_helptrace_bufsize) {
15126			nnext = next + size;
15127		} else {
15128			nnext = size;
15129		}
15130	} while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15131
15132	/*
15133	 * We have our slot; fill it in.
15134	 */
15135	if (nnext == size)
15136		next = 0;
15137
15138	ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
15139	ent->dtht_helper = helper;
15140	ent->dtht_where = where;
15141	ent->dtht_nlocals = vstate->dtvs_nlocals;
15142
15143	ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15144	    mstate->dtms_fltoffs : -1;
15145	ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15146	ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval;
15147
15148	for (i = 0; i < vstate->dtvs_nlocals; i++) {
15149		dtrace_statvar_t *svar;
15150
15151		if ((svar = vstate->dtvs_locals[i]) == NULL)
15152			continue;
15153
15154		ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
15155		ent->dtht_locals[i] =
15156		    ((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];
15157	}
15158}
15159
15160static uint64_t
15161dtrace_helper(int which, dtrace_mstate_t *mstate,
15162    dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15163{
15164	uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
15165	uint64_t sarg0 = mstate->dtms_arg[0];
15166	uint64_t sarg1 = mstate->dtms_arg[1];
15167	uint64_t rval = 0;
15168	dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15169	dtrace_helper_action_t *helper;
15170	dtrace_vstate_t *vstate;
15171	dtrace_difo_t *pred;
15172	int i, trace = dtrace_helptrace_enabled;
15173
15174	ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
15175
15176	if (helpers == NULL)
15177		return (0);
15178
15179	if ((helper = helpers->dthps_actions[which]) == NULL)
15180		return (0);
15181
15182	vstate = &helpers->dthps_vstate;
15183	mstate->dtms_arg[0] = arg0;
15184	mstate->dtms_arg[1] = arg1;
15185
15186	/*
15187	 * Now iterate over each helper.  If its predicate evaluates to 'true',
15188	 * we'll call the corresponding actions.  Note that the below calls
15189	 * to dtrace_dif_emulate() may set faults in machine state.  This is
15190	 * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
15191	 * the stored DIF offset with its own (which is the desired behavior).
15192	 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
15193	 * from machine state; this is okay, too.
15194	 */
15195	for (; helper != NULL; helper = helper->dtha_next) {
15196		if ((pred = helper->dtha_predicate) != NULL) {
15197			if (trace)
15198				dtrace_helper_trace(helper, mstate, vstate, 0);
15199
15200			if (!dtrace_dif_emulate(pred, mstate, vstate, state))
15201				goto next;
15202
15203			if (*flags & CPU_DTRACE_FAULT)
15204				goto err;
15205		}
15206
15207		for (i = 0; i < helper->dtha_nactions; i++) {
15208			if (trace)
15209				dtrace_helper_trace(helper,
15210				    mstate, vstate, i + 1);
15211
15212			rval = dtrace_dif_emulate(helper->dtha_actions[i],
15213			    mstate, vstate, state);
15214
15215			if (*flags & CPU_DTRACE_FAULT)
15216				goto err;
15217		}
15218
15219next:
15220		if (trace)
15221			dtrace_helper_trace(helper, mstate, vstate,
15222			    DTRACE_HELPTRACE_NEXT);
15223	}
15224
15225	if (trace)
15226		dtrace_helper_trace(helper, mstate, vstate,
15227		    DTRACE_HELPTRACE_DONE);
15228
15229	/*
15230	 * Restore the arg0 that we saved upon entry.
15231	 */
15232	mstate->dtms_arg[0] = sarg0;
15233	mstate->dtms_arg[1] = sarg1;
15234
15235	return (rval);
15236
15237err:
15238	if (trace)
15239		dtrace_helper_trace(helper, mstate, vstate,
15240		    DTRACE_HELPTRACE_ERR);
15241
15242	/*
15243	 * Restore the arg0 that we saved upon entry.
15244	 */
15245	mstate->dtms_arg[0] = sarg0;
15246	mstate->dtms_arg[1] = sarg1;
15247
15248	return (0);
15249}
15250
15251static void
15252dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15253    dtrace_vstate_t *vstate)
15254{
15255	int i;
15256
15257	if (helper->dtha_predicate != NULL)
15258		dtrace_difo_release(helper->dtha_predicate, vstate);
15259
15260	for (i = 0; i < helper->dtha_nactions; i++) {
15261		ASSERT(helper->dtha_actions[i] != NULL);
15262		dtrace_difo_release(helper->dtha_actions[i], vstate);
15263	}
15264
15265	kmem_free(helper->dtha_actions,
15266	    helper->dtha_nactions * sizeof (dtrace_difo_t *));
15267	kmem_free(helper, sizeof (dtrace_helper_action_t));
15268}
15269
15270static int
15271dtrace_helper_destroygen(int gen)
15272{
15273	proc_t *p = curproc;
15274	dtrace_helpers_t *help = p->p_dtrace_helpers;
15275	dtrace_vstate_t *vstate;
15276	int i;
15277
15278	ASSERT(MUTEX_HELD(&dtrace_lock));
15279
15280	if (help == NULL || gen > help->dthps_generation)
15281		return (EINVAL);
15282
15283	vstate = &help->dthps_vstate;
15284
15285	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15286		dtrace_helper_action_t *last = NULL, *h, *next;
15287
15288		for (h = help->dthps_actions[i]; h != NULL; h = next) {
15289			next = h->dtha_next;
15290
15291			if (h->dtha_generation == gen) {
15292				if (last != NULL) {
15293					last->dtha_next = next;
15294				} else {
15295					help->dthps_actions[i] = next;
15296				}
15297
15298				dtrace_helper_action_destroy(h, vstate);
15299			} else {
15300				last = h;
15301			}
15302		}
15303	}
15304
15305	/*
15306	 * Interate until we've cleared out all helper providers with the
15307	 * given generation number.
15308	 */
15309	for (;;) {
15310		dtrace_helper_provider_t *prov;
15311
15312		/*
15313		 * Look for a helper provider with the right generation. We
15314		 * have to start back at the beginning of the list each time
15315		 * because we drop dtrace_lock. It's unlikely that we'll make
15316		 * more than two passes.
15317		 */
15318		for (i = 0; i < help->dthps_nprovs; i++) {
15319			prov = help->dthps_provs[i];
15320
15321			if (prov->dthp_generation == gen)
15322				break;
15323		}
15324
15325		/*
15326		 * If there were no matches, we're done.
15327		 */
15328		if (i == help->dthps_nprovs)
15329			break;
15330
15331		/*
15332		 * Move the last helper provider into this slot.
15333		 */
15334		help->dthps_nprovs--;
15335		help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15336		help->dthps_provs[help->dthps_nprovs] = NULL;
15337
15338		mutex_exit(&dtrace_lock);
15339
15340		/*
15341		 * If we have a meta provider, remove this helper provider.
15342		 */
15343		mutex_enter(&dtrace_meta_lock);
15344		if (dtrace_meta_pid != NULL) {
15345			ASSERT(dtrace_deferred_pid == NULL);
15346			dtrace_helper_provider_remove(&prov->dthp_prov,
15347			    p->p_pid);
15348		}
15349		mutex_exit(&dtrace_meta_lock);
15350
15351		dtrace_helper_provider_destroy(prov);
15352
15353		mutex_enter(&dtrace_lock);
15354	}
15355
15356	return (0);
15357}
15358
15359static int
15360dtrace_helper_validate(dtrace_helper_action_t *helper)
15361{
15362	int err = 0, i;
15363	dtrace_difo_t *dp;
15364
15365	if ((dp = helper->dtha_predicate) != NULL)
15366		err += dtrace_difo_validate_helper(dp);
15367
15368	for (i = 0; i < helper->dtha_nactions; i++)
15369		err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
15370
15371	return (err == 0);
15372}
15373
15374static int
15375dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
15376{
15377	dtrace_helpers_t *help;
15378	dtrace_helper_action_t *helper, *last;
15379	dtrace_actdesc_t *act;
15380	dtrace_vstate_t *vstate;
15381	dtrace_predicate_t *pred;
15382	int count = 0, nactions = 0, i;
15383
15384	if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
15385		return (EINVAL);
15386
15387	help = curproc->p_dtrace_helpers;
15388	last = help->dthps_actions[which];
15389	vstate = &help->dthps_vstate;
15390
15391	for (count = 0; last != NULL; last = last->dtha_next) {
15392		count++;
15393		if (last->dtha_next == NULL)
15394			break;
15395	}
15396
15397	/*
15398	 * If we already have dtrace_helper_actions_max helper actions for this
15399	 * helper action type, we'll refuse to add a new one.
15400	 */
15401	if (count >= dtrace_helper_actions_max)
15402		return (ENOSPC);
15403
15404	helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15405	helper->dtha_generation = help->dthps_generation;
15406
15407	if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15408		ASSERT(pred->dtp_difo != NULL);
15409		dtrace_difo_hold(pred->dtp_difo);
15410		helper->dtha_predicate = pred->dtp_difo;
15411	}
15412
15413	for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15414		if (act->dtad_kind != DTRACEACT_DIFEXPR)
15415			goto err;
15416
15417		if (act->dtad_difo == NULL)
15418			goto err;
15419
15420		nactions++;
15421	}
15422
15423	helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
15424	    (helper->dtha_nactions = nactions), KM_SLEEP);
15425
15426	for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15427		dtrace_difo_hold(act->dtad_difo);
15428		helper->dtha_actions[i++] = act->dtad_difo;
15429	}
15430
15431	if (!dtrace_helper_validate(helper))
15432		goto err;
15433
15434	if (last == NULL) {
15435		help->dthps_actions[which] = helper;
15436	} else {
15437		last->dtha_next = helper;
15438	}
15439
15440	if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15441		dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15442		dtrace_helptrace_next = 0;
15443	}
15444
15445	return (0);
15446err:
15447	dtrace_helper_action_destroy(helper, vstate);
15448	return (EINVAL);
15449}
15450
15451static void
15452dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
15453    dof_helper_t *dofhp)
15454{
15455	ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
15456
15457	mutex_enter(&dtrace_meta_lock);
15458	mutex_enter(&dtrace_lock);
15459
15460	if (!dtrace_attached() || dtrace_meta_pid == NULL) {
15461		/*
15462		 * If the dtrace module is loaded but not attached, or if
15463		 * there aren't isn't a meta provider registered to deal with
15464		 * these provider descriptions, we need to postpone creating
15465		 * the actual providers until later.
15466		 */
15467
15468		if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15469		    dtrace_deferred_pid != help) {
15470			help->dthps_deferred = 1;
15471			help->dthps_pid = p->p_pid;
15472			help->dthps_next = dtrace_deferred_pid;
15473			help->dthps_prev = NULL;
15474			if (dtrace_deferred_pid != NULL)
15475				dtrace_deferred_pid->dthps_prev = help;
15476			dtrace_deferred_pid = help;
15477		}
15478
15479		mutex_exit(&dtrace_lock);
15480
15481	} else if (dofhp != NULL) {
15482		/*
15483		 * If the dtrace module is loaded and we have a particular
15484		 * helper provider description, pass that off to the
15485		 * meta provider.
15486		 */
15487
15488		mutex_exit(&dtrace_lock);
15489
15490		dtrace_helper_provide(dofhp, p->p_pid);
15491
15492	} else {
15493		/*
15494		 * Otherwise, just pass all the helper provider descriptions
15495		 * off to the meta provider.
15496		 */
15497
15498		int i;
15499		mutex_exit(&dtrace_lock);
15500
15501		for (i = 0; i < help->dthps_nprovs; i++) {
15502			dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
15503			    p->p_pid);
15504		}
15505	}
15506
15507	mutex_exit(&dtrace_meta_lock);
15508}
15509
15510static int
15511dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
15512{
15513	dtrace_helpers_t *help;
15514	dtrace_helper_provider_t *hprov, **tmp_provs;
15515	uint_t tmp_maxprovs, i;
15516
15517	ASSERT(MUTEX_HELD(&dtrace_lock));
15518
15519	help = curproc->p_dtrace_helpers;
15520	ASSERT(help != NULL);
15521
15522	/*
15523	 * If we already have dtrace_helper_providers_max helper providers,
15524	 * we're refuse to add a new one.
15525	 */
15526	if (help->dthps_nprovs >= dtrace_helper_providers_max)
15527		return (ENOSPC);
15528
15529	/*
15530	 * Check to make sure this isn't a duplicate.
15531	 */
15532	for (i = 0; i < help->dthps_nprovs; i++) {
15533		if (dofhp->dofhp_dof ==
15534		    help->dthps_provs[i]->dthp_prov.dofhp_dof)
15535			return (EALREADY);
15536	}
15537
15538	hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15539	hprov->dthp_prov = *dofhp;
15540	hprov->dthp_ref = 1;
15541	hprov->dthp_generation = gen;
15542
15543	/*
15544	 * Allocate a bigger table for helper providers if it's already full.
15545	 */
15546	if (help->dthps_maxprovs == help->dthps_nprovs) {
15547		tmp_maxprovs = help->dthps_maxprovs;
15548		tmp_provs = help->dthps_provs;
15549
15550		if (help->dthps_maxprovs == 0)
15551			help->dthps_maxprovs = 2;
15552		else
15553			help->dthps_maxprovs *= 2;
15554		if (help->dthps_maxprovs > dtrace_helper_providers_max)
15555			help->dthps_maxprovs = dtrace_helper_providers_max;
15556
15557		ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15558
15559		help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15560		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15561
15562		if (tmp_provs != NULL) {
15563			bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
15564			    sizeof (dtrace_helper_provider_t *));
15565			kmem_free(tmp_provs, tmp_maxprovs *
15566			    sizeof (dtrace_helper_provider_t *));
15567		}
15568	}
15569
15570	help->dthps_provs[help->dthps_nprovs] = hprov;
15571	help->dthps_nprovs++;
15572
15573	return (0);
15574}
15575
15576static void
15577dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15578{
15579	mutex_enter(&dtrace_lock);
15580
15581	if (--hprov->dthp_ref == 0) {
15582		dof_hdr_t *dof;
15583		mutex_exit(&dtrace_lock);
15584		dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15585		dtrace_dof_destroy(dof);
15586		kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15587	} else {
15588		mutex_exit(&dtrace_lock);
15589	}
15590}
15591
15592static int
15593dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
15594{
15595	uintptr_t daddr = (uintptr_t)dof;
15596	dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
15597	dof_provider_t *provider;
15598	dof_probe_t *probe;
15599	uint8_t *arg;
15600	char *strtab, *typestr;
15601	dof_stridx_t typeidx;
15602	size_t typesz;
15603	uint_t nprobes, j, k;
15604
15605	ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15606
15607	if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
15608		dtrace_dof_error(dof, "misaligned section offset");
15609		return (-1);
15610	}
15611
15612	/*
15613	 * The section needs to be large enough to contain the DOF provider
15614	 * structure appropriate for the given version.
15615	 */
15616	if (sec->dofs_size <
15617	    ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15618	    offsetof(dof_provider_t, dofpv_prenoffs) :
15619	    sizeof (dof_provider_t))) {
15620		dtrace_dof_error(dof, "provider section too small");
15621		return (-1);
15622	}
15623
15624	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15625	str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
15626	prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
15627	arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
15628	off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
15629
15630	if (str_sec == NULL || prb_sec == NULL ||
15631	    arg_sec == NULL || off_sec == NULL)
15632		return (-1);
15633
15634	enoff_sec = NULL;
15635
15636	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15637	    provider->dofpv_prenoffs != DOF_SECT_NONE &&
15638	    (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15639	    provider->dofpv_prenoffs)) == NULL)
15640		return (-1);
15641
15642	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15643
15644	if (provider->dofpv_name >= str_sec->dofs_size ||
15645	    strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15646		dtrace_dof_error(dof, "invalid provider name");
15647		return (-1);
15648	}
15649
15650	if (prb_sec->dofs_entsize == 0 ||
15651	    prb_sec->dofs_entsize > prb_sec->dofs_size) {
15652		dtrace_dof_error(dof, "invalid entry size");
15653		return (-1);
15654	}
15655
15656	if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
15657		dtrace_dof_error(dof, "misaligned entry size");
15658		return (-1);
15659	}
15660
15661	if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15662		dtrace_dof_error(dof, "invalid entry size");
15663		return (-1);
15664	}
15665
15666	if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
15667		dtrace_dof_error(dof, "misaligned section offset");
15668		return (-1);
15669	}
15670
15671	if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15672		dtrace_dof_error(dof, "invalid entry size");
15673		return (-1);
15674	}
15675
15676	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15677
15678	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15679
15680	/*
15681	 * Take a pass through the probes to check for errors.
15682	 */
15683	for (j = 0; j < nprobes; j++) {
15684		probe = (dof_probe_t *)(uintptr_t)(daddr +
15685		    prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15686
15687		if (probe->dofpr_func >= str_sec->dofs_size) {
15688			dtrace_dof_error(dof, "invalid function name");
15689			return (-1);
15690		}
15691
15692		if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15693			dtrace_dof_error(dof, "function name too long");
15694			return (-1);
15695		}
15696
15697		if (probe->dofpr_name >= str_sec->dofs_size ||
15698		    strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15699			dtrace_dof_error(dof, "invalid probe name");
15700			return (-1);
15701		}
15702
15703		/*
15704		 * The offset count must not wrap the index, and the offsets
15705		 * must also not overflow the section's data.
15706		 */
15707		if (probe->dofpr_offidx + probe->dofpr_noffs <
15708		    probe->dofpr_offidx ||
15709		    (probe->dofpr_offidx + probe->dofpr_noffs) *
15710		    off_sec->dofs_entsize > off_sec->dofs_size) {
15711			dtrace_dof_error(dof, "invalid probe offset");
15712			return (-1);
15713		}
15714
15715		if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15716			/*
15717			 * If there's no is-enabled offset section, make sure
15718			 * there aren't any is-enabled offsets. Otherwise
15719			 * perform the same checks as for probe offsets
15720			 * (immediately above).
15721			 */
15722			if (enoff_sec == NULL) {
15723				if (probe->dofpr_enoffidx != 0 ||
15724				    probe->dofpr_nenoffs != 0) {
15725					dtrace_dof_error(dof, "is-enabled "
15726					    "offsets with null section");
15727					return (-1);
15728				}
15729			} else if (probe->dofpr_enoffidx +
15730			    probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
15731			    (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15732			    enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15733				dtrace_dof_error(dof, "invalid is-enabled "
15734				    "offset");
15735				return (-1);
15736			}
15737
15738			if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
15739				dtrace_dof_error(dof, "zero probe and "
15740				    "is-enabled offsets");
15741				return (-1);
15742			}
15743		} else if (probe->dofpr_noffs == 0) {
15744			dtrace_dof_error(dof, "zero probe offsets");
15745			return (-1);
15746		}
15747
15748		if (probe->dofpr_argidx + probe->dofpr_xargc <
15749		    probe->dofpr_argidx ||
15750		    (probe->dofpr_argidx + probe->dofpr_xargc) *
15751		    arg_sec->dofs_entsize > arg_sec->dofs_size) {
15752			dtrace_dof_error(dof, "invalid args");
15753			return (-1);
15754		}
15755
15756		typeidx = probe->dofpr_nargv;
15757		typestr = strtab + probe->dofpr_nargv;
15758		for (k = 0; k < probe->dofpr_nargc; k++) {
15759			if (typeidx >= str_sec->dofs_size) {
15760				dtrace_dof_error(dof, "bad "
15761				    "native argument type");
15762				return (-1);
15763			}
15764
15765			typesz = strlen(typestr) + 1;
15766			if (typesz > DTRACE_ARGTYPELEN) {
15767				dtrace_dof_error(dof, "native "
15768				    "argument type too long");
15769				return (-1);
15770			}
15771			typeidx += typesz;
15772			typestr += typesz;
15773		}
15774
15775		typeidx = probe->dofpr_xargv;
15776		typestr = strtab + probe->dofpr_xargv;
15777		for (k = 0; k < probe->dofpr_xargc; k++) {
15778			if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15779				dtrace_dof_error(dof, "bad "
15780				    "native argument index");
15781				return (-1);
15782			}
15783
15784			if (typeidx >= str_sec->dofs_size) {
15785				dtrace_dof_error(dof, "bad "
15786				    "translated argument type");
15787				return (-1);
15788			}
15789
15790			typesz = strlen(typestr) + 1;
15791			if (typesz > DTRACE_ARGTYPELEN) {
15792				dtrace_dof_error(dof, "translated argument "
15793				    "type too long");
15794				return (-1);
15795			}
15796
15797			typeidx += typesz;
15798			typestr += typesz;
15799		}
15800	}
15801
15802	return (0);
15803}
15804
15805static int
15806dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
15807{
15808	dtrace_helpers_t *help;
15809	dtrace_vstate_t *vstate;
15810	dtrace_enabling_t *enab = NULL;
15811	int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15812	uintptr_t daddr = (uintptr_t)dof;
15813
15814	ASSERT(MUTEX_HELD(&dtrace_lock));
15815
15816	if ((help = curproc->p_dtrace_helpers) == NULL)
15817		help = dtrace_helpers_create(curproc);
15818
15819	vstate = &help->dthps_vstate;
15820
15821	if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15822	    dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15823		dtrace_dof_destroy(dof);
15824		return (rv);
15825	}
15826
15827	/*
15828	 * Look for helper providers and validate their descriptions.
15829	 */
15830	if (dhp != NULL) {
15831		for (i = 0; i < dof->dofh_secnum; i++) {
15832			dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15833			    dof->dofh_secoff + i * dof->dofh_secsize);
15834
15835			if (sec->dofs_type != DOF_SECT_PROVIDER)
15836				continue;
15837
15838			if (dtrace_helper_provider_validate(dof, sec) != 0) {
15839				dtrace_enabling_destroy(enab);
15840				dtrace_dof_destroy(dof);
15841				return (-1);
15842			}
15843
15844			nprovs++;
15845		}
15846	}
15847
15848	/*
15849	 * Now we need to walk through the ECB descriptions in the enabling.
15850	 */
15851	for (i = 0; i < enab->dten_ndesc; i++) {
15852		dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15853		dtrace_probedesc_t *desc = &ep->dted_probe;
15854
15855		if (strcmp(desc->dtpd_provider, "dtrace") != 0)
15856			continue;
15857
15858		if (strcmp(desc->dtpd_mod, "helper") != 0)
15859			continue;
15860
15861		if (strcmp(desc->dtpd_func, "ustack") != 0)
15862			continue;
15863
15864		if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
15865		    ep)) != 0) {
15866			/*
15867			 * Adding this helper action failed -- we are now going
15868			 * to rip out the entire generation and return failure.
15869			 */
15870			(void) dtrace_helper_destroygen(help->dthps_generation);
15871			dtrace_enabling_destroy(enab);
15872			dtrace_dof_destroy(dof);
15873			return (-1);
15874		}
15875
15876		nhelpers++;
15877	}
15878
15879	if (nhelpers < enab->dten_ndesc)
15880		dtrace_dof_error(dof, "unmatched helpers");
15881
15882	gen = help->dthps_generation++;
15883	dtrace_enabling_destroy(enab);
15884
15885	if (dhp != NULL && nprovs > 0) {
15886		dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15887		if (dtrace_helper_provider_add(dhp, gen) == 0) {
15888			mutex_exit(&dtrace_lock);
15889			dtrace_helper_provider_register(curproc, help, dhp);
15890			mutex_enter(&dtrace_lock);
15891
15892			destroy = 0;
15893		}
15894	}
15895
15896	if (destroy)
15897		dtrace_dof_destroy(dof);
15898
15899	return (gen);
15900}
15901
15902static dtrace_helpers_t *
15903dtrace_helpers_create(proc_t *p)
15904{
15905	dtrace_helpers_t *help;
15906
15907	ASSERT(MUTEX_HELD(&dtrace_lock));
15908	ASSERT(p->p_dtrace_helpers == NULL);
15909
15910	help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
15911	help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
15912	    DTRACE_NHELPER_ACTIONS, KM_SLEEP);
15913
15914	p->p_dtrace_helpers = help;
15915	dtrace_helpers++;
15916
15917	return (help);
15918}
15919
15920#if defined(sun)
15921static
15922#endif
15923void
15924dtrace_helpers_destroy(proc_t *p)
15925{
15926	dtrace_helpers_t *help;
15927	dtrace_vstate_t *vstate;
15928#if defined(sun)
15929	proc_t *p = curproc;
15930#endif
15931	int i;
15932
15933	mutex_enter(&dtrace_lock);
15934
15935	ASSERT(p->p_dtrace_helpers != NULL);
15936	ASSERT(dtrace_helpers > 0);
15937
15938	help = p->p_dtrace_helpers;
15939	vstate = &help->dthps_vstate;
15940
15941	/*
15942	 * We're now going to lose the help from this process.
15943	 */
15944	p->p_dtrace_helpers = NULL;
15945	dtrace_sync();
15946
15947	/*
15948	 * Destory the helper actions.
15949	 */
15950	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15951		dtrace_helper_action_t *h, *next;
15952
15953		for (h = help->dthps_actions[i]; h != NULL; h = next) {
15954			next = h->dtha_next;
15955			dtrace_helper_action_destroy(h, vstate);
15956			h = next;
15957		}
15958	}
15959
15960	mutex_exit(&dtrace_lock);
15961
15962	/*
15963	 * Destroy the helper providers.
15964	 */
15965	if (help->dthps_maxprovs > 0) {
15966		mutex_enter(&dtrace_meta_lock);
15967		if (dtrace_meta_pid != NULL) {
15968			ASSERT(dtrace_deferred_pid == NULL);
15969
15970			for (i = 0; i < help->dthps_nprovs; i++) {
15971				dtrace_helper_provider_remove(
15972				    &help->dthps_provs[i]->dthp_prov, p->p_pid);
15973			}
15974		} else {
15975			mutex_enter(&dtrace_lock);
15976			ASSERT(help->dthps_deferred == 0 ||
15977			    help->dthps_next != NULL ||
15978			    help->dthps_prev != NULL ||
15979			    help == dtrace_deferred_pid);
15980
15981			/*
15982			 * Remove the helper from the deferred list.
15983			 */
15984			if (help->dthps_next != NULL)
15985				help->dthps_next->dthps_prev = help->dthps_prev;
15986			if (help->dthps_prev != NULL)
15987				help->dthps_prev->dthps_next = help->dthps_next;
15988			if (dtrace_deferred_pid == help) {
15989				dtrace_deferred_pid = help->dthps_next;
15990				ASSERT(help->dthps_prev == NULL);
15991			}
15992
15993			mutex_exit(&dtrace_lock);
15994		}
15995
15996		mutex_exit(&dtrace_meta_lock);
15997
15998		for (i = 0; i < help->dthps_nprovs; i++) {
15999			dtrace_helper_provider_destroy(help->dthps_provs[i]);
16000		}
16001
16002		kmem_free(help->dthps_provs, help->dthps_maxprovs *
16003		    sizeof (dtrace_helper_provider_t *));
16004	}
16005
16006	mutex_enter(&dtrace_lock);
16007
16008	dtrace_vstate_fini(&help->dthps_vstate);
16009	kmem_free(help->dthps_actions,
16010	    sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16011	kmem_free(help, sizeof (dtrace_helpers_t));
16012
16013	--dtrace_helpers;
16014	mutex_exit(&dtrace_lock);
16015}
16016
16017#if defined(sun)
16018static
16019#endif
16020void
16021dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16022{
16023	dtrace_helpers_t *help, *newhelp;
16024	dtrace_helper_action_t *helper, *new, *last;
16025	dtrace_difo_t *dp;
16026	dtrace_vstate_t *vstate;
16027	int i, j, sz, hasprovs = 0;
16028
16029	mutex_enter(&dtrace_lock);
16030	ASSERT(from->p_dtrace_helpers != NULL);
16031	ASSERT(dtrace_helpers > 0);
16032
16033	help = from->p_dtrace_helpers;
16034	newhelp = dtrace_helpers_create(to);
16035	ASSERT(to->p_dtrace_helpers != NULL);
16036
16037	newhelp->dthps_generation = help->dthps_generation;
16038	vstate = &newhelp->dthps_vstate;
16039
16040	/*
16041	 * Duplicate the helper actions.
16042	 */
16043	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16044		if ((helper = help->dthps_actions[i]) == NULL)
16045			continue;
16046
16047		for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16048			new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16049			    KM_SLEEP);
16050			new->dtha_generation = helper->dtha_generation;
16051
16052			if ((dp = helper->dtha_predicate) != NULL) {
16053				dp = dtrace_difo_duplicate(dp, vstate);
16054				new->dtha_predicate = dp;
16055			}
16056
16057			new->dtha_nactions = helper->dtha_nactions;
16058			sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16059			new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16060
16061			for (j = 0; j < new->dtha_nactions; j++) {
16062				dtrace_difo_t *dp = helper->dtha_actions[j];
16063
16064				ASSERT(dp != NULL);
16065				dp = dtrace_difo_duplicate(dp, vstate);
16066				new->dtha_actions[j] = dp;
16067			}
16068
16069			if (last != NULL) {
16070				last->dtha_next = new;
16071			} else {
16072				newhelp->dthps_actions[i] = new;
16073			}
16074
16075			last = new;
16076		}
16077	}
16078
16079	/*
16080	 * Duplicate the helper providers and register them with the
16081	 * DTrace framework.
16082	 */
16083	if (help->dthps_nprovs > 0) {
16084		newhelp->dthps_nprovs = help->dthps_nprovs;
16085		newhelp->dthps_maxprovs = help->dthps_nprovs;
16086		newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16087		    sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16088		for (i = 0; i < newhelp->dthps_nprovs; i++) {
16089			newhelp->dthps_provs[i] = help->dthps_provs[i];
16090			newhelp->dthps_provs[i]->dthp_ref++;
16091		}
16092
16093		hasprovs = 1;
16094	}
16095
16096	mutex_exit(&dtrace_lock);
16097
16098	if (hasprovs)
16099		dtrace_helper_provider_register(to, newhelp, NULL);
16100}
16101
16102/*
16103 * DTrace Hook Functions
16104 */
16105static void
16106dtrace_module_loaded(modctl_t *ctl)
16107{
16108	dtrace_provider_t *prv;
16109
16110	mutex_enter(&dtrace_provider_lock);
16111#if defined(sun)
16112	mutex_enter(&mod_lock);
16113#endif
16114
16115#if defined(sun)
16116	ASSERT(ctl->mod_busy);
16117#endif
16118
16119	/*
16120	 * We're going to call each providers per-module provide operation
16121	 * specifying only this module.
16122	 */
16123	for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16124		prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16125
16126#if defined(sun)
16127	mutex_exit(&mod_lock);
16128#endif
16129	mutex_exit(&dtrace_provider_lock);
16130
16131	/*
16132	 * If we have any retained enablings, we need to match against them.
16133	 * Enabling probes requires that cpu_lock be held, and we cannot hold
16134	 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16135	 * module.  (In particular, this happens when loading scheduling
16136	 * classes.)  So if we have any retained enablings, we need to dispatch
16137	 * our task queue to do the match for us.
16138	 */
16139	mutex_enter(&dtrace_lock);
16140
16141	if (dtrace_retained == NULL) {
16142		mutex_exit(&dtrace_lock);
16143		return;
16144	}
16145
16146	(void) taskq_dispatch(dtrace_taskq,
16147	    (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
16148
16149	mutex_exit(&dtrace_lock);
16150
16151	/*
16152	 * And now, for a little heuristic sleaze:  in general, we want to
16153	 * match modules as soon as they load.  However, we cannot guarantee
16154	 * this, because it would lead us to the lock ordering violation
16155	 * outlined above.  The common case, of course, is that cpu_lock is
16156	 * _not_ held -- so we delay here for a clock tick, hoping that that's
16157	 * long enough for the task queue to do its work.  If it's not, it's
16158	 * not a serious problem -- it just means that the module that we
16159	 * just loaded may not be immediately instrumentable.
16160	 */
16161	delay(1);
16162}
16163
16164static void
16165#if defined(sun)
16166dtrace_module_unloaded(modctl_t *ctl)
16167#else
16168dtrace_module_unloaded(modctl_t *ctl, int *error)
16169#endif
16170{
16171	dtrace_probe_t template, *probe, *first, *next;
16172	dtrace_provider_t *prov;
16173#if !defined(sun)
16174	char modname[DTRACE_MODNAMELEN];
16175	size_t len;
16176#endif
16177
16178#if defined(sun)
16179	template.dtpr_mod = ctl->mod_modname;
16180#else
16181	/* Handle the fact that ctl->filename may end in ".ko". */
16182	strlcpy(modname, ctl->filename, sizeof(modname));
16183	len = strlen(ctl->filename);
16184	if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
16185		modname[len - 3] = '\0';
16186	template.dtpr_mod = modname;
16187#endif
16188
16189	mutex_enter(&dtrace_provider_lock);
16190#if defined(sun)
16191	mutex_enter(&mod_lock);
16192#endif
16193	mutex_enter(&dtrace_lock);
16194
16195#if !defined(sun)
16196	if (ctl->nenabled > 0) {
16197		/* Don't allow unloads if a probe is enabled. */
16198		mutex_exit(&dtrace_provider_lock);
16199		mutex_exit(&dtrace_lock);
16200		*error = -1;
16201		printf(
16202	"kldunload: attempt to unload module that has DTrace probes enabled\n");
16203		return;
16204	}
16205#endif
16206
16207	if (dtrace_bymod == NULL) {
16208		/*
16209		 * The DTrace module is loaded (obviously) but not attached;
16210		 * we don't have any work to do.
16211		 */
16212		mutex_exit(&dtrace_provider_lock);
16213#if defined(sun)
16214		mutex_exit(&mod_lock);
16215#endif
16216		mutex_exit(&dtrace_lock);
16217		return;
16218	}
16219
16220	for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16221	    probe != NULL; probe = probe->dtpr_nextmod) {
16222		if (probe->dtpr_ecb != NULL) {
16223			mutex_exit(&dtrace_provider_lock);
16224#if defined(sun)
16225			mutex_exit(&mod_lock);
16226#endif
16227			mutex_exit(&dtrace_lock);
16228
16229			/*
16230			 * This shouldn't _actually_ be possible -- we're
16231			 * unloading a module that has an enabled probe in it.
16232			 * (It's normally up to the provider to make sure that
16233			 * this can't happen.)  However, because dtps_enable()
16234			 * doesn't have a failure mode, there can be an
16235			 * enable/unload race.  Upshot:  we don't want to
16236			 * assert, but we're not going to disable the
16237			 * probe, either.
16238			 */
16239			if (dtrace_err_verbose) {
16240#if defined(sun)
16241				cmn_err(CE_WARN, "unloaded module '%s' had "
16242				    "enabled probes", ctl->mod_modname);
16243#else
16244				cmn_err(CE_WARN, "unloaded module '%s' had "
16245				    "enabled probes", modname);
16246#endif
16247			}
16248
16249			return;
16250		}
16251	}
16252
16253	probe = first;
16254
16255	for (first = NULL; probe != NULL; probe = next) {
16256		ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16257
16258		dtrace_probes[probe->dtpr_id - 1] = NULL;
16259
16260		next = probe->dtpr_nextmod;
16261		dtrace_hash_remove(dtrace_bymod, probe);
16262		dtrace_hash_remove(dtrace_byfunc, probe);
16263		dtrace_hash_remove(dtrace_byname, probe);
16264
16265		if (first == NULL) {
16266			first = probe;
16267			probe->dtpr_nextmod = NULL;
16268		} else {
16269			probe->dtpr_nextmod = first;
16270			first = probe;
16271		}
16272	}
16273
16274	/*
16275	 * We've removed all of the module's probes from the hash chains and
16276	 * from the probe array.  Now issue a dtrace_sync() to be sure that
16277	 * everyone has cleared out from any probe array processing.
16278	 */
16279	dtrace_sync();
16280
16281	for (probe = first; probe != NULL; probe = first) {
16282		first = probe->dtpr_nextmod;
16283		prov = probe->dtpr_provider;
16284		prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16285		    probe->dtpr_arg);
16286		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
16287		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
16288		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
16289#if defined(sun)
16290		vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16291#else
16292		free_unr(dtrace_arena, probe->dtpr_id);
16293#endif
16294		kmem_free(probe, sizeof (dtrace_probe_t));
16295	}
16296
16297	mutex_exit(&dtrace_lock);
16298#if defined(sun)
16299	mutex_exit(&mod_lock);
16300#endif
16301	mutex_exit(&dtrace_provider_lock);
16302}
16303
16304#if !defined(sun)
16305static void
16306dtrace_kld_load(void *arg __unused, linker_file_t lf)
16307{
16308
16309	dtrace_module_loaded(lf);
16310}
16311
16312static void
16313dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)
16314{
16315
16316	if (*error != 0)
16317		/* We already have an error, so don't do anything. */
16318		return;
16319	dtrace_module_unloaded(lf, error);
16320}
16321#endif
16322
16323#if defined(sun)
16324static void
16325dtrace_suspend(void)
16326{
16327	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
16328}
16329
16330static void
16331dtrace_resume(void)
16332{
16333	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
16334}
16335#endif
16336
16337static int
16338dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
16339{
16340	ASSERT(MUTEX_HELD(&cpu_lock));
16341	mutex_enter(&dtrace_lock);
16342
16343	switch (what) {
16344	case CPU_CONFIG: {
16345		dtrace_state_t *state;
16346		dtrace_optval_t *opt, rs, c;
16347
16348		/*
16349		 * For now, we only allocate a new buffer for anonymous state.
16350		 */
16351		if ((state = dtrace_anon.dta_state) == NULL)
16352			break;
16353
16354		if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
16355			break;
16356
16357		opt = state->dts_options;
16358		c = opt[DTRACEOPT_CPU];
16359
16360		if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
16361			break;
16362
16363		/*
16364		 * Regardless of what the actual policy is, we're going to
16365		 * temporarily set our resize policy to be manual.  We're
16366		 * also going to temporarily set our CPU option to denote
16367		 * the newly configured CPU.
16368		 */
16369		rs = opt[DTRACEOPT_BUFRESIZE];
16370		opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
16371		opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
16372
16373		(void) dtrace_state_buffers(state);
16374
16375		opt[DTRACEOPT_BUFRESIZE] = rs;
16376		opt[DTRACEOPT_CPU] = c;
16377
16378		break;
16379	}
16380
16381	case CPU_UNCONFIG:
16382		/*
16383		 * We don't free the buffer in the CPU_UNCONFIG case.  (The
16384		 * buffer will be freed when the consumer exits.)
16385		 */
16386		break;
16387
16388	default:
16389		break;
16390	}
16391
16392	mutex_exit(&dtrace_lock);
16393	return (0);
16394}
16395
16396#if defined(sun)
16397static void
16398dtrace_cpu_setup_initial(processorid_t cpu)
16399{
16400	(void) dtrace_cpu_setup(CPU_CONFIG, cpu);
16401}
16402#endif
16403
16404static void
16405dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
16406{
16407	if (dtrace_toxranges >= dtrace_toxranges_max) {
16408		int osize, nsize;
16409		dtrace_toxrange_t *range;
16410
16411		osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16412
16413		if (osize == 0) {
16414			ASSERT(dtrace_toxrange == NULL);
16415			ASSERT(dtrace_toxranges_max == 0);
16416			dtrace_toxranges_max = 1;
16417		} else {
16418			dtrace_toxranges_max <<= 1;
16419		}
16420
16421		nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16422		range = kmem_zalloc(nsize, KM_SLEEP);
16423
16424		if (dtrace_toxrange != NULL) {
16425			ASSERT(osize != 0);
16426			bcopy(dtrace_toxrange, range, osize);
16427			kmem_free(dtrace_toxrange, osize);
16428		}
16429
16430		dtrace_toxrange = range;
16431	}
16432
16433	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
16434	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
16435
16436	dtrace_toxrange[dtrace_toxranges].dtt_base = base;
16437	dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
16438	dtrace_toxranges++;
16439}
16440
16441static void
16442dtrace_getf_barrier()
16443{
16444#if defined(sun)
16445	/*
16446	 * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
16447	 * that contain calls to getf(), this routine will be called on every
16448	 * closef() before either the underlying vnode is released or the
16449	 * file_t itself is freed.  By the time we are here, it is essential
16450	 * that the file_t can no longer be accessed from a call to getf()
16451	 * in probe context -- that assures that a dtrace_sync() can be used
16452	 * to clear out any enablings referring to the old structures.
16453	 */
16454	if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
16455	    kcred->cr_zone->zone_dtrace_getf != 0)
16456		dtrace_sync();
16457#endif
16458}
16459
16460/*
16461 * DTrace Driver Cookbook Functions
16462 */
16463#if defined(sun)
16464/*ARGSUSED*/
16465static int
16466dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
16467{
16468	dtrace_provider_id_t id;
16469	dtrace_state_t *state = NULL;
16470	dtrace_enabling_t *enab;
16471
16472	mutex_enter(&cpu_lock);
16473	mutex_enter(&dtrace_provider_lock);
16474	mutex_enter(&dtrace_lock);
16475
16476	if (ddi_soft_state_init(&dtrace_softstate,
16477	    sizeof (dtrace_state_t), 0) != 0) {
16478		cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
16479		mutex_exit(&cpu_lock);
16480		mutex_exit(&dtrace_provider_lock);
16481		mutex_exit(&dtrace_lock);
16482		return (DDI_FAILURE);
16483	}
16484
16485	if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
16486	    DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
16487	    ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
16488	    DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
16489		cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
16490		ddi_remove_minor_node(devi, NULL);
16491		ddi_soft_state_fini(&dtrace_softstate);
16492		mutex_exit(&cpu_lock);
16493		mutex_exit(&dtrace_provider_lock);
16494		mutex_exit(&dtrace_lock);
16495		return (DDI_FAILURE);
16496	}
16497
16498	ddi_report_dev(devi);
16499	dtrace_devi = devi;
16500
16501	dtrace_modload = dtrace_module_loaded;
16502	dtrace_modunload = dtrace_module_unloaded;
16503	dtrace_cpu_init = dtrace_cpu_setup_initial;
16504	dtrace_helpers_cleanup = dtrace_helpers_destroy;
16505	dtrace_helpers_fork = dtrace_helpers_duplicate;
16506	dtrace_cpustart_init = dtrace_suspend;
16507	dtrace_cpustart_fini = dtrace_resume;
16508	dtrace_debugger_init = dtrace_suspend;
16509	dtrace_debugger_fini = dtrace_resume;
16510
16511	register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16512
16513	ASSERT(MUTEX_HELD(&cpu_lock));
16514
16515	dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
16516	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
16517	dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
16518	    UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
16519	    VM_SLEEP | VMC_IDENTIFIER);
16520	dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
16521	    1, INT_MAX, 0);
16522
16523	dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
16524	    sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
16525	    NULL, NULL, NULL, NULL, NULL, 0);
16526
16527	ASSERT(MUTEX_HELD(&cpu_lock));
16528	dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
16529	    offsetof(dtrace_probe_t, dtpr_nextmod),
16530	    offsetof(dtrace_probe_t, dtpr_prevmod));
16531
16532	dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
16533	    offsetof(dtrace_probe_t, dtpr_nextfunc),
16534	    offsetof(dtrace_probe_t, dtpr_prevfunc));
16535
16536	dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
16537	    offsetof(dtrace_probe_t, dtpr_nextname),
16538	    offsetof(dtrace_probe_t, dtpr_prevname));
16539
16540	if (dtrace_retain_max < 1) {
16541		cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
16542		    "setting to 1", dtrace_retain_max);
16543		dtrace_retain_max = 1;
16544	}
16545
16546	/*
16547	 * Now discover our toxic ranges.
16548	 */
16549	dtrace_toxic_ranges(dtrace_toxrange_add);
16550
16551	/*
16552	 * Before we register ourselves as a provider to our own framework,
16553	 * we would like to assert that dtrace_provider is NULL -- but that's
16554	 * not true if we were loaded as a dependency of a DTrace provider.
16555	 * Once we've registered, we can assert that dtrace_provider is our
16556	 * pseudo provider.
16557	 */
16558	(void) dtrace_register("dtrace", &dtrace_provider_attr,
16559	    DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
16560
16561	ASSERT(dtrace_provider != NULL);
16562	ASSERT((dtrace_provider_id_t)dtrace_provider == id);
16563
16564	dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
16565	    dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
16566	dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
16567	    dtrace_provider, NULL, NULL, "END", 0, NULL);
16568	dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
16569	    dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
16570
16571	dtrace_anon_property();
16572	mutex_exit(&cpu_lock);
16573
16574	/*
16575	 * If DTrace helper tracing is enabled, we need to allocate the
16576	 * trace buffer and initialize the values.
16577	 */
16578	if (dtrace_helptrace_enabled) {
16579		ASSERT(dtrace_helptrace_buffer == NULL);
16580		dtrace_helptrace_buffer =
16581		    kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
16582		dtrace_helptrace_next = 0;
16583	}
16584
16585	/*
16586	 * If there are already providers, we must ask them to provide their
16587	 * probes, and then match any anonymous enabling against them.  Note
16588	 * that there should be no other retained enablings at this time:
16589	 * the only retained enablings at this time should be the anonymous
16590	 * enabling.
16591	 */
16592	if (dtrace_anon.dta_enabling != NULL) {
16593		ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
16594
16595		dtrace_enabling_provide(NULL);
16596		state = dtrace_anon.dta_state;
16597
16598		/*
16599		 * We couldn't hold cpu_lock across the above call to
16600		 * dtrace_enabling_provide(), but we must hold it to actually
16601		 * enable the probes.  We have to drop all of our locks, pick
16602		 * up cpu_lock, and regain our locks before matching the
16603		 * retained anonymous enabling.
16604		 */
16605		mutex_exit(&dtrace_lock);
16606		mutex_exit(&dtrace_provider_lock);
16607
16608		mutex_enter(&cpu_lock);
16609		mutex_enter(&dtrace_provider_lock);
16610		mutex_enter(&dtrace_lock);
16611
16612		if ((enab = dtrace_anon.dta_enabling) != NULL)
16613			(void) dtrace_enabling_match(enab, NULL);
16614
16615		mutex_exit(&cpu_lock);
16616	}
16617
16618	mutex_exit(&dtrace_lock);
16619	mutex_exit(&dtrace_provider_lock);
16620
16621	if (state != NULL) {
16622		/*
16623		 * If we created any anonymous state, set it going now.
16624		 */
16625		(void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
16626	}
16627
16628	return (DDI_SUCCESS);
16629}
16630#endif
16631
16632#if !defined(sun)
16633#if __FreeBSD_version >= 800039
16634static void dtrace_dtr(void *);
16635#endif
16636#endif
16637
16638/*ARGSUSED*/
16639static int
16640#if defined(sun)
16641dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
16642#else
16643dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
16644#endif
16645{
16646	dtrace_state_t *state;
16647	uint32_t priv;
16648	uid_t uid;
16649	zoneid_t zoneid;
16650
16651#if defined(sun)
16652	if (getminor(*devp) == DTRACEMNRN_HELPER)
16653		return (0);
16654
16655	/*
16656	 * If this wasn't an open with the "helper" minor, then it must be
16657	 * the "dtrace" minor.
16658	 */
16659	if (getminor(*devp) == DTRACEMNRN_DTRACE)
16660		return (ENXIO);
16661#else
16662	cred_t *cred_p = NULL;
16663
16664#if __FreeBSD_version < 800039
16665	/*
16666	 * The first minor device is the one that is cloned so there is
16667	 * nothing more to do here.
16668	 */
16669	if (dev2unit(dev) == 0)
16670		return 0;
16671
16672	/*
16673	 * Devices are cloned, so if the DTrace state has already
16674	 * been allocated, that means this device belongs to a
16675	 * different client. Each client should open '/dev/dtrace'
16676	 * to get a cloned device.
16677	 */
16678	if (dev->si_drv1 != NULL)
16679		return (EBUSY);
16680#endif
16681
16682	cred_p = dev->si_cred;
16683#endif
16684
16685	/*
16686	 * If no DTRACE_PRIV_* bits are set in the credential, then the
16687	 * caller lacks sufficient permission to do anything with DTrace.
16688	 */
16689	dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
16690	if (priv == DTRACE_PRIV_NONE) {
16691#if !defined(sun)
16692#if __FreeBSD_version < 800039
16693		/* Destroy the cloned device. */
16694                destroy_dev(dev);
16695#endif
16696#endif
16697
16698		return (EACCES);
16699	}
16700
16701	/*
16702	 * Ask all providers to provide all their probes.
16703	 */
16704	mutex_enter(&dtrace_provider_lock);
16705	dtrace_probe_provide(NULL, NULL);
16706	mutex_exit(&dtrace_provider_lock);
16707
16708	mutex_enter(&cpu_lock);
16709	mutex_enter(&dtrace_lock);
16710	dtrace_opens++;
16711	dtrace_membar_producer();
16712
16713#if defined(sun)
16714	/*
16715	 * If the kernel debugger is active (that is, if the kernel debugger
16716	 * modified text in some way), we won't allow the open.
16717	 */
16718	if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
16719		dtrace_opens--;
16720		mutex_exit(&cpu_lock);
16721		mutex_exit(&dtrace_lock);
16722		return (EBUSY);
16723	}
16724
16725	state = dtrace_state_create(devp, cred_p);
16726#else
16727	state = dtrace_state_create(dev);
16728#if __FreeBSD_version < 800039
16729	dev->si_drv1 = state;
16730#else
16731	devfs_set_cdevpriv(state, dtrace_dtr);
16732#endif
16733#endif
16734
16735	mutex_exit(&cpu_lock);
16736
16737	if (state == NULL) {
16738#if defined(sun)
16739		if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
16740			(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16741#else
16742		--dtrace_opens;
16743#endif
16744		mutex_exit(&dtrace_lock);
16745#if !defined(sun)
16746#if __FreeBSD_version < 800039
16747		/* Destroy the cloned device. */
16748                destroy_dev(dev);
16749#endif
16750#endif
16751		return (EAGAIN);
16752	}
16753
16754	mutex_exit(&dtrace_lock);
16755
16756	return (0);
16757}
16758
16759/*ARGSUSED*/
16760#if defined(sun)
16761static int
16762dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
16763#elif __FreeBSD_version < 800039
16764static int
16765dtrace_close(struct cdev *dev, int flags, int fmt __unused, struct thread *td)
16766#else
16767static void
16768dtrace_dtr(void *data)
16769#endif
16770{
16771#if defined(sun)
16772	minor_t minor = getminor(dev);
16773	dtrace_state_t *state;
16774
16775	if (minor == DTRACEMNRN_HELPER)
16776		return (0);
16777
16778	state = ddi_get_soft_state(dtrace_softstate, minor);
16779#else
16780#if __FreeBSD_version < 800039
16781	dtrace_state_t *state = dev->si_drv1;
16782
16783	/* Check if this is not a cloned device. */
16784	if (dev2unit(dev) == 0)
16785		return (0);
16786#else
16787	dtrace_state_t *state = data;
16788#endif
16789
16790#endif
16791
16792	mutex_enter(&cpu_lock);
16793	mutex_enter(&dtrace_lock);
16794
16795	if (state != NULL) {
16796		if (state->dts_anon) {
16797			/*
16798			 * There is anonymous state. Destroy that first.
16799			 */
16800			ASSERT(dtrace_anon.dta_state == NULL);
16801			dtrace_state_destroy(state->dts_anon);
16802		}
16803
16804		dtrace_state_destroy(state);
16805
16806#if !defined(sun)
16807		kmem_free(state, 0);
16808#if __FreeBSD_version < 800039
16809		dev->si_drv1 = NULL;
16810#endif
16811#endif
16812	}
16813
16814	ASSERT(dtrace_opens > 0);
16815#if defined(sun)
16816	/*
16817	 * Only relinquish control of the kernel debugger interface when there
16818	 * are no consumers and no anonymous enablings.
16819	 */
16820	if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
16821		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16822#else
16823	--dtrace_opens;
16824#endif
16825
16826	mutex_exit(&dtrace_lock);
16827	mutex_exit(&cpu_lock);
16828
16829#if __FreeBSD_version < 800039
16830	/* Schedule this cloned device to be destroyed. */
16831	destroy_dev_sched(dev);
16832#endif
16833
16834#if defined(sun) || __FreeBSD_version < 800039
16835	return (0);
16836#endif
16837}
16838
16839#if defined(sun)
16840/*ARGSUSED*/
16841static int
16842dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
16843{
16844	int rval;
16845	dof_helper_t help, *dhp = NULL;
16846
16847	switch (cmd) {
16848	case DTRACEHIOC_ADDDOF:
16849		if (copyin((void *)arg, &help, sizeof (help)) != 0) {
16850			dtrace_dof_error(NULL, "failed to copyin DOF helper");
16851			return (EFAULT);
16852		}
16853
16854		dhp = &help;
16855		arg = (intptr_t)help.dofhp_dof;
16856		/*FALLTHROUGH*/
16857
16858	case DTRACEHIOC_ADD: {
16859		dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
16860
16861		if (dof == NULL)
16862			return (rval);
16863
16864		mutex_enter(&dtrace_lock);
16865
16866		/*
16867		 * dtrace_helper_slurp() takes responsibility for the dof --
16868		 * it may free it now or it may save it and free it later.
16869		 */
16870		if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
16871			*rv = rval;
16872			rval = 0;
16873		} else {
16874			rval = EINVAL;
16875		}
16876
16877		mutex_exit(&dtrace_lock);
16878		return (rval);
16879	}
16880
16881	case DTRACEHIOC_REMOVE: {
16882		mutex_enter(&dtrace_lock);
16883		rval = dtrace_helper_destroygen(arg);
16884		mutex_exit(&dtrace_lock);
16885
16886		return (rval);
16887	}
16888
16889	default:
16890		break;
16891	}
16892
16893	return (ENOTTY);
16894}
16895
16896/*ARGSUSED*/
16897static int
16898dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
16899{
16900	minor_t minor = getminor(dev);
16901	dtrace_state_t *state;
16902	int rval;
16903
16904	if (minor == DTRACEMNRN_HELPER)
16905		return (dtrace_ioctl_helper(cmd, arg, rv));
16906
16907	state = ddi_get_soft_state(dtrace_softstate, minor);
16908
16909	if (state->dts_anon) {
16910		ASSERT(dtrace_anon.dta_state == NULL);
16911		state = state->dts_anon;
16912	}
16913
16914	switch (cmd) {
16915	case DTRACEIOC_PROVIDER: {
16916		dtrace_providerdesc_t pvd;
16917		dtrace_provider_t *pvp;
16918
16919		if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
16920			return (EFAULT);
16921
16922		pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
16923		mutex_enter(&dtrace_provider_lock);
16924
16925		for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
16926			if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
16927				break;
16928		}
16929
16930		mutex_exit(&dtrace_provider_lock);
16931
16932		if (pvp == NULL)
16933			return (ESRCH);
16934
16935		bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
16936		bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
16937
16938		if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
16939			return (EFAULT);
16940
16941		return (0);
16942	}
16943
16944	case DTRACEIOC_EPROBE: {
16945		dtrace_eprobedesc_t epdesc;
16946		dtrace_ecb_t *ecb;
16947		dtrace_action_t *act;
16948		void *buf;
16949		size_t size;
16950		uintptr_t dest;
16951		int nrecs;
16952
16953		if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
16954			return (EFAULT);
16955
16956		mutex_enter(&dtrace_lock);
16957
16958		if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
16959			mutex_exit(&dtrace_lock);
16960			return (EINVAL);
16961		}
16962
16963		if (ecb->dte_probe == NULL) {
16964			mutex_exit(&dtrace_lock);
16965			return (EINVAL);
16966		}
16967
16968		epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
16969		epdesc.dtepd_uarg = ecb->dte_uarg;
16970		epdesc.dtepd_size = ecb->dte_size;
16971
16972		nrecs = epdesc.dtepd_nrecs;
16973		epdesc.dtepd_nrecs = 0;
16974		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
16975			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
16976				continue;
16977
16978			epdesc.dtepd_nrecs++;
16979		}
16980
16981		/*
16982		 * Now that we have the size, we need to allocate a temporary
16983		 * buffer in which to store the complete description.  We need
16984		 * the temporary buffer to be able to drop dtrace_lock()
16985		 * across the copyout(), below.
16986		 */
16987		size = sizeof (dtrace_eprobedesc_t) +
16988		    (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
16989
16990		buf = kmem_alloc(size, KM_SLEEP);
16991		dest = (uintptr_t)buf;
16992
16993		bcopy(&epdesc, (void *)dest, sizeof (epdesc));
16994		dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
16995
16996		for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
16997			if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
16998				continue;
16999
17000			if (nrecs-- == 0)
17001				break;
17002
17003			bcopy(&act->dta_rec, (void *)dest,
17004			    sizeof (dtrace_recdesc_t));
17005			dest += sizeof (dtrace_recdesc_t);
17006		}
17007
17008		mutex_exit(&dtrace_lock);
17009
17010		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17011			kmem_free(buf, size);
17012			return (EFAULT);
17013		}
17014
17015		kmem_free(buf, size);
17016		return (0);
17017	}
17018
17019	case DTRACEIOC_AGGDESC: {
17020		dtrace_aggdesc_t aggdesc;
17021		dtrace_action_t *act;
17022		dtrace_aggregation_t *agg;
17023		int nrecs;
17024		uint32_t offs;
17025		dtrace_recdesc_t *lrec;
17026		void *buf;
17027		size_t size;
17028		uintptr_t dest;
17029
17030		if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
17031			return (EFAULT);
17032
17033		mutex_enter(&dtrace_lock);
17034
17035		if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17036			mutex_exit(&dtrace_lock);
17037			return (EINVAL);
17038		}
17039
17040		aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17041
17042		nrecs = aggdesc.dtagd_nrecs;
17043		aggdesc.dtagd_nrecs = 0;
17044
17045		offs = agg->dtag_base;
17046		lrec = &agg->dtag_action.dta_rec;
17047		aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17048
17049		for (act = agg->dtag_first; ; act = act->dta_next) {
17050			ASSERT(act->dta_intuple ||
17051			    DTRACEACT_ISAGG(act->dta_kind));
17052
17053			/*
17054			 * If this action has a record size of zero, it
17055			 * denotes an argument to the aggregating action.
17056			 * Because the presence of this record doesn't (or
17057			 * shouldn't) affect the way the data is interpreted,
17058			 * we don't copy it out to save user-level the
17059			 * confusion of dealing with a zero-length record.
17060			 */
17061			if (act->dta_rec.dtrd_size == 0) {
17062				ASSERT(agg->dtag_hasarg);
17063				continue;
17064			}
17065
17066			aggdesc.dtagd_nrecs++;
17067
17068			if (act == &agg->dtag_action)
17069				break;
17070		}
17071
17072		/*
17073		 * Now that we have the size, we need to allocate a temporary
17074		 * buffer in which to store the complete description.  We need
17075		 * the temporary buffer to be able to drop dtrace_lock()
17076		 * across the copyout(), below.
17077		 */
17078		size = sizeof (dtrace_aggdesc_t) +
17079		    (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17080
17081		buf = kmem_alloc(size, KM_SLEEP);
17082		dest = (uintptr_t)buf;
17083
17084		bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17085		dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17086
17087		for (act = agg->dtag_first; ; act = act->dta_next) {
17088			dtrace_recdesc_t rec = act->dta_rec;
17089
17090			/*
17091			 * See the comment in the above loop for why we pass
17092			 * over zero-length records.
17093			 */
17094			if (rec.dtrd_size == 0) {
17095				ASSERT(agg->dtag_hasarg);
17096				continue;
17097			}
17098
17099			if (nrecs-- == 0)
17100				break;
17101
17102			rec.dtrd_offset -= offs;
17103			bcopy(&rec, (void *)dest, sizeof (rec));
17104			dest += sizeof (dtrace_recdesc_t);
17105
17106			if (act == &agg->dtag_action)
17107				break;
17108		}
17109
17110		mutex_exit(&dtrace_lock);
17111
17112		if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17113			kmem_free(buf, size);
17114			return (EFAULT);
17115		}
17116
17117		kmem_free(buf, size);
17118		return (0);
17119	}
17120
17121	case DTRACEIOC_ENABLE: {
17122		dof_hdr_t *dof;
17123		dtrace_enabling_t *enab = NULL;
17124		dtrace_vstate_t *vstate;
17125		int err = 0;
17126
17127		*rv = 0;
17128
17129		/*
17130		 * If a NULL argument has been passed, we take this as our
17131		 * cue to reevaluate our enablings.
17132		 */
17133		if (arg == NULL) {
17134			dtrace_enabling_matchall();
17135
17136			return (0);
17137		}
17138
17139		if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17140			return (rval);
17141
17142		mutex_enter(&cpu_lock);
17143		mutex_enter(&dtrace_lock);
17144		vstate = &state->dts_vstate;
17145
17146		if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17147			mutex_exit(&dtrace_lock);
17148			mutex_exit(&cpu_lock);
17149			dtrace_dof_destroy(dof);
17150			return (EBUSY);
17151		}
17152
17153		if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17154			mutex_exit(&dtrace_lock);
17155			mutex_exit(&cpu_lock);
17156			dtrace_dof_destroy(dof);
17157			return (EINVAL);
17158		}
17159
17160		if ((rval = dtrace_dof_options(dof, state)) != 0) {
17161			dtrace_enabling_destroy(enab);
17162			mutex_exit(&dtrace_lock);
17163			mutex_exit(&cpu_lock);
17164			dtrace_dof_destroy(dof);
17165			return (rval);
17166		}
17167
17168		if ((err = dtrace_enabling_match(enab, rv)) == 0) {
17169			err = dtrace_enabling_retain(enab);
17170		} else {
17171			dtrace_enabling_destroy(enab);
17172		}
17173
17174		mutex_exit(&cpu_lock);
17175		mutex_exit(&dtrace_lock);
17176		dtrace_dof_destroy(dof);
17177
17178		return (err);
17179	}
17180
17181	case DTRACEIOC_REPLICATE: {
17182		dtrace_repldesc_t desc;
17183		dtrace_probedesc_t *match = &desc.dtrpd_match;
17184		dtrace_probedesc_t *create = &desc.dtrpd_create;
17185		int err;
17186
17187		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17188			return (EFAULT);
17189
17190		match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17191		match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17192		match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17193		match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17194
17195		create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17196		create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17197		create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17198		create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17199
17200		mutex_enter(&dtrace_lock);
17201		err = dtrace_enabling_replicate(state, match, create);
17202		mutex_exit(&dtrace_lock);
17203
17204		return (err);
17205	}
17206
17207	case DTRACEIOC_PROBEMATCH:
17208	case DTRACEIOC_PROBES: {
17209		dtrace_probe_t *probe = NULL;
17210		dtrace_probedesc_t desc;
17211		dtrace_probekey_t pkey;
17212		dtrace_id_t i;
17213		int m = 0;
17214		uint32_t priv;
17215		uid_t uid;
17216		zoneid_t zoneid;
17217
17218		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17219			return (EFAULT);
17220
17221		desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17222		desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17223		desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17224		desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17225
17226		/*
17227		 * Before we attempt to match this probe, we want to give
17228		 * all providers the opportunity to provide it.
17229		 */
17230		if (desc.dtpd_id == DTRACE_IDNONE) {
17231			mutex_enter(&dtrace_provider_lock);
17232			dtrace_probe_provide(&desc, NULL);
17233			mutex_exit(&dtrace_provider_lock);
17234			desc.dtpd_id++;
17235		}
17236
17237		if (cmd == DTRACEIOC_PROBEMATCH)  {
17238			dtrace_probekey(&desc, &pkey);
17239			pkey.dtpk_id = DTRACE_IDNONE;
17240		}
17241
17242		dtrace_cred2priv(cr, &priv, &uid, &zoneid);
17243
17244		mutex_enter(&dtrace_lock);
17245
17246		if (cmd == DTRACEIOC_PROBEMATCH) {
17247			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17248				if ((probe = dtrace_probes[i - 1]) != NULL &&
17249				    (m = dtrace_match_probe(probe, &pkey,
17250				    priv, uid, zoneid)) != 0)
17251					break;
17252			}
17253
17254			if (m < 0) {
17255				mutex_exit(&dtrace_lock);
17256				return (EINVAL);
17257			}
17258
17259		} else {
17260			for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17261				if ((probe = dtrace_probes[i - 1]) != NULL &&
17262				    dtrace_match_priv(probe, priv, uid, zoneid))
17263					break;
17264			}
17265		}
17266
17267		if (probe == NULL) {
17268			mutex_exit(&dtrace_lock);
17269			return (ESRCH);
17270		}
17271
17272		dtrace_probe_description(probe, &desc);
17273		mutex_exit(&dtrace_lock);
17274
17275		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17276			return (EFAULT);
17277
17278		return (0);
17279	}
17280
17281	case DTRACEIOC_PROBEARG: {
17282		dtrace_argdesc_t desc;
17283		dtrace_probe_t *probe;
17284		dtrace_provider_t *prov;
17285
17286		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17287			return (EFAULT);
17288
17289		if (desc.dtargd_id == DTRACE_IDNONE)
17290			return (EINVAL);
17291
17292		if (desc.dtargd_ndx == DTRACE_ARGNONE)
17293			return (EINVAL);
17294
17295		mutex_enter(&dtrace_provider_lock);
17296		mutex_enter(&mod_lock);
17297		mutex_enter(&dtrace_lock);
17298
17299		if (desc.dtargd_id > dtrace_nprobes) {
17300			mutex_exit(&dtrace_lock);
17301			mutex_exit(&mod_lock);
17302			mutex_exit(&dtrace_provider_lock);
17303			return (EINVAL);
17304		}
17305
17306		if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
17307			mutex_exit(&dtrace_lock);
17308			mutex_exit(&mod_lock);
17309			mutex_exit(&dtrace_provider_lock);
17310			return (EINVAL);
17311		}
17312
17313		mutex_exit(&dtrace_lock);
17314
17315		prov = probe->dtpr_provider;
17316
17317		if (prov->dtpv_pops.dtps_getargdesc == NULL) {
17318			/*
17319			 * There isn't any typed information for this probe.
17320			 * Set the argument number to DTRACE_ARGNONE.
17321			 */
17322			desc.dtargd_ndx = DTRACE_ARGNONE;
17323		} else {
17324			desc.dtargd_native[0] = '\0';
17325			desc.dtargd_xlate[0] = '\0';
17326			desc.dtargd_mapping = desc.dtargd_ndx;
17327
17328			prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
17329			    probe->dtpr_id, probe->dtpr_arg, &desc);
17330		}
17331
17332		mutex_exit(&mod_lock);
17333		mutex_exit(&dtrace_provider_lock);
17334
17335		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17336			return (EFAULT);
17337
17338		return (0);
17339	}
17340
17341	case DTRACEIOC_GO: {
17342		processorid_t cpuid;
17343		rval = dtrace_state_go(state, &cpuid);
17344
17345		if (rval != 0)
17346			return (rval);
17347
17348		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17349			return (EFAULT);
17350
17351		return (0);
17352	}
17353
17354	case DTRACEIOC_STOP: {
17355		processorid_t cpuid;
17356
17357		mutex_enter(&dtrace_lock);
17358		rval = dtrace_state_stop(state, &cpuid);
17359		mutex_exit(&dtrace_lock);
17360
17361		if (rval != 0)
17362			return (rval);
17363
17364		if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17365			return (EFAULT);
17366
17367		return (0);
17368	}
17369
17370	case DTRACEIOC_DOFGET: {
17371		dof_hdr_t hdr, *dof;
17372		uint64_t len;
17373
17374		if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
17375			return (EFAULT);
17376
17377		mutex_enter(&dtrace_lock);
17378		dof = dtrace_dof_create(state);
17379		mutex_exit(&dtrace_lock);
17380
17381		len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
17382		rval = copyout(dof, (void *)arg, len);
17383		dtrace_dof_destroy(dof);
17384
17385		return (rval == 0 ? 0 : EFAULT);
17386	}
17387
17388	case DTRACEIOC_AGGSNAP:
17389	case DTRACEIOC_BUFSNAP: {
17390		dtrace_bufdesc_t desc;
17391		caddr_t cached;
17392		dtrace_buffer_t *buf;
17393
17394		if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17395			return (EFAULT);
17396
17397		if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
17398			return (EINVAL);
17399
17400		mutex_enter(&dtrace_lock);
17401
17402		if (cmd == DTRACEIOC_BUFSNAP) {
17403			buf = &state->dts_buffer[desc.dtbd_cpu];
17404		} else {
17405			buf = &state->dts_aggbuffer[desc.dtbd_cpu];
17406		}
17407
17408		if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
17409			size_t sz = buf->dtb_offset;
17410
17411			if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
17412				mutex_exit(&dtrace_lock);
17413				return (EBUSY);
17414			}
17415
17416			/*
17417			 * If this buffer has already been consumed, we're
17418			 * going to indicate that there's nothing left here
17419			 * to consume.
17420			 */
17421			if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
17422				mutex_exit(&dtrace_lock);
17423
17424				desc.dtbd_size = 0;
17425				desc.dtbd_drops = 0;
17426				desc.dtbd_errors = 0;
17427				desc.dtbd_oldest = 0;
17428				sz = sizeof (desc);
17429
17430				if (copyout(&desc, (void *)arg, sz) != 0)
17431					return (EFAULT);
17432
17433				return (0);
17434			}
17435
17436			/*
17437			 * If this is a ring buffer that has wrapped, we want
17438			 * to copy the whole thing out.
17439			 */
17440			if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
17441				dtrace_buffer_polish(buf);
17442				sz = buf->dtb_size;
17443			}
17444
17445			if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
17446				mutex_exit(&dtrace_lock);
17447				return (EFAULT);
17448			}
17449
17450			desc.dtbd_size = sz;
17451			desc.dtbd_drops = buf->dtb_drops;
17452			desc.dtbd_errors = buf->dtb_errors;
17453			desc.dtbd_oldest = buf->dtb_xamot_offset;
17454			desc.dtbd_timestamp = dtrace_gethrtime();
17455
17456			mutex_exit(&dtrace_lock);
17457
17458			if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17459				return (EFAULT);
17460
17461			buf->dtb_flags |= DTRACEBUF_CONSUMED;
17462
17463			return (0);
17464		}
17465
17466		if (buf->dtb_tomax == NULL) {
17467			ASSERT(buf->dtb_xamot == NULL);
17468			mutex_exit(&dtrace_lock);
17469			return (ENOENT);
17470		}
17471
17472		cached = buf->dtb_tomax;
17473		ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
17474
17475		dtrace_xcall(desc.dtbd_cpu,
17476		    (dtrace_xcall_t)dtrace_buffer_switch, buf);
17477
17478		state->dts_errors += buf->dtb_xamot_errors;
17479
17480		/*
17481		 * If the buffers did not actually switch, then the cross call
17482		 * did not take place -- presumably because the given CPU is
17483		 * not in the ready set.  If this is the case, we'll return
17484		 * ENOENT.
17485		 */
17486		if (buf->dtb_tomax == cached) {
17487			ASSERT(buf->dtb_xamot != cached);
17488			mutex_exit(&dtrace_lock);
17489			return (ENOENT);
17490		}
17491
17492		ASSERT(cached == buf->dtb_xamot);
17493
17494		/*
17495		 * We have our snapshot; now copy it out.
17496		 */
17497		if (copyout(buf->dtb_xamot, desc.dtbd_data,
17498		    buf->dtb_xamot_offset) != 0) {
17499			mutex_exit(&dtrace_lock);
17500			return (EFAULT);
17501		}
17502
17503		desc.dtbd_size = buf->dtb_xamot_offset;
17504		desc.dtbd_drops = buf->dtb_xamot_drops;
17505		desc.dtbd_errors = buf->dtb_xamot_errors;
17506		desc.dtbd_oldest = 0;
17507		desc.dtbd_timestamp = buf->dtb_switched;
17508
17509		mutex_exit(&dtrace_lock);
17510
17511		/*
17512		 * Finally, copy out the buffer description.
17513		 */
17514		if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17515			return (EFAULT);
17516
17517		return (0);
17518	}
17519
17520	case DTRACEIOC_CONF: {
17521		dtrace_conf_t conf;
17522
17523		bzero(&conf, sizeof (conf));
17524		conf.dtc_difversion = DIF_VERSION;
17525		conf.dtc_difintregs = DIF_DIR_NREGS;
17526		conf.dtc_diftupregs = DIF_DTR_NREGS;
17527		conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
17528
17529		if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
17530			return (EFAULT);
17531
17532		return (0);
17533	}
17534
17535	case DTRACEIOC_STATUS: {
17536		dtrace_status_t stat;
17537		dtrace_dstate_t *dstate;
17538		int i, j;
17539		uint64_t nerrs;
17540
17541		/*
17542		 * See the comment in dtrace_state_deadman() for the reason
17543		 * for setting dts_laststatus to INT64_MAX before setting
17544		 * it to the correct value.
17545		 */
17546		state->dts_laststatus = INT64_MAX;
17547		dtrace_membar_producer();
17548		state->dts_laststatus = dtrace_gethrtime();
17549
17550		bzero(&stat, sizeof (stat));
17551
17552		mutex_enter(&dtrace_lock);
17553
17554		if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
17555			mutex_exit(&dtrace_lock);
17556			return (ENOENT);
17557		}
17558
17559		if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
17560			stat.dtst_exiting = 1;
17561
17562		nerrs = state->dts_errors;
17563		dstate = &state->dts_vstate.dtvs_dynvars;
17564
17565		for (i = 0; i < NCPU; i++) {
17566			dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
17567
17568			stat.dtst_dyndrops += dcpu->dtdsc_drops;
17569			stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
17570			stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
17571
17572			if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
17573				stat.dtst_filled++;
17574
17575			nerrs += state->dts_buffer[i].dtb_errors;
17576
17577			for (j = 0; j < state->dts_nspeculations; j++) {
17578				dtrace_speculation_t *spec;
17579				dtrace_buffer_t *buf;
17580
17581				spec = &state->dts_speculations[j];
17582				buf = &spec->dtsp_buffer[i];
17583				stat.dtst_specdrops += buf->dtb_xamot_drops;
17584			}
17585		}
17586
17587		stat.dtst_specdrops_busy = state->dts_speculations_busy;
17588		stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
17589		stat.dtst_stkstroverflows = state->dts_stkstroverflows;
17590		stat.dtst_dblerrors = state->dts_dblerrors;
17591		stat.dtst_killed =
17592		    (state->dts_activity == DTRACE_ACTIVITY_KILLED);
17593		stat.dtst_errors = nerrs;
17594
17595		mutex_exit(&dtrace_lock);
17596
17597		if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
17598			return (EFAULT);
17599
17600		return (0);
17601	}
17602
17603	case DTRACEIOC_FORMAT: {
17604		dtrace_fmtdesc_t fmt;
17605		char *str;
17606		int len;
17607
17608		if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
17609			return (EFAULT);
17610
17611		mutex_enter(&dtrace_lock);
17612
17613		if (fmt.dtfd_format == 0 ||
17614		    fmt.dtfd_format > state->dts_nformats) {
17615			mutex_exit(&dtrace_lock);
17616			return (EINVAL);
17617		}
17618
17619		/*
17620		 * Format strings are allocated contiguously and they are
17621		 * never freed; if a format index is less than the number
17622		 * of formats, we can assert that the format map is non-NULL
17623		 * and that the format for the specified index is non-NULL.
17624		 */
17625		ASSERT(state->dts_formats != NULL);
17626		str = state->dts_formats[fmt.dtfd_format - 1];
17627		ASSERT(str != NULL);
17628
17629		len = strlen(str) + 1;
17630
17631		if (len > fmt.dtfd_length) {
17632			fmt.dtfd_length = len;
17633
17634			if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
17635				mutex_exit(&dtrace_lock);
17636				return (EINVAL);
17637			}
17638		} else {
17639			if (copyout(str, fmt.dtfd_string, len) != 0) {
17640				mutex_exit(&dtrace_lock);
17641				return (EINVAL);
17642			}
17643		}
17644
17645		mutex_exit(&dtrace_lock);
17646		return (0);
17647	}
17648
17649	default:
17650		break;
17651	}
17652
17653	return (ENOTTY);
17654}
17655
17656/*ARGSUSED*/
17657static int
17658dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
17659{
17660	dtrace_state_t *state;
17661
17662	switch (cmd) {
17663	case DDI_DETACH:
17664		break;
17665
17666	case DDI_SUSPEND:
17667		return (DDI_SUCCESS);
17668
17669	default:
17670		return (DDI_FAILURE);
17671	}
17672
17673	mutex_enter(&cpu_lock);
17674	mutex_enter(&dtrace_provider_lock);
17675	mutex_enter(&dtrace_lock);
17676
17677	ASSERT(dtrace_opens == 0);
17678
17679	if (dtrace_helpers > 0) {
17680		mutex_exit(&dtrace_provider_lock);
17681		mutex_exit(&dtrace_lock);
17682		mutex_exit(&cpu_lock);
17683		return (DDI_FAILURE);
17684	}
17685
17686	if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
17687		mutex_exit(&dtrace_provider_lock);
17688		mutex_exit(&dtrace_lock);
17689		mutex_exit(&cpu_lock);
17690		return (DDI_FAILURE);
17691	}
17692
17693	dtrace_provider = NULL;
17694
17695	if ((state = dtrace_anon_grab()) != NULL) {
17696		/*
17697		 * If there were ECBs on this state, the provider should
17698		 * have not been allowed to detach; assert that there is
17699		 * none.
17700		 */
17701		ASSERT(state->dts_necbs == 0);
17702		dtrace_state_destroy(state);
17703
17704		/*
17705		 * If we're being detached with anonymous state, we need to
17706		 * indicate to the kernel debugger that DTrace is now inactive.
17707		 */
17708		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17709	}
17710
17711	bzero(&dtrace_anon, sizeof (dtrace_anon_t));
17712	unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17713	dtrace_cpu_init = NULL;
17714	dtrace_helpers_cleanup = NULL;
17715	dtrace_helpers_fork = NULL;
17716	dtrace_cpustart_init = NULL;
17717	dtrace_cpustart_fini = NULL;
17718	dtrace_debugger_init = NULL;
17719	dtrace_debugger_fini = NULL;
17720	dtrace_modload = NULL;
17721	dtrace_modunload = NULL;
17722
17723	ASSERT(dtrace_getf == 0);
17724	ASSERT(dtrace_closef == NULL);
17725
17726	mutex_exit(&cpu_lock);
17727
17728	if (dtrace_helptrace_enabled) {
17729		kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
17730		dtrace_helptrace_buffer = NULL;
17731	}
17732
17733	kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
17734	dtrace_probes = NULL;
17735	dtrace_nprobes = 0;
17736
17737	dtrace_hash_destroy(dtrace_bymod);
17738	dtrace_hash_destroy(dtrace_byfunc);
17739	dtrace_hash_destroy(dtrace_byname);
17740	dtrace_bymod = NULL;
17741	dtrace_byfunc = NULL;
17742	dtrace_byname = NULL;
17743
17744	kmem_cache_destroy(dtrace_state_cache);
17745	vmem_destroy(dtrace_minor);
17746	vmem_destroy(dtrace_arena);
17747
17748	if (dtrace_toxrange != NULL) {
17749		kmem_free(dtrace_toxrange,
17750		    dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
17751		dtrace_toxrange = NULL;
17752		dtrace_toxranges = 0;
17753		dtrace_toxranges_max = 0;
17754	}
17755
17756	ddi_remove_minor_node(dtrace_devi, NULL);
17757	dtrace_devi = NULL;
17758
17759	ddi_soft_state_fini(&dtrace_softstate);
17760
17761	ASSERT(dtrace_vtime_references == 0);
17762	ASSERT(dtrace_opens == 0);
17763	ASSERT(dtrace_retained == NULL);
17764
17765	mutex_exit(&dtrace_lock);
17766	mutex_exit(&dtrace_provider_lock);
17767
17768	/*
17769	 * We don't destroy the task queue until after we have dropped our
17770	 * locks (taskq_destroy() may block on running tasks).  To prevent
17771	 * attempting to do work after we have effectively detached but before
17772	 * the task queue has been destroyed, all tasks dispatched via the
17773	 * task queue must check that DTrace is still attached before
17774	 * performing any operation.
17775	 */
17776	taskq_destroy(dtrace_taskq);
17777	dtrace_taskq = NULL;
17778
17779	return (DDI_SUCCESS);
17780}
17781#endif
17782
17783#if defined(sun)
17784/*ARGSUSED*/
17785static int
17786dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
17787{
17788	int error;
17789
17790	switch (infocmd) {
17791	case DDI_INFO_DEVT2DEVINFO:
17792		*result = (void *)dtrace_devi;
17793		error = DDI_SUCCESS;
17794		break;
17795	case DDI_INFO_DEVT2INSTANCE:
17796		*result = (void *)0;
17797		error = DDI_SUCCESS;
17798		break;
17799	default:
17800		error = DDI_FAILURE;
17801	}
17802	return (error);
17803}
17804#endif
17805
17806#if defined(sun)
17807static struct cb_ops dtrace_cb_ops = {
17808	dtrace_open,		/* open */
17809	dtrace_close,		/* close */
17810	nulldev,		/* strategy */
17811	nulldev,		/* print */
17812	nodev,			/* dump */
17813	nodev,			/* read */
17814	nodev,			/* write */
17815	dtrace_ioctl,		/* ioctl */
17816	nodev,			/* devmap */
17817	nodev,			/* mmap */
17818	nodev,			/* segmap */
17819	nochpoll,		/* poll */
17820	ddi_prop_op,		/* cb_prop_op */
17821	0,			/* streamtab  */
17822	D_NEW | D_MP		/* Driver compatibility flag */
17823};
17824
17825static struct dev_ops dtrace_ops = {
17826	DEVO_REV,		/* devo_rev */
17827	0,			/* refcnt */
17828	dtrace_info,		/* get_dev_info */
17829	nulldev,		/* identify */
17830	nulldev,		/* probe */
17831	dtrace_attach,		/* attach */
17832	dtrace_detach,		/* detach */
17833	nodev,			/* reset */
17834	&dtrace_cb_ops,		/* driver operations */
17835	NULL,			/* bus operations */
17836	nodev			/* dev power */
17837};
17838
17839static struct modldrv modldrv = {
17840	&mod_driverops,		/* module type (this is a pseudo driver) */
17841	"Dynamic Tracing",	/* name of module */
17842	&dtrace_ops,		/* driver ops */
17843};
17844
17845static struct modlinkage modlinkage = {
17846	MODREV_1,
17847	(void *)&modldrv,
17848	NULL
17849};
17850
17851int
17852_init(void)
17853{
17854	return (mod_install(&modlinkage));
17855}
17856
17857int
17858_info(struct modinfo *modinfop)
17859{
17860	return (mod_info(&modlinkage, modinfop));
17861}
17862
17863int
17864_fini(void)
17865{
17866	return (mod_remove(&modlinkage));
17867}
17868#else
17869
17870static d_ioctl_t	dtrace_ioctl;
17871static d_ioctl_t	dtrace_ioctl_helper;
17872static void		dtrace_load(void *);
17873static int		dtrace_unload(void);
17874#if __FreeBSD_version < 800039
17875static void		dtrace_clone(void *, struct ucred *, char *, int , struct cdev **);
17876static struct clonedevs	*dtrace_clones;		/* Ptr to the array of cloned devices. */
17877static eventhandler_tag	eh_tag;			/* Event handler tag. */
17878#else
17879static struct cdev	*dtrace_dev;
17880static struct cdev	*helper_dev;
17881#endif
17882
17883void dtrace_invop_init(void);
17884void dtrace_invop_uninit(void);
17885
17886static struct cdevsw dtrace_cdevsw = {
17887	.d_version	= D_VERSION,
17888#if __FreeBSD_version < 800039
17889	.d_flags	= D_TRACKCLOSE | D_NEEDMINOR,
17890	.d_close	= dtrace_close,
17891#endif
17892	.d_ioctl	= dtrace_ioctl,
17893	.d_open		= dtrace_open,
17894	.d_name		= "dtrace",
17895};
17896
17897static struct cdevsw helper_cdevsw = {
17898	.d_version	= D_VERSION,
17899	.d_ioctl	= dtrace_ioctl_helper,
17900	.d_name		= "helper",
17901};
17902
17903#include <dtrace_anon.c>
17904#if __FreeBSD_version < 800039
17905#include <dtrace_clone.c>
17906#endif
17907#include <dtrace_ioctl.c>
17908#include <dtrace_load.c>
17909#include <dtrace_modevent.c>
17910#include <dtrace_sysctl.c>
17911#include <dtrace_unload.c>
17912#include <dtrace_vtime.c>
17913#include <dtrace_hacks.c>
17914#include <dtrace_isa.c>
17915
17916SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL);
17917SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL);
17918SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL);
17919
17920DEV_MODULE(dtrace, dtrace_modevent, NULL);
17921MODULE_VERSION(dtrace, 1);
17922MODULE_DEPEND(dtrace, cyclic, 1, 1, 1);
17923MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
17924#endif
17925