1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1989, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Mike Karels at Berkeley Software Design, Inc.
9 *
10 * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
11 * project, to make these variables more userfriendly.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37
38#include <sys/cdefs.h>
39#include "opt_capsicum.h"
40#include "opt_ddb.h"
41#include "opt_ktrace.h"
42#include "opt_sysctl.h"
43
44#include <sys/param.h>
45#include <sys/fail.h>
46#include <sys/systm.h>
47#include <sys/capsicum.h>
48#include <sys/kernel.h>
49#include <sys/limits.h>
50#include <sys/sysctl.h>
51#include <sys/malloc.h>
52#include <sys/priv.h>
53#include <sys/proc.h>
54#include <sys/jail.h>
55#include <sys/kdb.h>
56#include <sys/lock.h>
57#include <sys/mutex.h>
58#include <sys/rmlock.h>
59#include <sys/sbuf.h>
60#include <sys/sx.h>
61#include <sys/sysproto.h>
62#include <sys/uio.h>
63#ifdef KTRACE
64#include <sys/ktrace.h>
65#endif
66
67#ifdef DDB
68#include <ddb/ddb.h>
69#include <ddb/db_lex.h>
70#endif
71
72#include <net/vnet.h>
73
74#include <security/mac/mac_framework.h>
75
76#include <vm/vm.h>
77#include <vm/vm_extern.h>
78
79static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
80static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
81static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");
82
83RB_GENERATE(sysctl_oid_list, sysctl_oid, oid_link, cmp_sysctl_oid);
84
85/*
86 * The sysctllock protects the MIB tree.  It also protects sysctl
87 * contexts used with dynamic sysctls.  The sysctl_register_oid() and
88 * sysctl_unregister_oid() routines require the sysctllock to already
89 * be held, so the sysctl_wlock() and sysctl_wunlock() routines are
90 * provided for the few places in the kernel which need to use that
91 * API rather than using the dynamic API.  Use of the dynamic API is
92 * strongly encouraged for most code.
93 *
94 * The sysctlmemlock is used to limit the amount of user memory wired for
95 * sysctl requests.  This is implemented by serializing any userland
96 * sysctl requests larger than a single page via an exclusive lock.
97 *
98 * The sysctlstringlock is used to protect concurrent access to writable
99 * string nodes in sysctl_handle_string().
100 */
101static struct rmlock sysctllock;
102static struct sx __exclusive_cache_line sysctlmemlock;
103static struct sx sysctlstringlock;
104
105#define	SYSCTL_WLOCK()		rm_wlock(&sysctllock)
106#define	SYSCTL_WUNLOCK()	rm_wunlock(&sysctllock)
107#define	SYSCTL_RLOCK(tracker)	rm_rlock(&sysctllock, (tracker))
108#define	SYSCTL_RUNLOCK(tracker)	rm_runlock(&sysctllock, (tracker))
109#define	SYSCTL_WLOCKED()	rm_wowned(&sysctllock)
110#define	SYSCTL_ASSERT_LOCKED()	rm_assert(&sysctllock, RA_LOCKED)
111#define	SYSCTL_ASSERT_WLOCKED()	rm_assert(&sysctllock, RA_WLOCKED)
112#define	SYSCTL_ASSERT_RLOCKED()	rm_assert(&sysctllock, RA_RLOCKED)
113#define	SYSCTL_INIT()		rm_init_flags(&sysctllock, "sysctl lock", \
114				    RM_SLEEPABLE)
115#define	SYSCTL_SLEEP(ch, wmesg, timo)					\
116				rm_sleep(ch, &sysctllock, 0, wmesg, timo)
117
118static int sysctl_root(SYSCTL_HANDLER_ARGS);
119
120/* Root list */
121struct sysctl_oid_list sysctl__children = RB_INITIALIZER(&sysctl__children);
122
123static char*	sysctl_escape_name(const char*);
124static int	sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del,
125		    int recurse);
126static int	sysctl_old_kernel(struct sysctl_req *, const void *, size_t);
127static int	sysctl_new_kernel(struct sysctl_req *, void *, size_t);
128static int	name2oid(const char *, int *, int *, struct sysctl_oid **);
129
130static struct sysctl_oid *
131sysctl_find_oidname(const char *name, struct sysctl_oid_list *list)
132{
133	struct sysctl_oid *oidp;
134
135	SYSCTL_ASSERT_LOCKED();
136	SYSCTL_FOREACH(oidp, list) {
137		if (strcmp(oidp->oid_name, name) == 0) {
138			return (oidp);
139		}
140	}
141	return (NULL);
142}
143
144static struct sysctl_oid *
145sysctl_find_oidnamelen(const char *name, size_t len,
146    struct sysctl_oid_list *list)
147{
148	struct sysctl_oid *oidp;
149
150	SYSCTL_ASSERT_LOCKED();
151	SYSCTL_FOREACH(oidp, list) {
152		if (strncmp(oidp->oid_name, name, len) == 0 &&
153		    oidp->oid_name[len] == '\0')
154			return (oidp);
155	}
156	return (NULL);
157}
158
159/*
160 * Initialization of the MIB tree.
161 *
162 * Order by number in each list.
163 */
164void
165sysctl_wlock(void)
166{
167
168	SYSCTL_WLOCK();
169}
170
171void
172sysctl_wunlock(void)
173{
174
175	SYSCTL_WUNLOCK();
176}
177
178static int
179sysctl_root_handler_locked(struct sysctl_oid *oid, void *arg1, intmax_t arg2,
180    struct sysctl_req *req, struct rm_priotracker *tracker)
181{
182	int error;
183
184	if (oid->oid_kind & CTLFLAG_DYN)
185		atomic_add_int(&oid->oid_running, 1);
186
187	if (tracker != NULL)
188		SYSCTL_RUNLOCK(tracker);
189	else
190		SYSCTL_WUNLOCK();
191
192	/*
193	 * Treat set CTLFLAG_NEEDGIANT and unset CTLFLAG_MPSAFE flags the same,
194	 * untill we're ready to remove all traces of Giant from sysctl(9).
195	 */
196	if ((oid->oid_kind & CTLFLAG_NEEDGIANT) ||
197	    (!(oid->oid_kind & CTLFLAG_MPSAFE)))
198		mtx_lock(&Giant);
199	error = oid->oid_handler(oid, arg1, arg2, req);
200	if ((oid->oid_kind & CTLFLAG_NEEDGIANT) ||
201	    (!(oid->oid_kind & CTLFLAG_MPSAFE)))
202		mtx_unlock(&Giant);
203
204	KFAIL_POINT_ERROR(_debug_fail_point, sysctl_running, error);
205
206	if (tracker != NULL)
207		SYSCTL_RLOCK(tracker);
208	else
209		SYSCTL_WLOCK();
210
211	if (oid->oid_kind & CTLFLAG_DYN) {
212		if (atomic_fetchadd_int(&oid->oid_running, -1) == 1 &&
213		    (oid->oid_kind & CTLFLAG_DYING) != 0)
214			wakeup(&oid->oid_running);
215	}
216
217	return (error);
218}
219
220static void
221sysctl_load_tunable_by_oid_locked(struct sysctl_oid *oidp)
222{
223	struct sysctl_req req;
224	struct sysctl_oid *curr;
225	char *penv = NULL;
226	char path[96];
227	ssize_t rem = sizeof(path);
228	ssize_t len;
229	uint8_t data[512] __aligned(sizeof(uint64_t));
230	int size;
231	int error;
232
233	path[--rem] = 0;
234
235	for (curr = oidp; curr != NULL; curr = SYSCTL_PARENT(curr)) {
236		len = strlen(curr->oid_name);
237		rem -= len;
238		if (curr != oidp)
239			rem -= 1;
240		if (rem < 0) {
241			printf("OID path exceeds %d bytes\n", (int)sizeof(path));
242			return;
243		}
244		memcpy(path + rem, curr->oid_name, len);
245		if (curr != oidp)
246			path[rem + len] = '.';
247	}
248
249	memset(&req, 0, sizeof(req));
250
251	req.td = curthread;
252	req.oldfunc = sysctl_old_kernel;
253	req.newfunc = sysctl_new_kernel;
254	req.lock = REQ_UNWIRED;
255
256	switch (oidp->oid_kind & CTLTYPE) {
257	case CTLTYPE_INT:
258		if (getenv_array(path + rem, data, sizeof(data), &size,
259		    sizeof(int), GETENV_SIGNED) == 0)
260			return;
261		req.newlen = size;
262		req.newptr = data;
263		break;
264	case CTLTYPE_UINT:
265		if (getenv_array(path + rem, data, sizeof(data), &size,
266		    sizeof(int), GETENV_UNSIGNED) == 0)
267			return;
268		req.newlen = size;
269		req.newptr = data;
270		break;
271	case CTLTYPE_LONG:
272		if (getenv_array(path + rem, data, sizeof(data), &size,
273		    sizeof(long), GETENV_SIGNED) == 0)
274			return;
275		req.newlen = size;
276		req.newptr = data;
277		break;
278	case CTLTYPE_ULONG:
279		if (getenv_array(path + rem, data, sizeof(data), &size,
280		    sizeof(long), GETENV_UNSIGNED) == 0)
281			return;
282		req.newlen = size;
283		req.newptr = data;
284		break;
285	case CTLTYPE_S8:
286		if (getenv_array(path + rem, data, sizeof(data), &size,
287		    sizeof(int8_t), GETENV_SIGNED) == 0)
288			return;
289		req.newlen = size;
290		req.newptr = data;
291		break;
292	case CTLTYPE_S16:
293		if (getenv_array(path + rem, data, sizeof(data), &size,
294		    sizeof(int16_t), GETENV_SIGNED) == 0)
295			return;
296		req.newlen = size;
297		req.newptr = data;
298		break;
299	case CTLTYPE_S32:
300		if (getenv_array(path + rem, data, sizeof(data), &size,
301		    sizeof(int32_t), GETENV_SIGNED) == 0)
302			return;
303		req.newlen = size;
304		req.newptr = data;
305		break;
306	case CTLTYPE_S64:
307		if (getenv_array(path + rem, data, sizeof(data), &size,
308		    sizeof(int64_t), GETENV_SIGNED) == 0)
309			return;
310		req.newlen = size;
311		req.newptr = data;
312		break;
313	case CTLTYPE_U8:
314		if (getenv_array(path + rem, data, sizeof(data), &size,
315		    sizeof(uint8_t), GETENV_UNSIGNED) == 0)
316			return;
317		req.newlen = size;
318		req.newptr = data;
319		break;
320	case CTLTYPE_U16:
321		if (getenv_array(path + rem, data, sizeof(data), &size,
322		    sizeof(uint16_t), GETENV_UNSIGNED) == 0)
323			return;
324		req.newlen = size;
325		req.newptr = data;
326		break;
327	case CTLTYPE_U32:
328		if (getenv_array(path + rem, data, sizeof(data), &size,
329		    sizeof(uint32_t), GETENV_UNSIGNED) == 0)
330			return;
331		req.newlen = size;
332		req.newptr = data;
333		break;
334	case CTLTYPE_U64:
335		if (getenv_array(path + rem, data, sizeof(data), &size,
336		    sizeof(uint64_t), GETENV_UNSIGNED) == 0)
337			return;
338		req.newlen = size;
339		req.newptr = data;
340		break;
341	case CTLTYPE_STRING:
342		penv = kern_getenv(path + rem);
343		if (penv == NULL)
344			return;
345		req.newlen = strlen(penv);
346		req.newptr = penv;
347		break;
348	default:
349		return;
350	}
351	error = sysctl_root_handler_locked(oidp, oidp->oid_arg1,
352	    oidp->oid_arg2, &req, NULL);
353	if (error != 0)
354		printf("Setting sysctl %s failed: %d\n", path + rem, error);
355	if (penv != NULL)
356		freeenv(penv);
357}
358
359/*
360 * Locate the path to a given oid.  Returns the length of the resulting path,
361 * or -1 if the oid was not found.  nodes must have room for CTL_MAXNAME
362 * elements.
363 */
364static int
365sysctl_search_oid(struct sysctl_oid **nodes, struct sysctl_oid *needle)
366{
367	int indx;
368
369	SYSCTL_ASSERT_LOCKED();
370	indx = 0;
371	/*
372	 * Do a depth-first search of the oid tree, looking for 'needle'. Start
373	 * with the first child of the root.
374	 */
375	nodes[indx] = RB_MIN(sysctl_oid_list, &sysctl__children);
376	for (;;) {
377		if (nodes[indx] == needle)
378			return (indx + 1);
379
380		if (nodes[indx] == NULL) {
381			/* Node has no more siblings, so back up to parent. */
382			if (indx-- == 0) {
383				/* Retreat to root, so give up. */
384				break;
385			}
386		} else if ((nodes[indx]->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
387			/* Node has children. */
388			if (++indx == CTL_MAXNAME) {
389				/* Max search depth reached, so give up. */
390				break;
391			}
392			/* Start with the first child. */
393			nodes[indx] = RB_MIN(sysctl_oid_list,
394			    &nodes[indx - 1]->oid_children);
395			continue;
396		}
397		/* Consider next sibling. */
398		nodes[indx] = RB_NEXT(sysctl_oid_list, NULL, nodes[indx]);
399	}
400	return (-1);
401}
402
403static void
404sysctl_warn_reuse(const char *func, struct sysctl_oid *leaf)
405{
406	struct sysctl_oid *nodes[CTL_MAXNAME];
407	char buf[128];
408	struct sbuf sb;
409	int rc, i;
410
411	(void)sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN | SBUF_INCLUDENUL);
412	sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
413
414	sbuf_printf(&sb, "%s: can't re-use a leaf (", __func__);
415
416	rc = sysctl_search_oid(nodes, leaf);
417	if (rc > 0) {
418		for (i = 0; i < rc; i++)
419			sbuf_printf(&sb, "%s%.*s", nodes[i]->oid_name,
420			    i != (rc - 1), ".");
421	} else {
422		sbuf_cat(&sb, leaf->oid_name);
423	}
424	sbuf_cat(&sb, ")!\n");
425
426	(void)sbuf_finish(&sb);
427}
428
429#ifdef SYSCTL_DEBUG
430static int
431sysctl_reuse_test(SYSCTL_HANDLER_ARGS)
432{
433	struct rm_priotracker tracker;
434
435	SYSCTL_RLOCK(&tracker);
436	sysctl_warn_reuse(__func__, oidp);
437	SYSCTL_RUNLOCK(&tracker);
438	return (0);
439}
440SYSCTL_PROC(_sysctl, OID_AUTO, reuse_test,
441    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_reuse_test, "-",
442    "");
443#endif
444
445void
446sysctl_register_oid(struct sysctl_oid *oidp)
447{
448	struct sysctl_oid_list *parent = oidp->oid_parent;
449	struct sysctl_oid *p, key;
450	int oid_number;
451	int timeout = 2;
452
453	/*
454	 * First check if another oid with the same name already
455	 * exists in the parent's list.
456	 */
457	SYSCTL_ASSERT_WLOCKED();
458	p = sysctl_find_oidname(oidp->oid_name, parent);
459	if (p != NULL) {
460		if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
461			p->oid_refcnt++;
462			return;
463		} else {
464			sysctl_warn_reuse(__func__, p);
465			return;
466		}
467	}
468	/* get current OID number */
469	oid_number = oidp->oid_number;
470
471#if (OID_AUTO >= 0)
472#error "OID_AUTO is expected to be a negative value"
473#endif
474	/*
475	 * Any negative OID number qualifies as OID_AUTO. Valid OID
476	 * numbers should always be positive.
477	 *
478	 * NOTE: DO NOT change the starting value here, change it in
479	 * <sys/sysctl.h>, and make sure it is at least 256 to
480	 * accommodate e.g. net.inet.raw as a static sysctl node.
481	 */
482	if (oid_number < 0) {
483		static int newoid;
484
485		/*
486		 * By decrementing the next OID number we spend less
487		 * time inserting the OIDs into a sorted list.
488		 */
489		if (--newoid < CTL_AUTO_START)
490			newoid = 0x7fffffff;
491
492		oid_number = newoid;
493	}
494
495	/*
496	 * Insert the OID into the parent's list sorted by OID number.
497	 */
498	key.oid_number = oid_number;
499	p = RB_NFIND(sysctl_oid_list, parent, &key);
500	while (p != NULL && oid_number == p->oid_number) {
501		/* get the next valid OID number */
502		if (oid_number < CTL_AUTO_START ||
503		    oid_number == 0x7fffffff) {
504			/* wraparound - restart */
505			oid_number = CTL_AUTO_START;
506			/* don't loop forever */
507			if (!timeout--)
508				panic("sysctl: Out of OID numbers\n");
509			key.oid_number = oid_number;
510			p = RB_NFIND(sysctl_oid_list, parent, &key);
511			continue;
512		}
513		p = RB_NEXT(sysctl_oid_list, NULL, p);
514		oid_number++;
515	}
516	/* check for non-auto OID number collision */
517	if (oidp->oid_number >= 0 && oidp->oid_number < CTL_AUTO_START &&
518	    oid_number >= CTL_AUTO_START) {
519		printf("sysctl: OID number(%d) is already in use for '%s'\n",
520		    oidp->oid_number, oidp->oid_name);
521	}
522	/* update the OID number, if any */
523	oidp->oid_number = oid_number;
524	RB_INSERT(sysctl_oid_list, parent, oidp);
525
526	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE &&
527	    (oidp->oid_kind & CTLFLAG_TUN) != 0 &&
528	    (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) {
529#ifdef VIMAGE
530		/*
531		 * Can fetch value multiple times for VNET loader tunables.
532		 * Only fetch once for non-VNET loader tunables.
533		 */
534		if ((oidp->oid_kind & CTLFLAG_VNET) == 0)
535#endif
536			oidp->oid_kind |= CTLFLAG_NOFETCH;
537		/* try to fetch value from kernel environment */
538		sysctl_load_tunable_by_oid_locked(oidp);
539	}
540}
541
542void
543sysctl_register_disabled_oid(struct sysctl_oid *oidp)
544{
545
546	/*
547	 * Mark the leaf as dormant if it's not to be immediately enabled.
548	 * We do not disable nodes as they can be shared between modules
549	 * and it is always safe to access a node.
550	 */
551	KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0,
552	    ("internal flag is set in oid_kind"));
553	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
554		oidp->oid_kind |= CTLFLAG_DORMANT;
555	sysctl_register_oid(oidp);
556}
557
558void
559sysctl_enable_oid(struct sysctl_oid *oidp)
560{
561
562	SYSCTL_ASSERT_WLOCKED();
563	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
564		KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0,
565		    ("sysctl node is marked as dormant"));
566		return;
567	}
568	KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) != 0,
569	    ("enabling already enabled sysctl oid"));
570	oidp->oid_kind &= ~CTLFLAG_DORMANT;
571}
572
573void
574sysctl_unregister_oid(struct sysctl_oid *oidp)
575{
576	int error;
577
578	SYSCTL_ASSERT_WLOCKED();
579	if (oidp->oid_number == OID_AUTO) {
580		error = EINVAL;
581	} else {
582		error = ENOENT;
583		if (RB_REMOVE(sysctl_oid_list, oidp->oid_parent, oidp))
584			error = 0;
585	}
586
587	/*
588	 * This can happen when a module fails to register and is
589	 * being unloaded afterwards.  It should not be a panic()
590	 * for normal use.
591	 */
592	if (error) {
593		printf("%s: failed(%d) to unregister sysctl(%s)\n",
594		    __func__, error, oidp->oid_name);
595	}
596}
597
598/* Initialize a new context to keep track of dynamically added sysctls. */
599int
600sysctl_ctx_init(struct sysctl_ctx_list *c)
601{
602
603	if (c == NULL) {
604		return (EINVAL);
605	}
606
607	/*
608	 * No locking here, the caller is responsible for not adding
609	 * new nodes to a context until after this function has
610	 * returned.
611	 */
612	TAILQ_INIT(c);
613	return (0);
614}
615
616/* Free the context, and destroy all dynamic oids registered in this context */
617int
618sysctl_ctx_free(struct sysctl_ctx_list *clist)
619{
620	struct sysctl_ctx_entry *e, *e1;
621	int error;
622
623	error = 0;
624	/*
625	 * First perform a "dry run" to check if it's ok to remove oids.
626	 * XXX FIXME
627	 * XXX This algorithm is a hack. But I don't know any
628	 * XXX better solution for now...
629	 */
630	SYSCTL_WLOCK();
631	TAILQ_FOREACH(e, clist, link) {
632		error = sysctl_remove_oid_locked(e->entry, 0, 0);
633		if (error)
634			break;
635	}
636	/*
637	 * Restore deregistered entries, either from the end,
638	 * or from the place where error occurred.
639	 * e contains the entry that was not unregistered
640	 */
641	if (error)
642		e1 = TAILQ_PREV(e, sysctl_ctx_list, link);
643	else
644		e1 = TAILQ_LAST(clist, sysctl_ctx_list);
645	while (e1 != NULL) {
646		sysctl_register_oid(e1->entry);
647		e1 = TAILQ_PREV(e1, sysctl_ctx_list, link);
648	}
649	if (error) {
650		SYSCTL_WUNLOCK();
651		return(EBUSY);
652	}
653	/* Now really delete the entries */
654	TAILQ_FOREACH_SAFE(e, clist, link, e1) {
655		error = sysctl_remove_oid_locked(e->entry, 1, 0);
656		if (error)
657			panic("sysctl_remove_oid: corrupt tree, entry: %s",
658			    e->entry->oid_name);
659		free(e, M_SYSCTLOID);
660	}
661	SYSCTL_WUNLOCK();
662	TAILQ_INIT(clist);
663	return (error);
664}
665
666/* Add an entry to the context */
667struct sysctl_ctx_entry *
668sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
669{
670	struct sysctl_ctx_entry *e;
671
672	SYSCTL_ASSERT_WLOCKED();
673	if (clist == NULL || oidp == NULL)
674		return(NULL);
675	e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK);
676	e->entry = oidp;
677	TAILQ_INSERT_HEAD(clist, e, link);
678	return (e);
679}
680
681/* Find an entry in the context */
682struct sysctl_ctx_entry *
683sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
684{
685	struct sysctl_ctx_entry *e;
686
687	SYSCTL_ASSERT_WLOCKED();
688	if (clist == NULL || oidp == NULL)
689		return(NULL);
690	TAILQ_FOREACH(e, clist, link) {
691		if (e->entry == oidp)
692			return(e);
693	}
694	return (e);
695}
696
697/*
698 * Delete an entry from the context.
699 * NOTE: this function doesn't free oidp! You have to remove it
700 * with sysctl_remove_oid().
701 */
702int
703sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
704{
705	struct sysctl_ctx_entry *e;
706
707	if (clist == NULL || oidp == NULL)
708		return (EINVAL);
709	SYSCTL_WLOCK();
710	e = sysctl_ctx_entry_find(clist, oidp);
711	if (e != NULL) {
712		TAILQ_REMOVE(clist, e, link);
713		SYSCTL_WUNLOCK();
714		free(e, M_SYSCTLOID);
715		return (0);
716	} else {
717		SYSCTL_WUNLOCK();
718		return (ENOENT);
719	}
720}
721
722/*
723 * Remove dynamically created sysctl trees.
724 * oidp - top of the tree to be removed
725 * del - if 0 - just deregister, otherwise free up entries as well
726 * recurse - if != 0 traverse the subtree to be deleted
727 */
728int
729sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse)
730{
731	int error;
732
733	SYSCTL_WLOCK();
734	error = sysctl_remove_oid_locked(oidp, del, recurse);
735	SYSCTL_WUNLOCK();
736	return (error);
737}
738
739int
740sysctl_remove_name(struct sysctl_oid *parent, const char *name,
741    int del, int recurse)
742{
743	struct sysctl_oid *p;
744	int error;
745
746	error = ENOENT;
747	SYSCTL_WLOCK();
748	p = sysctl_find_oidname(name, &parent->oid_children);
749	if (p)
750		error = sysctl_remove_oid_locked(p, del, recurse);
751	SYSCTL_WUNLOCK();
752
753	return (error);
754}
755
756/*
757 * Duplicate the provided string, escaping any illegal characters.  The result
758 * must be freed when no longer in use.
759 *
760 * The list of illegal characters is ".".
761 */
762static char*
763sysctl_escape_name(const char* orig)
764{
765	int i, s = 0, d = 0, nillegals = 0;
766	char *new;
767
768	/* First count the number of illegal characters */
769	for (i = 0; orig[i] != '\0'; i++) {
770		if (orig[i] == '.')
771			nillegals++;
772	}
773
774	/* Allocate storage for new string */
775	new = malloc(i + 2 * nillegals + 1, M_SYSCTLOID, M_WAITOK);
776
777	/* Copy the name, escaping characters as we go */
778	while (orig[s] != '\0') {
779		if (orig[s] == '.') {
780			/* %25 is the hexadecimal representation of '.' */
781			new[d++] = '%';
782			new[d++] = '2';
783			new[d++] = '5';
784			s++;
785		} else {
786			new[d++] = orig[s++];
787		}
788	}
789
790	/* Finally, nul-terminate */
791	new[d] = '\0';
792
793	return (new);
794}
795
796static int
797sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse)
798{
799	struct sysctl_oid *p, *tmp;
800	int error;
801
802	SYSCTL_ASSERT_WLOCKED();
803	if (oidp == NULL)
804		return(EINVAL);
805	if ((oidp->oid_kind & CTLFLAG_DYN) == 0) {
806		printf("Warning: can't remove non-dynamic nodes (%s)!\n",
807		    oidp->oid_name);
808		return (EINVAL);
809	}
810	/*
811	 * WARNING: normal method to do this should be through
812	 * sysctl_ctx_free(). Use recursing as the last resort
813	 * method to purge your sysctl tree of leftovers...
814	 * However, if some other code still references these nodes,
815	 * it will panic.
816	 */
817	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
818		if (oidp->oid_refcnt == 1) {
819			for(p = RB_MIN(sysctl_oid_list, &oidp->oid_children);
820			    p != NULL; p = tmp) {
821				if (!recurse) {
822					printf("Warning: failed attempt to "
823					    "remove oid %s with child %s\n",
824					    oidp->oid_name, p->oid_name);
825					return (ENOTEMPTY);
826				}
827				tmp = RB_NEXT(sysctl_oid_list,
828				    &oidp->oid_children, p);
829				error = sysctl_remove_oid_locked(p, del,
830				    recurse);
831				if (error)
832					return (error);
833			}
834		}
835	}
836	if (oidp->oid_refcnt > 1 ) {
837		oidp->oid_refcnt--;
838	} else {
839		if (oidp->oid_refcnt == 0) {
840			printf("Warning: bad oid_refcnt=%u (%s)!\n",
841				oidp->oid_refcnt, oidp->oid_name);
842			return (EINVAL);
843		}
844		sysctl_unregister_oid(oidp);
845		if (del) {
846			/*
847			 * Wait for all threads running the handler to drain.
848			 * This preserves the previous behavior when the
849			 * sysctl lock was held across a handler invocation,
850			 * and is necessary for module unload correctness.
851			 */
852			while (oidp->oid_running > 0) {
853				oidp->oid_kind |= CTLFLAG_DYING;
854				SYSCTL_SLEEP(&oidp->oid_running, "oidrm", 0);
855			}
856			if (oidp->oid_descr)
857				free(__DECONST(char *, oidp->oid_descr),
858				    M_SYSCTLOID);
859			if (oidp->oid_label)
860				free(__DECONST(char *, oidp->oid_label),
861				    M_SYSCTLOID);
862			free(__DECONST(char *, oidp->oid_name), M_SYSCTLOID);
863			free(oidp, M_SYSCTLOID);
864		}
865	}
866	return (0);
867}
868/*
869 * Create new sysctls at run time.
870 * clist may point to a valid context initialized with sysctl_ctx_init().
871 */
872struct sysctl_oid *
873sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent,
874	int number, const char *name, int kind, void *arg1, intmax_t arg2,
875	int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr,
876	const char *label)
877{
878	struct sysctl_oid *oidp;
879	char *escaped;
880
881	/* You have to hook up somewhere.. */
882	if (parent == NULL)
883		return(NULL);
884	escaped = sysctl_escape_name(name);
885	/* Check if the node already exists, otherwise create it */
886	SYSCTL_WLOCK();
887	oidp = sysctl_find_oidname(escaped, parent);
888	if (oidp != NULL) {
889		free(escaped, M_SYSCTLOID);
890		if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
891			oidp->oid_refcnt++;
892			/* Update the context */
893			if (clist != NULL)
894				sysctl_ctx_entry_add(clist, oidp);
895			SYSCTL_WUNLOCK();
896			return (oidp);
897		} else {
898			sysctl_warn_reuse(__func__, oidp);
899			SYSCTL_WUNLOCK();
900			return (NULL);
901		}
902	}
903	oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO);
904	oidp->oid_parent = parent;
905	RB_INIT(&oidp->oid_children);
906	oidp->oid_number = number;
907	oidp->oid_refcnt = 1;
908	oidp->oid_name = escaped;
909	oidp->oid_handler = handler;
910	oidp->oid_kind = CTLFLAG_DYN | kind;
911	oidp->oid_arg1 = arg1;
912	oidp->oid_arg2 = arg2;
913	oidp->oid_fmt = fmt;
914	if (descr != NULL)
915		oidp->oid_descr = strdup(descr, M_SYSCTLOID);
916	if (label != NULL)
917		oidp->oid_label = strdup(label, M_SYSCTLOID);
918	/* Update the context, if used */
919	if (clist != NULL)
920		sysctl_ctx_entry_add(clist, oidp);
921	/* Register this oid */
922	sysctl_register_oid(oidp);
923	SYSCTL_WUNLOCK();
924	return (oidp);
925}
926
927/*
928 * Rename an existing oid.
929 */
930void
931sysctl_rename_oid(struct sysctl_oid *oidp, const char *name)
932{
933	char *newname;
934	char *oldname;
935
936	newname = strdup(name, M_SYSCTLOID);
937	SYSCTL_WLOCK();
938	oldname = __DECONST(char *, oidp->oid_name);
939	oidp->oid_name = newname;
940	SYSCTL_WUNLOCK();
941	free(oldname, M_SYSCTLOID);
942}
943
944/*
945 * Reparent an existing oid.
946 */
947int
948sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent)
949{
950	struct sysctl_oid *oidp;
951
952	SYSCTL_WLOCK();
953	if (oid->oid_parent == parent) {
954		SYSCTL_WUNLOCK();
955		return (0);
956	}
957	oidp = sysctl_find_oidname(oid->oid_name, parent);
958	if (oidp != NULL) {
959		SYSCTL_WUNLOCK();
960		return (EEXIST);
961	}
962	sysctl_unregister_oid(oid);
963	oid->oid_parent = parent;
964	oid->oid_number = OID_AUTO;
965	sysctl_register_oid(oid);
966	SYSCTL_WUNLOCK();
967	return (0);
968}
969
970/*
971 * Register the kernel's oids on startup.
972 */
973SET_DECLARE(sysctl_set, struct sysctl_oid);
974
975static void
976sysctl_register_all(void *arg)
977{
978	struct sysctl_oid **oidp;
979
980	sx_init(&sysctlmemlock, "sysctl mem");
981	sx_init(&sysctlstringlock, "sysctl string handler");
982	SYSCTL_INIT();
983	SYSCTL_WLOCK();
984	SET_FOREACH(oidp, sysctl_set)
985		sysctl_register_oid(*oidp);
986	SYSCTL_WUNLOCK();
987}
988SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_FIRST, sysctl_register_all, NULL);
989
990#ifdef VIMAGE
991static void
992sysctl_setenv_vnet(void *arg __unused, const char *name)
993{
994	struct sysctl_oid *oidp;
995	int oid[CTL_MAXNAME];
996	int error, nlen;
997
998	SYSCTL_WLOCK();
999	error = name2oid(name, oid, &nlen, &oidp);
1000	if (error)
1001		goto out;
1002
1003	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE &&
1004	    (oidp->oid_kind & CTLFLAG_VNET) != 0 &&
1005	    (oidp->oid_kind & CTLFLAG_TUN) != 0 &&
1006	    (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) {
1007		/* Update value from kernel environment */
1008		sysctl_load_tunable_by_oid_locked(oidp);
1009	}
1010out:
1011	SYSCTL_WUNLOCK();
1012}
1013
1014static void
1015sysctl_unsetenv_vnet(void *arg __unused, const char *name)
1016{
1017	struct sysctl_oid *oidp;
1018	int oid[CTL_MAXNAME];
1019	int error, nlen;
1020
1021	SYSCTL_WLOCK();
1022	/*
1023	 * The setenv / unsetenv event handlers are invoked by kern_setenv() /
1024	 * kern_unsetenv() without exclusive locks. It is rare but still possible
1025	 * that the invoke order of event handlers is different from that of
1026	 * kern_setenv() and kern_unsetenv().
1027	 * Re-check environment variable string to make sure it is unset.
1028	 */
1029	if (testenv(name))
1030		goto out;
1031	error = name2oid(name, oid, &nlen, &oidp);
1032	if (error)
1033		goto out;
1034
1035	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE &&
1036	    (oidp->oid_kind & CTLFLAG_VNET) != 0 &&
1037	    (oidp->oid_kind & CTLFLAG_TUN) != 0 &&
1038	    (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) {
1039		size_t size;
1040
1041		switch (oidp->oid_kind & CTLTYPE) {
1042		case CTLTYPE_INT:
1043		case CTLTYPE_UINT:
1044			size = sizeof(int);
1045			break;
1046		case CTLTYPE_LONG:
1047		case CTLTYPE_ULONG:
1048			size = sizeof(long);
1049			break;
1050		case CTLTYPE_S8:
1051		case CTLTYPE_U8:
1052			size = sizeof(int8_t);
1053			break;
1054		case CTLTYPE_S16:
1055		case CTLTYPE_U16:
1056			size = sizeof(int16_t);
1057			break;
1058		case CTLTYPE_S32:
1059		case CTLTYPE_U32:
1060			size = sizeof(int32_t);
1061			break;
1062		case CTLTYPE_S64:
1063		case CTLTYPE_U64:
1064			size = sizeof(int64_t);
1065			break;
1066		case CTLTYPE_STRING:
1067			MPASS(oidp->oid_arg2 > 0);
1068			size = oidp->oid_arg2;
1069			break;
1070		default:
1071			goto out;
1072		}
1073		vnet_restore_init(oidp->oid_arg1, size);
1074	}
1075out:
1076	SYSCTL_WUNLOCK();
1077}
1078
1079/*
1080 * Register the kernel's setenv / unsetenv events.
1081 */
1082EVENTHANDLER_DEFINE(setenv, sysctl_setenv_vnet, NULL, EVENTHANDLER_PRI_ANY);
1083EVENTHANDLER_DEFINE(unsetenv, sysctl_unsetenv_vnet, NULL, EVENTHANDLER_PRI_ANY);
1084#endif
1085
1086/*
1087 * "Staff-functions"
1088 *
1089 * These functions implement a presently undocumented interface
1090 * used by the sysctl program to walk the tree, and get the type
1091 * so it can print the value.
1092 * This interface is under work and consideration, and should probably
1093 * be killed with a big axe by the first person who can find the time.
1094 * (be aware though, that the proper interface isn't as obvious as it
1095 * may seem, there are various conflicting requirements.
1096 *
1097 * {CTL_SYSCTL, CTL_SYSCTL_DEBUG}		printf the entire MIB-tree.
1098 * {CTL_SYSCTL, CTL_SYSCTL_NAME, ...}		return the name of the "..."
1099 *						OID.
1100 * {CTL_SYSCTL, CTL_SYSCTL_NEXT, ...}		return the next OID, honoring
1101 *						CTLFLAG_SKIP.
1102 * {CTL_SYSCTL, CTL_SYSCTL_NAME2OID}		return the OID of the name in
1103 *						"new"
1104 * {CTL_SYSCTL, CTL_SYSCTL_OIDFMT, ...}		return the kind & format info
1105 *						for the "..." OID.
1106 * {CTL_SYSCTL, CTL_SYSCTL_OIDDESCR, ...}	return the description of the
1107 *						"..." OID.
1108 * {CTL_SYSCTL, CTL_SYSCTL_OIDLABEL, ...}	return the aggregation label of
1109 *						the "..." OID.
1110 * {CTL_SYSCTL, CTL_SYSCTL_NEXTNOSKIP, ...}	return the next OID, ignoring
1111 *						CTLFLAG_SKIP.
1112 */
1113
1114#ifdef SYSCTL_DEBUG
1115static void
1116sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
1117{
1118	int k;
1119	struct sysctl_oid *oidp;
1120
1121	SYSCTL_ASSERT_LOCKED();
1122	SYSCTL_FOREACH(oidp, l) {
1123		for (k=0; k<i; k++)
1124			printf(" ");
1125
1126		printf("%d %s ", oidp->oid_number, oidp->oid_name);
1127
1128		printf("%c%c",
1129			oidp->oid_kind & CTLFLAG_RD ? 'R':' ',
1130			oidp->oid_kind & CTLFLAG_WR ? 'W':' ');
1131
1132		if (oidp->oid_handler)
1133			printf(" *Handler");
1134
1135		switch (oidp->oid_kind & CTLTYPE) {
1136			case CTLTYPE_NODE:
1137				printf(" Node\n");
1138				if (!oidp->oid_handler) {
1139					sysctl_sysctl_debug_dump_node(
1140					    SYSCTL_CHILDREN(oidp), i + 2);
1141				}
1142				break;
1143			case CTLTYPE_INT:    printf(" Int\n"); break;
1144			case CTLTYPE_UINT:   printf(" u_int\n"); break;
1145			case CTLTYPE_LONG:   printf(" Long\n"); break;
1146			case CTLTYPE_ULONG:  printf(" u_long\n"); break;
1147			case CTLTYPE_STRING: printf(" String\n"); break;
1148			case CTLTYPE_S8:     printf(" int8_t\n"); break;
1149			case CTLTYPE_S16:    printf(" int16_t\n"); break;
1150			case CTLTYPE_S32:    printf(" int32_t\n"); break;
1151			case CTLTYPE_S64:    printf(" int64_t\n"); break;
1152			case CTLTYPE_U8:     printf(" uint8_t\n"); break;
1153			case CTLTYPE_U16:    printf(" uint16_t\n"); break;
1154			case CTLTYPE_U32:    printf(" uint32_t\n"); break;
1155			case CTLTYPE_U64:    printf(" uint64_t\n"); break;
1156			case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
1157			default:	     printf("\n");
1158		}
1159	}
1160}
1161
1162static int
1163sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS)
1164{
1165	struct rm_priotracker tracker;
1166	int error;
1167
1168	error = priv_check(req->td, PRIV_SYSCTL_DEBUG);
1169	if (error)
1170		return (error);
1171	SYSCTL_RLOCK(&tracker);
1172	sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
1173	SYSCTL_RUNLOCK(&tracker);
1174	return (ENOENT);
1175}
1176
1177SYSCTL_PROC(_sysctl, CTL_SYSCTL_DEBUG, debug, CTLTYPE_STRING | CTLFLAG_RD |
1178    CTLFLAG_MPSAFE, 0, 0, sysctl_sysctl_debug, "-", "");
1179#endif
1180
1181static int
1182sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
1183{
1184	int *name = (int *) arg1;
1185	u_int namelen = arg2;
1186	int error;
1187	struct sysctl_oid *oid, key;
1188	struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
1189	struct rm_priotracker tracker;
1190	char buf[10];
1191
1192	error = sysctl_wire_old_buffer(req, 0);
1193	if (error)
1194		return (error);
1195
1196	SYSCTL_RLOCK(&tracker);
1197	while (namelen) {
1198		if (!lsp) {
1199			snprintf(buf,sizeof(buf),"%d",*name);
1200			if (req->oldidx)
1201				error = SYSCTL_OUT(req, ".", 1);
1202			if (!error)
1203				error = SYSCTL_OUT(req, buf, strlen(buf));
1204			if (error)
1205				goto out;
1206			namelen--;
1207			name++;
1208			continue;
1209		}
1210		lsp2 = NULL;
1211		key.oid_number = *name;
1212		oid = RB_FIND(sysctl_oid_list, lsp, &key);
1213		if (oid) {
1214			if (req->oldidx)
1215				error = SYSCTL_OUT(req, ".", 1);
1216			if (!error)
1217				error = SYSCTL_OUT(req, oid->oid_name,
1218					strlen(oid->oid_name));
1219			if (error)
1220				goto out;
1221
1222			namelen--;
1223			name++;
1224
1225			if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE &&
1226				!oid->oid_handler)
1227				lsp2 = SYSCTL_CHILDREN(oid);
1228		}
1229		lsp = lsp2;
1230	}
1231	error = SYSCTL_OUT(req, "", 1);
1232 out:
1233	SYSCTL_RUNLOCK(&tracker);
1234	return (error);
1235}
1236
1237/*
1238 * XXXRW/JA: Shouldn't return name data for nodes that we don't permit in
1239 * capability mode.
1240 */
1241static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NAME, name, CTLFLAG_RD |
1242    CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_name, "");
1243
1244enum sysctl_iter_action {
1245	ITER_SIBLINGS,	/* Not matched, continue iterating siblings */
1246	ITER_CHILDREN,	/* Node has children we need to iterate over them */
1247	ITER_FOUND,	/* Matching node was found */
1248};
1249
1250/*
1251 * Tries to find the next node for @name and @namelen.
1252 *
1253 * Returns next action to take.
1254 */
1255static enum sysctl_iter_action
1256sysctl_sysctl_next_node(struct sysctl_oid *oidp, int *name, unsigned int namelen,
1257    bool honor_skip)
1258{
1259
1260	if ((oidp->oid_kind & CTLFLAG_DORMANT) != 0)
1261		return (ITER_SIBLINGS);
1262
1263	if (honor_skip && (oidp->oid_kind & CTLFLAG_SKIP) != 0)
1264		return (ITER_SIBLINGS);
1265
1266	if (namelen == 0) {
1267		/*
1268		 * We have reached a node with a full name match and are
1269		 * looking for the next oid in its children.
1270		 *
1271		 * For CTL_SYSCTL_NEXTNOSKIP we are done.
1272		 *
1273		 * For CTL_SYSCTL_NEXT we skip CTLTYPE_NODE (unless it
1274		 * has a handler) and move on to the children.
1275		 */
1276		if (!honor_skip)
1277			return (ITER_FOUND);
1278		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
1279			return (ITER_FOUND);
1280		/* If node does not have an iterator, treat it as leaf */
1281		if (oidp->oid_handler)
1282			return (ITER_FOUND);
1283
1284		/* Report oid as a node to iterate */
1285		return (ITER_CHILDREN);
1286	}
1287
1288	/*
1289	 * No match yet. Continue seeking the given name.
1290	 *
1291	 * We are iterating in order by oid_number, so skip oids lower
1292	 * than the one we are looking for.
1293	 *
1294	 * When the current oid_number is higher than the one we seek,
1295	 * that means we have reached the next oid in the sequence and
1296	 * should return it.
1297	 *
1298	 * If the oid_number matches the name at this level then we
1299	 * have to find a node to continue searching at the next level.
1300	 */
1301	if (oidp->oid_number < *name)
1302		return (ITER_SIBLINGS);
1303	if (oidp->oid_number > *name) {
1304		/*
1305		 * We have reached the next oid.
1306		 *
1307		 * For CTL_SYSCTL_NEXTNOSKIP we are done.
1308		 *
1309		 * For CTL_SYSCTL_NEXT we skip CTLTYPE_NODE (unless it
1310		 * has a handler) and move on to the children.
1311		 */
1312		if (!honor_skip)
1313			return (ITER_FOUND);
1314		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
1315			return (ITER_FOUND);
1316		/* If node does not have an iterator, treat it as leaf */
1317		if (oidp->oid_handler)
1318			return (ITER_FOUND);
1319		return (ITER_CHILDREN);
1320	}
1321
1322	/* match at a current level */
1323	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
1324		return (ITER_SIBLINGS);
1325	if (oidp->oid_handler)
1326		return (ITER_SIBLINGS);
1327
1328	return (ITER_CHILDREN);
1329}
1330
1331/*
1332 * Recursively walk the sysctl subtree at lsp until we find the given name.
1333 * Returns true and fills in next oid data in @next and @len if oid is found.
1334 */
1335static bool
1336sysctl_sysctl_next_action(struct sysctl_oid_list *lsp, int *name, u_int namelen,
1337    int *next, int *len, int level, bool honor_skip)
1338{
1339	struct sysctl_oid_list *next_lsp;
1340	struct sysctl_oid *oidp = NULL, key;
1341	bool success = false;
1342	enum sysctl_iter_action action;
1343
1344	SYSCTL_ASSERT_LOCKED();
1345	/*
1346	 * Start the search at the requested oid.  But if not found, then scan
1347	 * through all children.
1348	 */
1349	if (namelen > 0) {
1350		key.oid_number = *name;
1351		oidp = RB_FIND(sysctl_oid_list, lsp, &key);
1352	}
1353	if (!oidp)
1354		oidp = RB_MIN(sysctl_oid_list, lsp);
1355	for(; oidp != NULL; oidp = RB_NEXT(sysctl_oid_list, lsp, oidp)) {
1356		action = sysctl_sysctl_next_node(oidp, name, namelen,
1357		    honor_skip);
1358		if (action == ITER_SIBLINGS)
1359			continue;
1360		if (action == ITER_FOUND) {
1361			success = true;
1362			break;
1363		}
1364		KASSERT((action== ITER_CHILDREN), ("ret(%d)!=ITER_CHILDREN", action));
1365
1366		next_lsp = SYSCTL_CHILDREN(oidp);
1367		if (namelen == 0) {
1368			success = sysctl_sysctl_next_action(next_lsp, NULL, 0,
1369			    next + 1, len, level + 1, honor_skip);
1370		} else {
1371			success = sysctl_sysctl_next_action(next_lsp, name + 1,
1372			    namelen - 1, next + 1, len, level + 1, honor_skip);
1373			if (!success) {
1374
1375				/*
1376				 * We maintain the invariant that current node oid
1377				 * is >= the oid provided in @name.
1378				 * As there are no usable children at this node,
1379				 *  current node oid is strictly > than the requested
1380				 *  oid.
1381				 * Hence, reduce namelen to 0 to allow for picking first
1382				 *  nodes/leafs in the next node in list.
1383				 */
1384				namelen = 0;
1385			}
1386		}
1387		if (success)
1388			break;
1389	}
1390
1391	if (success) {
1392		*next = oidp->oid_number;
1393		if (level > *len)
1394			*len = level;
1395	}
1396
1397	return (success);
1398}
1399
1400static int
1401sysctl_sysctl_next(SYSCTL_HANDLER_ARGS)
1402{
1403	int *name = (int *) arg1;
1404	u_int namelen = arg2;
1405	int len, error;
1406	bool success;
1407	struct sysctl_oid_list *lsp = &sysctl__children;
1408	struct rm_priotracker tracker;
1409	int next[CTL_MAXNAME];
1410
1411	len = 0;
1412	SYSCTL_RLOCK(&tracker);
1413	success = sysctl_sysctl_next_action(lsp, name, namelen, next, &len, 1,
1414	    oidp->oid_number == CTL_SYSCTL_NEXT);
1415	SYSCTL_RUNLOCK(&tracker);
1416	if (!success)
1417		return (ENOENT);
1418	error = SYSCTL_OUT(req, next, len * sizeof (int));
1419	return (error);
1420}
1421
1422/*
1423 * XXXRW/JA: Shouldn't return next data for nodes that we don't permit in
1424 * capability mode.
1425 */
1426static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NEXT, next, CTLFLAG_RD |
1427    CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, "");
1428
1429static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NEXTNOSKIP, nextnoskip, CTLFLAG_RD |
1430    CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, "");
1431
1432static int
1433name2oid(const char *name, int *oid, int *len, struct sysctl_oid **oidpp)
1434{
1435	struct sysctl_oid *oidp;
1436	struct sysctl_oid_list *lsp = &sysctl__children;
1437	const char *n;
1438
1439	SYSCTL_ASSERT_LOCKED();
1440
1441	for (*len = 0; *len < CTL_MAXNAME;) {
1442		n = strchrnul(name, '.');
1443		oidp = sysctl_find_oidnamelen(name, n - name, lsp);
1444		if (oidp == NULL)
1445			return (ENOENT);
1446		*oid++ = oidp->oid_number;
1447		(*len)++;
1448
1449		name = n;
1450		if (*name == '.')
1451			name++;
1452		if (*name == '\0') {
1453			if (oidpp)
1454				*oidpp = oidp;
1455			return (0);
1456		}
1457
1458		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
1459			break;
1460
1461		if (oidp->oid_handler)
1462			break;
1463
1464		lsp = SYSCTL_CHILDREN(oidp);
1465	}
1466	return (ENOENT);
1467}
1468
1469static int
1470sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS)
1471{
1472	char *p;
1473	int error, oid[CTL_MAXNAME], len = 0;
1474	struct sysctl_oid *op = NULL;
1475	struct rm_priotracker tracker;
1476	char buf[32];
1477
1478	if (!req->newlen)
1479		return (ENOENT);
1480	if (req->newlen >= MAXPATHLEN)	/* XXX arbitrary, undocumented */
1481		return (ENAMETOOLONG);
1482
1483	p = buf;
1484	if (req->newlen >= sizeof(buf))
1485		p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
1486
1487	error = SYSCTL_IN(req, p, req->newlen);
1488	if (error) {
1489		if (p != buf)
1490			free(p, M_SYSCTL);
1491		return (error);
1492	}
1493
1494	p [req->newlen] = '\0';
1495
1496	SYSCTL_RLOCK(&tracker);
1497	error = name2oid(p, oid, &len, &op);
1498	SYSCTL_RUNLOCK(&tracker);
1499
1500	if (p != buf)
1501		free(p, M_SYSCTL);
1502
1503	if (error)
1504		return (error);
1505
1506	error = SYSCTL_OUT(req, oid, len * sizeof *oid);
1507	return (error);
1508}
1509
1510/*
1511 * XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in
1512 * capability mode.
1513 */
1514SYSCTL_PROC(_sysctl, CTL_SYSCTL_NAME2OID, name2oid, CTLTYPE_INT | CTLFLAG_RW |
1515    CTLFLAG_ANYBODY | CTLFLAG_MPSAFE | CTLFLAG_CAPRW, 0, 0,
1516    sysctl_sysctl_name2oid, "I", "");
1517
1518static int
1519sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS)
1520{
1521	struct sysctl_oid *oid;
1522	struct rm_priotracker tracker;
1523	int error;
1524
1525	error = sysctl_wire_old_buffer(req, 0);
1526	if (error)
1527		return (error);
1528
1529	SYSCTL_RLOCK(&tracker);
1530	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
1531	if (error)
1532		goto out;
1533
1534	if (oid->oid_fmt == NULL) {
1535		error = ENOENT;
1536		goto out;
1537	}
1538	error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind));
1539	if (error)
1540		goto out;
1541	error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1);
1542 out:
1543	SYSCTL_RUNLOCK(&tracker);
1544	return (error);
1545}
1546
1547static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDFMT, oidfmt, CTLFLAG_RD |
1548    CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidfmt, "");
1549
1550static int
1551sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS)
1552{
1553	struct sysctl_oid *oid;
1554	struct rm_priotracker tracker;
1555	int error;
1556
1557	error = sysctl_wire_old_buffer(req, 0);
1558	if (error)
1559		return (error);
1560
1561	SYSCTL_RLOCK(&tracker);
1562	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
1563	if (error)
1564		goto out;
1565
1566	if (oid->oid_descr == NULL) {
1567		error = ENOENT;
1568		goto out;
1569	}
1570	error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1);
1571 out:
1572	SYSCTL_RUNLOCK(&tracker);
1573	return (error);
1574}
1575
1576static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDDESCR, oiddescr, CTLFLAG_RD |
1577    CTLFLAG_MPSAFE|CTLFLAG_CAPRD, sysctl_sysctl_oiddescr, "");
1578
1579static int
1580sysctl_sysctl_oidlabel(SYSCTL_HANDLER_ARGS)
1581{
1582	struct sysctl_oid *oid;
1583	struct rm_priotracker tracker;
1584	int error;
1585
1586	error = sysctl_wire_old_buffer(req, 0);
1587	if (error)
1588		return (error);
1589
1590	SYSCTL_RLOCK(&tracker);
1591	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
1592	if (error)
1593		goto out;
1594
1595	if (oid->oid_label == NULL) {
1596		error = ENOENT;
1597		goto out;
1598	}
1599	error = SYSCTL_OUT(req, oid->oid_label, strlen(oid->oid_label) + 1);
1600 out:
1601	SYSCTL_RUNLOCK(&tracker);
1602	return (error);
1603}
1604
1605static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDLABEL, oidlabel, CTLFLAG_RD |
1606    CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidlabel, "");
1607
1608/*
1609 * Default "handler" functions.
1610 */
1611
1612/*
1613 * Handle a bool.
1614 * Two cases:
1615 *     a variable:  point arg1 at it.
1616 *     a constant:  pass it in arg2.
1617 */
1618
1619int
1620sysctl_handle_bool(SYSCTL_HANDLER_ARGS)
1621{
1622	uint8_t temp;
1623	int error;
1624
1625	/*
1626	 * Attempt to get a coherent snapshot by making a copy of the data.
1627	 */
1628	if (arg1)
1629		temp = *(bool *)arg1 ? 1 : 0;
1630	else
1631		temp = arg2 ? 1 : 0;
1632
1633	error = SYSCTL_OUT(req, &temp, sizeof(temp));
1634	if (error || !req->newptr)
1635		return (error);
1636
1637	if (!arg1)
1638		error = EPERM;
1639	else {
1640		error = SYSCTL_IN(req, &temp, sizeof(temp));
1641		if (!error)
1642			*(bool *)arg1 = temp ? 1 : 0;
1643	}
1644	return (error);
1645}
1646
1647/*
1648 * Handle an int8_t, signed or unsigned.
1649 * Two cases:
1650 *     a variable:  point arg1 at it.
1651 *     a constant:  pass it in arg2.
1652 */
1653
1654int
1655sysctl_handle_8(SYSCTL_HANDLER_ARGS)
1656{
1657	int8_t tmpout;
1658	int error = 0;
1659
1660	/*
1661	 * Attempt to get a coherent snapshot by making a copy of the data.
1662	 */
1663	if (arg1)
1664		tmpout = *(int8_t *)arg1;
1665	else
1666		tmpout = arg2;
1667	error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout));
1668
1669	if (error || !req->newptr)
1670		return (error);
1671
1672	if (!arg1)
1673		error = EPERM;
1674	else
1675		error = SYSCTL_IN(req, arg1, sizeof(tmpout));
1676	return (error);
1677}
1678
1679/*
1680 * Handle an int16_t, signed or unsigned.
1681 * Two cases:
1682 *     a variable:  point arg1 at it.
1683 *     a constant:  pass it in arg2.
1684 */
1685
1686int
1687sysctl_handle_16(SYSCTL_HANDLER_ARGS)
1688{
1689	int16_t tmpout;
1690	int error = 0;
1691
1692	/*
1693	 * Attempt to get a coherent snapshot by making a copy of the data.
1694	 */
1695	if (arg1)
1696		tmpout = *(int16_t *)arg1;
1697	else
1698		tmpout = arg2;
1699	error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout));
1700
1701	if (error || !req->newptr)
1702		return (error);
1703
1704	if (!arg1)
1705		error = EPERM;
1706	else
1707		error = SYSCTL_IN(req, arg1, sizeof(tmpout));
1708	return (error);
1709}
1710
1711/*
1712 * Handle an int32_t, signed or unsigned.
1713 * Two cases:
1714 *     a variable:  point arg1 at it.
1715 *     a constant:  pass it in arg2.
1716 */
1717
1718int
1719sysctl_handle_32(SYSCTL_HANDLER_ARGS)
1720{
1721	int32_t tmpout;
1722	int error = 0;
1723
1724	/*
1725	 * Attempt to get a coherent snapshot by making a copy of the data.
1726	 */
1727	if (arg1)
1728		tmpout = *(int32_t *)arg1;
1729	else
1730		tmpout = arg2;
1731	error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout));
1732
1733	if (error || !req->newptr)
1734		return (error);
1735
1736	if (!arg1)
1737		error = EPERM;
1738	else
1739		error = SYSCTL_IN(req, arg1, sizeof(tmpout));
1740	return (error);
1741}
1742
1743/*
1744 * Handle an int, signed or unsigned.
1745 * Two cases:
1746 *     a variable:  point arg1 at it.
1747 *     a constant:  pass it in arg2.
1748 */
1749
1750int
1751sysctl_handle_int(SYSCTL_HANDLER_ARGS)
1752{
1753	int tmpout, error = 0;
1754
1755	/*
1756	 * Attempt to get a coherent snapshot by making a copy of the data.
1757	 */
1758	if (arg1)
1759		tmpout = *(int *)arg1;
1760	else
1761		tmpout = arg2;
1762	error = SYSCTL_OUT(req, &tmpout, sizeof(int));
1763
1764	if (error || !req->newptr)
1765		return (error);
1766
1767	if (!arg1)
1768		error = EPERM;
1769	else
1770		error = SYSCTL_IN(req, arg1, sizeof(int));
1771	return (error);
1772}
1773
1774/*
1775 * Based on sysctl_handle_int() convert milliseconds into ticks.
1776 * Note: this is used by TCP.
1777 */
1778
1779int
1780sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
1781{
1782	int error, s, tt;
1783
1784	tt = *(int *)arg1;
1785	s = (int)((int64_t)tt * 1000 / hz);
1786
1787	error = sysctl_handle_int(oidp, &s, 0, req);
1788	if (error || !req->newptr)
1789		return (error);
1790
1791	tt = (int)((int64_t)s * hz / 1000);
1792	if (tt < 1)
1793		return (EINVAL);
1794
1795	*(int *)arg1 = tt;
1796	return (0);
1797}
1798
1799/*
1800 * Handle a long, signed or unsigned.
1801 * Two cases:
1802 *     a variable:  point arg1 at it.
1803 *     a constant:  pass it in arg2.
1804 */
1805
1806int
1807sysctl_handle_long(SYSCTL_HANDLER_ARGS)
1808{
1809	int error = 0;
1810	long tmplong;
1811#ifdef SCTL_MASK32
1812	int tmpint;
1813#endif
1814
1815	/*
1816	 * Attempt to get a coherent snapshot by making a copy of the data.
1817	 */
1818	if (arg1)
1819		tmplong = *(long *)arg1;
1820	else
1821		tmplong = arg2;
1822#ifdef SCTL_MASK32
1823	if (req->flags & SCTL_MASK32) {
1824		tmpint = tmplong;
1825		error = SYSCTL_OUT(req, &tmpint, sizeof(int));
1826	} else
1827#endif
1828		error = SYSCTL_OUT(req, &tmplong, sizeof(long));
1829
1830	if (error || !req->newptr)
1831		return (error);
1832
1833	if (!arg1)
1834		error = EPERM;
1835#ifdef SCTL_MASK32
1836	else if (req->flags & SCTL_MASK32) {
1837		error = SYSCTL_IN(req, &tmpint, sizeof(int));
1838		*(long *)arg1 = (long)tmpint;
1839	}
1840#endif
1841	else
1842		error = SYSCTL_IN(req, arg1, sizeof(long));
1843	return (error);
1844}
1845
1846/*
1847 * Handle a 64 bit int, signed or unsigned.
1848 * Two cases:
1849 *     a variable:  point arg1 at it.
1850 *     a constant:  pass it in arg2.
1851 */
1852int
1853sysctl_handle_64(SYSCTL_HANDLER_ARGS)
1854{
1855	int error = 0;
1856	uint64_t tmpout;
1857
1858	/*
1859	 * Attempt to get a coherent snapshot by making a copy of the data.
1860	 */
1861	if (arg1)
1862		tmpout = *(uint64_t *)arg1;
1863	else
1864		tmpout = arg2;
1865	error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t));
1866
1867	if (error || !req->newptr)
1868		return (error);
1869
1870	if (!arg1)
1871		error = EPERM;
1872	else
1873		error = SYSCTL_IN(req, arg1, sizeof(uint64_t));
1874	return (error);
1875}
1876
1877/*
1878 * Handle our generic '\0' terminated 'C' string.
1879 * Two cases:
1880 * 	a variable string:  point arg1 at it, arg2 is max length.
1881 * 	a constant string:  point arg1 at it, arg2 is zero.
1882 */
1883
1884int
1885sysctl_handle_string(SYSCTL_HANDLER_ARGS)
1886{
1887	char *tmparg;
1888	size_t outlen;
1889	int error = 0, ro_string = 0;
1890
1891	/*
1892	 * If the sysctl isn't writable and isn't a preallocated tunable that
1893	 * can be modified by kenv(2), microoptimise and treat it as a
1894	 * read-only string.
1895	 * A zero-length buffer indicates a fixed size read-only
1896	 * string.  In ddb, don't worry about trying to make a malloced
1897	 * snapshot.
1898	 */
1899	if ((oidp->oid_kind & (CTLFLAG_WR | CTLFLAG_TUN)) == 0 ||
1900	    arg2 == 0 || kdb_active) {
1901		arg2 = strlen((char *)arg1) + 1;
1902		ro_string = 1;
1903	}
1904
1905	if (req->oldptr != NULL) {
1906		if (ro_string) {
1907			tmparg = arg1;
1908			outlen = strlen(tmparg) + 1;
1909		} else {
1910			tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK);
1911			sx_slock(&sysctlstringlock);
1912			memcpy(tmparg, arg1, arg2);
1913			sx_sunlock(&sysctlstringlock);
1914			outlen = strlen(tmparg) + 1;
1915		}
1916
1917		error = SYSCTL_OUT(req, tmparg, outlen);
1918
1919		if (!ro_string)
1920			free(tmparg, M_SYSCTLTMP);
1921	} else {
1922		if (!ro_string)
1923			sx_slock(&sysctlstringlock);
1924		outlen = strlen((char *)arg1) + 1;
1925		if (!ro_string)
1926			sx_sunlock(&sysctlstringlock);
1927		error = SYSCTL_OUT(req, NULL, outlen);
1928	}
1929	if (error || !req->newptr)
1930		return (error);
1931
1932	if (req->newlen - req->newidx >= arg2 ||
1933	    req->newlen - req->newidx < 0) {
1934		error = EINVAL;
1935	} else if (req->newlen - req->newidx == 0) {
1936		sx_xlock(&sysctlstringlock);
1937		((char *)arg1)[0] = '\0';
1938		sx_xunlock(&sysctlstringlock);
1939	} else if (req->newfunc == sysctl_new_kernel) {
1940		arg2 = req->newlen - req->newidx;
1941		sx_xlock(&sysctlstringlock);
1942		error = SYSCTL_IN(req, arg1, arg2);
1943		if (error == 0) {
1944			((char *)arg1)[arg2] = '\0';
1945			req->newidx += arg2;
1946		}
1947		sx_xunlock(&sysctlstringlock);
1948	} else {
1949		arg2 = req->newlen - req->newidx;
1950		tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK);
1951
1952		error = SYSCTL_IN(req, tmparg, arg2);
1953		if (error) {
1954			free(tmparg, M_SYSCTLTMP);
1955			return (error);
1956		}
1957
1958		sx_xlock(&sysctlstringlock);
1959		memcpy(arg1, tmparg, arg2);
1960		((char *)arg1)[arg2] = '\0';
1961		sx_xunlock(&sysctlstringlock);
1962		free(tmparg, M_SYSCTLTMP);
1963		req->newidx += arg2;
1964	}
1965	return (error);
1966}
1967
1968/*
1969 * Handle any kind of opaque data.
1970 * arg1 points to it, arg2 is the size.
1971 */
1972
1973int
1974sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
1975{
1976	int error, tries;
1977	u_int generation;
1978	struct sysctl_req req2;
1979
1980	/*
1981	 * Attempt to get a coherent snapshot, by using the thread
1982	 * pre-emption counter updated from within mi_switch() to
1983	 * determine if we were pre-empted during a bcopy() or
1984	 * copyout(). Make 3 attempts at doing this before giving up.
1985	 * If we encounter an error, stop immediately.
1986	 */
1987	tries = 0;
1988	req2 = *req;
1989retry:
1990	generation = curthread->td_generation;
1991	error = SYSCTL_OUT(req, arg1, arg2);
1992	if (error)
1993		return (error);
1994	tries++;
1995	if (generation != curthread->td_generation && tries < 3) {
1996		*req = req2;
1997		goto retry;
1998	}
1999
2000	error = SYSCTL_IN(req, arg1, arg2);
2001
2002	return (error);
2003}
2004
2005/*
2006 * Based on sysctl_handle_64() convert microseconds to a sbintime.
2007 */
2008int
2009sysctl_usec_to_sbintime(SYSCTL_HANDLER_ARGS)
2010{
2011	int error;
2012	int64_t usec;
2013
2014	usec = sbttous(*(sbintime_t *)arg1);
2015
2016	error = sysctl_handle_64(oidp, &usec, 0, req);
2017	if (error || !req->newptr)
2018		return (error);
2019
2020	*(sbintime_t *)arg1 = ustosbt(usec);
2021
2022	return (0);
2023}
2024
2025/*
2026 * Based on sysctl_handle_64() convert milliseconds to a sbintime.
2027 */
2028int
2029sysctl_msec_to_sbintime(SYSCTL_HANDLER_ARGS)
2030{
2031	int error;
2032	int64_t msec;
2033
2034	msec = sbttoms(*(sbintime_t *)arg1);
2035
2036	error = sysctl_handle_64(oidp, &msec, 0, req);
2037	if (error || !req->newptr)
2038		return (error);
2039
2040	*(sbintime_t *)arg1 = mstosbt(msec);
2041
2042	return (0);
2043}
2044
2045/*
2046 * Convert seconds to a struct timeval.  Intended for use with
2047 * intervals and thus does not permit negative seconds.
2048 */
2049int
2050sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS)
2051{
2052	struct timeval *tv;
2053	int error, secs;
2054
2055	tv = arg1;
2056	secs = tv->tv_sec;
2057
2058	error = sysctl_handle_int(oidp, &secs, 0, req);
2059	if (error || req->newptr == NULL)
2060		return (error);
2061
2062	if (secs < 0)
2063		return (EINVAL);
2064	tv->tv_sec = secs;
2065
2066	return (0);
2067}
2068
2069/*
2070 * Transfer functions to/from kernel space.
2071 * XXX: rather untested at this point
2072 */
2073static int
2074sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l)
2075{
2076	size_t i = 0;
2077
2078	if (req->oldptr) {
2079		i = l;
2080		if (req->oldlen <= req->oldidx)
2081			i = 0;
2082		else
2083			if (i > req->oldlen - req->oldidx)
2084				i = req->oldlen - req->oldidx;
2085		if (i > 0)
2086			bcopy(p, (char *)req->oldptr + req->oldidx, i);
2087	}
2088	req->oldidx += l;
2089	if (req->oldptr && i != l)
2090		return (ENOMEM);
2091	return (0);
2092}
2093
2094static int
2095sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l)
2096{
2097	if (!req->newptr)
2098		return (0);
2099	if (req->newlen - req->newidx < l)
2100		return (EINVAL);
2101	bcopy((const char *)req->newptr + req->newidx, p, l);
2102	req->newidx += l;
2103	return (0);
2104}
2105
2106int
2107kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
2108    size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags)
2109{
2110	int error = 0;
2111	struct sysctl_req req;
2112
2113	bzero(&req, sizeof req);
2114
2115	req.td = td;
2116	req.flags = flags;
2117
2118	if (oldlenp) {
2119		req.oldlen = *oldlenp;
2120	}
2121	req.validlen = req.oldlen;
2122
2123	if (old) {
2124		req.oldptr= old;
2125	}
2126
2127	if (new != NULL) {
2128		req.newlen = newlen;
2129		req.newptr = new;
2130	}
2131
2132	req.oldfunc = sysctl_old_kernel;
2133	req.newfunc = sysctl_new_kernel;
2134	req.lock = REQ_UNWIRED;
2135
2136	error = sysctl_root(0, name, namelen, &req);
2137
2138	if (req.lock == REQ_WIRED && req.validlen > 0)
2139		vsunlock(req.oldptr, req.validlen);
2140
2141	if (error && error != ENOMEM)
2142		return (error);
2143
2144	if (retval) {
2145		if (req.oldptr && req.oldidx > req.validlen)
2146			*retval = req.validlen;
2147		else
2148			*retval = req.oldidx;
2149	}
2150	return (error);
2151}
2152
2153int
2154kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
2155    void *new, size_t newlen, size_t *retval, int flags)
2156{
2157        int oid[CTL_MAXNAME];
2158        size_t oidlen, plen;
2159	int error;
2160
2161	oid[0] = CTL_SYSCTL;
2162	oid[1] = CTL_SYSCTL_NAME2OID;
2163	oidlen = sizeof(oid);
2164
2165	error = kernel_sysctl(td, oid, 2, oid, &oidlen,
2166	    (void *)name, strlen(name), &plen, flags);
2167	if (error)
2168		return (error);
2169
2170	error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp,
2171	    new, newlen, retval, flags);
2172	return (error);
2173}
2174
2175/*
2176 * Transfer function to/from user space.
2177 */
2178static int
2179sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
2180{
2181	size_t i, len, origidx;
2182	int error;
2183
2184	origidx = req->oldidx;
2185	req->oldidx += l;
2186	if (req->oldptr == NULL)
2187		return (0);
2188	/*
2189	 * If we have not wired the user supplied buffer and we are currently
2190	 * holding locks, drop a witness warning, as it's possible that
2191	 * write operations to the user page can sleep.
2192	 */
2193	if (req->lock != REQ_WIRED)
2194		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2195		    "sysctl_old_user()");
2196	i = l;
2197	len = req->validlen;
2198	if (len <= origidx)
2199		i = 0;
2200	else {
2201		if (i > len - origidx)
2202			i = len - origidx;
2203		if (req->lock == REQ_WIRED) {
2204			error = copyout_nofault(p, (char *)req->oldptr +
2205			    origidx, i);
2206		} else
2207			error = copyout(p, (char *)req->oldptr + origidx, i);
2208		if (error != 0)
2209			return (error);
2210	}
2211	if (i < l)
2212		return (ENOMEM);
2213	return (0);
2214}
2215
2216static int
2217sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
2218{
2219	int error;
2220
2221	if (!req->newptr)
2222		return (0);
2223	if (req->newlen - req->newidx < l)
2224		return (EINVAL);
2225	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2226	    "sysctl_new_user()");
2227	error = copyin((const char *)req->newptr + req->newidx, p, l);
2228	req->newidx += l;
2229	return (error);
2230}
2231
2232/*
2233 * Wire the user space destination buffer.  If set to a value greater than
2234 * zero, the len parameter limits the maximum amount of wired memory.
2235 */
2236int
2237sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
2238{
2239	int ret;
2240	size_t wiredlen;
2241
2242	wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen;
2243	ret = 0;
2244	if (req->lock != REQ_WIRED && req->oldptr &&
2245	    req->oldfunc == sysctl_old_user) {
2246		if (wiredlen != 0) {
2247			ret = vslock(req->oldptr, wiredlen);
2248			if (ret != 0) {
2249				if (ret != ENOMEM)
2250					return (ret);
2251				wiredlen = 0;
2252			}
2253		}
2254		req->lock = REQ_WIRED;
2255		req->validlen = wiredlen;
2256	}
2257	return (0);
2258}
2259
2260int
2261sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
2262    int *nindx, struct sysctl_req *req)
2263{
2264	struct sysctl_oid_list *lsp;
2265	struct sysctl_oid *oid;
2266	struct sysctl_oid key;
2267	int indx;
2268
2269	SYSCTL_ASSERT_LOCKED();
2270	lsp = &sysctl__children;
2271	indx = 0;
2272	while (indx < CTL_MAXNAME) {
2273		key.oid_number = name[indx];
2274		oid = RB_FIND(sysctl_oid_list, lsp, &key);
2275		if (oid == NULL)
2276			return (ENOENT);
2277
2278		indx++;
2279		if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
2280			if (oid->oid_handler != NULL || indx == namelen) {
2281				*noid = oid;
2282				if (nindx != NULL)
2283					*nindx = indx;
2284				KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
2285				    ("%s found DYING node %p", __func__, oid));
2286				return (0);
2287			}
2288			lsp = SYSCTL_CHILDREN(oid);
2289		} else if (indx == namelen) {
2290			if ((oid->oid_kind & CTLFLAG_DORMANT) != 0)
2291				return (ENOENT);
2292			*noid = oid;
2293			if (nindx != NULL)
2294				*nindx = indx;
2295			KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
2296			    ("%s found DYING node %p", __func__, oid));
2297			return (0);
2298		} else {
2299			return (ENOTDIR);
2300		}
2301	}
2302	return (ENOENT);
2303}
2304
2305/*
2306 * Traverse our tree, and find the right node, execute whatever it points
2307 * to, and return the resulting error code.
2308 */
2309
2310static int
2311sysctl_root(SYSCTL_HANDLER_ARGS)
2312{
2313	struct sysctl_oid *oid;
2314	struct rm_priotracker tracker;
2315	int error, indx, lvl;
2316
2317	SYSCTL_RLOCK(&tracker);
2318
2319	error = sysctl_find_oid(arg1, arg2, &oid, &indx, req);
2320	if (error)
2321		goto out;
2322
2323	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
2324		/*
2325		 * You can't call a sysctl when it's a node, but has
2326		 * no handler.  Inform the user that it's a node.
2327		 * The indx may or may not be the same as namelen.
2328		 */
2329		if (oid->oid_handler == NULL) {
2330			error = EISDIR;
2331			goto out;
2332		}
2333	}
2334
2335	/* Is this sysctl writable? */
2336	if (req->newptr && !(oid->oid_kind & CTLFLAG_WR)) {
2337		error = EPERM;
2338		goto out;
2339	}
2340
2341	KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL"));
2342
2343#ifdef CAPABILITY_MODE
2344	/*
2345	 * If the process is in capability mode, then don't permit reading or
2346	 * writing unless specifically granted for the node.
2347	 */
2348	if (IN_CAPABILITY_MODE(req->td)) {
2349		if ((req->oldptr && !(oid->oid_kind & CTLFLAG_CAPRD)) ||
2350		    (req->newptr && !(oid->oid_kind & CTLFLAG_CAPWR))) {
2351			error = EPERM;
2352			goto out;
2353		}
2354	}
2355#endif
2356
2357	/* Is this sysctl sensitive to securelevels? */
2358	if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) {
2359		lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE;
2360		error = securelevel_gt(req->td->td_ucred, lvl);
2361		if (error)
2362			goto out;
2363	}
2364
2365	/* Is this sysctl writable by only privileged users? */
2366	if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
2367		int priv;
2368
2369		if (oid->oid_kind & CTLFLAG_PRISON)
2370			priv = PRIV_SYSCTL_WRITEJAIL;
2371#ifdef VIMAGE
2372		else if ((oid->oid_kind & CTLFLAG_VNET) &&
2373		     prison_owns_vnet(req->td->td_ucred))
2374			priv = PRIV_SYSCTL_WRITEJAIL;
2375#endif
2376		else
2377			priv = PRIV_SYSCTL_WRITE;
2378		error = priv_check(req->td, priv);
2379		if (error)
2380			goto out;
2381	}
2382
2383	if (!oid->oid_handler) {
2384		error = EINVAL;
2385		goto out;
2386	}
2387
2388	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
2389		arg1 = (int *)arg1 + indx;
2390		arg2 -= indx;
2391	} else {
2392		arg1 = oid->oid_arg1;
2393		arg2 = oid->oid_arg2;
2394	}
2395#ifdef MAC
2396	error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2,
2397	    req);
2398	if (error != 0)
2399		goto out;
2400#endif
2401#ifdef VIMAGE
2402	if ((oid->oid_kind & CTLFLAG_VNET) && arg1 != NULL)
2403		arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1);
2404#endif
2405	error = sysctl_root_handler_locked(oid, arg1, arg2, req, &tracker);
2406
2407out:
2408	SYSCTL_RUNLOCK(&tracker);
2409	return (error);
2410}
2411
2412#ifndef _SYS_SYSPROTO_H_
2413struct __sysctl_args {
2414	int	*name;
2415	u_int	namelen;
2416	void	*old;
2417	size_t	*oldlenp;
2418	void	*new;
2419	size_t	newlen;
2420};
2421#endif
2422int
2423sys___sysctl(struct thread *td, struct __sysctl_args *uap)
2424{
2425	int error, i, name[CTL_MAXNAME];
2426	size_t j;
2427
2428	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
2429		return (EINVAL);
2430
2431 	error = copyin(uap->name, &name, uap->namelen * sizeof(int));
2432 	if (error)
2433		return (error);
2434
2435	error = userland_sysctl(td, name, uap->namelen,
2436		uap->old, uap->oldlenp, 0,
2437		uap->new, uap->newlen, &j, 0);
2438	if (error && error != ENOMEM)
2439		return (error);
2440	if (uap->oldlenp) {
2441		i = copyout(&j, uap->oldlenp, sizeof(j));
2442		if (i)
2443			return (i);
2444	}
2445	return (error);
2446}
2447
2448int
2449kern___sysctlbyname(struct thread *td, const char *oname, size_t namelen,
2450    void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval,
2451    int flags, bool inkernel)
2452{
2453	int oid[CTL_MAXNAME];
2454	char namebuf[16];
2455	char *name;
2456	size_t oidlen;
2457	int error;
2458
2459	if (namelen > MAXPATHLEN || namelen == 0)
2460		return (EINVAL);
2461	name = namebuf;
2462	if (namelen > sizeof(namebuf))
2463		name = malloc(namelen, M_SYSCTL, M_WAITOK);
2464	error = copyin(oname, name, namelen);
2465	if (error != 0)
2466		goto out;
2467
2468	oid[0] = CTL_SYSCTL;
2469	oid[1] = CTL_SYSCTL_NAME2OID;
2470	oidlen = sizeof(oid);
2471	error = kernel_sysctl(td, oid, 2, oid, &oidlen, (void *)name, namelen,
2472	    retval, flags);
2473	if (error != 0)
2474		goto out;
2475	error = userland_sysctl(td, oid, *retval / sizeof(int), old, oldlenp,
2476	    inkernel, new, newlen, retval, flags);
2477
2478out:
2479	if (namelen > sizeof(namebuf))
2480		free(name, M_SYSCTL);
2481	return (error);
2482}
2483
2484#ifndef	_SYS_SYSPROTO_H_
2485struct __sysctlbyname_args {
2486	const char	*name;
2487	size_t	namelen;
2488	void	*old;
2489	size_t	*oldlenp;
2490	void	*new;
2491	size_t	newlen;
2492};
2493#endif
2494int
2495sys___sysctlbyname(struct thread *td, struct __sysctlbyname_args *uap)
2496{
2497	size_t rv;
2498	int error;
2499
2500	error = kern___sysctlbyname(td, uap->name, uap->namelen, uap->old,
2501	    uap->oldlenp, uap->new, uap->newlen, &rv, 0, 0);
2502	if (error != 0)
2503		return (error);
2504	if (uap->oldlenp != NULL)
2505		error = copyout(&rv, uap->oldlenp, sizeof(rv));
2506
2507	return (error);
2508}
2509
2510/*
2511 * This is used from various compatibility syscalls too.  That's why name
2512 * must be in kernel space.
2513 */
2514int
2515userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
2516    size_t *oldlenp, int inkernel, const void *new, size_t newlen,
2517    size_t *retval, int flags)
2518{
2519	int error = 0, memlocked;
2520	struct sysctl_req req;
2521
2522	bzero(&req, sizeof req);
2523
2524	req.td = td;
2525	req.flags = flags;
2526
2527	if (oldlenp) {
2528		if (inkernel) {
2529			req.oldlen = *oldlenp;
2530		} else {
2531			error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
2532			if (error)
2533				return (error);
2534		}
2535	}
2536	req.validlen = req.oldlen;
2537	req.oldptr = old;
2538
2539	if (new != NULL) {
2540		req.newlen = newlen;
2541		req.newptr = new;
2542	}
2543
2544	req.oldfunc = sysctl_old_user;
2545	req.newfunc = sysctl_new_user;
2546	req.lock = REQ_UNWIRED;
2547
2548#ifdef KTRACE
2549	if (KTRPOINT(curthread, KTR_SYSCTL))
2550		ktrsysctl(name, namelen);
2551#endif
2552	memlocked = 0;
2553	if (req.oldptr && req.oldlen > 4 * PAGE_SIZE) {
2554		memlocked = 1;
2555		sx_xlock(&sysctlmemlock);
2556	}
2557	CURVNET_SET(TD_TO_VNET(td));
2558
2559	for (;;) {
2560		req.oldidx = 0;
2561		req.newidx = 0;
2562		error = sysctl_root(0, name, namelen, &req);
2563		if (error != EAGAIN)
2564			break;
2565		kern_yield(PRI_USER);
2566	}
2567
2568	CURVNET_RESTORE();
2569
2570	if (req.lock == REQ_WIRED && req.validlen > 0)
2571		vsunlock(req.oldptr, req.validlen);
2572	if (memlocked)
2573		sx_xunlock(&sysctlmemlock);
2574
2575	if (error && error != ENOMEM)
2576		return (error);
2577
2578	if (retval) {
2579		if (req.oldptr && req.oldidx > req.validlen)
2580			*retval = req.validlen;
2581		else
2582			*retval = req.oldidx;
2583	}
2584	return (error);
2585}
2586
2587/*
2588 * Drain into a sysctl struct.  The user buffer should be wired if a page
2589 * fault would cause issue.
2590 */
2591static int
2592sbuf_sysctl_drain(void *arg, const char *data, int len)
2593{
2594	struct sysctl_req *req = arg;
2595	int error;
2596
2597	error = SYSCTL_OUT(req, data, len);
2598	KASSERT(error >= 0, ("Got unexpected negative value %d", error));
2599	return (error == 0 ? len : -error);
2600}
2601
2602struct sbuf *
2603sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length,
2604    struct sysctl_req *req)
2605{
2606
2607	/* Supply a default buffer size if none given. */
2608	if (buf == NULL && length == 0)
2609		length = 64;
2610	s = sbuf_new(s, buf, length, SBUF_FIXEDLEN | SBUF_INCLUDENUL);
2611	sbuf_set_drain(s, sbuf_sysctl_drain, req);
2612	return (s);
2613}
2614
2615#ifdef DDB
2616
2617/* The current OID the debugger is working with */
2618static struct sysctl_oid *g_ddb_oid;
2619
2620/* The current flags specified by the user */
2621static int g_ddb_sysctl_flags;
2622
2623/* Check to see if the last sysctl printed */
2624static int g_ddb_sysctl_printed;
2625
2626static const int ctl_sign[CTLTYPE+1] = {
2627	[CTLTYPE_INT] = 1,
2628	[CTLTYPE_LONG] = 1,
2629	[CTLTYPE_S8] = 1,
2630	[CTLTYPE_S16] = 1,
2631	[CTLTYPE_S32] = 1,
2632	[CTLTYPE_S64] = 1,
2633};
2634
2635static const int ctl_size[CTLTYPE+1] = {
2636	[CTLTYPE_INT] = sizeof(int),
2637	[CTLTYPE_UINT] = sizeof(u_int),
2638	[CTLTYPE_LONG] = sizeof(long),
2639	[CTLTYPE_ULONG] = sizeof(u_long),
2640	[CTLTYPE_S8] = sizeof(int8_t),
2641	[CTLTYPE_S16] = sizeof(int16_t),
2642	[CTLTYPE_S32] = sizeof(int32_t),
2643	[CTLTYPE_S64] = sizeof(int64_t),
2644	[CTLTYPE_U8] = sizeof(uint8_t),
2645	[CTLTYPE_U16] = sizeof(uint16_t),
2646	[CTLTYPE_U32] = sizeof(uint32_t),
2647	[CTLTYPE_U64] = sizeof(uint64_t),
2648};
2649
2650#define DB_SYSCTL_NAME_ONLY	0x001	/* Compare with -N */
2651#define DB_SYSCTL_VALUE_ONLY	0x002	/* Compare with -n */
2652#define DB_SYSCTL_OPAQUE	0x004	/* Compare with -o */
2653#define DB_SYSCTL_HEX		0x008	/* Compare with -x */
2654
2655#define DB_SYSCTL_SAFE_ONLY	0x100	/* Only simple types */
2656
2657static const char db_sysctl_modifs[] = {
2658	'N', 'n', 'o', 'x',
2659};
2660
2661static const int db_sysctl_modif_values[] = {
2662	DB_SYSCTL_NAME_ONLY, DB_SYSCTL_VALUE_ONLY,
2663	DB_SYSCTL_OPAQUE, DB_SYSCTL_HEX,
2664};
2665
2666/* Handlers considered safe to print while recursing */
2667static int (* const db_safe_handlers[])(SYSCTL_HANDLER_ARGS) = {
2668	sysctl_handle_bool,
2669	sysctl_handle_8,
2670	sysctl_handle_16,
2671	sysctl_handle_32,
2672	sysctl_handle_64,
2673	sysctl_handle_int,
2674	sysctl_handle_long,
2675	sysctl_handle_string,
2676	sysctl_handle_opaque,
2677};
2678
2679/*
2680 * Use in place of sysctl_old_kernel to print sysctl values.
2681 *
2682 * Compare to the output handling in show_var from sbin/sysctl/sysctl.c
2683 */
2684static int
2685sysctl_old_ddb(struct sysctl_req *req, const void *ptr, size_t len)
2686{
2687	const u_char *val, *p;
2688	const char *sep1;
2689	size_t intlen, slen;
2690	uintmax_t umv;
2691	intmax_t mv;
2692	int sign, ctltype, hexlen, xflag, error;
2693
2694	/* Suppress false-positive GCC uninitialized variable warnings */
2695	mv = 0;
2696	umv = 0;
2697
2698	slen = len;
2699	val = p = ptr;
2700
2701	if (ptr == NULL) {
2702		error = 0;
2703		goto out;
2704	}
2705
2706	/* We are going to print */
2707	g_ddb_sysctl_printed = 1;
2708
2709	xflag = g_ddb_sysctl_flags & DB_SYSCTL_HEX;
2710
2711	ctltype = (g_ddb_oid->oid_kind & CTLTYPE);
2712	sign = ctl_sign[ctltype];
2713	intlen = ctl_size[ctltype];
2714
2715	switch (ctltype) {
2716	case CTLTYPE_NODE:
2717	case CTLTYPE_STRING:
2718		db_printf("%.*s", (int) len, (const char *) p);
2719		error = 0;
2720		goto out;
2721
2722	case CTLTYPE_INT:
2723	case CTLTYPE_UINT:
2724	case CTLTYPE_LONG:
2725	case CTLTYPE_ULONG:
2726	case CTLTYPE_S8:
2727	case CTLTYPE_S16:
2728	case CTLTYPE_S32:
2729	case CTLTYPE_S64:
2730	case CTLTYPE_U8:
2731	case CTLTYPE_U16:
2732	case CTLTYPE_U32:
2733	case CTLTYPE_U64:
2734		hexlen = 2 + (intlen * CHAR_BIT + 3) / 4;
2735		sep1 = "";
2736		while (len >= intlen) {
2737			switch (ctltype) {
2738			case CTLTYPE_INT:
2739			case CTLTYPE_UINT:
2740				umv = *(const u_int *)p;
2741				mv = *(const int *)p;
2742				break;
2743			case CTLTYPE_LONG:
2744			case CTLTYPE_ULONG:
2745				umv = *(const u_long *)p;
2746				mv = *(const long *)p;
2747				break;
2748			case CTLTYPE_S8:
2749			case CTLTYPE_U8:
2750				umv = *(const uint8_t *)p;
2751				mv = *(const int8_t *)p;
2752				break;
2753			case CTLTYPE_S16:
2754			case CTLTYPE_U16:
2755				umv = *(const uint16_t *)p;
2756				mv = *(const int16_t *)p;
2757				break;
2758			case CTLTYPE_S32:
2759			case CTLTYPE_U32:
2760				umv = *(const uint32_t *)p;
2761				mv = *(const int32_t *)p;
2762				break;
2763			case CTLTYPE_S64:
2764			case CTLTYPE_U64:
2765				umv = *(const uint64_t *)p;
2766				mv = *(const int64_t *)p;
2767				break;
2768			}
2769
2770			db_printf("%s", sep1);
2771			if (xflag)
2772				db_printf("%#0*jx", hexlen, umv);
2773			else if (!sign)
2774				db_printf("%ju", umv);
2775			else if (g_ddb_oid->oid_fmt[1] == 'K') {
2776				/* Kelvins are currently unsupported. */
2777				error = EOPNOTSUPP;
2778				goto out;
2779			} else
2780				db_printf("%jd", mv);
2781
2782			sep1 = " ";
2783			len -= intlen;
2784			p += intlen;
2785		}
2786		error = 0;
2787		goto out;
2788
2789	case CTLTYPE_OPAQUE:
2790		/* TODO: Support struct functions. */
2791
2792		/* FALLTHROUGH */
2793	default:
2794		db_printf("Format:%s Length:%zu Dump:0x",
2795		    g_ddb_oid->oid_fmt, len);
2796		while (len-- && (xflag || p < val + 16))
2797			db_printf("%02x", *p++);
2798		if (!xflag && len > 16)
2799			db_printf("...");
2800		error = 0;
2801		goto out;
2802	}
2803
2804out:
2805	req->oldidx += slen;
2806	return (error);
2807}
2808
2809/*
2810 * Avoid setting new sysctl values from the debugger
2811 */
2812static int
2813sysctl_new_ddb(struct sysctl_req *req, void *p, size_t l)
2814{
2815
2816	if (!req->newptr)
2817		return (0);
2818
2819	/* Changing sysctls from the debugger is currently unsupported */
2820	return (EPERM);
2821}
2822
2823/*
2824 * Run a sysctl handler with the DDB oldfunc and newfunc attached.
2825 * Instead of copying any output to a buffer we'll dump it right to
2826 * the console.
2827 */
2828static int
2829db_sysctl(struct sysctl_oid *oidp, int *name, u_int namelen,
2830    void *old, size_t *oldlenp, size_t *retval, int flags)
2831{
2832	struct sysctl_req req;
2833	int error;
2834
2835	/* Setup the request */
2836	bzero(&req, sizeof req);
2837	req.td = kdb_thread;
2838	req.oldfunc = sysctl_old_ddb;
2839	req.newfunc = sysctl_new_ddb;
2840	req.lock = REQ_UNWIRED;
2841	if (oldlenp) {
2842		req.oldlen = *oldlenp;
2843	}
2844	req.validlen = req.oldlen;
2845	if (old) {
2846		req.oldptr = old;
2847	}
2848
2849	/* Setup our globals for sysctl_old_ddb */
2850	g_ddb_oid = oidp;
2851	g_ddb_sysctl_flags = flags;
2852	g_ddb_sysctl_printed = 0;
2853
2854	error = sysctl_root(0, name, namelen, &req);
2855
2856	/* Reset globals */
2857	g_ddb_oid = NULL;
2858	g_ddb_sysctl_flags = 0;
2859
2860	if (retval) {
2861		if (req.oldptr && req.oldidx > req.validlen)
2862			*retval = req.validlen;
2863		else
2864			*retval = req.oldidx;
2865	}
2866	return (error);
2867}
2868
2869/*
2870 * Show a sysctl's name
2871 */
2872static void
2873db_show_oid_name(int *oid, size_t nlen)
2874{
2875	struct sysctl_oid *oidp;
2876	int qoid[CTL_MAXNAME + 2];
2877	int error;
2878
2879	qoid[0] = CTL_SYSCTL;
2880	qoid[1] = CTL_SYSCTL_NAME;
2881	memcpy(qoid + 2, oid, nlen * sizeof(int));
2882
2883	error = sysctl_find_oid(qoid, nlen + 2, &oidp, NULL, NULL);
2884	if (error)
2885		db_error("sysctl name oid");
2886
2887	error = db_sysctl(oidp, qoid, nlen + 2, NULL, NULL, NULL, 0);
2888	if (error)
2889		db_error("sysctl name");
2890}
2891
2892/*
2893 * Check to see if an OID is safe to print from ddb.
2894 */
2895static bool
2896db_oid_safe(const struct sysctl_oid *oidp)
2897{
2898	for (unsigned int i = 0; i < nitems(db_safe_handlers); ++i) {
2899		if (oidp->oid_handler == db_safe_handlers[i])
2900			return (true);
2901	}
2902
2903	return (false);
2904}
2905
2906/*
2907 * Show a sysctl at a specific OID
2908 * Compare to the input handling in show_var from sbin/sysctl/sysctl.c
2909 */
2910static int
2911db_show_oid(struct sysctl_oid *oidp, int *oid, size_t nlen, int flags)
2912{
2913	int error, xflag, oflag, Nflag, nflag;
2914	size_t len;
2915
2916	xflag = flags & DB_SYSCTL_HEX;
2917	oflag = flags & DB_SYSCTL_OPAQUE;
2918	nflag = flags & DB_SYSCTL_VALUE_ONLY;
2919	Nflag = flags & DB_SYSCTL_NAME_ONLY;
2920
2921	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_OPAQUE &&
2922	    (!xflag && !oflag))
2923		return (0);
2924
2925	if (Nflag) {
2926		db_show_oid_name(oid, nlen);
2927		error = 0;
2928		goto out;
2929	}
2930
2931	if (!nflag) {
2932		db_show_oid_name(oid, nlen);
2933		db_printf(": ");
2934	}
2935
2936	if ((flags & DB_SYSCTL_SAFE_ONLY) && !db_oid_safe(oidp)) {
2937		db_printf("Skipping, unsafe to print while recursing.");
2938		error = 0;
2939		goto out;
2940	}
2941
2942	/* Try once, and ask about the size */
2943	len = 0;
2944	error = db_sysctl(oidp, oid, nlen,
2945	    NULL, NULL, &len, flags);
2946	if (error)
2947		goto out;
2948
2949	if (!g_ddb_sysctl_printed)
2950		/* Lie about the size */
2951		error = db_sysctl(oidp, oid, nlen,
2952		    (void *) 1, &len, NULL, flags);
2953
2954out:
2955	db_printf("\n");
2956	return (error);
2957}
2958
2959/*
2960 * Show all sysctls under a specific OID
2961 * Compare to sysctl_all from sbin/sysctl/sysctl.c
2962 */
2963static int
2964db_show_sysctl_all(int *oid, size_t len, int flags)
2965{
2966	struct sysctl_oid *oidp;
2967	int qoid[CTL_MAXNAME + 2], next[CTL_MAXNAME];
2968	size_t nlen;
2969
2970	qoid[0] = CTL_SYSCTL;
2971	qoid[1] = CTL_SYSCTL_NEXT;
2972	if (len) {
2973		nlen = len;
2974		memcpy(&qoid[2], oid, nlen * sizeof(int));
2975	} else {
2976		nlen = 1;
2977		qoid[2] = CTL_KERN;
2978	}
2979	for (;;) {
2980		int error;
2981		size_t nextsize = sizeof(next);
2982
2983		error = kernel_sysctl(kdb_thread, qoid, nlen + 2,
2984		    next, &nextsize, NULL, 0, &nlen, 0);
2985		if (error != 0) {
2986			if (error == ENOENT)
2987				return (0);
2988			else
2989				db_error("sysctl(next)");
2990		}
2991
2992		nlen /= sizeof(int);
2993
2994		if (nlen < (unsigned int)len)
2995			return (0);
2996
2997		if (memcmp(&oid[0], &next[0], len * sizeof(int)) != 0)
2998			return (0);
2999
3000		/* Find the OID in question */
3001		error = sysctl_find_oid(next, nlen, &oidp, NULL, NULL);
3002		if (error)
3003			return (error);
3004
3005		(void)db_show_oid(oidp, next, nlen, flags | DB_SYSCTL_SAFE_ONLY);
3006
3007		if (db_pager_quit)
3008			return (0);
3009
3010		memcpy(&qoid[2 + len], &next[len], (nlen - len) * sizeof(int));
3011	}
3012}
3013
3014/*
3015 * Show a sysctl by its user facing string
3016 */
3017static int
3018db_sysctlbyname(const char *name, int flags)
3019{
3020	struct sysctl_oid *oidp;
3021	int oid[CTL_MAXNAME];
3022	int error, nlen;
3023
3024	error = name2oid(name, oid, &nlen, &oidp);
3025	if (error) {
3026		return (error);
3027	}
3028
3029	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
3030		db_show_sysctl_all(oid, nlen, flags);
3031	} else {
3032		error = db_show_oid(oidp, oid, nlen, flags);
3033	}
3034
3035	return (error);
3036}
3037
3038static void
3039db_sysctl_cmd_usage(void)
3040{
3041	db_printf(
3042	    " sysctl [/Nnox] <sysctl>					    \n"
3043	    "								    \n"
3044	    " <sysctl> The name of the sysctl to show.			    \n"
3045	    "								    \n"
3046	    " Show a sysctl by hooking into SYSCTL_IN and SYSCTL_OUT.	    \n"
3047	    " This will work for most sysctls, but should not be used	    \n"
3048	    " with sysctls that are known to malloc.			    \n"
3049	    "								    \n"
3050	    " While recursing any \"unsafe\" sysctls will be skipped.	    \n"
3051	    " Call sysctl directly on the sysctl to try printing the	    \n"
3052	    " skipped sysctl. This is unsafe and may make the ddb	    \n"
3053	    " session unusable.						    \n"
3054	    "								    \n"
3055	    " Arguments:						    \n"
3056	    "	/N	Display only the name of the sysctl.		    \n"
3057	    "	/n	Display only the value of the sysctl.		    \n"
3058	    "	/o	Display opaque values.				    \n"
3059	    "	/x	Display the sysctl in hex.			    \n"
3060	    "								    \n"
3061	    "For example:						    \n"
3062	    "sysctl vm.v_free_min					    \n"
3063	    "vn.v_free_min: 12669					    \n"
3064	    );
3065}
3066
3067/*
3068 * Show a specific sysctl similar to sysctl (8).
3069 */
3070DB_COMMAND_FLAGS(sysctl, db_sysctl_cmd, CS_OWN)
3071{
3072	char name[TOK_STRING_SIZE];
3073	int error, i, t, flags;
3074
3075	/* Parse the modifiers */
3076	t = db_read_token();
3077	if (t == tSLASH || t == tMINUS) {
3078		t = db_read_token();
3079		if (t != tIDENT) {
3080			db_printf("Bad modifier\n");
3081			error = EINVAL;
3082			goto out;
3083		}
3084		db_strcpy(modif, db_tok_string);
3085	}
3086	else {
3087		db_unread_token(t);
3088		modif[0] = '\0';
3089	}
3090
3091	flags = 0;
3092	for (i = 0; i < nitems(db_sysctl_modifs); i++) {
3093		if (strchr(modif, db_sysctl_modifs[i])) {
3094			flags |= db_sysctl_modif_values[i];
3095		}
3096	}
3097
3098	/* Parse the sysctl names */
3099	t = db_read_token();
3100	if (t != tIDENT) {
3101		db_printf("Need sysctl name\n");
3102		error = EINVAL;
3103		goto out;
3104	}
3105
3106	/* Copy the name into a temporary buffer */
3107	db_strcpy(name, db_tok_string);
3108
3109	/* Ensure there is no trailing cruft */
3110	t = db_read_token();
3111	if (t != tEOL) {
3112		db_printf("Unexpected sysctl argument\n");
3113		error = EINVAL;
3114		goto out;
3115	}
3116
3117	error = db_sysctlbyname(name, flags);
3118	if (error == ENOENT) {
3119		db_printf("unknown oid: '%s'\n", db_tok_string);
3120		goto out;
3121	} else if (error) {
3122		db_printf("%s: error: %d\n", db_tok_string, error);
3123		goto out;
3124	}
3125
3126out:
3127	/* Ensure we eat all of our text */
3128	db_flush_lex();
3129
3130	if (error == EINVAL) {
3131		db_sysctl_cmd_usage();
3132	}
3133}
3134
3135#endif /* DDB */
3136