1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 1999 Poul-Henning Kamp.
5 * Copyright (c) 2008 Bjoern A. Zeeb.
6 * Copyright (c) 2009 James Gritton.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD$");
33
34#include "opt_ddb.h"
35#include "opt_inet.h"
36#include "opt_inet6.h"
37
38#include <sys/param.h>
39#include <sys/types.h>
40#include <sys/kernel.h>
41#include <sys/systm.h>
42#include <sys/errno.h>
43#include <sys/sysproto.h>
44#include <sys/malloc.h>
45#include <sys/osd.h>
46#include <sys/priv.h>
47#include <sys/proc.h>
48#include <sys/taskqueue.h>
49#include <sys/fcntl.h>
50#include <sys/jail.h>
51#include <sys/linker.h>
52#include <sys/lock.h>
53#include <sys/mutex.h>
54#include <sys/racct.h>
55#include <sys/rctl.h>
56#include <sys/refcount.h>
57#include <sys/sx.h>
58#include <sys/sysent.h>
59#include <sys/namei.h>
60#include <sys/mount.h>
61#include <sys/queue.h>
62#include <sys/socket.h>
63#include <sys/syscallsubr.h>
64#include <sys/sysctl.h>
65#include <sys/uuid.h>
66#include <sys/vnode.h>
67
68#include <net/if.h>
69#include <net/vnet.h>
70
71#include <netinet/in.h>
72
73#ifdef DDB
74#include <ddb/ddb.h>
75#endif /* DDB */
76
77#include <security/mac/mac_framework.h>
78
79#define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
80#define	PRISON0_HOSTUUID_MODULE	"hostuuid"
81
82MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
83static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
84
85/* Keep struct prison prison0 and some code in kern_jail_set() readable. */
86#ifdef INET
87#ifdef INET6
88#define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
89#else
90#define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
91#endif
92#else /* !INET */
93#ifdef INET6
94#define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
95#else
96#define	_PR_IP_SADDRSEL	0
97#endif
98#endif
99
100/* prison0 describes what is "real" about the system. */
101struct prison prison0 = {
102	.pr_id		= 0,
103	.pr_name	= "0",
104	.pr_ref		= 1,
105	.pr_uref	= 1,
106	.pr_path	= "/",
107	.pr_securelevel	= -1,
108	.pr_devfs_rsnum = 0,
109	.pr_state	= PRISON_STATE_ALIVE,
110	.pr_childmax	= JAIL_MAX,
111	.pr_hostuuid	= DEFAULT_HOSTUUID,
112	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
113#ifdef VIMAGE
114	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
115#else
116	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
117#endif
118	.pr_allow	= PR_ALLOW_ALL_STATIC,
119};
120MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
121
122struct bool_flags {
123	const char	*name;
124	const char	*noname;
125	volatile u_int	 flag;
126};
127struct jailsys_flags {
128	const char	*name;
129	unsigned	 disable;
130	unsigned	 new;
131};
132
133/* allprison, allprison_racct and lastprid are protected by allprison_lock. */
134struct	sx allprison_lock;
135SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
136struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
137LIST_HEAD(, prison_racct) allprison_racct;
138int	lastprid = 0;
139
140static int get_next_prid(struct prison **insprp);
141static int do_jail_attach(struct thread *td, struct prison *pr, int drflags);
142static void prison_complete(void *context, int pending);
143static void prison_deref(struct prison *pr, int flags);
144static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison);
145static int prison_lock_xlock(struct prison *pr, int flags);
146static void prison_free_not_last(struct prison *pr);
147static void prison_proc_free_not_last(struct prison *pr);
148static void prison_set_allow_locked(struct prison *pr, unsigned flag,
149    int enable);
150static char *prison_path(struct prison *pr1, struct prison *pr2);
151#ifdef RACCT
152static void prison_racct_attach(struct prison *pr);
153static void prison_racct_modify(struct prison *pr);
154static void prison_racct_detach(struct prison *pr);
155#endif
156
157/* Flags for prison_deref */
158#define	PD_DEREF	0x01	/* Decrement pr_ref */
159#define	PD_DEUREF	0x02	/* Decrement pr_uref */
160#define	PD_KILL		0x04	/* Remove jail, kill processes, etc */
161#define	PD_LOCKED	0x10	/* pr_mtx is held */
162#define	PD_LIST_SLOCKED	0x20	/* allprison_lock is held shared */
163#define	PD_LIST_XLOCKED	0x40	/* allprison_lock is held exclusive */
164#define PD_OP_FLAGS	0x07	/* Operation flags */
165#define PD_LOCK_FLAGS	0x70	/* Lock status flags */
166
167/*
168 * Parameter names corresponding to PR_* flag values.  Size values are for kvm
169 * as we cannot figure out the size of a sparse array, or an array without a
170 * terminating entry.
171 */
172static struct bool_flags pr_flag_bool[] = {
173	{"persist", "nopersist", PR_PERSIST},
174#ifdef INET
175	{"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL},
176#endif
177#ifdef INET6
178	{"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL},
179#endif
180};
181const size_t pr_flag_bool_size = sizeof(pr_flag_bool);
182
183static struct jailsys_flags pr_flag_jailsys[] = {
184	{"host", 0, PR_HOST},
185#ifdef VIMAGE
186	{"vnet", 0, PR_VNET},
187#endif
188#ifdef INET
189	{"ip4", PR_IP4_USER, PR_IP4_USER},
190#endif
191#ifdef INET6
192	{"ip6", PR_IP6_USER, PR_IP6_USER},
193#endif
194};
195const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
196
197/*
198 * Make this array full-size so dynamic parameters can be added.
199 * It is protected by prison0.mtx, but lockless reading is allowed
200 * with an atomic check of the flag values.
201 */
202static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
203	{"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME},
204	{"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC},
205	{"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS},
206	{"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS},
207	{"allow.mount", "allow.nomount", PR_ALLOW_MOUNT},
208	{"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS},
209	{"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF},
210	{"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK},
211	{"allow.reserved_ports", "allow.noreserved_ports",
212	 PR_ALLOW_RESERVED_PORTS},
213	{"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF},
214	{"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug",
215	 PR_ALLOW_UNPRIV_DEBUG},
216	{"allow.suser", "allow.nosuser", PR_ALLOW_SUSER},
217};
218static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC;
219const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
220
221#define	JAIL_DEFAULT_ALLOW		(PR_ALLOW_SET_HOSTNAME | \
222					 PR_ALLOW_RESERVED_PORTS | \
223					 PR_ALLOW_UNPRIV_DEBUG | \
224					 PR_ALLOW_SUSER)
225#define	JAIL_DEFAULT_ENFORCE_STATFS	2
226#define	JAIL_DEFAULT_DEVFS_RSNUM	0
227static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
228static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
229static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
230#if defined(INET) || defined(INET6)
231static unsigned jail_max_af_ips = 255;
232#endif
233
234/*
235 * Initialize the parts of prison0 that can't be static-initialized with
236 * constants.  This is called from proc0_init() after creating thread0 cpuset.
237 */
238void
239prison0_init(void)
240{
241	uint8_t *file, *data;
242	size_t size;
243
244	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
245	prison0.pr_osreldate = osreldate;
246	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
247
248	/* If we have a preloaded hostuuid, use it. */
249	file = preload_search_by_type(PRISON0_HOSTUUID_MODULE);
250	if (file != NULL) {
251		data = preload_fetch_addr(file);
252		size = preload_fetch_size(file);
253		if (data != NULL) {
254			/*
255			 * The preloaded data may include trailing whitespace, almost
256			 * certainly a newline; skip over any whitespace or
257			 * non-printable characters to be safe.
258			 */
259			while (size > 0 && data[size - 1] <= 0x20) {
260				size--;
261			}
262			if (validate_uuid(data, size, NULL, 0) == 0) {
263				(void)strlcpy(prison0.pr_hostuuid, data,
264				    size + 1);
265			} else if (bootverbose) {
266				printf("hostuuid: preload data malformed: '%.*s'\n",
267				    (int)size, data);
268			}
269		}
270	}
271	if (bootverbose)
272		printf("hostuuid: using %s\n", prison0.pr_hostuuid);
273}
274
275/*
276 * struct jail_args {
277 *	struct jail *jail;
278 * };
279 */
280int
281sys_jail(struct thread *td, struct jail_args *uap)
282{
283	uint32_t version;
284	int error;
285	struct jail j;
286
287	error = copyin(uap->jail, &version, sizeof(uint32_t));
288	if (error)
289		return (error);
290
291	switch (version) {
292	case 0:
293	{
294		struct jail_v0 j0;
295
296		/* FreeBSD single IPv4 jails. */
297		bzero(&j, sizeof(struct jail));
298		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
299		if (error)
300			return (error);
301		j.version = j0.version;
302		j.path = j0.path;
303		j.hostname = j0.hostname;
304		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
305		break;
306	}
307
308	case 1:
309		/*
310		 * Version 1 was used by multi-IPv4 jail implementations
311		 * that never made it into the official kernel.
312		 */
313		return (EINVAL);
314
315	case 2:	/* JAIL_API_VERSION */
316		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
317		error = copyin(uap->jail, &j, sizeof(struct jail));
318		if (error)
319			return (error);
320		break;
321
322	default:
323		/* Sci-Fi jails are not supported, sorry. */
324		return (EINVAL);
325	}
326	return (kern_jail(td, &j));
327}
328
329int
330kern_jail(struct thread *td, struct jail *j)
331{
332	struct iovec optiov[2 * (4 + nitems(pr_flag_allow)
333#ifdef INET
334			    + 1
335#endif
336#ifdef INET6
337			    + 1
338#endif
339			    )];
340	struct uio opt;
341	char *u_path, *u_hostname, *u_name;
342	struct bool_flags *bf;
343#ifdef INET
344	uint32_t ip4s;
345	struct in_addr *u_ip4;
346#endif
347#ifdef INET6
348	struct in6_addr *u_ip6;
349#endif
350	size_t tmplen;
351	int error, enforce_statfs;
352
353	bzero(&optiov, sizeof(optiov));
354	opt.uio_iov = optiov;
355	opt.uio_iovcnt = 0;
356	opt.uio_offset = -1;
357	opt.uio_resid = -1;
358	opt.uio_segflg = UIO_SYSSPACE;
359	opt.uio_rw = UIO_READ;
360	opt.uio_td = td;
361
362	/* Set permissions for top-level jails from sysctls. */
363	if (!jailed(td->td_ucred)) {
364		for (bf = pr_flag_allow;
365		     bf < pr_flag_allow + nitems(pr_flag_allow) &&
366			atomic_load_int(&bf->flag) != 0;
367		     bf++) {
368			optiov[opt.uio_iovcnt].iov_base = __DECONST(char *,
369			    (jail_default_allow & bf->flag)
370			    ? bf->name : bf->noname);
371			optiov[opt.uio_iovcnt].iov_len =
372			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
373			opt.uio_iovcnt += 2;
374		}
375		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
376		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
377		opt.uio_iovcnt++;
378		enforce_statfs = jail_default_enforce_statfs;
379		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
380		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
381		opt.uio_iovcnt++;
382	}
383
384	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
385#ifdef INET
386	ip4s = (j->version == 0) ? 1 : j->ip4s;
387	if (ip4s > jail_max_af_ips)
388		return (EINVAL);
389	tmplen += ip4s * sizeof(struct in_addr);
390#else
391	if (j->ip4s > 0)
392		return (EINVAL);
393#endif
394#ifdef INET6
395	if (j->ip6s > jail_max_af_ips)
396		return (EINVAL);
397	tmplen += j->ip6s * sizeof(struct in6_addr);
398#else
399	if (j->ip6s > 0)
400		return (EINVAL);
401#endif
402	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
403	u_hostname = u_path + MAXPATHLEN;
404	u_name = u_hostname + MAXHOSTNAMELEN;
405#ifdef INET
406	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
407#endif
408#ifdef INET6
409#ifdef INET
410	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
411#else
412	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
413#endif
414#endif
415	optiov[opt.uio_iovcnt].iov_base = "path";
416	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
417	opt.uio_iovcnt++;
418	optiov[opt.uio_iovcnt].iov_base = u_path;
419	error = copyinstr(j->path, u_path, MAXPATHLEN,
420	    &optiov[opt.uio_iovcnt].iov_len);
421	if (error) {
422		free(u_path, M_TEMP);
423		return (error);
424	}
425	opt.uio_iovcnt++;
426	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
427	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
428	opt.uio_iovcnt++;
429	optiov[opt.uio_iovcnt].iov_base = u_hostname;
430	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
431	    &optiov[opt.uio_iovcnt].iov_len);
432	if (error) {
433		free(u_path, M_TEMP);
434		return (error);
435	}
436	opt.uio_iovcnt++;
437	if (j->jailname != NULL) {
438		optiov[opt.uio_iovcnt].iov_base = "name";
439		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
440		opt.uio_iovcnt++;
441		optiov[opt.uio_iovcnt].iov_base = u_name;
442		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
443		    &optiov[opt.uio_iovcnt].iov_len);
444		if (error) {
445			free(u_path, M_TEMP);
446			return (error);
447		}
448		opt.uio_iovcnt++;
449	}
450#ifdef INET
451	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
452	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
453	opt.uio_iovcnt++;
454	optiov[opt.uio_iovcnt].iov_base = u_ip4;
455	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
456	if (j->version == 0)
457		u_ip4->s_addr = j->ip4s;
458	else {
459		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
460		if (error) {
461			free(u_path, M_TEMP);
462			return (error);
463		}
464	}
465	opt.uio_iovcnt++;
466#endif
467#ifdef INET6
468	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
469	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
470	opt.uio_iovcnt++;
471	optiov[opt.uio_iovcnt].iov_base = u_ip6;
472	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
473	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
474	if (error) {
475		free(u_path, M_TEMP);
476		return (error);
477	}
478	opt.uio_iovcnt++;
479#endif
480	KASSERT(opt.uio_iovcnt <= nitems(optiov),
481		("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
482	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
483	free(u_path, M_TEMP);
484	return (error);
485}
486
487/*
488 * struct jail_set_args {
489 *	struct iovec *iovp;
490 *	unsigned int iovcnt;
491 *	int flags;
492 * };
493 */
494int
495sys_jail_set(struct thread *td, struct jail_set_args *uap)
496{
497	struct uio *auio;
498	int error;
499
500	/* Check that we have an even number of iovecs. */
501	if (uap->iovcnt & 1)
502		return (EINVAL);
503
504	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
505	if (error)
506		return (error);
507	error = kern_jail_set(td, auio, uap->flags);
508	free(auio, M_IOV);
509	return (error);
510}
511
512int
513kern_jail_set(struct thread *td, struct uio *optuio, int flags)
514{
515	struct nameidata nd;
516#ifdef INET
517	struct in_addr *ip4;
518#endif
519#ifdef INET6
520	struct in6_addr *ip6;
521#endif
522	struct vfsopt *opt;
523	struct vfsoptlist *opts;
524	struct prison *pr, *deadpr, *inspr, *mypr, *ppr, *tpr;
525	struct vnode *root;
526	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
527	char *g_path, *osrelstr;
528	struct bool_flags *bf;
529	struct jailsys_flags *jsf;
530#if defined(INET) || defined(INET6)
531	struct prison *tppr;
532	void *op;
533#endif
534	unsigned long hid;
535	size_t namelen, onamelen, pnamelen;
536	int born, created, cuflags, descend, drflags, enforce;
537	int error, errmsg_len, errmsg_pos;
538	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
539	int jid, jsys, len, level;
540	int childmax, osreldt, rsnum, slevel;
541#if defined(INET) || defined(INET6)
542	int ii, ij;
543#endif
544#ifdef INET
545	int ip4s, redo_ip4;
546#endif
547#ifdef INET6
548	int ip6s, redo_ip6;
549#endif
550	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
551	uint64_t pr_allow_diff;
552	unsigned tallow;
553	char numbuf[12];
554
555	error = priv_check(td, PRIV_JAIL_SET);
556	if (!error && (flags & JAIL_ATTACH))
557		error = priv_check(td, PRIV_JAIL_ATTACH);
558	if (error)
559		return (error);
560	mypr = td->td_ucred->cr_prison;
561	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
562		return (EPERM);
563	if (flags & ~JAIL_SET_MASK)
564		return (EINVAL);
565
566	/*
567	 * Check all the parameters before committing to anything.  Not all
568	 * errors can be caught early, but we may as well try.  Also, this
569	 * takes care of some expensive stuff (path lookup) before getting
570	 * the allprison lock.
571	 *
572	 * XXX Jails are not filesystems, and jail parameters are not mount
573	 *     options.  But it makes more sense to re-use the vfsopt code
574	 *     than duplicate it under a different name.
575	 */
576	error = vfs_buildopts(optuio, &opts);
577	if (error)
578		return (error);
579#ifdef INET
580	ip4 = NULL;
581#endif
582#ifdef INET6
583	ip6 = NULL;
584#endif
585	g_path = NULL;
586
587	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
588	if (!cuflags) {
589		error = EINVAL;
590		vfs_opterror(opts, "no valid operation (create or update)");
591		goto done_errmsg;
592	}
593
594	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
595	if (error == ENOENT)
596		jid = 0;
597	else if (error != 0)
598		goto done_free;
599
600	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
601	if (error == ENOENT)
602		gotslevel = 0;
603	else if (error != 0)
604		goto done_free;
605	else
606		gotslevel = 1;
607
608	error =
609	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
610	if (error == ENOENT)
611		gotchildmax = 0;
612	else if (error != 0)
613		goto done_free;
614	else
615		gotchildmax = 1;
616
617	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
618	if (error == ENOENT)
619		gotenforce = 0;
620	else if (error != 0)
621		goto done_free;
622	else if (enforce < 0 || enforce > 2) {
623		error = EINVAL;
624		goto done_free;
625	} else
626		gotenforce = 1;
627
628	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
629	if (error == ENOENT)
630		gotrsnum = 0;
631	else if (error != 0)
632		goto done_free;
633	else
634		gotrsnum = 1;
635
636	pr_flags = ch_flags = 0;
637	for (bf = pr_flag_bool;
638	     bf < pr_flag_bool + nitems(pr_flag_bool);
639	     bf++) {
640		vfs_flagopt(opts, bf->name, &pr_flags, bf->flag);
641		vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag);
642	}
643	ch_flags |= pr_flags;
644	for (jsf = pr_flag_jailsys;
645	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
646	     jsf++) {
647		error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys));
648		if (error == ENOENT)
649			continue;
650		if (error != 0)
651			goto done_free;
652		switch (jsys) {
653		case JAIL_SYS_DISABLE:
654			if (!jsf->disable) {
655				error = EINVAL;
656				goto done_free;
657			}
658			pr_flags |= jsf->disable;
659			break;
660		case JAIL_SYS_NEW:
661			pr_flags |= jsf->new;
662			break;
663		case JAIL_SYS_INHERIT:
664			break;
665		default:
666			error = EINVAL;
667			goto done_free;
668		}
669		ch_flags |= jsf->new | jsf->disable;
670	}
671	if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE
672	    && !(pr_flags & PR_PERSIST)) {
673		error = EINVAL;
674		vfs_opterror(opts, "new jail must persist or attach");
675		goto done_errmsg;
676	}
677#ifdef VIMAGE
678	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
679		error = EINVAL;
680		vfs_opterror(opts, "vnet cannot be changed after creation");
681		goto done_errmsg;
682	}
683#endif
684#ifdef INET
685	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
686		error = EINVAL;
687		vfs_opterror(opts, "ip4 cannot be changed after creation");
688		goto done_errmsg;
689	}
690#endif
691#ifdef INET6
692	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
693		error = EINVAL;
694		vfs_opterror(opts, "ip6 cannot be changed after creation");
695		goto done_errmsg;
696	}
697#endif
698
699	pr_allow = ch_allow = 0;
700	for (bf = pr_flag_allow;
701	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
702		atomic_load_int(&bf->flag) != 0;
703	     bf++) {
704		vfs_flagopt(opts, bf->name, &pr_allow, bf->flag);
705		vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag);
706	}
707	ch_allow |= pr_allow;
708
709	error = vfs_getopt(opts, "name", (void **)&name, &len);
710	if (error == ENOENT)
711		name = NULL;
712	else if (error != 0)
713		goto done_free;
714	else {
715		if (len == 0 || name[len - 1] != '\0') {
716			error = EINVAL;
717			goto done_free;
718		}
719		if (len > MAXHOSTNAMELEN) {
720			error = ENAMETOOLONG;
721			goto done_free;
722		}
723	}
724
725	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
726	if (error == ENOENT)
727		host = NULL;
728	else if (error != 0)
729		goto done_free;
730	else {
731		ch_flags |= PR_HOST;
732		pr_flags |= PR_HOST;
733		if (len == 0 || host[len - 1] != '\0') {
734			error = EINVAL;
735			goto done_free;
736		}
737		if (len > MAXHOSTNAMELEN) {
738			error = ENAMETOOLONG;
739			goto done_free;
740		}
741	}
742
743	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
744	if (error == ENOENT)
745		domain = NULL;
746	else if (error != 0)
747		goto done_free;
748	else {
749		ch_flags |= PR_HOST;
750		pr_flags |= PR_HOST;
751		if (len == 0 || domain[len - 1] != '\0') {
752			error = EINVAL;
753			goto done_free;
754		}
755		if (len > MAXHOSTNAMELEN) {
756			error = ENAMETOOLONG;
757			goto done_free;
758		}
759	}
760
761	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
762	if (error == ENOENT)
763		uuid = NULL;
764	else if (error != 0)
765		goto done_free;
766	else {
767		ch_flags |= PR_HOST;
768		pr_flags |= PR_HOST;
769		if (len == 0 || uuid[len - 1] != '\0') {
770			error = EINVAL;
771			goto done_free;
772		}
773		if (len > HOSTUUIDLEN) {
774			error = ENAMETOOLONG;
775			goto done_free;
776		}
777	}
778
779#ifdef COMPAT_FREEBSD32
780	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
781		uint32_t hid32;
782
783		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
784		hid = hid32;
785	} else
786#endif
787		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
788	if (error == ENOENT)
789		gothid = 0;
790	else if (error != 0)
791		goto done_free;
792	else {
793		gothid = 1;
794		ch_flags |= PR_HOST;
795		pr_flags |= PR_HOST;
796	}
797
798#ifdef INET
799	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
800	if (error == ENOENT)
801		ip4s = 0;
802	else if (error != 0)
803		goto done_free;
804	else if (ip4s & (sizeof(*ip4) - 1)) {
805		error = EINVAL;
806		goto done_free;
807	} else {
808		ch_flags |= PR_IP4_USER;
809		pr_flags |= PR_IP4_USER;
810		if (ip4s > 0) {
811			ip4s /= sizeof(*ip4);
812			if (ip4s > jail_max_af_ips) {
813				error = EINVAL;
814				vfs_opterror(opts, "too many IPv4 addresses");
815				goto done_errmsg;
816			}
817			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
818			bcopy(op, ip4, ip4s * sizeof(*ip4));
819			/*
820			 * IP addresses are all sorted but ip[0] to preserve
821			 * the primary IP address as given from userland.
822			 * This special IP is used for unbound outgoing
823			 * connections as well for "loopback" traffic in case
824			 * source address selection cannot find any more fitting
825			 * address to connect from.
826			 */
827			if (ip4s > 1)
828				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4),
829				    prison_qcmp_v4);
830			/*
831			 * Check for duplicate addresses and do some simple
832			 * zero and broadcast checks. If users give other bogus
833			 * addresses it is their problem.
834			 *
835			 * We do not have to care about byte order for these
836			 * checks so we will do them in NBO.
837			 */
838			for (ii = 0; ii < ip4s; ii++) {
839				if (ip4[ii].s_addr == INADDR_ANY ||
840				    ip4[ii].s_addr == INADDR_BROADCAST) {
841					error = EINVAL;
842					goto done_free;
843				}
844				if ((ii+1) < ip4s &&
845				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
846				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
847					error = EINVAL;
848					goto done_free;
849				}
850			}
851		}
852	}
853#endif
854
855#ifdef INET6
856	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
857	if (error == ENOENT)
858		ip6s = 0;
859	else if (error != 0)
860		goto done_free;
861	else if (ip6s & (sizeof(*ip6) - 1)) {
862		error = EINVAL;
863		goto done_free;
864	} else {
865		ch_flags |= PR_IP6_USER;
866		pr_flags |= PR_IP6_USER;
867		if (ip6s > 0) {
868			ip6s /= sizeof(*ip6);
869			if (ip6s > jail_max_af_ips) {
870				error = EINVAL;
871				vfs_opterror(opts, "too many IPv6 addresses");
872				goto done_errmsg;
873			}
874			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
875			bcopy(op, ip6, ip6s * sizeof(*ip6));
876			if (ip6s > 1)
877				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6),
878				    prison_qcmp_v6);
879			for (ii = 0; ii < ip6s; ii++) {
880				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
881					error = EINVAL;
882					goto done_free;
883				}
884				if ((ii+1) < ip6s &&
885				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
886				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
887				{
888					error = EINVAL;
889					goto done_free;
890				}
891			}
892		}
893	}
894#endif
895
896#if defined(VIMAGE) && (defined(INET) || defined(INET6))
897	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
898		error = EINVAL;
899		vfs_opterror(opts,
900		    "vnet jails cannot have IP address restrictions");
901		goto done_errmsg;
902	}
903#endif
904
905	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
906	if (error == ENOENT)
907		osrelstr = NULL;
908	else if (error != 0)
909		goto done_free;
910	else {
911		if (flags & JAIL_UPDATE) {
912			error = EINVAL;
913			vfs_opterror(opts,
914			    "osrelease cannot be changed after creation");
915			goto done_errmsg;
916		}
917		if (len == 0 || osrelstr[len - 1] != '\0') {
918			error = EINVAL;
919			goto done_free;
920		}
921		if (len >= OSRELEASELEN) {
922			error = ENAMETOOLONG;
923			vfs_opterror(opts,
924			    "osrelease string must be 1-%d bytes long",
925			    OSRELEASELEN - 1);
926			goto done_errmsg;
927		}
928	}
929
930	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
931	if (error == ENOENT)
932		osreldt = 0;
933	else if (error != 0)
934		goto done_free;
935	else {
936		if (flags & JAIL_UPDATE) {
937			error = EINVAL;
938			vfs_opterror(opts,
939			    "osreldate cannot be changed after creation");
940			goto done_errmsg;
941		}
942		if (osreldt == 0) {
943			error = EINVAL;
944			vfs_opterror(opts, "osreldate cannot be 0");
945			goto done_errmsg;
946		}
947	}
948
949	root = NULL;
950	error = vfs_getopt(opts, "path", (void **)&path, &len);
951	if (error == ENOENT)
952		path = NULL;
953	else if (error != 0)
954		goto done_free;
955	else {
956		if (flags & JAIL_UPDATE) {
957			error = EINVAL;
958			vfs_opterror(opts,
959			    "path cannot be changed after creation");
960			goto done_errmsg;
961		}
962		if (len == 0 || path[len - 1] != '\0') {
963			error = EINVAL;
964			goto done_free;
965		}
966		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
967		    path, td);
968		error = namei(&nd);
969		if (error)
970			goto done_free;
971		root = nd.ni_vp;
972		NDFREE(&nd, NDF_ONLY_PNBUF);
973		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
974		strlcpy(g_path, path, MAXPATHLEN);
975		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
976		if (error == 0) {
977			path = g_path;
978		} else {
979			/* exit on other errors */
980			goto done_free;
981		}
982		if (root->v_type != VDIR) {
983			error = ENOTDIR;
984			vput(root);
985			goto done_free;
986		}
987		VOP_UNLOCK(root);
988	}
989
990	/*
991	 * Find the specified jail, or at least its parent.
992	 * This abuses the file error codes ENOENT and EEXIST.
993	 */
994	pr = NULL;
995	inspr = NULL;
996	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
997		namelc = strrchr(name, '.');
998		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
999		if (*p != '\0')
1000			jid = 0;
1001	}
1002	sx_xlock(&allprison_lock);
1003	drflags = PD_LIST_XLOCKED;
1004	ppr = mypr;
1005	if (!prison_isalive(ppr)) {
1006		/* This jail is dying.  This process will surely follow. */
1007		error = EAGAIN;
1008		goto done_deref;
1009	}
1010	if (jid != 0) {
1011		if (jid < 0) {
1012			error = EINVAL;
1013			vfs_opterror(opts, "negative jid");
1014			goto done_deref;
1015		}
1016		/*
1017		 * See if a requested jid already exists.  Keep track of
1018		 * where it can be inserted later.
1019		 */
1020		TAILQ_FOREACH(inspr, &allprison, pr_list) {
1021			if (inspr->pr_id < jid)
1022				continue;
1023			if (inspr->pr_id > jid)
1024				break;
1025			pr = inspr;
1026			mtx_lock(&pr->pr_mtx);
1027			drflags |= PD_LOCKED;
1028			inspr = NULL;
1029			break;
1030		}
1031		if (pr != NULL) {
1032			/* Create: jid must not exist. */
1033			if (cuflags == JAIL_CREATE) {
1034				/*
1035				 * Even creators that cannot see the jail will
1036				 * get EEXIST.
1037				 */
1038				error = EEXIST;
1039				vfs_opterror(opts, "jail %d already exists",
1040				    jid);
1041				goto done_deref;
1042			}
1043			if (!prison_ischild(mypr, pr)) {
1044				/*
1045				 * Updaters get ENOENT if they cannot see the
1046				 * jail.  This is true even for CREATE | UPDATE,
1047				 * which normally cannot give this error.
1048				 */
1049				error = ENOENT;
1050				vfs_opterror(opts, "jail %d not found", jid);
1051				goto done_deref;
1052			}
1053			ppr = pr->pr_parent;
1054			if (!prison_isalive(ppr)) {
1055				error = ENOENT;
1056				vfs_opterror(opts, "jail %d is dying",
1057				    ppr->pr_id);
1058				goto done_deref;
1059			}
1060			if (!prison_isalive(pr)) {
1061				if (!(flags & JAIL_DYING)) {
1062					error = ENOENT;
1063					vfs_opterror(opts, "jail %d is dying",
1064					    jid);
1065					goto done_deref;
1066				}
1067				if ((flags & JAIL_ATTACH) ||
1068				    (pr_flags & PR_PERSIST)) {
1069					/*
1070					 * A dying jail might be resurrected
1071					 * (via attach or persist), but first
1072					 * it must determine if another jail
1073					 * has claimed its name.  Accomplish
1074					 * this by implicitly re-setting the
1075					 * name.
1076					 */
1077					if (name == NULL)
1078						name = prison_name(mypr, pr);
1079				}
1080			}
1081		} else {
1082			/* Update: jid must exist. */
1083			if (cuflags == JAIL_UPDATE) {
1084				error = ENOENT;
1085				vfs_opterror(opts, "jail %d not found", jid);
1086				goto done_deref;
1087			}
1088		}
1089	}
1090	/*
1091	 * If the caller provided a name, look for a jail by that name.
1092	 * This has different semantics for creates and updates keyed by jid
1093	 * (where the name must not already exist in a different jail),
1094	 * and updates keyed by the name itself (where the name must exist
1095	 * because that is the jail being updated).
1096	 */
1097	namelc = NULL;
1098	if (name != NULL) {
1099		namelc = strrchr(name, '.');
1100		if (namelc == NULL)
1101			namelc = name;
1102		else {
1103			/*
1104			 * This is a hierarchical name.  Split it into the
1105			 * parent and child names, and make sure the parent
1106			 * exists or matches an already found jail.
1107			 */
1108			if (pr != NULL) {
1109				if (strncmp(name, ppr->pr_name, namelc - name)
1110				    || ppr->pr_name[namelc - name] != '\0') {
1111					error = EINVAL;
1112					vfs_opterror(opts,
1113					    "cannot change jail's parent");
1114					goto done_deref;
1115				}
1116			} else {
1117				*namelc = '\0';
1118				ppr = prison_find_name(mypr, name);
1119				if (ppr == NULL) {
1120					error = ENOENT;
1121					vfs_opterror(opts,
1122					    "jail \"%s\" not found", name);
1123					goto done_deref;
1124				}
1125				mtx_unlock(&ppr->pr_mtx);
1126				if (!prison_isalive(ppr)) {
1127					error = ENOENT;
1128					vfs_opterror(opts,
1129					    "jail \"%s\" is dying", name);
1130					goto done_deref;
1131				}
1132				*namelc = '.';
1133			}
1134			namelc++;
1135		}
1136		if (namelc[0] != '\0') {
1137			pnamelen =
1138			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1139			deadpr = NULL;
1140			FOREACH_PRISON_CHILD(ppr, tpr) {
1141				if (tpr != pr &&
1142				    !strcmp(tpr->pr_name + pnamelen, namelc)) {
1143					if (prison_isalive(tpr)) {
1144						if (pr == NULL &&
1145						    cuflags != JAIL_CREATE) {
1146							/*
1147							 * Use this jail
1148							 * for updates.
1149							 */
1150							pr = tpr;
1151							mtx_lock(&pr->pr_mtx);
1152							drflags |= PD_LOCKED;
1153							break;
1154						}
1155						/*
1156						 * Create, or update(jid):
1157						 * name must not exist in an
1158						 * active sibling jail.
1159						 */
1160						error = EEXIST;
1161						vfs_opterror(opts,
1162						   "jail \"%s\" already exists",
1163						   name);
1164						goto done_deref;
1165					}
1166					if (pr == NULL &&
1167					    cuflags != JAIL_CREATE) {
1168						deadpr = tpr;
1169					}
1170				}
1171			}
1172			/* If no active jail is found, use a dying one. */
1173			if (deadpr != NULL && pr == NULL) {
1174				if (flags & JAIL_DYING) {
1175					pr = deadpr;
1176					mtx_lock(&pr->pr_mtx);
1177					drflags |= PD_LOCKED;
1178				} else if (cuflags == JAIL_UPDATE) {
1179					error = ENOENT;
1180					vfs_opterror(opts,
1181					    "jail \"%s\" is dying", name);
1182					goto done_deref;
1183				}
1184			}
1185			/* Update: name must exist if no jid. */
1186			else if (cuflags == JAIL_UPDATE && pr == NULL) {
1187				error = ENOENT;
1188				vfs_opterror(opts, "jail \"%s\" not found",
1189				    name);
1190				goto done_deref;
1191			}
1192		}
1193	}
1194	/* Update: must provide a jid or name. */
1195	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1196		error = ENOENT;
1197		vfs_opterror(opts, "update specified no jail");
1198		goto done_deref;
1199	}
1200
1201	/* If there's no prison to update, create a new one and link it in. */
1202	created = pr == NULL;
1203	if (created) {
1204		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1205			if (tpr->pr_childcount >= tpr->pr_childmax) {
1206				error = EPERM;
1207				vfs_opterror(opts, "prison limit exceeded");
1208				goto done_deref;
1209			}
1210		if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) {
1211			error = EAGAIN;
1212			vfs_opterror(opts, "no available jail IDs");
1213			goto done_deref;
1214		}
1215
1216		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1217		pr->pr_state = PRISON_STATE_INVALID;
1218		refcount_init(&pr->pr_ref, 1);
1219		refcount_init(&pr->pr_uref, 0);
1220		drflags |= PD_DEREF;
1221		LIST_INIT(&pr->pr_children);
1222		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1223		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
1224
1225		pr->pr_id = jid;
1226		if (inspr != NULL)
1227			TAILQ_INSERT_BEFORE(inspr, pr, pr_list);
1228		else
1229			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1230
1231		pr->pr_parent = ppr;
1232		prison_hold(ppr);
1233		prison_proc_hold(ppr);
1234		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1235		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1236			tpr->pr_childcount++;
1237
1238		/* Set some default values, and inherit some from the parent. */
1239		if (namelc == NULL)
1240			namelc = "";
1241		if (path == NULL) {
1242			path = "/";
1243			root = mypr->pr_root;
1244			vref(root);
1245		}
1246		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1247		pr->pr_flags |= PR_HOST;
1248#if defined(INET) || defined(INET6)
1249#ifdef VIMAGE
1250		if (!(pr_flags & PR_VNET))
1251#endif
1252		{
1253#ifdef INET
1254			if (!(ch_flags & PR_IP4_USER))
1255				pr->pr_flags |= PR_IP4 | PR_IP4_USER;
1256			else if (!(pr_flags & PR_IP4_USER)) {
1257				pr->pr_flags |= ppr->pr_flags & PR_IP4;
1258				if (ppr->pr_ip4 != NULL) {
1259					pr->pr_ip4s = ppr->pr_ip4s;
1260					pr->pr_ip4 = malloc(pr->pr_ip4s *
1261					    sizeof(struct in_addr), M_PRISON,
1262					    M_WAITOK);
1263					bcopy(ppr->pr_ip4, pr->pr_ip4,
1264					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1265				}
1266			}
1267#endif
1268#ifdef INET6
1269			if (!(ch_flags & PR_IP6_USER))
1270				pr->pr_flags |= PR_IP6 | PR_IP6_USER;
1271			else if (!(pr_flags & PR_IP6_USER)) {
1272				pr->pr_flags |= ppr->pr_flags & PR_IP6;
1273				if (ppr->pr_ip6 != NULL) {
1274					pr->pr_ip6s = ppr->pr_ip6s;
1275					pr->pr_ip6 = malloc(pr->pr_ip6s *
1276					    sizeof(struct in6_addr), M_PRISON,
1277					    M_WAITOK);
1278					bcopy(ppr->pr_ip6, pr->pr_ip6,
1279					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1280				}
1281			}
1282#endif
1283		}
1284#endif
1285		/* Source address selection is always on by default. */
1286		pr->pr_flags |= _PR_IP_SADDRSEL;
1287
1288		pr->pr_securelevel = ppr->pr_securelevel;
1289		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1290		pr->pr_enforce_statfs = jail_default_enforce_statfs;
1291		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
1292
1293		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
1294		if (osrelstr == NULL)
1295			strlcpy(pr->pr_osrelease, ppr->pr_osrelease,
1296			    sizeof(pr->pr_osrelease));
1297		else
1298			strlcpy(pr->pr_osrelease, osrelstr,
1299			    sizeof(pr->pr_osrelease));
1300
1301#ifdef VIMAGE
1302		/* Allocate a new vnet if specified. */
1303		pr->pr_vnet = (pr_flags & PR_VNET)
1304		    ? vnet_alloc() : ppr->pr_vnet;
1305#endif
1306		/*
1307		 * Allocate a dedicated cpuset for each jail.
1308		 * Unlike other initial settings, this may return an erorr.
1309		 */
1310		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1311		if (error)
1312			goto done_deref;
1313
1314		mtx_lock(&pr->pr_mtx);
1315		drflags |= PD_LOCKED;
1316	} else {
1317		/*
1318		 * Grab a reference for existing prisons, to ensure they
1319		 * continue to exist for the duration of the call.
1320		 */
1321		prison_hold(pr);
1322		drflags |= PD_DEREF;
1323#if defined(VIMAGE) && (defined(INET) || defined(INET6))
1324		if ((pr->pr_flags & PR_VNET) &&
1325		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1326			error = EINVAL;
1327			vfs_opterror(opts,
1328			    "vnet jails cannot have IP address restrictions");
1329			goto done_deref;
1330		}
1331#endif
1332#ifdef INET
1333		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1334			error = EINVAL;
1335			vfs_opterror(opts,
1336			    "ip4 cannot be changed after creation");
1337			goto done_deref;
1338		}
1339#endif
1340#ifdef INET6
1341		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1342			error = EINVAL;
1343			vfs_opterror(opts,
1344			    "ip6 cannot be changed after creation");
1345			goto done_deref;
1346		}
1347#endif
1348	}
1349
1350	/* Do final error checking before setting anything. */
1351	if (gotslevel) {
1352		if (slevel < ppr->pr_securelevel) {
1353			error = EPERM;
1354			goto done_deref;
1355		}
1356	}
1357	if (gotchildmax) {
1358		if (childmax >= ppr->pr_childmax) {
1359			error = EPERM;
1360			goto done_deref;
1361		}
1362	}
1363	if (gotenforce) {
1364		if (enforce < ppr->pr_enforce_statfs) {
1365			error = EPERM;
1366			goto done_deref;
1367		}
1368	}
1369	if (gotrsnum) {
1370		/*
1371		 * devfs_rsnum is a uint16_t
1372		 */
1373		if (rsnum < 0 || rsnum > 65535) {
1374			error = EINVAL;
1375			goto done_deref;
1376		}
1377		/*
1378		 * Nested jails always inherit parent's devfs ruleset
1379		 */
1380		if (jailed(td->td_ucred)) {
1381			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
1382				error = EPERM;
1383				goto done_deref;
1384			} else
1385				rsnum = ppr->pr_devfs_rsnum;
1386		}
1387	}
1388#ifdef INET
1389	if (ip4s > 0) {
1390		if (ppr->pr_flags & PR_IP4) {
1391			/*
1392			 * Make sure the new set of IP addresses is a
1393			 * subset of the parent's list.  Don't worry
1394			 * about the parent being unlocked, as any
1395			 * setting is done with allprison_lock held.
1396			 */
1397			for (ij = 0; ij < ppr->pr_ip4s; ij++)
1398				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
1399					break;
1400			if (ij == ppr->pr_ip4s) {
1401				error = EPERM;
1402				goto done_deref;
1403			}
1404			if (ip4s > 1) {
1405				for (ii = ij = 1; ii < ip4s; ii++) {
1406					if (ip4[ii].s_addr ==
1407					    ppr->pr_ip4[0].s_addr)
1408						continue;
1409					for (; ij < ppr->pr_ip4s; ij++)
1410						if (ip4[ii].s_addr ==
1411						    ppr->pr_ip4[ij].s_addr)
1412							break;
1413					if (ij == ppr->pr_ip4s)
1414						break;
1415				}
1416				if (ij == ppr->pr_ip4s) {
1417					error = EPERM;
1418					goto done_deref;
1419				}
1420			}
1421		}
1422		/*
1423		 * Check for conflicting IP addresses.  We permit them
1424		 * if there is no more than one IP on each jail.  If
1425		 * there is a duplicate on a jail with more than one
1426		 * IP stop checking and return error.
1427		 */
1428#ifdef VIMAGE
1429		for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
1430			if (tppr->pr_flags & PR_VNET)
1431				break;
1432#else
1433		tppr = &prison0;
1434#endif
1435		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1436			if (tpr == pr ||
1437#ifdef VIMAGE
1438			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1439#endif
1440			    !prison_isalive(tpr)) {
1441				descend = 0;
1442				continue;
1443			}
1444			if (!(tpr->pr_flags & PR_IP4_USER))
1445				continue;
1446			descend = 0;
1447			if (tpr->pr_ip4 == NULL ||
1448			    (ip4s == 1 && tpr->pr_ip4s == 1))
1449				continue;
1450			for (ii = 0; ii < ip4s; ii++) {
1451				if (prison_check_ip4_locked(tpr, &ip4[ii]) ==
1452				    0) {
1453					error = EADDRINUSE;
1454					vfs_opterror(opts,
1455					    "IPv4 addresses clash");
1456					goto done_deref;
1457				}
1458			}
1459		}
1460	}
1461#endif
1462#ifdef INET6
1463	if (ip6s > 0) {
1464		if (ppr->pr_flags & PR_IP6) {
1465			/*
1466			 * Make sure the new set of IP addresses is a
1467			 * subset of the parent's list.
1468			 */
1469			for (ij = 0; ij < ppr->pr_ip6s; ij++)
1470				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
1471				    &ppr->pr_ip6[ij]))
1472					break;
1473			if (ij == ppr->pr_ip6s) {
1474				error = EPERM;
1475				goto done_deref;
1476			}
1477			if (ip6s > 1) {
1478				for (ii = ij = 1; ii < ip6s; ii++) {
1479					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
1480					     &ppr->pr_ip6[0]))
1481						continue;
1482					for (; ij < ppr->pr_ip6s; ij++)
1483						if (IN6_ARE_ADDR_EQUAL(
1484						    &ip6[ii], &ppr->pr_ip6[ij]))
1485							break;
1486					if (ij == ppr->pr_ip6s)
1487						break;
1488				}
1489				if (ij == ppr->pr_ip6s) {
1490					error = EPERM;
1491					goto done_deref;
1492				}
1493			}
1494		}
1495		/* Check for conflicting IP addresses. */
1496#ifdef VIMAGE
1497		for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
1498			if (tppr->pr_flags & PR_VNET)
1499				break;
1500#else
1501		tppr = &prison0;
1502#endif
1503		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1504			if (tpr == pr ||
1505#ifdef VIMAGE
1506			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1507#endif
1508			    !prison_isalive(tpr)) {
1509				descend = 0;
1510				continue;
1511			}
1512			if (!(tpr->pr_flags & PR_IP6_USER))
1513				continue;
1514			descend = 0;
1515			if (tpr->pr_ip6 == NULL ||
1516			    (ip6s == 1 && tpr->pr_ip6s == 1))
1517				continue;
1518			for (ii = 0; ii < ip6s; ii++) {
1519				if (prison_check_ip6_locked(tpr, &ip6[ii]) ==
1520				    0) {
1521					error = EADDRINUSE;
1522					vfs_opterror(opts,
1523					    "IPv6 addresses clash");
1524					goto done_deref;
1525				}
1526			}
1527		}
1528	}
1529#endif
1530	onamelen = namelen = 0;
1531	if (namelc != NULL) {
1532		/* Give a default name of the jid.  Also allow the name to be
1533		 * explicitly the jid - but not any other number, and only in
1534		 * normal form (no leading zero/etc).
1535		 */
1536		if (namelc[0] == '\0')
1537			snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
1538		else if ((strtoul(namelc, &p, 10) != jid ||
1539			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
1540			error = EINVAL;
1541			vfs_opterror(opts,
1542			    "name cannot be numeric (unless it is the jid)");
1543			goto done_deref;
1544		}
1545		/*
1546		 * Make sure the name isn't too long for the prison or its
1547		 * children.
1548		 */
1549		pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1550		onamelen = strlen(pr->pr_name + pnamelen);
1551		namelen = strlen(namelc);
1552		if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
1553			error = ENAMETOOLONG;
1554			goto done_deref;
1555		}
1556		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1557			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1558			    sizeof(pr->pr_name)) {
1559				error = ENAMETOOLONG;
1560				goto done_deref;
1561			}
1562		}
1563	}
1564	pr_allow_diff = pr_allow & ~ppr->pr_allow;
1565	if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) {
1566		error = EPERM;
1567		goto done_deref;
1568	}
1569
1570	/*
1571	 * Let modules check their parameters.  This requires unlocking and
1572	 * then re-locking the prison, but this is still a valid state as long
1573	 * as allprison_lock remains xlocked.
1574	 */
1575	mtx_unlock(&pr->pr_mtx);
1576	drflags &= ~PD_LOCKED;
1577	error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
1578	if (error != 0)
1579		goto done_deref;
1580	mtx_lock(&pr->pr_mtx);
1581	drflags |= PD_LOCKED;
1582
1583	/* At this point, all valid parameters should have been noted. */
1584	TAILQ_FOREACH(opt, opts, link) {
1585		if (!opt->seen && strcmp(opt->name, "errmsg")) {
1586			error = EINVAL;
1587			vfs_opterror(opts, "unknown parameter: %s", opt->name);
1588			goto done_deref;
1589		}
1590	}
1591
1592	/* Set the parameters of the prison. */
1593#ifdef INET
1594	redo_ip4 = 0;
1595	if (pr_flags & PR_IP4_USER) {
1596		pr->pr_flags |= PR_IP4;
1597		free(pr->pr_ip4, M_PRISON);
1598		pr->pr_ip4s = ip4s;
1599		pr->pr_ip4 = ip4;
1600		ip4 = NULL;
1601		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1602#ifdef VIMAGE
1603			if (tpr->pr_flags & PR_VNET) {
1604				descend = 0;
1605				continue;
1606			}
1607#endif
1608			if (prison_restrict_ip4(tpr, NULL)) {
1609				redo_ip4 = 1;
1610				descend = 0;
1611			}
1612		}
1613	}
1614#endif
1615#ifdef INET6
1616	redo_ip6 = 0;
1617	if (pr_flags & PR_IP6_USER) {
1618		pr->pr_flags |= PR_IP6;
1619		free(pr->pr_ip6, M_PRISON);
1620		pr->pr_ip6s = ip6s;
1621		pr->pr_ip6 = ip6;
1622		ip6 = NULL;
1623		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1624#ifdef VIMAGE
1625			if (tpr->pr_flags & PR_VNET) {
1626				descend = 0;
1627				continue;
1628			}
1629#endif
1630			if (prison_restrict_ip6(tpr, NULL)) {
1631				redo_ip6 = 1;
1632				descend = 0;
1633			}
1634		}
1635	}
1636#endif
1637	if (gotslevel) {
1638		pr->pr_securelevel = slevel;
1639		/* Set all child jails to be at least this level. */
1640		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1641			if (tpr->pr_securelevel < slevel)
1642				tpr->pr_securelevel = slevel;
1643	}
1644	if (gotchildmax) {
1645		pr->pr_childmax = childmax;
1646		/* Set all child jails to under this limit. */
1647		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1648			if (tpr->pr_childmax > childmax - level)
1649				tpr->pr_childmax = childmax > level
1650				    ? childmax - level : 0;
1651	}
1652	if (gotenforce) {
1653		pr->pr_enforce_statfs = enforce;
1654		/* Pass this restriction on to the children. */
1655		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1656			if (tpr->pr_enforce_statfs < enforce)
1657				tpr->pr_enforce_statfs = enforce;
1658	}
1659	if (gotrsnum) {
1660		pr->pr_devfs_rsnum = rsnum;
1661		/* Pass this restriction on to the children. */
1662		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1663			tpr->pr_devfs_rsnum = rsnum;
1664	}
1665	if (namelc != NULL) {
1666		if (ppr == &prison0)
1667			strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
1668		else
1669			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1670			    ppr->pr_name, namelc);
1671		/* Change this component of child names. */
1672		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1673			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1674			    strlen(tpr->pr_name + onamelen) + 1);
1675			bcopy(pr->pr_name, tpr->pr_name, namelen);
1676		}
1677	}
1678	if (path != NULL) {
1679		/* Try to keep a real-rooted full pathname. */
1680		strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1681		pr->pr_root = root;
1682		root = NULL;
1683	}
1684	if (PR_HOST & ch_flags & ~pr_flags) {
1685		if (pr->pr_flags & PR_HOST) {
1686			/*
1687			 * Copy the parent's host info.  As with pr_ip4 above,
1688			 * the lack of a lock on the parent is not a problem;
1689			 * it is always set with allprison_lock at least
1690			 * shared, and is held exclusively here.
1691			 */
1692			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1693			    sizeof(pr->pr_hostname));
1694			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1695			    sizeof(pr->pr_domainname));
1696			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1697			    sizeof(pr->pr_hostuuid));
1698			pr->pr_hostid = pr->pr_parent->pr_hostid;
1699		}
1700	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1701		/* Set this prison, and any descendants without PR_HOST. */
1702		if (host != NULL)
1703			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1704		if (domain != NULL)
1705			strlcpy(pr->pr_domainname, domain,
1706			    sizeof(pr->pr_domainname));
1707		if (uuid != NULL)
1708			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1709		if (gothid)
1710			pr->pr_hostid = hid;
1711		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1712			if (tpr->pr_flags & PR_HOST)
1713				descend = 0;
1714			else {
1715				if (host != NULL)
1716					strlcpy(tpr->pr_hostname,
1717					    pr->pr_hostname,
1718					    sizeof(tpr->pr_hostname));
1719				if (domain != NULL)
1720					strlcpy(tpr->pr_domainname,
1721					    pr->pr_domainname,
1722					    sizeof(tpr->pr_domainname));
1723				if (uuid != NULL)
1724					strlcpy(tpr->pr_hostuuid,
1725					    pr->pr_hostuuid,
1726					    sizeof(tpr->pr_hostuuid));
1727				if (gothid)
1728					tpr->pr_hostid = hid;
1729			}
1730		}
1731	}
1732	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1733	if ((tallow = ch_allow & ~pr_allow))
1734		prison_set_allow_locked(pr, tallow, 0);
1735	/*
1736	 * Persistent prisons get an extra reference, and prisons losing their
1737	 * persist flag lose that reference.
1738	 */
1739	born = !prison_isalive(pr);
1740	if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) {
1741		if (pr_flags & PR_PERSIST) {
1742			prison_hold(pr);
1743			/*
1744			 * This may make a dead prison alive again, but wait
1745			 * to label it as such until after OSD calls have had
1746			 * a chance to run (and perhaps to fail).
1747			 */
1748			refcount_acquire(&pr->pr_uref);
1749		} else {
1750			drflags |= PD_DEUREF;
1751			prison_free_not_last(pr);
1752		}
1753	}
1754	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1755	mtx_unlock(&pr->pr_mtx);
1756	drflags &= ~PD_LOCKED;
1757	/*
1758	 * Any errors past this point will need to de-persist newly created
1759	 * prisons, as well as call remove methods.
1760	 */
1761	if (born)
1762		drflags |= PD_KILL;
1763
1764#ifdef RACCT
1765	if (racct_enable && created)
1766		prison_racct_attach(pr);
1767#endif
1768
1769	/* Locks may have prevented a complete restriction of child IP
1770	 * addresses.  If so, allocate some more memory and try again.
1771	 */
1772#ifdef INET
1773	while (redo_ip4) {
1774		ip4s = pr->pr_ip4s;
1775		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
1776		mtx_lock(&pr->pr_mtx);
1777		redo_ip4 = 0;
1778		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1779#ifdef VIMAGE
1780			if (tpr->pr_flags & PR_VNET) {
1781				descend = 0;
1782				continue;
1783			}
1784#endif
1785			if (prison_restrict_ip4(tpr, ip4)) {
1786				if (ip4 != NULL)
1787					ip4 = NULL;
1788				else
1789					redo_ip4 = 1;
1790			}
1791		}
1792		mtx_unlock(&pr->pr_mtx);
1793	}
1794#endif
1795#ifdef INET6
1796	while (redo_ip6) {
1797		ip6s = pr->pr_ip6s;
1798		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
1799		mtx_lock(&pr->pr_mtx);
1800		redo_ip6 = 0;
1801		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1802#ifdef VIMAGE
1803			if (tpr->pr_flags & PR_VNET) {
1804				descend = 0;
1805				continue;
1806			}
1807#endif
1808			if (prison_restrict_ip6(tpr, ip6)) {
1809				if (ip6 != NULL)
1810					ip6 = NULL;
1811				else
1812					redo_ip6 = 1;
1813			}
1814		}
1815		mtx_unlock(&pr->pr_mtx);
1816	}
1817#endif
1818
1819	/* Let the modules do their work. */
1820	if (born) {
1821		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1822		if (error)
1823			goto done_deref;
1824	}
1825	error = osd_jail_call(pr, PR_METHOD_SET, opts);
1826	if (error)
1827		goto done_deref;
1828
1829	/*
1830	 * A new prison is now ready to be seen; either it has gained a user
1831	 * reference via persistence, or is about to gain one via attachment.
1832	 */
1833	if (born) {
1834		drflags = prison_lock_xlock(pr, drflags);
1835		pr->pr_state = PRISON_STATE_ALIVE;
1836	}
1837
1838	/* Attach this process to the prison if requested. */
1839	if (flags & JAIL_ATTACH) {
1840		error = do_jail_attach(td, pr,
1841		    prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS));
1842		drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED);
1843		if (error) {
1844			vfs_opterror(opts, "attach failed");
1845			goto done_deref;
1846		}
1847	}
1848
1849#ifdef RACCT
1850	if (racct_enable && !created) {
1851		if (drflags & PD_LOCKED) {
1852			mtx_unlock(&pr->pr_mtx);
1853			drflags &= ~PD_LOCKED;
1854		}
1855		if (drflags & PD_LIST_XLOCKED) {
1856			sx_xunlock(&allprison_lock);
1857			drflags &= ~PD_LIST_XLOCKED;
1858		}
1859		prison_racct_modify(pr);
1860	}
1861#endif
1862
1863	drflags &= ~PD_KILL;
1864	td->td_retval[0] = pr->pr_id;
1865
1866 done_deref:
1867	/* Release any temporary prison holds and/or locks. */
1868	if (pr != NULL)
1869		prison_deref(pr, drflags);
1870	else if (drflags & PD_LIST_SLOCKED)
1871		sx_sunlock(&allprison_lock);
1872	else if (drflags & PD_LIST_XLOCKED)
1873		sx_xunlock(&allprison_lock);
1874	if (root != NULL)
1875		vrele(root);
1876 done_errmsg:
1877	if (error) {
1878		/* Write the error message back to userspace. */
1879		if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
1880		    &errmsg_len) == 0 && errmsg_len > 0) {
1881			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1882			if (optuio->uio_segflg == UIO_SYSSPACE)
1883				bcopy(errmsg,
1884				    optuio->uio_iov[errmsg_pos].iov_base,
1885				    errmsg_len);
1886			else
1887				copyout(errmsg,
1888				    optuio->uio_iov[errmsg_pos].iov_base,
1889				    errmsg_len);
1890		}
1891	}
1892 done_free:
1893#ifdef INET
1894	free(ip4, M_PRISON);
1895#endif
1896#ifdef INET6
1897	free(ip6, M_PRISON);
1898#endif
1899	if (g_path != NULL)
1900		free(g_path, M_TEMP);
1901	vfs_freeopts(opts);
1902	return (error);
1903}
1904
1905/*
1906 * Find the next available prison ID.  Return the ID on success, or zero
1907 * on failure.  Also set a pointer to the allprison list entry the prison
1908 * should be inserted before.
1909 */
1910static int
1911get_next_prid(struct prison **insprp)
1912{
1913	struct prison *inspr;
1914	int jid, maxid;
1915
1916	jid = lastprid % JAIL_MAX + 1;
1917	if (TAILQ_EMPTY(&allprison) ||
1918	    TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) {
1919		/*
1920		 * A common case is for all jails to be implicitly numbered,
1921		 * which means they'll go on the end of the list, at least
1922		 * for the first JAIL_MAX times.
1923		 */
1924		inspr = NULL;
1925	} else {
1926		/*
1927		 * Take two passes through the allprison list: first starting
1928		 * with the proposed jid, then ending with it.
1929		 */
1930		for (maxid = JAIL_MAX; maxid != 0; ) {
1931			TAILQ_FOREACH(inspr, &allprison, pr_list) {
1932				if (inspr->pr_id < jid)
1933					continue;
1934				if (inspr->pr_id > jid) {
1935					/* Found an opening. */
1936					maxid = 0;
1937					break;
1938				}
1939				if (++jid > maxid) {
1940					if (lastprid == maxid || lastprid == 0)
1941					{
1942						/*
1943						 * The entire legal range
1944						 * has been traversed
1945						 */
1946						return 0;
1947					}
1948					/* Try again from the start. */
1949					jid = 1;
1950					maxid = lastprid;
1951					break;
1952				}
1953			}
1954			if (inspr == NULL) {
1955				/* Found room at the end of the list. */
1956				break;
1957			}
1958		}
1959	}
1960	*insprp = inspr;
1961	lastprid = jid;
1962	return (jid);
1963}
1964
1965/*
1966 * struct jail_get_args {
1967 *	struct iovec *iovp;
1968 *	unsigned int iovcnt;
1969 *	int flags;
1970 * };
1971 */
1972int
1973sys_jail_get(struct thread *td, struct jail_get_args *uap)
1974{
1975	struct uio *auio;
1976	int error;
1977
1978	/* Check that we have an even number of iovecs. */
1979	if (uap->iovcnt & 1)
1980		return (EINVAL);
1981
1982	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
1983	if (error)
1984		return (error);
1985	error = kern_jail_get(td, auio, uap->flags);
1986	if (error == 0)
1987		error = copyout(auio->uio_iov, uap->iovp,
1988		    uap->iovcnt * sizeof (struct iovec));
1989	free(auio, M_IOV);
1990	return (error);
1991}
1992
1993int
1994kern_jail_get(struct thread *td, struct uio *optuio, int flags)
1995{
1996	struct bool_flags *bf;
1997	struct jailsys_flags *jsf;
1998	struct prison *pr, *mypr;
1999	struct vfsopt *opt;
2000	struct vfsoptlist *opts;
2001	char *errmsg, *name;
2002	int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos;
2003	unsigned f;
2004
2005	if (flags & ~JAIL_GET_MASK)
2006		return (EINVAL);
2007
2008	/* Get the parameter list. */
2009	error = vfs_buildopts(optuio, &opts);
2010	if (error)
2011		return (error);
2012	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
2013	mypr = td->td_ucred->cr_prison;
2014	pr = NULL;
2015
2016	/*
2017	 * Find the prison specified by one of: lastjid, jid, name.
2018	 */
2019	sx_slock(&allprison_lock);
2020	drflags = PD_LIST_SLOCKED;
2021	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
2022	if (error == 0) {
2023		TAILQ_FOREACH(pr, &allprison, pr_list) {
2024			if (pr->pr_id > jid &&
2025			    ((flags & JAIL_DYING) || prison_isalive(pr)) &&
2026			    prison_ischild(mypr, pr)) {
2027				mtx_lock(&pr->pr_mtx);
2028				drflags |= PD_LOCKED;
2029				goto found_prison;
2030			}
2031		}
2032		error = ENOENT;
2033		vfs_opterror(opts, "no jail after %d", jid);
2034		goto done;
2035	} else if (error != ENOENT)
2036		goto done;
2037
2038	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
2039	if (error == 0) {
2040		if (jid != 0) {
2041			pr = prison_find_child(mypr, jid);
2042			if (pr != NULL) {
2043				drflags |= PD_LOCKED;
2044				if (!(prison_isalive(pr) ||
2045				    (flags & JAIL_DYING))) {
2046					error = ENOENT;
2047					vfs_opterror(opts, "jail %d is dying",
2048					    jid);
2049					goto done;
2050				}
2051				goto found_prison;
2052			}
2053			error = ENOENT;
2054			vfs_opterror(opts, "jail %d not found", jid);
2055			goto done;
2056		}
2057	} else if (error != ENOENT)
2058		goto done;
2059
2060	error = vfs_getopt(opts, "name", (void **)&name, &len);
2061	if (error == 0) {
2062		if (len == 0 || name[len - 1] != '\0') {
2063			error = EINVAL;
2064			goto done;
2065		}
2066		pr = prison_find_name(mypr, name);
2067		if (pr != NULL) {
2068			drflags |= PD_LOCKED;
2069			if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
2070				error = ENOENT;
2071				vfs_opterror(opts, "jail \"%s\" is dying",
2072				    name);
2073				goto done;
2074			}
2075			goto found_prison;
2076		}
2077		error = ENOENT;
2078		vfs_opterror(opts, "jail \"%s\" not found", name);
2079		goto done;
2080	} else if (error != ENOENT)
2081		goto done;
2082
2083	vfs_opterror(opts, "no jail specified");
2084	error = ENOENT;
2085	goto done;
2086
2087 found_prison:
2088	/* Get the parameters of the prison. */
2089	prison_hold(pr);
2090	drflags |= PD_DEREF;
2091	td->td_retval[0] = pr->pr_id;
2092	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
2093	if (error != 0 && error != ENOENT)
2094		goto done;
2095	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
2096	error = vfs_setopt(opts, "parent", &i, sizeof(i));
2097	if (error != 0 && error != ENOENT)
2098		goto done;
2099	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
2100	if (error != 0 && error != ENOENT)
2101		goto done;
2102	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
2103	    sizeof(pr->pr_cpuset->cs_id));
2104	if (error != 0 && error != ENOENT)
2105		goto done;
2106	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
2107	if (error != 0 && error != ENOENT)
2108		goto done;
2109#ifdef INET
2110	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
2111	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
2112	if (error != 0 && error != ENOENT)
2113		goto done;
2114#endif
2115#ifdef INET6
2116	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
2117	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
2118	if (error != 0 && error != ENOENT)
2119		goto done;
2120#endif
2121	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2122	    sizeof(pr->pr_securelevel));
2123	if (error != 0 && error != ENOENT)
2124		goto done;
2125	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2126	    sizeof(pr->pr_childcount));
2127	if (error != 0 && error != ENOENT)
2128		goto done;
2129	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2130	    sizeof(pr->pr_childmax));
2131	if (error != 0 && error != ENOENT)
2132		goto done;
2133	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2134	if (error != 0 && error != ENOENT)
2135		goto done;
2136	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2137	if (error != 0 && error != ENOENT)
2138		goto done;
2139	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2140	if (error != 0 && error != ENOENT)
2141		goto done;
2142#ifdef COMPAT_FREEBSD32
2143	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
2144		uint32_t hid32 = pr->pr_hostid;
2145
2146		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2147	} else
2148#endif
2149	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2150	    sizeof(pr->pr_hostid));
2151	if (error != 0 && error != ENOENT)
2152		goto done;
2153	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2154	    sizeof(pr->pr_enforce_statfs));
2155	if (error != 0 && error != ENOENT)
2156		goto done;
2157	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
2158	    sizeof(pr->pr_devfs_rsnum));
2159	if (error != 0 && error != ENOENT)
2160		goto done;
2161	for (bf = pr_flag_bool;
2162	     bf < pr_flag_bool + nitems(pr_flag_bool);
2163	     bf++) {
2164		i = (pr->pr_flags & bf->flag) ? 1 : 0;
2165		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2166		if (error != 0 && error != ENOENT)
2167			goto done;
2168		i = !i;
2169		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2170		if (error != 0 && error != ENOENT)
2171			goto done;
2172	}
2173	for (jsf = pr_flag_jailsys;
2174	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
2175	     jsf++) {
2176		f = pr->pr_flags & (jsf->disable | jsf->new);
2177		i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE
2178		    : (f == jsf->new) ? JAIL_SYS_NEW
2179		    : JAIL_SYS_INHERIT;
2180		error = vfs_setopt(opts, jsf->name, &i, sizeof(i));
2181		if (error != 0 && error != ENOENT)
2182			goto done;
2183	}
2184	for (bf = pr_flag_allow;
2185	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
2186		atomic_load_int(&bf->flag) != 0;
2187	     bf++) {
2188		i = (pr->pr_allow & bf->flag) ? 1 : 0;
2189		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2190		if (error != 0 && error != ENOENT)
2191			goto done;
2192		i = !i;
2193		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2194		if (error != 0 && error != ENOENT)
2195			goto done;
2196	}
2197	i = !prison_isalive(pr);
2198	error = vfs_setopt(opts, "dying", &i, sizeof(i));
2199	if (error != 0 && error != ENOENT)
2200		goto done;
2201	i = !i;
2202	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2203	if (error != 0 && error != ENOENT)
2204		goto done;
2205	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
2206	    sizeof(pr->pr_osreldate));
2207	if (error != 0 && error != ENOENT)
2208		goto done;
2209	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
2210	if (error != 0 && error != ENOENT)
2211		goto done;
2212
2213	/* Get the module parameters. */
2214	mtx_unlock(&pr->pr_mtx);
2215	drflags &= ~PD_LOCKED;
2216	error = osd_jail_call(pr, PR_METHOD_GET, opts);
2217	if (error)
2218		goto done;
2219	prison_deref(pr, drflags);
2220	pr = NULL;
2221	drflags = 0;
2222
2223	/* By now, all parameters should have been noted. */
2224	TAILQ_FOREACH(opt, opts, link) {
2225		if (!opt->seen && strcmp(opt->name, "errmsg")) {
2226			error = EINVAL;
2227			vfs_opterror(opts, "unknown parameter: %s", opt->name);
2228			goto done;
2229		}
2230	}
2231
2232	/* Write the fetched parameters back to userspace. */
2233	error = 0;
2234	TAILQ_FOREACH(opt, opts, link) {
2235		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2236			pos = 2 * opt->pos + 1;
2237			optuio->uio_iov[pos].iov_len = opt->len;
2238			if (opt->value != NULL) {
2239				if (optuio->uio_segflg == UIO_SYSSPACE) {
2240					bcopy(opt->value,
2241					    optuio->uio_iov[pos].iov_base,
2242					    opt->len);
2243				} else {
2244					error = copyout(opt->value,
2245					    optuio->uio_iov[pos].iov_base,
2246					    opt->len);
2247					if (error)
2248						break;
2249				}
2250			}
2251		}
2252	}
2253
2254 done:
2255	/* Release any temporary prison holds and/or locks. */
2256	if (pr != NULL)
2257		prison_deref(pr, drflags);
2258	else if (drflags & PD_LIST_SLOCKED)
2259		sx_sunlock(&allprison_lock);
2260	if (error && errmsg_pos >= 0) {
2261		/* Write the error message back to userspace. */
2262		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2263		errmsg_pos = 2 * errmsg_pos + 1;
2264		if (errmsg_len > 0) {
2265			if (optuio->uio_segflg == UIO_SYSSPACE)
2266				bcopy(errmsg,
2267				    optuio->uio_iov[errmsg_pos].iov_base,
2268				    errmsg_len);
2269			else
2270				copyout(errmsg,
2271				    optuio->uio_iov[errmsg_pos].iov_base,
2272				    errmsg_len);
2273		}
2274	}
2275	vfs_freeopts(opts);
2276	return (error);
2277}
2278
2279/*
2280 * struct jail_remove_args {
2281 *	int jid;
2282 * };
2283 */
2284int
2285sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
2286{
2287	struct prison *pr;
2288	int error;
2289
2290	error = priv_check(td, PRIV_JAIL_REMOVE);
2291	if (error)
2292		return (error);
2293
2294	sx_xlock(&allprison_lock);
2295	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2296	if (pr == NULL) {
2297		sx_xunlock(&allprison_lock);
2298		return (EINVAL);
2299	}
2300	if (!prison_isalive(pr)) {
2301		/* Silently ignore already-dying prisons. */
2302		mtx_unlock(&pr->pr_mtx);
2303		sx_xunlock(&allprison_lock);
2304		return (0);
2305	}
2306	prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED);
2307	return (0);
2308}
2309
2310/*
2311 * struct jail_attach_args {
2312 *	int jid;
2313 * };
2314 */
2315int
2316sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
2317{
2318	struct prison *pr;
2319	int error;
2320
2321	error = priv_check(td, PRIV_JAIL_ATTACH);
2322	if (error)
2323		return (error);
2324
2325	sx_slock(&allprison_lock);
2326	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2327	if (pr == NULL) {
2328		sx_sunlock(&allprison_lock);
2329		return (EINVAL);
2330	}
2331
2332	/* Do not allow a process to attach to a prison that is not alive. */
2333	if (!prison_isalive(pr)) {
2334		mtx_unlock(&pr->pr_mtx);
2335		sx_sunlock(&allprison_lock);
2336		return (EINVAL);
2337	}
2338
2339	return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED));
2340}
2341
2342static int
2343do_jail_attach(struct thread *td, struct prison *pr, int drflags)
2344{
2345	struct proc *p;
2346	struct ucred *newcred, *oldcred;
2347	int error;
2348
2349	mtx_assert(&pr->pr_mtx, MA_OWNED);
2350	sx_assert(&allprison_lock, SX_LOCKED);
2351	drflags &= PD_LOCK_FLAGS;
2352	/*
2353	 * XXX: Note that there is a slight race here if two threads
2354	 * in the same privileged process attempt to attach to two
2355	 * different jails at the same time.  It is important for
2356	 * user processes not to do this, or they might end up with
2357	 * a process root from one prison, but attached to the jail
2358	 * of another.
2359	 */
2360	prison_hold(pr);
2361	refcount_acquire(&pr->pr_uref);
2362	drflags |= PD_DEREF | PD_DEUREF;
2363	mtx_unlock(&pr->pr_mtx);
2364	drflags &= ~PD_LOCKED;
2365
2366	/* Let modules do whatever they need to prepare for attaching. */
2367	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2368	if (error) {
2369		prison_deref(pr, drflags);
2370		return (error);
2371	}
2372	sx_unlock(&allprison_lock);
2373	drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED);
2374
2375	/*
2376	 * Reparent the newly attached process to this jail.
2377	 */
2378	p = td->td_proc;
2379	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2380	if (error)
2381		goto e_revert_osd;
2382
2383	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2384	if ((error = change_dir(pr->pr_root, td)) != 0)
2385		goto e_unlock;
2386#ifdef MAC
2387	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2388		goto e_unlock;
2389#endif
2390	VOP_UNLOCK(pr->pr_root);
2391	if ((error = pwd_chroot_chdir(td, pr->pr_root)))
2392		goto e_revert_osd;
2393
2394	newcred = crget();
2395	PROC_LOCK(p);
2396	oldcred = crcopysafe(p, newcred);
2397	newcred->cr_prison = pr;
2398	proc_set_cred(p, newcred);
2399	setsugid(p);
2400#ifdef RACCT
2401	racct_proc_ucred_changed(p, oldcred, newcred);
2402	crhold(newcred);
2403#endif
2404	PROC_UNLOCK(p);
2405#ifdef RCTL
2406	rctl_proc_ucred_changed(p, newcred);
2407	crfree(newcred);
2408#endif
2409	prison_deref(oldcred->cr_prison, drflags);
2410	crfree(oldcred);
2411
2412	/*
2413	 * If the prison was killed while changing credentials, die along
2414	 * with it.
2415	 */
2416	if (!prison_isalive(pr)) {
2417		PROC_LOCK(p);
2418		kern_psignal(p, SIGKILL);
2419		PROC_UNLOCK(p);
2420	}
2421
2422	return (0);
2423
2424 e_unlock:
2425	VOP_UNLOCK(pr->pr_root);
2426 e_revert_osd:
2427	/* Tell modules this thread is still in its old jail after all. */
2428	sx_slock(&allprison_lock);
2429	drflags |= PD_LIST_SLOCKED;
2430	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
2431	prison_deref(pr, drflags);
2432	return (error);
2433}
2434
2435/*
2436 * Returns a locked prison instance, or NULL on failure.
2437 */
2438struct prison *
2439prison_find(int prid)
2440{
2441	struct prison *pr;
2442
2443	sx_assert(&allprison_lock, SX_LOCKED);
2444	TAILQ_FOREACH(pr, &allprison, pr_list) {
2445		if (pr->pr_id < prid)
2446			continue;
2447		if (pr->pr_id > prid)
2448			break;
2449		KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr));
2450		mtx_lock(&pr->pr_mtx);
2451		return (pr);
2452	}
2453	return (NULL);
2454}
2455
2456/*
2457 * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2458 */
2459struct prison *
2460prison_find_child(struct prison *mypr, int prid)
2461{
2462	struct prison *pr;
2463	int descend;
2464
2465	sx_assert(&allprison_lock, SX_LOCKED);
2466	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2467		if (pr->pr_id == prid) {
2468			KASSERT(prison_isvalid(pr),
2469			    ("Found invalid prison %p", pr));
2470			mtx_lock(&pr->pr_mtx);
2471			return (pr);
2472		}
2473	}
2474	return (NULL);
2475}
2476
2477/*
2478 * Look for the name relative to mypr.  Returns a locked prison or NULL.
2479 */
2480struct prison *
2481prison_find_name(struct prison *mypr, const char *name)
2482{
2483	struct prison *pr, *deadpr;
2484	size_t mylen;
2485	int descend;
2486
2487	sx_assert(&allprison_lock, SX_LOCKED);
2488	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2489	deadpr = NULL;
2490	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2491		if (!strcmp(pr->pr_name + mylen, name)) {
2492			KASSERT(prison_isvalid(pr),
2493			    ("Found invalid prison %p", pr));
2494			if (prison_isalive(pr)) {
2495				mtx_lock(&pr->pr_mtx);
2496				return (pr);
2497			}
2498			deadpr = pr;
2499		}
2500	}
2501	/* There was no valid prison - perhaps there was a dying one. */
2502	if (deadpr != NULL)
2503		mtx_lock(&deadpr->pr_mtx);
2504	return (deadpr);
2505}
2506
2507/*
2508 * See if a prison has the specific flag set.  The prison should be locked,
2509 * unless checking for flags that are only set at jail creation (such as
2510 * PR_IP4 and PR_IP6), or only the single bit is examined, without regard
2511 * to any other prison data.
2512 */
2513int
2514prison_flag(struct ucred *cred, unsigned flag)
2515{
2516
2517	return (cred->cr_prison->pr_flags & flag);
2518}
2519
2520int
2521prison_allow(struct ucred *cred, unsigned flag)
2522{
2523
2524	return ((cred->cr_prison->pr_allow & flag) != 0);
2525}
2526
2527/*
2528 * Hold a prison reference, by incrementing pr_ref.  It is generally
2529 * an error to hold a prison that does not already have a reference.
2530 * A prison record will remain valid as long as it has at least one
2531 * reference, and will not be removed as long as either the prison
2532 * mutex or the allprison lock is held (allprison_lock may be shared).
2533 */
2534void
2535prison_hold_locked(struct prison *pr)
2536{
2537
2538	/* Locking is no longer required. */
2539	prison_hold(pr);
2540}
2541
2542void
2543prison_hold(struct prison *pr)
2544{
2545#ifdef INVARIANTS
2546	int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref);
2547
2548	KASSERT(was_valid,
2549	    ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
2550#else
2551	refcount_acquire(&pr->pr_ref);
2552#endif
2553}
2554
2555/*
2556 * Remove a prison reference.  If that was the last reference, the
2557 * prison will be removed (at a later time).
2558 */
2559void
2560prison_free_locked(struct prison *pr)
2561{
2562
2563	mtx_assert(&pr->pr_mtx, MA_OWNED);
2564	/*
2565	 * Locking is no longer required, but unlock because the caller
2566	 * expects it.
2567	 */
2568	mtx_unlock(&pr->pr_mtx);
2569	prison_free(pr);
2570}
2571
2572void
2573prison_free(struct prison *pr)
2574{
2575
2576	KASSERT(refcount_load(&pr->pr_ref) > 0,
2577	    ("Trying to free dead prison %p (jid=%d).",
2578	     pr, pr->pr_id));
2579	if (!refcount_release_if_not_last(&pr->pr_ref)) {
2580		/*
2581		 * Don't remove the last reference in this context,
2582		 * in case there are locks held.
2583		 */
2584		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2585	}
2586}
2587
2588static void
2589prison_free_not_last(struct prison *pr)
2590{
2591#ifdef INVARIANTS
2592	int lastref;
2593
2594	KASSERT(refcount_load(&pr->pr_ref) > 0,
2595	    ("Trying to free dead prison %p (jid=%d).",
2596	     pr, pr->pr_id));
2597	lastref = refcount_release(&pr->pr_ref);
2598	KASSERT(!lastref,
2599	    ("prison_free_not_last freed last ref on prison %p (jid=%d).",
2600	     pr, pr->pr_id));
2601#else
2602	refcount_release(&pr->pr_ref);
2603#endif
2604}
2605
2606/*
2607 * Hold a a prison for user visibility, by incrementing pr_uref.
2608 * It is generally an error to hold a prison that isn't already
2609 * user-visible, except through the the jail system calls.  It is also
2610 * an error to hold an invalid prison.  A prison record will remain
2611 * alive as long as it has at least one user reference, and will not
2612 * be set to the dying state until the prison mutex and allprison_lock
2613 * are both freed.
2614 */
2615void
2616prison_proc_hold(struct prison *pr)
2617{
2618#ifdef INVARIANTS
2619	int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref);
2620
2621	KASSERT(was_alive,
2622	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2623#else
2624	refcount_acquire(&pr->pr_uref);
2625#endif
2626}
2627
2628/*
2629 * Remove a prison user reference.  If it was the last reference, the
2630 * prison will be considered "dying", and may be removed once all of
2631 * its references are dropped.
2632 */
2633void
2634prison_proc_free(struct prison *pr)
2635{
2636
2637	/*
2638	 * Locking is only required when releasing the last reference.
2639	 * This allows assurance that a locked prison will remain alive
2640	 * until it is unlocked.
2641	 */
2642	KASSERT(refcount_load(&pr->pr_uref) > 0,
2643	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2644	if (!refcount_release_if_not_last(&pr->pr_uref)) {
2645		/*
2646		 * Don't remove the last user reference in this context,
2647		 * which is expected to be a process that is not only locked,
2648		 * but also half dead.  Add a reference so any calls to
2649		 * prison_free() won't re-submit the task.
2650		 */
2651		prison_hold(pr);
2652		mtx_lock(&pr->pr_mtx);
2653		KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC),
2654		    ("Redundant last reference in prison_proc_free (jid=%d)",
2655		     pr->pr_id));
2656		pr->pr_flags |= PR_COMPLETE_PROC;
2657		mtx_unlock(&pr->pr_mtx);
2658		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2659	}
2660}
2661
2662static void
2663prison_proc_free_not_last(struct prison *pr)
2664{
2665#ifdef INVARIANTS
2666	int lastref;
2667
2668	KASSERT(refcount_load(&pr->pr_uref) > 0,
2669	    ("Trying to free dead prison %p (jid=%d).",
2670	     pr, pr->pr_id));
2671	lastref = refcount_release(&pr->pr_uref);
2672	KASSERT(!lastref,
2673	    ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).",
2674	     pr, pr->pr_id));
2675#else
2676	refcount_release(&pr->pr_uref);
2677#endif
2678}
2679
2680/*
2681 * Complete a call to either prison_free or prison_proc_free.
2682 */
2683static void
2684prison_complete(void *context, int pending)
2685{
2686	struct prison *pr = context;
2687	int drflags;
2688
2689	/*
2690	 * This could be called to release the last reference, or the last
2691	 * user reference (plus the reference held in prison_proc_free).
2692	 */
2693	drflags = prison_lock_xlock(pr, PD_DEREF);
2694	if (pr->pr_flags & PR_COMPLETE_PROC) {
2695		pr->pr_flags &= ~PR_COMPLETE_PROC;
2696		drflags |= PD_DEUREF;
2697	}
2698	prison_deref(pr, drflags);
2699}
2700
2701/*
2702 * Remove a prison reference and/or user reference (usually).
2703 * This assumes context that allows sleeping (for allprison_lock),
2704 * with no non-sleeping locks held, except perhaps the prison itself.
2705 * If there are no more references, release and delist the prison.
2706 * On completion, the prison lock and the allprison lock are both
2707 * unlocked.
2708 */
2709static void
2710prison_deref(struct prison *pr, int flags)
2711{
2712	struct prisonlist freeprison;
2713	struct prison *killpr, *rpr, *ppr, *tpr;
2714	struct proc *p;
2715
2716	killpr = NULL;
2717	TAILQ_INIT(&freeprison);
2718	/*
2719	 * Release this prison as requested, which may cause its parent
2720	 * to be released, and then maybe its grandparent, etc.
2721	 */
2722	for (;;) {
2723		if (flags & PD_KILL) {
2724			/* Kill the prison and its descendents. */
2725			KASSERT(pr != &prison0,
2726			    ("prison_deref trying to kill prison0"));
2727			if (!(flags & PD_DEREF)) {
2728				prison_hold(pr);
2729				flags |= PD_DEREF;
2730			}
2731			flags = prison_lock_xlock(pr, flags);
2732			prison_deref_kill(pr, &freeprison);
2733		}
2734		if (flags & PD_DEUREF) {
2735			/* Drop a user reference. */
2736			KASSERT(refcount_load(&pr->pr_uref) > 0,
2737			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
2738			     pr->pr_id));
2739			if (!refcount_release_if_not_last(&pr->pr_uref)) {
2740				if (!(flags & PD_DEREF)) {
2741					prison_hold(pr);
2742					flags |= PD_DEREF;
2743				}
2744				flags = prison_lock_xlock(pr, flags);
2745				if (refcount_release(&pr->pr_uref) &&
2746				    pr->pr_state == PRISON_STATE_ALIVE) {
2747					/*
2748					 * When the last user references goes,
2749					 * this becomes a dying prison.
2750					 */
2751					KASSERT(
2752					    refcount_load(&prison0.pr_uref) > 0,
2753					    ("prison0 pr_uref=0"));
2754					pr->pr_state = PRISON_STATE_DYING;
2755					mtx_unlock(&pr->pr_mtx);
2756					flags &= ~PD_LOCKED;
2757					(void)osd_jail_call(pr,
2758					    PR_METHOD_REMOVE, NULL);
2759				}
2760			}
2761		}
2762		if (flags & PD_KILL) {
2763			/*
2764			 * Any remaining user references are probably processes
2765			 * that need to be killed, either in this prison or its
2766			 * descendants.
2767			 */
2768			if (refcount_load(&pr->pr_uref) > 0)
2769				killpr = pr;
2770			/* Make sure the parent prison doesn't get killed. */
2771			flags &= ~PD_KILL;
2772		}
2773		if (flags & PD_DEREF) {
2774			/* Drop a reference. */
2775			KASSERT(refcount_load(&pr->pr_ref) > 0,
2776			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
2777			     pr->pr_id));
2778			if (!refcount_release_if_not_last(&pr->pr_ref)) {
2779				flags = prison_lock_xlock(pr, flags);
2780				if (refcount_release(&pr->pr_ref)) {
2781					/*
2782					 * When the last reference goes,
2783					 * unlink the prison and set it aside.
2784					 */
2785					KASSERT(
2786					    refcount_load(&pr->pr_uref) == 0,
2787					    ("prison_deref: last ref, "
2788					     "but still has %d urefs (jid=%d)",
2789					     pr->pr_uref, pr->pr_id));
2790					KASSERT(
2791					    refcount_load(&prison0.pr_ref) != 0,
2792					    ("prison0 pr_ref=0"));
2793					pr->pr_state = PRISON_STATE_INVALID;
2794					TAILQ_REMOVE(&allprison, pr, pr_list);
2795					LIST_REMOVE(pr, pr_sibling);
2796					TAILQ_INSERT_TAIL(&freeprison, pr,
2797					    pr_list);
2798					for (ppr = pr->pr_parent;
2799					     ppr != NULL;
2800					     ppr = ppr->pr_parent)
2801						ppr->pr_childcount--;
2802					/*
2803					 * Removing a prison frees references
2804					 * from its parent.
2805					 */
2806					mtx_unlock(&pr->pr_mtx);
2807					flags &= ~PD_LOCKED;
2808					pr = pr->pr_parent;
2809					flags |= PD_DEREF | PD_DEUREF;
2810					continue;
2811				}
2812			}
2813		}
2814		break;
2815	}
2816
2817	/* Release all the prison locks. */
2818	if (flags & PD_LOCKED)
2819		mtx_unlock(&pr->pr_mtx);
2820	if (flags & PD_LIST_SLOCKED)
2821		sx_sunlock(&allprison_lock);
2822	else if (flags & PD_LIST_XLOCKED)
2823		sx_xunlock(&allprison_lock);
2824
2825	/* Kill any processes attached to a killed prison. */
2826	if (killpr != NULL) {
2827		sx_slock(&allproc_lock);
2828		FOREACH_PROC_IN_SYSTEM(p) {
2829			PROC_LOCK(p);
2830			if (p->p_state != PRS_NEW && p->p_ucred != NULL) {
2831				for (ppr = p->p_ucred->cr_prison;
2832				     ppr != &prison0;
2833				     ppr = ppr->pr_parent)
2834					if (ppr == killpr) {
2835						kern_psignal(p, SIGKILL);
2836						break;
2837					}
2838			}
2839			PROC_UNLOCK(p);
2840		}
2841		sx_sunlock(&allproc_lock);
2842	}
2843
2844	/*
2845	 * Finish removing any unreferenced prisons, which couldn't happen
2846	 * while allprison_lock was held (to avoid a LOR on vrele).
2847	 */
2848	TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) {
2849#ifdef VIMAGE
2850		if (rpr->pr_vnet != rpr->pr_parent->pr_vnet)
2851			vnet_destroy(rpr->pr_vnet);
2852#endif
2853		if (rpr->pr_root != NULL)
2854			vrele(rpr->pr_root);
2855		mtx_destroy(&rpr->pr_mtx);
2856#ifdef INET
2857		free(rpr->pr_ip4, M_PRISON);
2858#endif
2859#ifdef INET6
2860		free(rpr->pr_ip6, M_PRISON);
2861#endif
2862		if (rpr->pr_cpuset != NULL)
2863			cpuset_rel(rpr->pr_cpuset);
2864		osd_jail_exit(rpr);
2865#ifdef RACCT
2866		if (racct_enable)
2867			prison_racct_detach(rpr);
2868#endif
2869		TAILQ_REMOVE(&freeprison, rpr, pr_list);
2870		free(rpr, M_PRISON);
2871	}
2872}
2873
2874/*
2875 * Kill the prison and its descendants.  Mark them as dying, clear the
2876 * persist flag, and call module remove methods.
2877 */
2878static void
2879prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
2880{
2881	struct prison *cpr, *ppr, *rpr;
2882	bool descend;
2883
2884	/*
2885	 * Unlike the descendants, the target prison can be killed
2886	 * even if it is currently dying.  This is useful for failed
2887	 * creation in jail_set(2).
2888	 */
2889	KASSERT(refcount_load(&pr->pr_ref) > 0,
2890	    ("Trying to kill dead prison %p (jid=%d).",
2891	     pr, pr->pr_id));
2892	refcount_acquire(&pr->pr_uref);
2893	pr->pr_state = PRISON_STATE_DYING;
2894	mtx_unlock(&pr->pr_mtx);
2895
2896	rpr = NULL;
2897	FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) {
2898		if (descend) {
2899			if (!prison_isalive(cpr)) {
2900				descend = false;
2901				continue;
2902			}
2903			prison_hold(cpr);
2904			prison_proc_hold(cpr);
2905			mtx_lock(&cpr->pr_mtx);
2906			cpr->pr_state = PRISON_STATE_DYING;
2907			cpr->pr_flags |= PR_REMOVE;
2908			mtx_unlock(&cpr->pr_mtx);
2909			continue;
2910		}
2911		if (!(cpr->pr_flags & PR_REMOVE))
2912			continue;
2913		(void)osd_jail_call(cpr, PR_METHOD_REMOVE, NULL);
2914		mtx_lock(&cpr->pr_mtx);
2915		cpr->pr_flags &= ~PR_REMOVE;
2916		if (cpr->pr_flags & PR_PERSIST) {
2917			cpr->pr_flags &= ~PR_PERSIST;
2918			prison_proc_free_not_last(cpr);
2919			prison_free_not_last(cpr);
2920		}
2921		(void)refcount_release(&cpr->pr_uref);
2922		if (refcount_release(&cpr->pr_ref)) {
2923			/*
2924			 * When the last reference goes, unlink the prison
2925			 * and set it aside for prison_deref() to handle.
2926			 * Delay unlinking the sibling list to keep the loop
2927			 * safe.
2928			 */
2929			if (rpr != NULL)
2930				LIST_REMOVE(rpr, pr_sibling);
2931			rpr = cpr;
2932			rpr->pr_state = PRISON_STATE_INVALID;
2933			TAILQ_REMOVE(&allprison, rpr, pr_list);
2934			TAILQ_INSERT_TAIL(freeprison, rpr, pr_list);
2935			/*
2936			 * Removing a prison frees references from its parent.
2937			 */
2938			ppr = rpr->pr_parent;
2939			prison_proc_free_not_last(ppr);
2940			prison_free_not_last(ppr);
2941			for (; ppr != NULL; ppr = ppr->pr_parent)
2942				ppr->pr_childcount--;
2943		}
2944		mtx_unlock(&cpr->pr_mtx);
2945	}
2946	if (rpr != NULL)
2947		LIST_REMOVE(rpr, pr_sibling);
2948
2949	(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
2950	mtx_lock(&pr->pr_mtx);
2951	if (pr->pr_flags & PR_PERSIST) {
2952		pr->pr_flags &= ~PR_PERSIST;
2953		prison_proc_free_not_last(pr);
2954		prison_free_not_last(pr);
2955	}
2956	(void)refcount_release(&pr->pr_uref);
2957}
2958
2959/*
2960 * Given the current locking state in the flags, make sure allprison_lock
2961 * is held exclusive, and the prison is locked.  Return flags indicating
2962 * the new state.
2963 */
2964static int
2965prison_lock_xlock(struct prison *pr, int flags)
2966{
2967
2968	if (!(flags & PD_LIST_XLOCKED)) {
2969		/*
2970		 * Get allprison_lock, which may be an upgrade,
2971		 * and may require unlocking the prison.
2972		 */
2973		if (flags & PD_LOCKED) {
2974			mtx_unlock(&pr->pr_mtx);
2975			flags &= ~PD_LOCKED;
2976		}
2977		if (flags & PD_LIST_SLOCKED) {
2978			if (!sx_try_upgrade(&allprison_lock)) {
2979				sx_sunlock(&allprison_lock);
2980				sx_xlock(&allprison_lock);
2981			}
2982			flags &= ~PD_LIST_SLOCKED;
2983		} else
2984			sx_xlock(&allprison_lock);
2985		flags |= PD_LIST_XLOCKED;
2986	}
2987	if (!(flags & PD_LOCKED)) {
2988		/* Lock the prison mutex. */
2989		mtx_lock(&pr->pr_mtx);
2990		flags |= PD_LOCKED;
2991	}
2992	return flags;
2993}
2994
2995/*
2996 * Set or clear a permission bit in the pr_allow field, passing restrictions
2997 * (cleared permission) down to child jails.
2998 */
2999void
3000prison_set_allow(struct ucred *cred, unsigned flag, int enable)
3001{
3002	struct prison *pr;
3003
3004	pr = cred->cr_prison;
3005	sx_slock(&allprison_lock);
3006	mtx_lock(&pr->pr_mtx);
3007	prison_set_allow_locked(pr, flag, enable);
3008	mtx_unlock(&pr->pr_mtx);
3009	sx_sunlock(&allprison_lock);
3010}
3011
3012static void
3013prison_set_allow_locked(struct prison *pr, unsigned flag, int enable)
3014{
3015	struct prison *cpr;
3016	int descend;
3017
3018	if (enable != 0)
3019		pr->pr_allow |= flag;
3020	else {
3021		pr->pr_allow &= ~flag;
3022		FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
3023			cpr->pr_allow &= ~flag;
3024	}
3025}
3026
3027/*
3028 * Check if a jail supports the given address family.
3029 *
3030 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3031 * if not.
3032 */
3033int
3034prison_check_af(struct ucred *cred, int af)
3035{
3036	struct prison *pr;
3037	int error;
3038
3039	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3040
3041	pr = cred->cr_prison;
3042#ifdef VIMAGE
3043	/* Prisons with their own network stack are not limited. */
3044	if (prison_owns_vnet(cred))
3045		return (0);
3046#endif
3047
3048	error = 0;
3049	switch (af)
3050	{
3051#ifdef INET
3052	case AF_INET:
3053		if (pr->pr_flags & PR_IP4)
3054		{
3055			mtx_lock(&pr->pr_mtx);
3056			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
3057				error = EAFNOSUPPORT;
3058			mtx_unlock(&pr->pr_mtx);
3059		}
3060		break;
3061#endif
3062#ifdef INET6
3063	case AF_INET6:
3064		if (pr->pr_flags & PR_IP6)
3065		{
3066			mtx_lock(&pr->pr_mtx);
3067			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
3068				error = EAFNOSUPPORT;
3069			mtx_unlock(&pr->pr_mtx);
3070		}
3071		break;
3072#endif
3073	case AF_LOCAL:
3074	case AF_ROUTE:
3075		break;
3076	default:
3077		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3078			error = EAFNOSUPPORT;
3079	}
3080	return (error);
3081}
3082
3083/*
3084 * Check if given address belongs to the jail referenced by cred (wrapper to
3085 * prison_check_ip[46]).
3086 *
3087 * Returns 0 if jail doesn't restrict the address family or if address belongs
3088 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3089 * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3090 */
3091int
3092prison_if(struct ucred *cred, const struct sockaddr *sa)
3093{
3094#ifdef INET
3095	const struct sockaddr_in *sai;
3096#endif
3097#ifdef INET6
3098	const struct sockaddr_in6 *sai6;
3099#endif
3100	int error;
3101
3102	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3103	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3104
3105#ifdef VIMAGE
3106	if (prison_owns_vnet(cred))
3107		return (0);
3108#endif
3109
3110	error = 0;
3111	switch (sa->sa_family)
3112	{
3113#ifdef INET
3114	case AF_INET:
3115		sai = (const struct sockaddr_in *)sa;
3116		error = prison_check_ip4(cred, &sai->sin_addr);
3117		break;
3118#endif
3119#ifdef INET6
3120	case AF_INET6:
3121		sai6 = (const struct sockaddr_in6 *)sa;
3122		error = prison_check_ip6(cred, &sai6->sin6_addr);
3123		break;
3124#endif
3125	default:
3126		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3127			error = EAFNOSUPPORT;
3128	}
3129	return (error);
3130}
3131
3132/*
3133 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3134 */
3135int
3136prison_check(struct ucred *cred1, struct ucred *cred2)
3137{
3138
3139	return ((cred1->cr_prison == cred2->cr_prison ||
3140	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3141}
3142
3143/*
3144 * Return 1 if p2 is a child of p1, otherwise 0.
3145 */
3146int
3147prison_ischild(struct prison *pr1, struct prison *pr2)
3148{
3149
3150	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3151		if (pr1 == pr2)
3152			return (1);
3153	return (0);
3154}
3155
3156/*
3157 * Return true if the prison is currently alive.  A prison is alive if it
3158 * holds user references and it isn't being removed.
3159 */
3160bool
3161prison_isalive(struct prison *pr)
3162{
3163
3164	if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE))
3165		return (false);
3166	return (true);
3167}
3168
3169/*
3170 * Return true if the prison is currently valid.  A prison is valid if it has
3171 * been fully created, and is not being destroyed.  Note that dying prisons
3172 * are still considered valid.  Invalid prisons won't be found under normal
3173 * circumstances, as they're only put in that state by functions that have
3174 * an exclusive hold on allprison_lock.
3175 */
3176bool
3177prison_isvalid(struct prison *pr)
3178{
3179
3180	if (__predict_false(pr->pr_state == PRISON_STATE_INVALID))
3181		return (false);
3182	if (__predict_false(refcount_load(&pr->pr_ref) == 0))
3183		return (false);
3184	return (true);
3185}
3186
3187/*
3188 * Return 1 if the passed credential is in a jail and that jail does not
3189 * have its own virtual network stack, otherwise 0.
3190 */
3191int
3192jailed_without_vnet(struct ucred *cred)
3193{
3194
3195	if (!jailed(cred))
3196		return (0);
3197#ifdef VIMAGE
3198	if (prison_owns_vnet(cred))
3199		return (0);
3200#endif
3201
3202	return (1);
3203}
3204
3205/*
3206 * Return the correct hostname (domainname, et al) for the passed credential.
3207 */
3208void
3209getcredhostname(struct ucred *cred, char *buf, size_t size)
3210{
3211	struct prison *pr;
3212
3213	/*
3214	 * A NULL credential can be used to shortcut to the physical
3215	 * system's hostname.
3216	 */
3217	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3218	mtx_lock(&pr->pr_mtx);
3219	strlcpy(buf, pr->pr_hostname, size);
3220	mtx_unlock(&pr->pr_mtx);
3221}
3222
3223void
3224getcreddomainname(struct ucred *cred, char *buf, size_t size)
3225{
3226
3227	mtx_lock(&cred->cr_prison->pr_mtx);
3228	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3229	mtx_unlock(&cred->cr_prison->pr_mtx);
3230}
3231
3232void
3233getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3234{
3235
3236	mtx_lock(&cred->cr_prison->pr_mtx);
3237	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3238	mtx_unlock(&cred->cr_prison->pr_mtx);
3239}
3240
3241void
3242getcredhostid(struct ucred *cred, unsigned long *hostid)
3243{
3244
3245	mtx_lock(&cred->cr_prison->pr_mtx);
3246	*hostid = cred->cr_prison->pr_hostid;
3247	mtx_unlock(&cred->cr_prison->pr_mtx);
3248}
3249
3250void
3251getjailname(struct ucred *cred, char *name, size_t len)
3252{
3253
3254	mtx_lock(&cred->cr_prison->pr_mtx);
3255	strlcpy(name, cred->cr_prison->pr_name, len);
3256	mtx_unlock(&cred->cr_prison->pr_mtx);
3257}
3258
3259#ifdef VIMAGE
3260/*
3261 * Determine whether the prison represented by cred owns
3262 * its vnet rather than having it inherited.
3263 *
3264 * Returns 1 in case the prison owns the vnet, 0 otherwise.
3265 */
3266int
3267prison_owns_vnet(struct ucred *cred)
3268{
3269
3270	/*
3271	 * vnets cannot be added/removed after jail creation,
3272	 * so no need to lock here.
3273	 */
3274	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
3275}
3276#endif
3277
3278/*
3279 * Determine whether the subject represented by cred can "see"
3280 * status of a mount point.
3281 * Returns: 0 for permitted, ENOENT otherwise.
3282 * XXX: This function should be called cr_canseemount() and should be
3283 *      placed in kern_prot.c.
3284 */
3285int
3286prison_canseemount(struct ucred *cred, struct mount *mp)
3287{
3288	struct prison *pr;
3289	struct statfs *sp;
3290	size_t len;
3291
3292	pr = cred->cr_prison;
3293	if (pr->pr_enforce_statfs == 0)
3294		return (0);
3295	if (pr->pr_root->v_mount == mp)
3296		return (0);
3297	if (pr->pr_enforce_statfs == 2)
3298		return (ENOENT);
3299	/*
3300	 * If jail's chroot directory is set to "/" we should be able to see
3301	 * all mount-points from inside a jail.
3302	 * This is ugly check, but this is the only situation when jail's
3303	 * directory ends with '/'.
3304	 */
3305	if (strcmp(pr->pr_path, "/") == 0)
3306		return (0);
3307	len = strlen(pr->pr_path);
3308	sp = &mp->mnt_stat;
3309	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3310		return (ENOENT);
3311	/*
3312	 * Be sure that we don't have situation where jail's root directory
3313	 * is "/some/path" and mount point is "/some/pathpath".
3314	 */
3315	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3316		return (ENOENT);
3317	return (0);
3318}
3319
3320void
3321prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3322{
3323	char jpath[MAXPATHLEN];
3324	struct prison *pr;
3325	size_t len;
3326
3327	pr = cred->cr_prison;
3328	if (pr->pr_enforce_statfs == 0)
3329		return;
3330	if (prison_canseemount(cred, mp) != 0) {
3331		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3332		strlcpy(sp->f_mntonname, "[restricted]",
3333		    sizeof(sp->f_mntonname));
3334		return;
3335	}
3336	if (pr->pr_root->v_mount == mp) {
3337		/*
3338		 * Clear current buffer data, so we are sure nothing from
3339		 * the valid path left there.
3340		 */
3341		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3342		*sp->f_mntonname = '/';
3343		return;
3344	}
3345	/*
3346	 * If jail's chroot directory is set to "/" we should be able to see
3347	 * all mount-points from inside a jail.
3348	 */
3349	if (strcmp(pr->pr_path, "/") == 0)
3350		return;
3351	len = strlen(pr->pr_path);
3352	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3353	/*
3354	 * Clear current buffer data, so we are sure nothing from
3355	 * the valid path left there.
3356	 */
3357	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3358	if (*jpath == '\0') {
3359		/* Should never happen. */
3360		*sp->f_mntonname = '/';
3361	} else {
3362		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3363	}
3364}
3365
3366/*
3367 * Check with permission for a specific privilege is granted within jail.  We
3368 * have a specific list of accepted privileges; the rest are denied.
3369 */
3370int
3371prison_priv_check(struct ucred *cred, int priv)
3372{
3373	struct prison *pr;
3374	int error;
3375
3376	/*
3377	 * Some policies have custom handlers. This routine should not be
3378	 * called for them. See priv_check_cred().
3379	 */
3380	switch (priv) {
3381	case PRIV_VFS_LOOKUP:
3382	case PRIV_VFS_GENERATION:
3383		KASSERT(0, ("prison_priv_check instead of a custom handler "
3384		    "called for %d\n", priv));
3385	}
3386
3387	if (!jailed(cred))
3388		return (0);
3389
3390#ifdef VIMAGE
3391	/*
3392	 * Privileges specific to prisons with a virtual network stack.
3393	 * There might be a duplicate entry here in case the privilege
3394	 * is only granted conditionally in the legacy jail case.
3395	 */
3396	switch (priv) {
3397#ifdef notyet
3398		/*
3399		 * NFS-specific privileges.
3400		 */
3401	case PRIV_NFS_DAEMON:
3402	case PRIV_NFS_LOCKD:
3403#endif
3404		/*
3405		 * Network stack privileges.
3406		 */
3407	case PRIV_NET_BRIDGE:
3408	case PRIV_NET_GRE:
3409	case PRIV_NET_BPF:
3410	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3411	case PRIV_NET_ROUTE:
3412	case PRIV_NET_TAP:
3413	case PRIV_NET_SETIFMTU:
3414	case PRIV_NET_SETIFFLAGS:
3415	case PRIV_NET_SETIFCAP:
3416	case PRIV_NET_SETIFDESCR:
3417	case PRIV_NET_SETIFNAME	:
3418	case PRIV_NET_SETIFMETRIC:
3419	case PRIV_NET_SETIFPHYS:
3420	case PRIV_NET_SETIFMAC:
3421	case PRIV_NET_SETLANPCP:
3422	case PRIV_NET_ADDMULTI:
3423	case PRIV_NET_DELMULTI:
3424	case PRIV_NET_HWIOCTL:
3425	case PRIV_NET_SETLLADDR:
3426	case PRIV_NET_ADDIFGROUP:
3427	case PRIV_NET_DELIFGROUP:
3428	case PRIV_NET_IFCREATE:
3429	case PRIV_NET_IFDESTROY:
3430	case PRIV_NET_ADDIFADDR:
3431	case PRIV_NET_DELIFADDR:
3432	case PRIV_NET_LAGG:
3433	case PRIV_NET_GIF:
3434	case PRIV_NET_SETIFVNET:
3435	case PRIV_NET_SETIFFIB:
3436
3437		/*
3438		 * 802.11-related privileges.
3439		 */
3440	case PRIV_NET80211_VAP_GETKEY:
3441	case PRIV_NET80211_VAP_MANAGE:
3442
3443#ifdef notyet
3444		/*
3445		 * ATM privileges.
3446		 */
3447	case PRIV_NETATM_CFG:
3448	case PRIV_NETATM_ADD:
3449	case PRIV_NETATM_DEL:
3450	case PRIV_NETATM_SET:
3451
3452		/*
3453		 * Bluetooth privileges.
3454		 */
3455	case PRIV_NETBLUETOOTH_RAW:
3456#endif
3457
3458		/*
3459		 * Netgraph and netgraph module privileges.
3460		 */
3461	case PRIV_NETGRAPH_CONTROL:
3462#ifdef notyet
3463	case PRIV_NETGRAPH_TTY:
3464#endif
3465
3466		/*
3467		 * IPv4 and IPv6 privileges.
3468		 */
3469	case PRIV_NETINET_IPFW:
3470	case PRIV_NETINET_DIVERT:
3471	case PRIV_NETINET_PF:
3472	case PRIV_NETINET_DUMMYNET:
3473	case PRIV_NETINET_CARP:
3474	case PRIV_NETINET_MROUTE:
3475	case PRIV_NETINET_RAW:
3476	case PRIV_NETINET_ADDRCTRL6:
3477	case PRIV_NETINET_ND6:
3478	case PRIV_NETINET_SCOPE6:
3479	case PRIV_NETINET_ALIFETIME6:
3480	case PRIV_NETINET_IPSEC:
3481	case PRIV_NETINET_BINDANY:
3482
3483#ifdef notyet
3484		/*
3485		 * NCP privileges.
3486		 */
3487	case PRIV_NETNCP:
3488
3489		/*
3490		 * SMB privileges.
3491		 */
3492	case PRIV_NETSMB:
3493#endif
3494
3495	/*
3496	 * No default: or deny here.
3497	 * In case of no permit fall through to next switch().
3498	 */
3499		if (cred->cr_prison->pr_flags & PR_VNET)
3500			return (0);
3501	}
3502#endif /* VIMAGE */
3503
3504	switch (priv) {
3505		/*
3506		 * Allow ktrace privileges for root in jail.
3507		 */
3508	case PRIV_KTRACE:
3509
3510#if 0
3511		/*
3512		 * Allow jailed processes to configure audit identity and
3513		 * submit audit records (login, etc).  In the future we may
3514		 * want to further refine the relationship between audit and
3515		 * jail.
3516		 */
3517	case PRIV_AUDIT_GETAUDIT:
3518	case PRIV_AUDIT_SETAUDIT:
3519	case PRIV_AUDIT_SUBMIT:
3520#endif
3521
3522		/*
3523		 * Allow jailed processes to manipulate process UNIX
3524		 * credentials in any way they see fit.
3525		 */
3526	case PRIV_CRED_SETUID:
3527	case PRIV_CRED_SETEUID:
3528	case PRIV_CRED_SETGID:
3529	case PRIV_CRED_SETEGID:
3530	case PRIV_CRED_SETGROUPS:
3531	case PRIV_CRED_SETREUID:
3532	case PRIV_CRED_SETREGID:
3533	case PRIV_CRED_SETRESUID:
3534	case PRIV_CRED_SETRESGID:
3535
3536		/*
3537		 * Jail implements visibility constraints already, so allow
3538		 * jailed root to override uid/gid-based constraints.
3539		 */
3540	case PRIV_SEEOTHERGIDS:
3541	case PRIV_SEEOTHERUIDS:
3542
3543		/*
3544		 * Jail implements inter-process debugging limits already, so
3545		 * allow jailed root various debugging privileges.
3546		 */
3547	case PRIV_DEBUG_DIFFCRED:
3548	case PRIV_DEBUG_SUGID:
3549	case PRIV_DEBUG_UNPRIV:
3550
3551		/*
3552		 * Allow jail to set various resource limits and login
3553		 * properties, and for now, exceed process resource limits.
3554		 */
3555	case PRIV_PROC_LIMIT:
3556	case PRIV_PROC_SETLOGIN:
3557	case PRIV_PROC_SETRLIMIT:
3558
3559		/*
3560		 * System V and POSIX IPC privileges are granted in jail.
3561		 */
3562	case PRIV_IPC_READ:
3563	case PRIV_IPC_WRITE:
3564	case PRIV_IPC_ADMIN:
3565	case PRIV_IPC_MSGSIZE:
3566	case PRIV_MQ_ADMIN:
3567
3568		/*
3569		 * Jail operations within a jail work on child jails.
3570		 */
3571	case PRIV_JAIL_ATTACH:
3572	case PRIV_JAIL_SET:
3573	case PRIV_JAIL_REMOVE:
3574
3575		/*
3576		 * Jail implements its own inter-process limits, so allow
3577		 * root processes in jail to change scheduling on other
3578		 * processes in the same jail.  Likewise for signalling.
3579		 */
3580	case PRIV_SCHED_DIFFCRED:
3581	case PRIV_SCHED_CPUSET:
3582	case PRIV_SIGNAL_DIFFCRED:
3583	case PRIV_SIGNAL_SUGID:
3584
3585		/*
3586		 * Allow jailed processes to write to sysctls marked as jail
3587		 * writable.
3588		 */
3589	case PRIV_SYSCTL_WRITEJAIL:
3590
3591		/*
3592		 * Allow root in jail to manage a variety of quota
3593		 * properties.  These should likely be conditional on a
3594		 * configuration option.
3595		 */
3596	case PRIV_VFS_GETQUOTA:
3597	case PRIV_VFS_SETQUOTA:
3598
3599		/*
3600		 * Since Jail relies on chroot() to implement file system
3601		 * protections, grant many VFS privileges to root in jail.
3602		 * Be careful to exclude mount-related and NFS-related
3603		 * privileges.
3604		 */
3605	case PRIV_VFS_READ:
3606	case PRIV_VFS_WRITE:
3607	case PRIV_VFS_ADMIN:
3608	case PRIV_VFS_EXEC:
3609	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
3610	case PRIV_VFS_CHFLAGS_DEV:
3611	case PRIV_VFS_CHOWN:
3612	case PRIV_VFS_CHROOT:
3613	case PRIV_VFS_RETAINSUGID:
3614	case PRIV_VFS_FCHROOT:
3615	case PRIV_VFS_LINK:
3616	case PRIV_VFS_SETGID:
3617	case PRIV_VFS_STAT:
3618	case PRIV_VFS_STICKYFILE:
3619
3620		/*
3621		 * As in the non-jail case, non-root users are expected to be
3622		 * able to read kernel/phyiscal memory (provided /dev/[k]mem
3623		 * exists in the jail and they have permission to access it).
3624		 */
3625	case PRIV_KMEM_READ:
3626		return (0);
3627
3628		/*
3629		 * Depending on the global setting, allow privilege of
3630		 * setting system flags.
3631		 */
3632	case PRIV_VFS_SYSFLAGS:
3633		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
3634			return (0);
3635		else
3636			return (EPERM);
3637
3638		/*
3639		 * Depending on the global setting, allow privilege of
3640		 * mounting/unmounting file systems.
3641		 */
3642	case PRIV_VFS_MOUNT:
3643	case PRIV_VFS_UNMOUNT:
3644	case PRIV_VFS_MOUNT_NONUSER:
3645	case PRIV_VFS_MOUNT_OWNER:
3646		pr = cred->cr_prison;
3647		prison_lock(pr);
3648		if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2)
3649			error = 0;
3650		else
3651			error = EPERM;
3652		prison_unlock(pr);
3653		return (error);
3654
3655		/*
3656		 * Jails should hold no disposition on the PRIV_VFS_READ_DIR
3657		 * policy.  priv_check_cred will not specifically allow it, and
3658		 * we may want a MAC policy to allow it.
3659		 */
3660	case PRIV_VFS_READ_DIR:
3661		return (0);
3662
3663		/*
3664		 * Conditionnaly allow locking (unlocking) physical pages
3665		 * in memory.
3666		 */
3667	case PRIV_VM_MLOCK:
3668	case PRIV_VM_MUNLOCK:
3669		if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK)
3670			return (0);
3671		else
3672			return (EPERM);
3673
3674		/*
3675		 * Conditionally allow jailed root to bind reserved ports.
3676		 */
3677	case PRIV_NETINET_RESERVEDPORT:
3678		if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
3679			return (0);
3680		else
3681			return (EPERM);
3682
3683		/*
3684		 * Allow jailed root to reuse in-use ports.
3685		 */
3686	case PRIV_NETINET_REUSEPORT:
3687		return (0);
3688
3689		/*
3690		 * Allow jailed root to set certain IPv4/6 (option) headers.
3691		 */
3692	case PRIV_NETINET_SETHDROPTS:
3693		return (0);
3694
3695		/*
3696		 * Conditionally allow creating raw sockets in jail.
3697		 */
3698	case PRIV_NETINET_RAW:
3699		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
3700			return (0);
3701		else
3702			return (EPERM);
3703
3704		/*
3705		 * Since jail implements its own visibility limits on netstat
3706		 * sysctls, allow getcred.  This allows identd to work in
3707		 * jail.
3708		 */
3709	case PRIV_NETINET_GETCRED:
3710		return (0);
3711
3712		/*
3713		 * Allow jailed root to set loginclass.
3714		 */
3715	case PRIV_PROC_SETLOGINCLASS:
3716		return (0);
3717
3718		/*
3719		 * Do not allow a process inside a jail to read the kernel
3720		 * message buffer unless explicitly permitted.
3721		 */
3722	case PRIV_MSGBUF:
3723		if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
3724			return (0);
3725		return (EPERM);
3726
3727	default:
3728		/*
3729		 * In all remaining cases, deny the privilege request.  This
3730		 * includes almost all network privileges, many system
3731		 * configuration privileges.
3732		 */
3733		return (EPERM);
3734	}
3735}
3736
3737/*
3738 * Return the part of pr2's name that is relative to pr1, or the whole name
3739 * if it does not directly follow.
3740 */
3741
3742char *
3743prison_name(struct prison *pr1, struct prison *pr2)
3744{
3745	char *name;
3746
3747	/* Jails see themselves as "0" (if they see themselves at all). */
3748	if (pr1 == pr2)
3749		return "0";
3750	name = pr2->pr_name;
3751	if (prison_ischild(pr1, pr2)) {
3752		/*
3753		 * pr1 isn't locked (and allprison_lock may not be either)
3754		 * so its length can't be counted on.  But the number of dots
3755		 * can be counted on - and counted.
3756		 */
3757		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
3758			name = strchr(name, '.') + 1;
3759	}
3760	return (name);
3761}
3762
3763/*
3764 * Return the part of pr2's path that is relative to pr1, or the whole path
3765 * if it does not directly follow.
3766 */
3767static char *
3768prison_path(struct prison *pr1, struct prison *pr2)
3769{
3770	char *path1, *path2;
3771	int len1;
3772
3773	path1 = pr1->pr_path;
3774	path2 = pr2->pr_path;
3775	if (!strcmp(path1, "/"))
3776		return (path2);
3777	len1 = strlen(path1);
3778	if (strncmp(path1, path2, len1))
3779		return (path2);
3780	if (path2[len1] == '\0')
3781		return "/";
3782	if (path2[len1] == '/')
3783		return (path2 + len1);
3784	return (path2);
3785}
3786
3787/*
3788 * Jail-related sysctls.
3789 */
3790static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
3791    "Jails");
3792
3793static int
3794sysctl_jail_list(SYSCTL_HANDLER_ARGS)
3795{
3796	struct xprison *xp;
3797	struct prison *pr, *cpr;
3798#ifdef INET
3799	struct in_addr *ip4 = NULL;
3800	int ip4s = 0;
3801#endif
3802#ifdef INET6
3803	struct in6_addr *ip6 = NULL;
3804	int ip6s = 0;
3805#endif
3806	int descend, error;
3807
3808	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
3809	pr = req->td->td_ucred->cr_prison;
3810	error = 0;
3811	sx_slock(&allprison_lock);
3812	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
3813#if defined(INET) || defined(INET6)
3814 again:
3815#endif
3816		mtx_lock(&cpr->pr_mtx);
3817#ifdef INET
3818		if (cpr->pr_ip4s > 0) {
3819			if (ip4s < cpr->pr_ip4s) {
3820				ip4s = cpr->pr_ip4s;
3821				mtx_unlock(&cpr->pr_mtx);
3822				ip4 = realloc(ip4, ip4s *
3823				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
3824				goto again;
3825			}
3826			bcopy(cpr->pr_ip4, ip4,
3827			    cpr->pr_ip4s * sizeof(struct in_addr));
3828		}
3829#endif
3830#ifdef INET6
3831		if (cpr->pr_ip6s > 0) {
3832			if (ip6s < cpr->pr_ip6s) {
3833				ip6s = cpr->pr_ip6s;
3834				mtx_unlock(&cpr->pr_mtx);
3835				ip6 = realloc(ip6, ip6s *
3836				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
3837				goto again;
3838			}
3839			bcopy(cpr->pr_ip6, ip6,
3840			    cpr->pr_ip6s * sizeof(struct in6_addr));
3841		}
3842#endif
3843		bzero(xp, sizeof(*xp));
3844		xp->pr_version = XPRISON_VERSION;
3845		xp->pr_id = cpr->pr_id;
3846		xp->pr_state = cpr->pr_state;
3847		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
3848		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
3849		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
3850#ifdef INET
3851		xp->pr_ip4s = cpr->pr_ip4s;
3852#endif
3853#ifdef INET6
3854		xp->pr_ip6s = cpr->pr_ip6s;
3855#endif
3856		mtx_unlock(&cpr->pr_mtx);
3857		error = SYSCTL_OUT(req, xp, sizeof(*xp));
3858		if (error)
3859			break;
3860#ifdef INET
3861		if (xp->pr_ip4s > 0) {
3862			error = SYSCTL_OUT(req, ip4,
3863			    xp->pr_ip4s * sizeof(struct in_addr));
3864			if (error)
3865				break;
3866		}
3867#endif
3868#ifdef INET6
3869		if (xp->pr_ip6s > 0) {
3870			error = SYSCTL_OUT(req, ip6,
3871			    xp->pr_ip6s * sizeof(struct in6_addr));
3872			if (error)
3873				break;
3874		}
3875#endif
3876	}
3877	sx_sunlock(&allprison_lock);
3878	free(xp, M_TEMP);
3879#ifdef INET
3880	free(ip4, M_TEMP);
3881#endif
3882#ifdef INET6
3883	free(ip6, M_TEMP);
3884#endif
3885	return (error);
3886}
3887
3888SYSCTL_OID(_security_jail, OID_AUTO, list,
3889    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3890    sysctl_jail_list, "S", "List of active jails");
3891
3892static int
3893sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
3894{
3895	int error, injail;
3896
3897	injail = jailed(req->td->td_ucred);
3898	error = SYSCTL_OUT(req, &injail, sizeof(injail));
3899
3900	return (error);
3901}
3902
3903SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
3904    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3905    sysctl_jail_jailed, "I", "Process in jail?");
3906
3907static int
3908sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
3909{
3910	int error, havevnet;
3911#ifdef VIMAGE
3912	struct ucred *cred = req->td->td_ucred;
3913
3914	havevnet = jailed(cred) && prison_owns_vnet(cred);
3915#else
3916	havevnet = 0;
3917#endif
3918	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
3919
3920	return (error);
3921}
3922
3923SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
3924    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3925    sysctl_jail_vnet, "I", "Jail owns vnet?");
3926
3927#if defined(INET) || defined(INET6)
3928SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
3929    &jail_max_af_ips, 0,
3930    "Number of IP addresses a jail may have at most per address family (deprecated)");
3931#endif
3932
3933/*
3934 * Default parameters for jail(2) compatibility.  For historical reasons,
3935 * the sysctl names have varying similarity to the parameter names.  Prisons
3936 * just see their own parameters, and can't change them.
3937 */
3938static int
3939sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
3940{
3941	int error, i;
3942
3943	/* Get the current flag value, and convert it to a boolean. */
3944	if (req->td->td_ucred->cr_prison == &prison0) {
3945		mtx_lock(&prison0.pr_mtx);
3946		i = (jail_default_allow & arg2) != 0;
3947		mtx_unlock(&prison0.pr_mtx);
3948	} else
3949		i = prison_allow(req->td->td_ucred, arg2);
3950
3951	if (arg1 != NULL)
3952		i = !i;
3953	error = sysctl_handle_int(oidp, &i, 0, req);
3954	if (error || !req->newptr)
3955		return (error);
3956	i = i ? arg2 : 0;
3957	if (arg1 != NULL)
3958		i ^= arg2;
3959	/*
3960	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
3961	 * for writing.
3962	 */
3963	mtx_lock(&prison0.pr_mtx);
3964	jail_default_allow = (jail_default_allow & ~arg2) | i;
3965	mtx_unlock(&prison0.pr_mtx);
3966	return (0);
3967}
3968
3969SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
3970    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3971    NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
3972    "Processes in jail can set their hostnames (deprecated)");
3973SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
3974    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3975    (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
3976    "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
3977SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
3978    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3979    NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
3980    "Processes in jail can use System V IPC primitives (deprecated)");
3981SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
3982    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3983    NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
3984    "Prison root can create raw sockets (deprecated)");
3985SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
3986    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3987    NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
3988    "Processes in jail can alter system file flags (deprecated)");
3989SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
3990    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3991    NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
3992    "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
3993
3994static int
3995sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
3996{
3997	struct prison *pr;
3998	int level, error;
3999
4000	pr = req->td->td_ucred->cr_prison;
4001	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4002	error = sysctl_handle_int(oidp, &level, 0, req);
4003	if (error || !req->newptr)
4004		return (error);
4005	*(int *)arg1 = level;
4006	return (0);
4007}
4008
4009SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4010    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4011    &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4012    sysctl_jail_default_level, "I",
4013    "Processes in jail cannot see all mounted file systems (deprecated)");
4014
4015SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
4016    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4017    &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
4018    sysctl_jail_default_level, "I",
4019    "Ruleset for the devfs filesystem in jail (deprecated)");
4020
4021/*
4022 * Nodes to describe jail parameters.  Maximum length of string parameters
4023 * is returned in the string itself, and the other parameters exist merely
4024 * to make themselves and their types known.
4025 */
4026SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4027    "Jail parameters");
4028
4029int
4030sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4031{
4032	int i;
4033	long l;
4034	size_t s;
4035	char numbuf[12];
4036
4037	switch (oidp->oid_kind & CTLTYPE)
4038	{
4039	case CTLTYPE_LONG:
4040	case CTLTYPE_ULONG:
4041		l = 0;
4042#ifdef SCTL_MASK32
4043		if (!(req->flags & SCTL_MASK32))
4044#endif
4045			return (SYSCTL_OUT(req, &l, sizeof(l)));
4046	case CTLTYPE_INT:
4047	case CTLTYPE_UINT:
4048		i = 0;
4049		return (SYSCTL_OUT(req, &i, sizeof(i)));
4050	case CTLTYPE_STRING:
4051		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
4052		return
4053		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4054	case CTLTYPE_STRUCT:
4055		s = (size_t)arg2;
4056		return (SYSCTL_OUT(req, &s, sizeof(s)));
4057	}
4058	return (0);
4059}
4060
4061/*
4062 * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
4063 * jail creation time but cannot be changed in an existing jail.
4064 */
4065SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4066SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4067SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4068SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4069SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4070    "I", "Jail secure level");
4071SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
4072    "Jail value for kern.osreldate and uname -K");
4073SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
4074    "Jail value for kern.osrelease and uname -r");
4075SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4076    "I", "Jail cannot see all mounted file systems");
4077SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
4078    "I", "Ruleset for in-jail devfs mounts");
4079SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4080    "B", "Jail persistence");
4081#ifdef VIMAGE
4082SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4083    "E,jailsys", "Virtual network stack");
4084#endif
4085SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4086    "B", "Jail is in the process of shutting down");
4087
4088SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4089SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4090    "I", "Current number of child jails");
4091SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4092    "I", "Maximum number of child jails");
4093
4094SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4095SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4096    "Jail hostname");
4097SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4098    "Jail NIS domainname");
4099SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4100    "Jail host UUID");
4101SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4102    "LU", "Jail host ID");
4103
4104SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4105SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4106
4107#ifdef INET
4108SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4109    "Jail IPv4 address virtualization");
4110SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4111    "S,in_addr,a", "Jail IPv4 addresses");
4112SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4113    "B", "Do (not) use IPv4 source address selection rather than the "
4114    "primary jail IPv4 address.");
4115#endif
4116#ifdef INET6
4117SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4118    "Jail IPv6 address virtualization");
4119SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4120    "S,in6_addr,a", "Jail IPv6 addresses");
4121SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4122    "B", "Do (not) use IPv6 source address selection rather than the "
4123    "primary jail IPv6 address.");
4124#endif
4125
4126SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4127SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4128    "B", "Jail may set hostname");
4129SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4130    "B", "Jail may use SYSV IPC");
4131SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4132    "B", "Jail may create raw sockets");
4133SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4134    "B", "Jail may alter system file flags");
4135SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4136    "B", "Jail may set file quotas");
4137SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4138    "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4139SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW,
4140    "B", "Jail may lock (unlock) physical pages in memory");
4141SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
4142    "B", "Jail may bind sockets to reserved ports");
4143SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
4144    "B", "Jail may read the kernel message buffer");
4145SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW,
4146    "B", "Unprivileged processes may use process debugging facilities");
4147SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW,
4148    "B", "Processes in jail with uid 0 have privilege");
4149
4150SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
4151SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
4152    "B", "Jail may mount/unmount jail-friendly file systems in general");
4153
4154/*
4155 * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>.  Return
4156 * its associated bit in the pr_allow bitmask, or zero if the parameter was
4157 * not created.
4158 */
4159unsigned
4160prison_add_allow(const char *prefix, const char *name, const char *prefix_descr,
4161    const char *descr)
4162{
4163	struct bool_flags *bf;
4164	struct sysctl_oid *parent;
4165	char *allow_name, *allow_noname, *allowed;
4166#ifndef NO_SYSCTL_DESCR
4167	char *descr_deprecated;
4168#endif
4169	u_int allow_flag;
4170
4171	if (prefix
4172	    ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name)
4173		< 0 ||
4174	      asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name)
4175		< 0
4176	    : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 ||
4177	      asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) {
4178		free(allow_name, M_PRISON);
4179		return 0;
4180	}
4181
4182	/*
4183	 * See if this parameter has already beed added, i.e. a module was
4184	 * previously loaded/unloaded.
4185	 */
4186	mtx_lock(&prison0.pr_mtx);
4187	for (bf = pr_flag_allow;
4188	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
4189		atomic_load_int(&bf->flag) != 0;
4190	     bf++) {
4191		if (strcmp(bf->name, allow_name) == 0) {
4192			allow_flag = bf->flag;
4193			goto no_add;
4194		}
4195	}
4196
4197	/*
4198	 * Find a free bit in pr_allow_all, failing if there are none
4199	 * (which shouldn't happen as long as we keep track of how many
4200	 * potential dynamic flags exist).
4201	 */
4202	for (allow_flag = 1;; allow_flag <<= 1) {
4203		if (allow_flag == 0)
4204			goto no_add;
4205		if ((pr_allow_all & allow_flag) == 0)
4206			break;
4207	}
4208
4209	/* Note the parameter in the next open slot in pr_flag_allow. */
4210	for (bf = pr_flag_allow; ; bf++) {
4211		if (bf == pr_flag_allow + nitems(pr_flag_allow)) {
4212			/* This should never happen, but is not fatal. */
4213			allow_flag = 0;
4214			goto no_add;
4215		}
4216		if (atomic_load_int(&bf->flag) == 0)
4217			break;
4218	}
4219	bf->name = allow_name;
4220	bf->noname = allow_noname;
4221	pr_allow_all |= allow_flag;
4222	/*
4223	 * prison0 always has permission for the new parameter.
4224	 * Other jails must have it granted to them.
4225	 */
4226	prison0.pr_allow |= allow_flag;
4227	/* The flag indicates a valid entry, so make sure it is set last. */
4228	atomic_store_rel_int(&bf->flag, allow_flag);
4229	mtx_unlock(&prison0.pr_mtx);
4230
4231	/*
4232	 * Create sysctls for the paramter, and the back-compat global
4233	 * permission.
4234	 */
4235	parent = prefix
4236	    ? SYSCTL_ADD_NODE(NULL,
4237		  SYSCTL_CHILDREN(&sysctl___security_jail_param_allow),
4238		  OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr)
4239	    : &sysctl___security_jail_param_allow;
4240	(void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
4241	    name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4242	    NULL, 0, sysctl_jail_param, "B", descr);
4243	if ((prefix
4244	     ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name)
4245	     : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) {
4246#ifndef NO_SYSCTL_DESCR
4247		(void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)",
4248		    descr);
4249#endif
4250		(void)SYSCTL_ADD_PROC(NULL,
4251		    SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed,
4252		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag,
4253		    sysctl_jail_default_allow, "I", descr_deprecated);
4254#ifndef NO_SYSCTL_DESCR
4255		free(descr_deprecated, M_TEMP);
4256#endif
4257		free(allowed, M_TEMP);
4258	}
4259	return allow_flag;
4260
4261 no_add:
4262	mtx_unlock(&prison0.pr_mtx);
4263	free(allow_name, M_PRISON);
4264	free(allow_noname, M_PRISON);
4265	return allow_flag;
4266}
4267
4268/*
4269 * The VFS system will register jail-aware filesystems here.  They each get
4270 * a parameter allow.mount.xxxfs and a flag to check when a jailed user
4271 * attempts to mount.
4272 */
4273void
4274prison_add_vfs(struct vfsconf *vfsp)
4275{
4276#ifdef NO_SYSCTL_DESCR
4277
4278	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
4279	    NULL, NULL);
4280#else
4281	char *descr;
4282
4283	(void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system",
4284	    vfsp->vfc_name);
4285	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
4286	    NULL, descr);
4287	free(descr, M_TEMP);
4288#endif
4289}
4290
4291#ifdef RACCT
4292void
4293prison_racct_foreach(void (*callback)(struct racct *racct,
4294    void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
4295    void *arg2, void *arg3)
4296{
4297	struct prison_racct *prr;
4298
4299	ASSERT_RACCT_ENABLED();
4300
4301	sx_slock(&allprison_lock);
4302	if (pre != NULL)
4303		(pre)();
4304	LIST_FOREACH(prr, &allprison_racct, prr_next)
4305		(callback)(prr->prr_racct, arg2, arg3);
4306	if (post != NULL)
4307		(post)();
4308	sx_sunlock(&allprison_lock);
4309}
4310
4311static struct prison_racct *
4312prison_racct_find_locked(const char *name)
4313{
4314	struct prison_racct *prr;
4315
4316	ASSERT_RACCT_ENABLED();
4317	sx_assert(&allprison_lock, SA_XLOCKED);
4318
4319	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
4320		return (NULL);
4321
4322	LIST_FOREACH(prr, &allprison_racct, prr_next) {
4323		if (strcmp(name, prr->prr_name) != 0)
4324			continue;
4325
4326		/* Found prison_racct with a matching name? */
4327		prison_racct_hold(prr);
4328		return (prr);
4329	}
4330
4331	/* Add new prison_racct. */
4332	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
4333	racct_create(&prr->prr_racct);
4334
4335	strcpy(prr->prr_name, name);
4336	refcount_init(&prr->prr_refcount, 1);
4337	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
4338
4339	return (prr);
4340}
4341
4342struct prison_racct *
4343prison_racct_find(const char *name)
4344{
4345	struct prison_racct *prr;
4346
4347	ASSERT_RACCT_ENABLED();
4348
4349	sx_xlock(&allprison_lock);
4350	prr = prison_racct_find_locked(name);
4351	sx_xunlock(&allprison_lock);
4352	return (prr);
4353}
4354
4355void
4356prison_racct_hold(struct prison_racct *prr)
4357{
4358
4359	ASSERT_RACCT_ENABLED();
4360
4361	refcount_acquire(&prr->prr_refcount);
4362}
4363
4364static void
4365prison_racct_free_locked(struct prison_racct *prr)
4366{
4367
4368	ASSERT_RACCT_ENABLED();
4369	sx_assert(&allprison_lock, SA_XLOCKED);
4370
4371	if (refcount_release(&prr->prr_refcount)) {
4372		racct_destroy(&prr->prr_racct);
4373		LIST_REMOVE(prr, prr_next);
4374		free(prr, M_PRISON_RACCT);
4375	}
4376}
4377
4378void
4379prison_racct_free(struct prison_racct *prr)
4380{
4381
4382	ASSERT_RACCT_ENABLED();
4383	sx_assert(&allprison_lock, SA_UNLOCKED);
4384
4385	if (refcount_release_if_not_last(&prr->prr_refcount))
4386		return;
4387
4388	sx_xlock(&allprison_lock);
4389	prison_racct_free_locked(prr);
4390	sx_xunlock(&allprison_lock);
4391}
4392
4393static void
4394prison_racct_attach(struct prison *pr)
4395{
4396	struct prison_racct *prr;
4397
4398	ASSERT_RACCT_ENABLED();
4399	sx_assert(&allprison_lock, SA_XLOCKED);
4400
4401	prr = prison_racct_find_locked(pr->pr_name);
4402	KASSERT(prr != NULL, ("cannot find prison_racct"));
4403
4404	pr->pr_prison_racct = prr;
4405}
4406
4407/*
4408 * Handle jail renaming.  From the racct point of view, renaming means
4409 * moving from one prison_racct to another.
4410 */
4411static void
4412prison_racct_modify(struct prison *pr)
4413{
4414#ifdef RCTL
4415	struct proc *p;
4416	struct ucred *cred;
4417#endif
4418	struct prison_racct *oldprr;
4419
4420	ASSERT_RACCT_ENABLED();
4421
4422	sx_slock(&allproc_lock);
4423	sx_xlock(&allprison_lock);
4424
4425	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
4426		sx_xunlock(&allprison_lock);
4427		sx_sunlock(&allproc_lock);
4428		return;
4429	}
4430
4431	oldprr = pr->pr_prison_racct;
4432	pr->pr_prison_racct = NULL;
4433
4434	prison_racct_attach(pr);
4435
4436	/*
4437	 * Move resource utilisation records.
4438	 */
4439	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
4440
4441#ifdef RCTL
4442	/*
4443	 * Force rctl to reattach rules to processes.
4444	 */
4445	FOREACH_PROC_IN_SYSTEM(p) {
4446		PROC_LOCK(p);
4447		cred = crhold(p->p_ucred);
4448		PROC_UNLOCK(p);
4449		rctl_proc_ucred_changed(p, cred);
4450		crfree(cred);
4451	}
4452#endif
4453
4454	sx_sunlock(&allproc_lock);
4455	prison_racct_free_locked(oldprr);
4456	sx_xunlock(&allprison_lock);
4457}
4458
4459static void
4460prison_racct_detach(struct prison *pr)
4461{
4462
4463	ASSERT_RACCT_ENABLED();
4464	sx_assert(&allprison_lock, SA_UNLOCKED);
4465
4466	if (pr->pr_prison_racct == NULL)
4467		return;
4468	prison_racct_free(pr->pr_prison_racct);
4469	pr->pr_prison_racct = NULL;
4470}
4471#endif /* RACCT */
4472
4473#ifdef DDB
4474
4475static void
4476db_show_prison(struct prison *pr)
4477{
4478	struct bool_flags *bf;
4479	struct jailsys_flags *jsf;
4480#if defined(INET) || defined(INET6)
4481	int ii;
4482#endif
4483	unsigned f;
4484#ifdef INET
4485	char ip4buf[INET_ADDRSTRLEN];
4486#endif
4487#ifdef INET6
4488	char ip6buf[INET6_ADDRSTRLEN];
4489#endif
4490
4491	db_printf("prison %p:\n", pr);
4492	db_printf(" jid             = %d\n", pr->pr_id);
4493	db_printf(" name            = %s\n", pr->pr_name);
4494	db_printf(" parent          = %p\n", pr->pr_parent);
4495	db_printf(" ref             = %d\n", pr->pr_ref);
4496	db_printf(" uref            = %d\n", pr->pr_uref);
4497	db_printf(" state           = %s\n",
4498	    pr->pr_state == PRISON_STATE_ALIVE ? "alive" :
4499	    pr->pr_state == PRISON_STATE_DYING ? "dying" :
4500	    "invalid");
4501	db_printf(" path            = %s\n", pr->pr_path);
4502	db_printf(" cpuset          = %d\n", pr->pr_cpuset
4503	    ? pr->pr_cpuset->cs_id : -1);
4504#ifdef VIMAGE
4505	db_printf(" vnet            = %p\n", pr->pr_vnet);
4506#endif
4507	db_printf(" root            = %p\n", pr->pr_root);
4508	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
4509	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
4510	db_printf(" children.max    = %d\n", pr->pr_childmax);
4511	db_printf(" children.cur    = %d\n", pr->pr_childcount);
4512	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
4513	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
4514	db_printf(" flags           = 0x%x", pr->pr_flags);
4515	for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++)
4516		if (pr->pr_flags & bf->flag)
4517			db_printf(" %s", bf->name);
4518	for (jsf = pr_flag_jailsys;
4519	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
4520	     jsf++) {
4521		f = pr->pr_flags & (jsf->disable | jsf->new);
4522		db_printf(" %-16s= %s\n", jsf->name,
4523		    (f != 0 && f == jsf->disable) ? "disable"
4524		    : (f == jsf->new) ? "new"
4525		    : "inherit");
4526	}
4527	db_printf(" allow           = 0x%x", pr->pr_allow);
4528	for (bf = pr_flag_allow;
4529	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
4530		atomic_load_int(&bf->flag) != 0;
4531	     bf++)
4532		if (pr->pr_allow & bf->flag)
4533			db_printf(" %s", bf->name);
4534	db_printf("\n");
4535	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
4536	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
4537	db_printf(" host.domainname = %s\n", pr->pr_domainname);
4538	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
4539	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
4540#ifdef INET
4541	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
4542	for (ii = 0; ii < pr->pr_ip4s; ii++)
4543		db_printf(" %s %s\n",
4544		    ii == 0 ? "ip4.addr        =" : "                 ",
4545		    inet_ntoa_r(pr->pr_ip4[ii], ip4buf));
4546#endif
4547#ifdef INET6
4548	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
4549	for (ii = 0; ii < pr->pr_ip6s; ii++)
4550		db_printf(" %s %s\n",
4551		    ii == 0 ? "ip6.addr        =" : "                 ",
4552		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
4553#endif
4554}
4555
4556DB_SHOW_COMMAND(prison, db_show_prison_command)
4557{
4558	struct prison *pr;
4559
4560	if (!have_addr) {
4561		/*
4562		 * Show all prisons in the list, and prison0 which is not
4563		 * listed.
4564		 */
4565		db_show_prison(&prison0);
4566		if (!db_pager_quit) {
4567			TAILQ_FOREACH(pr, &allprison, pr_list) {
4568				db_show_prison(pr);
4569				if (db_pager_quit)
4570					break;
4571			}
4572		}
4573		return;
4574	}
4575
4576	if (addr == 0)
4577		pr = &prison0;
4578	else {
4579		/* Look for a prison with the ID and with references. */
4580		TAILQ_FOREACH(pr, &allprison, pr_list)
4581			if (pr->pr_id == addr && pr->pr_ref > 0)
4582				break;
4583		if (pr == NULL)
4584			/* Look again, without requiring a reference. */
4585			TAILQ_FOREACH(pr, &allprison, pr_list)
4586				if (pr->pr_id == addr)
4587					break;
4588		if (pr == NULL)
4589			/* Assume address points to a valid prison. */
4590			pr = (struct prison *)addr;
4591	}
4592	db_show_prison(pr);
4593}
4594
4595#endif /* DDB */
4596