kern_jail.c revision 295951
1/*-
2 * Copyright (c) 1999 Poul-Henning Kamp.
3 * Copyright (c) 2008 Bjoern A. Zeeb.
4 * Copyright (c) 2009 James Gritton.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/kern/kern_jail.c 295951 2016-02-24 02:34:11Z araujo $");
31
32#include "opt_compat.h"
33#include "opt_ddb.h"
34#include "opt_inet.h"
35#include "opt_inet6.h"
36
37#include <sys/param.h>
38#include <sys/types.h>
39#include <sys/kernel.h>
40#include <sys/systm.h>
41#include <sys/errno.h>
42#include <sys/sysproto.h>
43#include <sys/malloc.h>
44#include <sys/osd.h>
45#include <sys/priv.h>
46#include <sys/proc.h>
47#include <sys/taskqueue.h>
48#include <sys/fcntl.h>
49#include <sys/jail.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/racct.h>
53#include <sys/refcount.h>
54#include <sys/sx.h>
55#include <sys/sysent.h>
56#include <sys/namei.h>
57#include <sys/mount.h>
58#include <sys/queue.h>
59#include <sys/socket.h>
60#include <sys/syscallsubr.h>
61#include <sys/sysctl.h>
62#include <sys/vnode.h>
63
64#include <net/if.h>
65#include <net/vnet.h>
66
67#include <netinet/in.h>
68
69#ifdef DDB
70#include <ddb/ddb.h>
71#ifdef INET6
72#include <netinet6/in6_var.h>
73#endif /* INET6 */
74#endif /* DDB */
75
76#include <security/mac/mac_framework.h>
77
78#define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
79
80MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
81static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
82
83/* Keep struct prison prison0 and some code in kern_jail_set() readable. */
84#ifdef INET
85#ifdef INET6
86#define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
87#else
88#define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
89#endif
90#else /* !INET */
91#ifdef INET6
92#define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
93#else
94#define	_PR_IP_SADDRSEL	0
95#endif
96#endif
97
98/* prison0 describes what is "real" about the system. */
99struct prison prison0 = {
100	.pr_id		= 0,
101	.pr_name	= "0",
102	.pr_ref		= 1,
103	.pr_uref	= 1,
104	.pr_path	= "/",
105	.pr_securelevel	= -1,
106	.pr_devfs_rsnum = 0,
107	.pr_childmax	= JAIL_MAX,
108	.pr_hostuuid	= DEFAULT_HOSTUUID,
109	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
110#ifdef VIMAGE
111	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
112#else
113	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
114#endif
115	.pr_allow	= PR_ALLOW_ALL,
116};
117MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
118
119/* allprison, allprison_racct and lastprid are protected by allprison_lock. */
120struct	sx allprison_lock;
121SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
122struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
123LIST_HEAD(, prison_racct) allprison_racct;
124int	lastprid = 0;
125
126static int do_jail_attach(struct thread *td, struct prison *pr);
127static void prison_complete(void *context, int pending);
128static void prison_deref(struct prison *pr, int flags);
129static char *prison_path(struct prison *pr1, struct prison *pr2);
130static void prison_remove_one(struct prison *pr);
131#ifdef RACCT
132static void prison_racct_attach(struct prison *pr);
133static void prison_racct_modify(struct prison *pr);
134static void prison_racct_detach(struct prison *pr);
135#endif
136#ifdef INET
137static int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
138static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4);
139#endif
140#ifdef INET6
141static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
142static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6);
143#endif
144
145/* Flags for prison_deref */
146#define	PD_DEREF	0x01
147#define	PD_DEUREF	0x02
148#define	PD_LOCKED	0x04
149#define	PD_LIST_SLOCKED	0x08
150#define	PD_LIST_XLOCKED	0x10
151
152/*
153 * Parameter names corresponding to PR_* flag values.  Size values are for kvm
154 * as we cannot figure out the size of a sparse array, or an array without a
155 * terminating entry.
156 */
157static char *pr_flag_names[] = {
158	[0] = "persist",
159#ifdef INET
160	[7] = "ip4.saddrsel",
161#endif
162#ifdef INET6
163	[8] = "ip6.saddrsel",
164#endif
165};
166const size_t pr_flag_names_size = sizeof(pr_flag_names);
167
168static char *pr_flag_nonames[] = {
169	[0] = "nopersist",
170#ifdef INET
171	[7] = "ip4.nosaddrsel",
172#endif
173#ifdef INET6
174	[8] = "ip6.nosaddrsel",
175#endif
176};
177const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames);
178
179struct jailsys_flags {
180	const char	*name;
181	unsigned	 disable;
182	unsigned	 new;
183} pr_flag_jailsys[] = {
184	{ "host", 0, PR_HOST },
185#ifdef VIMAGE
186	{ "vnet", 0, PR_VNET },
187#endif
188#ifdef INET
189	{ "ip4", PR_IP4_USER | PR_IP4_DISABLE, PR_IP4_USER },
190#endif
191#ifdef INET6
192	{ "ip6", PR_IP6_USER | PR_IP6_DISABLE, PR_IP6_USER },
193#endif
194};
195const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
196
197static char *pr_allow_names[] = {
198	"allow.set_hostname",
199	"allow.sysvipc",
200	"allow.raw_sockets",
201	"allow.chflags",
202	"allow.mount",
203	"allow.quotas",
204	"allow.socket_af",
205	"allow.mount.devfs",
206	"allow.mount.nullfs",
207	"allow.mount.zfs",
208	"allow.mount.procfs",
209	"allow.mount.tmpfs",
210	"allow.mount.fdescfs",
211	"allow.mount.linprocfs",
212	"allow.mount.linsysfs",
213};
214const size_t pr_allow_names_size = sizeof(pr_allow_names);
215
216static char *pr_allow_nonames[] = {
217	"allow.noset_hostname",
218	"allow.nosysvipc",
219	"allow.noraw_sockets",
220	"allow.nochflags",
221	"allow.nomount",
222	"allow.noquotas",
223	"allow.nosocket_af",
224	"allow.mount.nodevfs",
225	"allow.mount.nonullfs",
226	"allow.mount.nozfs",
227	"allow.mount.noprocfs",
228	"allow.mount.notmpfs",
229	"allow.mount.nofdescfs",
230	"allow.mount.nolinprocfs",
231	"allow.mount.nolinsysfs",
232};
233const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);
234
235#define	JAIL_DEFAULT_ALLOW		PR_ALLOW_SET_HOSTNAME
236#define	JAIL_DEFAULT_ENFORCE_STATFS	2
237#define	JAIL_DEFAULT_DEVFS_RSNUM	0
238static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
239static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
240static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
241#if defined(INET) || defined(INET6)
242static unsigned jail_max_af_ips = 255;
243#endif
244
245/*
246 * Initialize the parts of prison0 that can't be static-initialized with
247 * constants.  This is called from proc0_init() after creating thread0 cpuset.
248 */
249void
250prison0_init(void)
251{
252
253	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
254	prison0.pr_osreldate = osreldate;
255	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
256}
257
258#ifdef INET
259static int
260qcmp_v4(const void *ip1, const void *ip2)
261{
262	in_addr_t iaa, iab;
263
264	/*
265	 * We need to compare in HBO here to get the list sorted as expected
266	 * by the result of the code.  Sorting NBO addresses gives you
267	 * interesting results.  If you do not understand, do not try.
268	 */
269	iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
270	iab = ntohl(((const struct in_addr *)ip2)->s_addr);
271
272	/*
273	 * Do not simply return the difference of the two numbers, the int is
274	 * not wide enough.
275	 */
276	if (iaa > iab)
277		return (1);
278	else if (iaa < iab)
279		return (-1);
280	else
281		return (0);
282}
283#endif
284
285#ifdef INET6
286static int
287qcmp_v6(const void *ip1, const void *ip2)
288{
289	const struct in6_addr *ia6a, *ia6b;
290	int i, rc;
291
292	ia6a = (const struct in6_addr *)ip1;
293	ia6b = (const struct in6_addr *)ip2;
294
295	rc = 0;
296	for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
297		if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
298			rc = 1;
299		else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
300			rc = -1;
301	}
302	return (rc);
303}
304#endif
305
306/*
307 * struct jail_args {
308 *	struct jail *jail;
309 * };
310 */
311int
312sys_jail(struct thread *td, struct jail_args *uap)
313{
314	uint32_t version;
315	int error;
316	struct jail j;
317
318	error = copyin(uap->jail, &version, sizeof(uint32_t));
319	if (error)
320		return (error);
321
322	switch (version) {
323	case 0:
324	{
325		struct jail_v0 j0;
326
327		/* FreeBSD single IPv4 jails. */
328		bzero(&j, sizeof(struct jail));
329		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
330		if (error)
331			return (error);
332		j.version = j0.version;
333		j.path = j0.path;
334		j.hostname = j0.hostname;
335		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
336		break;
337	}
338
339	case 1:
340		/*
341		 * Version 1 was used by multi-IPv4 jail implementations
342		 * that never made it into the official kernel.
343		 */
344		return (EINVAL);
345
346	case 2:	/* JAIL_API_VERSION */
347		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
348		error = copyin(uap->jail, &j, sizeof(struct jail));
349		if (error)
350			return (error);
351		break;
352
353	default:
354		/* Sci-Fi jails are not supported, sorry. */
355		return (EINVAL);
356	}
357	return (kern_jail(td, &j));
358}
359
360int
361kern_jail(struct thread *td, struct jail *j)
362{
363	struct iovec optiov[2 * (4
364			    + sizeof(pr_allow_names) / sizeof(pr_allow_names[0])
365#ifdef INET
366			    + 1
367#endif
368#ifdef INET6
369			    + 1
370#endif
371			    )];
372	struct uio opt;
373	char *u_path, *u_hostname, *u_name;
374#ifdef INET
375	uint32_t ip4s;
376	struct in_addr *u_ip4;
377#endif
378#ifdef INET6
379	struct in6_addr *u_ip6;
380#endif
381	size_t tmplen;
382	int error, enforce_statfs, fi;
383
384	bzero(&optiov, sizeof(optiov));
385	opt.uio_iov = optiov;
386	opt.uio_iovcnt = 0;
387	opt.uio_offset = -1;
388	opt.uio_resid = -1;
389	opt.uio_segflg = UIO_SYSSPACE;
390	opt.uio_rw = UIO_READ;
391	opt.uio_td = td;
392
393	/* Set permissions for top-level jails from sysctls. */
394	if (!jailed(td->td_ucred)) {
395		for (fi = 0; fi < sizeof(pr_allow_names) /
396		     sizeof(pr_allow_names[0]); fi++) {
397			optiov[opt.uio_iovcnt].iov_base =
398			    (jail_default_allow & (1 << fi))
399			    ? pr_allow_names[fi] : pr_allow_nonames[fi];
400			optiov[opt.uio_iovcnt].iov_len =
401			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
402			opt.uio_iovcnt += 2;
403		}
404		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
405		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
406		opt.uio_iovcnt++;
407		enforce_statfs = jail_default_enforce_statfs;
408		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
409		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
410		opt.uio_iovcnt++;
411	}
412
413	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
414#ifdef INET
415	ip4s = (j->version == 0) ? 1 : j->ip4s;
416	if (ip4s > jail_max_af_ips)
417		return (EINVAL);
418	tmplen += ip4s * sizeof(struct in_addr);
419#else
420	if (j->ip4s > 0)
421		return (EINVAL);
422#endif
423#ifdef INET6
424	if (j->ip6s > jail_max_af_ips)
425		return (EINVAL);
426	tmplen += j->ip6s * sizeof(struct in6_addr);
427#else
428	if (j->ip6s > 0)
429		return (EINVAL);
430#endif
431	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
432	u_hostname = u_path + MAXPATHLEN;
433	u_name = u_hostname + MAXHOSTNAMELEN;
434#ifdef INET
435	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
436#endif
437#ifdef INET6
438#ifdef INET
439	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
440#else
441	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
442#endif
443#endif
444	optiov[opt.uio_iovcnt].iov_base = "path";
445	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
446	opt.uio_iovcnt++;
447	optiov[opt.uio_iovcnt].iov_base = u_path;
448	error = copyinstr(j->path, u_path, MAXPATHLEN,
449	    &optiov[opt.uio_iovcnt].iov_len);
450	if (error) {
451		free(u_path, M_TEMP);
452		return (error);
453	}
454	opt.uio_iovcnt++;
455	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
456	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
457	opt.uio_iovcnt++;
458	optiov[opt.uio_iovcnt].iov_base = u_hostname;
459	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
460	    &optiov[opt.uio_iovcnt].iov_len);
461	if (error) {
462		free(u_path, M_TEMP);
463		return (error);
464	}
465	opt.uio_iovcnt++;
466	if (j->jailname != NULL) {
467		optiov[opt.uio_iovcnt].iov_base = "name";
468		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
469		opt.uio_iovcnt++;
470		optiov[opt.uio_iovcnt].iov_base = u_name;
471		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
472		    &optiov[opt.uio_iovcnt].iov_len);
473		if (error) {
474			free(u_path, M_TEMP);
475			return (error);
476		}
477		opt.uio_iovcnt++;
478	}
479#ifdef INET
480	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
481	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
482	opt.uio_iovcnt++;
483	optiov[opt.uio_iovcnt].iov_base = u_ip4;
484	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
485	if (j->version == 0)
486		u_ip4->s_addr = j->ip4s;
487	else {
488		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
489		if (error) {
490			free(u_path, M_TEMP);
491			return (error);
492		}
493	}
494	opt.uio_iovcnt++;
495#endif
496#ifdef INET6
497	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
498	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
499	opt.uio_iovcnt++;
500	optiov[opt.uio_iovcnt].iov_base = u_ip6;
501	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
502	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
503	if (error) {
504		free(u_path, M_TEMP);
505		return (error);
506	}
507	opt.uio_iovcnt++;
508#endif
509	KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
510	    ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
511	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
512	free(u_path, M_TEMP);
513	return (error);
514}
515
516
517/*
518 * struct jail_set_args {
519 *	struct iovec *iovp;
520 *	unsigned int iovcnt;
521 *	int flags;
522 * };
523 */
524int
525sys_jail_set(struct thread *td, struct jail_set_args *uap)
526{
527	struct uio *auio;
528	int error;
529
530	/* Check that we have an even number of iovecs. */
531	if (uap->iovcnt & 1)
532		return (EINVAL);
533
534	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
535	if (error)
536		return (error);
537	error = kern_jail_set(td, auio, uap->flags);
538	free(auio, M_IOV);
539	return (error);
540}
541
542int
543kern_jail_set(struct thread *td, struct uio *optuio, int flags)
544{
545	struct nameidata nd;
546#ifdef INET
547	struct in_addr *ip4;
548#endif
549#ifdef INET6
550	struct in6_addr *ip6;
551#endif
552	struct vfsopt *opt;
553	struct vfsoptlist *opts;
554	struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
555	struct vnode *root;
556	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
557	char *g_path, *osrelstr;
558#if defined(INET) || defined(INET6)
559	struct prison *tppr;
560	void *op;
561#endif
562	unsigned long hid;
563	size_t namelen, onamelen;
564	int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos;
565	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
566	int fi, jid, jsys, len, level;
567	int childmax, osreldt, rsnum, slevel;
568	int fullpath_disabled;
569#if defined(INET) || defined(INET6)
570	int ii, ij;
571#endif
572#ifdef INET
573	int ip4s, redo_ip4;
574#endif
575#ifdef INET6
576	int ip6s, redo_ip6;
577#endif
578	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
579	unsigned tallow;
580	char numbuf[12];
581
582	error = priv_check(td, PRIV_JAIL_SET);
583	if (!error && (flags & JAIL_ATTACH))
584		error = priv_check(td, PRIV_JAIL_ATTACH);
585	if (error)
586		return (error);
587	mypr = ppr = td->td_ucred->cr_prison;
588	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
589		return (EPERM);
590	if (flags & ~JAIL_SET_MASK)
591		return (EINVAL);
592
593	/*
594	 * Check all the parameters before committing to anything.  Not all
595	 * errors can be caught early, but we may as well try.  Also, this
596	 * takes care of some expensive stuff (path lookup) before getting
597	 * the allprison lock.
598	 *
599	 * XXX Jails are not filesystems, and jail parameters are not mount
600	 *     options.  But it makes more sense to re-use the vfsopt code
601	 *     than duplicate it under a different name.
602	 */
603	error = vfs_buildopts(optuio, &opts);
604	if (error)
605		return (error);
606#ifdef INET
607	ip4 = NULL;
608#endif
609#ifdef INET6
610	ip6 = NULL;
611#endif
612	g_path = NULL;
613
614	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
615	if (error == ENOENT)
616		jid = 0;
617	else if (error != 0)
618		goto done_free;
619
620	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
621	if (error == ENOENT)
622		gotslevel = 0;
623	else if (error != 0)
624		goto done_free;
625	else
626		gotslevel = 1;
627
628	error =
629	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
630	if (error == ENOENT)
631		gotchildmax = 0;
632	else if (error != 0)
633		goto done_free;
634	else
635		gotchildmax = 1;
636
637	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
638	if (error == ENOENT)
639		gotenforce = 0;
640	else if (error != 0)
641		goto done_free;
642	else if (enforce < 0 || enforce > 2) {
643		error = EINVAL;
644		goto done_free;
645	} else
646		gotenforce = 1;
647
648	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
649	if (error == ENOENT)
650		gotrsnum = 0;
651	else if (error != 0)
652		goto done_free;
653	else
654		gotrsnum = 1;
655
656	pr_flags = ch_flags = 0;
657	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
658	    fi++) {
659		if (pr_flag_names[fi] == NULL)
660			continue;
661		vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
662		vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
663	}
664	ch_flags |= pr_flags;
665	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
666	    fi++) {
667		error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
668		    sizeof(jsys));
669		if (error == ENOENT)
670			continue;
671		if (error != 0)
672			goto done_free;
673		switch (jsys) {
674		case JAIL_SYS_DISABLE:
675			if (!pr_flag_jailsys[fi].disable) {
676				error = EINVAL;
677				goto done_free;
678			}
679			pr_flags |= pr_flag_jailsys[fi].disable;
680			break;
681		case JAIL_SYS_NEW:
682			pr_flags |= pr_flag_jailsys[fi].new;
683			break;
684		case JAIL_SYS_INHERIT:
685			break;
686		default:
687			error = EINVAL;
688			goto done_free;
689		}
690		ch_flags |=
691		    pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
692	}
693	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
694	    && !(pr_flags & PR_PERSIST)) {
695		error = EINVAL;
696		vfs_opterror(opts, "new jail must persist or attach");
697		goto done_errmsg;
698	}
699#ifdef VIMAGE
700	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
701		error = EINVAL;
702		vfs_opterror(opts, "vnet cannot be changed after creation");
703		goto done_errmsg;
704	}
705#endif
706#ifdef INET
707	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
708		error = EINVAL;
709		vfs_opterror(opts, "ip4 cannot be changed after creation");
710		goto done_errmsg;
711	}
712#endif
713#ifdef INET6
714	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
715		error = EINVAL;
716		vfs_opterror(opts, "ip6 cannot be changed after creation");
717		goto done_errmsg;
718	}
719#endif
720
721	pr_allow = ch_allow = 0;
722	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
723	    fi++) {
724		vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
725		vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
726	}
727	ch_allow |= pr_allow;
728
729	error = vfs_getopt(opts, "name", (void **)&name, &len);
730	if (error == ENOENT)
731		name = NULL;
732	else if (error != 0)
733		goto done_free;
734	else {
735		if (len == 0 || name[len - 1] != '\0') {
736			error = EINVAL;
737			goto done_free;
738		}
739		if (len > MAXHOSTNAMELEN) {
740			error = ENAMETOOLONG;
741			goto done_free;
742		}
743	}
744
745	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
746	if (error == ENOENT)
747		host = NULL;
748	else if (error != 0)
749		goto done_free;
750	else {
751		ch_flags |= PR_HOST;
752		pr_flags |= PR_HOST;
753		if (len == 0 || host[len - 1] != '\0') {
754			error = EINVAL;
755			goto done_free;
756		}
757		if (len > MAXHOSTNAMELEN) {
758			error = ENAMETOOLONG;
759			goto done_free;
760		}
761	}
762
763	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
764	if (error == ENOENT)
765		domain = NULL;
766	else if (error != 0)
767		goto done_free;
768	else {
769		ch_flags |= PR_HOST;
770		pr_flags |= PR_HOST;
771		if (len == 0 || domain[len - 1] != '\0') {
772			error = EINVAL;
773			goto done_free;
774		}
775		if (len > MAXHOSTNAMELEN) {
776			error = ENAMETOOLONG;
777			goto done_free;
778		}
779	}
780
781	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
782	if (error == ENOENT)
783		uuid = NULL;
784	else if (error != 0)
785		goto done_free;
786	else {
787		ch_flags |= PR_HOST;
788		pr_flags |= PR_HOST;
789		if (len == 0 || uuid[len - 1] != '\0') {
790			error = EINVAL;
791			goto done_free;
792		}
793		if (len > HOSTUUIDLEN) {
794			error = ENAMETOOLONG;
795			goto done_free;
796		}
797	}
798
799#ifdef COMPAT_FREEBSD32
800	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
801		uint32_t hid32;
802
803		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
804		hid = hid32;
805	} else
806#endif
807		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
808	if (error == ENOENT)
809		gothid = 0;
810	else if (error != 0)
811		goto done_free;
812	else {
813		gothid = 1;
814		ch_flags |= PR_HOST;
815		pr_flags |= PR_HOST;
816	}
817
818#ifdef INET
819	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
820	if (error == ENOENT)
821		ip4s = 0;
822	else if (error != 0)
823		goto done_free;
824	else if (ip4s & (sizeof(*ip4) - 1)) {
825		error = EINVAL;
826		goto done_free;
827	} else {
828		ch_flags |= PR_IP4_USER | PR_IP4_DISABLE;
829		if (ip4s == 0)
830			pr_flags |= PR_IP4_USER | PR_IP4_DISABLE;
831		else {
832			pr_flags = (pr_flags & ~PR_IP4_DISABLE) | PR_IP4_USER;
833			ip4s /= sizeof(*ip4);
834			if (ip4s > jail_max_af_ips) {
835				error = EINVAL;
836				vfs_opterror(opts, "too many IPv4 addresses");
837				goto done_errmsg;
838			}
839			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
840			bcopy(op, ip4, ip4s * sizeof(*ip4));
841			/*
842			 * IP addresses are all sorted but ip[0] to preserve
843			 * the primary IP address as given from userland.
844			 * This special IP is used for unbound outgoing
845			 * connections as well for "loopback" traffic in case
846			 * source address selection cannot find any more fitting
847			 * address to connect from.
848			 */
849			if (ip4s > 1)
850				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
851			/*
852			 * Check for duplicate addresses and do some simple
853			 * zero and broadcast checks. If users give other bogus
854			 * addresses it is their problem.
855			 *
856			 * We do not have to care about byte order for these
857			 * checks so we will do them in NBO.
858			 */
859			for (ii = 0; ii < ip4s; ii++) {
860				if (ip4[ii].s_addr == INADDR_ANY ||
861				    ip4[ii].s_addr == INADDR_BROADCAST) {
862					error = EINVAL;
863					goto done_free;
864				}
865				if ((ii+1) < ip4s &&
866				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
867				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
868					error = EINVAL;
869					goto done_free;
870				}
871			}
872		}
873	}
874#endif
875
876#ifdef INET6
877	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
878	if (error == ENOENT)
879		ip6s = 0;
880	else if (error != 0)
881		goto done_free;
882	else if (ip6s & (sizeof(*ip6) - 1)) {
883		error = EINVAL;
884		goto done_free;
885	} else {
886		ch_flags |= PR_IP6_USER | PR_IP6_DISABLE;
887		if (ip6s == 0)
888			pr_flags |= PR_IP6_USER | PR_IP6_DISABLE;
889		else {
890			pr_flags = (pr_flags & ~PR_IP6_DISABLE) | PR_IP6_USER;
891			ip6s /= sizeof(*ip6);
892			if (ip6s > jail_max_af_ips) {
893				error = EINVAL;
894				vfs_opterror(opts, "too many IPv6 addresses");
895				goto done_errmsg;
896			}
897			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
898			bcopy(op, ip6, ip6s * sizeof(*ip6));
899			if (ip6s > 1)
900				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
901			for (ii = 0; ii < ip6s; ii++) {
902				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
903					error = EINVAL;
904					goto done_free;
905				}
906				if ((ii+1) < ip6s &&
907				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
908				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
909				{
910					error = EINVAL;
911					goto done_free;
912				}
913			}
914		}
915	}
916#endif
917
918#if defined(VIMAGE) && (defined(INET) || defined(INET6))
919	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
920		error = EINVAL;
921		vfs_opterror(opts,
922		    "vnet jails cannot have IP address restrictions");
923		goto done_errmsg;
924	}
925#endif
926
927	fullpath_disabled = 0;
928	root = NULL;
929	error = vfs_getopt(opts, "path", (void **)&path, &len);
930	if (error == ENOENT)
931		path = NULL;
932	else if (error != 0)
933		goto done_free;
934	else {
935		if (flags & JAIL_UPDATE) {
936			error = EINVAL;
937			vfs_opterror(opts,
938			    "path cannot be changed after creation");
939			goto done_errmsg;
940		}
941		if (len == 0 || path[len - 1] != '\0') {
942			error = EINVAL;
943			goto done_free;
944		}
945		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
946		    path, td);
947		error = namei(&nd);
948		if (error)
949			goto done_free;
950		root = nd.ni_vp;
951		NDFREE(&nd, NDF_ONLY_PNBUF);
952		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
953		strlcpy(g_path, path, MAXPATHLEN);
954		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
955		if (error == 0)
956			path = g_path;
957		else if (error == ENODEV) {
958			/* proceed if sysctl debug.disablefullpath == 1 */
959			fullpath_disabled = 1;
960			if (len < 2 || (len == 2 && path[0] == '/'))
961				path = NULL;
962		} else {
963			/* exit on other errors */
964			goto done_free;
965		}
966		if (root->v_type != VDIR) {
967			error = ENOTDIR;
968			vput(root);
969			goto done_free;
970		}
971		VOP_UNLOCK(root, 0);
972		if (fullpath_disabled) {
973			/* Leave room for a real-root full pathname. */
974			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
975			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
976				error = ENAMETOOLONG;
977				goto done_free;
978			}
979		}
980	}
981
982	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
983	if (error == ENOENT)
984		osrelstr = NULL;
985	else if (error != 0)
986		goto done_free;
987	else {
988		if (flags & JAIL_UPDATE) {
989			error = EINVAL;
990			vfs_opterror(opts,
991			    "osrelease cannot be changed after creation");
992			goto done_errmsg;
993		}
994		if (len == 0 || len >= OSRELEASELEN) {
995			error = EINVAL;
996			vfs_opterror(opts,
997			    "osrelease string must be 1-%d bytes long",
998			    OSRELEASELEN - 1);
999			goto done_errmsg;
1000		}
1001	}
1002
1003	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
1004	if (error == ENOENT)
1005		osreldt = 0;
1006	else if (error != 0)
1007		goto done_free;
1008	else {
1009		if (flags & JAIL_UPDATE) {
1010			error = EINVAL;
1011			vfs_opterror(opts,
1012			    "osreldate cannot be changed after creation");
1013			goto done_errmsg;
1014		}
1015		if (osreldt == 0) {
1016			error = EINVAL;
1017			vfs_opterror(opts, "osreldate cannot be 0");
1018			goto done_errmsg;
1019		}
1020	}
1021
1022	/*
1023	 * Grab the allprison lock before letting modules check their
1024	 * parameters.  Once we have it, do not let go so we'll have a
1025	 * consistent view of the OSD list.
1026	 */
1027	sx_xlock(&allprison_lock);
1028	error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
1029	if (error)
1030		goto done_unlock_list;
1031
1032	/* By now, all parameters should have been noted. */
1033	TAILQ_FOREACH(opt, opts, link) {
1034		if (!opt->seen && strcmp(opt->name, "errmsg")) {
1035			error = EINVAL;
1036			vfs_opterror(opts, "unknown parameter: %s", opt->name);
1037			goto done_unlock_list;
1038		}
1039	}
1040
1041	/*
1042	 * See if we are creating a new record or updating an existing one.
1043	 * This abuses the file error codes ENOENT and EEXIST.
1044	 */
1045	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
1046	if (!cuflags) {
1047		error = EINVAL;
1048		vfs_opterror(opts, "no valid operation (create or update)");
1049		goto done_unlock_list;
1050	}
1051	pr = NULL;
1052	namelc = NULL;
1053	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
1054		namelc = strrchr(name, '.');
1055		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
1056		if (*p != '\0')
1057			jid = 0;
1058	}
1059	if (jid != 0) {
1060		/*
1061		 * See if a requested jid already exists.  There is an
1062		 * information leak here if the jid exists but is not within
1063		 * the caller's jail hierarchy.  Jail creators will get EEXIST
1064		 * even though they cannot see the jail, and CREATE | UPDATE
1065		 * will return ENOENT which is not normally a valid error.
1066		 */
1067		if (jid < 0) {
1068			error = EINVAL;
1069			vfs_opterror(opts, "negative jid");
1070			goto done_unlock_list;
1071		}
1072		pr = prison_find(jid);
1073		if (pr != NULL) {
1074			ppr = pr->pr_parent;
1075			/* Create: jid must not exist. */
1076			if (cuflags == JAIL_CREATE) {
1077				mtx_unlock(&pr->pr_mtx);
1078				error = EEXIST;
1079				vfs_opterror(opts, "jail %d already exists",
1080				    jid);
1081				goto done_unlock_list;
1082			}
1083			if (!prison_ischild(mypr, pr)) {
1084				mtx_unlock(&pr->pr_mtx);
1085				pr = NULL;
1086			} else if (pr->pr_uref == 0) {
1087				if (!(flags & JAIL_DYING)) {
1088					mtx_unlock(&pr->pr_mtx);
1089					error = ENOENT;
1090					vfs_opterror(opts, "jail %d is dying",
1091					    jid);
1092					goto done_unlock_list;
1093				} else if ((flags & JAIL_ATTACH) ||
1094				    (pr_flags & PR_PERSIST)) {
1095					/*
1096					 * A dying jail might be resurrected
1097					 * (via attach or persist), but first
1098					 * it must determine if another jail
1099					 * has claimed its name.  Accomplish
1100					 * this by implicitly re-setting the
1101					 * name.
1102					 */
1103					if (name == NULL)
1104						name = prison_name(mypr, pr);
1105				}
1106			}
1107		}
1108		if (pr == NULL) {
1109			/* Update: jid must exist. */
1110			if (cuflags == JAIL_UPDATE) {
1111				error = ENOENT;
1112				vfs_opterror(opts, "jail %d not found", jid);
1113				goto done_unlock_list;
1114			}
1115		}
1116	}
1117	/*
1118	 * If the caller provided a name, look for a jail by that name.
1119	 * This has different semantics for creates and updates keyed by jid
1120	 * (where the name must not already exist in a different jail),
1121	 * and updates keyed by the name itself (where the name must exist
1122	 * because that is the jail being updated).
1123	 */
1124	if (name != NULL) {
1125		namelc = strrchr(name, '.');
1126		if (namelc == NULL)
1127			namelc = name;
1128		else {
1129			/*
1130			 * This is a hierarchical name.  Split it into the
1131			 * parent and child names, and make sure the parent
1132			 * exists or matches an already found jail.
1133			 */
1134			*namelc = '\0';
1135			if (pr != NULL) {
1136				if (strncmp(name, ppr->pr_name, namelc - name)
1137				    || ppr->pr_name[namelc - name] != '\0') {
1138					mtx_unlock(&pr->pr_mtx);
1139					error = EINVAL;
1140					vfs_opterror(opts,
1141					    "cannot change jail's parent");
1142					goto done_unlock_list;
1143				}
1144			} else {
1145				ppr = prison_find_name(mypr, name);
1146				if (ppr == NULL) {
1147					error = ENOENT;
1148					vfs_opterror(opts,
1149					    "jail \"%s\" not found", name);
1150					goto done_unlock_list;
1151				}
1152				mtx_unlock(&ppr->pr_mtx);
1153			}
1154			name = ++namelc;
1155		}
1156		if (name[0] != '\0') {
1157			namelen =
1158			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1159 name_again:
1160			deadpr = NULL;
1161			FOREACH_PRISON_CHILD(ppr, tpr) {
1162				if (tpr != pr && tpr->pr_ref > 0 &&
1163				    !strcmp(tpr->pr_name + namelen, name)) {
1164					if (pr == NULL &&
1165					    cuflags != JAIL_CREATE) {
1166						mtx_lock(&tpr->pr_mtx);
1167						if (tpr->pr_ref > 0) {
1168							/*
1169							 * Use this jail
1170							 * for updates.
1171							 */
1172							if (tpr->pr_uref > 0) {
1173								pr = tpr;
1174								break;
1175							}
1176							deadpr = tpr;
1177						}
1178						mtx_unlock(&tpr->pr_mtx);
1179					} else if (tpr->pr_uref > 0) {
1180						/*
1181						 * Create, or update(jid):
1182						 * name must not exist in an
1183						 * active sibling jail.
1184						 */
1185						error = EEXIST;
1186						if (pr != NULL)
1187							mtx_unlock(&pr->pr_mtx);
1188						vfs_opterror(opts,
1189						   "jail \"%s\" already exists",
1190						   name);
1191						goto done_unlock_list;
1192					}
1193				}
1194			}
1195			/* If no active jail is found, use a dying one. */
1196			if (deadpr != NULL && pr == NULL) {
1197				if (flags & JAIL_DYING) {
1198					mtx_lock(&deadpr->pr_mtx);
1199					if (deadpr->pr_ref == 0) {
1200						mtx_unlock(&deadpr->pr_mtx);
1201						goto name_again;
1202					}
1203					pr = deadpr;
1204				} else if (cuflags == JAIL_UPDATE) {
1205					error = ENOENT;
1206					vfs_opterror(opts,
1207					    "jail \"%s\" is dying", name);
1208					goto done_unlock_list;
1209				}
1210			}
1211			/* Update: name must exist if no jid. */
1212			else if (cuflags == JAIL_UPDATE && pr == NULL) {
1213				error = ENOENT;
1214				vfs_opterror(opts, "jail \"%s\" not found",
1215				    name);
1216				goto done_unlock_list;
1217			}
1218		}
1219	}
1220	/* Update: must provide a jid or name. */
1221	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1222		error = ENOENT;
1223		vfs_opterror(opts, "update specified no jail");
1224		goto done_unlock_list;
1225	}
1226
1227	/* If there's no prison to update, create a new one and link it in. */
1228	if (pr == NULL) {
1229		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1230			if (tpr->pr_childcount >= tpr->pr_childmax) {
1231				error = EPERM;
1232				vfs_opterror(opts, "prison limit exceeded");
1233				goto done_unlock_list;
1234			}
1235		created = 1;
1236		mtx_lock(&ppr->pr_mtx);
1237		if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) {
1238			mtx_unlock(&ppr->pr_mtx);
1239			error = ENOENT;
1240			vfs_opterror(opts, "parent jail went away!");
1241			goto done_unlock_list;
1242		}
1243		ppr->pr_ref++;
1244		ppr->pr_uref++;
1245		mtx_unlock(&ppr->pr_mtx);
1246		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1247		if (jid == 0) {
1248			/* Find the next free jid. */
1249			jid = lastprid + 1;
1250 findnext:
1251			if (jid == JAIL_MAX)
1252				jid = 1;
1253			TAILQ_FOREACH(tpr, &allprison, pr_list) {
1254				if (tpr->pr_id < jid)
1255					continue;
1256				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
1257					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1258					break;
1259				}
1260				if (jid == lastprid) {
1261					error = EAGAIN;
1262					vfs_opterror(opts,
1263					    "no available jail IDs");
1264					free(pr, M_PRISON);
1265					prison_deref(ppr, PD_DEREF |
1266					    PD_DEUREF | PD_LIST_XLOCKED);
1267					goto done_releroot;
1268				}
1269				jid++;
1270				goto findnext;
1271			}
1272			lastprid = jid;
1273		} else {
1274			/*
1275			 * The jail already has a jid (that did not yet exist),
1276			 * so just find where to insert it.
1277			 */
1278			TAILQ_FOREACH(tpr, &allprison, pr_list)
1279				if (tpr->pr_id >= jid) {
1280					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1281					break;
1282				}
1283		}
1284		if (tpr == NULL)
1285			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1286		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1287		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1288			tpr->pr_childcount++;
1289
1290		pr->pr_parent = ppr;
1291		pr->pr_id = jid;
1292
1293		/* Set some default values, and inherit some from the parent. */
1294		if (name == NULL)
1295			name = "";
1296		if (path == NULL) {
1297			path = "/";
1298			root = mypr->pr_root;
1299			vref(root);
1300		}
1301		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1302		pr->pr_flags |= PR_HOST;
1303#if defined(INET) || defined(INET6)
1304#ifdef VIMAGE
1305		if (!(pr_flags & PR_VNET))
1306#endif
1307		{
1308#ifdef INET
1309			if (!(ch_flags & PR_IP4_USER))
1310				pr->pr_flags |=
1311				    PR_IP4 | PR_IP4_USER | PR_IP4_DISABLE;
1312			else if (!(pr_flags & PR_IP4_USER)) {
1313				pr->pr_flags |= ppr->pr_flags & PR_IP4;
1314				if (ppr->pr_ip4 != NULL) {
1315					pr->pr_ip4s = ppr->pr_ip4s;
1316					pr->pr_ip4 = malloc(pr->pr_ip4s *
1317					    sizeof(struct in_addr), M_PRISON,
1318					    M_WAITOK);
1319					bcopy(ppr->pr_ip4, pr->pr_ip4,
1320					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1321				}
1322			}
1323#endif
1324#ifdef INET6
1325			if (!(ch_flags & PR_IP6_USER))
1326				pr->pr_flags |=
1327				    PR_IP6 | PR_IP6_USER | PR_IP6_DISABLE;
1328			else if (!(pr_flags & PR_IP6_USER)) {
1329				pr->pr_flags |= ppr->pr_flags & PR_IP6;
1330				if (ppr->pr_ip6 != NULL) {
1331					pr->pr_ip6s = ppr->pr_ip6s;
1332					pr->pr_ip6 = malloc(pr->pr_ip6s *
1333					    sizeof(struct in6_addr), M_PRISON,
1334					    M_WAITOK);
1335					bcopy(ppr->pr_ip6, pr->pr_ip6,
1336					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1337				}
1338			}
1339#endif
1340		}
1341#endif
1342		/* Source address selection is always on by default. */
1343		pr->pr_flags |= _PR_IP_SADDRSEL;
1344
1345		pr->pr_securelevel = ppr->pr_securelevel;
1346		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1347		pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
1348		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
1349
1350		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
1351		if (osrelstr == NULL)
1352		    strcpy(pr->pr_osrelease, ppr->pr_osrelease);
1353		else
1354		    strcpy(pr->pr_osrelease, osrelstr);
1355
1356		LIST_INIT(&pr->pr_children);
1357		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1358
1359#ifdef VIMAGE
1360		/* Allocate a new vnet if specified. */
1361		pr->pr_vnet = (pr_flags & PR_VNET)
1362		    ? vnet_alloc() : ppr->pr_vnet;
1363#endif
1364		/*
1365		 * Allocate a dedicated cpuset for each jail.
1366		 * Unlike other initial settings, this may return an erorr.
1367		 */
1368		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1369		if (error) {
1370			prison_deref(pr, PD_LIST_XLOCKED);
1371			goto done_releroot;
1372		}
1373
1374		mtx_lock(&pr->pr_mtx);
1375		/*
1376		 * New prisons do not yet have a reference, because we do not
1377		 * want other to see the incomplete prison once the
1378		 * allprison_lock is downgraded.
1379		 */
1380	} else {
1381		created = 0;
1382		/*
1383		 * Grab a reference for existing prisons, to ensure they
1384		 * continue to exist for the duration of the call.
1385		 */
1386		pr->pr_ref++;
1387#if defined(VIMAGE) && (defined(INET) || defined(INET6))
1388		if ((pr->pr_flags & PR_VNET) &&
1389		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1390			error = EINVAL;
1391			vfs_opterror(opts,
1392			    "vnet jails cannot have IP address restrictions");
1393			goto done_deref_locked;
1394		}
1395#endif
1396#ifdef INET
1397		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1398			error = EINVAL;
1399			vfs_opterror(opts,
1400			    "ip4 cannot be changed after creation");
1401			goto done_deref_locked;
1402		}
1403#endif
1404#ifdef INET6
1405		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1406			error = EINVAL;
1407			vfs_opterror(opts,
1408			    "ip6 cannot be changed after creation");
1409			goto done_deref_locked;
1410		}
1411#endif
1412	}
1413
1414	/* Do final error checking before setting anything. */
1415	if (gotslevel) {
1416		if (slevel < ppr->pr_securelevel) {
1417			error = EPERM;
1418			goto done_deref_locked;
1419		}
1420	}
1421	if (gotchildmax) {
1422		if (childmax >= ppr->pr_childmax) {
1423			error = EPERM;
1424			goto done_deref_locked;
1425		}
1426	}
1427	if (gotenforce) {
1428		if (enforce < ppr->pr_enforce_statfs) {
1429			error = EPERM;
1430			goto done_deref_locked;
1431		}
1432	}
1433	if (gotrsnum) {
1434		/*
1435		 * devfs_rsnum is a uint16_t
1436		 */
1437		if (rsnum < 0 || rsnum > 65535) {
1438			error = EINVAL;
1439			goto done_deref_locked;
1440		}
1441		/*
1442		 * Nested jails always inherit parent's devfs ruleset
1443		 */
1444		if (jailed(td->td_ucred)) {
1445			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
1446				error = EPERM;
1447				goto done_deref_locked;
1448			} else
1449				rsnum = ppr->pr_devfs_rsnum;
1450		}
1451	}
1452#ifdef INET
1453	if (ip4s > 0) {
1454		if (ppr->pr_flags & PR_IP4) {
1455			/*
1456			 * Make sure the new set of IP addresses is a
1457			 * subset of the parent's list.  Don't worry
1458			 * about the parent being unlocked, as any
1459			 * setting is done with allprison_lock held.
1460			 */
1461			for (ij = 0; ij < ppr->pr_ip4s; ij++)
1462				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
1463					break;
1464			if (ij == ppr->pr_ip4s) {
1465				error = EPERM;
1466				goto done_deref_locked;
1467			}
1468			if (ip4s > 1) {
1469				for (ii = ij = 1; ii < ip4s; ii++) {
1470					if (ip4[ii].s_addr ==
1471					    ppr->pr_ip4[0].s_addr)
1472						continue;
1473					for (; ij < ppr->pr_ip4s; ij++)
1474						if (ip4[ii].s_addr ==
1475						    ppr->pr_ip4[ij].s_addr)
1476							break;
1477					if (ij == ppr->pr_ip4s)
1478						break;
1479				}
1480				if (ij == ppr->pr_ip4s) {
1481					error = EPERM;
1482					goto done_deref_locked;
1483				}
1484			}
1485		}
1486		/*
1487		 * Check for conflicting IP addresses.  We permit them
1488		 * if there is no more than one IP on each jail.  If
1489		 * there is a duplicate on a jail with more than one
1490		 * IP stop checking and return error.
1491		 */
1492		tppr = ppr;
1493#ifdef VIMAGE
1494		for (; tppr != &prison0; tppr = tppr->pr_parent)
1495			if (tppr->pr_flags & PR_VNET)
1496				break;
1497#endif
1498		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1499			if (tpr == pr ||
1500#ifdef VIMAGE
1501			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1502#endif
1503			    tpr->pr_uref == 0) {
1504				descend = 0;
1505				continue;
1506			}
1507			if (!(tpr->pr_flags & PR_IP4_USER))
1508				continue;
1509			descend = 0;
1510			if (tpr->pr_ip4 == NULL ||
1511			    (ip4s == 1 && tpr->pr_ip4s == 1))
1512				continue;
1513			for (ii = 0; ii < ip4s; ii++) {
1514				if (_prison_check_ip4(tpr, &ip4[ii]) == 0) {
1515					error = EADDRINUSE;
1516					vfs_opterror(opts,
1517					    "IPv4 addresses clash");
1518					goto done_deref_locked;
1519				}
1520			}
1521		}
1522	}
1523#endif
1524#ifdef INET6
1525	if (ip6s > 0) {
1526		if (ppr->pr_flags & PR_IP6) {
1527			/*
1528			 * Make sure the new set of IP addresses is a
1529			 * subset of the parent's list.
1530			 */
1531			for (ij = 0; ij < ppr->pr_ip6s; ij++)
1532				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
1533				    &ppr->pr_ip6[ij]))
1534					break;
1535			if (ij == ppr->pr_ip6s) {
1536				error = EPERM;
1537				goto done_deref_locked;
1538			}
1539			if (ip6s > 1) {
1540				for (ii = ij = 1; ii < ip6s; ii++) {
1541					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
1542					     &ppr->pr_ip6[0]))
1543						continue;
1544					for (; ij < ppr->pr_ip6s; ij++)
1545						if (IN6_ARE_ADDR_EQUAL(
1546						    &ip6[ii], &ppr->pr_ip6[ij]))
1547							break;
1548					if (ij == ppr->pr_ip6s)
1549						break;
1550				}
1551				if (ij == ppr->pr_ip6s) {
1552					error = EPERM;
1553					goto done_deref_locked;
1554				}
1555			}
1556		}
1557		/* Check for conflicting IP addresses. */
1558		tppr = ppr;
1559#ifdef VIMAGE
1560		for (; tppr != &prison0; tppr = tppr->pr_parent)
1561			if (tppr->pr_flags & PR_VNET)
1562				break;
1563#endif
1564		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1565			if (tpr == pr ||
1566#ifdef VIMAGE
1567			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1568#endif
1569			    tpr->pr_uref == 0) {
1570				descend = 0;
1571				continue;
1572			}
1573			if (!(tpr->pr_flags & PR_IP6_USER))
1574				continue;
1575			descend = 0;
1576			if (tpr->pr_ip6 == NULL ||
1577			    (ip6s == 1 && tpr->pr_ip6s == 1))
1578				continue;
1579			for (ii = 0; ii < ip6s; ii++) {
1580				if (_prison_check_ip6(tpr, &ip6[ii]) == 0) {
1581					error = EADDRINUSE;
1582					vfs_opterror(opts,
1583					    "IPv6 addresses clash");
1584					goto done_deref_locked;
1585				}
1586			}
1587		}
1588	}
1589#endif
1590	onamelen = namelen = 0;
1591	if (name != NULL) {
1592		/* Give a default name of the jid.  Also allow the name to be
1593		 * explicitly the jid - but not any other number, and only in
1594		 * normal form (no leading zero/etc).
1595		 */
1596		if (name[0] == '\0')
1597			snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
1598		else if ((strtoul(namelc, &p, 10) != jid ||
1599			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
1600			error = EINVAL;
1601			vfs_opterror(opts,
1602			    "name cannot be numeric (unless it is the jid)");
1603			goto done_deref_locked;
1604		}
1605		/*
1606		 * Make sure the name isn't too long for the prison or its
1607		 * children.
1608		 */
1609		onamelen = strlen(pr->pr_name);
1610		namelen = strlen(name);
1611		if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) {
1612			error = ENAMETOOLONG;
1613			goto done_deref_locked;
1614		}
1615		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1616			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1617			    sizeof(pr->pr_name)) {
1618				error = ENAMETOOLONG;
1619				goto done_deref_locked;
1620			}
1621		}
1622	}
1623	if (pr_allow & ~ppr->pr_allow) {
1624		error = EPERM;
1625		goto done_deref_locked;
1626	}
1627
1628	/* Set the parameters of the prison. */
1629#ifdef INET
1630	redo_ip4 = 0;
1631	if (pr_flags & PR_IP4_USER) {
1632		pr->pr_flags |= PR_IP4;
1633		free(pr->pr_ip4, M_PRISON);
1634		pr->pr_ip4s = ip4s;
1635		pr->pr_ip4 = ip4;
1636		ip4 = NULL;
1637		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1638#ifdef VIMAGE
1639			if (tpr->pr_flags & PR_VNET) {
1640				descend = 0;
1641				continue;
1642			}
1643#endif
1644			if (prison_restrict_ip4(tpr, NULL)) {
1645				redo_ip4 = 1;
1646				descend = 0;
1647			}
1648		}
1649	}
1650#endif
1651#ifdef INET6
1652	redo_ip6 = 0;
1653	if (pr_flags & PR_IP6_USER) {
1654		pr->pr_flags |= PR_IP6;
1655		free(pr->pr_ip6, M_PRISON);
1656		pr->pr_ip6s = ip6s;
1657		pr->pr_ip6 = ip6;
1658		ip6 = NULL;
1659		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1660#ifdef VIMAGE
1661			if (tpr->pr_flags & PR_VNET) {
1662				descend = 0;
1663				continue;
1664			}
1665#endif
1666			if (prison_restrict_ip6(tpr, NULL)) {
1667				redo_ip6 = 1;
1668				descend = 0;
1669			}
1670		}
1671	}
1672#endif
1673	if (gotslevel) {
1674		pr->pr_securelevel = slevel;
1675		/* Set all child jails to be at least this level. */
1676		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1677			if (tpr->pr_securelevel < slevel)
1678				tpr->pr_securelevel = slevel;
1679	}
1680	if (gotchildmax) {
1681		pr->pr_childmax = childmax;
1682		/* Set all child jails to under this limit. */
1683		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1684			if (tpr->pr_childmax > childmax - level)
1685				tpr->pr_childmax = childmax > level
1686				    ? childmax - level : 0;
1687	}
1688	if (gotenforce) {
1689		pr->pr_enforce_statfs = enforce;
1690		/* Pass this restriction on to the children. */
1691		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1692			if (tpr->pr_enforce_statfs < enforce)
1693				tpr->pr_enforce_statfs = enforce;
1694	}
1695	if (gotrsnum) {
1696		pr->pr_devfs_rsnum = rsnum;
1697		/* Pass this restriction on to the children. */
1698		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1699			tpr->pr_devfs_rsnum = rsnum;
1700	}
1701	if (name != NULL) {
1702		if (ppr == &prison0)
1703			strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
1704		else
1705			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1706			    ppr->pr_name, name);
1707		/* Change this component of child names. */
1708		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1709			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1710			    strlen(tpr->pr_name + onamelen) + 1);
1711			bcopy(pr->pr_name, tpr->pr_name, namelen);
1712		}
1713	}
1714	if (path != NULL) {
1715		/* Try to keep a real-rooted full pathname. */
1716		if (fullpath_disabled && path[0] == '/' &&
1717		    strcmp(mypr->pr_path, "/"))
1718			snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
1719			    mypr->pr_path, path);
1720		else
1721			strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1722		pr->pr_root = root;
1723	}
1724	if (PR_HOST & ch_flags & ~pr_flags) {
1725		if (pr->pr_flags & PR_HOST) {
1726			/*
1727			 * Copy the parent's host info.  As with pr_ip4 above,
1728			 * the lack of a lock on the parent is not a problem;
1729			 * it is always set with allprison_lock at least
1730			 * shared, and is held exclusively here.
1731			 */
1732			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1733			    sizeof(pr->pr_hostname));
1734			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1735			    sizeof(pr->pr_domainname));
1736			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1737			    sizeof(pr->pr_hostuuid));
1738			pr->pr_hostid = pr->pr_parent->pr_hostid;
1739		}
1740	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1741		/* Set this prison, and any descendants without PR_HOST. */
1742		if (host != NULL)
1743			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1744		if (domain != NULL)
1745			strlcpy(pr->pr_domainname, domain,
1746			    sizeof(pr->pr_domainname));
1747		if (uuid != NULL)
1748			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1749		if (gothid)
1750			pr->pr_hostid = hid;
1751		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1752			if (tpr->pr_flags & PR_HOST)
1753				descend = 0;
1754			else {
1755				if (host != NULL)
1756					strlcpy(tpr->pr_hostname,
1757					    pr->pr_hostname,
1758					    sizeof(tpr->pr_hostname));
1759				if (domain != NULL)
1760					strlcpy(tpr->pr_domainname,
1761					    pr->pr_domainname,
1762					    sizeof(tpr->pr_domainname));
1763				if (uuid != NULL)
1764					strlcpy(tpr->pr_hostuuid,
1765					    pr->pr_hostuuid,
1766					    sizeof(tpr->pr_hostuuid));
1767				if (gothid)
1768					tpr->pr_hostid = hid;
1769			}
1770		}
1771	}
1772	if ((tallow = ch_allow & ~pr_allow)) {
1773		/* Clear allow bits in all children. */
1774		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1775			tpr->pr_allow &= ~tallow;
1776	}
1777	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1778	/*
1779	 * Persistent prisons get an extra reference, and prisons losing their
1780	 * persist flag lose that reference.  Only do this for existing prisons
1781	 * for now, so new ones will remain unseen until after the module
1782	 * handlers have completed.
1783	 */
1784	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
1785		if (pr_flags & PR_PERSIST) {
1786			pr->pr_ref++;
1787			pr->pr_uref++;
1788		} else {
1789			pr->pr_ref--;
1790			pr->pr_uref--;
1791		}
1792	}
1793	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1794	mtx_unlock(&pr->pr_mtx);
1795
1796#ifdef RACCT
1797	if (racct_enable && created)
1798		prison_racct_attach(pr);
1799#endif
1800
1801	/* Locks may have prevented a complete restriction of child IP
1802	 * addresses.  If so, allocate some more memory and try again.
1803	 */
1804#ifdef INET
1805	while (redo_ip4) {
1806		ip4s = pr->pr_ip4s;
1807		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
1808		mtx_lock(&pr->pr_mtx);
1809		redo_ip4 = 0;
1810		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1811#ifdef VIMAGE
1812			if (tpr->pr_flags & PR_VNET) {
1813				descend = 0;
1814				continue;
1815			}
1816#endif
1817			if (prison_restrict_ip4(tpr, ip4)) {
1818				if (ip4 != NULL)
1819					ip4 = NULL;
1820				else
1821					redo_ip4 = 1;
1822			}
1823		}
1824		mtx_unlock(&pr->pr_mtx);
1825	}
1826#endif
1827#ifdef INET6
1828	while (redo_ip6) {
1829		ip6s = pr->pr_ip6s;
1830		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
1831		mtx_lock(&pr->pr_mtx);
1832		redo_ip6 = 0;
1833		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1834#ifdef VIMAGE
1835			if (tpr->pr_flags & PR_VNET) {
1836				descend = 0;
1837				continue;
1838			}
1839#endif
1840			if (prison_restrict_ip6(tpr, ip6)) {
1841				if (ip6 != NULL)
1842					ip6 = NULL;
1843				else
1844					redo_ip6 = 1;
1845			}
1846		}
1847		mtx_unlock(&pr->pr_mtx);
1848	}
1849#endif
1850
1851	/* Let the modules do their work. */
1852	sx_downgrade(&allprison_lock);
1853	if (created) {
1854		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1855		if (error) {
1856			prison_deref(pr, PD_LIST_SLOCKED);
1857			goto done_errmsg;
1858		}
1859	}
1860	error = osd_jail_call(pr, PR_METHOD_SET, opts);
1861	if (error) {
1862		prison_deref(pr, created
1863		    ? PD_LIST_SLOCKED
1864		    : PD_DEREF | PD_LIST_SLOCKED);
1865		goto done_errmsg;
1866	}
1867
1868	/* Attach this process to the prison if requested. */
1869	if (flags & JAIL_ATTACH) {
1870		mtx_lock(&pr->pr_mtx);
1871		error = do_jail_attach(td, pr);
1872		if (error) {
1873			vfs_opterror(opts, "attach failed");
1874			if (!created)
1875				prison_deref(pr, PD_DEREF);
1876			goto done_errmsg;
1877		}
1878	}
1879
1880#ifdef RACCT
1881	if (racct_enable && !created) {
1882		if (!(flags & JAIL_ATTACH))
1883			sx_sunlock(&allprison_lock);
1884		prison_racct_modify(pr);
1885		if (!(flags & JAIL_ATTACH))
1886			sx_slock(&allprison_lock);
1887	}
1888#endif
1889
1890	td->td_retval[0] = pr->pr_id;
1891
1892	/*
1893	 * Now that it is all there, drop the temporary reference from existing
1894	 * prisons.  Or add a reference to newly created persistent prisons
1895	 * (which was not done earlier so that the prison would not be publicly
1896	 * visible).
1897	 */
1898	if (!created) {
1899		prison_deref(pr, (flags & JAIL_ATTACH)
1900		    ? PD_DEREF
1901		    : PD_DEREF | PD_LIST_SLOCKED);
1902	} else {
1903		if (pr_flags & PR_PERSIST) {
1904			mtx_lock(&pr->pr_mtx);
1905			pr->pr_ref++;
1906			pr->pr_uref++;
1907			mtx_unlock(&pr->pr_mtx);
1908		}
1909		if (!(flags & JAIL_ATTACH))
1910			sx_sunlock(&allprison_lock);
1911	}
1912
1913	goto done_errmsg;
1914
1915 done_deref_locked:
1916	prison_deref(pr, created
1917	    ? PD_LOCKED | PD_LIST_XLOCKED
1918	    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
1919	goto done_releroot;
1920 done_unlock_list:
1921	sx_xunlock(&allprison_lock);
1922 done_releroot:
1923	if (root != NULL)
1924		vrele(root);
1925 done_errmsg:
1926	if (error) {
1927		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
1928		if (errmsg_len > 0) {
1929			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1930			if (errmsg_pos > 0) {
1931				if (optuio->uio_segflg == UIO_SYSSPACE)
1932					bcopy(errmsg,
1933					   optuio->uio_iov[errmsg_pos].iov_base,
1934					   errmsg_len);
1935				else
1936					copyout(errmsg,
1937					   optuio->uio_iov[errmsg_pos].iov_base,
1938					   errmsg_len);
1939			}
1940		}
1941	}
1942 done_free:
1943#ifdef INET
1944	free(ip4, M_PRISON);
1945#endif
1946#ifdef INET6
1947	free(ip6, M_PRISON);
1948#endif
1949	if (g_path != NULL)
1950		free(g_path, M_TEMP);
1951	vfs_freeopts(opts);
1952	return (error);
1953}
1954
1955
1956/*
1957 * struct jail_get_args {
1958 *	struct iovec *iovp;
1959 *	unsigned int iovcnt;
1960 *	int flags;
1961 * };
1962 */
1963int
1964sys_jail_get(struct thread *td, struct jail_get_args *uap)
1965{
1966	struct uio *auio;
1967	int error;
1968
1969	/* Check that we have an even number of iovecs. */
1970	if (uap->iovcnt & 1)
1971		return (EINVAL);
1972
1973	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
1974	if (error)
1975		return (error);
1976	error = kern_jail_get(td, auio, uap->flags);
1977	if (error == 0)
1978		error = copyout(auio->uio_iov, uap->iovp,
1979		    uap->iovcnt * sizeof (struct iovec));
1980	free(auio, M_IOV);
1981	return (error);
1982}
1983
1984int
1985kern_jail_get(struct thread *td, struct uio *optuio, int flags)
1986{
1987	struct prison *pr, *mypr;
1988	struct vfsopt *opt;
1989	struct vfsoptlist *opts;
1990	char *errmsg, *name;
1991	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
1992
1993	if (flags & ~JAIL_GET_MASK)
1994		return (EINVAL);
1995
1996	/* Get the parameter list. */
1997	error = vfs_buildopts(optuio, &opts);
1998	if (error)
1999		return (error);
2000	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
2001	mypr = td->td_ucred->cr_prison;
2002
2003	/*
2004	 * Find the prison specified by one of: lastjid, jid, name.
2005	 */
2006	sx_slock(&allprison_lock);
2007	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
2008	if (error == 0) {
2009		TAILQ_FOREACH(pr, &allprison, pr_list) {
2010			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
2011				mtx_lock(&pr->pr_mtx);
2012				if (pr->pr_ref > 0 &&
2013				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
2014					break;
2015				mtx_unlock(&pr->pr_mtx);
2016			}
2017		}
2018		if (pr != NULL)
2019			goto found_prison;
2020		error = ENOENT;
2021		vfs_opterror(opts, "no jail after %d", jid);
2022		goto done_unlock_list;
2023	} else if (error != ENOENT)
2024		goto done_unlock_list;
2025
2026	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
2027	if (error == 0) {
2028		if (jid != 0) {
2029			pr = prison_find_child(mypr, jid);
2030			if (pr != NULL) {
2031				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
2032					mtx_unlock(&pr->pr_mtx);
2033					error = ENOENT;
2034					vfs_opterror(opts, "jail %d is dying",
2035					    jid);
2036					goto done_unlock_list;
2037				}
2038				goto found_prison;
2039			}
2040			error = ENOENT;
2041			vfs_opterror(opts, "jail %d not found", jid);
2042			goto done_unlock_list;
2043		}
2044	} else if (error != ENOENT)
2045		goto done_unlock_list;
2046
2047	error = vfs_getopt(opts, "name", (void **)&name, &len);
2048	if (error == 0) {
2049		if (len == 0 || name[len - 1] != '\0') {
2050			error = EINVAL;
2051			goto done_unlock_list;
2052		}
2053		pr = prison_find_name(mypr, name);
2054		if (pr != NULL) {
2055			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
2056				mtx_unlock(&pr->pr_mtx);
2057				error = ENOENT;
2058				vfs_opterror(opts, "jail \"%s\" is dying",
2059				    name);
2060				goto done_unlock_list;
2061			}
2062			goto found_prison;
2063		}
2064		error = ENOENT;
2065		vfs_opterror(opts, "jail \"%s\" not found", name);
2066		goto done_unlock_list;
2067	} else if (error != ENOENT)
2068		goto done_unlock_list;
2069
2070	vfs_opterror(opts, "no jail specified");
2071	error = ENOENT;
2072	goto done_unlock_list;
2073
2074 found_prison:
2075	/* Get the parameters of the prison. */
2076	pr->pr_ref++;
2077	locked = PD_LOCKED;
2078	td->td_retval[0] = pr->pr_id;
2079	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
2080	if (error != 0 && error != ENOENT)
2081		goto done_deref;
2082	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
2083	error = vfs_setopt(opts, "parent", &i, sizeof(i));
2084	if (error != 0 && error != ENOENT)
2085		goto done_deref;
2086	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
2087	if (error != 0 && error != ENOENT)
2088		goto done_deref;
2089	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
2090	    sizeof(pr->pr_cpuset->cs_id));
2091	if (error != 0 && error != ENOENT)
2092		goto done_deref;
2093	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
2094	if (error != 0 && error != ENOENT)
2095		goto done_deref;
2096#ifdef INET
2097	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
2098	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
2099	if (error != 0 && error != ENOENT)
2100		goto done_deref;
2101#endif
2102#ifdef INET6
2103	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
2104	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
2105	if (error != 0 && error != ENOENT)
2106		goto done_deref;
2107#endif
2108	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2109	    sizeof(pr->pr_securelevel));
2110	if (error != 0 && error != ENOENT)
2111		goto done_deref;
2112	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2113	    sizeof(pr->pr_childcount));
2114	if (error != 0 && error != ENOENT)
2115		goto done_deref;
2116	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2117	    sizeof(pr->pr_childmax));
2118	if (error != 0 && error != ENOENT)
2119		goto done_deref;
2120	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2121	if (error != 0 && error != ENOENT)
2122		goto done_deref;
2123	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2124	if (error != 0 && error != ENOENT)
2125		goto done_deref;
2126	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2127	if (error != 0 && error != ENOENT)
2128		goto done_deref;
2129#ifdef COMPAT_FREEBSD32
2130	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
2131		uint32_t hid32 = pr->pr_hostid;
2132
2133		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2134	} else
2135#endif
2136	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2137	    sizeof(pr->pr_hostid));
2138	if (error != 0 && error != ENOENT)
2139		goto done_deref;
2140	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2141	    sizeof(pr->pr_enforce_statfs));
2142	if (error != 0 && error != ENOENT)
2143		goto done_deref;
2144	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
2145	    sizeof(pr->pr_devfs_rsnum));
2146	if (error != 0 && error != ENOENT)
2147		goto done_deref;
2148	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
2149	    fi++) {
2150		if (pr_flag_names[fi] == NULL)
2151			continue;
2152		i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
2153		error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
2154		if (error != 0 && error != ENOENT)
2155			goto done_deref;
2156		i = !i;
2157		error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
2158		if (error != 0 && error != ENOENT)
2159			goto done_deref;
2160	}
2161	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
2162	    fi++) {
2163		i = pr->pr_flags &
2164		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
2165		i = pr_flag_jailsys[fi].disable &&
2166		      (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
2167		    : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
2168		    : JAIL_SYS_INHERIT;
2169		error =
2170		    vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
2171		if (error != 0 && error != ENOENT)
2172			goto done_deref;
2173	}
2174	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
2175	    fi++) {
2176		if (pr_allow_names[fi] == NULL)
2177			continue;
2178		i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
2179		error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
2180		if (error != 0 && error != ENOENT)
2181			goto done_deref;
2182		i = !i;
2183		error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
2184		if (error != 0 && error != ENOENT)
2185			goto done_deref;
2186	}
2187	i = (pr->pr_uref == 0);
2188	error = vfs_setopt(opts, "dying", &i, sizeof(i));
2189	if (error != 0 && error != ENOENT)
2190		goto done_deref;
2191	i = !i;
2192	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2193	if (error != 0 && error != ENOENT)
2194		goto done_deref;
2195	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
2196	    sizeof(pr->pr_osreldate));
2197	if (error != 0 && error != ENOENT)
2198		goto done_deref;
2199	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
2200	if (error != 0 && error != ENOENT)
2201		goto done_deref;
2202
2203	/* Get the module parameters. */
2204	mtx_unlock(&pr->pr_mtx);
2205	locked = 0;
2206	error = osd_jail_call(pr, PR_METHOD_GET, opts);
2207	if (error)
2208		goto done_deref;
2209	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
2210
2211	/* By now, all parameters should have been noted. */
2212	TAILQ_FOREACH(opt, opts, link) {
2213		if (!opt->seen && strcmp(opt->name, "errmsg")) {
2214			error = EINVAL;
2215			vfs_opterror(opts, "unknown parameter: %s", opt->name);
2216			goto done_errmsg;
2217		}
2218	}
2219
2220	/* Write the fetched parameters back to userspace. */
2221	error = 0;
2222	TAILQ_FOREACH(opt, opts, link) {
2223		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2224			pos = 2 * opt->pos + 1;
2225			optuio->uio_iov[pos].iov_len = opt->len;
2226			if (opt->value != NULL) {
2227				if (optuio->uio_segflg == UIO_SYSSPACE) {
2228					bcopy(opt->value,
2229					    optuio->uio_iov[pos].iov_base,
2230					    opt->len);
2231				} else {
2232					error = copyout(opt->value,
2233					    optuio->uio_iov[pos].iov_base,
2234					    opt->len);
2235					if (error)
2236						break;
2237				}
2238			}
2239		}
2240	}
2241	goto done_errmsg;
2242
2243 done_deref:
2244	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
2245	goto done_errmsg;
2246
2247 done_unlock_list:
2248	sx_sunlock(&allprison_lock);
2249 done_errmsg:
2250	if (error && errmsg_pos >= 0) {
2251		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2252		errmsg_pos = 2 * errmsg_pos + 1;
2253		if (errmsg_len > 0) {
2254			if (optuio->uio_segflg == UIO_SYSSPACE)
2255				bcopy(errmsg,
2256				    optuio->uio_iov[errmsg_pos].iov_base,
2257				    errmsg_len);
2258			else
2259				copyout(errmsg,
2260				    optuio->uio_iov[errmsg_pos].iov_base,
2261				    errmsg_len);
2262		}
2263	}
2264	vfs_freeopts(opts);
2265	return (error);
2266}
2267
2268
2269/*
2270 * struct jail_remove_args {
2271 *	int jid;
2272 * };
2273 */
2274int
2275sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
2276{
2277	struct prison *pr, *cpr, *lpr, *tpr;
2278	int descend, error;
2279
2280	error = priv_check(td, PRIV_JAIL_REMOVE);
2281	if (error)
2282		return (error);
2283
2284	sx_xlock(&allprison_lock);
2285	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2286	if (pr == NULL) {
2287		sx_xunlock(&allprison_lock);
2288		return (EINVAL);
2289	}
2290
2291	/* Remove all descendants of this prison, then remove this prison. */
2292	pr->pr_ref++;
2293	pr->pr_flags |= PR_REMOVE;
2294	if (!LIST_EMPTY(&pr->pr_children)) {
2295		mtx_unlock(&pr->pr_mtx);
2296		lpr = NULL;
2297		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
2298			mtx_lock(&cpr->pr_mtx);
2299			if (cpr->pr_ref > 0) {
2300				tpr = cpr;
2301				cpr->pr_ref++;
2302				cpr->pr_flags |= PR_REMOVE;
2303			} else {
2304				/* Already removed - do not do it again. */
2305				tpr = NULL;
2306			}
2307			mtx_unlock(&cpr->pr_mtx);
2308			if (lpr != NULL) {
2309				mtx_lock(&lpr->pr_mtx);
2310				prison_remove_one(lpr);
2311				sx_xlock(&allprison_lock);
2312			}
2313			lpr = tpr;
2314		}
2315		if (lpr != NULL) {
2316			mtx_lock(&lpr->pr_mtx);
2317			prison_remove_one(lpr);
2318			sx_xlock(&allprison_lock);
2319		}
2320		mtx_lock(&pr->pr_mtx);
2321	}
2322	prison_remove_one(pr);
2323	return (0);
2324}
2325
2326static void
2327prison_remove_one(struct prison *pr)
2328{
2329	struct proc *p;
2330	int deuref;
2331
2332	/* If the prison was persistent, it is not anymore. */
2333	deuref = 0;
2334	if (pr->pr_flags & PR_PERSIST) {
2335		pr->pr_ref--;
2336		deuref = PD_DEUREF;
2337		pr->pr_flags &= ~PR_PERSIST;
2338	}
2339
2340	/*
2341	 * jail_remove added a reference.  If that's the only one, remove
2342	 * the prison now.
2343	 */
2344	KASSERT(pr->pr_ref > 0,
2345	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
2346	if (pr->pr_ref == 1) {
2347		prison_deref(pr,
2348		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
2349		return;
2350	}
2351
2352	mtx_unlock(&pr->pr_mtx);
2353	sx_xunlock(&allprison_lock);
2354	/*
2355	 * Kill all processes unfortunate enough to be attached to this prison.
2356	 */
2357	sx_slock(&allproc_lock);
2358	LIST_FOREACH(p, &allproc, p_list) {
2359		PROC_LOCK(p);
2360		if (p->p_state != PRS_NEW && p->p_ucred &&
2361		    p->p_ucred->cr_prison == pr)
2362			kern_psignal(p, SIGKILL);
2363		PROC_UNLOCK(p);
2364	}
2365	sx_sunlock(&allproc_lock);
2366	/* Remove the temporary reference added by jail_remove. */
2367	prison_deref(pr, deuref | PD_DEREF);
2368}
2369
2370
2371/*
2372 * struct jail_attach_args {
2373 *	int jid;
2374 * };
2375 */
2376int
2377sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
2378{
2379	struct prison *pr;
2380	int error;
2381
2382	error = priv_check(td, PRIV_JAIL_ATTACH);
2383	if (error)
2384		return (error);
2385
2386	sx_slock(&allprison_lock);
2387	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2388	if (pr == NULL) {
2389		sx_sunlock(&allprison_lock);
2390		return (EINVAL);
2391	}
2392
2393	/*
2394	 * Do not allow a process to attach to a prison that is not
2395	 * considered to be "alive".
2396	 */
2397	if (pr->pr_uref == 0) {
2398		mtx_unlock(&pr->pr_mtx);
2399		sx_sunlock(&allprison_lock);
2400		return (EINVAL);
2401	}
2402
2403	return (do_jail_attach(td, pr));
2404}
2405
2406static int
2407do_jail_attach(struct thread *td, struct prison *pr)
2408{
2409	struct prison *ppr;
2410	struct proc *p;
2411	struct ucred *newcred, *oldcred;
2412	int error;
2413
2414	/*
2415	 * XXX: Note that there is a slight race here if two threads
2416	 * in the same privileged process attempt to attach to two
2417	 * different jails at the same time.  It is important for
2418	 * user processes not to do this, or they might end up with
2419	 * a process root from one prison, but attached to the jail
2420	 * of another.
2421	 */
2422	pr->pr_ref++;
2423	pr->pr_uref++;
2424	mtx_unlock(&pr->pr_mtx);
2425
2426	/* Let modules do whatever they need to prepare for attaching. */
2427	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2428	if (error) {
2429		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
2430		return (error);
2431	}
2432	sx_sunlock(&allprison_lock);
2433
2434	/*
2435	 * Reparent the newly attached process to this jail.
2436	 */
2437	ppr = td->td_ucred->cr_prison;
2438	p = td->td_proc;
2439	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2440	if (error)
2441		goto e_revert_osd;
2442
2443	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2444	if ((error = change_dir(pr->pr_root, td)) != 0)
2445		goto e_unlock;
2446#ifdef MAC
2447	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2448		goto e_unlock;
2449#endif
2450	VOP_UNLOCK(pr->pr_root, 0);
2451	if ((error = change_root(pr->pr_root, td)))
2452		goto e_revert_osd;
2453
2454	newcred = crget();
2455	PROC_LOCK(p);
2456	oldcred = p->p_ucred;
2457	setsugid(p);
2458	crcopy(newcred, oldcred);
2459	newcred->cr_prison = pr;
2460	p->p_ucred = newcred;
2461	PROC_UNLOCK(p);
2462#ifdef RACCT
2463	racct_proc_ucred_changed(p, oldcred, newcred);
2464#endif
2465	crfree(oldcred);
2466	prison_deref(ppr, PD_DEREF | PD_DEUREF);
2467	return (0);
2468 e_unlock:
2469	VOP_UNLOCK(pr->pr_root, 0);
2470 e_revert_osd:
2471	/* Tell modules this thread is still in its old jail after all. */
2472	(void)osd_jail_call(ppr, PR_METHOD_ATTACH, td);
2473	prison_deref(pr, PD_DEREF | PD_DEUREF);
2474	return (error);
2475}
2476
2477
2478/*
2479 * Returns a locked prison instance, or NULL on failure.
2480 */
2481struct prison *
2482prison_find(int prid)
2483{
2484	struct prison *pr;
2485
2486	sx_assert(&allprison_lock, SX_LOCKED);
2487	TAILQ_FOREACH(pr, &allprison, pr_list) {
2488		if (pr->pr_id == prid) {
2489			mtx_lock(&pr->pr_mtx);
2490			if (pr->pr_ref > 0)
2491				return (pr);
2492			mtx_unlock(&pr->pr_mtx);
2493		}
2494	}
2495	return (NULL);
2496}
2497
2498/*
2499 * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2500 */
2501struct prison *
2502prison_find_child(struct prison *mypr, int prid)
2503{
2504	struct prison *pr;
2505	int descend;
2506
2507	sx_assert(&allprison_lock, SX_LOCKED);
2508	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2509		if (pr->pr_id == prid) {
2510			mtx_lock(&pr->pr_mtx);
2511			if (pr->pr_ref > 0)
2512				return (pr);
2513			mtx_unlock(&pr->pr_mtx);
2514		}
2515	}
2516	return (NULL);
2517}
2518
2519/*
2520 * Look for the name relative to mypr.  Returns a locked prison or NULL.
2521 */
2522struct prison *
2523prison_find_name(struct prison *mypr, const char *name)
2524{
2525	struct prison *pr, *deadpr;
2526	size_t mylen;
2527	int descend;
2528
2529	sx_assert(&allprison_lock, SX_LOCKED);
2530	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2531 again:
2532	deadpr = NULL;
2533	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2534		if (!strcmp(pr->pr_name + mylen, name)) {
2535			mtx_lock(&pr->pr_mtx);
2536			if (pr->pr_ref > 0) {
2537				if (pr->pr_uref > 0)
2538					return (pr);
2539				deadpr = pr;
2540			}
2541			mtx_unlock(&pr->pr_mtx);
2542		}
2543	}
2544	/* There was no valid prison - perhaps there was a dying one. */
2545	if (deadpr != NULL) {
2546		mtx_lock(&deadpr->pr_mtx);
2547		if (deadpr->pr_ref == 0) {
2548			mtx_unlock(&deadpr->pr_mtx);
2549			goto again;
2550		}
2551	}
2552	return (deadpr);
2553}
2554
2555/*
2556 * See if a prison has the specific flag set.
2557 */
2558int
2559prison_flag(struct ucred *cred, unsigned flag)
2560{
2561
2562	/* This is an atomic read, so no locking is necessary. */
2563	return (cred->cr_prison->pr_flags & flag);
2564}
2565
2566int
2567prison_allow(struct ucred *cred, unsigned flag)
2568{
2569
2570	/* This is an atomic read, so no locking is necessary. */
2571	return (cred->cr_prison->pr_allow & flag);
2572}
2573
2574/*
2575 * Remove a prison reference.  If that was the last reference, remove the
2576 * prison itself - but not in this context in case there are locks held.
2577 */
2578void
2579prison_free_locked(struct prison *pr)
2580{
2581
2582	mtx_assert(&pr->pr_mtx, MA_OWNED);
2583	pr->pr_ref--;
2584	if (pr->pr_ref == 0) {
2585		mtx_unlock(&pr->pr_mtx);
2586		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
2587		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2588		return;
2589	}
2590	mtx_unlock(&pr->pr_mtx);
2591}
2592
2593void
2594prison_free(struct prison *pr)
2595{
2596
2597	mtx_lock(&pr->pr_mtx);
2598	prison_free_locked(pr);
2599}
2600
2601static void
2602prison_complete(void *context, int pending)
2603{
2604
2605	prison_deref((struct prison *)context, 0);
2606}
2607
2608/*
2609 * Remove a prison reference (usually).  This internal version assumes no
2610 * mutexes are held, except perhaps the prison itself.  If there are no more
2611 * references, release and delist the prison.  On completion, the prison lock
2612 * and the allprison lock are both unlocked.
2613 */
2614static void
2615prison_deref(struct prison *pr, int flags)
2616{
2617	struct prison *ppr, *tpr;
2618
2619	if (!(flags & PD_LOCKED))
2620		mtx_lock(&pr->pr_mtx);
2621	for (;;) {
2622		if (flags & PD_DEUREF) {
2623			pr->pr_uref--;
2624			KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
2625		}
2626		if (flags & PD_DEREF)
2627			pr->pr_ref--;
2628		/* If the prison still has references, nothing else to do. */
2629		if (pr->pr_ref > 0) {
2630			mtx_unlock(&pr->pr_mtx);
2631			if (flags & PD_LIST_SLOCKED)
2632				sx_sunlock(&allprison_lock);
2633			else if (flags & PD_LIST_XLOCKED)
2634				sx_xunlock(&allprison_lock);
2635			return;
2636		}
2637
2638		mtx_unlock(&pr->pr_mtx);
2639		if (flags & PD_LIST_SLOCKED) {
2640			if (!sx_try_upgrade(&allprison_lock)) {
2641				sx_sunlock(&allprison_lock);
2642				sx_xlock(&allprison_lock);
2643			}
2644		} else if (!(flags & PD_LIST_XLOCKED))
2645			sx_xlock(&allprison_lock);
2646
2647		TAILQ_REMOVE(&allprison, pr, pr_list);
2648		LIST_REMOVE(pr, pr_sibling);
2649		ppr = pr->pr_parent;
2650		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
2651			tpr->pr_childcount--;
2652		sx_xunlock(&allprison_lock);
2653
2654#ifdef VIMAGE
2655		if (pr->pr_vnet != ppr->pr_vnet)
2656			vnet_destroy(pr->pr_vnet);
2657#endif
2658		if (pr->pr_root != NULL)
2659			vrele(pr->pr_root);
2660		mtx_destroy(&pr->pr_mtx);
2661#ifdef INET
2662		free(pr->pr_ip4, M_PRISON);
2663#endif
2664#ifdef INET6
2665		free(pr->pr_ip6, M_PRISON);
2666#endif
2667		if (pr->pr_cpuset != NULL)
2668			cpuset_rel(pr->pr_cpuset);
2669		osd_jail_exit(pr);
2670#ifdef RACCT
2671		if (racct_enable)
2672			prison_racct_detach(pr);
2673#endif
2674		free(pr, M_PRISON);
2675
2676		/* Removing a prison frees a reference on its parent. */
2677		pr = ppr;
2678		mtx_lock(&pr->pr_mtx);
2679		flags = PD_DEREF | PD_DEUREF;
2680	}
2681}
2682
2683void
2684prison_hold_locked(struct prison *pr)
2685{
2686
2687	mtx_assert(&pr->pr_mtx, MA_OWNED);
2688	KASSERT(pr->pr_ref > 0,
2689	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
2690	pr->pr_ref++;
2691}
2692
2693void
2694prison_hold(struct prison *pr)
2695{
2696
2697	mtx_lock(&pr->pr_mtx);
2698	prison_hold_locked(pr);
2699	mtx_unlock(&pr->pr_mtx);
2700}
2701
2702void
2703prison_proc_hold(struct prison *pr)
2704{
2705
2706	mtx_lock(&pr->pr_mtx);
2707	KASSERT(pr->pr_uref > 0,
2708	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2709	pr->pr_uref++;
2710	mtx_unlock(&pr->pr_mtx);
2711}
2712
2713void
2714prison_proc_free(struct prison *pr)
2715{
2716
2717	mtx_lock(&pr->pr_mtx);
2718	KASSERT(pr->pr_uref > 0,
2719	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2720	prison_deref(pr, PD_DEUREF | PD_LOCKED);
2721}
2722
2723
2724#ifdef INET
2725/*
2726 * Restrict a prison's IP address list with its parent's, possibly replacing
2727 * it.  Return true if the replacement buffer was used (or would have been).
2728 */
2729static int
2730prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
2731{
2732	int ii, ij, used;
2733	struct prison *ppr;
2734
2735	ppr = pr->pr_parent;
2736	if (!(pr->pr_flags & PR_IP4_USER)) {
2737		/* This has no user settings, so just copy the parent's list. */
2738		if (pr->pr_ip4s < ppr->pr_ip4s) {
2739			/*
2740			 * There's no room for the parent's list.  Use the
2741			 * new list buffer, which is assumed to be big enough
2742			 * (if it was passed).  If there's no buffer, try to
2743			 * allocate one.
2744			 */
2745			used = 1;
2746			if (newip4 == NULL) {
2747				newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
2748				    M_PRISON, M_NOWAIT);
2749				if (newip4 != NULL)
2750					used = 0;
2751			}
2752			if (newip4 != NULL) {
2753				bcopy(ppr->pr_ip4, newip4,
2754				    ppr->pr_ip4s * sizeof(*newip4));
2755				free(pr->pr_ip4, M_PRISON);
2756				pr->pr_ip4 = newip4;
2757				pr->pr_ip4s = ppr->pr_ip4s;
2758			}
2759			return (used);
2760		}
2761		pr->pr_ip4s = ppr->pr_ip4s;
2762		if (pr->pr_ip4s > 0)
2763			bcopy(ppr->pr_ip4, pr->pr_ip4,
2764			    pr->pr_ip4s * sizeof(*newip4));
2765		else if (pr->pr_ip4 != NULL) {
2766			free(pr->pr_ip4, M_PRISON);
2767			pr->pr_ip4 = NULL;
2768		}
2769	} else if (pr->pr_ip4s > 0) {
2770		/* Remove addresses that aren't in the parent. */
2771		for (ij = 0; ij < ppr->pr_ip4s; ij++)
2772			if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
2773				break;
2774		if (ij < ppr->pr_ip4s)
2775			ii = 1;
2776		else {
2777			bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
2778			    --pr->pr_ip4s * sizeof(*pr->pr_ip4));
2779			ii = 0;
2780		}
2781		for (ij = 1; ii < pr->pr_ip4s; ) {
2782			if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
2783				ii++;
2784				continue;
2785			}
2786			switch (ij >= ppr->pr_ip4s ? -1 :
2787				qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
2788			case -1:
2789				bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
2790				    (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
2791				break;
2792			case 0:
2793				ii++;
2794				ij++;
2795				break;
2796			case 1:
2797				ij++;
2798				break;
2799			}
2800		}
2801		if (pr->pr_ip4s == 0) {
2802			pr->pr_flags |= PR_IP4_DISABLE;
2803			free(pr->pr_ip4, M_PRISON);
2804			pr->pr_ip4 = NULL;
2805		}
2806	}
2807	return (0);
2808}
2809
2810/*
2811 * Pass back primary IPv4 address of this jail.
2812 *
2813 * If not restricted return success but do not alter the address.  Caller has
2814 * to make sure to initialize it correctly (e.g. INADDR_ANY).
2815 *
2816 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2817 * Address returned in NBO.
2818 */
2819int
2820prison_get_ip4(struct ucred *cred, struct in_addr *ia)
2821{
2822	struct prison *pr;
2823
2824	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2825	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2826
2827	pr = cred->cr_prison;
2828	if (!(pr->pr_flags & PR_IP4))
2829		return (0);
2830	mtx_lock(&pr->pr_mtx);
2831	if (!(pr->pr_flags & PR_IP4)) {
2832		mtx_unlock(&pr->pr_mtx);
2833		return (0);
2834	}
2835	if (pr->pr_ip4 == NULL) {
2836		mtx_unlock(&pr->pr_mtx);
2837		return (EAFNOSUPPORT);
2838	}
2839
2840	ia->s_addr = pr->pr_ip4[0].s_addr;
2841	mtx_unlock(&pr->pr_mtx);
2842	return (0);
2843}
2844
2845/*
2846 * Return 1 if we should do proper source address selection or are not jailed.
2847 * We will return 0 if we should bypass source address selection in favour
2848 * of the primary jail IPv4 address. Only in this case *ia will be updated and
2849 * returned in NBO.
2850 * Return EAFNOSUPPORT, in case this jail does not allow IPv4.
2851 */
2852int
2853prison_saddrsel_ip4(struct ucred *cred, struct in_addr *ia)
2854{
2855	struct prison *pr;
2856	struct in_addr lia;
2857	int error;
2858
2859	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2860	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2861
2862	if (!jailed(cred))
2863		return (1);
2864
2865	pr = cred->cr_prison;
2866	if (pr->pr_flags & PR_IP4_SADDRSEL)
2867		return (1);
2868
2869	lia.s_addr = INADDR_ANY;
2870	error = prison_get_ip4(cred, &lia);
2871	if (error)
2872		return (error);
2873	if (lia.s_addr == INADDR_ANY)
2874		return (1);
2875
2876	ia->s_addr = lia.s_addr;
2877	return (0);
2878}
2879
2880/*
2881 * Return true if pr1 and pr2 have the same IPv4 address restrictions.
2882 */
2883int
2884prison_equal_ip4(struct prison *pr1, struct prison *pr2)
2885{
2886
2887	if (pr1 == pr2)
2888		return (1);
2889
2890	/*
2891	 * No need to lock since the PR_IP4_USER flag can't be altered for
2892	 * existing prisons.
2893	 */
2894	while (pr1 != &prison0 &&
2895#ifdef VIMAGE
2896	       !(pr1->pr_flags & PR_VNET) &&
2897#endif
2898	       !(pr1->pr_flags & PR_IP4_USER))
2899		pr1 = pr1->pr_parent;
2900	while (pr2 != &prison0 &&
2901#ifdef VIMAGE
2902	       !(pr2->pr_flags & PR_VNET) &&
2903#endif
2904	       !(pr2->pr_flags & PR_IP4_USER))
2905		pr2 = pr2->pr_parent;
2906	return (pr1 == pr2);
2907}
2908
2909/*
2910 * Make sure our (source) address is set to something meaningful to this
2911 * jail.
2912 *
2913 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2914 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2915 * doesn't allow IPv4.  Address passed in in NBO and returned in NBO.
2916 */
2917int
2918prison_local_ip4(struct ucred *cred, struct in_addr *ia)
2919{
2920	struct prison *pr;
2921	struct in_addr ia0;
2922	int error;
2923
2924	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2925	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2926
2927	pr = cred->cr_prison;
2928	if (!(pr->pr_flags & PR_IP4))
2929		return (0);
2930	mtx_lock(&pr->pr_mtx);
2931	if (!(pr->pr_flags & PR_IP4)) {
2932		mtx_unlock(&pr->pr_mtx);
2933		return (0);
2934	}
2935	if (pr->pr_ip4 == NULL) {
2936		mtx_unlock(&pr->pr_mtx);
2937		return (EAFNOSUPPORT);
2938	}
2939
2940	ia0.s_addr = ntohl(ia->s_addr);
2941	if (ia0.s_addr == INADDR_LOOPBACK) {
2942		ia->s_addr = pr->pr_ip4[0].s_addr;
2943		mtx_unlock(&pr->pr_mtx);
2944		return (0);
2945	}
2946
2947	if (ia0.s_addr == INADDR_ANY) {
2948		/*
2949		 * In case there is only 1 IPv4 address, bind directly.
2950		 */
2951		if (pr->pr_ip4s == 1)
2952			ia->s_addr = pr->pr_ip4[0].s_addr;
2953		mtx_unlock(&pr->pr_mtx);
2954		return (0);
2955	}
2956
2957	error = _prison_check_ip4(pr, ia);
2958	mtx_unlock(&pr->pr_mtx);
2959	return (error);
2960}
2961
2962/*
2963 * Rewrite destination address in case we will connect to loopback address.
2964 *
2965 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2966 * Address passed in in NBO and returned in NBO.
2967 */
2968int
2969prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
2970{
2971	struct prison *pr;
2972
2973	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2974	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2975
2976	pr = cred->cr_prison;
2977	if (!(pr->pr_flags & PR_IP4))
2978		return (0);
2979	mtx_lock(&pr->pr_mtx);
2980	if (!(pr->pr_flags & PR_IP4)) {
2981		mtx_unlock(&pr->pr_mtx);
2982		return (0);
2983	}
2984	if (pr->pr_ip4 == NULL) {
2985		mtx_unlock(&pr->pr_mtx);
2986		return (EAFNOSUPPORT);
2987	}
2988
2989	if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
2990		ia->s_addr = pr->pr_ip4[0].s_addr;
2991		mtx_unlock(&pr->pr_mtx);
2992		return (0);
2993	}
2994
2995	/*
2996	 * Return success because nothing had to be changed.
2997	 */
2998	mtx_unlock(&pr->pr_mtx);
2999	return (0);
3000}
3001
3002/*
3003 * Check if given address belongs to the jail referenced by cred/prison.
3004 *
3005 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
3006 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3007 * doesn't allow IPv4.  Address passed in in NBO.
3008 */
3009static int
3010_prison_check_ip4(struct prison *pr, struct in_addr *ia)
3011{
3012	int i, a, z, d;
3013
3014	/*
3015	 * Check the primary IP.
3016	 */
3017	if (pr->pr_ip4[0].s_addr == ia->s_addr)
3018		return (0);
3019
3020	/*
3021	 * All the other IPs are sorted so we can do a binary search.
3022	 */
3023	a = 0;
3024	z = pr->pr_ip4s - 2;
3025	while (a <= z) {
3026		i = (a + z) / 2;
3027		d = qcmp_v4(&pr->pr_ip4[i+1], ia);
3028		if (d > 0)
3029			z = i - 1;
3030		else if (d < 0)
3031			a = i + 1;
3032		else
3033			return (0);
3034	}
3035
3036	return (EADDRNOTAVAIL);
3037}
3038
3039int
3040prison_check_ip4(struct ucred *cred, struct in_addr *ia)
3041{
3042	struct prison *pr;
3043	int error;
3044
3045	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3046	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
3047
3048	pr = cred->cr_prison;
3049	if (!(pr->pr_flags & PR_IP4))
3050		return (0);
3051	mtx_lock(&pr->pr_mtx);
3052	if (!(pr->pr_flags & PR_IP4)) {
3053		mtx_unlock(&pr->pr_mtx);
3054		return (0);
3055	}
3056	if (pr->pr_ip4 == NULL) {
3057		mtx_unlock(&pr->pr_mtx);
3058		return (EAFNOSUPPORT);
3059	}
3060
3061	error = _prison_check_ip4(pr, ia);
3062	mtx_unlock(&pr->pr_mtx);
3063	return (error);
3064}
3065#endif
3066
3067#ifdef INET6
3068static int
3069prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6)
3070{
3071	int ii, ij, used;
3072	struct prison *ppr;
3073
3074	ppr = pr->pr_parent;
3075	if (!(pr->pr_flags & PR_IP6_USER)) {
3076		/* This has no user settings, so just copy the parent's list. */
3077		if (pr->pr_ip6s < ppr->pr_ip6s) {
3078			/*
3079			 * There's no room for the parent's list.  Use the
3080			 * new list buffer, which is assumed to be big enough
3081			 * (if it was passed).  If there's no buffer, try to
3082			 * allocate one.
3083			 */
3084			used = 1;
3085			if (newip6 == NULL) {
3086				newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
3087				    M_PRISON, M_NOWAIT);
3088				if (newip6 != NULL)
3089					used = 0;
3090			}
3091			if (newip6 != NULL) {
3092				bcopy(ppr->pr_ip6, newip6,
3093				    ppr->pr_ip6s * sizeof(*newip6));
3094				free(pr->pr_ip6, M_PRISON);
3095				pr->pr_ip6 = newip6;
3096				pr->pr_ip6s = ppr->pr_ip6s;
3097			}
3098			return (used);
3099		}
3100		pr->pr_ip6s = ppr->pr_ip6s;
3101		if (pr->pr_ip6s > 0)
3102			bcopy(ppr->pr_ip6, pr->pr_ip6,
3103			    pr->pr_ip6s * sizeof(*newip6));
3104		else if (pr->pr_ip6 != NULL) {
3105			free(pr->pr_ip6, M_PRISON);
3106			pr->pr_ip6 = NULL;
3107		}
3108	} else if (pr->pr_ip6s > 0) {
3109		/* Remove addresses that aren't in the parent. */
3110		for (ij = 0; ij < ppr->pr_ip6s; ij++)
3111			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
3112			    &ppr->pr_ip6[ij]))
3113				break;
3114		if (ij < ppr->pr_ip6s)
3115			ii = 1;
3116		else {
3117			bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
3118			    --pr->pr_ip6s * sizeof(*pr->pr_ip6));
3119			ii = 0;
3120		}
3121		for (ij = 1; ii < pr->pr_ip6s; ) {
3122			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
3123			    &ppr->pr_ip6[0])) {
3124				ii++;
3125				continue;
3126			}
3127			switch (ij >= ppr->pr_ip6s ? -1 :
3128				qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
3129			case -1:
3130				bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
3131				    (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
3132				break;
3133			case 0:
3134				ii++;
3135				ij++;
3136				break;
3137			case 1:
3138				ij++;
3139				break;
3140			}
3141		}
3142		if (pr->pr_ip6s == 0) {
3143			pr->pr_flags |= PR_IP6_DISABLE;
3144			free(pr->pr_ip6, M_PRISON);
3145			pr->pr_ip6 = NULL;
3146		}
3147	}
3148	return 0;
3149}
3150
3151/*
3152 * Pass back primary IPv6 address for this jail.
3153 *
3154 * If not restricted return success but do not alter the address.  Caller has
3155 * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
3156 *
3157 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3158 */
3159int
3160prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
3161{
3162	struct prison *pr;
3163
3164	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3165	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3166
3167	pr = cred->cr_prison;
3168	if (!(pr->pr_flags & PR_IP6))
3169		return (0);
3170	mtx_lock(&pr->pr_mtx);
3171	if (!(pr->pr_flags & PR_IP6)) {
3172		mtx_unlock(&pr->pr_mtx);
3173		return (0);
3174	}
3175	if (pr->pr_ip6 == NULL) {
3176		mtx_unlock(&pr->pr_mtx);
3177		return (EAFNOSUPPORT);
3178	}
3179
3180	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3181	mtx_unlock(&pr->pr_mtx);
3182	return (0);
3183}
3184
3185/*
3186 * Return 1 if we should do proper source address selection or are not jailed.
3187 * We will return 0 if we should bypass source address selection in favour
3188 * of the primary jail IPv6 address. Only in this case *ia will be updated and
3189 * returned in NBO.
3190 * Return EAFNOSUPPORT, in case this jail does not allow IPv6.
3191 */
3192int
3193prison_saddrsel_ip6(struct ucred *cred, struct in6_addr *ia6)
3194{
3195	struct prison *pr;
3196	struct in6_addr lia6;
3197	int error;
3198
3199	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3200	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3201
3202	if (!jailed(cred))
3203		return (1);
3204
3205	pr = cred->cr_prison;
3206	if (pr->pr_flags & PR_IP6_SADDRSEL)
3207		return (1);
3208
3209	lia6 = in6addr_any;
3210	error = prison_get_ip6(cred, &lia6);
3211	if (error)
3212		return (error);
3213	if (IN6_IS_ADDR_UNSPECIFIED(&lia6))
3214		return (1);
3215
3216	bcopy(&lia6, ia6, sizeof(struct in6_addr));
3217	return (0);
3218}
3219
3220/*
3221 * Return true if pr1 and pr2 have the same IPv6 address restrictions.
3222 */
3223int
3224prison_equal_ip6(struct prison *pr1, struct prison *pr2)
3225{
3226
3227	if (pr1 == pr2)
3228		return (1);
3229
3230	while (pr1 != &prison0 &&
3231#ifdef VIMAGE
3232	       !(pr1->pr_flags & PR_VNET) &&
3233#endif
3234	       !(pr1->pr_flags & PR_IP6_USER))
3235		pr1 = pr1->pr_parent;
3236	while (pr2 != &prison0 &&
3237#ifdef VIMAGE
3238	       !(pr2->pr_flags & PR_VNET) &&
3239#endif
3240	       !(pr2->pr_flags & PR_IP6_USER))
3241		pr2 = pr2->pr_parent;
3242	return (pr1 == pr2);
3243}
3244
3245/*
3246 * Make sure our (source) address is set to something meaningful to this jail.
3247 *
3248 * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
3249 * when needed while binding.
3250 *
3251 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3252 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3253 * doesn't allow IPv6.
3254 */
3255int
3256prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
3257{
3258	struct prison *pr;
3259	int error;
3260
3261	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3262	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3263
3264	pr = cred->cr_prison;
3265	if (!(pr->pr_flags & PR_IP6))
3266		return (0);
3267	mtx_lock(&pr->pr_mtx);
3268	if (!(pr->pr_flags & PR_IP6)) {
3269		mtx_unlock(&pr->pr_mtx);
3270		return (0);
3271	}
3272	if (pr->pr_ip6 == NULL) {
3273		mtx_unlock(&pr->pr_mtx);
3274		return (EAFNOSUPPORT);
3275	}
3276
3277	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3278		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3279		mtx_unlock(&pr->pr_mtx);
3280		return (0);
3281	}
3282
3283	if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
3284		/*
3285		 * In case there is only 1 IPv6 address, and v6only is true,
3286		 * then bind directly.
3287		 */
3288		if (v6only != 0 && pr->pr_ip6s == 1)
3289			bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3290		mtx_unlock(&pr->pr_mtx);
3291		return (0);
3292	}
3293
3294	error = _prison_check_ip6(pr, ia6);
3295	mtx_unlock(&pr->pr_mtx);
3296	return (error);
3297}
3298
3299/*
3300 * Rewrite destination address in case we will connect to loopback address.
3301 *
3302 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3303 */
3304int
3305prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
3306{
3307	struct prison *pr;
3308
3309	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3310	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3311
3312	pr = cred->cr_prison;
3313	if (!(pr->pr_flags & PR_IP6))
3314		return (0);
3315	mtx_lock(&pr->pr_mtx);
3316	if (!(pr->pr_flags & PR_IP6)) {
3317		mtx_unlock(&pr->pr_mtx);
3318		return (0);
3319	}
3320	if (pr->pr_ip6 == NULL) {
3321		mtx_unlock(&pr->pr_mtx);
3322		return (EAFNOSUPPORT);
3323	}
3324
3325	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3326		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3327		mtx_unlock(&pr->pr_mtx);
3328		return (0);
3329	}
3330
3331	/*
3332	 * Return success because nothing had to be changed.
3333	 */
3334	mtx_unlock(&pr->pr_mtx);
3335	return (0);
3336}
3337
3338/*
3339 * Check if given address belongs to the jail referenced by cred/prison.
3340 *
3341 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3342 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3343 * doesn't allow IPv6.
3344 */
3345static int
3346_prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
3347{
3348	int i, a, z, d;
3349
3350	/*
3351	 * Check the primary IP.
3352	 */
3353	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
3354		return (0);
3355
3356	/*
3357	 * All the other IPs are sorted so we can do a binary search.
3358	 */
3359	a = 0;
3360	z = pr->pr_ip6s - 2;
3361	while (a <= z) {
3362		i = (a + z) / 2;
3363		d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
3364		if (d > 0)
3365			z = i - 1;
3366		else if (d < 0)
3367			a = i + 1;
3368		else
3369			return (0);
3370	}
3371
3372	return (EADDRNOTAVAIL);
3373}
3374
3375int
3376prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
3377{
3378	struct prison *pr;
3379	int error;
3380
3381	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3382	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3383
3384	pr = cred->cr_prison;
3385	if (!(pr->pr_flags & PR_IP6))
3386		return (0);
3387	mtx_lock(&pr->pr_mtx);
3388	if (!(pr->pr_flags & PR_IP6)) {
3389		mtx_unlock(&pr->pr_mtx);
3390		return (0);
3391	}
3392	if (pr->pr_ip6 == NULL) {
3393		mtx_unlock(&pr->pr_mtx);
3394		return (EAFNOSUPPORT);
3395	}
3396
3397	error = _prison_check_ip6(pr, ia6);
3398	mtx_unlock(&pr->pr_mtx);
3399	return (error);
3400}
3401#endif
3402
3403/*
3404 * Check if a jail supports the given address family.
3405 *
3406 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3407 * if not.
3408 */
3409int
3410prison_check_af(struct ucred *cred, int af)
3411{
3412	struct prison *pr;
3413	int error;
3414
3415	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3416
3417	pr = cred->cr_prison;
3418#ifdef VIMAGE
3419	/* Prisons with their own network stack are not limited. */
3420	if (prison_owns_vnet(cred))
3421		return (0);
3422#endif
3423
3424	error = 0;
3425	switch (af)
3426	{
3427#ifdef INET
3428	case AF_INET:
3429		if (pr->pr_flags & PR_IP4)
3430		{
3431			mtx_lock(&pr->pr_mtx);
3432			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
3433				error = EAFNOSUPPORT;
3434			mtx_unlock(&pr->pr_mtx);
3435		}
3436		break;
3437#endif
3438#ifdef INET6
3439	case AF_INET6:
3440		if (pr->pr_flags & PR_IP6)
3441		{
3442			mtx_lock(&pr->pr_mtx);
3443			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
3444				error = EAFNOSUPPORT;
3445			mtx_unlock(&pr->pr_mtx);
3446		}
3447		break;
3448#endif
3449	case AF_LOCAL:
3450	case AF_ROUTE:
3451		break;
3452	default:
3453		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3454			error = EAFNOSUPPORT;
3455	}
3456	return (error);
3457}
3458
3459/*
3460 * Check if given address belongs to the jail referenced by cred (wrapper to
3461 * prison_check_ip[46]).
3462 *
3463 * Returns 0 if jail doesn't restrict the address family or if address belongs
3464 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3465 * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3466 */
3467int
3468prison_if(struct ucred *cred, struct sockaddr *sa)
3469{
3470#ifdef INET
3471	struct sockaddr_in *sai;
3472#endif
3473#ifdef INET6
3474	struct sockaddr_in6 *sai6;
3475#endif
3476	int error;
3477
3478	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3479	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3480
3481#ifdef VIMAGE
3482	if (prison_owns_vnet(cred))
3483		return (0);
3484#endif
3485
3486	error = 0;
3487	switch (sa->sa_family)
3488	{
3489#ifdef INET
3490	case AF_INET:
3491		sai = (struct sockaddr_in *)sa;
3492		error = prison_check_ip4(cred, &sai->sin_addr);
3493		break;
3494#endif
3495#ifdef INET6
3496	case AF_INET6:
3497		sai6 = (struct sockaddr_in6 *)sa;
3498		error = prison_check_ip6(cred, &sai6->sin6_addr);
3499		break;
3500#endif
3501	default:
3502		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3503			error = EAFNOSUPPORT;
3504	}
3505	return (error);
3506}
3507
3508/*
3509 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3510 */
3511int
3512prison_check(struct ucred *cred1, struct ucred *cred2)
3513{
3514
3515	return ((cred1->cr_prison == cred2->cr_prison ||
3516	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3517}
3518
3519/*
3520 * Return 1 if p2 is a child of p1, otherwise 0.
3521 */
3522int
3523prison_ischild(struct prison *pr1, struct prison *pr2)
3524{
3525
3526	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3527		if (pr1 == pr2)
3528			return (1);
3529	return (0);
3530}
3531
3532/*
3533 * Return 1 if the passed credential is in a jail, otherwise 0.
3534 */
3535int
3536jailed(struct ucred *cred)
3537{
3538
3539	return (cred->cr_prison != &prison0);
3540}
3541
3542/*
3543 * Return 1 if the passed credential is in a jail and that jail does not
3544 * have its own virtual network stack, otherwise 0.
3545 */
3546int
3547jailed_without_vnet(struct ucred *cred)
3548{
3549
3550	if (!jailed(cred))
3551		return (0);
3552#ifdef VIMAGE
3553	if (prison_owns_vnet(cred))
3554		return (0);
3555#endif
3556
3557	return (1);
3558}
3559
3560/*
3561 * Return the correct hostname (domainname, et al) for the passed credential.
3562 */
3563void
3564getcredhostname(struct ucred *cred, char *buf, size_t size)
3565{
3566	struct prison *pr;
3567
3568	/*
3569	 * A NULL credential can be used to shortcut to the physical
3570	 * system's hostname.
3571	 */
3572	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3573	mtx_lock(&pr->pr_mtx);
3574	strlcpy(buf, pr->pr_hostname, size);
3575	mtx_unlock(&pr->pr_mtx);
3576}
3577
3578void
3579getcreddomainname(struct ucred *cred, char *buf, size_t size)
3580{
3581
3582	mtx_lock(&cred->cr_prison->pr_mtx);
3583	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3584	mtx_unlock(&cred->cr_prison->pr_mtx);
3585}
3586
3587void
3588getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3589{
3590
3591	mtx_lock(&cred->cr_prison->pr_mtx);
3592	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3593	mtx_unlock(&cred->cr_prison->pr_mtx);
3594}
3595
3596void
3597getcredhostid(struct ucred *cred, unsigned long *hostid)
3598{
3599
3600	mtx_lock(&cred->cr_prison->pr_mtx);
3601	*hostid = cred->cr_prison->pr_hostid;
3602	mtx_unlock(&cred->cr_prison->pr_mtx);
3603}
3604
3605#ifdef VIMAGE
3606/*
3607 * Determine whether the prison represented by cred owns
3608 * its vnet rather than having it inherited.
3609 *
3610 * Returns 1 in case the prison owns the vnet, 0 otherwise.
3611 */
3612int
3613prison_owns_vnet(struct ucred *cred)
3614{
3615
3616	/*
3617	 * vnets cannot be added/removed after jail creation,
3618	 * so no need to lock here.
3619	 */
3620	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
3621}
3622#endif
3623
3624/*
3625 * Determine whether the subject represented by cred can "see"
3626 * status of a mount point.
3627 * Returns: 0 for permitted, ENOENT otherwise.
3628 * XXX: This function should be called cr_canseemount() and should be
3629 *      placed in kern_prot.c.
3630 */
3631int
3632prison_canseemount(struct ucred *cred, struct mount *mp)
3633{
3634	struct prison *pr;
3635	struct statfs *sp;
3636	size_t len;
3637
3638	pr = cred->cr_prison;
3639	if (pr->pr_enforce_statfs == 0)
3640		return (0);
3641	if (pr->pr_root->v_mount == mp)
3642		return (0);
3643	if (pr->pr_enforce_statfs == 2)
3644		return (ENOENT);
3645	/*
3646	 * If jail's chroot directory is set to "/" we should be able to see
3647	 * all mount-points from inside a jail.
3648	 * This is ugly check, but this is the only situation when jail's
3649	 * directory ends with '/'.
3650	 */
3651	if (strcmp(pr->pr_path, "/") == 0)
3652		return (0);
3653	len = strlen(pr->pr_path);
3654	sp = &mp->mnt_stat;
3655	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3656		return (ENOENT);
3657	/*
3658	 * Be sure that we don't have situation where jail's root directory
3659	 * is "/some/path" and mount point is "/some/pathpath".
3660	 */
3661	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3662		return (ENOENT);
3663	return (0);
3664}
3665
3666void
3667prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3668{
3669	char jpath[MAXPATHLEN];
3670	struct prison *pr;
3671	size_t len;
3672
3673	pr = cred->cr_prison;
3674	if (pr->pr_enforce_statfs == 0)
3675		return;
3676	if (prison_canseemount(cred, mp) != 0) {
3677		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3678		strlcpy(sp->f_mntonname, "[restricted]",
3679		    sizeof(sp->f_mntonname));
3680		return;
3681	}
3682	if (pr->pr_root->v_mount == mp) {
3683		/*
3684		 * Clear current buffer data, so we are sure nothing from
3685		 * the valid path left there.
3686		 */
3687		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3688		*sp->f_mntonname = '/';
3689		return;
3690	}
3691	/*
3692	 * If jail's chroot directory is set to "/" we should be able to see
3693	 * all mount-points from inside a jail.
3694	 */
3695	if (strcmp(pr->pr_path, "/") == 0)
3696		return;
3697	len = strlen(pr->pr_path);
3698	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3699	/*
3700	 * Clear current buffer data, so we are sure nothing from
3701	 * the valid path left there.
3702	 */
3703	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3704	if (*jpath == '\0') {
3705		/* Should never happen. */
3706		*sp->f_mntonname = '/';
3707	} else {
3708		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3709	}
3710}
3711
3712/*
3713 * Check with permission for a specific privilege is granted within jail.  We
3714 * have a specific list of accepted privileges; the rest are denied.
3715 */
3716int
3717prison_priv_check(struct ucred *cred, int priv)
3718{
3719
3720	if (!jailed(cred))
3721		return (0);
3722
3723#ifdef VIMAGE
3724	/*
3725	 * Privileges specific to prisons with a virtual network stack.
3726	 * There might be a duplicate entry here in case the privilege
3727	 * is only granted conditionally in the legacy jail case.
3728	 */
3729	switch (priv) {
3730#ifdef notyet
3731		/*
3732		 * NFS-specific privileges.
3733		 */
3734	case PRIV_NFS_DAEMON:
3735	case PRIV_NFS_LOCKD:
3736#endif
3737		/*
3738		 * Network stack privileges.
3739		 */
3740	case PRIV_NET_BRIDGE:
3741	case PRIV_NET_GRE:
3742	case PRIV_NET_BPF:
3743	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3744	case PRIV_NET_ROUTE:
3745	case PRIV_NET_TAP:
3746	case PRIV_NET_SETIFMTU:
3747	case PRIV_NET_SETIFFLAGS:
3748	case PRIV_NET_SETIFCAP:
3749	case PRIV_NET_SETIFDESCR:
3750	case PRIV_NET_SETIFNAME	:
3751	case PRIV_NET_SETIFMETRIC:
3752	case PRIV_NET_SETIFPHYS:
3753	case PRIV_NET_SETIFMAC:
3754	case PRIV_NET_ADDMULTI:
3755	case PRIV_NET_DELMULTI:
3756	case PRIV_NET_HWIOCTL:
3757	case PRIV_NET_SETLLADDR:
3758	case PRIV_NET_ADDIFGROUP:
3759	case PRIV_NET_DELIFGROUP:
3760	case PRIV_NET_IFCREATE:
3761	case PRIV_NET_IFDESTROY:
3762	case PRIV_NET_ADDIFADDR:
3763	case PRIV_NET_DELIFADDR:
3764	case PRIV_NET_LAGG:
3765	case PRIV_NET_GIF:
3766	case PRIV_NET_SETIFVNET:
3767	case PRIV_NET_SETIFFIB:
3768
3769		/*
3770		 * 802.11-related privileges.
3771		 */
3772	case PRIV_NET80211_GETKEY:
3773#ifdef notyet
3774	case PRIV_NET80211_MANAGE:		/* XXX-BZ discuss with sam@ */
3775#endif
3776
3777#ifdef notyet
3778		/*
3779		 * AppleTalk privileges.
3780		 */
3781	case PRIV_NETATALK_RESERVEDPORT:
3782
3783		/*
3784		 * ATM privileges.
3785		 */
3786	case PRIV_NETATM_CFG:
3787	case PRIV_NETATM_ADD:
3788	case PRIV_NETATM_DEL:
3789	case PRIV_NETATM_SET:
3790
3791		/*
3792		 * Bluetooth privileges.
3793		 */
3794	case PRIV_NETBLUETOOTH_RAW:
3795#endif
3796
3797		/*
3798		 * Netgraph and netgraph module privileges.
3799		 */
3800	case PRIV_NETGRAPH_CONTROL:
3801#ifdef notyet
3802	case PRIV_NETGRAPH_TTY:
3803#endif
3804
3805		/*
3806		 * IPv4 and IPv6 privileges.
3807		 */
3808	case PRIV_NETINET_IPFW:
3809	case PRIV_NETINET_DIVERT:
3810	case PRIV_NETINET_PF:
3811	case PRIV_NETINET_DUMMYNET:
3812	case PRIV_NETINET_CARP:
3813	case PRIV_NETINET_MROUTE:
3814	case PRIV_NETINET_RAW:
3815	case PRIV_NETINET_ADDRCTRL6:
3816	case PRIV_NETINET_ND6:
3817	case PRIV_NETINET_SCOPE6:
3818	case PRIV_NETINET_ALIFETIME6:
3819	case PRIV_NETINET_IPSEC:
3820	case PRIV_NETINET_BINDANY:
3821
3822#ifdef notyet
3823		/*
3824		 * IPX/SPX privileges.
3825		 */
3826	case PRIV_NETIPX_RESERVEDPORT:
3827	case PRIV_NETIPX_RAW:
3828
3829		/*
3830		 * NCP privileges.
3831		 */
3832	case PRIV_NETNCP:
3833
3834		/*
3835		 * SMB privileges.
3836		 */
3837	case PRIV_NETSMB:
3838#endif
3839
3840	/*
3841	 * No default: or deny here.
3842	 * In case of no permit fall through to next switch().
3843	 */
3844		if (cred->cr_prison->pr_flags & PR_VNET)
3845			return (0);
3846	}
3847#endif /* VIMAGE */
3848
3849	switch (priv) {
3850
3851		/*
3852		 * Allow ktrace privileges for root in jail.
3853		 */
3854	case PRIV_KTRACE:
3855
3856#if 0
3857		/*
3858		 * Allow jailed processes to configure audit identity and
3859		 * submit audit records (login, etc).  In the future we may
3860		 * want to further refine the relationship between audit and
3861		 * jail.
3862		 */
3863	case PRIV_AUDIT_GETAUDIT:
3864	case PRIV_AUDIT_SETAUDIT:
3865	case PRIV_AUDIT_SUBMIT:
3866#endif
3867
3868		/*
3869		 * Allow jailed processes to manipulate process UNIX
3870		 * credentials in any way they see fit.
3871		 */
3872	case PRIV_CRED_SETUID:
3873	case PRIV_CRED_SETEUID:
3874	case PRIV_CRED_SETGID:
3875	case PRIV_CRED_SETEGID:
3876	case PRIV_CRED_SETGROUPS:
3877	case PRIV_CRED_SETREUID:
3878	case PRIV_CRED_SETREGID:
3879	case PRIV_CRED_SETRESUID:
3880	case PRIV_CRED_SETRESGID:
3881
3882		/*
3883		 * Jail implements visibility constraints already, so allow
3884		 * jailed root to override uid/gid-based constraints.
3885		 */
3886	case PRIV_SEEOTHERGIDS:
3887	case PRIV_SEEOTHERUIDS:
3888
3889		/*
3890		 * Jail implements inter-process debugging limits already, so
3891		 * allow jailed root various debugging privileges.
3892		 */
3893	case PRIV_DEBUG_DIFFCRED:
3894	case PRIV_DEBUG_SUGID:
3895	case PRIV_DEBUG_UNPRIV:
3896
3897		/*
3898		 * Allow jail to set various resource limits and login
3899		 * properties, and for now, exceed process resource limits.
3900		 */
3901	case PRIV_PROC_LIMIT:
3902	case PRIV_PROC_SETLOGIN:
3903	case PRIV_PROC_SETRLIMIT:
3904
3905		/*
3906		 * System V and POSIX IPC privileges are granted in jail.
3907		 */
3908	case PRIV_IPC_READ:
3909	case PRIV_IPC_WRITE:
3910	case PRIV_IPC_ADMIN:
3911	case PRIV_IPC_MSGSIZE:
3912	case PRIV_MQ_ADMIN:
3913
3914		/*
3915		 * Jail operations within a jail work on child jails.
3916		 */
3917	case PRIV_JAIL_ATTACH:
3918	case PRIV_JAIL_SET:
3919	case PRIV_JAIL_REMOVE:
3920
3921		/*
3922		 * Jail implements its own inter-process limits, so allow
3923		 * root processes in jail to change scheduling on other
3924		 * processes in the same jail.  Likewise for signalling.
3925		 */
3926	case PRIV_SCHED_DIFFCRED:
3927	case PRIV_SCHED_CPUSET:
3928	case PRIV_SIGNAL_DIFFCRED:
3929	case PRIV_SIGNAL_SUGID:
3930
3931		/*
3932		 * Allow jailed processes to write to sysctls marked as jail
3933		 * writable.
3934		 */
3935	case PRIV_SYSCTL_WRITEJAIL:
3936
3937		/*
3938		 * Allow root in jail to manage a variety of quota
3939		 * properties.  These should likely be conditional on a
3940		 * configuration option.
3941		 */
3942	case PRIV_VFS_GETQUOTA:
3943	case PRIV_VFS_SETQUOTA:
3944
3945		/*
3946		 * Since Jail relies on chroot() to implement file system
3947		 * protections, grant many VFS privileges to root in jail.
3948		 * Be careful to exclude mount-related and NFS-related
3949		 * privileges.
3950		 */
3951	case PRIV_VFS_READ:
3952	case PRIV_VFS_WRITE:
3953	case PRIV_VFS_ADMIN:
3954	case PRIV_VFS_EXEC:
3955	case PRIV_VFS_LOOKUP:
3956	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
3957	case PRIV_VFS_CHFLAGS_DEV:
3958	case PRIV_VFS_CHOWN:
3959	case PRIV_VFS_CHROOT:
3960	case PRIV_VFS_RETAINSUGID:
3961	case PRIV_VFS_FCHROOT:
3962	case PRIV_VFS_LINK:
3963	case PRIV_VFS_SETGID:
3964	case PRIV_VFS_STAT:
3965	case PRIV_VFS_STICKYFILE:
3966
3967		/*
3968		 * As in the non-jail case, non-root users are expected to be
3969		 * able to read kernel/phyiscal memory (provided /dev/[k]mem
3970		 * exists in the jail and they have permission to access it).
3971		 */
3972	case PRIV_KMEM_READ:
3973		return (0);
3974
3975		/*
3976		 * Depending on the global setting, allow privilege of
3977		 * setting system flags.
3978		 */
3979	case PRIV_VFS_SYSFLAGS:
3980		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
3981			return (0);
3982		else
3983			return (EPERM);
3984
3985		/*
3986		 * Depending on the global setting, allow privilege of
3987		 * mounting/unmounting file systems.
3988		 */
3989	case PRIV_VFS_MOUNT:
3990	case PRIV_VFS_UNMOUNT:
3991	case PRIV_VFS_MOUNT_NONUSER:
3992	case PRIV_VFS_MOUNT_OWNER:
3993		if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT &&
3994		    cred->cr_prison->pr_enforce_statfs < 2)
3995			return (0);
3996		else
3997			return (EPERM);
3998
3999		/*
4000		 * Allow jailed root to bind reserved ports and reuse in-use
4001		 * ports.
4002		 */
4003	case PRIV_NETINET_RESERVEDPORT:
4004	case PRIV_NETINET_REUSEPORT:
4005		return (0);
4006
4007		/*
4008		 * Allow jailed root to set certian IPv4/6 (option) headers.
4009		 */
4010	case PRIV_NETINET_SETHDROPTS:
4011		return (0);
4012
4013		/*
4014		 * Conditionally allow creating raw sockets in jail.
4015		 */
4016	case PRIV_NETINET_RAW:
4017		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
4018			return (0);
4019		else
4020			return (EPERM);
4021
4022		/*
4023		 * Since jail implements its own visibility limits on netstat
4024		 * sysctls, allow getcred.  This allows identd to work in
4025		 * jail.
4026		 */
4027	case PRIV_NETINET_GETCRED:
4028		return (0);
4029
4030		/*
4031		 * Allow jailed root to set loginclass.
4032		 */
4033	case PRIV_PROC_SETLOGINCLASS:
4034		return (0);
4035
4036	default:
4037		/*
4038		 * In all remaining cases, deny the privilege request.  This
4039		 * includes almost all network privileges, many system
4040		 * configuration privileges.
4041		 */
4042		return (EPERM);
4043	}
4044}
4045
4046/*
4047 * Return the part of pr2's name that is relative to pr1, or the whole name
4048 * if it does not directly follow.
4049 */
4050
4051char *
4052prison_name(struct prison *pr1, struct prison *pr2)
4053{
4054	char *name;
4055
4056	/* Jails see themselves as "0" (if they see themselves at all). */
4057	if (pr1 == pr2)
4058		return "0";
4059	name = pr2->pr_name;
4060	if (prison_ischild(pr1, pr2)) {
4061		/*
4062		 * pr1 isn't locked (and allprison_lock may not be either)
4063		 * so its length can't be counted on.  But the number of dots
4064		 * can be counted on - and counted.
4065		 */
4066		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
4067			name = strchr(name, '.') + 1;
4068	}
4069	return (name);
4070}
4071
4072/*
4073 * Return the part of pr2's path that is relative to pr1, or the whole path
4074 * if it does not directly follow.
4075 */
4076static char *
4077prison_path(struct prison *pr1, struct prison *pr2)
4078{
4079	char *path1, *path2;
4080	int len1;
4081
4082	path1 = pr1->pr_path;
4083	path2 = pr2->pr_path;
4084	if (!strcmp(path1, "/"))
4085		return (path2);
4086	len1 = strlen(path1);
4087	if (strncmp(path1, path2, len1))
4088		return (path2);
4089	if (path2[len1] == '\0')
4090		return "/";
4091	if (path2[len1] == '/')
4092		return (path2 + len1);
4093	return (path2);
4094}
4095
4096
4097/*
4098 * Jail-related sysctls.
4099 */
4100static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
4101    "Jails");
4102
4103static int
4104sysctl_jail_list(SYSCTL_HANDLER_ARGS)
4105{
4106	struct xprison *xp;
4107	struct prison *pr, *cpr;
4108#ifdef INET
4109	struct in_addr *ip4 = NULL;
4110	int ip4s = 0;
4111#endif
4112#ifdef INET6
4113	struct in6_addr *ip6 = NULL;
4114	int ip6s = 0;
4115#endif
4116	int descend, error;
4117
4118	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
4119	pr = req->td->td_ucred->cr_prison;
4120	error = 0;
4121	sx_slock(&allprison_lock);
4122	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
4123#if defined(INET) || defined(INET6)
4124 again:
4125#endif
4126		mtx_lock(&cpr->pr_mtx);
4127#ifdef INET
4128		if (cpr->pr_ip4s > 0) {
4129			if (ip4s < cpr->pr_ip4s) {
4130				ip4s = cpr->pr_ip4s;
4131				mtx_unlock(&cpr->pr_mtx);
4132				ip4 = realloc(ip4, ip4s *
4133				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
4134				goto again;
4135			}
4136			bcopy(cpr->pr_ip4, ip4,
4137			    cpr->pr_ip4s * sizeof(struct in_addr));
4138		}
4139#endif
4140#ifdef INET6
4141		if (cpr->pr_ip6s > 0) {
4142			if (ip6s < cpr->pr_ip6s) {
4143				ip6s = cpr->pr_ip6s;
4144				mtx_unlock(&cpr->pr_mtx);
4145				ip6 = realloc(ip6, ip6s *
4146				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
4147				goto again;
4148			}
4149			bcopy(cpr->pr_ip6, ip6,
4150			    cpr->pr_ip6s * sizeof(struct in6_addr));
4151		}
4152#endif
4153		if (cpr->pr_ref == 0) {
4154			mtx_unlock(&cpr->pr_mtx);
4155			continue;
4156		}
4157		bzero(xp, sizeof(*xp));
4158		xp->pr_version = XPRISON_VERSION;
4159		xp->pr_id = cpr->pr_id;
4160		xp->pr_state = cpr->pr_uref > 0
4161		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
4162		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
4163		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
4164		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
4165#ifdef INET
4166		xp->pr_ip4s = cpr->pr_ip4s;
4167#endif
4168#ifdef INET6
4169		xp->pr_ip6s = cpr->pr_ip6s;
4170#endif
4171		mtx_unlock(&cpr->pr_mtx);
4172		error = SYSCTL_OUT(req, xp, sizeof(*xp));
4173		if (error)
4174			break;
4175#ifdef INET
4176		if (xp->pr_ip4s > 0) {
4177			error = SYSCTL_OUT(req, ip4,
4178			    xp->pr_ip4s * sizeof(struct in_addr));
4179			if (error)
4180				break;
4181		}
4182#endif
4183#ifdef INET6
4184		if (xp->pr_ip6s > 0) {
4185			error = SYSCTL_OUT(req, ip6,
4186			    xp->pr_ip6s * sizeof(struct in6_addr));
4187			if (error)
4188				break;
4189		}
4190#endif
4191	}
4192	sx_sunlock(&allprison_lock);
4193	free(xp, M_TEMP);
4194#ifdef INET
4195	free(ip4, M_TEMP);
4196#endif
4197#ifdef INET6
4198	free(ip6, M_TEMP);
4199#endif
4200	return (error);
4201}
4202
4203SYSCTL_OID(_security_jail, OID_AUTO, list,
4204    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4205    sysctl_jail_list, "S", "List of active jails");
4206
4207static int
4208sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
4209{
4210	int error, injail;
4211
4212	injail = jailed(req->td->td_ucred);
4213	error = SYSCTL_OUT(req, &injail, sizeof(injail));
4214
4215	return (error);
4216}
4217
4218SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
4219    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4220    sysctl_jail_jailed, "I", "Process in jail?");
4221
4222static int
4223sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
4224{
4225	int error, havevnet;
4226#ifdef VIMAGE
4227	struct ucred *cred = req->td->td_ucred;
4228
4229	havevnet = jailed(cred) && prison_owns_vnet(cred);
4230#else
4231	havevnet = 0;
4232#endif
4233	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
4234
4235	return (error);
4236}
4237
4238SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
4239    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4240    sysctl_jail_vnet, "I", "Jail owns VNET?");
4241
4242#if defined(INET) || defined(INET6)
4243SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
4244    &jail_max_af_ips, 0,
4245    "Number of IP addresses a jail may have at most per address family");
4246#endif
4247
4248/*
4249 * Default parameters for jail(2) compatability.  For historical reasons,
4250 * the sysctl names have varying similarity to the parameter names.  Prisons
4251 * just see their own parameters, and can't change them.
4252 */
4253static int
4254sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
4255{
4256	struct prison *pr;
4257	int allow, error, i;
4258
4259	pr = req->td->td_ucred->cr_prison;
4260	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
4261
4262	/* Get the current flag value, and convert it to a boolean. */
4263	i = (allow & arg2) ? 1 : 0;
4264	if (arg1 != NULL)
4265		i = !i;
4266	error = sysctl_handle_int(oidp, &i, 0, req);
4267	if (error || !req->newptr)
4268		return (error);
4269	i = i ? arg2 : 0;
4270	if (arg1 != NULL)
4271		i ^= arg2;
4272	/*
4273	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
4274	 * for writing.
4275	 */
4276	mtx_lock(&prison0.pr_mtx);
4277	jail_default_allow = (jail_default_allow & ~arg2) | i;
4278	mtx_unlock(&prison0.pr_mtx);
4279	return (0);
4280}
4281
4282SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
4283    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4284    NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
4285    "Processes in jail can set their hostnames");
4286SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
4287    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4288    (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
4289    "Processes in jail are limited to creating UNIX/IP/route sockets only");
4290SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
4291    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4292    NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
4293    "Processes in jail can use System V IPC primitives");
4294SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
4295    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4296    NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
4297    "Prison root can create raw sockets");
4298SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
4299    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4300    NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
4301    "Processes in jail can alter system file flags");
4302SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
4303    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4304    NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
4305    "Processes in jail can mount/unmount jail-friendly file systems");
4306SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed,
4307    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4308    NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I",
4309    "Processes in jail can mount the devfs file system");
4310SYSCTL_PROC(_security_jail, OID_AUTO, mount_fdescfs_allowed,
4311    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4312    NULL, PR_ALLOW_MOUNT_FDESCFS, sysctl_jail_default_allow, "I",
4313    "Processes in jail can mount the fdescfs file system");
4314SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed,
4315    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4316    NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I",
4317    "Processes in jail can mount the nullfs file system");
4318SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed,
4319    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4320    NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I",
4321    "Processes in jail can mount the procfs file system");
4322SYSCTL_PROC(_security_jail, OID_AUTO, mount_linprocfs_allowed,
4323    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4324    NULL, PR_ALLOW_MOUNT_LINPROCFS, sysctl_jail_default_allow, "I",
4325    "Processes in jail can mount the linprocfs file system");
4326SYSCTL_PROC(_security_jail, OID_AUTO, mount_linsysfs_allowed,
4327    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4328    NULL, PR_ALLOW_MOUNT_LINSYSFS, sysctl_jail_default_allow, "I",
4329    "Processes in jail can mount the linsysfs file system");
4330SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed,
4331    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4332    NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I",
4333    "Processes in jail can mount the tmpfs file system");
4334SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed,
4335    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4336    NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I",
4337    "Processes in jail can mount the zfs file system");
4338
4339static int
4340sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
4341{
4342	struct prison *pr;
4343	int level, error;
4344
4345	pr = req->td->td_ucred->cr_prison;
4346	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4347	error = sysctl_handle_int(oidp, &level, 0, req);
4348	if (error || !req->newptr)
4349		return (error);
4350	*(int *)arg1 = level;
4351	return (0);
4352}
4353
4354SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4355    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4356    &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4357    sysctl_jail_default_level, "I",
4358    "Processes in jail cannot see all mounted file systems");
4359
4360SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
4361    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4362    &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
4363    sysctl_jail_default_level, "I",
4364    "Ruleset for the devfs filesystem in jail");
4365
4366/*
4367 * Nodes to describe jail parameters.  Maximum length of string parameters
4368 * is returned in the string itself, and the other parameters exist merely
4369 * to make themselves and their types known.
4370 */
4371SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
4372    "Jail parameters");
4373
4374int
4375sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4376{
4377	int i;
4378	long l;
4379	size_t s;
4380	char numbuf[12];
4381
4382	switch (oidp->oid_kind & CTLTYPE)
4383	{
4384	case CTLTYPE_LONG:
4385	case CTLTYPE_ULONG:
4386		l = 0;
4387#ifdef SCTL_MASK32
4388		if (!(req->flags & SCTL_MASK32))
4389#endif
4390			return (SYSCTL_OUT(req, &l, sizeof(l)));
4391	case CTLTYPE_INT:
4392	case CTLTYPE_UINT:
4393		i = 0;
4394		return (SYSCTL_OUT(req, &i, sizeof(i)));
4395	case CTLTYPE_STRING:
4396		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
4397		return
4398		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4399	case CTLTYPE_STRUCT:
4400		s = (size_t)arg2;
4401		return (SYSCTL_OUT(req, &s, sizeof(s)));
4402	}
4403	return (0);
4404}
4405
4406/*
4407 * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
4408 * jail creation time but cannot be changed in an existing jail.
4409 */
4410SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4411SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4412SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4413SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4414SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4415    "I", "Jail secure level");
4416SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
4417    "Jail value for kern.osreldate and uname -K");
4418SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
4419    "Jail value for kern.osrelease and uname -r");
4420SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4421    "I", "Jail cannot see all mounted file systems");
4422SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
4423    "I", "Ruleset for in-jail devfs mounts");
4424SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4425    "B", "Jail persistence");
4426#ifdef VIMAGE
4427SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4428    "E,jailsys", "Virtual network stack");
4429#endif
4430SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4431    "B", "Jail is in the process of shutting down");
4432
4433SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4434SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4435    "I", "Current number of child jails");
4436SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4437    "I", "Maximum number of child jails");
4438
4439SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4440SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4441    "Jail hostname");
4442SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4443    "Jail NIS domainname");
4444SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4445    "Jail host UUID");
4446SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4447    "LU", "Jail host ID");
4448
4449SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4450SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4451
4452#ifdef INET
4453SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4454    "Jail IPv4 address virtualization");
4455SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4456    "S,in_addr,a", "Jail IPv4 addresses");
4457SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4458    "B", "Do (not) use IPv4 source address selection rather than the "
4459    "primary jail IPv4 address.");
4460#endif
4461#ifdef INET6
4462SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4463    "Jail IPv6 address virtualization");
4464SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4465    "S,in6_addr,a", "Jail IPv6 addresses");
4466SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4467    "B", "Do (not) use IPv6 source address selection rather than the "
4468    "primary jail IPv6 address.");
4469#endif
4470
4471SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4472SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4473    "B", "Jail may set hostname");
4474SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4475    "B", "Jail may use SYSV IPC");
4476SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4477    "B", "Jail may create raw sockets");
4478SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4479    "B", "Jail may alter system file flags");
4480SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4481    "B", "Jail may set file quotas");
4482SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4483    "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4484
4485SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
4486SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
4487    "B", "Jail may mount/unmount jail-friendly file systems in general");
4488SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW,
4489    "B", "Jail may mount the devfs file system");
4490SYSCTL_JAIL_PARAM(_allow_mount, fdescfs, CTLTYPE_INT | CTLFLAG_RW,
4491    "B", "Jail may mount the fdescfs file system");
4492SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW,
4493    "B", "Jail may mount the nullfs file system");
4494SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW,
4495    "B", "Jail may mount the procfs file system");
4496SYSCTL_JAIL_PARAM(_allow_mount, linprocfs, CTLTYPE_INT | CTLFLAG_RW,
4497    "B", "Jail may mount the linprocfs file system");
4498SYSCTL_JAIL_PARAM(_allow_mount, linsysfs, CTLTYPE_INT | CTLFLAG_RW,
4499    "B", "Jail may mount the linsysfs file system");
4500SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW,
4501    "B", "Jail may mount the tmpfs file system");
4502SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW,
4503    "B", "Jail may mount the zfs file system");
4504
4505#ifdef RACCT
4506void
4507prison_racct_foreach(void (*callback)(struct racct *racct,
4508    void *arg2, void *arg3), void *arg2, void *arg3)
4509{
4510	struct prison_racct *prr;
4511
4512	ASSERT_RACCT_ENABLED();
4513
4514	sx_slock(&allprison_lock);
4515	LIST_FOREACH(prr, &allprison_racct, prr_next)
4516		(callback)(prr->prr_racct, arg2, arg3);
4517	sx_sunlock(&allprison_lock);
4518}
4519
4520static struct prison_racct *
4521prison_racct_find_locked(const char *name)
4522{
4523	struct prison_racct *prr;
4524
4525	ASSERT_RACCT_ENABLED();
4526	sx_assert(&allprison_lock, SA_XLOCKED);
4527
4528	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
4529		return (NULL);
4530
4531	LIST_FOREACH(prr, &allprison_racct, prr_next) {
4532		if (strcmp(name, prr->prr_name) != 0)
4533			continue;
4534
4535		/* Found prison_racct with a matching name? */
4536		prison_racct_hold(prr);
4537		return (prr);
4538	}
4539
4540	/* Add new prison_racct. */
4541	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
4542	racct_create(&prr->prr_racct);
4543
4544	strcpy(prr->prr_name, name);
4545	refcount_init(&prr->prr_refcount, 1);
4546	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
4547
4548	return (prr);
4549}
4550
4551struct prison_racct *
4552prison_racct_find(const char *name)
4553{
4554	struct prison_racct *prr;
4555
4556	ASSERT_RACCT_ENABLED();
4557
4558	sx_xlock(&allprison_lock);
4559	prr = prison_racct_find_locked(name);
4560	sx_xunlock(&allprison_lock);
4561	return (prr);
4562}
4563
4564void
4565prison_racct_hold(struct prison_racct *prr)
4566{
4567
4568	ASSERT_RACCT_ENABLED();
4569
4570	refcount_acquire(&prr->prr_refcount);
4571}
4572
4573static void
4574prison_racct_free_locked(struct prison_racct *prr)
4575{
4576
4577	ASSERT_RACCT_ENABLED();
4578	sx_assert(&allprison_lock, SA_XLOCKED);
4579
4580	if (refcount_release(&prr->prr_refcount)) {
4581		racct_destroy(&prr->prr_racct);
4582		LIST_REMOVE(prr, prr_next);
4583		free(prr, M_PRISON_RACCT);
4584	}
4585}
4586
4587void
4588prison_racct_free(struct prison_racct *prr)
4589{
4590	int old;
4591
4592	ASSERT_RACCT_ENABLED();
4593	sx_assert(&allprison_lock, SA_UNLOCKED);
4594
4595	old = prr->prr_refcount;
4596	if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1))
4597		return;
4598
4599	sx_xlock(&allprison_lock);
4600	prison_racct_free_locked(prr);
4601	sx_xunlock(&allprison_lock);
4602}
4603
4604static void
4605prison_racct_attach(struct prison *pr)
4606{
4607	struct prison_racct *prr;
4608
4609	ASSERT_RACCT_ENABLED();
4610	sx_assert(&allprison_lock, SA_XLOCKED);
4611
4612	prr = prison_racct_find_locked(pr->pr_name);
4613	KASSERT(prr != NULL, ("cannot find prison_racct"));
4614
4615	pr->pr_prison_racct = prr;
4616}
4617
4618/*
4619 * Handle jail renaming.  From the racct point of view, renaming means
4620 * moving from one prison_racct to another.
4621 */
4622static void
4623prison_racct_modify(struct prison *pr)
4624{
4625	struct proc *p;
4626	struct ucred *cred;
4627	struct prison_racct *oldprr;
4628
4629	ASSERT_RACCT_ENABLED();
4630
4631	sx_slock(&allproc_lock);
4632	sx_xlock(&allprison_lock);
4633
4634	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
4635		sx_xunlock(&allprison_lock);
4636		sx_sunlock(&allproc_lock);
4637		return;
4638	}
4639
4640	oldprr = pr->pr_prison_racct;
4641	pr->pr_prison_racct = NULL;
4642
4643	prison_racct_attach(pr);
4644
4645	/*
4646	 * Move resource utilisation records.
4647	 */
4648	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
4649
4650	/*
4651	 * Force rctl to reattach rules to processes.
4652	 */
4653	FOREACH_PROC_IN_SYSTEM(p) {
4654		PROC_LOCK(p);
4655		cred = crhold(p->p_ucred);
4656		PROC_UNLOCK(p);
4657		racct_proc_ucred_changed(p, cred, cred);
4658		crfree(cred);
4659	}
4660
4661	sx_sunlock(&allproc_lock);
4662	prison_racct_free_locked(oldprr);
4663	sx_xunlock(&allprison_lock);
4664}
4665
4666static void
4667prison_racct_detach(struct prison *pr)
4668{
4669
4670	ASSERT_RACCT_ENABLED();
4671	sx_assert(&allprison_lock, SA_UNLOCKED);
4672
4673	if (pr->pr_prison_racct == NULL)
4674		return;
4675	prison_racct_free(pr->pr_prison_racct);
4676	pr->pr_prison_racct = NULL;
4677}
4678#endif /* RACCT */
4679
4680#ifdef DDB
4681
4682static void
4683db_show_prison(struct prison *pr)
4684{
4685	int fi;
4686#if defined(INET) || defined(INET6)
4687	int ii;
4688#endif
4689	unsigned jsf;
4690#ifdef INET6
4691	char ip6buf[INET6_ADDRSTRLEN];
4692#endif
4693
4694	db_printf("prison %p:\n", pr);
4695	db_printf(" jid             = %d\n", pr->pr_id);
4696	db_printf(" name            = %s\n", pr->pr_name);
4697	db_printf(" parent          = %p\n", pr->pr_parent);
4698	db_printf(" ref             = %d\n", pr->pr_ref);
4699	db_printf(" uref            = %d\n", pr->pr_uref);
4700	db_printf(" path            = %s\n", pr->pr_path);
4701	db_printf(" cpuset          = %d\n", pr->pr_cpuset
4702	    ? pr->pr_cpuset->cs_id : -1);
4703#ifdef VIMAGE
4704	db_printf(" vnet            = %p\n", pr->pr_vnet);
4705#endif
4706	db_printf(" root            = %p\n", pr->pr_root);
4707	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
4708	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
4709	db_printf(" children.max    = %d\n", pr->pr_childmax);
4710	db_printf(" children.cur    = %d\n", pr->pr_childcount);
4711	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
4712	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
4713	db_printf(" flags           = 0x%x", pr->pr_flags);
4714	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
4715	    fi++)
4716		if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
4717			db_printf(" %s", pr_flag_names[fi]);
4718	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
4719	    fi++) {
4720		jsf = pr->pr_flags &
4721		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
4722		db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
4723		    pr_flag_jailsys[fi].disable &&
4724		      (jsf == pr_flag_jailsys[fi].disable) ? "disable"
4725		    : (jsf == pr_flag_jailsys[fi].new) ? "new"
4726		    : "inherit");
4727	}
4728	db_printf(" allow           = 0x%x", pr->pr_allow);
4729	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
4730	    fi++)
4731		if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
4732			db_printf(" %s", pr_allow_names[fi]);
4733	db_printf("\n");
4734	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
4735	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
4736	db_printf(" host.domainname = %s\n", pr->pr_domainname);
4737	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
4738	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
4739#ifdef INET
4740	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
4741	for (ii = 0; ii < pr->pr_ip4s; ii++)
4742		db_printf(" %s %s\n",
4743		    ii == 0 ? "ip4.addr        =" : "                 ",
4744		    inet_ntoa(pr->pr_ip4[ii]));
4745#endif
4746#ifdef INET6
4747	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
4748	for (ii = 0; ii < pr->pr_ip6s; ii++)
4749		db_printf(" %s %s\n",
4750		    ii == 0 ? "ip6.addr        =" : "                 ",
4751		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
4752#endif
4753}
4754
4755DB_SHOW_COMMAND(prison, db_show_prison_command)
4756{
4757	struct prison *pr;
4758
4759	if (!have_addr) {
4760		/*
4761		 * Show all prisons in the list, and prison0 which is not
4762		 * listed.
4763		 */
4764		db_show_prison(&prison0);
4765		if (!db_pager_quit) {
4766			TAILQ_FOREACH(pr, &allprison, pr_list) {
4767				db_show_prison(pr);
4768				if (db_pager_quit)
4769					break;
4770			}
4771		}
4772		return;
4773	}
4774
4775	if (addr == 0)
4776		pr = &prison0;
4777	else {
4778		/* Look for a prison with the ID and with references. */
4779		TAILQ_FOREACH(pr, &allprison, pr_list)
4780			if (pr->pr_id == addr && pr->pr_ref > 0)
4781				break;
4782		if (pr == NULL)
4783			/* Look again, without requiring a reference. */
4784			TAILQ_FOREACH(pr, &allprison, pr_list)
4785				if (pr->pr_id == addr)
4786					break;
4787		if (pr == NULL)
4788			/* Assume address points to a valid prison. */
4789			pr = (struct prison *)addr;
4790	}
4791	db_show_prison(pr);
4792}
4793
4794#endif /* DDB */
4795