kern_jail.c revision 224290
1/*-
2 * Copyright (c) 1999 Poul-Henning Kamp.
3 * Copyright (c) 2008 Bjoern A. Zeeb.
4 * Copyright (c) 2009 James Gritton.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/kern/kern_jail.c 224290 2011-07-24 17:43:09Z mckusick $");
31
32#include "opt_compat.h"
33#include "opt_ddb.h"
34#include "opt_inet.h"
35#include "opt_inet6.h"
36
37#include <sys/param.h>
38#include <sys/types.h>
39#include <sys/kernel.h>
40#include <sys/systm.h>
41#include <sys/errno.h>
42#include <sys/sysproto.h>
43#include <sys/malloc.h>
44#include <sys/osd.h>
45#include <sys/priv.h>
46#include <sys/proc.h>
47#include <sys/taskqueue.h>
48#include <sys/fcntl.h>
49#include <sys/jail.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/racct.h>
53#include <sys/refcount.h>
54#include <sys/sx.h>
55#include <sys/sysent.h>
56#include <sys/namei.h>
57#include <sys/mount.h>
58#include <sys/queue.h>
59#include <sys/socket.h>
60#include <sys/syscallsubr.h>
61#include <sys/sysctl.h>
62#include <sys/vnode.h>
63
64#include <net/if.h>
65#include <net/vnet.h>
66
67#include <netinet/in.h>
68
69#ifdef DDB
70#include <ddb/ddb.h>
71#ifdef INET6
72#include <netinet6/in6_var.h>
73#endif /* INET6 */
74#endif /* DDB */
75
76#include <security/mac/mac_framework.h>
77
78#define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
79
80MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
81MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
82
83/* Keep struct prison prison0 and some code in kern_jail_set() readable. */
84#ifdef INET
85#ifdef INET6
86#define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
87#else
88#define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
89#endif
90#else /* !INET */
91#ifdef INET6
92#define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
93#else
94#define	_PR_IP_SADDRSEL	0
95#endif
96#endif
97
98/* prison0 describes what is "real" about the system. */
99struct prison prison0 = {
100	.pr_id		= 0,
101	.pr_name	= "0",
102	.pr_ref		= 1,
103	.pr_uref	= 1,
104	.pr_path	= "/",
105	.pr_securelevel	= -1,
106	.pr_childmax	= JAIL_MAX,
107	.pr_hostuuid	= DEFAULT_HOSTUUID,
108	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
109#ifdef VIMAGE
110	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
111#else
112	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
113#endif
114	.pr_allow	= PR_ALLOW_ALL,
115};
116MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
117
118/* allprison, allprison_racct and lastprid are protected by allprison_lock. */
119struct	sx allprison_lock;
120SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
121struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
122LIST_HEAD(, prison_racct) allprison_racct;
123int	lastprid = 0;
124
125static int do_jail_attach(struct thread *td, struct prison *pr);
126static void prison_complete(void *context, int pending);
127static void prison_deref(struct prison *pr, int flags);
128static char *prison_path(struct prison *pr1, struct prison *pr2);
129static void prison_remove_one(struct prison *pr);
130#ifdef RACCT
131static void prison_racct_attach(struct prison *pr);
132static void prison_racct_detach(struct prison *pr);
133#endif
134#ifdef INET
135static int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
136static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4);
137#endif
138#ifdef INET6
139static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
140static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6);
141#endif
142
143/* Flags for prison_deref */
144#define	PD_DEREF	0x01
145#define	PD_DEUREF	0x02
146#define	PD_LOCKED	0x04
147#define	PD_LIST_SLOCKED	0x08
148#define	PD_LIST_XLOCKED	0x10
149
150/*
151 * Parameter names corresponding to PR_* flag values.  Size values are for kvm
152 * as we cannot figure out the size of a sparse array, or an array without a
153 * terminating entry.
154 */
155static char *pr_flag_names[] = {
156	[0] = "persist",
157#ifdef INET
158	[7] = "ip4.saddrsel",
159#endif
160#ifdef INET6
161	[8] = "ip6.saddrsel",
162#endif
163};
164const size_t pr_flag_names_size = sizeof(pr_flag_names);
165
166static char *pr_flag_nonames[] = {
167	[0] = "nopersist",
168#ifdef INET
169	[7] = "ip4.nosaddrsel",
170#endif
171#ifdef INET6
172	[8] = "ip6.nosaddrsel",
173#endif
174};
175const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames);
176
177struct jailsys_flags {
178	const char	*name;
179	unsigned	 disable;
180	unsigned	 new;
181} pr_flag_jailsys[] = {
182	{ "host", 0, PR_HOST },
183#ifdef VIMAGE
184	{ "vnet", 0, PR_VNET },
185#endif
186#ifdef INET
187	{ "ip4", PR_IP4_USER | PR_IP4_DISABLE, PR_IP4_USER },
188#endif
189#ifdef INET6
190	{ "ip6", PR_IP6_USER | PR_IP6_DISABLE, PR_IP6_USER },
191#endif
192};
193const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
194
195static char *pr_allow_names[] = {
196	"allow.set_hostname",
197	"allow.sysvipc",
198	"allow.raw_sockets",
199	"allow.chflags",
200	"allow.mount",
201	"allow.quotas",
202	"allow.socket_af",
203};
204const size_t pr_allow_names_size = sizeof(pr_allow_names);
205
206static char *pr_allow_nonames[] = {
207	"allow.noset_hostname",
208	"allow.nosysvipc",
209	"allow.noraw_sockets",
210	"allow.nochflags",
211	"allow.nomount",
212	"allow.noquotas",
213	"allow.nosocket_af",
214};
215const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);
216
217#define	JAIL_DEFAULT_ALLOW		PR_ALLOW_SET_HOSTNAME
218#define	JAIL_DEFAULT_ENFORCE_STATFS	2
219static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
220static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
221#if defined(INET) || defined(INET6)
222static unsigned jail_max_af_ips = 255;
223#endif
224
225#ifdef INET
226static int
227qcmp_v4(const void *ip1, const void *ip2)
228{
229	in_addr_t iaa, iab;
230
231	/*
232	 * We need to compare in HBO here to get the list sorted as expected
233	 * by the result of the code.  Sorting NBO addresses gives you
234	 * interesting results.  If you do not understand, do not try.
235	 */
236	iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
237	iab = ntohl(((const struct in_addr *)ip2)->s_addr);
238
239	/*
240	 * Do not simply return the difference of the two numbers, the int is
241	 * not wide enough.
242	 */
243	if (iaa > iab)
244		return (1);
245	else if (iaa < iab)
246		return (-1);
247	else
248		return (0);
249}
250#endif
251
252#ifdef INET6
253static int
254qcmp_v6(const void *ip1, const void *ip2)
255{
256	const struct in6_addr *ia6a, *ia6b;
257	int i, rc;
258
259	ia6a = (const struct in6_addr *)ip1;
260	ia6b = (const struct in6_addr *)ip2;
261
262	rc = 0;
263	for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
264		if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
265			rc = 1;
266		else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
267			rc = -1;
268	}
269	return (rc);
270}
271#endif
272
273/*
274 * struct jail_args {
275 *	struct jail *jail;
276 * };
277 */
278int
279jail(struct thread *td, struct jail_args *uap)
280{
281	uint32_t version;
282	int error;
283	struct jail j;
284
285	error = copyin(uap->jail, &version, sizeof(uint32_t));
286	if (error)
287		return (error);
288
289	switch (version) {
290	case 0:
291	{
292		struct jail_v0 j0;
293
294		/* FreeBSD single IPv4 jails. */
295		bzero(&j, sizeof(struct jail));
296		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
297		if (error)
298			return (error);
299		j.version = j0.version;
300		j.path = j0.path;
301		j.hostname = j0.hostname;
302		j.ip4s = j0.ip_number;
303		break;
304	}
305
306	case 1:
307		/*
308		 * Version 1 was used by multi-IPv4 jail implementations
309		 * that never made it into the official kernel.
310		 */
311		return (EINVAL);
312
313	case 2:	/* JAIL_API_VERSION */
314		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
315		error = copyin(uap->jail, &j, sizeof(struct jail));
316		if (error)
317			return (error);
318		break;
319
320	default:
321		/* Sci-Fi jails are not supported, sorry. */
322		return (EINVAL);
323	}
324	return (kern_jail(td, &j));
325}
326
327int
328kern_jail(struct thread *td, struct jail *j)
329{
330	struct iovec optiov[2 * (4
331			    + sizeof(pr_allow_names) / sizeof(pr_allow_names[0])
332#ifdef INET
333			    + 1
334#endif
335#ifdef INET6
336			    + 1
337#endif
338			    )];
339	struct uio opt;
340	char *u_path, *u_hostname, *u_name;
341#ifdef INET
342	uint32_t ip4s;
343	struct in_addr *u_ip4;
344#endif
345#ifdef INET6
346	struct in6_addr *u_ip6;
347#endif
348	size_t tmplen;
349	int error, enforce_statfs, fi;
350
351	bzero(&optiov, sizeof(optiov));
352	opt.uio_iov = optiov;
353	opt.uio_iovcnt = 0;
354	opt.uio_offset = -1;
355	opt.uio_resid = -1;
356	opt.uio_segflg = UIO_SYSSPACE;
357	opt.uio_rw = UIO_READ;
358	opt.uio_td = td;
359
360	/* Set permissions for top-level jails from sysctls. */
361	if (!jailed(td->td_ucred)) {
362		for (fi = 0; fi < sizeof(pr_allow_names) /
363		     sizeof(pr_allow_names[0]); fi++) {
364			optiov[opt.uio_iovcnt].iov_base =
365			    (jail_default_allow & (1 << fi))
366			    ? pr_allow_names[fi] : pr_allow_nonames[fi];
367			optiov[opt.uio_iovcnt].iov_len =
368			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
369			opt.uio_iovcnt += 2;
370		}
371		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
372		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
373		opt.uio_iovcnt++;
374		enforce_statfs = jail_default_enforce_statfs;
375		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
376		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
377		opt.uio_iovcnt++;
378	}
379
380	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
381#ifdef INET
382	ip4s = (j->version == 0) ? 1 : j->ip4s;
383	if (ip4s > jail_max_af_ips)
384		return (EINVAL);
385	tmplen += ip4s * sizeof(struct in_addr);
386#else
387	if (j->ip4s > 0)
388		return (EINVAL);
389#endif
390#ifdef INET6
391	if (j->ip6s > jail_max_af_ips)
392		return (EINVAL);
393	tmplen += j->ip6s * sizeof(struct in6_addr);
394#else
395	if (j->ip6s > 0)
396		return (EINVAL);
397#endif
398	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
399	u_hostname = u_path + MAXPATHLEN;
400	u_name = u_hostname + MAXHOSTNAMELEN;
401#ifdef INET
402	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
403#endif
404#ifdef INET6
405#ifdef INET
406	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
407#else
408	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
409#endif
410#endif
411	optiov[opt.uio_iovcnt].iov_base = "path";
412	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
413	opt.uio_iovcnt++;
414	optiov[opt.uio_iovcnt].iov_base = u_path;
415	error = copyinstr(j->path, u_path, MAXPATHLEN,
416	    &optiov[opt.uio_iovcnt].iov_len);
417	if (error) {
418		free(u_path, M_TEMP);
419		return (error);
420	}
421	opt.uio_iovcnt++;
422	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
423	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
424	opt.uio_iovcnt++;
425	optiov[opt.uio_iovcnt].iov_base = u_hostname;
426	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
427	    &optiov[opt.uio_iovcnt].iov_len);
428	if (error) {
429		free(u_path, M_TEMP);
430		return (error);
431	}
432	opt.uio_iovcnt++;
433	if (j->jailname != NULL) {
434		optiov[opt.uio_iovcnt].iov_base = "name";
435		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
436		opt.uio_iovcnt++;
437		optiov[opt.uio_iovcnt].iov_base = u_name;
438		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
439		    &optiov[opt.uio_iovcnt].iov_len);
440		if (error) {
441			free(u_path, M_TEMP);
442			return (error);
443		}
444		opt.uio_iovcnt++;
445	}
446#ifdef INET
447	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
448	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
449	opt.uio_iovcnt++;
450	optiov[opt.uio_iovcnt].iov_base = u_ip4;
451	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
452	if (j->version == 0)
453		u_ip4->s_addr = j->ip4s;
454	else {
455		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
456		if (error) {
457			free(u_path, M_TEMP);
458			return (error);
459		}
460	}
461	opt.uio_iovcnt++;
462#endif
463#ifdef INET6
464	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
465	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
466	opt.uio_iovcnt++;
467	optiov[opt.uio_iovcnt].iov_base = u_ip6;
468	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
469	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
470	if (error) {
471		free(u_path, M_TEMP);
472		return (error);
473	}
474	opt.uio_iovcnt++;
475#endif
476	KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
477	    ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
478	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
479	free(u_path, M_TEMP);
480	return (error);
481}
482
483
484/*
485 * struct jail_set_args {
486 *	struct iovec *iovp;
487 *	unsigned int iovcnt;
488 *	int flags;
489 * };
490 */
491int
492jail_set(struct thread *td, struct jail_set_args *uap)
493{
494	struct uio *auio;
495	int error;
496
497	/* Check that we have an even number of iovecs. */
498	if (uap->iovcnt & 1)
499		return (EINVAL);
500
501	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
502	if (error)
503		return (error);
504	error = kern_jail_set(td, auio, uap->flags);
505	free(auio, M_IOV);
506	return (error);
507}
508
509int
510kern_jail_set(struct thread *td, struct uio *optuio, int flags)
511{
512	struct nameidata nd;
513#ifdef INET
514	struct in_addr *ip4;
515#endif
516#ifdef INET6
517	struct in6_addr *ip6;
518#endif
519	struct vfsopt *opt;
520	struct vfsoptlist *opts;
521	struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
522	struct vnode *root;
523	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
524#if defined(INET) || defined(INET6)
525	struct prison *tppr;
526	void *op;
527#endif
528	unsigned long hid;
529	size_t namelen, onamelen;
530	int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos;
531	int gotchildmax, gotenforce, gothid, gotslevel;
532	int fi, jid, jsys, len, level;
533	int childmax, slevel, vfslocked;
534#if defined(INET) || defined(INET6)
535	int ii, ij;
536#endif
537#ifdef INET
538	int ip4s, redo_ip4;
539#endif
540#ifdef INET6
541	int ip6s, redo_ip6;
542#endif
543	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
544	unsigned tallow;
545	char numbuf[12];
546
547	error = priv_check(td, PRIV_JAIL_SET);
548	if (!error && (flags & JAIL_ATTACH))
549		error = priv_check(td, PRIV_JAIL_ATTACH);
550	if (error)
551		return (error);
552	mypr = ppr = td->td_ucred->cr_prison;
553	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
554		return (EPERM);
555	if (flags & ~JAIL_SET_MASK)
556		return (EINVAL);
557
558	/*
559	 * Check all the parameters before committing to anything.  Not all
560	 * errors can be caught early, but we may as well try.  Also, this
561	 * takes care of some expensive stuff (path lookup) before getting
562	 * the allprison lock.
563	 *
564	 * XXX Jails are not filesystems, and jail parameters are not mount
565	 *     options.  But it makes more sense to re-use the vfsopt code
566	 *     than duplicate it under a different name.
567	 */
568	error = vfs_buildopts(optuio, &opts);
569	if (error)
570		return (error);
571#ifdef INET
572	ip4 = NULL;
573#endif
574#ifdef INET6
575	ip6 = NULL;
576#endif
577
578	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
579	if (error == ENOENT)
580		jid = 0;
581	else if (error != 0)
582		goto done_free;
583
584	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
585	if (error == ENOENT)
586		gotslevel = 0;
587	else if (error != 0)
588		goto done_free;
589	else
590		gotslevel = 1;
591
592	error =
593	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
594	if (error == ENOENT)
595		gotchildmax = 0;
596	else if (error != 0)
597		goto done_free;
598	else
599		gotchildmax = 1;
600
601	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
602	if (error == ENOENT)
603		gotenforce = 0;
604	else if (error != 0)
605		goto done_free;
606	else if (enforce < 0 || enforce > 2) {
607		error = EINVAL;
608		goto done_free;
609	} else
610		gotenforce = 1;
611
612	pr_flags = ch_flags = 0;
613	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
614	    fi++) {
615		if (pr_flag_names[fi] == NULL)
616			continue;
617		vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
618		vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
619	}
620	ch_flags |= pr_flags;
621	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
622	    fi++) {
623		error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
624		    sizeof(jsys));
625		if (error == ENOENT)
626			continue;
627		if (error != 0)
628			goto done_free;
629		switch (jsys) {
630		case JAIL_SYS_DISABLE:
631			if (!pr_flag_jailsys[fi].disable) {
632				error = EINVAL;
633				goto done_free;
634			}
635			pr_flags |= pr_flag_jailsys[fi].disable;
636			break;
637		case JAIL_SYS_NEW:
638			pr_flags |= pr_flag_jailsys[fi].new;
639			break;
640		case JAIL_SYS_INHERIT:
641			break;
642		default:
643			error = EINVAL;
644			goto done_free;
645		}
646		ch_flags |=
647		    pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
648	}
649	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
650	    && !(pr_flags & PR_PERSIST)) {
651		error = EINVAL;
652		vfs_opterror(opts, "new jail must persist or attach");
653		goto done_errmsg;
654	}
655#ifdef VIMAGE
656	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
657		error = EINVAL;
658		vfs_opterror(opts, "vnet cannot be changed after creation");
659		goto done_errmsg;
660	}
661#endif
662#ifdef INET
663	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
664		error = EINVAL;
665		vfs_opterror(opts, "ip4 cannot be changed after creation");
666		goto done_errmsg;
667	}
668#endif
669#ifdef INET6
670	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
671		error = EINVAL;
672		vfs_opterror(opts, "ip6 cannot be changed after creation");
673		goto done_errmsg;
674	}
675#endif
676
677	pr_allow = ch_allow = 0;
678	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
679	    fi++) {
680		vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
681		vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
682	}
683	ch_allow |= pr_allow;
684
685	error = vfs_getopt(opts, "name", (void **)&name, &len);
686	if (error == ENOENT)
687		name = NULL;
688	else if (error != 0)
689		goto done_free;
690	else {
691		if (len == 0 || name[len - 1] != '\0') {
692			error = EINVAL;
693			goto done_free;
694		}
695		if (len > MAXHOSTNAMELEN) {
696			error = ENAMETOOLONG;
697			goto done_free;
698		}
699	}
700
701	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
702	if (error == ENOENT)
703		host = NULL;
704	else if (error != 0)
705		goto done_free;
706	else {
707		ch_flags |= PR_HOST;
708		pr_flags |= PR_HOST;
709		if (len == 0 || host[len - 1] != '\0') {
710			error = EINVAL;
711			goto done_free;
712		}
713		if (len > MAXHOSTNAMELEN) {
714			error = ENAMETOOLONG;
715			goto done_free;
716		}
717	}
718
719	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
720	if (error == ENOENT)
721		domain = NULL;
722	else if (error != 0)
723		goto done_free;
724	else {
725		ch_flags |= PR_HOST;
726		pr_flags |= PR_HOST;
727		if (len == 0 || domain[len - 1] != '\0') {
728			error = EINVAL;
729			goto done_free;
730		}
731		if (len > MAXHOSTNAMELEN) {
732			error = ENAMETOOLONG;
733			goto done_free;
734		}
735	}
736
737	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
738	if (error == ENOENT)
739		uuid = NULL;
740	else if (error != 0)
741		goto done_free;
742	else {
743		ch_flags |= PR_HOST;
744		pr_flags |= PR_HOST;
745		if (len == 0 || uuid[len - 1] != '\0') {
746			error = EINVAL;
747			goto done_free;
748		}
749		if (len > HOSTUUIDLEN) {
750			error = ENAMETOOLONG;
751			goto done_free;
752		}
753	}
754
755#ifdef COMPAT_FREEBSD32
756	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
757		uint32_t hid32;
758
759		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
760		hid = hid32;
761	} else
762#endif
763		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
764	if (error == ENOENT)
765		gothid = 0;
766	else if (error != 0)
767		goto done_free;
768	else {
769		gothid = 1;
770		ch_flags |= PR_HOST;
771		pr_flags |= PR_HOST;
772	}
773
774#ifdef INET
775	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
776	if (error == ENOENT)
777		ip4s = (pr_flags & PR_IP4_DISABLE) ? 0 : -1;
778	else if (error != 0)
779		goto done_free;
780	else if (ip4s & (sizeof(*ip4) - 1)) {
781		error = EINVAL;
782		goto done_free;
783	} else {
784		ch_flags |= PR_IP4_USER | PR_IP4_DISABLE;
785		if (ip4s == 0)
786			pr_flags |= PR_IP4_USER | PR_IP4_DISABLE;
787		else {
788			pr_flags = (pr_flags & ~PR_IP4_DISABLE) | PR_IP4_USER;
789			ip4s /= sizeof(*ip4);
790			if (ip4s > jail_max_af_ips) {
791				error = EINVAL;
792				vfs_opterror(opts, "too many IPv4 addresses");
793				goto done_errmsg;
794			}
795			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
796			bcopy(op, ip4, ip4s * sizeof(*ip4));
797			/*
798			 * IP addresses are all sorted but ip[0] to preserve
799			 * the primary IP address as given from userland.
800			 * This special IP is used for unbound outgoing
801			 * connections as well for "loopback" traffic in case
802			 * source address selection cannot find any more fitting
803			 * address to connect from.
804			 */
805			if (ip4s > 1)
806				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
807			/*
808			 * Check for duplicate addresses and do some simple
809			 * zero and broadcast checks. If users give other bogus
810			 * addresses it is their problem.
811			 *
812			 * We do not have to care about byte order for these
813			 * checks so we will do them in NBO.
814			 */
815			for (ii = 0; ii < ip4s; ii++) {
816				if (ip4[ii].s_addr == INADDR_ANY ||
817				    ip4[ii].s_addr == INADDR_BROADCAST) {
818					error = EINVAL;
819					goto done_free;
820				}
821				if ((ii+1) < ip4s &&
822				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
823				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
824					error = EINVAL;
825					goto done_free;
826				}
827			}
828		}
829	}
830#endif
831
832#ifdef INET6
833	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
834	if (error == ENOENT)
835		ip6s = (pr_flags & PR_IP6_DISABLE) ? 0 : -1;
836	else if (error != 0)
837		goto done_free;
838	else if (ip6s & (sizeof(*ip6) - 1)) {
839		error = EINVAL;
840		goto done_free;
841	} else {
842		ch_flags |= PR_IP6_USER | PR_IP6_DISABLE;
843		if (ip6s == 0)
844			pr_flags |= PR_IP6_USER | PR_IP6_DISABLE;
845		else {
846			pr_flags = (pr_flags & ~PR_IP6_DISABLE) | PR_IP6_USER;
847			ip6s /= sizeof(*ip6);
848			if (ip6s > jail_max_af_ips) {
849				error = EINVAL;
850				vfs_opterror(opts, "too many IPv6 addresses");
851				goto done_errmsg;
852			}
853			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
854			bcopy(op, ip6, ip6s * sizeof(*ip6));
855			if (ip6s > 1)
856				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
857			for (ii = 0; ii < ip6s; ii++) {
858				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
859					error = EINVAL;
860					goto done_free;
861				}
862				if ((ii+1) < ip6s &&
863				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
864				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
865				{
866					error = EINVAL;
867					goto done_free;
868				}
869			}
870		}
871	}
872#endif
873
874#if defined(VIMAGE) && (defined(INET) || defined(INET6))
875	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
876		error = EINVAL;
877		vfs_opterror(opts,
878		    "vnet jails cannot have IP address restrictions");
879		goto done_errmsg;
880	}
881#endif
882
883	root = NULL;
884	error = vfs_getopt(opts, "path", (void **)&path, &len);
885	if (error == ENOENT)
886		path = NULL;
887	else if (error != 0)
888		goto done_free;
889	else {
890		if (flags & JAIL_UPDATE) {
891			error = EINVAL;
892			vfs_opterror(opts,
893			    "path cannot be changed after creation");
894			goto done_errmsg;
895		}
896		if (len == 0 || path[len - 1] != '\0') {
897			error = EINVAL;
898			goto done_free;
899		}
900		if (len < 2 || (len == 2 && path[0] == '/'))
901			path = NULL;
902		else {
903			/* Leave room for a real-root full pathname. */
904			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
905			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
906				error = ENAMETOOLONG;
907				goto done_free;
908			}
909			NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_SYSSPACE,
910			    path, td);
911			error = namei(&nd);
912			if (error)
913				goto done_free;
914			vfslocked = NDHASGIANT(&nd);
915			root = nd.ni_vp;
916			NDFREE(&nd, NDF_ONLY_PNBUF);
917			if (root->v_type != VDIR) {
918				error = ENOTDIR;
919				vrele(root);
920				VFS_UNLOCK_GIANT(vfslocked);
921				goto done_free;
922			}
923			VFS_UNLOCK_GIANT(vfslocked);
924		}
925	}
926
927	/*
928	 * Grab the allprison lock before letting modules check their
929	 * parameters.  Once we have it, do not let go so we'll have a
930	 * consistent view of the OSD list.
931	 */
932	sx_xlock(&allprison_lock);
933	error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
934	if (error)
935		goto done_unlock_list;
936
937	/* By now, all parameters should have been noted. */
938	TAILQ_FOREACH(opt, opts, link) {
939		if (!opt->seen && strcmp(opt->name, "errmsg")) {
940			error = EINVAL;
941			vfs_opterror(opts, "unknown parameter: %s", opt->name);
942			goto done_unlock_list;
943		}
944	}
945
946	/*
947	 * See if we are creating a new record or updating an existing one.
948	 * This abuses the file error codes ENOENT and EEXIST.
949	 */
950	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
951	if (!cuflags) {
952		error = EINVAL;
953		vfs_opterror(opts, "no valid operation (create or update)");
954		goto done_unlock_list;
955	}
956	pr = NULL;
957	namelc = NULL;
958	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
959		namelc = strrchr(name, '.');
960		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
961		if (*p != '\0')
962			jid = 0;
963	}
964	if (jid != 0) {
965		/*
966		 * See if a requested jid already exists.  There is an
967		 * information leak here if the jid exists but is not within
968		 * the caller's jail hierarchy.  Jail creators will get EEXIST
969		 * even though they cannot see the jail, and CREATE | UPDATE
970		 * will return ENOENT which is not normally a valid error.
971		 */
972		if (jid < 0) {
973			error = EINVAL;
974			vfs_opterror(opts, "negative jid");
975			goto done_unlock_list;
976		}
977		pr = prison_find(jid);
978		if (pr != NULL) {
979			ppr = pr->pr_parent;
980			/* Create: jid must not exist. */
981			if (cuflags == JAIL_CREATE) {
982				mtx_unlock(&pr->pr_mtx);
983				error = EEXIST;
984				vfs_opterror(opts, "jail %d already exists",
985				    jid);
986				goto done_unlock_list;
987			}
988			if (!prison_ischild(mypr, pr)) {
989				mtx_unlock(&pr->pr_mtx);
990				pr = NULL;
991			} else if (pr->pr_uref == 0) {
992				if (!(flags & JAIL_DYING)) {
993					mtx_unlock(&pr->pr_mtx);
994					error = ENOENT;
995					vfs_opterror(opts, "jail %d is dying",
996					    jid);
997					goto done_unlock_list;
998				} else if ((flags & JAIL_ATTACH) ||
999				    (pr_flags & PR_PERSIST)) {
1000					/*
1001					 * A dying jail might be resurrected
1002					 * (via attach or persist), but first
1003					 * it must determine if another jail
1004					 * has claimed its name.  Accomplish
1005					 * this by implicitly re-setting the
1006					 * name.
1007					 */
1008					if (name == NULL)
1009						name = prison_name(mypr, pr);
1010				}
1011			}
1012		}
1013		if (pr == NULL) {
1014			/* Update: jid must exist. */
1015			if (cuflags == JAIL_UPDATE) {
1016				error = ENOENT;
1017				vfs_opterror(opts, "jail %d not found", jid);
1018				goto done_unlock_list;
1019			}
1020		}
1021	}
1022	/*
1023	 * If the caller provided a name, look for a jail by that name.
1024	 * This has different semantics for creates and updates keyed by jid
1025	 * (where the name must not already exist in a different jail),
1026	 * and updates keyed by the name itself (where the name must exist
1027	 * because that is the jail being updated).
1028	 */
1029	if (name != NULL) {
1030		namelc = strrchr(name, '.');
1031		if (namelc == NULL)
1032			namelc = name;
1033		else {
1034			/*
1035			 * This is a hierarchical name.  Split it into the
1036			 * parent and child names, and make sure the parent
1037			 * exists or matches an already found jail.
1038			 */
1039			*namelc = '\0';
1040			if (pr != NULL) {
1041				if (strncmp(name, ppr->pr_name, namelc - name)
1042				    || ppr->pr_name[namelc - name] != '\0') {
1043					mtx_unlock(&pr->pr_mtx);
1044					error = EINVAL;
1045					vfs_opterror(opts,
1046					    "cannot change jail's parent");
1047					goto done_unlock_list;
1048				}
1049			} else {
1050				ppr = prison_find_name(mypr, name);
1051				if (ppr == NULL) {
1052					error = ENOENT;
1053					vfs_opterror(opts,
1054					    "jail \"%s\" not found", name);
1055					goto done_unlock_list;
1056				}
1057				mtx_unlock(&ppr->pr_mtx);
1058			}
1059			name = ++namelc;
1060		}
1061		if (name[0] != '\0') {
1062			namelen =
1063			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1064 name_again:
1065			deadpr = NULL;
1066			FOREACH_PRISON_CHILD(ppr, tpr) {
1067				if (tpr != pr && tpr->pr_ref > 0 &&
1068				    !strcmp(tpr->pr_name + namelen, name)) {
1069					if (pr == NULL &&
1070					    cuflags != JAIL_CREATE) {
1071						mtx_lock(&tpr->pr_mtx);
1072						if (tpr->pr_ref > 0) {
1073							/*
1074							 * Use this jail
1075							 * for updates.
1076							 */
1077							if (tpr->pr_uref > 0) {
1078								pr = tpr;
1079								break;
1080							}
1081							deadpr = tpr;
1082						}
1083						mtx_unlock(&tpr->pr_mtx);
1084					} else if (tpr->pr_uref > 0) {
1085						/*
1086						 * Create, or update(jid):
1087						 * name must not exist in an
1088						 * active sibling jail.
1089						 */
1090						error = EEXIST;
1091						if (pr != NULL)
1092							mtx_unlock(&pr->pr_mtx);
1093						vfs_opterror(opts,
1094						   "jail \"%s\" already exists",
1095						   name);
1096						goto done_unlock_list;
1097					}
1098				}
1099			}
1100			/* If no active jail is found, use a dying one. */
1101			if (deadpr != NULL && pr == NULL) {
1102				if (flags & JAIL_DYING) {
1103					mtx_lock(&deadpr->pr_mtx);
1104					if (deadpr->pr_ref == 0) {
1105						mtx_unlock(&deadpr->pr_mtx);
1106						goto name_again;
1107					}
1108					pr = deadpr;
1109				} else if (cuflags == JAIL_UPDATE) {
1110					error = ENOENT;
1111					vfs_opterror(opts,
1112					    "jail \"%s\" is dying", name);
1113					goto done_unlock_list;
1114				}
1115			}
1116			/* Update: name must exist if no jid. */
1117			else if (cuflags == JAIL_UPDATE && pr == NULL) {
1118				error = ENOENT;
1119				vfs_opterror(opts, "jail \"%s\" not found",
1120				    name);
1121				goto done_unlock_list;
1122			}
1123		}
1124	}
1125	/* Update: must provide a jid or name. */
1126	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1127		error = ENOENT;
1128		vfs_opterror(opts, "update specified no jail");
1129		goto done_unlock_list;
1130	}
1131
1132	/* If there's no prison to update, create a new one and link it in. */
1133	if (pr == NULL) {
1134		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1135			if (tpr->pr_childcount >= tpr->pr_childmax) {
1136				error = EPERM;
1137				vfs_opterror(opts, "prison limit exceeded");
1138				goto done_unlock_list;
1139			}
1140		created = 1;
1141		mtx_lock(&ppr->pr_mtx);
1142		if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) {
1143			mtx_unlock(&ppr->pr_mtx);
1144			error = ENOENT;
1145			vfs_opterror(opts, "parent jail went away!");
1146			goto done_unlock_list;
1147		}
1148		ppr->pr_ref++;
1149		ppr->pr_uref++;
1150		mtx_unlock(&ppr->pr_mtx);
1151		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1152		if (jid == 0) {
1153			/* Find the next free jid. */
1154			jid = lastprid + 1;
1155 findnext:
1156			if (jid == JAIL_MAX)
1157				jid = 1;
1158			TAILQ_FOREACH(tpr, &allprison, pr_list) {
1159				if (tpr->pr_id < jid)
1160					continue;
1161				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
1162					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1163					break;
1164				}
1165				if (jid == lastprid) {
1166					error = EAGAIN;
1167					vfs_opterror(opts,
1168					    "no available jail IDs");
1169					free(pr, M_PRISON);
1170					prison_deref(ppr, PD_DEREF |
1171					    PD_DEUREF | PD_LIST_XLOCKED);
1172					goto done_releroot;
1173				}
1174				jid++;
1175				goto findnext;
1176			}
1177			lastprid = jid;
1178		} else {
1179			/*
1180			 * The jail already has a jid (that did not yet exist),
1181			 * so just find where to insert it.
1182			 */
1183			TAILQ_FOREACH(tpr, &allprison, pr_list)
1184				if (tpr->pr_id >= jid) {
1185					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1186					break;
1187				}
1188		}
1189		if (tpr == NULL)
1190			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1191		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1192		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1193			tpr->pr_childcount++;
1194
1195		pr->pr_parent = ppr;
1196		pr->pr_id = jid;
1197
1198		/* Set some default values, and inherit some from the parent. */
1199		if (name == NULL)
1200			name = "";
1201		if (path == NULL) {
1202			path = "/";
1203			root = mypr->pr_root;
1204			vref(root);
1205		}
1206		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1207		pr->pr_flags |= PR_HOST;
1208#if defined(INET) || defined(INET6)
1209#ifdef VIMAGE
1210		if (!(pr_flags & PR_VNET))
1211#endif
1212		{
1213#ifdef INET
1214			if (!(ch_flags & PR_IP4_USER))
1215				pr->pr_flags |=
1216				    PR_IP4 | PR_IP4_USER | PR_IP4_DISABLE;
1217			else if (!(pr_flags & PR_IP4_USER)) {
1218				pr->pr_flags |= ppr->pr_flags & PR_IP4;
1219				if (ppr->pr_ip4 != NULL) {
1220					pr->pr_ip4s = ppr->pr_ip4s;
1221					pr->pr_ip4 = malloc(pr->pr_ip4s *
1222					    sizeof(struct in_addr), M_PRISON,
1223					    M_WAITOK);
1224					bcopy(ppr->pr_ip4, pr->pr_ip4,
1225					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1226				}
1227			}
1228#endif
1229#ifdef INET6
1230			if (!(ch_flags & PR_IP6_USER))
1231				pr->pr_flags |=
1232				    PR_IP6 | PR_IP6_USER | PR_IP6_DISABLE;
1233			else if (!(pr_flags & PR_IP6_USER)) {
1234				pr->pr_flags |= ppr->pr_flags & PR_IP6;
1235				if (ppr->pr_ip6 != NULL) {
1236					pr->pr_ip6s = ppr->pr_ip6s;
1237					pr->pr_ip6 = malloc(pr->pr_ip6s *
1238					    sizeof(struct in6_addr), M_PRISON,
1239					    M_WAITOK);
1240					bcopy(ppr->pr_ip6, pr->pr_ip6,
1241					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1242				}
1243			}
1244#endif
1245		}
1246#endif
1247		/* Source address selection is always on by default. */
1248		pr->pr_flags |= _PR_IP_SADDRSEL;
1249
1250		pr->pr_securelevel = ppr->pr_securelevel;
1251		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1252		pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
1253
1254		LIST_INIT(&pr->pr_children);
1255		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1256
1257#ifdef VIMAGE
1258		/* Allocate a new vnet if specified. */
1259		pr->pr_vnet = (pr_flags & PR_VNET)
1260		    ? vnet_alloc() : ppr->pr_vnet;
1261#endif
1262		/*
1263		 * Allocate a dedicated cpuset for each jail.
1264		 * Unlike other initial settings, this may return an erorr.
1265		 */
1266		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1267		if (error) {
1268			prison_deref(pr, PD_LIST_XLOCKED);
1269			goto done_releroot;
1270		}
1271
1272		mtx_lock(&pr->pr_mtx);
1273		/*
1274		 * New prisons do not yet have a reference, because we do not
1275		 * want other to see the incomplete prison once the
1276		 * allprison_lock is downgraded.
1277		 */
1278	} else {
1279		created = 0;
1280		/*
1281		 * Grab a reference for existing prisons, to ensure they
1282		 * continue to exist for the duration of the call.
1283		 */
1284		pr->pr_ref++;
1285#if defined(VIMAGE) && (defined(INET) || defined(INET6))
1286		if ((pr->pr_flags & PR_VNET) &&
1287		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1288			error = EINVAL;
1289			vfs_opterror(opts,
1290			    "vnet jails cannot have IP address restrictions");
1291			goto done_deref_locked;
1292		}
1293#endif
1294#ifdef INET
1295		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1296			error = EINVAL;
1297			vfs_opterror(opts,
1298			    "ip4 cannot be changed after creation");
1299			goto done_deref_locked;
1300		}
1301#endif
1302#ifdef INET6
1303		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1304			error = EINVAL;
1305			vfs_opterror(opts,
1306			    "ip6 cannot be changed after creation");
1307			goto done_deref_locked;
1308		}
1309#endif
1310	}
1311
1312	/* Do final error checking before setting anything. */
1313	if (gotslevel) {
1314		if (slevel < ppr->pr_securelevel) {
1315			error = EPERM;
1316			goto done_deref_locked;
1317		}
1318	}
1319	if (gotchildmax) {
1320		if (childmax >= ppr->pr_childmax) {
1321			error = EPERM;
1322			goto done_deref_locked;
1323		}
1324	}
1325	if (gotenforce) {
1326		if (enforce < ppr->pr_enforce_statfs) {
1327			error = EPERM;
1328			goto done_deref_locked;
1329		}
1330	}
1331#ifdef INET
1332	if (ip4s > 0) {
1333		if (ppr->pr_flags & PR_IP4) {
1334			/*
1335			 * Make sure the new set of IP addresses is a
1336			 * subset of the parent's list.  Don't worry
1337			 * about the parent being unlocked, as any
1338			 * setting is done with allprison_lock held.
1339			 */
1340			for (ij = 0; ij < ppr->pr_ip4s; ij++)
1341				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
1342					break;
1343			if (ij == ppr->pr_ip4s) {
1344				error = EPERM;
1345				goto done_deref_locked;
1346			}
1347			if (ip4s > 1) {
1348				for (ii = ij = 1; ii < ip4s; ii++) {
1349					if (ip4[ii].s_addr ==
1350					    ppr->pr_ip4[0].s_addr)
1351						continue;
1352					for (; ij < ppr->pr_ip4s; ij++)
1353						if (ip4[ii].s_addr ==
1354						    ppr->pr_ip4[ij].s_addr)
1355							break;
1356					if (ij == ppr->pr_ip4s)
1357						break;
1358				}
1359				if (ij == ppr->pr_ip4s) {
1360					error = EPERM;
1361					goto done_deref_locked;
1362				}
1363			}
1364		}
1365		/*
1366		 * Check for conflicting IP addresses.  We permit them
1367		 * if there is no more than one IP on each jail.  If
1368		 * there is a duplicate on a jail with more than one
1369		 * IP stop checking and return error.
1370		 */
1371		tppr = ppr;
1372#ifdef VIMAGE
1373		for (; tppr != &prison0; tppr = tppr->pr_parent)
1374			if (tppr->pr_flags & PR_VNET)
1375				break;
1376#endif
1377		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1378			if (tpr == pr ||
1379#ifdef VIMAGE
1380			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1381#endif
1382			    tpr->pr_uref == 0) {
1383				descend = 0;
1384				continue;
1385			}
1386			if (!(tpr->pr_flags & PR_IP4_USER))
1387				continue;
1388			descend = 0;
1389			if (tpr->pr_ip4 == NULL ||
1390			    (ip4s == 1 && tpr->pr_ip4s == 1))
1391				continue;
1392			for (ii = 0; ii < ip4s; ii++) {
1393				if (_prison_check_ip4(tpr, &ip4[ii]) == 0) {
1394					error = EADDRINUSE;
1395					vfs_opterror(opts,
1396					    "IPv4 addresses clash");
1397					goto done_deref_locked;
1398				}
1399			}
1400		}
1401	}
1402#endif
1403#ifdef INET6
1404	if (ip6s > 0) {
1405		if (ppr->pr_flags & PR_IP6) {
1406			/*
1407			 * Make sure the new set of IP addresses is a
1408			 * subset of the parent's list.
1409			 */
1410			for (ij = 0; ij < ppr->pr_ip6s; ij++)
1411				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
1412				    &ppr->pr_ip6[ij]))
1413					break;
1414			if (ij == ppr->pr_ip6s) {
1415				error = EPERM;
1416				goto done_deref_locked;
1417			}
1418			if (ip6s > 1) {
1419				for (ii = ij = 1; ii < ip6s; ii++) {
1420					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
1421					     &ppr->pr_ip6[0]))
1422						continue;
1423					for (; ij < ppr->pr_ip6s; ij++)
1424						if (IN6_ARE_ADDR_EQUAL(
1425						    &ip6[ii], &ppr->pr_ip6[ij]))
1426							break;
1427					if (ij == ppr->pr_ip6s)
1428						break;
1429				}
1430				if (ij == ppr->pr_ip6s) {
1431					error = EPERM;
1432					goto done_deref_locked;
1433				}
1434			}
1435		}
1436		/* Check for conflicting IP addresses. */
1437		tppr = ppr;
1438#ifdef VIMAGE
1439		for (; tppr != &prison0; tppr = tppr->pr_parent)
1440			if (tppr->pr_flags & PR_VNET)
1441				break;
1442#endif
1443		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1444			if (tpr == pr ||
1445#ifdef VIMAGE
1446			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1447#endif
1448			    tpr->pr_uref == 0) {
1449				descend = 0;
1450				continue;
1451			}
1452			if (!(tpr->pr_flags & PR_IP6_USER))
1453				continue;
1454			descend = 0;
1455			if (tpr->pr_ip6 == NULL ||
1456			    (ip6s == 1 && tpr->pr_ip6s == 1))
1457				continue;
1458			for (ii = 0; ii < ip6s; ii++) {
1459				if (_prison_check_ip6(tpr, &ip6[ii]) == 0) {
1460					error = EADDRINUSE;
1461					vfs_opterror(opts,
1462					    "IPv6 addresses clash");
1463					goto done_deref_locked;
1464				}
1465			}
1466		}
1467	}
1468#endif
1469	onamelen = namelen = 0;
1470	if (name != NULL) {
1471		/* Give a default name of the jid. */
1472		if (name[0] == '\0')
1473			snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
1474		else if (*namelc == '0' || (strtoul(namelc, &p, 10) != jid &&
1475		    *p == '\0')) {
1476			error = EINVAL;
1477			vfs_opterror(opts,
1478			    "name cannot be numeric (unless it is the jid)");
1479			goto done_deref_locked;
1480		}
1481		/*
1482		 * Make sure the name isn't too long for the prison or its
1483		 * children.
1484		 */
1485		onamelen = strlen(pr->pr_name);
1486		namelen = strlen(name);
1487		if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) {
1488			error = ENAMETOOLONG;
1489			goto done_deref_locked;
1490		}
1491		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1492			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1493			    sizeof(pr->pr_name)) {
1494				error = ENAMETOOLONG;
1495				goto done_deref_locked;
1496			}
1497		}
1498	}
1499	if (pr_allow & ~ppr->pr_allow) {
1500		error = EPERM;
1501		goto done_deref_locked;
1502	}
1503
1504	/* Set the parameters of the prison. */
1505#ifdef INET
1506	redo_ip4 = 0;
1507	if (pr_flags & PR_IP4_USER) {
1508		pr->pr_flags |= PR_IP4;
1509		free(pr->pr_ip4, M_PRISON);
1510		pr->pr_ip4s = ip4s;
1511		pr->pr_ip4 = ip4;
1512		ip4 = NULL;
1513		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1514#ifdef VIMAGE
1515			if (tpr->pr_flags & PR_VNET) {
1516				descend = 0;
1517				continue;
1518			}
1519#endif
1520			if (prison_restrict_ip4(tpr, NULL)) {
1521				redo_ip4 = 1;
1522				descend = 0;
1523			}
1524		}
1525	}
1526#endif
1527#ifdef INET6
1528	redo_ip6 = 0;
1529	if (pr_flags & PR_IP6_USER) {
1530		pr->pr_flags |= PR_IP6;
1531		free(pr->pr_ip6, M_PRISON);
1532		pr->pr_ip6s = ip6s;
1533		pr->pr_ip6 = ip6;
1534		ip6 = NULL;
1535		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1536#ifdef VIMAGE
1537			if (tpr->pr_flags & PR_VNET) {
1538				descend = 0;
1539				continue;
1540			}
1541#endif
1542			if (prison_restrict_ip6(tpr, NULL)) {
1543				redo_ip6 = 1;
1544				descend = 0;
1545			}
1546		}
1547	}
1548#endif
1549	if (gotslevel) {
1550		pr->pr_securelevel = slevel;
1551		/* Set all child jails to be at least this level. */
1552		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1553			if (tpr->pr_securelevel < slevel)
1554				tpr->pr_securelevel = slevel;
1555	}
1556	if (gotchildmax) {
1557		pr->pr_childmax = childmax;
1558		/* Set all child jails to under this limit. */
1559		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1560			if (tpr->pr_childmax > childmax - level)
1561				tpr->pr_childmax = childmax > level
1562				    ? childmax - level : 0;
1563	}
1564	if (gotenforce) {
1565		pr->pr_enforce_statfs = enforce;
1566		/* Pass this restriction on to the children. */
1567		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1568			if (tpr->pr_enforce_statfs < enforce)
1569				tpr->pr_enforce_statfs = enforce;
1570	}
1571	if (name != NULL) {
1572		if (ppr == &prison0)
1573			strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
1574		else
1575			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1576			    ppr->pr_name, name);
1577		/* Change this component of child names. */
1578		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1579			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1580			    strlen(tpr->pr_name + onamelen) + 1);
1581			bcopy(pr->pr_name, tpr->pr_name, namelen);
1582		}
1583	}
1584	if (path != NULL) {
1585		/* Try to keep a real-rooted full pathname. */
1586		if (path[0] == '/' && strcmp(mypr->pr_path, "/"))
1587			snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
1588			    mypr->pr_path, path);
1589		else
1590			strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1591		pr->pr_root = root;
1592	}
1593	if (PR_HOST & ch_flags & ~pr_flags) {
1594		if (pr->pr_flags & PR_HOST) {
1595			/*
1596			 * Copy the parent's host info.  As with pr_ip4 above,
1597			 * the lack of a lock on the parent is not a problem;
1598			 * it is always set with allprison_lock at least
1599			 * shared, and is held exclusively here.
1600			 */
1601			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1602			    sizeof(pr->pr_hostname));
1603			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1604			    sizeof(pr->pr_domainname));
1605			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1606			    sizeof(pr->pr_hostuuid));
1607			pr->pr_hostid = pr->pr_parent->pr_hostid;
1608		}
1609	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1610		/* Set this prison, and any descendants without PR_HOST. */
1611		if (host != NULL)
1612			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1613		if (domain != NULL)
1614			strlcpy(pr->pr_domainname, domain,
1615			    sizeof(pr->pr_domainname));
1616		if (uuid != NULL)
1617			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1618		if (gothid)
1619			pr->pr_hostid = hid;
1620		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1621			if (tpr->pr_flags & PR_HOST)
1622				descend = 0;
1623			else {
1624				if (host != NULL)
1625					strlcpy(tpr->pr_hostname,
1626					    pr->pr_hostname,
1627					    sizeof(tpr->pr_hostname));
1628				if (domain != NULL)
1629					strlcpy(tpr->pr_domainname,
1630					    pr->pr_domainname,
1631					    sizeof(tpr->pr_domainname));
1632				if (uuid != NULL)
1633					strlcpy(tpr->pr_hostuuid,
1634					    pr->pr_hostuuid,
1635					    sizeof(tpr->pr_hostuuid));
1636				if (gothid)
1637					tpr->pr_hostid = hid;
1638			}
1639		}
1640	}
1641	if ((tallow = ch_allow & ~pr_allow)) {
1642		/* Clear allow bits in all children. */
1643		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1644			tpr->pr_allow &= ~tallow;
1645	}
1646	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1647	/*
1648	 * Persistent prisons get an extra reference, and prisons losing their
1649	 * persist flag lose that reference.  Only do this for existing prisons
1650	 * for now, so new ones will remain unseen until after the module
1651	 * handlers have completed.
1652	 */
1653	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
1654		if (pr_flags & PR_PERSIST) {
1655			pr->pr_ref++;
1656			pr->pr_uref++;
1657		} else {
1658			pr->pr_ref--;
1659			pr->pr_uref--;
1660		}
1661	}
1662	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1663	mtx_unlock(&pr->pr_mtx);
1664
1665#ifdef RACCT
1666	if (created)
1667		prison_racct_attach(pr);
1668#endif
1669
1670	/* Locks may have prevented a complete restriction of child IP
1671	 * addresses.  If so, allocate some more memory and try again.
1672	 */
1673#ifdef INET
1674	while (redo_ip4) {
1675		ip4s = pr->pr_ip4s;
1676		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
1677		mtx_lock(&pr->pr_mtx);
1678		redo_ip4 = 0;
1679		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1680#ifdef VIMAGE
1681			if (tpr->pr_flags & PR_VNET) {
1682				descend = 0;
1683				continue;
1684			}
1685#endif
1686			if (prison_restrict_ip4(tpr, ip4)) {
1687				if (ip4 != NULL)
1688					ip4 = NULL;
1689				else
1690					redo_ip4 = 1;
1691			}
1692		}
1693		mtx_unlock(&pr->pr_mtx);
1694	}
1695#endif
1696#ifdef INET6
1697	while (redo_ip6) {
1698		ip6s = pr->pr_ip6s;
1699		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
1700		mtx_lock(&pr->pr_mtx);
1701		redo_ip6 = 0;
1702		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1703#ifdef VIMAGE
1704			if (tpr->pr_flags & PR_VNET) {
1705				descend = 0;
1706				continue;
1707			}
1708#endif
1709			if (prison_restrict_ip6(tpr, ip6)) {
1710				if (ip6 != NULL)
1711					ip6 = NULL;
1712				else
1713					redo_ip6 = 1;
1714			}
1715		}
1716		mtx_unlock(&pr->pr_mtx);
1717	}
1718#endif
1719
1720	/* Let the modules do their work. */
1721	sx_downgrade(&allprison_lock);
1722	if (created) {
1723		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1724		if (error) {
1725			prison_deref(pr, PD_LIST_SLOCKED);
1726			goto done_errmsg;
1727		}
1728	}
1729	error = osd_jail_call(pr, PR_METHOD_SET, opts);
1730	if (error) {
1731		prison_deref(pr, created
1732		    ? PD_LIST_SLOCKED
1733		    : PD_DEREF | PD_LIST_SLOCKED);
1734		goto done_errmsg;
1735	}
1736
1737	/* Attach this process to the prison if requested. */
1738	if (flags & JAIL_ATTACH) {
1739		mtx_lock(&pr->pr_mtx);
1740		error = do_jail_attach(td, pr);
1741		if (error) {
1742			vfs_opterror(opts, "attach failed");
1743			if (!created)
1744				prison_deref(pr, PD_DEREF);
1745			goto done_errmsg;
1746		}
1747	}
1748
1749	/*
1750	 * Now that it is all there, drop the temporary reference from existing
1751	 * prisons.  Or add a reference to newly created persistent prisons
1752	 * (which was not done earlier so that the prison would not be publicly
1753	 * visible).
1754	 */
1755	if (!created) {
1756		prison_deref(pr, (flags & JAIL_ATTACH)
1757		    ? PD_DEREF
1758		    : PD_DEREF | PD_LIST_SLOCKED);
1759	} else {
1760		if (pr_flags & PR_PERSIST) {
1761			mtx_lock(&pr->pr_mtx);
1762			pr->pr_ref++;
1763			pr->pr_uref++;
1764			mtx_unlock(&pr->pr_mtx);
1765		}
1766		if (!(flags & JAIL_ATTACH))
1767			sx_sunlock(&allprison_lock);
1768	}
1769	td->td_retval[0] = pr->pr_id;
1770	goto done_errmsg;
1771
1772 done_deref_locked:
1773	prison_deref(pr, created
1774	    ? PD_LOCKED | PD_LIST_XLOCKED
1775	    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
1776	goto done_releroot;
1777 done_unlock_list:
1778	sx_xunlock(&allprison_lock);
1779 done_releroot:
1780	if (root != NULL) {
1781		vfslocked = VFS_LOCK_GIANT(root->v_mount);
1782		vrele(root);
1783		VFS_UNLOCK_GIANT(vfslocked);
1784	}
1785 done_errmsg:
1786	if (error) {
1787		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
1788		if (errmsg_len > 0) {
1789			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1790			if (errmsg_pos > 0) {
1791				if (optuio->uio_segflg == UIO_SYSSPACE)
1792					bcopy(errmsg,
1793					   optuio->uio_iov[errmsg_pos].iov_base,
1794					   errmsg_len);
1795				else
1796					copyout(errmsg,
1797					   optuio->uio_iov[errmsg_pos].iov_base,
1798					   errmsg_len);
1799			}
1800		}
1801	}
1802 done_free:
1803#ifdef INET
1804	free(ip4, M_PRISON);
1805#endif
1806#ifdef INET6
1807	free(ip6, M_PRISON);
1808#endif
1809	vfs_freeopts(opts);
1810	return (error);
1811}
1812
1813
1814/*
1815 * struct jail_get_args {
1816 *	struct iovec *iovp;
1817 *	unsigned int iovcnt;
1818 *	int flags;
1819 * };
1820 */
1821int
1822jail_get(struct thread *td, struct jail_get_args *uap)
1823{
1824	struct uio *auio;
1825	int error;
1826
1827	/* Check that we have an even number of iovecs. */
1828	if (uap->iovcnt & 1)
1829		return (EINVAL);
1830
1831	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
1832	if (error)
1833		return (error);
1834	error = kern_jail_get(td, auio, uap->flags);
1835	if (error == 0)
1836		error = copyout(auio->uio_iov, uap->iovp,
1837		    uap->iovcnt * sizeof (struct iovec));
1838	free(auio, M_IOV);
1839	return (error);
1840}
1841
1842int
1843kern_jail_get(struct thread *td, struct uio *optuio, int flags)
1844{
1845	struct prison *pr, *mypr;
1846	struct vfsopt *opt;
1847	struct vfsoptlist *opts;
1848	char *errmsg, *name;
1849	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
1850
1851	if (flags & ~JAIL_GET_MASK)
1852		return (EINVAL);
1853
1854	/* Get the parameter list. */
1855	error = vfs_buildopts(optuio, &opts);
1856	if (error)
1857		return (error);
1858	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
1859	mypr = td->td_ucred->cr_prison;
1860
1861	/*
1862	 * Find the prison specified by one of: lastjid, jid, name.
1863	 */
1864	sx_slock(&allprison_lock);
1865	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
1866	if (error == 0) {
1867		TAILQ_FOREACH(pr, &allprison, pr_list) {
1868			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
1869				mtx_lock(&pr->pr_mtx);
1870				if (pr->pr_ref > 0 &&
1871				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
1872					break;
1873				mtx_unlock(&pr->pr_mtx);
1874			}
1875		}
1876		if (pr != NULL)
1877			goto found_prison;
1878		error = ENOENT;
1879		vfs_opterror(opts, "no jail after %d", jid);
1880		goto done_unlock_list;
1881	} else if (error != ENOENT)
1882		goto done_unlock_list;
1883
1884	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
1885	if (error == 0) {
1886		if (jid != 0) {
1887			pr = prison_find_child(mypr, jid);
1888			if (pr != NULL) {
1889				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
1890					mtx_unlock(&pr->pr_mtx);
1891					error = ENOENT;
1892					vfs_opterror(opts, "jail %d is dying",
1893					    jid);
1894					goto done_unlock_list;
1895				}
1896				goto found_prison;
1897			}
1898			error = ENOENT;
1899			vfs_opterror(opts, "jail %d not found", jid);
1900			goto done_unlock_list;
1901		}
1902	} else if (error != ENOENT)
1903		goto done_unlock_list;
1904
1905	error = vfs_getopt(opts, "name", (void **)&name, &len);
1906	if (error == 0) {
1907		if (len == 0 || name[len - 1] != '\0') {
1908			error = EINVAL;
1909			goto done_unlock_list;
1910		}
1911		pr = prison_find_name(mypr, name);
1912		if (pr != NULL) {
1913			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
1914				mtx_unlock(&pr->pr_mtx);
1915				error = ENOENT;
1916				vfs_opterror(opts, "jail \"%s\" is dying",
1917				    name);
1918				goto done_unlock_list;
1919			}
1920			goto found_prison;
1921		}
1922		error = ENOENT;
1923		vfs_opterror(opts, "jail \"%s\" not found", name);
1924		goto done_unlock_list;
1925	} else if (error != ENOENT)
1926		goto done_unlock_list;
1927
1928	vfs_opterror(opts, "no jail specified");
1929	error = ENOENT;
1930	goto done_unlock_list;
1931
1932 found_prison:
1933	/* Get the parameters of the prison. */
1934	pr->pr_ref++;
1935	locked = PD_LOCKED;
1936	td->td_retval[0] = pr->pr_id;
1937	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
1938	if (error != 0 && error != ENOENT)
1939		goto done_deref;
1940	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
1941	error = vfs_setopt(opts, "parent", &i, sizeof(i));
1942	if (error != 0 && error != ENOENT)
1943		goto done_deref;
1944	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
1945	if (error != 0 && error != ENOENT)
1946		goto done_deref;
1947	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
1948	    sizeof(pr->pr_cpuset->cs_id));
1949	if (error != 0 && error != ENOENT)
1950		goto done_deref;
1951	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
1952	if (error != 0 && error != ENOENT)
1953		goto done_deref;
1954#ifdef INET
1955	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
1956	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1957	if (error != 0 && error != ENOENT)
1958		goto done_deref;
1959#endif
1960#ifdef INET6
1961	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
1962	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1963	if (error != 0 && error != ENOENT)
1964		goto done_deref;
1965#endif
1966	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
1967	    sizeof(pr->pr_securelevel));
1968	if (error != 0 && error != ENOENT)
1969		goto done_deref;
1970	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
1971	    sizeof(pr->pr_childcount));
1972	if (error != 0 && error != ENOENT)
1973		goto done_deref;
1974	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
1975	    sizeof(pr->pr_childmax));
1976	if (error != 0 && error != ENOENT)
1977		goto done_deref;
1978	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
1979	if (error != 0 && error != ENOENT)
1980		goto done_deref;
1981	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
1982	if (error != 0 && error != ENOENT)
1983		goto done_deref;
1984	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
1985	if (error != 0 && error != ENOENT)
1986		goto done_deref;
1987#ifdef COMPAT_FREEBSD32
1988	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
1989		uint32_t hid32 = pr->pr_hostid;
1990
1991		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
1992	} else
1993#endif
1994	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
1995	    sizeof(pr->pr_hostid));
1996	if (error != 0 && error != ENOENT)
1997		goto done_deref;
1998	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
1999	    sizeof(pr->pr_enforce_statfs));
2000	if (error != 0 && error != ENOENT)
2001		goto done_deref;
2002	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
2003	    fi++) {
2004		if (pr_flag_names[fi] == NULL)
2005			continue;
2006		i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
2007		error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
2008		if (error != 0 && error != ENOENT)
2009			goto done_deref;
2010		i = !i;
2011		error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
2012		if (error != 0 && error != ENOENT)
2013			goto done_deref;
2014	}
2015	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
2016	    fi++) {
2017		i = pr->pr_flags &
2018		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
2019		i = pr_flag_jailsys[fi].disable &&
2020		      (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
2021		    : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
2022		    : JAIL_SYS_INHERIT;
2023		error =
2024		    vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
2025		if (error != 0 && error != ENOENT)
2026			goto done_deref;
2027	}
2028	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
2029	    fi++) {
2030		if (pr_allow_names[fi] == NULL)
2031			continue;
2032		i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
2033		error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
2034		if (error != 0 && error != ENOENT)
2035			goto done_deref;
2036		i = !i;
2037		error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
2038		if (error != 0 && error != ENOENT)
2039			goto done_deref;
2040	}
2041	i = (pr->pr_uref == 0);
2042	error = vfs_setopt(opts, "dying", &i, sizeof(i));
2043	if (error != 0 && error != ENOENT)
2044		goto done_deref;
2045	i = !i;
2046	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2047	if (error != 0 && error != ENOENT)
2048		goto done_deref;
2049
2050	/* Get the module parameters. */
2051	mtx_unlock(&pr->pr_mtx);
2052	locked = 0;
2053	error = osd_jail_call(pr, PR_METHOD_GET, opts);
2054	if (error)
2055		goto done_deref;
2056	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
2057
2058	/* By now, all parameters should have been noted. */
2059	TAILQ_FOREACH(opt, opts, link) {
2060		if (!opt->seen && strcmp(opt->name, "errmsg")) {
2061			error = EINVAL;
2062			vfs_opterror(opts, "unknown parameter: %s", opt->name);
2063			goto done_errmsg;
2064		}
2065	}
2066
2067	/* Write the fetched parameters back to userspace. */
2068	error = 0;
2069	TAILQ_FOREACH(opt, opts, link) {
2070		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2071			pos = 2 * opt->pos + 1;
2072			optuio->uio_iov[pos].iov_len = opt->len;
2073			if (opt->value != NULL) {
2074				if (optuio->uio_segflg == UIO_SYSSPACE) {
2075					bcopy(opt->value,
2076					    optuio->uio_iov[pos].iov_base,
2077					    opt->len);
2078				} else {
2079					error = copyout(opt->value,
2080					    optuio->uio_iov[pos].iov_base,
2081					    opt->len);
2082					if (error)
2083						break;
2084				}
2085			}
2086		}
2087	}
2088	goto done_errmsg;
2089
2090 done_deref:
2091	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
2092	goto done_errmsg;
2093
2094 done_unlock_list:
2095	sx_sunlock(&allprison_lock);
2096 done_errmsg:
2097	if (error && errmsg_pos >= 0) {
2098		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2099		errmsg_pos = 2 * errmsg_pos + 1;
2100		if (errmsg_len > 0) {
2101			if (optuio->uio_segflg == UIO_SYSSPACE)
2102				bcopy(errmsg,
2103				    optuio->uio_iov[errmsg_pos].iov_base,
2104				    errmsg_len);
2105			else
2106				copyout(errmsg,
2107				    optuio->uio_iov[errmsg_pos].iov_base,
2108				    errmsg_len);
2109		}
2110	}
2111	vfs_freeopts(opts);
2112	return (error);
2113}
2114
2115
2116/*
2117 * struct jail_remove_args {
2118 *	int jid;
2119 * };
2120 */
2121int
2122jail_remove(struct thread *td, struct jail_remove_args *uap)
2123{
2124	struct prison *pr, *cpr, *lpr, *tpr;
2125	int descend, error;
2126
2127	error = priv_check(td, PRIV_JAIL_REMOVE);
2128	if (error)
2129		return (error);
2130
2131	sx_xlock(&allprison_lock);
2132	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2133	if (pr == NULL) {
2134		sx_xunlock(&allprison_lock);
2135		return (EINVAL);
2136	}
2137
2138	/* Remove all descendants of this prison, then remove this prison. */
2139	pr->pr_ref++;
2140	pr->pr_flags |= PR_REMOVE;
2141	if (!LIST_EMPTY(&pr->pr_children)) {
2142		mtx_unlock(&pr->pr_mtx);
2143		lpr = NULL;
2144		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
2145			mtx_lock(&cpr->pr_mtx);
2146			if (cpr->pr_ref > 0) {
2147				tpr = cpr;
2148				cpr->pr_ref++;
2149				cpr->pr_flags |= PR_REMOVE;
2150			} else {
2151				/* Already removed - do not do it again. */
2152				tpr = NULL;
2153			}
2154			mtx_unlock(&cpr->pr_mtx);
2155			if (lpr != NULL) {
2156				mtx_lock(&lpr->pr_mtx);
2157				prison_remove_one(lpr);
2158				sx_xlock(&allprison_lock);
2159			}
2160			lpr = tpr;
2161		}
2162		if (lpr != NULL) {
2163			mtx_lock(&lpr->pr_mtx);
2164			prison_remove_one(lpr);
2165			sx_xlock(&allprison_lock);
2166		}
2167		mtx_lock(&pr->pr_mtx);
2168	}
2169	prison_remove_one(pr);
2170	return (0);
2171}
2172
2173static void
2174prison_remove_one(struct prison *pr)
2175{
2176	struct proc *p;
2177	int deuref;
2178
2179	/* If the prison was persistent, it is not anymore. */
2180	deuref = 0;
2181	if (pr->pr_flags & PR_PERSIST) {
2182		pr->pr_ref--;
2183		deuref = PD_DEUREF;
2184		pr->pr_flags &= ~PR_PERSIST;
2185	}
2186
2187	/*
2188	 * jail_remove added a reference.  If that's the only one, remove
2189	 * the prison now.
2190	 */
2191	KASSERT(pr->pr_ref > 0,
2192	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
2193	if (pr->pr_ref == 1) {
2194		prison_deref(pr,
2195		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
2196		return;
2197	}
2198
2199	mtx_unlock(&pr->pr_mtx);
2200	sx_xunlock(&allprison_lock);
2201	/*
2202	 * Kill all processes unfortunate enough to be attached to this prison.
2203	 */
2204	sx_slock(&allproc_lock);
2205	LIST_FOREACH(p, &allproc, p_list) {
2206		PROC_LOCK(p);
2207		if (p->p_state != PRS_NEW && p->p_ucred &&
2208		    p->p_ucred->cr_prison == pr)
2209			psignal(p, SIGKILL);
2210		PROC_UNLOCK(p);
2211	}
2212	sx_sunlock(&allproc_lock);
2213	/* Remove the temporary reference added by jail_remove. */
2214	prison_deref(pr, deuref | PD_DEREF);
2215}
2216
2217
2218/*
2219 * struct jail_attach_args {
2220 *	int jid;
2221 * };
2222 */
2223int
2224jail_attach(struct thread *td, struct jail_attach_args *uap)
2225{
2226	struct prison *pr;
2227	int error;
2228
2229	error = priv_check(td, PRIV_JAIL_ATTACH);
2230	if (error)
2231		return (error);
2232
2233	sx_slock(&allprison_lock);
2234	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2235	if (pr == NULL) {
2236		sx_sunlock(&allprison_lock);
2237		return (EINVAL);
2238	}
2239
2240	/*
2241	 * Do not allow a process to attach to a prison that is not
2242	 * considered to be "alive".
2243	 */
2244	if (pr->pr_uref == 0) {
2245		mtx_unlock(&pr->pr_mtx);
2246		sx_sunlock(&allprison_lock);
2247		return (EINVAL);
2248	}
2249
2250	return (do_jail_attach(td, pr));
2251}
2252
2253static int
2254do_jail_attach(struct thread *td, struct prison *pr)
2255{
2256	struct prison *ppr;
2257	struct proc *p;
2258	struct ucred *newcred, *oldcred;
2259	int vfslocked, error;
2260
2261	/*
2262	 * XXX: Note that there is a slight race here if two threads
2263	 * in the same privileged process attempt to attach to two
2264	 * different jails at the same time.  It is important for
2265	 * user processes not to do this, or they might end up with
2266	 * a process root from one prison, but attached to the jail
2267	 * of another.
2268	 */
2269	pr->pr_ref++;
2270	pr->pr_uref++;
2271	mtx_unlock(&pr->pr_mtx);
2272
2273	/* Let modules do whatever they need to prepare for attaching. */
2274	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2275	if (error) {
2276		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
2277		return (error);
2278	}
2279	sx_sunlock(&allprison_lock);
2280
2281	/*
2282	 * Reparent the newly attached process to this jail.
2283	 */
2284	ppr = td->td_ucred->cr_prison;
2285	p = td->td_proc;
2286	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2287	if (error)
2288		goto e_revert_osd;
2289
2290	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
2291	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2292	if ((error = change_dir(pr->pr_root, td)) != 0)
2293		goto e_unlock;
2294#ifdef MAC
2295	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2296		goto e_unlock;
2297#endif
2298	VOP_UNLOCK(pr->pr_root, 0);
2299	if ((error = change_root(pr->pr_root, td)))
2300		goto e_unlock_giant;
2301	VFS_UNLOCK_GIANT(vfslocked);
2302
2303	newcred = crget();
2304	PROC_LOCK(p);
2305	oldcred = p->p_ucred;
2306	setsugid(p);
2307	crcopy(newcred, oldcred);
2308	newcred->cr_prison = pr;
2309	p->p_ucred = newcred;
2310	PROC_UNLOCK(p);
2311#ifdef RACCT
2312	racct_proc_ucred_changed(p, oldcred, newcred);
2313#endif
2314	crfree(oldcred);
2315	prison_deref(ppr, PD_DEREF | PD_DEUREF);
2316	return (0);
2317 e_unlock:
2318	VOP_UNLOCK(pr->pr_root, 0);
2319 e_unlock_giant:
2320	VFS_UNLOCK_GIANT(vfslocked);
2321 e_revert_osd:
2322	/* Tell modules this thread is still in its old jail after all. */
2323	(void)osd_jail_call(ppr, PR_METHOD_ATTACH, td);
2324	prison_deref(pr, PD_DEREF | PD_DEUREF);
2325	return (error);
2326}
2327
2328
2329/*
2330 * Returns a locked prison instance, or NULL on failure.
2331 */
2332struct prison *
2333prison_find(int prid)
2334{
2335	struct prison *pr;
2336
2337	sx_assert(&allprison_lock, SX_LOCKED);
2338	TAILQ_FOREACH(pr, &allprison, pr_list) {
2339		if (pr->pr_id == prid) {
2340			mtx_lock(&pr->pr_mtx);
2341			if (pr->pr_ref > 0)
2342				return (pr);
2343			mtx_unlock(&pr->pr_mtx);
2344		}
2345	}
2346	return (NULL);
2347}
2348
2349/*
2350 * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2351 */
2352struct prison *
2353prison_find_child(struct prison *mypr, int prid)
2354{
2355	struct prison *pr;
2356	int descend;
2357
2358	sx_assert(&allprison_lock, SX_LOCKED);
2359	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2360		if (pr->pr_id == prid) {
2361			mtx_lock(&pr->pr_mtx);
2362			if (pr->pr_ref > 0)
2363				return (pr);
2364			mtx_unlock(&pr->pr_mtx);
2365		}
2366	}
2367	return (NULL);
2368}
2369
2370/*
2371 * Look for the name relative to mypr.  Returns a locked prison or NULL.
2372 */
2373struct prison *
2374prison_find_name(struct prison *mypr, const char *name)
2375{
2376	struct prison *pr, *deadpr;
2377	size_t mylen;
2378	int descend;
2379
2380	sx_assert(&allprison_lock, SX_LOCKED);
2381	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2382 again:
2383	deadpr = NULL;
2384	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2385		if (!strcmp(pr->pr_name + mylen, name)) {
2386			mtx_lock(&pr->pr_mtx);
2387			if (pr->pr_ref > 0) {
2388				if (pr->pr_uref > 0)
2389					return (pr);
2390				deadpr = pr;
2391			}
2392			mtx_unlock(&pr->pr_mtx);
2393		}
2394	}
2395	/* There was no valid prison - perhaps there was a dying one. */
2396	if (deadpr != NULL) {
2397		mtx_lock(&deadpr->pr_mtx);
2398		if (deadpr->pr_ref == 0) {
2399			mtx_unlock(&deadpr->pr_mtx);
2400			goto again;
2401		}
2402	}
2403	return (deadpr);
2404}
2405
2406/*
2407 * See if a prison has the specific flag set.
2408 */
2409int
2410prison_flag(struct ucred *cred, unsigned flag)
2411{
2412
2413	/* This is an atomic read, so no locking is necessary. */
2414	return (cred->cr_prison->pr_flags & flag);
2415}
2416
2417int
2418prison_allow(struct ucred *cred, unsigned flag)
2419{
2420
2421	/* This is an atomic read, so no locking is necessary. */
2422	return (cred->cr_prison->pr_allow & flag);
2423}
2424
2425/*
2426 * Remove a prison reference.  If that was the last reference, remove the
2427 * prison itself - but not in this context in case there are locks held.
2428 */
2429void
2430prison_free_locked(struct prison *pr)
2431{
2432
2433	mtx_assert(&pr->pr_mtx, MA_OWNED);
2434	pr->pr_ref--;
2435	if (pr->pr_ref == 0) {
2436		mtx_unlock(&pr->pr_mtx);
2437		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
2438		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2439		return;
2440	}
2441	mtx_unlock(&pr->pr_mtx);
2442}
2443
2444void
2445prison_free(struct prison *pr)
2446{
2447
2448	mtx_lock(&pr->pr_mtx);
2449	prison_free_locked(pr);
2450}
2451
2452static void
2453prison_complete(void *context, int pending)
2454{
2455
2456	prison_deref((struct prison *)context, 0);
2457}
2458
2459/*
2460 * Remove a prison reference (usually).  This internal version assumes no
2461 * mutexes are held, except perhaps the prison itself.  If there are no more
2462 * references, release and delist the prison.  On completion, the prison lock
2463 * and the allprison lock are both unlocked.
2464 */
2465static void
2466prison_deref(struct prison *pr, int flags)
2467{
2468	struct prison *ppr, *tpr;
2469	int vfslocked;
2470
2471	if (!(flags & PD_LOCKED))
2472		mtx_lock(&pr->pr_mtx);
2473	/* Decrement the user references in a separate loop. */
2474	if (flags & PD_DEUREF) {
2475		for (tpr = pr;; tpr = tpr->pr_parent) {
2476			if (tpr != pr)
2477				mtx_lock(&tpr->pr_mtx);
2478			if (--tpr->pr_uref > 0)
2479				break;
2480			KASSERT(tpr != &prison0, ("prison0 pr_uref=0"));
2481			mtx_unlock(&tpr->pr_mtx);
2482		}
2483		/* Done if there were only user references to remove. */
2484		if (!(flags & PD_DEREF)) {
2485			mtx_unlock(&tpr->pr_mtx);
2486			if (flags & PD_LIST_SLOCKED)
2487				sx_sunlock(&allprison_lock);
2488			else if (flags & PD_LIST_XLOCKED)
2489				sx_xunlock(&allprison_lock);
2490			return;
2491		}
2492		if (tpr != pr) {
2493			mtx_unlock(&tpr->pr_mtx);
2494			mtx_lock(&pr->pr_mtx);
2495		}
2496	}
2497
2498	for (;;) {
2499		if (flags & PD_DEREF)
2500			pr->pr_ref--;
2501		/* If the prison still has references, nothing else to do. */
2502		if (pr->pr_ref > 0) {
2503			mtx_unlock(&pr->pr_mtx);
2504			if (flags & PD_LIST_SLOCKED)
2505				sx_sunlock(&allprison_lock);
2506			else if (flags & PD_LIST_XLOCKED)
2507				sx_xunlock(&allprison_lock);
2508			return;
2509		}
2510
2511		mtx_unlock(&pr->pr_mtx);
2512		if (flags & PD_LIST_SLOCKED) {
2513			if (!sx_try_upgrade(&allprison_lock)) {
2514				sx_sunlock(&allprison_lock);
2515				sx_xlock(&allprison_lock);
2516			}
2517		} else if (!(flags & PD_LIST_XLOCKED))
2518			sx_xlock(&allprison_lock);
2519
2520		TAILQ_REMOVE(&allprison, pr, pr_list);
2521		LIST_REMOVE(pr, pr_sibling);
2522		ppr = pr->pr_parent;
2523		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
2524			tpr->pr_childcount--;
2525		sx_xunlock(&allprison_lock);
2526
2527#ifdef VIMAGE
2528		if (pr->pr_vnet != ppr->pr_vnet)
2529			vnet_destroy(pr->pr_vnet);
2530#endif
2531		if (pr->pr_root != NULL) {
2532			vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
2533			vrele(pr->pr_root);
2534			VFS_UNLOCK_GIANT(vfslocked);
2535		}
2536		mtx_destroy(&pr->pr_mtx);
2537#ifdef INET
2538		free(pr->pr_ip4, M_PRISON);
2539#endif
2540#ifdef INET6
2541		free(pr->pr_ip6, M_PRISON);
2542#endif
2543		if (pr->pr_cpuset != NULL)
2544			cpuset_rel(pr->pr_cpuset);
2545		osd_jail_exit(pr);
2546#ifdef RACCT
2547		prison_racct_detach(pr);
2548#endif
2549		free(pr, M_PRISON);
2550
2551		/* Removing a prison frees a reference on its parent. */
2552		pr = ppr;
2553		mtx_lock(&pr->pr_mtx);
2554		flags = PD_DEREF;
2555	}
2556}
2557
2558void
2559prison_hold_locked(struct prison *pr)
2560{
2561
2562	mtx_assert(&pr->pr_mtx, MA_OWNED);
2563	KASSERT(pr->pr_ref > 0,
2564	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
2565	pr->pr_ref++;
2566}
2567
2568void
2569prison_hold(struct prison *pr)
2570{
2571
2572	mtx_lock(&pr->pr_mtx);
2573	prison_hold_locked(pr);
2574	mtx_unlock(&pr->pr_mtx);
2575}
2576
2577void
2578prison_proc_hold(struct prison *pr)
2579{
2580
2581	mtx_lock(&pr->pr_mtx);
2582	KASSERT(pr->pr_uref > 0,
2583	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2584	pr->pr_uref++;
2585	mtx_unlock(&pr->pr_mtx);
2586}
2587
2588void
2589prison_proc_free(struct prison *pr)
2590{
2591
2592	mtx_lock(&pr->pr_mtx);
2593	KASSERT(pr->pr_uref > 0,
2594	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2595	prison_deref(pr, PD_DEUREF | PD_LOCKED);
2596}
2597
2598
2599#ifdef INET
2600/*
2601 * Restrict a prison's IP address list with its parent's, possibly replacing
2602 * it.  Return true if the replacement buffer was used (or would have been).
2603 */
2604static int
2605prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
2606{
2607	int ii, ij, used;
2608	struct prison *ppr;
2609
2610	ppr = pr->pr_parent;
2611	if (!(pr->pr_flags & PR_IP4_USER)) {
2612		/* This has no user settings, so just copy the parent's list. */
2613		if (pr->pr_ip4s < ppr->pr_ip4s) {
2614			/*
2615			 * There's no room for the parent's list.  Use the
2616			 * new list buffer, which is assumed to be big enough
2617			 * (if it was passed).  If there's no buffer, try to
2618			 * allocate one.
2619			 */
2620			used = 1;
2621			if (newip4 == NULL) {
2622				newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
2623				    M_PRISON, M_NOWAIT);
2624				if (newip4 != NULL)
2625					used = 0;
2626			}
2627			if (newip4 != NULL) {
2628				bcopy(ppr->pr_ip4, newip4,
2629				    ppr->pr_ip4s * sizeof(*newip4));
2630				free(pr->pr_ip4, M_PRISON);
2631				pr->pr_ip4 = newip4;
2632				pr->pr_ip4s = ppr->pr_ip4s;
2633			}
2634			return (used);
2635		}
2636		pr->pr_ip4s = ppr->pr_ip4s;
2637		if (pr->pr_ip4s > 0)
2638			bcopy(ppr->pr_ip4, pr->pr_ip4,
2639			    pr->pr_ip4s * sizeof(*newip4));
2640		else if (pr->pr_ip4 != NULL) {
2641			free(pr->pr_ip4, M_PRISON);
2642			pr->pr_ip4 = NULL;
2643		}
2644	} else if (pr->pr_ip4s > 0) {
2645		/* Remove addresses that aren't in the parent. */
2646		for (ij = 0; ij < ppr->pr_ip4s; ij++)
2647			if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
2648				break;
2649		if (ij < ppr->pr_ip4s)
2650			ii = 1;
2651		else {
2652			bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
2653			    --pr->pr_ip4s * sizeof(*pr->pr_ip4));
2654			ii = 0;
2655		}
2656		for (ij = 1; ii < pr->pr_ip4s; ) {
2657			if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
2658				ii++;
2659				continue;
2660			}
2661			switch (ij >= ppr->pr_ip4s ? -1 :
2662				qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
2663			case -1:
2664				bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
2665				    (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
2666				break;
2667			case 0:
2668				ii++;
2669				ij++;
2670				break;
2671			case 1:
2672				ij++;
2673				break;
2674			}
2675		}
2676		if (pr->pr_ip4s == 0) {
2677			pr->pr_flags |= PR_IP4_DISABLE;
2678			free(pr->pr_ip4, M_PRISON);
2679			pr->pr_ip4 = NULL;
2680		}
2681	}
2682	return (0);
2683}
2684
2685/*
2686 * Pass back primary IPv4 address of this jail.
2687 *
2688 * If not restricted return success but do not alter the address.  Caller has
2689 * to make sure to initialize it correctly (e.g. INADDR_ANY).
2690 *
2691 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2692 * Address returned in NBO.
2693 */
2694int
2695prison_get_ip4(struct ucred *cred, struct in_addr *ia)
2696{
2697	struct prison *pr;
2698
2699	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2700	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2701
2702	pr = cred->cr_prison;
2703	if (!(pr->pr_flags & PR_IP4))
2704		return (0);
2705	mtx_lock(&pr->pr_mtx);
2706	if (!(pr->pr_flags & PR_IP4)) {
2707		mtx_unlock(&pr->pr_mtx);
2708		return (0);
2709	}
2710	if (pr->pr_ip4 == NULL) {
2711		mtx_unlock(&pr->pr_mtx);
2712		return (EAFNOSUPPORT);
2713	}
2714
2715	ia->s_addr = pr->pr_ip4[0].s_addr;
2716	mtx_unlock(&pr->pr_mtx);
2717	return (0);
2718}
2719
2720/*
2721 * Return 1 if we should do proper source address selection or are not jailed.
2722 * We will return 0 if we should bypass source address selection in favour
2723 * of the primary jail IPv4 address. Only in this case *ia will be updated and
2724 * returned in NBO.
2725 * Return EAFNOSUPPORT, in case this jail does not allow IPv4.
2726 */
2727int
2728prison_saddrsel_ip4(struct ucred *cred, struct in_addr *ia)
2729{
2730	struct prison *pr;
2731	struct in_addr lia;
2732	int error;
2733
2734	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2735	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2736
2737	if (!jailed(cred))
2738		return (1);
2739
2740	pr = cred->cr_prison;
2741	if (pr->pr_flags & PR_IP4_SADDRSEL)
2742		return (1);
2743
2744	lia.s_addr = INADDR_ANY;
2745	error = prison_get_ip4(cred, &lia);
2746	if (error)
2747		return (error);
2748	if (lia.s_addr == INADDR_ANY)
2749		return (1);
2750
2751	ia->s_addr = lia.s_addr;
2752	return (0);
2753}
2754
2755/*
2756 * Return true if pr1 and pr2 have the same IPv4 address restrictions.
2757 */
2758int
2759prison_equal_ip4(struct prison *pr1, struct prison *pr2)
2760{
2761
2762	if (pr1 == pr2)
2763		return (1);
2764
2765	/*
2766	 * No need to lock since the PR_IP4_USER flag can't be altered for
2767	 * existing prisons.
2768	 */
2769	while (pr1 != &prison0 &&
2770#ifdef VIMAGE
2771	       !(pr1->pr_flags & PR_VNET) &&
2772#endif
2773	       !(pr1->pr_flags & PR_IP4_USER))
2774		pr1 = pr1->pr_parent;
2775	while (pr2 != &prison0 &&
2776#ifdef VIMAGE
2777	       !(pr2->pr_flags & PR_VNET) &&
2778#endif
2779	       !(pr2->pr_flags & PR_IP4_USER))
2780		pr2 = pr2->pr_parent;
2781	return (pr1 == pr2);
2782}
2783
2784/*
2785 * Make sure our (source) address is set to something meaningful to this
2786 * jail.
2787 *
2788 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2789 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2790 * doesn't allow IPv4.  Address passed in in NBO and returned in NBO.
2791 */
2792int
2793prison_local_ip4(struct ucred *cred, struct in_addr *ia)
2794{
2795	struct prison *pr;
2796	struct in_addr ia0;
2797	int error;
2798
2799	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2800	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2801
2802	pr = cred->cr_prison;
2803	if (!(pr->pr_flags & PR_IP4))
2804		return (0);
2805	mtx_lock(&pr->pr_mtx);
2806	if (!(pr->pr_flags & PR_IP4)) {
2807		mtx_unlock(&pr->pr_mtx);
2808		return (0);
2809	}
2810	if (pr->pr_ip4 == NULL) {
2811		mtx_unlock(&pr->pr_mtx);
2812		return (EAFNOSUPPORT);
2813	}
2814
2815	ia0.s_addr = ntohl(ia->s_addr);
2816	if (ia0.s_addr == INADDR_LOOPBACK) {
2817		ia->s_addr = pr->pr_ip4[0].s_addr;
2818		mtx_unlock(&pr->pr_mtx);
2819		return (0);
2820	}
2821
2822	if (ia0.s_addr == INADDR_ANY) {
2823		/*
2824		 * In case there is only 1 IPv4 address, bind directly.
2825		 */
2826		if (pr->pr_ip4s == 1)
2827			ia->s_addr = pr->pr_ip4[0].s_addr;
2828		mtx_unlock(&pr->pr_mtx);
2829		return (0);
2830	}
2831
2832	error = _prison_check_ip4(pr, ia);
2833	mtx_unlock(&pr->pr_mtx);
2834	return (error);
2835}
2836
2837/*
2838 * Rewrite destination address in case we will connect to loopback address.
2839 *
2840 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2841 * Address passed in in NBO and returned in NBO.
2842 */
2843int
2844prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
2845{
2846	struct prison *pr;
2847
2848	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2849	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2850
2851	pr = cred->cr_prison;
2852	if (!(pr->pr_flags & PR_IP4))
2853		return (0);
2854	mtx_lock(&pr->pr_mtx);
2855	if (!(pr->pr_flags & PR_IP4)) {
2856		mtx_unlock(&pr->pr_mtx);
2857		return (0);
2858	}
2859	if (pr->pr_ip4 == NULL) {
2860		mtx_unlock(&pr->pr_mtx);
2861		return (EAFNOSUPPORT);
2862	}
2863
2864	if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
2865		ia->s_addr = pr->pr_ip4[0].s_addr;
2866		mtx_unlock(&pr->pr_mtx);
2867		return (0);
2868	}
2869
2870	/*
2871	 * Return success because nothing had to be changed.
2872	 */
2873	mtx_unlock(&pr->pr_mtx);
2874	return (0);
2875}
2876
2877/*
2878 * Check if given address belongs to the jail referenced by cred/prison.
2879 *
2880 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2881 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2882 * doesn't allow IPv4.  Address passed in in NBO.
2883 */
2884static int
2885_prison_check_ip4(struct prison *pr, struct in_addr *ia)
2886{
2887	int i, a, z, d;
2888
2889	/*
2890	 * Check the primary IP.
2891	 */
2892	if (pr->pr_ip4[0].s_addr == ia->s_addr)
2893		return (0);
2894
2895	/*
2896	 * All the other IPs are sorted so we can do a binary search.
2897	 */
2898	a = 0;
2899	z = pr->pr_ip4s - 2;
2900	while (a <= z) {
2901		i = (a + z) / 2;
2902		d = qcmp_v4(&pr->pr_ip4[i+1], ia);
2903		if (d > 0)
2904			z = i - 1;
2905		else if (d < 0)
2906			a = i + 1;
2907		else
2908			return (0);
2909	}
2910
2911	return (EADDRNOTAVAIL);
2912}
2913
2914int
2915prison_check_ip4(struct ucred *cred, struct in_addr *ia)
2916{
2917	struct prison *pr;
2918	int error;
2919
2920	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2921	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2922
2923	pr = cred->cr_prison;
2924	if (!(pr->pr_flags & PR_IP4))
2925		return (0);
2926	mtx_lock(&pr->pr_mtx);
2927	if (!(pr->pr_flags & PR_IP4)) {
2928		mtx_unlock(&pr->pr_mtx);
2929		return (0);
2930	}
2931	if (pr->pr_ip4 == NULL) {
2932		mtx_unlock(&pr->pr_mtx);
2933		return (EAFNOSUPPORT);
2934	}
2935
2936	error = _prison_check_ip4(pr, ia);
2937	mtx_unlock(&pr->pr_mtx);
2938	return (error);
2939}
2940#endif
2941
2942#ifdef INET6
2943static int
2944prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6)
2945{
2946	int ii, ij, used;
2947	struct prison *ppr;
2948
2949	ppr = pr->pr_parent;
2950	if (!(pr->pr_flags & PR_IP6_USER)) {
2951		/* This has no user settings, so just copy the parent's list. */
2952		if (pr->pr_ip6s < ppr->pr_ip6s) {
2953			/*
2954			 * There's no room for the parent's list.  Use the
2955			 * new list buffer, which is assumed to be big enough
2956			 * (if it was passed).  If there's no buffer, try to
2957			 * allocate one.
2958			 */
2959			used = 1;
2960			if (newip6 == NULL) {
2961				newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
2962				    M_PRISON, M_NOWAIT);
2963				if (newip6 != NULL)
2964					used = 0;
2965			}
2966			if (newip6 != NULL) {
2967				bcopy(ppr->pr_ip6, newip6,
2968				    ppr->pr_ip6s * sizeof(*newip6));
2969				free(pr->pr_ip6, M_PRISON);
2970				pr->pr_ip6 = newip6;
2971				pr->pr_ip6s = ppr->pr_ip6s;
2972			}
2973			return (used);
2974		}
2975		pr->pr_ip6s = ppr->pr_ip6s;
2976		if (pr->pr_ip6s > 0)
2977			bcopy(ppr->pr_ip6, pr->pr_ip6,
2978			    pr->pr_ip6s * sizeof(*newip6));
2979		else if (pr->pr_ip6 != NULL) {
2980			free(pr->pr_ip6, M_PRISON);
2981			pr->pr_ip6 = NULL;
2982		}
2983	} else if (pr->pr_ip6s > 0) {
2984		/* Remove addresses that aren't in the parent. */
2985		for (ij = 0; ij < ppr->pr_ip6s; ij++)
2986			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
2987			    &ppr->pr_ip6[ij]))
2988				break;
2989		if (ij < ppr->pr_ip6s)
2990			ii = 1;
2991		else {
2992			bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
2993			    --pr->pr_ip6s * sizeof(*pr->pr_ip6));
2994			ii = 0;
2995		}
2996		for (ij = 1; ii < pr->pr_ip6s; ) {
2997			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
2998			    &ppr->pr_ip6[0])) {
2999				ii++;
3000				continue;
3001			}
3002			switch (ij >= ppr->pr_ip4s ? -1 :
3003				qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
3004			case -1:
3005				bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
3006				    (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
3007				break;
3008			case 0:
3009				ii++;
3010				ij++;
3011				break;
3012			case 1:
3013				ij++;
3014				break;
3015			}
3016		}
3017		if (pr->pr_ip6s == 0) {
3018			pr->pr_flags |= PR_IP6_DISABLE;
3019			free(pr->pr_ip6, M_PRISON);
3020			pr->pr_ip6 = NULL;
3021		}
3022	}
3023	return 0;
3024}
3025
3026/*
3027 * Pass back primary IPv6 address for this jail.
3028 *
3029 * If not restricted return success but do not alter the address.  Caller has
3030 * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
3031 *
3032 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3033 */
3034int
3035prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
3036{
3037	struct prison *pr;
3038
3039	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3040	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3041
3042	pr = cred->cr_prison;
3043	if (!(pr->pr_flags & PR_IP6))
3044		return (0);
3045	mtx_lock(&pr->pr_mtx);
3046	if (!(pr->pr_flags & PR_IP6)) {
3047		mtx_unlock(&pr->pr_mtx);
3048		return (0);
3049	}
3050	if (pr->pr_ip6 == NULL) {
3051		mtx_unlock(&pr->pr_mtx);
3052		return (EAFNOSUPPORT);
3053	}
3054
3055	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3056	mtx_unlock(&pr->pr_mtx);
3057	return (0);
3058}
3059
3060/*
3061 * Return 1 if we should do proper source address selection or are not jailed.
3062 * We will return 0 if we should bypass source address selection in favour
3063 * of the primary jail IPv6 address. Only in this case *ia will be updated and
3064 * returned in NBO.
3065 * Return EAFNOSUPPORT, in case this jail does not allow IPv6.
3066 */
3067int
3068prison_saddrsel_ip6(struct ucred *cred, struct in6_addr *ia6)
3069{
3070	struct prison *pr;
3071	struct in6_addr lia6;
3072	int error;
3073
3074	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3075	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3076
3077	if (!jailed(cred))
3078		return (1);
3079
3080	pr = cred->cr_prison;
3081	if (pr->pr_flags & PR_IP6_SADDRSEL)
3082		return (1);
3083
3084	lia6 = in6addr_any;
3085	error = prison_get_ip6(cred, &lia6);
3086	if (error)
3087		return (error);
3088	if (IN6_IS_ADDR_UNSPECIFIED(&lia6))
3089		return (1);
3090
3091	bcopy(&lia6, ia6, sizeof(struct in6_addr));
3092	return (0);
3093}
3094
3095/*
3096 * Return true if pr1 and pr2 have the same IPv6 address restrictions.
3097 */
3098int
3099prison_equal_ip6(struct prison *pr1, struct prison *pr2)
3100{
3101
3102	if (pr1 == pr2)
3103		return (1);
3104
3105	while (pr1 != &prison0 &&
3106#ifdef VIMAGE
3107	       !(pr1->pr_flags & PR_VNET) &&
3108#endif
3109	       !(pr1->pr_flags & PR_IP6_USER))
3110		pr1 = pr1->pr_parent;
3111	while (pr2 != &prison0 &&
3112#ifdef VIMAGE
3113	       !(pr2->pr_flags & PR_VNET) &&
3114#endif
3115	       !(pr2->pr_flags & PR_IP6_USER))
3116		pr2 = pr2->pr_parent;
3117	return (pr1 == pr2);
3118}
3119
3120/*
3121 * Make sure our (source) address is set to something meaningful to this jail.
3122 *
3123 * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
3124 * when needed while binding.
3125 *
3126 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3127 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3128 * doesn't allow IPv6.
3129 */
3130int
3131prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
3132{
3133	struct prison *pr;
3134	int error;
3135
3136	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3137	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3138
3139	pr = cred->cr_prison;
3140	if (!(pr->pr_flags & PR_IP6))
3141		return (0);
3142	mtx_lock(&pr->pr_mtx);
3143	if (!(pr->pr_flags & PR_IP6)) {
3144		mtx_unlock(&pr->pr_mtx);
3145		return (0);
3146	}
3147	if (pr->pr_ip6 == NULL) {
3148		mtx_unlock(&pr->pr_mtx);
3149		return (EAFNOSUPPORT);
3150	}
3151
3152	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3153		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3154		mtx_unlock(&pr->pr_mtx);
3155		return (0);
3156	}
3157
3158	if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
3159		/*
3160		 * In case there is only 1 IPv6 address, and v6only is true,
3161		 * then bind directly.
3162		 */
3163		if (v6only != 0 && pr->pr_ip6s == 1)
3164			bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3165		mtx_unlock(&pr->pr_mtx);
3166		return (0);
3167	}
3168
3169	error = _prison_check_ip6(pr, ia6);
3170	mtx_unlock(&pr->pr_mtx);
3171	return (error);
3172}
3173
3174/*
3175 * Rewrite destination address in case we will connect to loopback address.
3176 *
3177 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3178 */
3179int
3180prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
3181{
3182	struct prison *pr;
3183
3184	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3185	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3186
3187	pr = cred->cr_prison;
3188	if (!(pr->pr_flags & PR_IP6))
3189		return (0);
3190	mtx_lock(&pr->pr_mtx);
3191	if (!(pr->pr_flags & PR_IP6)) {
3192		mtx_unlock(&pr->pr_mtx);
3193		return (0);
3194	}
3195	if (pr->pr_ip6 == NULL) {
3196		mtx_unlock(&pr->pr_mtx);
3197		return (EAFNOSUPPORT);
3198	}
3199
3200	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3201		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3202		mtx_unlock(&pr->pr_mtx);
3203		return (0);
3204	}
3205
3206	/*
3207	 * Return success because nothing had to be changed.
3208	 */
3209	mtx_unlock(&pr->pr_mtx);
3210	return (0);
3211}
3212
3213/*
3214 * Check if given address belongs to the jail referenced by cred/prison.
3215 *
3216 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3217 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3218 * doesn't allow IPv6.
3219 */
3220static int
3221_prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
3222{
3223	int i, a, z, d;
3224
3225	/*
3226	 * Check the primary IP.
3227	 */
3228	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
3229		return (0);
3230
3231	/*
3232	 * All the other IPs are sorted so we can do a binary search.
3233	 */
3234	a = 0;
3235	z = pr->pr_ip6s - 2;
3236	while (a <= z) {
3237		i = (a + z) / 2;
3238		d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
3239		if (d > 0)
3240			z = i - 1;
3241		else if (d < 0)
3242			a = i + 1;
3243		else
3244			return (0);
3245	}
3246
3247	return (EADDRNOTAVAIL);
3248}
3249
3250int
3251prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
3252{
3253	struct prison *pr;
3254	int error;
3255
3256	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3257	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3258
3259	pr = cred->cr_prison;
3260	if (!(pr->pr_flags & PR_IP6))
3261		return (0);
3262	mtx_lock(&pr->pr_mtx);
3263	if (!(pr->pr_flags & PR_IP6)) {
3264		mtx_unlock(&pr->pr_mtx);
3265		return (0);
3266	}
3267	if (pr->pr_ip6 == NULL) {
3268		mtx_unlock(&pr->pr_mtx);
3269		return (EAFNOSUPPORT);
3270	}
3271
3272	error = _prison_check_ip6(pr, ia6);
3273	mtx_unlock(&pr->pr_mtx);
3274	return (error);
3275}
3276#endif
3277
3278/*
3279 * Check if a jail supports the given address family.
3280 *
3281 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3282 * if not.
3283 */
3284int
3285prison_check_af(struct ucred *cred, int af)
3286{
3287	struct prison *pr;
3288	int error;
3289
3290	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3291
3292	pr = cred->cr_prison;
3293#ifdef VIMAGE
3294	/* Prisons with their own network stack are not limited. */
3295	if (prison_owns_vnet(cred))
3296		return (0);
3297#endif
3298
3299	error = 0;
3300	switch (af)
3301	{
3302#ifdef INET
3303	case AF_INET:
3304		if (pr->pr_flags & PR_IP4)
3305		{
3306			mtx_lock(&pr->pr_mtx);
3307			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
3308				error = EAFNOSUPPORT;
3309			mtx_unlock(&pr->pr_mtx);
3310		}
3311		break;
3312#endif
3313#ifdef INET6
3314	case AF_INET6:
3315		if (pr->pr_flags & PR_IP6)
3316		{
3317			mtx_lock(&pr->pr_mtx);
3318			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
3319				error = EAFNOSUPPORT;
3320			mtx_unlock(&pr->pr_mtx);
3321		}
3322		break;
3323#endif
3324	case AF_LOCAL:
3325	case AF_ROUTE:
3326		break;
3327	default:
3328		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3329			error = EAFNOSUPPORT;
3330	}
3331	return (error);
3332}
3333
3334/*
3335 * Check if given address belongs to the jail referenced by cred (wrapper to
3336 * prison_check_ip[46]).
3337 *
3338 * Returns 0 if jail doesn't restrict the address family or if address belongs
3339 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3340 * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3341 */
3342int
3343prison_if(struct ucred *cred, struct sockaddr *sa)
3344{
3345#ifdef INET
3346	struct sockaddr_in *sai;
3347#endif
3348#ifdef INET6
3349	struct sockaddr_in6 *sai6;
3350#endif
3351	int error;
3352
3353	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3354	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3355
3356#ifdef VIMAGE
3357	if (prison_owns_vnet(cred))
3358		return (0);
3359#endif
3360
3361	error = 0;
3362	switch (sa->sa_family)
3363	{
3364#ifdef INET
3365	case AF_INET:
3366		sai = (struct sockaddr_in *)sa;
3367		error = prison_check_ip4(cred, &sai->sin_addr);
3368		break;
3369#endif
3370#ifdef INET6
3371	case AF_INET6:
3372		sai6 = (struct sockaddr_in6 *)sa;
3373		error = prison_check_ip6(cred, &sai6->sin6_addr);
3374		break;
3375#endif
3376	default:
3377		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3378			error = EAFNOSUPPORT;
3379	}
3380	return (error);
3381}
3382
3383/*
3384 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3385 */
3386int
3387prison_check(struct ucred *cred1, struct ucred *cred2)
3388{
3389
3390	return ((cred1->cr_prison == cred2->cr_prison ||
3391	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3392}
3393
3394/*
3395 * Return 1 if p2 is a child of p1, otherwise 0.
3396 */
3397int
3398prison_ischild(struct prison *pr1, struct prison *pr2)
3399{
3400
3401	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3402		if (pr1 == pr2)
3403			return (1);
3404	return (0);
3405}
3406
3407/*
3408 * Return 1 if the passed credential is in a jail, otherwise 0.
3409 */
3410int
3411jailed(struct ucred *cred)
3412{
3413
3414	return (cred->cr_prison != &prison0);
3415}
3416
3417/*
3418 * Return 1 if the passed credential is in a jail and that jail does not
3419 * have its own virtual network stack, otherwise 0.
3420 */
3421int
3422jailed_without_vnet(struct ucred *cred)
3423{
3424
3425	if (!jailed(cred))
3426		return (0);
3427#ifdef VIMAGE
3428	if (prison_owns_vnet(cred))
3429		return (0);
3430#endif
3431
3432	return (1);
3433}
3434
3435/*
3436 * Return the correct hostname (domainname, et al) for the passed credential.
3437 */
3438void
3439getcredhostname(struct ucred *cred, char *buf, size_t size)
3440{
3441	struct prison *pr;
3442
3443	/*
3444	 * A NULL credential can be used to shortcut to the physical
3445	 * system's hostname.
3446	 */
3447	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3448	mtx_lock(&pr->pr_mtx);
3449	strlcpy(buf, pr->pr_hostname, size);
3450	mtx_unlock(&pr->pr_mtx);
3451}
3452
3453void
3454getcreddomainname(struct ucred *cred, char *buf, size_t size)
3455{
3456
3457	mtx_lock(&cred->cr_prison->pr_mtx);
3458	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3459	mtx_unlock(&cred->cr_prison->pr_mtx);
3460}
3461
3462void
3463getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3464{
3465
3466	mtx_lock(&cred->cr_prison->pr_mtx);
3467	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3468	mtx_unlock(&cred->cr_prison->pr_mtx);
3469}
3470
3471void
3472getcredhostid(struct ucred *cred, unsigned long *hostid)
3473{
3474
3475	mtx_lock(&cred->cr_prison->pr_mtx);
3476	*hostid = cred->cr_prison->pr_hostid;
3477	mtx_unlock(&cred->cr_prison->pr_mtx);
3478}
3479
3480#ifdef VIMAGE
3481/*
3482 * Determine whether the prison represented by cred owns
3483 * its vnet rather than having it inherited.
3484 *
3485 * Returns 1 in case the prison owns the vnet, 0 otherwise.
3486 */
3487int
3488prison_owns_vnet(struct ucred *cred)
3489{
3490
3491	/*
3492	 * vnets cannot be added/removed after jail creation,
3493	 * so no need to lock here.
3494	 */
3495	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
3496}
3497#endif
3498
3499/*
3500 * Determine whether the subject represented by cred can "see"
3501 * status of a mount point.
3502 * Returns: 0 for permitted, ENOENT otherwise.
3503 * XXX: This function should be called cr_canseemount() and should be
3504 *      placed in kern_prot.c.
3505 */
3506int
3507prison_canseemount(struct ucred *cred, struct mount *mp)
3508{
3509	struct prison *pr;
3510	struct statfs *sp;
3511	size_t len;
3512
3513	pr = cred->cr_prison;
3514	if (pr->pr_enforce_statfs == 0)
3515		return (0);
3516	if (pr->pr_root->v_mount == mp)
3517		return (0);
3518	if (pr->pr_enforce_statfs == 2)
3519		return (ENOENT);
3520	/*
3521	 * If jail's chroot directory is set to "/" we should be able to see
3522	 * all mount-points from inside a jail.
3523	 * This is ugly check, but this is the only situation when jail's
3524	 * directory ends with '/'.
3525	 */
3526	if (strcmp(pr->pr_path, "/") == 0)
3527		return (0);
3528	len = strlen(pr->pr_path);
3529	sp = &mp->mnt_stat;
3530	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3531		return (ENOENT);
3532	/*
3533	 * Be sure that we don't have situation where jail's root directory
3534	 * is "/some/path" and mount point is "/some/pathpath".
3535	 */
3536	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3537		return (ENOENT);
3538	return (0);
3539}
3540
3541void
3542prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3543{
3544	char jpath[MAXPATHLEN];
3545	struct prison *pr;
3546	size_t len;
3547
3548	pr = cred->cr_prison;
3549	if (pr->pr_enforce_statfs == 0)
3550		return;
3551	if (prison_canseemount(cred, mp) != 0) {
3552		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3553		strlcpy(sp->f_mntonname, "[restricted]",
3554		    sizeof(sp->f_mntonname));
3555		return;
3556	}
3557	if (pr->pr_root->v_mount == mp) {
3558		/*
3559		 * Clear current buffer data, so we are sure nothing from
3560		 * the valid path left there.
3561		 */
3562		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3563		*sp->f_mntonname = '/';
3564		return;
3565	}
3566	/*
3567	 * If jail's chroot directory is set to "/" we should be able to see
3568	 * all mount-points from inside a jail.
3569	 */
3570	if (strcmp(pr->pr_path, "/") == 0)
3571		return;
3572	len = strlen(pr->pr_path);
3573	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3574	/*
3575	 * Clear current buffer data, so we are sure nothing from
3576	 * the valid path left there.
3577	 */
3578	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3579	if (*jpath == '\0') {
3580		/* Should never happen. */
3581		*sp->f_mntonname = '/';
3582	} else {
3583		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3584	}
3585}
3586
3587/*
3588 * Check with permission for a specific privilege is granted within jail.  We
3589 * have a specific list of accepted privileges; the rest are denied.
3590 */
3591int
3592prison_priv_check(struct ucred *cred, int priv)
3593{
3594
3595	if (!jailed(cred))
3596		return (0);
3597
3598#ifdef VIMAGE
3599	/*
3600	 * Privileges specific to prisons with a virtual network stack.
3601	 * There might be a duplicate entry here in case the privilege
3602	 * is only granted conditionally in the legacy jail case.
3603	 */
3604	switch (priv) {
3605#ifdef notyet
3606		/*
3607		 * NFS-specific privileges.
3608		 */
3609	case PRIV_NFS_DAEMON:
3610	case PRIV_NFS_LOCKD:
3611#endif
3612		/*
3613		 * Network stack privileges.
3614		 */
3615	case PRIV_NET_BRIDGE:
3616	case PRIV_NET_GRE:
3617	case PRIV_NET_BPF:
3618	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3619	case PRIV_NET_ROUTE:
3620	case PRIV_NET_TAP:
3621	case PRIV_NET_SETIFMTU:
3622	case PRIV_NET_SETIFFLAGS:
3623	case PRIV_NET_SETIFCAP:
3624	case PRIV_NET_SETIFDESCR:
3625	case PRIV_NET_SETIFNAME	:
3626	case PRIV_NET_SETIFMETRIC:
3627	case PRIV_NET_SETIFPHYS:
3628	case PRIV_NET_SETIFMAC:
3629	case PRIV_NET_ADDMULTI:
3630	case PRIV_NET_DELMULTI:
3631	case PRIV_NET_HWIOCTL:
3632	case PRIV_NET_SETLLADDR:
3633	case PRIV_NET_ADDIFGROUP:
3634	case PRIV_NET_DELIFGROUP:
3635	case PRIV_NET_IFCREATE:
3636	case PRIV_NET_IFDESTROY:
3637	case PRIV_NET_ADDIFADDR:
3638	case PRIV_NET_DELIFADDR:
3639	case PRIV_NET_LAGG:
3640	case PRIV_NET_GIF:
3641	case PRIV_NET_SETIFVNET:
3642	case PRIV_NET_SETIFFIB:
3643
3644		/*
3645		 * 802.11-related privileges.
3646		 */
3647	case PRIV_NET80211_GETKEY:
3648#ifdef notyet
3649	case PRIV_NET80211_MANAGE:		/* XXX-BZ discuss with sam@ */
3650#endif
3651
3652#ifdef notyet
3653		/*
3654		 * AppleTalk privileges.
3655		 */
3656	case PRIV_NETATALK_RESERVEDPORT:
3657
3658		/*
3659		 * ATM privileges.
3660		 */
3661	case PRIV_NETATM_CFG:
3662	case PRIV_NETATM_ADD:
3663	case PRIV_NETATM_DEL:
3664	case PRIV_NETATM_SET:
3665
3666		/*
3667		 * Bluetooth privileges.
3668		 */
3669	case PRIV_NETBLUETOOTH_RAW:
3670#endif
3671
3672		/*
3673		 * Netgraph and netgraph module privileges.
3674		 */
3675	case PRIV_NETGRAPH_CONTROL:
3676#ifdef notyet
3677	case PRIV_NETGRAPH_TTY:
3678#endif
3679
3680		/*
3681		 * IPv4 and IPv6 privileges.
3682		 */
3683	case PRIV_NETINET_IPFW:
3684	case PRIV_NETINET_DIVERT:
3685	case PRIV_NETINET_PF:
3686	case PRIV_NETINET_DUMMYNET:
3687	case PRIV_NETINET_CARP:
3688	case PRIV_NETINET_MROUTE:
3689	case PRIV_NETINET_RAW:
3690	case PRIV_NETINET_ADDRCTRL6:
3691	case PRIV_NETINET_ND6:
3692	case PRIV_NETINET_SCOPE6:
3693	case PRIV_NETINET_ALIFETIME6:
3694	case PRIV_NETINET_IPSEC:
3695	case PRIV_NETINET_BINDANY:
3696
3697#ifdef notyet
3698		/*
3699		 * IPX/SPX privileges.
3700		 */
3701	case PRIV_NETIPX_RESERVEDPORT:
3702	case PRIV_NETIPX_RAW:
3703
3704		/*
3705		 * NCP privileges.
3706		 */
3707	case PRIV_NETNCP:
3708
3709		/*
3710		 * SMB privileges.
3711		 */
3712	case PRIV_NETSMB:
3713#endif
3714
3715	/*
3716	 * No default: or deny here.
3717	 * In case of no permit fall through to next switch().
3718	 */
3719		if (cred->cr_prison->pr_flags & PR_VNET)
3720			return (0);
3721	}
3722#endif /* VIMAGE */
3723
3724	switch (priv) {
3725
3726		/*
3727		 * Allow ktrace privileges for root in jail.
3728		 */
3729	case PRIV_KTRACE:
3730
3731#if 0
3732		/*
3733		 * Allow jailed processes to configure audit identity and
3734		 * submit audit records (login, etc).  In the future we may
3735		 * want to further refine the relationship between audit and
3736		 * jail.
3737		 */
3738	case PRIV_AUDIT_GETAUDIT:
3739	case PRIV_AUDIT_SETAUDIT:
3740	case PRIV_AUDIT_SUBMIT:
3741#endif
3742
3743		/*
3744		 * Allow jailed processes to manipulate process UNIX
3745		 * credentials in any way they see fit.
3746		 */
3747	case PRIV_CRED_SETUID:
3748	case PRIV_CRED_SETEUID:
3749	case PRIV_CRED_SETGID:
3750	case PRIV_CRED_SETEGID:
3751	case PRIV_CRED_SETGROUPS:
3752	case PRIV_CRED_SETREUID:
3753	case PRIV_CRED_SETREGID:
3754	case PRIV_CRED_SETRESUID:
3755	case PRIV_CRED_SETRESGID:
3756
3757		/*
3758		 * Jail implements visibility constraints already, so allow
3759		 * jailed root to override uid/gid-based constraints.
3760		 */
3761	case PRIV_SEEOTHERGIDS:
3762	case PRIV_SEEOTHERUIDS:
3763
3764		/*
3765		 * Jail implements inter-process debugging limits already, so
3766		 * allow jailed root various debugging privileges.
3767		 */
3768	case PRIV_DEBUG_DIFFCRED:
3769	case PRIV_DEBUG_SUGID:
3770	case PRIV_DEBUG_UNPRIV:
3771
3772		/*
3773		 * Allow jail to set various resource limits and login
3774		 * properties, and for now, exceed process resource limits.
3775		 */
3776	case PRIV_PROC_LIMIT:
3777	case PRIV_PROC_SETLOGIN:
3778	case PRIV_PROC_SETRLIMIT:
3779
3780		/*
3781		 * System V and POSIX IPC privileges are granted in jail.
3782		 */
3783	case PRIV_IPC_READ:
3784	case PRIV_IPC_WRITE:
3785	case PRIV_IPC_ADMIN:
3786	case PRIV_IPC_MSGSIZE:
3787	case PRIV_MQ_ADMIN:
3788
3789		/*
3790		 * Jail operations within a jail work on child jails.
3791		 */
3792	case PRIV_JAIL_ATTACH:
3793	case PRIV_JAIL_SET:
3794	case PRIV_JAIL_REMOVE:
3795
3796		/*
3797		 * Jail implements its own inter-process limits, so allow
3798		 * root processes in jail to change scheduling on other
3799		 * processes in the same jail.  Likewise for signalling.
3800		 */
3801	case PRIV_SCHED_DIFFCRED:
3802	case PRIV_SCHED_CPUSET:
3803	case PRIV_SIGNAL_DIFFCRED:
3804	case PRIV_SIGNAL_SUGID:
3805
3806		/*
3807		 * Allow jailed processes to write to sysctls marked as jail
3808		 * writable.
3809		 */
3810	case PRIV_SYSCTL_WRITEJAIL:
3811
3812		/*
3813		 * Allow root in jail to manage a variety of quota
3814		 * properties.  These should likely be conditional on a
3815		 * configuration option.
3816		 */
3817	case PRIV_VFS_GETQUOTA:
3818	case PRIV_VFS_SETQUOTA:
3819
3820		/*
3821		 * Since Jail relies on chroot() to implement file system
3822		 * protections, grant many VFS privileges to root in jail.
3823		 * Be careful to exclude mount-related and NFS-related
3824		 * privileges.
3825		 */
3826	case PRIV_VFS_READ:
3827	case PRIV_VFS_WRITE:
3828	case PRIV_VFS_ADMIN:
3829	case PRIV_VFS_EXEC:
3830	case PRIV_VFS_LOOKUP:
3831	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
3832	case PRIV_VFS_CHFLAGS_DEV:
3833	case PRIV_VFS_CHOWN:
3834	case PRIV_VFS_CHROOT:
3835	case PRIV_VFS_RETAINSUGID:
3836	case PRIV_VFS_FCHROOT:
3837	case PRIV_VFS_LINK:
3838	case PRIV_VFS_SETGID:
3839	case PRIV_VFS_STAT:
3840	case PRIV_VFS_STICKYFILE:
3841		return (0);
3842
3843		/*
3844		 * Depending on the global setting, allow privilege of
3845		 * setting system flags.
3846		 */
3847	case PRIV_VFS_SYSFLAGS:
3848		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
3849			return (0);
3850		else
3851			return (EPERM);
3852
3853		/*
3854		 * Depending on the global setting, allow privilege of
3855		 * mounting/unmounting file systems.
3856		 */
3857	case PRIV_VFS_MOUNT:
3858	case PRIV_VFS_UNMOUNT:
3859	case PRIV_VFS_MOUNT_NONUSER:
3860	case PRIV_VFS_MOUNT_OWNER:
3861		if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT)
3862			return (0);
3863		else
3864			return (EPERM);
3865
3866		/*
3867		 * Allow jailed root to bind reserved ports and reuse in-use
3868		 * ports.
3869		 */
3870	case PRIV_NETINET_RESERVEDPORT:
3871	case PRIV_NETINET_REUSEPORT:
3872		return (0);
3873
3874		/*
3875		 * Allow jailed root to set certian IPv4/6 (option) headers.
3876		 */
3877	case PRIV_NETINET_SETHDROPTS:
3878		return (0);
3879
3880		/*
3881		 * Conditionally allow creating raw sockets in jail.
3882		 */
3883	case PRIV_NETINET_RAW:
3884		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
3885			return (0);
3886		else
3887			return (EPERM);
3888
3889		/*
3890		 * Since jail implements its own visibility limits on netstat
3891		 * sysctls, allow getcred.  This allows identd to work in
3892		 * jail.
3893		 */
3894	case PRIV_NETINET_GETCRED:
3895		return (0);
3896
3897		/*
3898		 * Allow jailed root to set loginclass.
3899		 */
3900	case PRIV_PROC_SETLOGINCLASS:
3901		return (0);
3902
3903	default:
3904		/*
3905		 * In all remaining cases, deny the privilege request.  This
3906		 * includes almost all network privileges, many system
3907		 * configuration privileges.
3908		 */
3909		return (EPERM);
3910	}
3911}
3912
3913/*
3914 * Return the part of pr2's name that is relative to pr1, or the whole name
3915 * if it does not directly follow.
3916 */
3917
3918char *
3919prison_name(struct prison *pr1, struct prison *pr2)
3920{
3921	char *name;
3922
3923	/* Jails see themselves as "0" (if they see themselves at all). */
3924	if (pr1 == pr2)
3925		return "0";
3926	name = pr2->pr_name;
3927	if (prison_ischild(pr1, pr2)) {
3928		/*
3929		 * pr1 isn't locked (and allprison_lock may not be either)
3930		 * so its length can't be counted on.  But the number of dots
3931		 * can be counted on - and counted.
3932		 */
3933		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
3934			name = strchr(name, '.') + 1;
3935	}
3936	return (name);
3937}
3938
3939/*
3940 * Return the part of pr2's path that is relative to pr1, or the whole path
3941 * if it does not directly follow.
3942 */
3943static char *
3944prison_path(struct prison *pr1, struct prison *pr2)
3945{
3946	char *path1, *path2;
3947	int len1;
3948
3949	path1 = pr1->pr_path;
3950	path2 = pr2->pr_path;
3951	if (!strcmp(path1, "/"))
3952		return (path2);
3953	len1 = strlen(path1);
3954	if (strncmp(path1, path2, len1))
3955		return (path2);
3956	if (path2[len1] == '\0')
3957		return "/";
3958	if (path2[len1] == '/')
3959		return (path2 + len1);
3960	return (path2);
3961}
3962
3963
3964/*
3965 * Jail-related sysctls.
3966 */
3967SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
3968    "Jails");
3969
3970static int
3971sysctl_jail_list(SYSCTL_HANDLER_ARGS)
3972{
3973	struct xprison *xp;
3974	struct prison *pr, *cpr;
3975#ifdef INET
3976	struct in_addr *ip4 = NULL;
3977	int ip4s = 0;
3978#endif
3979#ifdef INET6
3980	struct in6_addr *ip6 = NULL;
3981	int ip6s = 0;
3982#endif
3983	int descend, error;
3984
3985	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
3986	pr = req->td->td_ucred->cr_prison;
3987	error = 0;
3988	sx_slock(&allprison_lock);
3989	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
3990#if defined(INET) || defined(INET6)
3991 again:
3992#endif
3993		mtx_lock(&cpr->pr_mtx);
3994#ifdef INET
3995		if (cpr->pr_ip4s > 0) {
3996			if (ip4s < cpr->pr_ip4s) {
3997				ip4s = cpr->pr_ip4s;
3998				mtx_unlock(&cpr->pr_mtx);
3999				ip4 = realloc(ip4, ip4s *
4000				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
4001				goto again;
4002			}
4003			bcopy(cpr->pr_ip4, ip4,
4004			    cpr->pr_ip4s * sizeof(struct in_addr));
4005		}
4006#endif
4007#ifdef INET6
4008		if (cpr->pr_ip6s > 0) {
4009			if (ip6s < cpr->pr_ip6s) {
4010				ip6s = cpr->pr_ip6s;
4011				mtx_unlock(&cpr->pr_mtx);
4012				ip6 = realloc(ip6, ip6s *
4013				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
4014				goto again;
4015			}
4016			bcopy(cpr->pr_ip6, ip6,
4017			    cpr->pr_ip6s * sizeof(struct in6_addr));
4018		}
4019#endif
4020		if (cpr->pr_ref == 0) {
4021			mtx_unlock(&cpr->pr_mtx);
4022			continue;
4023		}
4024		bzero(xp, sizeof(*xp));
4025		xp->pr_version = XPRISON_VERSION;
4026		xp->pr_id = cpr->pr_id;
4027		xp->pr_state = cpr->pr_uref > 0
4028		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
4029		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
4030		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
4031		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
4032#ifdef INET
4033		xp->pr_ip4s = cpr->pr_ip4s;
4034#endif
4035#ifdef INET6
4036		xp->pr_ip6s = cpr->pr_ip6s;
4037#endif
4038		mtx_unlock(&cpr->pr_mtx);
4039		error = SYSCTL_OUT(req, xp, sizeof(*xp));
4040		if (error)
4041			break;
4042#ifdef INET
4043		if (xp->pr_ip4s > 0) {
4044			error = SYSCTL_OUT(req, ip4,
4045			    xp->pr_ip4s * sizeof(struct in_addr));
4046			if (error)
4047				break;
4048		}
4049#endif
4050#ifdef INET6
4051		if (xp->pr_ip6s > 0) {
4052			error = SYSCTL_OUT(req, ip6,
4053			    xp->pr_ip6s * sizeof(struct in6_addr));
4054			if (error)
4055				break;
4056		}
4057#endif
4058	}
4059	sx_sunlock(&allprison_lock);
4060	free(xp, M_TEMP);
4061#ifdef INET
4062	free(ip4, M_TEMP);
4063#endif
4064#ifdef INET6
4065	free(ip6, M_TEMP);
4066#endif
4067	return (error);
4068}
4069
4070SYSCTL_OID(_security_jail, OID_AUTO, list,
4071    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4072    sysctl_jail_list, "S", "List of active jails");
4073
4074static int
4075sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
4076{
4077	int error, injail;
4078
4079	injail = jailed(req->td->td_ucred);
4080	error = SYSCTL_OUT(req, &injail, sizeof(injail));
4081
4082	return (error);
4083}
4084
4085SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
4086    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4087    sysctl_jail_jailed, "I", "Process in jail?");
4088
4089#if defined(INET) || defined(INET6)
4090SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
4091    &jail_max_af_ips, 0,
4092    "Number of IP addresses a jail may have at most per address family");
4093#endif
4094
4095/*
4096 * Default parameters for jail(2) compatability.  For historical reasons,
4097 * the sysctl names have varying similarity to the parameter names.  Prisons
4098 * just see their own parameters, and can't change them.
4099 */
4100static int
4101sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
4102{
4103	struct prison *pr;
4104	int allow, error, i;
4105
4106	pr = req->td->td_ucred->cr_prison;
4107	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
4108
4109	/* Get the current flag value, and convert it to a boolean. */
4110	i = (allow & arg2) ? 1 : 0;
4111	if (arg1 != NULL)
4112		i = !i;
4113	error = sysctl_handle_int(oidp, &i, 0, req);
4114	if (error || !req->newptr)
4115		return (error);
4116	i = i ? arg2 : 0;
4117	if (arg1 != NULL)
4118		i ^= arg2;
4119	/*
4120	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
4121	 * for writing.
4122	 */
4123	mtx_lock(&prison0.pr_mtx);
4124	jail_default_allow = (jail_default_allow & ~arg2) | i;
4125	mtx_unlock(&prison0.pr_mtx);
4126	return (0);
4127}
4128
4129SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
4130    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4131    NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
4132    "Processes in jail can set their hostnames");
4133SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
4134    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4135    (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
4136    "Processes in jail are limited to creating UNIX/IP/route sockets only");
4137SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
4138    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4139    NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
4140    "Processes in jail can use System V IPC primitives");
4141SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
4142    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4143    NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
4144    "Prison root can create raw sockets");
4145SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
4146    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4147    NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
4148    "Processes in jail can alter system file flags");
4149SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
4150    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4151    NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
4152    "Processes in jail can mount/unmount jail-friendly file systems");
4153
4154static int
4155sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
4156{
4157	struct prison *pr;
4158	int level, error;
4159
4160	pr = req->td->td_ucred->cr_prison;
4161	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4162	error = sysctl_handle_int(oidp, &level, 0, req);
4163	if (error || !req->newptr)
4164		return (error);
4165	*(int *)arg1 = level;
4166	return (0);
4167}
4168
4169SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4170    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4171    &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4172    sysctl_jail_default_level, "I",
4173    "Processes in jail cannot see all mounted file systems");
4174
4175/*
4176 * Nodes to describe jail parameters.  Maximum length of string parameters
4177 * is returned in the string itself, and the other parameters exist merely
4178 * to make themselves and their types known.
4179 */
4180SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
4181    "Jail parameters");
4182
4183int
4184sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4185{
4186	int i;
4187	long l;
4188	size_t s;
4189	char numbuf[12];
4190
4191	switch (oidp->oid_kind & CTLTYPE)
4192	{
4193	case CTLTYPE_LONG:
4194	case CTLTYPE_ULONG:
4195		l = 0;
4196#ifdef SCTL_MASK32
4197		if (!(req->flags & SCTL_MASK32))
4198#endif
4199			return (SYSCTL_OUT(req, &l, sizeof(l)));
4200	case CTLTYPE_INT:
4201	case CTLTYPE_UINT:
4202		i = 0;
4203		return (SYSCTL_OUT(req, &i, sizeof(i)));
4204	case CTLTYPE_STRING:
4205		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
4206		return
4207		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4208	case CTLTYPE_STRUCT:
4209		s = (size_t)arg2;
4210		return (SYSCTL_OUT(req, &s, sizeof(s)));
4211	}
4212	return (0);
4213}
4214
4215SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4216SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4217SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4218SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4219SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4220    "I", "Jail secure level");
4221SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4222    "I", "Jail cannot see all mounted file systems");
4223SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4224    "B", "Jail persistence");
4225#ifdef VIMAGE
4226SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4227    "E,jailsys", "Virtual network stack");
4228#endif
4229SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4230    "B", "Jail is in the process of shutting down");
4231
4232SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4233SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4234    "I", "Current number of child jails");
4235SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4236    "I", "Maximum number of child jails");
4237
4238SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4239SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4240    "Jail hostname");
4241SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4242    "Jail NIS domainname");
4243SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4244    "Jail host UUID");
4245SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4246    "LU", "Jail host ID");
4247
4248SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4249SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4250
4251#ifdef INET
4252SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4253    "Jail IPv4 address virtualization");
4254SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4255    "S,in_addr,a", "Jail IPv4 addresses");
4256SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4257    "B", "Do (not) use IPv4 source address selection rather than the "
4258    "primary jail IPv4 address.");
4259#endif
4260#ifdef INET6
4261SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4262    "Jail IPv6 address virtualization");
4263SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4264    "S,in6_addr,a", "Jail IPv6 addresses");
4265SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4266    "B", "Do (not) use IPv6 source address selection rather than the "
4267    "primary jail IPv6 address.");
4268#endif
4269
4270SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4271SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4272    "B", "Jail may set hostname");
4273SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4274    "B", "Jail may use SYSV IPC");
4275SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4276    "B", "Jail may create raw sockets");
4277SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4278    "B", "Jail may alter system file flags");
4279SYSCTL_JAIL_PARAM(_allow, mount, CTLTYPE_INT | CTLFLAG_RW,
4280    "B", "Jail may mount/unmount jail-friendly file systems");
4281SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4282    "B", "Jail may set file quotas");
4283SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4284    "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4285
4286void
4287prison_racct_foreach(void (*callback)(struct racct *racct,
4288    void *arg2, void *arg3), void *arg2, void *arg3)
4289{
4290	struct prison_racct *prr;
4291
4292	sx_slock(&allprison_lock);
4293	LIST_FOREACH(prr, &allprison_racct, prr_next)
4294		(callback)(prr->prr_racct, arg2, arg3);
4295	sx_sunlock(&allprison_lock);
4296}
4297
4298static struct prison_racct *
4299prison_racct_find_locked(const char *name)
4300{
4301	struct prison_racct *prr;
4302
4303	sx_assert(&allprison_lock, SA_XLOCKED);
4304
4305	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
4306		return (NULL);
4307
4308	LIST_FOREACH(prr, &allprison_racct, prr_next) {
4309		if (strcmp(name, prr->prr_name) != 0)
4310			continue;
4311
4312		/* Found prison_racct with a matching name? */
4313		prison_racct_hold(prr);
4314		return (prr);
4315	}
4316
4317	/* Add new prison_racct. */
4318	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
4319	racct_create(&prr->prr_racct);
4320
4321	strcpy(prr->prr_name, name);
4322	refcount_init(&prr->prr_refcount, 1);
4323	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
4324
4325	return (prr);
4326}
4327
4328struct prison_racct *
4329prison_racct_find(const char *name)
4330{
4331	struct prison_racct *prr;
4332
4333	sx_xlock(&allprison_lock);
4334	prr = prison_racct_find_locked(name);
4335	sx_xunlock(&allprison_lock);
4336	return (prr);
4337}
4338
4339void
4340prison_racct_hold(struct prison_racct *prr)
4341{
4342
4343	refcount_acquire(&prr->prr_refcount);
4344}
4345
4346void
4347prison_racct_free(struct prison_racct *prr)
4348{
4349	int old;
4350
4351	old = prr->prr_refcount;
4352	if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1))
4353		return;
4354
4355	sx_xlock(&allprison_lock);
4356	if (refcount_release(&prr->prr_refcount)) {
4357		racct_destroy(&prr->prr_racct);
4358		LIST_REMOVE(prr, prr_next);
4359		sx_xunlock(&allprison_lock);
4360		free(prr, M_PRISON_RACCT);
4361
4362		return;
4363	}
4364	sx_xunlock(&allprison_lock);
4365}
4366
4367#ifdef RACCT
4368static void
4369prison_racct_attach(struct prison *pr)
4370{
4371	struct prison_racct *prr;
4372
4373	prr = prison_racct_find_locked(pr->pr_name);
4374	KASSERT(prr != NULL, ("cannot find prison_racct"));
4375
4376	pr->pr_prison_racct = prr;
4377}
4378
4379static void
4380prison_racct_detach(struct prison *pr)
4381{
4382	prison_racct_free(pr->pr_prison_racct);
4383	pr->pr_prison_racct = NULL;
4384}
4385#endif /* RACCT */
4386
4387#ifdef DDB
4388
4389static void
4390db_show_prison(struct prison *pr)
4391{
4392	int fi;
4393#if defined(INET) || defined(INET6)
4394	int ii;
4395#endif
4396	unsigned jsf;
4397#ifdef INET6
4398	char ip6buf[INET6_ADDRSTRLEN];
4399#endif
4400
4401	db_printf("prison %p:\n", pr);
4402	db_printf(" jid             = %d\n", pr->pr_id);
4403	db_printf(" name            = %s\n", pr->pr_name);
4404	db_printf(" parent          = %p\n", pr->pr_parent);
4405	db_printf(" ref             = %d\n", pr->pr_ref);
4406	db_printf(" uref            = %d\n", pr->pr_uref);
4407	db_printf(" path            = %s\n", pr->pr_path);
4408	db_printf(" cpuset          = %d\n", pr->pr_cpuset
4409	    ? pr->pr_cpuset->cs_id : -1);
4410#ifdef VIMAGE
4411	db_printf(" vnet            = %p\n", pr->pr_vnet);
4412#endif
4413	db_printf(" root            = %p\n", pr->pr_root);
4414	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
4415	db_printf(" children.max    = %d\n", pr->pr_childmax);
4416	db_printf(" children.cur    = %d\n", pr->pr_childcount);
4417	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
4418	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
4419	db_printf(" flags           = 0x%x", pr->pr_flags);
4420	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
4421	    fi++)
4422		if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
4423			db_printf(" %s", pr_flag_names[fi]);
4424	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
4425	    fi++) {
4426		jsf = pr->pr_flags &
4427		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
4428		db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
4429		    pr_flag_jailsys[fi].disable &&
4430		      (jsf == pr_flag_jailsys[fi].disable) ? "disable"
4431		    : (jsf == pr_flag_jailsys[fi].new) ? "new"
4432		    : "inherit");
4433	}
4434	db_printf(" allow           = 0x%x", pr->pr_allow);
4435	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
4436	    fi++)
4437		if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
4438			db_printf(" %s", pr_allow_names[fi]);
4439	db_printf("\n");
4440	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
4441	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
4442	db_printf(" host.domainname = %s\n", pr->pr_domainname);
4443	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
4444	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
4445#ifdef INET
4446	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
4447	for (ii = 0; ii < pr->pr_ip4s; ii++)
4448		db_printf(" %s %s\n",
4449		    ii == 0 ? "ip4.addr        =" : "                 ",
4450		    inet_ntoa(pr->pr_ip4[ii]));
4451#endif
4452#ifdef INET6
4453	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
4454	for (ii = 0; ii < pr->pr_ip6s; ii++)
4455		db_printf(" %s %s\n",
4456		    ii == 0 ? "ip6.addr        =" : "                 ",
4457		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
4458#endif
4459}
4460
4461DB_SHOW_COMMAND(prison, db_show_prison_command)
4462{
4463	struct prison *pr;
4464
4465	if (!have_addr) {
4466		/*
4467		 * Show all prisons in the list, and prison0 which is not
4468		 * listed.
4469		 */
4470		db_show_prison(&prison0);
4471		if (!db_pager_quit) {
4472			TAILQ_FOREACH(pr, &allprison, pr_list) {
4473				db_show_prison(pr);
4474				if (db_pager_quit)
4475					break;
4476			}
4477		}
4478		return;
4479	}
4480
4481	if (addr == 0)
4482		pr = &prison0;
4483	else {
4484		/* Look for a prison with the ID and with references. */
4485		TAILQ_FOREACH(pr, &allprison, pr_list)
4486			if (pr->pr_id == addr && pr->pr_ref > 0)
4487				break;
4488		if (pr == NULL)
4489			/* Look again, without requiring a reference. */
4490			TAILQ_FOREACH(pr, &allprison, pr_list)
4491				if (pr->pr_id == addr)
4492					break;
4493		if (pr == NULL)
4494			/* Assume address points to a valid prison. */
4495			pr = (struct prison *)addr;
4496	}
4497	db_show_prison(pr);
4498}
4499
4500#endif /* DDB */
4501