kern_jail.c revision 195945
1/*-
2 * Copyright (c) 1999 Poul-Henning Kamp.
3 * Copyright (c) 2008 Bjoern A. Zeeb.
4 * Copyright (c) 2009 James Gritton.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: head/sys/kern/kern_jail.c 195945 2009-07-29 16:46:59Z jamie $");
31
32#include "opt_compat.h"
33#include "opt_ddb.h"
34#include "opt_inet.h"
35#include "opt_inet6.h"
36
37#include <sys/param.h>
38#include <sys/types.h>
39#include <sys/kernel.h>
40#include <sys/systm.h>
41#include <sys/errno.h>
42#include <sys/sysproto.h>
43#include <sys/malloc.h>
44#include <sys/osd.h>
45#include <sys/priv.h>
46#include <sys/proc.h>
47#include <sys/taskqueue.h>
48#include <sys/fcntl.h>
49#include <sys/jail.h>
50#include <sys/lock.h>
51#include <sys/mutex.h>
52#include <sys/sx.h>
53#include <sys/sysent.h>
54#include <sys/namei.h>
55#include <sys/mount.h>
56#include <sys/queue.h>
57#include <sys/socket.h>
58#include <sys/syscallsubr.h>
59#include <sys/sysctl.h>
60#include <sys/vnode.h>
61#include <sys/vimage.h>
62#include <net/if.h>
63#include <netinet/in.h>
64#ifdef DDB
65#include <ddb/ddb.h>
66#ifdef INET6
67#include <netinet6/in6_var.h>
68#endif /* INET6 */
69#endif /* DDB */
70
71#include <security/mac/mac_framework.h>
72
73#define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
74
75MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
76
77/* prison0 describes what is "real" about the system. */
78struct prison prison0 = {
79	.pr_id		= 0,
80	.pr_name	= "0",
81	.pr_ref		= 1,
82	.pr_uref	= 1,
83	.pr_path	= "/",
84	.pr_securelevel	= -1,
85	.pr_childmax	= JAIL_MAX,
86	.pr_hostuuid	= DEFAULT_HOSTUUID,
87	.pr_children	= LIST_HEAD_INITIALIZER(&prison0.pr_children),
88	.pr_flags	= PR_HOST,
89	.pr_allow	= PR_ALLOW_ALL,
90};
91MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
92
93/* allprison and lastprid are protected by allprison_lock. */
94struct	sx allprison_lock;
95SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
96struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
97int	lastprid = 0;
98
99static int do_jail_attach(struct thread *td, struct prison *pr);
100static void prison_complete(void *context, int pending);
101static void prison_deref(struct prison *pr, int flags);
102static char *prison_path(struct prison *pr1, struct prison *pr2);
103static void prison_remove_one(struct prison *pr);
104#ifdef INET
105static int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
106static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4);
107#endif
108#ifdef INET6
109static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
110static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6);
111#endif
112
113/* Flags for prison_deref */
114#define	PD_DEREF	0x01
115#define	PD_DEUREF	0x02
116#define	PD_LOCKED	0x04
117#define	PD_LIST_SLOCKED	0x08
118#define	PD_LIST_XLOCKED	0x10
119
120/*
121 * Parameter names corresponding to PR_* flag values
122 */
123static char *pr_flag_names[] = {
124	[0] = "persist",
125};
126
127static char *pr_flag_nonames[] = {
128	[0] = "nopersist",
129};
130
131struct jailsys_flags {
132	const char	*name;
133	unsigned	 disable;
134	unsigned	 new;
135} pr_flag_jailsys[] = {
136	{ "host", 0, PR_HOST },
137#ifdef VIMAGE
138	{ "vnet", 0, PR_VNET },
139#endif
140#ifdef INET
141	{ "ip4", PR_IP4_USER | PR_IP4_DISABLE, PR_IP4_USER },
142#endif
143#ifdef INET6
144	{ "ip6", PR_IP6_USER | PR_IP6_DISABLE, PR_IP6_USER },
145#endif
146};
147
148static char *pr_allow_names[] = {
149	"allow.set_hostname",
150	"allow.sysvipc",
151	"allow.raw_sockets",
152	"allow.chflags",
153	"allow.mount",
154	"allow.quotas",
155	"allow.socket_af",
156};
157
158static char *pr_allow_nonames[] = {
159	"allow.noset_hostname",
160	"allow.nosysvipc",
161	"allow.noraw_sockets",
162	"allow.nochflags",
163	"allow.nomount",
164	"allow.noquotas",
165	"allow.nosocket_af",
166};
167
168#define	JAIL_DEFAULT_ALLOW	PR_ALLOW_SET_HOSTNAME
169static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
170static int jail_default_enforce_statfs = 2;
171#if defined(INET) || defined(INET6)
172static unsigned jail_max_af_ips = 255;
173#endif
174
175#ifdef INET
176static int
177qcmp_v4(const void *ip1, const void *ip2)
178{
179	in_addr_t iaa, iab;
180
181	/*
182	 * We need to compare in HBO here to get the list sorted as expected
183	 * by the result of the code.  Sorting NBO addresses gives you
184	 * interesting results.  If you do not understand, do not try.
185	 */
186	iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
187	iab = ntohl(((const struct in_addr *)ip2)->s_addr);
188
189	/*
190	 * Do not simply return the difference of the two numbers, the int is
191	 * not wide enough.
192	 */
193	if (iaa > iab)
194		return (1);
195	else if (iaa < iab)
196		return (-1);
197	else
198		return (0);
199}
200#endif
201
202#ifdef INET6
203static int
204qcmp_v6(const void *ip1, const void *ip2)
205{
206	const struct in6_addr *ia6a, *ia6b;
207	int i, rc;
208
209	ia6a = (const struct in6_addr *)ip1;
210	ia6b = (const struct in6_addr *)ip2;
211
212	rc = 0;
213	for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
214		if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
215			rc = 1;
216		else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
217			rc = -1;
218	}
219	return (rc);
220}
221#endif
222
223/*
224 * struct jail_args {
225 *	struct jail *jail;
226 * };
227 */
228int
229jail(struct thread *td, struct jail_args *uap)
230{
231	uint32_t version;
232	int error;
233	struct jail j;
234
235	error = copyin(uap->jail, &version, sizeof(uint32_t));
236	if (error)
237		return (error);
238
239	switch (version) {
240	case 0:
241	{
242		struct jail_v0 j0;
243
244		/* FreeBSD single IPv4 jails. */
245		bzero(&j, sizeof(struct jail));
246		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
247		if (error)
248			return (error);
249		j.version = j0.version;
250		j.path = j0.path;
251		j.hostname = j0.hostname;
252		j.ip4s = j0.ip_number;
253		break;
254	}
255
256	case 1:
257		/*
258		 * Version 1 was used by multi-IPv4 jail implementations
259		 * that never made it into the official kernel.
260		 */
261		return (EINVAL);
262
263	case 2:	/* JAIL_API_VERSION */
264		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
265		error = copyin(uap->jail, &j, sizeof(struct jail));
266		if (error)
267			return (error);
268		break;
269
270	default:
271		/* Sci-Fi jails are not supported, sorry. */
272		return (EINVAL);
273	}
274	return (kern_jail(td, &j));
275}
276
277int
278kern_jail(struct thread *td, struct jail *j)
279{
280	struct iovec optiov[2 * (4
281			    + sizeof(pr_allow_names) / sizeof(pr_allow_names[0])
282#ifdef INET
283			    + 1
284#endif
285#ifdef INET6
286			    + 1
287#endif
288			    )];
289	struct uio opt;
290	char *u_path, *u_hostname, *u_name;
291#ifdef INET
292	uint32_t ip4s;
293	struct in_addr *u_ip4;
294#endif
295#ifdef INET6
296	struct in6_addr *u_ip6;
297#endif
298	size_t tmplen;
299	int error, enforce_statfs, fi;
300
301	bzero(&optiov, sizeof(optiov));
302	opt.uio_iov = optiov;
303	opt.uio_iovcnt = 0;
304	opt.uio_offset = -1;
305	opt.uio_resid = -1;
306	opt.uio_segflg = UIO_SYSSPACE;
307	opt.uio_rw = UIO_READ;
308	opt.uio_td = td;
309
310	/* Set permissions for top-level jails from sysctls. */
311	if (!jailed(td->td_ucred)) {
312		for (fi = 0; fi < sizeof(pr_allow_names) /
313		     sizeof(pr_allow_names[0]); fi++) {
314			optiov[opt.uio_iovcnt].iov_base =
315			    (jail_default_allow & (1 << fi))
316			    ? pr_allow_names[fi] : pr_allow_nonames[fi];
317			optiov[opt.uio_iovcnt].iov_len =
318			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
319			opt.uio_iovcnt += 2;
320		}
321		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
322		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
323		opt.uio_iovcnt++;
324		enforce_statfs = jail_default_enforce_statfs;
325		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
326		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
327		opt.uio_iovcnt++;
328	}
329
330	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
331#ifdef INET
332	ip4s = (j->version == 0) ? 1 : j->ip4s;
333	if (ip4s > jail_max_af_ips)
334		return (EINVAL);
335	tmplen += ip4s * sizeof(struct in_addr);
336#else
337	if (j->ip4s > 0)
338		return (EINVAL);
339#endif
340#ifdef INET6
341	if (j->ip6s > jail_max_af_ips)
342		return (EINVAL);
343	tmplen += j->ip6s * sizeof(struct in6_addr);
344#else
345	if (j->ip6s > 0)
346		return (EINVAL);
347#endif
348	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
349	u_hostname = u_path + MAXPATHLEN;
350	u_name = u_hostname + MAXHOSTNAMELEN;
351#ifdef INET
352	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
353#endif
354#ifdef INET6
355#ifdef INET
356	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
357#else
358	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
359#endif
360#endif
361	optiov[opt.uio_iovcnt].iov_base = "path";
362	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
363	opt.uio_iovcnt++;
364	optiov[opt.uio_iovcnt].iov_base = u_path;
365	error = copyinstr(j->path, u_path, MAXPATHLEN,
366	    &optiov[opt.uio_iovcnt].iov_len);
367	if (error) {
368		free(u_path, M_TEMP);
369		return (error);
370	}
371	opt.uio_iovcnt++;
372	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
373	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
374	opt.uio_iovcnt++;
375	optiov[opt.uio_iovcnt].iov_base = u_hostname;
376	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
377	    &optiov[opt.uio_iovcnt].iov_len);
378	if (error) {
379		free(u_path, M_TEMP);
380		return (error);
381	}
382	opt.uio_iovcnt++;
383	if (j->jailname != NULL) {
384		optiov[opt.uio_iovcnt].iov_base = "name";
385		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
386		opt.uio_iovcnt++;
387		optiov[opt.uio_iovcnt].iov_base = u_name;
388		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
389		    &optiov[opt.uio_iovcnt].iov_len);
390		if (error) {
391			free(u_path, M_TEMP);
392			return (error);
393		}
394		opt.uio_iovcnt++;
395	}
396#ifdef INET
397	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
398	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
399	opt.uio_iovcnt++;
400	optiov[opt.uio_iovcnt].iov_base = u_ip4;
401	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
402	if (j->version == 0)
403		u_ip4->s_addr = j->ip4s;
404	else {
405		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
406		if (error) {
407			free(u_path, M_TEMP);
408			return (error);
409		}
410	}
411	opt.uio_iovcnt++;
412#endif
413#ifdef INET6
414	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
415	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
416	opt.uio_iovcnt++;
417	optiov[opt.uio_iovcnt].iov_base = u_ip6;
418	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
419	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
420	if (error) {
421		free(u_path, M_TEMP);
422		return (error);
423	}
424	opt.uio_iovcnt++;
425#endif
426	KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
427	    ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
428	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
429	free(u_path, M_TEMP);
430	return (error);
431}
432
433
434/*
435 * struct jail_set_args {
436 *	struct iovec *iovp;
437 *	unsigned int iovcnt;
438 *	int flags;
439 * };
440 */
441int
442jail_set(struct thread *td, struct jail_set_args *uap)
443{
444	struct uio *auio;
445	int error;
446
447	/* Check that we have an even number of iovecs. */
448	if (uap->iovcnt & 1)
449		return (EINVAL);
450
451	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
452	if (error)
453		return (error);
454	error = kern_jail_set(td, auio, uap->flags);
455	free(auio, M_IOV);
456	return (error);
457}
458
459int
460kern_jail_set(struct thread *td, struct uio *optuio, int flags)
461{
462	struct nameidata nd;
463#ifdef INET
464	struct in_addr *ip4;
465#endif
466#ifdef INET6
467	struct in6_addr *ip6;
468#endif
469	struct vfsopt *opt;
470	struct vfsoptlist *opts;
471	struct prison *pr, *deadpr, *mypr, *ppr, *tpr, *tppr;
472	struct vnode *root;
473	char *domain, *errmsg, *host, *name, *p, *path, *uuid;
474#if defined(INET) || defined(INET6)
475	void *op;
476#endif
477	unsigned long hid;
478	size_t namelen, onamelen;
479	int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos;
480	int gotchildmax, gotenforce, gothid, gotslevel;
481	int fi, jid, jsys, len, level;
482	int childmax, slevel, vfslocked;
483#if defined(INET) || defined(INET6)
484	int ii, ij;
485#endif
486#ifdef INET
487	int ip4s, ip4a, redo_ip4;
488#endif
489#ifdef INET6
490	int ip6s, ip6a, redo_ip6;
491#endif
492	unsigned pr_flags, ch_flags;
493	unsigned pr_allow, ch_allow, tallow;
494	char numbuf[12];
495
496	error = priv_check(td, PRIV_JAIL_SET);
497	if (!error && (flags & JAIL_ATTACH))
498		error = priv_check(td, PRIV_JAIL_ATTACH);
499	if (error)
500		return (error);
501	mypr = ppr = td->td_ucred->cr_prison;
502	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
503		return (EPERM);
504	if (flags & ~JAIL_SET_MASK)
505		return (EINVAL);
506
507	/*
508	 * Check all the parameters before committing to anything.  Not all
509	 * errors can be caught early, but we may as well try.  Also, this
510	 * takes care of some expensive stuff (path lookup) before getting
511	 * the allprison lock.
512	 *
513	 * XXX Jails are not filesystems, and jail parameters are not mount
514	 *     options.  But it makes more sense to re-use the vfsopt code
515	 *     than duplicate it under a different name.
516	 */
517	error = vfs_buildopts(optuio, &opts);
518	if (error)
519		return (error);
520#ifdef INET
521	ip4a = 0;
522	ip4 = NULL;
523#endif
524#ifdef INET6
525	ip6a = 0;
526	ip6 = NULL;
527#endif
528
529#if defined(INET) || defined(INET6)
530 again:
531#endif
532	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
533	if (error == ENOENT)
534		jid = 0;
535	else if (error != 0)
536		goto done_free;
537
538	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
539	if (error == ENOENT)
540		gotslevel = 0;
541	else if (error != 0)
542		goto done_free;
543	else
544		gotslevel = 1;
545
546	error =
547	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
548	if (error == ENOENT)
549		gotchildmax = 0;
550	else if (error != 0)
551		goto done_free;
552	else
553		gotchildmax = 1;
554
555	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
556	gotenforce = (error == 0);
557	if (gotenforce) {
558		if (enforce < 0 || enforce > 2)
559			return (EINVAL);
560	} else if (error != ENOENT)
561		goto done_free;
562
563	pr_flags = ch_flags = 0;
564	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
565	    fi++) {
566		if (pr_flag_names[fi] == NULL)
567			continue;
568		vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
569		vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
570	}
571	ch_flags |= pr_flags;
572	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
573	    fi++) {
574		error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
575		    sizeof(jsys));
576		if (error == ENOENT)
577			continue;
578		if (error != 0)
579			goto done_free;
580		switch (jsys) {
581		case JAIL_SYS_DISABLE:
582			if (!pr_flag_jailsys[fi].disable) {
583				error = EINVAL;
584				goto done_free;
585			}
586			pr_flags |= pr_flag_jailsys[fi].disable;
587			break;
588		case JAIL_SYS_NEW:
589			pr_flags |= pr_flag_jailsys[fi].new;
590			break;
591		case JAIL_SYS_INHERIT:
592			break;
593		default:
594			error = EINVAL;
595			goto done_free;
596		}
597		ch_flags |=
598		    pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
599	}
600	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
601	    && !(pr_flags & PR_PERSIST)) {
602		error = EINVAL;
603		vfs_opterror(opts, "new jail must persist or attach");
604		goto done_errmsg;
605	}
606#ifdef VIMAGE
607	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
608		error = EINVAL;
609		vfs_opterror(opts, "vnet cannot be changed after creation");
610		goto done_errmsg;
611	}
612#endif
613
614	pr_allow = ch_allow = 0;
615	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
616	    fi++) {
617		vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
618		vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
619	}
620	ch_allow |= pr_allow;
621
622	error = vfs_getopt(opts, "name", (void **)&name, &len);
623	if (error == ENOENT)
624		name = NULL;
625	else if (error != 0)
626		goto done_free;
627	else {
628		if (len == 0 || name[len - 1] != '\0') {
629			error = EINVAL;
630			goto done_free;
631		}
632		if (len > MAXHOSTNAMELEN) {
633			error = ENAMETOOLONG;
634			goto done_free;
635		}
636	}
637
638	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
639	if (error == ENOENT)
640		host = NULL;
641	else if (error != 0)
642		goto done_free;
643	else {
644		ch_flags |= PR_HOST;
645		pr_flags |= PR_HOST;
646		if (len == 0 || host[len - 1] != '\0') {
647			error = EINVAL;
648			goto done_free;
649		}
650		if (len > MAXHOSTNAMELEN) {
651			error = ENAMETOOLONG;
652			goto done_free;
653		}
654	}
655
656	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
657	if (error == ENOENT)
658		domain = NULL;
659	else if (error != 0)
660		goto done_free;
661	else {
662		ch_flags |= PR_HOST;
663		pr_flags |= PR_HOST;
664		if (len == 0 || domain[len - 1] != '\0') {
665			error = EINVAL;
666			goto done_free;
667		}
668		if (len > MAXHOSTNAMELEN) {
669			error = ENAMETOOLONG;
670			goto done_free;
671		}
672	}
673
674	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
675	if (error == ENOENT)
676		uuid = NULL;
677	else if (error != 0)
678		goto done_free;
679	else {
680		ch_flags |= PR_HOST;
681		pr_flags |= PR_HOST;
682		if (len == 0 || uuid[len - 1] != '\0') {
683			error = EINVAL;
684			goto done_free;
685		}
686		if (len > HOSTUUIDLEN) {
687			error = ENAMETOOLONG;
688			goto done_free;
689		}
690	}
691
692#ifdef COMPAT_IA32
693	if (td->td_proc->p_sysent->sv_flags & SV_IA32) {
694		uint32_t hid32;
695
696		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
697		hid = hid32;
698	} else
699#endif
700		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
701	if (error == ENOENT)
702		gothid = 0;
703	else if (error != 0)
704		goto done_free;
705	else {
706		gothid = 1;
707		ch_flags |= PR_HOST;
708		pr_flags |= PR_HOST;
709	}
710
711	/* This might be the second time around for this option. */
712#ifdef INET
713	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
714	if (error == ENOENT)
715		ip4s = (pr_flags & PR_IP4_DISABLE) ? 0 : -1;
716	else if (error != 0)
717		goto done_free;
718	else if (ip4s & (sizeof(*ip4) - 1)) {
719		error = EINVAL;
720		goto done_free;
721	} else {
722		ch_flags |= PR_IP4_USER | PR_IP4_DISABLE;
723		if (ip4s == 0)
724			pr_flags |= PR_IP4_USER | PR_IP4_DISABLE;
725		else {
726			pr_flags = (pr_flags & ~PR_IP4_DISABLE) | PR_IP4_USER;
727			ip4s /= sizeof(*ip4);
728			if (ip4s > jail_max_af_ips) {
729				error = EINVAL;
730				vfs_opterror(opts, "too many IPv4 addresses");
731				goto done_errmsg;
732			}
733			if (ip4a < ip4s) {
734				ip4a = ip4s;
735				free(ip4, M_PRISON);
736				ip4 = NULL;
737			}
738			if (ip4 == NULL)
739				ip4 = malloc(ip4a * sizeof(*ip4), M_PRISON,
740				    M_WAITOK);
741			bcopy(op, ip4, ip4s * sizeof(*ip4));
742			/*
743			 * IP addresses are all sorted but ip[0] to preserve
744			 * the primary IP address as given from userland.
745			 * This special IP is used for unbound outgoing
746			 * connections as well for "loopback" traffic.
747			 */
748			if (ip4s > 1)
749				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
750			/*
751			 * Check for duplicate addresses and do some simple
752			 * zero and broadcast checks. If users give other bogus
753			 * addresses it is their problem.
754			 *
755			 * We do not have to care about byte order for these
756			 * checks so we will do them in NBO.
757			 */
758			for (ii = 0; ii < ip4s; ii++) {
759				if (ip4[ii].s_addr == INADDR_ANY ||
760				    ip4[ii].s_addr == INADDR_BROADCAST) {
761					error = EINVAL;
762					goto done_free;
763				}
764				if ((ii+1) < ip4s &&
765				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
766				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
767					error = EINVAL;
768					goto done_free;
769				}
770			}
771		}
772	}
773#endif
774
775#ifdef INET6
776	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
777	if (error == ENOENT)
778		ip6s = (pr_flags & PR_IP6_DISABLE) ? 0 : -1;
779	else if (error != 0)
780		goto done_free;
781	else if (ip6s & (sizeof(*ip6) - 1)) {
782		error = EINVAL;
783		goto done_free;
784	} else {
785		ch_flags |= PR_IP6_USER | PR_IP6_DISABLE;
786		if (ip6s == 0)
787			pr_flags |= PR_IP6_USER | PR_IP6_DISABLE;
788		else {
789			pr_flags = (pr_flags & ~PR_IP6_DISABLE) | PR_IP6_USER;
790			ip6s /= sizeof(*ip6);
791			if (ip6s > jail_max_af_ips) {
792				error = EINVAL;
793				vfs_opterror(opts, "too many IPv6 addresses");
794				goto done_errmsg;
795			}
796			if (ip6a < ip6s) {
797				ip6a = ip6s;
798				free(ip6, M_PRISON);
799				ip6 = NULL;
800			}
801			if (ip6 == NULL)
802				ip6 = malloc(ip6a * sizeof(*ip6), M_PRISON,
803				    M_WAITOK);
804			bcopy(op, ip6, ip6s * sizeof(*ip6));
805			if (ip6s > 1)
806				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
807			for (ii = 0; ii < ip6s; ii++) {
808				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
809					error = EINVAL;
810					goto done_free;
811				}
812				if ((ii+1) < ip6s &&
813				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
814				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
815				{
816					error = EINVAL;
817					goto done_free;
818				}
819			}
820		}
821	}
822#endif
823
824#if defined(VIMAGE) && (defined(INET) || defined(INET6))
825	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
826		error = EINVAL;
827		vfs_opterror(opts,
828		    "vnet jails cannot have IP address restrictions");
829		goto done_errmsg;
830	}
831#endif
832
833	root = NULL;
834	error = vfs_getopt(opts, "path", (void **)&path, &len);
835	if (error == ENOENT)
836		path = NULL;
837	else if (error != 0)
838		goto done_free;
839	else {
840		if (flags & JAIL_UPDATE) {
841			error = EINVAL;
842			vfs_opterror(opts,
843			    "path cannot be changed after creation");
844			goto done_errmsg;
845		}
846		if (len == 0 || path[len - 1] != '\0') {
847			error = EINVAL;
848			goto done_free;
849		}
850		if (len < 2 || (len == 2 && path[0] == '/'))
851			path = NULL;
852		else {
853			/* Leave room for a real-root full pathname. */
854			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
855			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
856				error = ENAMETOOLONG;
857				goto done_free;
858			}
859			NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_SYSSPACE,
860			    path, td);
861			error = namei(&nd);
862			if (error)
863				goto done_free;
864			vfslocked = NDHASGIANT(&nd);
865			root = nd.ni_vp;
866			NDFREE(&nd, NDF_ONLY_PNBUF);
867			if (root->v_type != VDIR) {
868				error = ENOTDIR;
869				vrele(root);
870				VFS_UNLOCK_GIANT(vfslocked);
871				goto done_free;
872			}
873			VFS_UNLOCK_GIANT(vfslocked);
874		}
875	}
876
877	/*
878	 * Grab the allprison lock before letting modules check their
879	 * parameters.  Once we have it, do not let go so we'll have a
880	 * consistent view of the OSD list.
881	 */
882	sx_xlock(&allprison_lock);
883	error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
884	if (error)
885		goto done_unlock_list;
886
887	/* By now, all parameters should have been noted. */
888	TAILQ_FOREACH(opt, opts, link) {
889		if (!opt->seen && strcmp(opt->name, "errmsg")) {
890			error = EINVAL;
891			vfs_opterror(opts, "unknown parameter: %s", opt->name);
892			goto done_unlock_list;
893		}
894	}
895
896	/*
897	 * See if we are creating a new record or updating an existing one.
898	 * This abuses the file error codes ENOENT and EEXIST.
899	 */
900	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
901	if (!cuflags) {
902		error = EINVAL;
903		vfs_opterror(opts, "no valid operation (create or update)");
904		goto done_unlock_list;
905	}
906	pr = NULL;
907	if (jid != 0) {
908		/*
909		 * See if a requested jid already exists.  There is an
910		 * information leak here if the jid exists but is not within
911		 * the caller's jail hierarchy.  Jail creators will get EEXIST
912		 * even though they cannot see the jail, and CREATE | UPDATE
913		 * will return ENOENT which is not normally a valid error.
914		 */
915		if (jid < 0) {
916			error = EINVAL;
917			vfs_opterror(opts, "negative jid");
918			goto done_unlock_list;
919		}
920		pr = prison_find(jid);
921		if (pr != NULL) {
922			ppr = pr->pr_parent;
923			/* Create: jid must not exist. */
924			if (cuflags == JAIL_CREATE) {
925				mtx_unlock(&pr->pr_mtx);
926				error = EEXIST;
927				vfs_opterror(opts, "jail %d already exists",
928				    jid);
929				goto done_unlock_list;
930			}
931			if (!prison_ischild(mypr, pr)) {
932				mtx_unlock(&pr->pr_mtx);
933				pr = NULL;
934			} else if (pr->pr_uref == 0) {
935				if (!(flags & JAIL_DYING)) {
936					mtx_unlock(&pr->pr_mtx);
937					error = ENOENT;
938					vfs_opterror(opts, "jail %d is dying",
939					    jid);
940					goto done_unlock_list;
941				} else if ((flags & JAIL_ATTACH) ||
942				    (pr_flags & PR_PERSIST)) {
943					/*
944					 * A dying jail might be resurrected
945					 * (via attach or persist), but first
946					 * it must determine if another jail
947					 * has claimed its name.  Accomplish
948					 * this by implicitly re-setting the
949					 * name.
950					 */
951					if (name == NULL)
952						name = prison_name(mypr, pr);
953				}
954			}
955		}
956		if (pr == NULL) {
957			/* Update: jid must exist. */
958			if (cuflags == JAIL_UPDATE) {
959				error = ENOENT;
960				vfs_opterror(opts, "jail %d not found", jid);
961				goto done_unlock_list;
962			}
963		}
964	}
965	/*
966	 * If the caller provided a name, look for a jail by that name.
967	 * This has different semantics for creates and updates keyed by jid
968	 * (where the name must not already exist in a different jail),
969	 * and updates keyed by the name itself (where the name must exist
970	 * because that is the jail being updated).
971	 */
972	if (name != NULL) {
973		p = strrchr(name, '.');
974		if (p != NULL) {
975			/*
976			 * This is a hierarchical name.  Split it into the
977			 * parent and child names, and make sure the parent
978			 * exists or matches an already found jail.
979			 */
980			*p = '\0';
981			if (pr != NULL) {
982				if (strncmp(name, ppr->pr_name, p - name) ||
983				    ppr->pr_name[p - name] != '\0') {
984					mtx_unlock(&pr->pr_mtx);
985					error = EINVAL;
986					vfs_opterror(opts,
987					    "cannot change jail's parent");
988					goto done_unlock_list;
989				}
990			} else {
991				ppr = prison_find_name(mypr, name);
992				if (ppr == NULL) {
993					error = ENOENT;
994					vfs_opterror(opts,
995					    "jail \"%s\" not found", name);
996					goto done_unlock_list;
997				}
998				mtx_unlock(&ppr->pr_mtx);
999			}
1000			name = p + 1;
1001		}
1002		if (name[0] != '\0') {
1003			namelen =
1004			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1005 name_again:
1006			deadpr = NULL;
1007			FOREACH_PRISON_CHILD(ppr, tpr) {
1008				if (tpr != pr && tpr->pr_ref > 0 &&
1009				    !strcmp(tpr->pr_name + namelen, name)) {
1010					if (pr == NULL &&
1011					    cuflags != JAIL_CREATE) {
1012						mtx_lock(&tpr->pr_mtx);
1013						if (tpr->pr_ref > 0) {
1014							/*
1015							 * Use this jail
1016							 * for updates.
1017							 */
1018							if (tpr->pr_uref > 0) {
1019								pr = tpr;
1020								break;
1021							}
1022							deadpr = tpr;
1023						}
1024						mtx_unlock(&tpr->pr_mtx);
1025					} else if (tpr->pr_uref > 0) {
1026						/*
1027						 * Create, or update(jid):
1028						 * name must not exist in an
1029						 * active sibling jail.
1030						 */
1031						error = EEXIST;
1032						if (pr != NULL)
1033							mtx_unlock(&pr->pr_mtx);
1034						vfs_opterror(opts,
1035						   "jail \"%s\" already exists",
1036						   name);
1037						goto done_unlock_list;
1038					}
1039				}
1040			}
1041			/* If no active jail is found, use a dying one. */
1042			if (deadpr != NULL && pr == NULL) {
1043				if (flags & JAIL_DYING) {
1044					mtx_lock(&deadpr->pr_mtx);
1045					if (deadpr->pr_ref == 0) {
1046						mtx_unlock(&deadpr->pr_mtx);
1047						goto name_again;
1048					}
1049					pr = deadpr;
1050				} else if (cuflags == JAIL_UPDATE) {
1051					error = ENOENT;
1052					vfs_opterror(opts,
1053					    "jail \"%s\" is dying", name);
1054					goto done_unlock_list;
1055				}
1056			}
1057			/* Update: name must exist if no jid. */
1058			else if (cuflags == JAIL_UPDATE && pr == NULL) {
1059				error = ENOENT;
1060				vfs_opterror(opts, "jail \"%s\" not found",
1061				    name);
1062				goto done_unlock_list;
1063			}
1064		}
1065	}
1066	/* Update: must provide a jid or name. */
1067	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1068		error = ENOENT;
1069		vfs_opterror(opts, "update specified no jail");
1070		goto done_unlock_list;
1071	}
1072
1073	/* If there's no prison to update, create a new one and link it in. */
1074	if (pr == NULL) {
1075		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1076			if (tpr->pr_childcount >= tpr->pr_childmax) {
1077				error = EPERM;
1078				vfs_opterror(opts, "prison limit exceeded");
1079				goto done_unlock_list;
1080			}
1081		created = 1;
1082		mtx_lock(&ppr->pr_mtx);
1083		if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) {
1084			mtx_unlock(&ppr->pr_mtx);
1085			error = ENOENT;
1086			vfs_opterror(opts, "parent jail went away!");
1087			goto done_unlock_list;
1088		}
1089		ppr->pr_ref++;
1090		ppr->pr_uref++;
1091		mtx_unlock(&ppr->pr_mtx);
1092		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1093		if (jid == 0) {
1094			/* Find the next free jid. */
1095			jid = lastprid + 1;
1096 findnext:
1097			if (jid == JAIL_MAX)
1098				jid = 1;
1099			TAILQ_FOREACH(tpr, &allprison, pr_list) {
1100				if (tpr->pr_id < jid)
1101					continue;
1102				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
1103					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1104					break;
1105				}
1106				if (jid == lastprid) {
1107					error = EAGAIN;
1108					vfs_opterror(opts,
1109					    "no available jail IDs");
1110					free(pr, M_PRISON);
1111					prison_deref(ppr, PD_DEREF |
1112					    PD_DEUREF | PD_LIST_XLOCKED);
1113					goto done_releroot;
1114				}
1115				jid++;
1116				goto findnext;
1117			}
1118			lastprid = jid;
1119		} else {
1120			/*
1121			 * The jail already has a jid (that did not yet exist),
1122			 * so just find where to insert it.
1123			 */
1124			TAILQ_FOREACH(tpr, &allprison, pr_list)
1125				if (tpr->pr_id >= jid) {
1126					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1127					break;
1128				}
1129		}
1130		if (tpr == NULL)
1131			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1132		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1133		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1134			tpr->pr_childcount++;
1135
1136		pr->pr_parent = ppr;
1137		pr->pr_id = jid;
1138
1139		/* Set some default values, and inherit some from the parent. */
1140		if (name == NULL)
1141			name = "";
1142		if (path == NULL) {
1143			path = "/";
1144			root = mypr->pr_root;
1145			vref(root);
1146		}
1147		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1148		pr->pr_flags |= PR_HOST;
1149#if defined(INET) || defined(INET6)
1150#ifdef VIMAGE
1151		if (!(pr_flags & PR_VNET))
1152#endif
1153		{
1154#ifdef INET
1155			pr->pr_flags |= PR_IP4 | PR_IP4_USER | PR_IP4_DISABLE;
1156#endif
1157#ifdef INET6
1158			pr->pr_flags |= PR_IP6 | PR_IP6_USER | PR_IP6_DISABLE;
1159#endif
1160		}
1161#endif
1162		pr->pr_securelevel = ppr->pr_securelevel;
1163		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1164		pr->pr_enforce_statfs = ppr->pr_enforce_statfs;
1165
1166		LIST_INIT(&pr->pr_children);
1167		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1168
1169#ifdef VIMAGE
1170		/* Allocate a new vnet if specified. */
1171		pr->pr_vnet = (pr_flags & PR_VNET)
1172		    ? vnet_alloc() : ppr->pr_vnet;
1173#endif
1174		/*
1175		 * Allocate a dedicated cpuset for each jail.
1176		 * Unlike other initial settings, this may return an erorr.
1177		 */
1178		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1179		if (error) {
1180			prison_deref(pr, PD_LIST_XLOCKED);
1181			goto done_releroot;
1182		}
1183
1184		mtx_lock(&pr->pr_mtx);
1185		/*
1186		 * New prisons do not yet have a reference, because we do not
1187		 * want other to see the incomplete prison once the
1188		 * allprison_lock is downgraded.
1189		 */
1190	} else {
1191		created = 0;
1192#if defined(VIMAGE) && (defined(INET) || defined(INET6))
1193		if ((pr->pr_flags & PR_VNET) &&
1194		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1195			error = EINVAL;
1196			vfs_opterror(opts,
1197			    "vnet jails cannot have IP address restrictions");
1198			goto done_deref_locked;
1199		}
1200#endif
1201		/*
1202		 * Grab a reference for existing prisons, to ensure they
1203		 * continue to exist for the duration of the call.
1204		 */
1205		pr->pr_ref++;
1206	}
1207
1208	/* Do final error checking before setting anything. */
1209	if (gotslevel) {
1210		if (slevel < ppr->pr_securelevel) {
1211			error = EPERM;
1212			goto done_deref_locked;
1213		}
1214	}
1215	if (gotchildmax) {
1216		if (childmax >= ppr->pr_childmax) {
1217			error = EPERM;
1218			goto done_deref_locked;
1219		}
1220	}
1221	if (gotenforce) {
1222		if (enforce < ppr->pr_enforce_statfs) {
1223			error = EPERM;
1224			goto done_deref_locked;
1225		}
1226	}
1227#ifdef INET
1228	if (ch_flags & PR_IP4_USER) {
1229		if (ppr->pr_flags & PR_IP4) {
1230			if (!(pr_flags & PR_IP4_USER)) {
1231				/*
1232				 * Silently ignore attempts to make the IP
1233				 * addresses unrestricted when the parent is
1234				 * restricted; in other words, interpret
1235				 * "unrestricted" as "as unrestricted as
1236				 * possible".
1237				 */
1238				ip4s = ppr->pr_ip4s;
1239				if (ip4s == 0) {
1240					free(ip4, M_PRISON);
1241					ip4 = NULL;
1242				} else if (ip4s <= ip4a) {
1243					/* Inherit the parent's address(es). */
1244					bcopy(ppr->pr_ip4, ip4,
1245					    ip4s * sizeof(*ip4));
1246				} else {
1247					/*
1248					 * There's no room for the parent's
1249					 * address list.  Allocate some more.
1250					 */
1251					ip4a = ip4s;
1252					free(ip4, M_PRISON);
1253					ip4 = malloc(ip4a * sizeof(*ip4),
1254					    M_PRISON, M_NOWAIT);
1255					if (ip4 != NULL)
1256						bcopy(ppr->pr_ip4, ip4,
1257						    ip4s * sizeof(*ip4));
1258					else {
1259						/* Allocation failed without
1260						 * sleeping.  Unlocking the
1261						 * prison now will invalidate
1262						 * some checks and prematurely
1263						 * show an unfinished new jail.
1264						 * So let go of everything and
1265						 * start over.
1266						 */
1267						prison_deref(pr, created
1268						    ? PD_LOCKED |
1269						      PD_LIST_XLOCKED
1270						    : PD_DEREF | PD_LOCKED |
1271						      PD_LIST_XLOCKED);
1272						if (root != NULL) {
1273							vfslocked =
1274							    VFS_LOCK_GIANT(
1275							    root->v_mount);
1276							vrele(root);
1277							VFS_UNLOCK_GIANT(
1278							    vfslocked);
1279						}
1280						ip4 = malloc(ip4a *
1281						    sizeof(*ip4), M_PRISON,
1282						    M_WAITOK);
1283						goto again;
1284					}
1285				}
1286			} else if (ip4s > 0) {
1287				/*
1288				 * Make sure the new set of IP addresses is a
1289				 * subset of the parent's list.  Don't worry
1290				 * about the parent being unlocked, as any
1291				 * setting is done with allprison_lock held.
1292				 */
1293				for (ij = 0; ij < ppr->pr_ip4s; ij++)
1294					if (ip4[0].s_addr ==
1295					    ppr->pr_ip4[ij].s_addr)
1296						break;
1297				if (ij == ppr->pr_ip4s) {
1298					error = EPERM;
1299					goto done_deref_locked;
1300				}
1301				if (ip4s > 1) {
1302					for (ii = ij = 1; ii < ip4s; ii++) {
1303						if (ip4[ii].s_addr ==
1304						    ppr->pr_ip4[0].s_addr)
1305							continue;
1306						for (; ij < ppr->pr_ip4s; ij++)
1307						    if (ip4[ii].s_addr ==
1308							ppr->pr_ip4[ij].s_addr)
1309							    break;
1310						if (ij == ppr->pr_ip4s)
1311							break;
1312					}
1313					if (ij == ppr->pr_ip4s) {
1314						error = EPERM;
1315						goto done_deref_locked;
1316					}
1317				}
1318			}
1319		}
1320		if (ip4s > 0) {
1321			/*
1322			 * Check for conflicting IP addresses.  We permit them
1323			 * if there is no more than one IP on each jail.  If
1324			 * there is a duplicate on a jail with more than one
1325			 * IP stop checking and return error.
1326			 */
1327			tppr = ppr;
1328#ifdef VIMAGE
1329			for (; tppr != &prison0; tppr = tppr->pr_parent)
1330				if (tppr->pr_flags & PR_VNET)
1331					break;
1332#endif
1333			FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1334				if (tpr == pr ||
1335#ifdef VIMAGE
1336				    (tpr != tppr &&
1337				     (tpr->pr_flags & PR_VNET)) ||
1338#endif
1339				    tpr->pr_uref == 0) {
1340					descend = 0;
1341					continue;
1342				}
1343				if (!(tpr->pr_flags & PR_IP4_USER))
1344					continue;
1345				descend = 0;
1346				if (tpr->pr_ip4 == NULL ||
1347				    (ip4s == 1 && tpr->pr_ip4s == 1))
1348					continue;
1349				for (ii = 0; ii < ip4s; ii++) {
1350					if (_prison_check_ip4(tpr,
1351					    &ip4[ii]) == 0) {
1352						error = EADDRINUSE;
1353						vfs_opterror(opts,
1354						    "IPv4 addresses clash");
1355						goto done_deref_locked;
1356					}
1357				}
1358			}
1359		}
1360	}
1361#endif
1362#ifdef INET6
1363	if (ch_flags & PR_IP6_USER) {
1364		if (ppr->pr_flags & PR_IP6) {
1365			if (!(pr_flags & PR_IP6_USER)) {
1366				/*
1367				 * Silently ignore attempts to make the IP
1368				 * addresses unrestricted when the parent is
1369				 * restricted.
1370				 */
1371				ip6s = ppr->pr_ip6s;
1372				if (ip6s == 0) {
1373					free(ip6, M_PRISON);
1374					ip6 = NULL;
1375				} else if (ip6s <= ip6a) {
1376					/* Inherit the parent's address(es). */
1377					bcopy(ppr->pr_ip6, ip6,
1378					    ip6s * sizeof(*ip6));
1379				} else {
1380					/*
1381					 * There's no room for the parent's
1382					 * address list.
1383					 */
1384					ip6a = ip6s;
1385					free(ip6, M_PRISON);
1386					ip6 = malloc(ip6a * sizeof(*ip6),
1387					    M_PRISON, M_NOWAIT);
1388					if (ip6 != NULL)
1389						bcopy(ppr->pr_ip6, ip6,
1390						    ip6s * sizeof(*ip6));
1391					else {
1392						prison_deref(pr, created
1393						    ? PD_LOCKED |
1394						      PD_LIST_XLOCKED
1395						    : PD_DEREF | PD_LOCKED |
1396						      PD_LIST_XLOCKED);
1397						if (root != NULL) {
1398							vfslocked =
1399							    VFS_LOCK_GIANT(
1400							    root->v_mount);
1401							vrele(root);
1402							VFS_UNLOCK_GIANT(
1403							    vfslocked);
1404						}
1405						ip6 = malloc(ip6a *
1406						    sizeof(*ip6), M_PRISON,
1407						    M_WAITOK);
1408						goto again;
1409					}
1410				}
1411			} else if (ip6s > 0) {
1412				/*
1413				 * Make sure the new set of IP addresses is a
1414				 * subset of the parent's list.
1415				 */
1416				for (ij = 0; ij < ppr->pr_ip6s; ij++)
1417					if (IN6_ARE_ADDR_EQUAL(&ip6[0],
1418					    &ppr->pr_ip6[ij]))
1419						break;
1420				if (ij == ppr->pr_ip6s) {
1421					error = EPERM;
1422					goto done_deref_locked;
1423				}
1424				if (ip6s > 1) {
1425					for (ii = ij = 1; ii < ip6s; ii++) {
1426						if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
1427						    &ppr->pr_ip6[0]))
1428							continue;
1429						for (; ij < ppr->pr_ip6s; ij++)
1430							if (IN6_ARE_ADDR_EQUAL(
1431							    &ip6[ii],
1432							    &ppr->pr_ip6[ij]))
1433								break;
1434						if (ij == ppr->pr_ip6s)
1435							break;
1436					}
1437					if (ij == ppr->pr_ip6s) {
1438						error = EPERM;
1439						goto done_deref_locked;
1440					}
1441				}
1442			}
1443		}
1444		if (ip6s > 0) {
1445			/* Check for conflicting IP addresses. */
1446			tppr = ppr;
1447#ifdef VIMAGE
1448			for (; tppr != &prison0; tppr = tppr->pr_parent)
1449				if (tppr->pr_flags & PR_VNET)
1450					break;
1451#endif
1452			FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1453				if (tpr == pr ||
1454#ifdef VIMAGE
1455				    (tpr != tppr &&
1456				     (tpr->pr_flags & PR_VNET)) ||
1457#endif
1458				    tpr->pr_uref == 0) {
1459					descend = 0;
1460					continue;
1461				}
1462				if (!(tpr->pr_flags & PR_IP6_USER))
1463					continue;
1464				descend = 0;
1465				if (tpr->pr_ip6 == NULL ||
1466				    (ip6s == 1 && tpr->pr_ip6s == 1))
1467					continue;
1468				for (ii = 0; ii < ip6s; ii++) {
1469					if (_prison_check_ip6(tpr,
1470					    &ip6[ii]) == 0) {
1471						error = EADDRINUSE;
1472						vfs_opterror(opts,
1473						    "IPv6 addresses clash");
1474						goto done_deref_locked;
1475					}
1476				}
1477			}
1478		}
1479	}
1480#endif
1481	onamelen = namelen = 0;
1482	if (name != NULL) {
1483		/* Give a default name of the jid. */
1484		if (name[0] == '\0')
1485			snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
1486		else if (strtoul(name, &p, 10) != jid && *p == '\0') {
1487			error = EINVAL;
1488			vfs_opterror(opts, "name cannot be numeric");
1489			goto done_deref_locked;
1490		}
1491		/*
1492		 * Make sure the name isn't too long for the prison or its
1493		 * children.
1494		 */
1495		onamelen = strlen(pr->pr_name);
1496		namelen = strlen(name);
1497		if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) {
1498			error = ENAMETOOLONG;
1499			goto done_deref_locked;
1500		}
1501		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1502			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1503			    sizeof(pr->pr_name)) {
1504				error = ENAMETOOLONG;
1505				goto done_deref_locked;
1506			}
1507		}
1508	}
1509	if (pr_allow & ~ppr->pr_allow) {
1510		error = EPERM;
1511		goto done_deref_locked;
1512	}
1513
1514	/* Set the parameters of the prison. */
1515#ifdef INET
1516	redo_ip4 = 0;
1517	if (ch_flags & PR_IP4_USER) {
1518		if (pr_flags & PR_IP4_USER) {
1519			/* Some restriction set. */
1520			pr->pr_flags |= PR_IP4;
1521			if (ip4s >= 0) {
1522				free(pr->pr_ip4, M_PRISON);
1523				pr->pr_ip4s = ip4s;
1524				pr->pr_ip4 = ip4;
1525				ip4 = NULL;
1526			}
1527		} else if (ppr->pr_flags & PR_IP4) {
1528			/* This restriction cleared, but keep inherited. */
1529			free(pr->pr_ip4, M_PRISON);
1530			pr->pr_ip4s = ip4s;
1531			pr->pr_ip4 = ip4;
1532			ip4 = NULL;
1533		} else {
1534			/* Restriction cleared, now unrestricted. */
1535			pr->pr_flags &= ~PR_IP4;
1536			free(pr->pr_ip4, M_PRISON);
1537			pr->pr_ip4s = 0;
1538		}
1539		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1540#ifdef VIMAGE
1541			if (tpr->pr_flags & PR_VNET) {
1542				descend = 0;
1543				continue;
1544			}
1545#endif
1546			if (prison_restrict_ip4(tpr, NULL)) {
1547				redo_ip4 = 1;
1548				descend = 0;
1549			}
1550		}
1551	}
1552#endif
1553#ifdef INET6
1554	redo_ip6 = 0;
1555	if (ch_flags & PR_IP6_USER) {
1556		if (pr_flags & PR_IP6_USER) {
1557			/* Some restriction set. */
1558			pr->pr_flags |= PR_IP6;
1559			if (ip6s >= 0) {
1560				free(pr->pr_ip6, M_PRISON);
1561				pr->pr_ip6s = ip6s;
1562				pr->pr_ip6 = ip6;
1563				ip6 = NULL;
1564			}
1565		} else if (ppr->pr_flags & PR_IP6) {
1566			/* This restriction cleared, but keep inherited. */
1567			free(pr->pr_ip6, M_PRISON);
1568			pr->pr_ip6s = ip6s;
1569			pr->pr_ip6 = ip6;
1570			ip6 = NULL;
1571		} else {
1572			/* Restriction cleared, now unrestricted. */
1573			pr->pr_flags &= ~PR_IP6;
1574			free(pr->pr_ip6, M_PRISON);
1575			pr->pr_ip6s = 0;
1576		}
1577		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1578#ifdef VIMAGE
1579			if (tpr->pr_flags & PR_VNET) {
1580				descend = 0;
1581				continue;
1582			}
1583#endif
1584			if (prison_restrict_ip6(tpr, NULL)) {
1585				redo_ip6 = 1;
1586				descend = 0;
1587			}
1588		}
1589	}
1590#endif
1591	if (gotslevel) {
1592		pr->pr_securelevel = slevel;
1593		/* Set all child jails to be at least this level. */
1594		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1595			if (tpr->pr_securelevel < slevel)
1596				tpr->pr_securelevel = slevel;
1597	}
1598	if (gotchildmax) {
1599		pr->pr_childmax = childmax;
1600		/* Set all child jails to under this limit. */
1601		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1602			if (tpr->pr_childmax > childmax - level)
1603				tpr->pr_childmax = childmax > level
1604				    ? childmax - level : 0;
1605	}
1606	if (gotenforce) {
1607		pr->pr_enforce_statfs = enforce;
1608		/* Pass this restriction on to the children. */
1609		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1610			if (tpr->pr_enforce_statfs < enforce)
1611				tpr->pr_enforce_statfs = enforce;
1612	}
1613	if (name != NULL) {
1614		if (ppr == &prison0)
1615			strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
1616		else
1617			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1618			    ppr->pr_name, name);
1619		/* Change this component of child names. */
1620		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1621			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1622			    strlen(tpr->pr_name + onamelen) + 1);
1623			bcopy(pr->pr_name, tpr->pr_name, namelen);
1624		}
1625	}
1626	if (path != NULL) {
1627		/* Try to keep a real-rooted full pathname. */
1628		if (path[0] == '/' && strcmp(mypr->pr_path, "/"))
1629			snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
1630			    mypr->pr_path, path);
1631		else
1632			strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1633		pr->pr_root = root;
1634	}
1635	if (PR_HOST & ch_flags & ~pr_flags) {
1636		if (pr->pr_flags & PR_HOST) {
1637			/*
1638			 * Copy the parent's host info.  As with pr_ip4 above,
1639			 * the lack of a lock on the parent is not a problem;
1640			 * it is always set with allprison_lock at least
1641			 * shared, and is held exclusively here.
1642			 */
1643			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1644			    sizeof(pr->pr_hostname));
1645			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1646			    sizeof(pr->pr_domainname));
1647			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1648			    sizeof(pr->pr_hostuuid));
1649			pr->pr_hostid = pr->pr_parent->pr_hostid;
1650		}
1651	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1652		/* Set this prison, and any descendants without PR_HOST. */
1653		if (host != NULL)
1654			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1655		if (domain != NULL)
1656			strlcpy(pr->pr_domainname, domain,
1657			    sizeof(pr->pr_domainname));
1658		if (uuid != NULL)
1659			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1660		if (gothid)
1661			pr->pr_hostid = hid;
1662		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1663			if (tpr->pr_flags & PR_HOST)
1664				descend = 0;
1665			else {
1666				if (host != NULL)
1667					strlcpy(tpr->pr_hostname,
1668					    pr->pr_hostname,
1669					    sizeof(tpr->pr_hostname));
1670				if (domain != NULL)
1671					strlcpy(tpr->pr_domainname,
1672					    pr->pr_domainname,
1673					    sizeof(tpr->pr_domainname));
1674				if (uuid != NULL)
1675					strlcpy(tpr->pr_hostuuid,
1676					    pr->pr_hostuuid,
1677					    sizeof(tpr->pr_hostuuid));
1678				if (gothid)
1679					tpr->pr_hostid = hid;
1680			}
1681		}
1682	}
1683	if ((tallow = ch_allow & ~pr_allow)) {
1684		/* Clear allow bits in all children. */
1685		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1686			tpr->pr_allow &= ~tallow;
1687	}
1688	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1689	/*
1690	 * Persistent prisons get an extra reference, and prisons losing their
1691	 * persist flag lose that reference.  Only do this for existing prisons
1692	 * for now, so new ones will remain unseen until after the module
1693	 * handlers have completed.
1694	 */
1695	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
1696		if (pr_flags & PR_PERSIST) {
1697			pr->pr_ref++;
1698			pr->pr_uref++;
1699		} else {
1700			pr->pr_ref--;
1701			pr->pr_uref--;
1702		}
1703	}
1704	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1705	mtx_unlock(&pr->pr_mtx);
1706
1707	/* Locks may have prevented a complete restriction of child IP
1708	 * addresses.  If so, allocate some more memory and try again.
1709	 */
1710#ifdef INET
1711	while (redo_ip4) {
1712		ip4s = pr->pr_ip4s;
1713		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
1714		mtx_lock(&pr->pr_mtx);
1715		redo_ip4 = 0;
1716		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1717#ifdef VIMAGE
1718			if (tpr->pr_flags & PR_VNET) {
1719				descend = 0;
1720				continue;
1721			}
1722#endif
1723			if (prison_restrict_ip4(tpr, ip4)) {
1724				if (ip4 != NULL)
1725					ip4 = NULL;
1726				else
1727					redo_ip4 = 1;
1728			}
1729		}
1730		mtx_unlock(&pr->pr_mtx);
1731	}
1732#endif
1733#ifdef INET6
1734	while (redo_ip6) {
1735		ip6s = pr->pr_ip6s;
1736		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
1737		mtx_lock(&pr->pr_mtx);
1738		redo_ip6 = 0;
1739		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1740#ifdef VIMAGE
1741			if (tpr->pr_flags & PR_VNET) {
1742				descend = 0;
1743				continue;
1744			}
1745#endif
1746			if (prison_restrict_ip6(tpr, ip6)) {
1747				if (ip6 != NULL)
1748					ip6 = NULL;
1749				else
1750					redo_ip6 = 1;
1751			}
1752		}
1753		mtx_unlock(&pr->pr_mtx);
1754	}
1755#endif
1756
1757	/* Let the modules do their work. */
1758	sx_downgrade(&allprison_lock);
1759	if (created) {
1760		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1761		if (error) {
1762			prison_deref(pr, PD_LIST_SLOCKED);
1763			goto done_errmsg;
1764		}
1765	}
1766	error = osd_jail_call(pr, PR_METHOD_SET, opts);
1767	if (error) {
1768		prison_deref(pr, created
1769		    ? PD_LIST_SLOCKED
1770		    : PD_DEREF | PD_LIST_SLOCKED);
1771		goto done_errmsg;
1772	}
1773
1774	/* Attach this process to the prison if requested. */
1775	if (flags & JAIL_ATTACH) {
1776		mtx_lock(&pr->pr_mtx);
1777		error = do_jail_attach(td, pr);
1778		if (error) {
1779			vfs_opterror(opts, "attach failed");
1780			if (!created)
1781				prison_deref(pr, PD_DEREF);
1782			goto done_errmsg;
1783		}
1784	}
1785
1786	/*
1787	 * Now that it is all there, drop the temporary reference from existing
1788	 * prisons.  Or add a reference to newly created persistent prisons
1789	 * (which was not done earlier so that the prison would not be publicly
1790	 * visible).
1791	 */
1792	if (!created) {
1793		prison_deref(pr, (flags & JAIL_ATTACH)
1794		    ? PD_DEREF
1795		    : PD_DEREF | PD_LIST_SLOCKED);
1796	} else {
1797		if (pr_flags & PR_PERSIST) {
1798			mtx_lock(&pr->pr_mtx);
1799			pr->pr_ref++;
1800			pr->pr_uref++;
1801			mtx_unlock(&pr->pr_mtx);
1802		}
1803		if (!(flags & JAIL_ATTACH))
1804			sx_sunlock(&allprison_lock);
1805	}
1806	td->td_retval[0] = pr->pr_id;
1807	goto done_errmsg;
1808
1809 done_deref_locked:
1810	prison_deref(pr, created
1811	    ? PD_LOCKED | PD_LIST_XLOCKED
1812	    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
1813	goto done_releroot;
1814 done_unlock_list:
1815	sx_xunlock(&allprison_lock);
1816 done_releroot:
1817	if (root != NULL) {
1818		vfslocked = VFS_LOCK_GIANT(root->v_mount);
1819		vrele(root);
1820		VFS_UNLOCK_GIANT(vfslocked);
1821	}
1822 done_errmsg:
1823	if (error) {
1824		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
1825		if (errmsg_len > 0) {
1826			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1827			if (errmsg_pos > 0) {
1828				if (optuio->uio_segflg == UIO_SYSSPACE)
1829					bcopy(errmsg,
1830					   optuio->uio_iov[errmsg_pos].iov_base,
1831					   errmsg_len);
1832				else
1833					copyout(errmsg,
1834					   optuio->uio_iov[errmsg_pos].iov_base,
1835					   errmsg_len);
1836			}
1837		}
1838	}
1839 done_free:
1840#ifdef INET
1841	free(ip4, M_PRISON);
1842#endif
1843#ifdef INET6
1844	free(ip6, M_PRISON);
1845#endif
1846	vfs_freeopts(opts);
1847	return (error);
1848}
1849
1850
1851/*
1852 * struct jail_get_args {
1853 *	struct iovec *iovp;
1854 *	unsigned int iovcnt;
1855 *	int flags;
1856 * };
1857 */
1858int
1859jail_get(struct thread *td, struct jail_get_args *uap)
1860{
1861	struct uio *auio;
1862	int error;
1863
1864	/* Check that we have an even number of iovecs. */
1865	if (uap->iovcnt & 1)
1866		return (EINVAL);
1867
1868	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
1869	if (error)
1870		return (error);
1871	error = kern_jail_get(td, auio, uap->flags);
1872	if (error == 0)
1873		error = copyout(auio->uio_iov, uap->iovp,
1874		    uap->iovcnt * sizeof (struct iovec));
1875	free(auio, M_IOV);
1876	return (error);
1877}
1878
1879int
1880kern_jail_get(struct thread *td, struct uio *optuio, int flags)
1881{
1882	struct prison *pr, *mypr;
1883	struct vfsopt *opt;
1884	struct vfsoptlist *opts;
1885	char *errmsg, *name;
1886	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
1887
1888	if (flags & ~JAIL_GET_MASK)
1889		return (EINVAL);
1890
1891	/* Get the parameter list. */
1892	error = vfs_buildopts(optuio, &opts);
1893	if (error)
1894		return (error);
1895	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
1896	mypr = td->td_ucred->cr_prison;
1897
1898	/*
1899	 * Find the prison specified by one of: lastjid, jid, name.
1900	 */
1901	sx_slock(&allprison_lock);
1902	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
1903	if (error == 0) {
1904		TAILQ_FOREACH(pr, &allprison, pr_list) {
1905			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
1906				mtx_lock(&pr->pr_mtx);
1907				if (pr->pr_ref > 0 &&
1908				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
1909					break;
1910				mtx_unlock(&pr->pr_mtx);
1911			}
1912		}
1913		if (pr != NULL)
1914			goto found_prison;
1915		error = ENOENT;
1916		vfs_opterror(opts, "no jail after %d", jid);
1917		goto done_unlock_list;
1918	} else if (error != ENOENT)
1919		goto done_unlock_list;
1920
1921	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
1922	if (error == 0) {
1923		if (jid != 0) {
1924			pr = prison_find_child(mypr, jid);
1925			if (pr != NULL) {
1926				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
1927					mtx_unlock(&pr->pr_mtx);
1928					error = ENOENT;
1929					vfs_opterror(opts, "jail %d is dying",
1930					    jid);
1931					goto done_unlock_list;
1932				}
1933				goto found_prison;
1934			}
1935			error = ENOENT;
1936			vfs_opterror(opts, "jail %d not found", jid);
1937			goto done_unlock_list;
1938		}
1939	} else if (error != ENOENT)
1940		goto done_unlock_list;
1941
1942	error = vfs_getopt(opts, "name", (void **)&name, &len);
1943	if (error == 0) {
1944		if (len == 0 || name[len - 1] != '\0') {
1945			error = EINVAL;
1946			goto done_unlock_list;
1947		}
1948		pr = prison_find_name(mypr, name);
1949		if (pr != NULL) {
1950			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
1951				mtx_unlock(&pr->pr_mtx);
1952				error = ENOENT;
1953				vfs_opterror(opts, "jail \"%s\" is dying",
1954				    name);
1955				goto done_unlock_list;
1956			}
1957			goto found_prison;
1958		}
1959		error = ENOENT;
1960		vfs_opterror(opts, "jail \"%s\" not found", name);
1961		goto done_unlock_list;
1962	} else if (error != ENOENT)
1963		goto done_unlock_list;
1964
1965	vfs_opterror(opts, "no jail specified");
1966	error = ENOENT;
1967	goto done_unlock_list;
1968
1969 found_prison:
1970	/* Get the parameters of the prison. */
1971	pr->pr_ref++;
1972	locked = PD_LOCKED;
1973	td->td_retval[0] = pr->pr_id;
1974	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
1975	if (error != 0 && error != ENOENT)
1976		goto done_deref;
1977	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
1978	error = vfs_setopt(opts, "parent", &i, sizeof(i));
1979	if (error != 0 && error != ENOENT)
1980		goto done_deref;
1981	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
1982	if (error != 0 && error != ENOENT)
1983		goto done_deref;
1984	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
1985	    sizeof(pr->pr_cpuset->cs_id));
1986	if (error != 0 && error != ENOENT)
1987		goto done_deref;
1988	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
1989	if (error != 0 && error != ENOENT)
1990		goto done_deref;
1991#ifdef INET
1992	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
1993	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1994	if (error != 0 && error != ENOENT)
1995		goto done_deref;
1996#endif
1997#ifdef INET6
1998	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
1999	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
2000	if (error != 0 && error != ENOENT)
2001		goto done_deref;
2002#endif
2003	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2004	    sizeof(pr->pr_securelevel));
2005	if (error != 0 && error != ENOENT)
2006		goto done_deref;
2007	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2008	    sizeof(pr->pr_childcount));
2009	if (error != 0 && error != ENOENT)
2010		goto done_deref;
2011	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2012	    sizeof(pr->pr_childmax));
2013	if (error != 0 && error != ENOENT)
2014		goto done_deref;
2015	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2016	if (error != 0 && error != ENOENT)
2017		goto done_deref;
2018	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2019	if (error != 0 && error != ENOENT)
2020		goto done_deref;
2021	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2022	if (error != 0 && error != ENOENT)
2023		goto done_deref;
2024#ifdef COMPAT_IA32
2025	if (td->td_proc->p_sysent->sv_flags & SV_IA32) {
2026		uint32_t hid32 = pr->pr_hostid;
2027
2028		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2029	} else
2030#endif
2031	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2032	    sizeof(pr->pr_hostid));
2033	if (error != 0 && error != ENOENT)
2034		goto done_deref;
2035	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2036	    sizeof(pr->pr_enforce_statfs));
2037	if (error != 0 && error != ENOENT)
2038		goto done_deref;
2039	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
2040	    fi++) {
2041		if (pr_flag_names[fi] == NULL)
2042			continue;
2043		i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
2044		error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
2045		if (error != 0 && error != ENOENT)
2046			goto done_deref;
2047		i = !i;
2048		error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
2049		if (error != 0 && error != ENOENT)
2050			goto done_deref;
2051	}
2052	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
2053	    fi++) {
2054		i = pr->pr_flags &
2055		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
2056		i = pr_flag_jailsys[fi].disable &&
2057		      (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
2058		    : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
2059		    : JAIL_SYS_INHERIT;
2060		error =
2061		    vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
2062		if (error != 0 && error != ENOENT)
2063			goto done_deref;
2064	}
2065	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
2066	    fi++) {
2067		if (pr_allow_names[fi] == NULL)
2068			continue;
2069		i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
2070		error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
2071		if (error != 0 && error != ENOENT)
2072			goto done_deref;
2073		i = !i;
2074		error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
2075		if (error != 0 && error != ENOENT)
2076			goto done_deref;
2077	}
2078	i = (pr->pr_uref == 0);
2079	error = vfs_setopt(opts, "dying", &i, sizeof(i));
2080	if (error != 0 && error != ENOENT)
2081		goto done_deref;
2082	i = !i;
2083	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2084	if (error != 0 && error != ENOENT)
2085		goto done_deref;
2086
2087	/* Get the module parameters. */
2088	mtx_unlock(&pr->pr_mtx);
2089	locked = 0;
2090	error = osd_jail_call(pr, PR_METHOD_GET, opts);
2091	if (error)
2092		goto done_deref;
2093	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
2094
2095	/* By now, all parameters should have been noted. */
2096	TAILQ_FOREACH(opt, opts, link) {
2097		if (!opt->seen && strcmp(opt->name, "errmsg")) {
2098			error = EINVAL;
2099			vfs_opterror(opts, "unknown parameter: %s", opt->name);
2100			goto done_errmsg;
2101		}
2102	}
2103
2104	/* Write the fetched parameters back to userspace. */
2105	error = 0;
2106	TAILQ_FOREACH(opt, opts, link) {
2107		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2108			pos = 2 * opt->pos + 1;
2109			optuio->uio_iov[pos].iov_len = opt->len;
2110			if (opt->value != NULL) {
2111				if (optuio->uio_segflg == UIO_SYSSPACE) {
2112					bcopy(opt->value,
2113					    optuio->uio_iov[pos].iov_base,
2114					    opt->len);
2115				} else {
2116					error = copyout(opt->value,
2117					    optuio->uio_iov[pos].iov_base,
2118					    opt->len);
2119					if (error)
2120						break;
2121				}
2122			}
2123		}
2124	}
2125	goto done_errmsg;
2126
2127 done_deref:
2128	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
2129	goto done_errmsg;
2130
2131 done_unlock_list:
2132	sx_sunlock(&allprison_lock);
2133 done_errmsg:
2134	if (error && errmsg_pos >= 0) {
2135		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2136		errmsg_pos = 2 * errmsg_pos + 1;
2137		if (errmsg_len > 0) {
2138			if (optuio->uio_segflg == UIO_SYSSPACE)
2139				bcopy(errmsg,
2140				    optuio->uio_iov[errmsg_pos].iov_base,
2141				    errmsg_len);
2142			else
2143				copyout(errmsg,
2144				    optuio->uio_iov[errmsg_pos].iov_base,
2145				    errmsg_len);
2146		}
2147	}
2148	vfs_freeopts(opts);
2149	return (error);
2150}
2151
2152
2153/*
2154 * struct jail_remove_args {
2155 *	int jid;
2156 * };
2157 */
2158int
2159jail_remove(struct thread *td, struct jail_remove_args *uap)
2160{
2161	struct prison *pr, *cpr, *lpr, *tpr;
2162	int descend, error;
2163
2164	error = priv_check(td, PRIV_JAIL_REMOVE);
2165	if (error)
2166		return (error);
2167
2168	sx_xlock(&allprison_lock);
2169	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2170	if (pr == NULL) {
2171		sx_xunlock(&allprison_lock);
2172		return (EINVAL);
2173	}
2174
2175	/* Remove all descendants of this prison, then remove this prison. */
2176	pr->pr_ref++;
2177	pr->pr_flags |= PR_REMOVE;
2178	if (!LIST_EMPTY(&pr->pr_children)) {
2179		mtx_unlock(&pr->pr_mtx);
2180		lpr = NULL;
2181		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
2182			mtx_lock(&cpr->pr_mtx);
2183			if (cpr->pr_ref > 0) {
2184				tpr = cpr;
2185				cpr->pr_ref++;
2186				cpr->pr_flags |= PR_REMOVE;
2187			} else {
2188				/* Already removed - do not do it again. */
2189				tpr = NULL;
2190			}
2191			mtx_unlock(&cpr->pr_mtx);
2192			if (lpr != NULL) {
2193				mtx_lock(&lpr->pr_mtx);
2194				prison_remove_one(lpr);
2195				sx_xlock(&allprison_lock);
2196			}
2197			lpr = tpr;
2198		}
2199		if (lpr != NULL) {
2200			mtx_lock(&lpr->pr_mtx);
2201			prison_remove_one(lpr);
2202			sx_xlock(&allprison_lock);
2203		}
2204		mtx_lock(&pr->pr_mtx);
2205	}
2206	prison_remove_one(pr);
2207	return (0);
2208}
2209
2210static void
2211prison_remove_one(struct prison *pr)
2212{
2213	struct proc *p;
2214	int deuref;
2215
2216	/* If the prison was persistent, it is not anymore. */
2217	deuref = 0;
2218	if (pr->pr_flags & PR_PERSIST) {
2219		pr->pr_ref--;
2220		deuref = PD_DEUREF;
2221		pr->pr_flags &= ~PR_PERSIST;
2222	}
2223
2224	/*
2225	 * jail_remove added a reference.  If that's the only one, remove
2226	 * the prison now.
2227	 */
2228	KASSERT(pr->pr_ref > 0,
2229	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
2230	if (pr->pr_ref == 1) {
2231		prison_deref(pr,
2232		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
2233		return;
2234	}
2235
2236	mtx_unlock(&pr->pr_mtx);
2237	sx_xunlock(&allprison_lock);
2238	/*
2239	 * Kill all processes unfortunate enough to be attached to this prison.
2240	 */
2241	sx_slock(&allproc_lock);
2242	LIST_FOREACH(p, &allproc, p_list) {
2243		PROC_LOCK(p);
2244		if (p->p_state != PRS_NEW && p->p_ucred &&
2245		    p->p_ucred->cr_prison == pr)
2246			psignal(p, SIGKILL);
2247		PROC_UNLOCK(p);
2248	}
2249	sx_sunlock(&allproc_lock);
2250	/* Remove the temporary reference added by jail_remove. */
2251	prison_deref(pr, deuref | PD_DEREF);
2252}
2253
2254
2255/*
2256 * struct jail_attach_args {
2257 *	int jid;
2258 * };
2259 */
2260int
2261jail_attach(struct thread *td, struct jail_attach_args *uap)
2262{
2263	struct prison *pr;
2264	int error;
2265
2266	error = priv_check(td, PRIV_JAIL_ATTACH);
2267	if (error)
2268		return (error);
2269
2270	sx_slock(&allprison_lock);
2271	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2272	if (pr == NULL) {
2273		sx_sunlock(&allprison_lock);
2274		return (EINVAL);
2275	}
2276
2277	/*
2278	 * Do not allow a process to attach to a prison that is not
2279	 * considered to be "alive".
2280	 */
2281	if (pr->pr_uref == 0) {
2282		mtx_unlock(&pr->pr_mtx);
2283		sx_sunlock(&allprison_lock);
2284		return (EINVAL);
2285	}
2286
2287	return (do_jail_attach(td, pr));
2288}
2289
2290static int
2291do_jail_attach(struct thread *td, struct prison *pr)
2292{
2293	struct prison *ppr;
2294	struct proc *p;
2295	struct ucred *newcred, *oldcred;
2296	int vfslocked, error;
2297
2298	/*
2299	 * XXX: Note that there is a slight race here if two threads
2300	 * in the same privileged process attempt to attach to two
2301	 * different jails at the same time.  It is important for
2302	 * user processes not to do this, or they might end up with
2303	 * a process root from one prison, but attached to the jail
2304	 * of another.
2305	 */
2306	pr->pr_ref++;
2307	pr->pr_uref++;
2308	mtx_unlock(&pr->pr_mtx);
2309
2310	/* Let modules do whatever they need to prepare for attaching. */
2311	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2312	if (error) {
2313		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
2314		return (error);
2315	}
2316	sx_sunlock(&allprison_lock);
2317
2318	/*
2319	 * Reparent the newly attached process to this jail.
2320	 */
2321	ppr = td->td_ucred->cr_prison;
2322	p = td->td_proc;
2323	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2324	if (error)
2325		goto e_revert_osd;
2326
2327	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
2328	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2329	if ((error = change_dir(pr->pr_root, td)) != 0)
2330		goto e_unlock;
2331#ifdef MAC
2332	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2333		goto e_unlock;
2334#endif
2335	VOP_UNLOCK(pr->pr_root, 0);
2336	if ((error = change_root(pr->pr_root, td)))
2337		goto e_unlock_giant;
2338	VFS_UNLOCK_GIANT(vfslocked);
2339
2340	newcred = crget();
2341	PROC_LOCK(p);
2342	oldcred = p->p_ucred;
2343	setsugid(p);
2344	crcopy(newcred, oldcred);
2345	newcred->cr_prison = pr;
2346	p->p_ucred = newcred;
2347	PROC_UNLOCK(p);
2348	crfree(oldcred);
2349	prison_deref(ppr, PD_DEREF | PD_DEUREF);
2350	return (0);
2351 e_unlock:
2352	VOP_UNLOCK(pr->pr_root, 0);
2353 e_unlock_giant:
2354	VFS_UNLOCK_GIANT(vfslocked);
2355 e_revert_osd:
2356	/* Tell modules this thread is still in its old jail after all. */
2357	(void)osd_jail_call(ppr, PR_METHOD_ATTACH, td);
2358	prison_deref(pr, PD_DEREF | PD_DEUREF);
2359	return (error);
2360}
2361
2362
2363/*
2364 * Returns a locked prison instance, or NULL on failure.
2365 */
2366struct prison *
2367prison_find(int prid)
2368{
2369	struct prison *pr;
2370
2371	sx_assert(&allprison_lock, SX_LOCKED);
2372	TAILQ_FOREACH(pr, &allprison, pr_list) {
2373		if (pr->pr_id == prid) {
2374			mtx_lock(&pr->pr_mtx);
2375			if (pr->pr_ref > 0)
2376				return (pr);
2377			mtx_unlock(&pr->pr_mtx);
2378		}
2379	}
2380	return (NULL);
2381}
2382
2383/*
2384 * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2385 */
2386struct prison *
2387prison_find_child(struct prison *mypr, int prid)
2388{
2389	struct prison *pr;
2390	int descend;
2391
2392	sx_assert(&allprison_lock, SX_LOCKED);
2393	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2394		if (pr->pr_id == prid) {
2395			mtx_lock(&pr->pr_mtx);
2396			if (pr->pr_ref > 0)
2397				return (pr);
2398			mtx_unlock(&pr->pr_mtx);
2399		}
2400	}
2401	return (NULL);
2402}
2403
2404/*
2405 * Look for the name relative to mypr.  Returns a locked prison or NULL.
2406 */
2407struct prison *
2408prison_find_name(struct prison *mypr, const char *name)
2409{
2410	struct prison *pr, *deadpr;
2411	size_t mylen;
2412	int descend;
2413
2414	sx_assert(&allprison_lock, SX_LOCKED);
2415	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2416 again:
2417	deadpr = NULL;
2418	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2419		if (!strcmp(pr->pr_name + mylen, name)) {
2420			mtx_lock(&pr->pr_mtx);
2421			if (pr->pr_ref > 0) {
2422				if (pr->pr_uref > 0)
2423					return (pr);
2424				deadpr = pr;
2425			}
2426			mtx_unlock(&pr->pr_mtx);
2427		}
2428	}
2429	/* There was no valid prison - perhaps there was a dying one. */
2430	if (deadpr != NULL) {
2431		mtx_lock(&deadpr->pr_mtx);
2432		if (deadpr->pr_ref == 0) {
2433			mtx_unlock(&deadpr->pr_mtx);
2434			goto again;
2435		}
2436	}
2437	return (deadpr);
2438}
2439
2440/*
2441 * See if a prison has the specific flag set.
2442 */
2443int
2444prison_flag(struct ucred *cred, unsigned flag)
2445{
2446
2447	/* This is an atomic read, so no locking is necessary. */
2448	return (cred->cr_prison->pr_flags & flag);
2449}
2450
2451int
2452prison_allow(struct ucred *cred, unsigned flag)
2453{
2454
2455	/* This is an atomic read, so no locking is necessary. */
2456	return (cred->cr_prison->pr_allow & flag);
2457}
2458
2459/*
2460 * Remove a prison reference.  If that was the last reference, remove the
2461 * prison itself - but not in this context in case there are locks held.
2462 */
2463void
2464prison_free_locked(struct prison *pr)
2465{
2466
2467	mtx_assert(&pr->pr_mtx, MA_OWNED);
2468	pr->pr_ref--;
2469	if (pr->pr_ref == 0) {
2470		mtx_unlock(&pr->pr_mtx);
2471		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
2472		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2473		return;
2474	}
2475	mtx_unlock(&pr->pr_mtx);
2476}
2477
2478void
2479prison_free(struct prison *pr)
2480{
2481
2482	mtx_lock(&pr->pr_mtx);
2483	prison_free_locked(pr);
2484}
2485
2486static void
2487prison_complete(void *context, int pending)
2488{
2489
2490	prison_deref((struct prison *)context, 0);
2491}
2492
2493/*
2494 * Remove a prison reference (usually).  This internal version assumes no
2495 * mutexes are held, except perhaps the prison itself.  If there are no more
2496 * references, release and delist the prison.  On completion, the prison lock
2497 * and the allprison lock are both unlocked.
2498 */
2499static void
2500prison_deref(struct prison *pr, int flags)
2501{
2502	struct prison *ppr, *tpr;
2503	int vfslocked;
2504
2505	if (!(flags & PD_LOCKED))
2506		mtx_lock(&pr->pr_mtx);
2507	/* Decrement the user references in a separate loop. */
2508	if (flags & PD_DEUREF) {
2509		for (tpr = pr;; tpr = tpr->pr_parent) {
2510			if (tpr != pr)
2511				mtx_lock(&tpr->pr_mtx);
2512			if (--tpr->pr_uref > 0)
2513				break;
2514			KASSERT(tpr != &prison0, ("prison0 pr_uref=0"));
2515			mtx_unlock(&tpr->pr_mtx);
2516		}
2517		/* Done if there were only user references to remove. */
2518		if (!(flags & PD_DEREF)) {
2519			mtx_unlock(&tpr->pr_mtx);
2520			if (flags & PD_LIST_SLOCKED)
2521				sx_sunlock(&allprison_lock);
2522			else if (flags & PD_LIST_XLOCKED)
2523				sx_xunlock(&allprison_lock);
2524			return;
2525		}
2526		if (tpr != pr) {
2527			mtx_unlock(&tpr->pr_mtx);
2528			mtx_lock(&pr->pr_mtx);
2529		}
2530	}
2531
2532	for (;;) {
2533		if (flags & PD_DEREF)
2534			pr->pr_ref--;
2535		/* If the prison still has references, nothing else to do. */
2536		if (pr->pr_ref > 0) {
2537			mtx_unlock(&pr->pr_mtx);
2538			if (flags & PD_LIST_SLOCKED)
2539				sx_sunlock(&allprison_lock);
2540			else if (flags & PD_LIST_XLOCKED)
2541				sx_xunlock(&allprison_lock);
2542			return;
2543		}
2544
2545		mtx_unlock(&pr->pr_mtx);
2546		if (flags & PD_LIST_SLOCKED) {
2547			if (!sx_try_upgrade(&allprison_lock)) {
2548				sx_sunlock(&allprison_lock);
2549				sx_xlock(&allprison_lock);
2550			}
2551		} else if (!(flags & PD_LIST_XLOCKED))
2552			sx_xlock(&allprison_lock);
2553
2554		TAILQ_REMOVE(&allprison, pr, pr_list);
2555		LIST_REMOVE(pr, pr_sibling);
2556		ppr = pr->pr_parent;
2557		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
2558			tpr->pr_childcount--;
2559		sx_downgrade(&allprison_lock);
2560
2561#ifdef VIMAGE
2562		if (pr->pr_flags & PR_VNET)
2563			vnet_destroy(pr->pr_vnet);
2564#endif
2565		if (pr->pr_root != NULL) {
2566			vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
2567			vrele(pr->pr_root);
2568			VFS_UNLOCK_GIANT(vfslocked);
2569		}
2570		mtx_destroy(&pr->pr_mtx);
2571#ifdef INET
2572		free(pr->pr_ip4, M_PRISON);
2573#endif
2574#ifdef INET6
2575		free(pr->pr_ip6, M_PRISON);
2576#endif
2577		if (pr->pr_cpuset != NULL)
2578			cpuset_rel(pr->pr_cpuset);
2579		osd_jail_exit(pr);
2580		free(pr, M_PRISON);
2581
2582		/* Removing a prison frees a reference on its parent. */
2583		pr = ppr;
2584		mtx_lock(&pr->pr_mtx);
2585		flags = PD_DEREF | PD_LIST_SLOCKED;
2586	}
2587}
2588
2589void
2590prison_hold_locked(struct prison *pr)
2591{
2592
2593	mtx_assert(&pr->pr_mtx, MA_OWNED);
2594	KASSERT(pr->pr_ref > 0,
2595	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
2596	pr->pr_ref++;
2597}
2598
2599void
2600prison_hold(struct prison *pr)
2601{
2602
2603	mtx_lock(&pr->pr_mtx);
2604	prison_hold_locked(pr);
2605	mtx_unlock(&pr->pr_mtx);
2606}
2607
2608void
2609prison_proc_hold(struct prison *pr)
2610{
2611
2612	mtx_lock(&pr->pr_mtx);
2613	KASSERT(pr->pr_uref > 0,
2614	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2615	pr->pr_uref++;
2616	mtx_unlock(&pr->pr_mtx);
2617}
2618
2619void
2620prison_proc_free(struct prison *pr)
2621{
2622
2623	mtx_lock(&pr->pr_mtx);
2624	KASSERT(pr->pr_uref > 0,
2625	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2626	prison_deref(pr, PD_DEUREF | PD_LOCKED);
2627}
2628
2629
2630#ifdef INET
2631/*
2632 * Restrict a prison's IP address list with its parent's, possibly replacing
2633 * it.  Return true if the replacement buffer was used (or would have been).
2634 */
2635static int
2636prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
2637{
2638	int ii, ij, used;
2639	struct prison *ppr;
2640
2641	ppr = pr->pr_parent;
2642	if (!(pr->pr_flags & PR_IP4_USER)) {
2643		/* This has no user settings, so just copy the parent's list. */
2644		if (pr->pr_ip4s < ppr->pr_ip4s) {
2645			/*
2646			 * There's no room for the parent's list.  Use the
2647			 * new list buffer, which is assumed to be big enough
2648			 * (if it was passed).  If there's no buffer, try to
2649			 * allocate one.
2650			 */
2651			used = 1;
2652			if (newip4 == NULL) {
2653				newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
2654				    M_PRISON, M_NOWAIT);
2655				if (newip4 != NULL)
2656					used = 0;
2657			}
2658			if (newip4 != NULL) {
2659				bcopy(ppr->pr_ip4, newip4,
2660				    ppr->pr_ip4s * sizeof(*newip4));
2661				free(pr->pr_ip4, M_PRISON);
2662				pr->pr_ip4 = newip4;
2663				pr->pr_ip4s = ppr->pr_ip4s;
2664				pr->pr_flags |= PR_IP4;
2665			}
2666			return (used);
2667		}
2668		pr->pr_ip4s = ppr->pr_ip4s;
2669		if (pr->pr_ip4s > 0)
2670			bcopy(ppr->pr_ip4, pr->pr_ip4,
2671			    pr->pr_ip4s * sizeof(*newip4));
2672		else if (pr->pr_ip4 != NULL) {
2673			free(pr->pr_ip4, M_PRISON);
2674			pr->pr_ip4 = NULL;
2675		}
2676		pr->pr_flags =
2677			(pr->pr_flags & ~PR_IP4) | (ppr->pr_flags & PR_IP4);
2678	} else if (pr->pr_ip4s > 0 && (ppr->pr_flags & PR_IP4)) {
2679		/* Remove addresses that aren't in the parent. */
2680		for (ij = 0; ij < ppr->pr_ip4s; ij++)
2681			if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
2682				break;
2683		if (ij < ppr->pr_ip4s)
2684			ii = 1;
2685		else {
2686			bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
2687			    --pr->pr_ip4s * sizeof(*pr->pr_ip4));
2688			ii = 0;
2689		}
2690		for (ij = 1; ii < pr->pr_ip4s; ) {
2691			if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
2692				ii++;
2693				continue;
2694			}
2695			switch (ij >= ppr->pr_ip4s ? -1 :
2696				qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
2697			case -1:
2698				bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
2699				    (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
2700				break;
2701			case 0:
2702				ii++;
2703				ij++;
2704				break;
2705			case 1:
2706				ij++;
2707				break;
2708			}
2709		}
2710		if (pr->pr_ip4s == 0) {
2711			pr->pr_flags |= PR_IP4_DISABLE;
2712			free(pr->pr_ip4, M_PRISON);
2713			pr->pr_ip4 = NULL;
2714		}
2715	}
2716	return (0);
2717}
2718
2719/*
2720 * Pass back primary IPv4 address of this jail.
2721 *
2722 * If not restricted return success but do not alter the address.  Caller has
2723 * to make sure to initialize it correctly (e.g. INADDR_ANY).
2724 *
2725 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2726 * Address returned in NBO.
2727 */
2728int
2729prison_get_ip4(struct ucred *cred, struct in_addr *ia)
2730{
2731	struct prison *pr;
2732
2733	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2734	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2735
2736	pr = cred->cr_prison;
2737	if (!(pr->pr_flags & PR_IP4))
2738		return (0);
2739	mtx_lock(&pr->pr_mtx);
2740	if (!(pr->pr_flags & PR_IP4)) {
2741		mtx_unlock(&pr->pr_mtx);
2742		return (0);
2743	}
2744	if (pr->pr_ip4 == NULL) {
2745		mtx_unlock(&pr->pr_mtx);
2746		return (EAFNOSUPPORT);
2747	}
2748
2749	ia->s_addr = pr->pr_ip4[0].s_addr;
2750	mtx_unlock(&pr->pr_mtx);
2751	return (0);
2752}
2753
2754/*
2755 * Return true if pr1 and pr2 have the same IPv4 address restrictions.
2756 */
2757int
2758prison_equal_ip4(struct prison *pr1, struct prison *pr2)
2759{
2760
2761	if (pr1 == pr2)
2762		return (1);
2763
2764	/*
2765	 * jail_set maintains an exclusive hold on allprison_lock while it
2766	 * changes the IP addresses, so only a shared hold is needed.  This is
2767	 * easier than locking the two prisons which would require finding the
2768	 * proper locking order and end up needing allprison_lock anyway.
2769	 */
2770	sx_slock(&allprison_lock);
2771	while (pr1 != &prison0 &&
2772#ifdef VIMAGE
2773	       !(pr1->pr_flags & PR_VNET) &&
2774#endif
2775	       !(pr1->pr_flags & PR_IP4_USER))
2776		pr1 = pr1->pr_parent;
2777	while (pr2 != &prison0 &&
2778#ifdef VIMAGE
2779	       !(pr2->pr_flags & PR_VNET) &&
2780#endif
2781	       !(pr2->pr_flags & PR_IP4_USER))
2782		pr2 = pr2->pr_parent;
2783	sx_sunlock(&allprison_lock);
2784	return (pr1 == pr2);
2785}
2786
2787/*
2788 * Make sure our (source) address is set to something meaningful to this
2789 * jail.
2790 *
2791 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2792 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2793 * doesn't allow IPv4.  Address passed in in NBO and returned in NBO.
2794 */
2795int
2796prison_local_ip4(struct ucred *cred, struct in_addr *ia)
2797{
2798	struct prison *pr;
2799	struct in_addr ia0;
2800	int error;
2801
2802	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2803	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2804
2805	pr = cred->cr_prison;
2806	if (!(pr->pr_flags & PR_IP4))
2807		return (0);
2808	mtx_lock(&pr->pr_mtx);
2809	if (!(pr->pr_flags & PR_IP4)) {
2810		mtx_unlock(&pr->pr_mtx);
2811		return (0);
2812	}
2813	if (pr->pr_ip4 == NULL) {
2814		mtx_unlock(&pr->pr_mtx);
2815		return (EAFNOSUPPORT);
2816	}
2817
2818	ia0.s_addr = ntohl(ia->s_addr);
2819	if (ia0.s_addr == INADDR_LOOPBACK) {
2820		ia->s_addr = pr->pr_ip4[0].s_addr;
2821		mtx_unlock(&pr->pr_mtx);
2822		return (0);
2823	}
2824
2825	if (ia0.s_addr == INADDR_ANY) {
2826		/*
2827		 * In case there is only 1 IPv4 address, bind directly.
2828		 */
2829		if (pr->pr_ip4s == 1)
2830			ia->s_addr = pr->pr_ip4[0].s_addr;
2831		mtx_unlock(&pr->pr_mtx);
2832		return (0);
2833	}
2834
2835	error = _prison_check_ip4(pr, ia);
2836	mtx_unlock(&pr->pr_mtx);
2837	return (error);
2838}
2839
2840/*
2841 * Rewrite destination address in case we will connect to loopback address.
2842 *
2843 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2844 * Address passed in in NBO and returned in NBO.
2845 */
2846int
2847prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
2848{
2849	struct prison *pr;
2850
2851	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2852	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2853
2854	pr = cred->cr_prison;
2855	if (!(pr->pr_flags & PR_IP4))
2856		return (0);
2857	mtx_lock(&pr->pr_mtx);
2858	if (!(pr->pr_flags & PR_IP4)) {
2859		mtx_unlock(&pr->pr_mtx);
2860		return (0);
2861	}
2862	if (pr->pr_ip4 == NULL) {
2863		mtx_unlock(&pr->pr_mtx);
2864		return (EAFNOSUPPORT);
2865	}
2866
2867	if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
2868		ia->s_addr = pr->pr_ip4[0].s_addr;
2869		mtx_unlock(&pr->pr_mtx);
2870		return (0);
2871	}
2872
2873	/*
2874	 * Return success because nothing had to be changed.
2875	 */
2876	mtx_unlock(&pr->pr_mtx);
2877	return (0);
2878}
2879
2880/*
2881 * Check if given address belongs to the jail referenced by cred/prison.
2882 *
2883 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2884 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2885 * doesn't allow IPv4.  Address passed in in NBO.
2886 */
2887static int
2888_prison_check_ip4(struct prison *pr, struct in_addr *ia)
2889{
2890	int i, a, z, d;
2891
2892	/*
2893	 * Check the primary IP.
2894	 */
2895	if (pr->pr_ip4[0].s_addr == ia->s_addr)
2896		return (0);
2897
2898	/*
2899	 * All the other IPs are sorted so we can do a binary search.
2900	 */
2901	a = 0;
2902	z = pr->pr_ip4s - 2;
2903	while (a <= z) {
2904		i = (a + z) / 2;
2905		d = qcmp_v4(&pr->pr_ip4[i+1], ia);
2906		if (d > 0)
2907			z = i - 1;
2908		else if (d < 0)
2909			a = i + 1;
2910		else
2911			return (0);
2912	}
2913
2914	return (EADDRNOTAVAIL);
2915}
2916
2917int
2918prison_check_ip4(struct ucred *cred, struct in_addr *ia)
2919{
2920	struct prison *pr;
2921	int error;
2922
2923	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2924	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2925
2926	pr = cred->cr_prison;
2927	if (!(pr->pr_flags & PR_IP4))
2928		return (0);
2929	mtx_lock(&pr->pr_mtx);
2930	if (!(pr->pr_flags & PR_IP4)) {
2931		mtx_unlock(&pr->pr_mtx);
2932		return (0);
2933	}
2934	if (pr->pr_ip4 == NULL) {
2935		mtx_unlock(&pr->pr_mtx);
2936		return (EAFNOSUPPORT);
2937	}
2938
2939	error = _prison_check_ip4(pr, ia);
2940	mtx_unlock(&pr->pr_mtx);
2941	return (error);
2942}
2943#endif
2944
2945#ifdef INET6
2946static int
2947prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6)
2948{
2949	int ii, ij, used;
2950	struct prison *ppr;
2951
2952	ppr = pr->pr_parent;
2953	if (!(pr->pr_flags & PR_IP6_USER)) {
2954		/* This has no user settings, so just copy the parent's list. */
2955		if (pr->pr_ip6s < ppr->pr_ip6s) {
2956			/*
2957			 * There's no room for the parent's list.  Use the
2958			 * new list buffer, which is assumed to be big enough
2959			 * (if it was passed).  If there's no buffer, try to
2960			 * allocate one.
2961			 */
2962			used = 1;
2963			if (newip6 == NULL) {
2964				newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
2965				    M_PRISON, M_NOWAIT);
2966				if (newip6 != NULL)
2967					used = 0;
2968			}
2969			if (newip6 != NULL) {
2970				bcopy(ppr->pr_ip6, newip6,
2971				    ppr->pr_ip6s * sizeof(*newip6));
2972				free(pr->pr_ip6, M_PRISON);
2973				pr->pr_ip6 = newip6;
2974				pr->pr_ip6s = ppr->pr_ip6s;
2975				pr->pr_flags |= PR_IP6;
2976			}
2977			return (used);
2978		}
2979		pr->pr_ip6s = ppr->pr_ip6s;
2980		if (pr->pr_ip6s > 0)
2981			bcopy(ppr->pr_ip6, pr->pr_ip6,
2982			    pr->pr_ip6s * sizeof(*newip6));
2983		else if (pr->pr_ip6 != NULL) {
2984			free(pr->pr_ip6, M_PRISON);
2985			pr->pr_ip6 = NULL;
2986		}
2987		pr->pr_flags =
2988			(pr->pr_flags & ~PR_IP6) | (ppr->pr_flags & PR_IP6);
2989	} else if (pr->pr_ip6s > 0 && (ppr->pr_flags & PR_IP6)) {
2990		/* Remove addresses that aren't in the parent. */
2991		for (ij = 0; ij < ppr->pr_ip6s; ij++)
2992			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
2993			    &ppr->pr_ip6[ij]))
2994				break;
2995		if (ij < ppr->pr_ip6s)
2996			ii = 1;
2997		else {
2998			bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
2999			    --pr->pr_ip6s * sizeof(*pr->pr_ip6));
3000			ii = 0;
3001		}
3002		for (ij = 1; ii < pr->pr_ip6s; ) {
3003			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
3004			    &ppr->pr_ip6[0])) {
3005				ii++;
3006				continue;
3007			}
3008			switch (ij >= ppr->pr_ip4s ? -1 :
3009				qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
3010			case -1:
3011				bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
3012				    (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
3013				break;
3014			case 0:
3015				ii++;
3016				ij++;
3017				break;
3018			case 1:
3019				ij++;
3020				break;
3021			}
3022		}
3023		if (pr->pr_ip6s == 0) {
3024			pr->pr_flags |= PR_IP6_DISABLE;
3025			free(pr->pr_ip6, M_PRISON);
3026			pr->pr_ip6 = NULL;
3027		}
3028	}
3029	return 0;
3030}
3031
3032/*
3033 * Pass back primary IPv6 address for this jail.
3034 *
3035 * If not restricted return success but do not alter the address.  Caller has
3036 * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
3037 *
3038 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3039 */
3040int
3041prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
3042{
3043	struct prison *pr;
3044
3045	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3046	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3047
3048	pr = cred->cr_prison;
3049	if (!(pr->pr_flags & PR_IP6))
3050		return (0);
3051	mtx_lock(&pr->pr_mtx);
3052	if (!(pr->pr_flags & PR_IP6)) {
3053		mtx_unlock(&pr->pr_mtx);
3054		return (0);
3055	}
3056	if (pr->pr_ip6 == NULL) {
3057		mtx_unlock(&pr->pr_mtx);
3058		return (EAFNOSUPPORT);
3059	}
3060
3061	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3062	mtx_unlock(&pr->pr_mtx);
3063	return (0);
3064}
3065
3066/*
3067 * Return true if pr1 and pr2 have the same IPv6 address restrictions.
3068 */
3069int
3070prison_equal_ip6(struct prison *pr1, struct prison *pr2)
3071{
3072
3073	if (pr1 == pr2)
3074		return (1);
3075
3076	sx_slock(&allprison_lock);
3077	while (pr1 != &prison0 &&
3078#ifdef VIMAGE
3079	       !(pr1->pr_flags & PR_VNET) &&
3080#endif
3081	       !(pr1->pr_flags & PR_IP6_USER))
3082		pr1 = pr1->pr_parent;
3083	while (pr2 != &prison0 &&
3084#ifdef VIMAGE
3085	       !(pr2->pr_flags & PR_VNET) &&
3086#endif
3087	       !(pr2->pr_flags & PR_IP6_USER))
3088		pr2 = pr2->pr_parent;
3089	sx_sunlock(&allprison_lock);
3090	return (pr1 == pr2);
3091}
3092
3093/*
3094 * Make sure our (source) address is set to something meaningful to this jail.
3095 *
3096 * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
3097 * when needed while binding.
3098 *
3099 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3100 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3101 * doesn't allow IPv6.
3102 */
3103int
3104prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
3105{
3106	struct prison *pr;
3107	int error;
3108
3109	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3110	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3111
3112	pr = cred->cr_prison;
3113	if (!(pr->pr_flags & PR_IP6))
3114		return (0);
3115	mtx_lock(&pr->pr_mtx);
3116	if (!(pr->pr_flags & PR_IP6)) {
3117		mtx_unlock(&pr->pr_mtx);
3118		return (0);
3119	}
3120	if (pr->pr_ip6 == NULL) {
3121		mtx_unlock(&pr->pr_mtx);
3122		return (EAFNOSUPPORT);
3123	}
3124
3125	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3126		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3127		mtx_unlock(&pr->pr_mtx);
3128		return (0);
3129	}
3130
3131	if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
3132		/*
3133		 * In case there is only 1 IPv6 address, and v6only is true,
3134		 * then bind directly.
3135		 */
3136		if (v6only != 0 && pr->pr_ip6s == 1)
3137			bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3138		mtx_unlock(&pr->pr_mtx);
3139		return (0);
3140	}
3141
3142	error = _prison_check_ip6(pr, ia6);
3143	mtx_unlock(&pr->pr_mtx);
3144	return (error);
3145}
3146
3147/*
3148 * Rewrite destination address in case we will connect to loopback address.
3149 *
3150 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3151 */
3152int
3153prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
3154{
3155	struct prison *pr;
3156
3157	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3158	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3159
3160	pr = cred->cr_prison;
3161	if (!(pr->pr_flags & PR_IP6))
3162		return (0);
3163	mtx_lock(&pr->pr_mtx);
3164	if (!(pr->pr_flags & PR_IP6)) {
3165		mtx_unlock(&pr->pr_mtx);
3166		return (0);
3167	}
3168	if (pr->pr_ip6 == NULL) {
3169		mtx_unlock(&pr->pr_mtx);
3170		return (EAFNOSUPPORT);
3171	}
3172
3173	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3174		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3175		mtx_unlock(&pr->pr_mtx);
3176		return (0);
3177	}
3178
3179	/*
3180	 * Return success because nothing had to be changed.
3181	 */
3182	mtx_unlock(&pr->pr_mtx);
3183	return (0);
3184}
3185
3186/*
3187 * Check if given address belongs to the jail referenced by cred/prison.
3188 *
3189 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3190 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3191 * doesn't allow IPv6.
3192 */
3193static int
3194_prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
3195{
3196	int i, a, z, d;
3197
3198	/*
3199	 * Check the primary IP.
3200	 */
3201	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
3202		return (0);
3203
3204	/*
3205	 * All the other IPs are sorted so we can do a binary search.
3206	 */
3207	a = 0;
3208	z = pr->pr_ip6s - 2;
3209	while (a <= z) {
3210		i = (a + z) / 2;
3211		d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
3212		if (d > 0)
3213			z = i - 1;
3214		else if (d < 0)
3215			a = i + 1;
3216		else
3217			return (0);
3218	}
3219
3220	return (EADDRNOTAVAIL);
3221}
3222
3223int
3224prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
3225{
3226	struct prison *pr;
3227	int error;
3228
3229	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3230	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3231
3232	pr = cred->cr_prison;
3233	if (!(pr->pr_flags & PR_IP6))
3234		return (0);
3235	mtx_lock(&pr->pr_mtx);
3236	if (!(pr->pr_flags & PR_IP6)) {
3237		mtx_unlock(&pr->pr_mtx);
3238		return (0);
3239	}
3240	if (pr->pr_ip6 == NULL) {
3241		mtx_unlock(&pr->pr_mtx);
3242		return (EAFNOSUPPORT);
3243	}
3244
3245	error = _prison_check_ip6(pr, ia6);
3246	mtx_unlock(&pr->pr_mtx);
3247	return (error);
3248}
3249#endif
3250
3251/*
3252 * Check if a jail supports the given address family.
3253 *
3254 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3255 * if not.
3256 */
3257int
3258prison_check_af(struct ucred *cred, int af)
3259{
3260	struct prison *pr;
3261	int error;
3262
3263	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3264
3265	pr = cred->cr_prison;
3266#ifdef VIMAGE
3267	/* Prisons with their own network stack are not limited. */
3268	if (pr->pr_flags & PR_VNET)
3269		return (0);
3270#endif
3271
3272	error = 0;
3273	switch (af)
3274	{
3275#ifdef INET
3276	case AF_INET:
3277		if (pr->pr_flags & PR_IP4)
3278		{
3279			mtx_lock(&pr->pr_mtx);
3280			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
3281				error = EAFNOSUPPORT;
3282			mtx_unlock(&pr->pr_mtx);
3283		}
3284		break;
3285#endif
3286#ifdef INET6
3287	case AF_INET6:
3288		if (pr->pr_flags & PR_IP6)
3289		{
3290			mtx_lock(&pr->pr_mtx);
3291			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
3292				error = EAFNOSUPPORT;
3293			mtx_unlock(&pr->pr_mtx);
3294		}
3295		break;
3296#endif
3297	case AF_LOCAL:
3298	case AF_ROUTE:
3299		break;
3300	default:
3301		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3302			error = EAFNOSUPPORT;
3303	}
3304	return (error);
3305}
3306
3307/*
3308 * Check if given address belongs to the jail referenced by cred (wrapper to
3309 * prison_check_ip[46]).
3310 *
3311 * Returns 0 if jail doesn't restrict the address family or if address belongs
3312 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3313 * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3314 */
3315int
3316prison_if(struct ucred *cred, struct sockaddr *sa)
3317{
3318#ifdef INET
3319	struct sockaddr_in *sai;
3320#endif
3321#ifdef INET6
3322	struct sockaddr_in6 *sai6;
3323#endif
3324	int error;
3325
3326	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3327	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3328
3329	error = 0;
3330	switch (sa->sa_family)
3331	{
3332#ifdef INET
3333	case AF_INET:
3334		sai = (struct sockaddr_in *)sa;
3335		error = prison_check_ip4(cred, &sai->sin_addr);
3336		break;
3337#endif
3338#ifdef INET6
3339	case AF_INET6:
3340		sai6 = (struct sockaddr_in6 *)sa;
3341		error = prison_check_ip6(cred, &sai6->sin6_addr);
3342		break;
3343#endif
3344	default:
3345		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3346			error = EAFNOSUPPORT;
3347	}
3348	return (error);
3349}
3350
3351/*
3352 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3353 */
3354int
3355prison_check(struct ucred *cred1, struct ucred *cred2)
3356{
3357
3358	return ((cred1->cr_prison == cred2->cr_prison ||
3359	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3360}
3361
3362/*
3363 * Return 1 if p2 is a child of p1, otherwise 0.
3364 */
3365int
3366prison_ischild(struct prison *pr1, struct prison *pr2)
3367{
3368
3369	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3370		if (pr1 == pr2)
3371			return (1);
3372	return (0);
3373}
3374
3375/*
3376 * Return 1 if the passed credential is in a jail, otherwise 0.
3377 */
3378int
3379jailed(struct ucred *cred)
3380{
3381
3382	return (cred->cr_prison != &prison0);
3383}
3384
3385/*
3386 * Return the correct hostname (domainname, et al) for the passed credential.
3387 */
3388void
3389getcredhostname(struct ucred *cred, char *buf, size_t size)
3390{
3391	struct prison *pr;
3392
3393	/*
3394	 * A NULL credential can be used to shortcut to the physical
3395	 * system's hostname.
3396	 */
3397	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3398	mtx_lock(&pr->pr_mtx);
3399	strlcpy(buf, pr->pr_hostname, size);
3400	mtx_unlock(&pr->pr_mtx);
3401}
3402
3403void
3404getcreddomainname(struct ucred *cred, char *buf, size_t size)
3405{
3406
3407	mtx_lock(&cred->cr_prison->pr_mtx);
3408	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3409	mtx_unlock(&cred->cr_prison->pr_mtx);
3410}
3411
3412void
3413getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3414{
3415
3416	mtx_lock(&cred->cr_prison->pr_mtx);
3417	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3418	mtx_unlock(&cred->cr_prison->pr_mtx);
3419}
3420
3421void
3422getcredhostid(struct ucred *cred, unsigned long *hostid)
3423{
3424
3425	mtx_lock(&cred->cr_prison->pr_mtx);
3426	*hostid = cred->cr_prison->pr_hostid;
3427	mtx_unlock(&cred->cr_prison->pr_mtx);
3428}
3429
3430/*
3431 * Determine whether the subject represented by cred can "see"
3432 * status of a mount point.
3433 * Returns: 0 for permitted, ENOENT otherwise.
3434 * XXX: This function should be called cr_canseemount() and should be
3435 *      placed in kern_prot.c.
3436 */
3437int
3438prison_canseemount(struct ucred *cred, struct mount *mp)
3439{
3440	struct prison *pr;
3441	struct statfs *sp;
3442	size_t len;
3443
3444	pr = cred->cr_prison;
3445	if (pr->pr_enforce_statfs == 0)
3446		return (0);
3447	if (pr->pr_root->v_mount == mp)
3448		return (0);
3449	if (pr->pr_enforce_statfs == 2)
3450		return (ENOENT);
3451	/*
3452	 * If jail's chroot directory is set to "/" we should be able to see
3453	 * all mount-points from inside a jail.
3454	 * This is ugly check, but this is the only situation when jail's
3455	 * directory ends with '/'.
3456	 */
3457	if (strcmp(pr->pr_path, "/") == 0)
3458		return (0);
3459	len = strlen(pr->pr_path);
3460	sp = &mp->mnt_stat;
3461	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3462		return (ENOENT);
3463	/*
3464	 * Be sure that we don't have situation where jail's root directory
3465	 * is "/some/path" and mount point is "/some/pathpath".
3466	 */
3467	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3468		return (ENOENT);
3469	return (0);
3470}
3471
3472void
3473prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3474{
3475	char jpath[MAXPATHLEN];
3476	struct prison *pr;
3477	size_t len;
3478
3479	pr = cred->cr_prison;
3480	if (pr->pr_enforce_statfs == 0)
3481		return;
3482	if (prison_canseemount(cred, mp) != 0) {
3483		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3484		strlcpy(sp->f_mntonname, "[restricted]",
3485		    sizeof(sp->f_mntonname));
3486		return;
3487	}
3488	if (pr->pr_root->v_mount == mp) {
3489		/*
3490		 * Clear current buffer data, so we are sure nothing from
3491		 * the valid path left there.
3492		 */
3493		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3494		*sp->f_mntonname = '/';
3495		return;
3496	}
3497	/*
3498	 * If jail's chroot directory is set to "/" we should be able to see
3499	 * all mount-points from inside a jail.
3500	 */
3501	if (strcmp(pr->pr_path, "/") == 0)
3502		return;
3503	len = strlen(pr->pr_path);
3504	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3505	/*
3506	 * Clear current buffer data, so we are sure nothing from
3507	 * the valid path left there.
3508	 */
3509	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3510	if (*jpath == '\0') {
3511		/* Should never happen. */
3512		*sp->f_mntonname = '/';
3513	} else {
3514		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3515	}
3516}
3517
3518/*
3519 * Check with permission for a specific privilege is granted within jail.  We
3520 * have a specific list of accepted privileges; the rest are denied.
3521 */
3522int
3523prison_priv_check(struct ucred *cred, int priv)
3524{
3525
3526	if (!jailed(cred))
3527		return (0);
3528
3529#ifdef VIMAGE
3530	/*
3531	 * Privileges specific to prisons with a virtual network stack.
3532	 * There might be a duplicate entry here in case the privilege
3533	 * is only granted conditionally in the legacy jail case.
3534	 */
3535	switch (priv) {
3536#ifdef notyet
3537		/*
3538		 * NFS-specific privileges.
3539		 */
3540	case PRIV_NFS_DAEMON:
3541	case PRIV_NFS_LOCKD:
3542#endif
3543		/*
3544		 * Network stack privileges.
3545		 */
3546	case PRIV_NET_BRIDGE:
3547	case PRIV_NET_GRE:
3548	case PRIV_NET_BPF:
3549	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3550	case PRIV_NET_ROUTE:
3551	case PRIV_NET_TAP:
3552	case PRIV_NET_SETIFMTU:
3553	case PRIV_NET_SETIFFLAGS:
3554	case PRIV_NET_SETIFCAP:
3555	case PRIV_NET_SETIFNAME	:
3556	case PRIV_NET_SETIFMETRIC:
3557	case PRIV_NET_SETIFPHYS:
3558	case PRIV_NET_SETIFMAC:
3559	case PRIV_NET_ADDMULTI:
3560	case PRIV_NET_DELMULTI:
3561	case PRIV_NET_HWIOCTL:
3562	case PRIV_NET_SETLLADDR:
3563	case PRIV_NET_ADDIFGROUP:
3564	case PRIV_NET_DELIFGROUP:
3565	case PRIV_NET_IFCREATE:
3566	case PRIV_NET_IFDESTROY:
3567	case PRIV_NET_ADDIFADDR:
3568	case PRIV_NET_DELIFADDR:
3569	case PRIV_NET_LAGG:
3570	case PRIV_NET_GIF:
3571	case PRIV_NET_SETIFVNET:
3572
3573		/*
3574		 * 802.11-related privileges.
3575		 */
3576	case PRIV_NET80211_GETKEY:
3577#ifdef notyet
3578	case PRIV_NET80211_MANAGE:		/* XXX-BZ discuss with sam@ */
3579#endif
3580
3581#ifdef notyet
3582		/*
3583		 * AppleTalk privileges.
3584		 */
3585	case PRIV_NETATALK_RESERVEDPORT:
3586
3587		/*
3588		 * ATM privileges.
3589		 */
3590	case PRIV_NETATM_CFG:
3591	case PRIV_NETATM_ADD:
3592	case PRIV_NETATM_DEL:
3593	case PRIV_NETATM_SET:
3594
3595		/*
3596		 * Bluetooth privileges.
3597		 */
3598	case PRIV_NETBLUETOOTH_RAW:
3599#endif
3600
3601		/*
3602		 * Netgraph and netgraph module privileges.
3603		 */
3604	case PRIV_NETGRAPH_CONTROL:
3605#ifdef notyet
3606	case PRIV_NETGRAPH_TTY:
3607#endif
3608
3609		/*
3610		 * IPv4 and IPv6 privileges.
3611		 */
3612	case PRIV_NETINET_IPFW:
3613	case PRIV_NETINET_DIVERT:
3614	case PRIV_NETINET_PF:
3615	case PRIV_NETINET_DUMMYNET:
3616	case PRIV_NETINET_CARP:
3617	case PRIV_NETINET_MROUTE:
3618	case PRIV_NETINET_RAW:
3619	case PRIV_NETINET_ADDRCTRL6:
3620	case PRIV_NETINET_ND6:
3621	case PRIV_NETINET_SCOPE6:
3622	case PRIV_NETINET_ALIFETIME6:
3623	case PRIV_NETINET_IPSEC:
3624	case PRIV_NETINET_BINDANY:
3625
3626#ifdef notyet
3627		/*
3628		 * IPX/SPX privileges.
3629		 */
3630	case PRIV_NETIPX_RESERVEDPORT:
3631	case PRIV_NETIPX_RAW:
3632
3633		/*
3634		 * NCP privileges.
3635		 */
3636	case PRIV_NETNCP:
3637
3638		/*
3639		 * SMB privileges.
3640		 */
3641	case PRIV_NETSMB:
3642#endif
3643
3644	/*
3645	 * No default: or deny here.
3646	 * In case of no permit fall through to next switch().
3647	 */
3648		if (cred->cr_prison->pr_flags & PR_VNET)
3649			return (0);
3650	}
3651#endif /* VIMAGE */
3652
3653	switch (priv) {
3654
3655		/*
3656		 * Allow ktrace privileges for root in jail.
3657		 */
3658	case PRIV_KTRACE:
3659
3660#if 0
3661		/*
3662		 * Allow jailed processes to configure audit identity and
3663		 * submit audit records (login, etc).  In the future we may
3664		 * want to further refine the relationship between audit and
3665		 * jail.
3666		 */
3667	case PRIV_AUDIT_GETAUDIT:
3668	case PRIV_AUDIT_SETAUDIT:
3669	case PRIV_AUDIT_SUBMIT:
3670#endif
3671
3672		/*
3673		 * Allow jailed processes to manipulate process UNIX
3674		 * credentials in any way they see fit.
3675		 */
3676	case PRIV_CRED_SETUID:
3677	case PRIV_CRED_SETEUID:
3678	case PRIV_CRED_SETGID:
3679	case PRIV_CRED_SETEGID:
3680	case PRIV_CRED_SETGROUPS:
3681	case PRIV_CRED_SETREUID:
3682	case PRIV_CRED_SETREGID:
3683	case PRIV_CRED_SETRESUID:
3684	case PRIV_CRED_SETRESGID:
3685
3686		/*
3687		 * Jail implements visibility constraints already, so allow
3688		 * jailed root to override uid/gid-based constraints.
3689		 */
3690	case PRIV_SEEOTHERGIDS:
3691	case PRIV_SEEOTHERUIDS:
3692
3693		/*
3694		 * Jail implements inter-process debugging limits already, so
3695		 * allow jailed root various debugging privileges.
3696		 */
3697	case PRIV_DEBUG_DIFFCRED:
3698	case PRIV_DEBUG_SUGID:
3699	case PRIV_DEBUG_UNPRIV:
3700
3701		/*
3702		 * Allow jail to set various resource limits and login
3703		 * properties, and for now, exceed process resource limits.
3704		 */
3705	case PRIV_PROC_LIMIT:
3706	case PRIV_PROC_SETLOGIN:
3707	case PRIV_PROC_SETRLIMIT:
3708
3709		/*
3710		 * System V and POSIX IPC privileges are granted in jail.
3711		 */
3712	case PRIV_IPC_READ:
3713	case PRIV_IPC_WRITE:
3714	case PRIV_IPC_ADMIN:
3715	case PRIV_IPC_MSGSIZE:
3716	case PRIV_MQ_ADMIN:
3717
3718		/*
3719		 * Jail operations within a jail work on child jails.
3720		 */
3721	case PRIV_JAIL_ATTACH:
3722	case PRIV_JAIL_SET:
3723	case PRIV_JAIL_REMOVE:
3724
3725		/*
3726		 * Jail implements its own inter-process limits, so allow
3727		 * root processes in jail to change scheduling on other
3728		 * processes in the same jail.  Likewise for signalling.
3729		 */
3730	case PRIV_SCHED_DIFFCRED:
3731	case PRIV_SCHED_CPUSET:
3732	case PRIV_SIGNAL_DIFFCRED:
3733	case PRIV_SIGNAL_SUGID:
3734
3735		/*
3736		 * Allow jailed processes to write to sysctls marked as jail
3737		 * writable.
3738		 */
3739	case PRIV_SYSCTL_WRITEJAIL:
3740
3741		/*
3742		 * Allow root in jail to manage a variety of quota
3743		 * properties.  These should likely be conditional on a
3744		 * configuration option.
3745		 */
3746	case PRIV_VFS_GETQUOTA:
3747	case PRIV_VFS_SETQUOTA:
3748
3749		/*
3750		 * Since Jail relies on chroot() to implement file system
3751		 * protections, grant many VFS privileges to root in jail.
3752		 * Be careful to exclude mount-related and NFS-related
3753		 * privileges.
3754		 */
3755	case PRIV_VFS_READ:
3756	case PRIV_VFS_WRITE:
3757	case PRIV_VFS_ADMIN:
3758	case PRIV_VFS_EXEC:
3759	case PRIV_VFS_LOOKUP:
3760	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
3761	case PRIV_VFS_CHFLAGS_DEV:
3762	case PRIV_VFS_CHOWN:
3763	case PRIV_VFS_CHROOT:
3764	case PRIV_VFS_RETAINSUGID:
3765	case PRIV_VFS_FCHROOT:
3766	case PRIV_VFS_LINK:
3767	case PRIV_VFS_SETGID:
3768	case PRIV_VFS_STAT:
3769	case PRIV_VFS_STICKYFILE:
3770		return (0);
3771
3772		/*
3773		 * Depending on the global setting, allow privilege of
3774		 * setting system flags.
3775		 */
3776	case PRIV_VFS_SYSFLAGS:
3777		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
3778			return (0);
3779		else
3780			return (EPERM);
3781
3782		/*
3783		 * Depending on the global setting, allow privilege of
3784		 * mounting/unmounting file systems.
3785		 */
3786	case PRIV_VFS_MOUNT:
3787	case PRIV_VFS_UNMOUNT:
3788	case PRIV_VFS_MOUNT_NONUSER:
3789	case PRIV_VFS_MOUNT_OWNER:
3790		if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT)
3791			return (0);
3792		else
3793			return (EPERM);
3794
3795		/*
3796		 * Allow jailed root to bind reserved ports and reuse in-use
3797		 * ports.
3798		 */
3799	case PRIV_NETINET_RESERVEDPORT:
3800	case PRIV_NETINET_REUSEPORT:
3801		return (0);
3802
3803		/*
3804		 * Allow jailed root to set certian IPv4/6 (option) headers.
3805		 */
3806	case PRIV_NETINET_SETHDROPTS:
3807		return (0);
3808
3809		/*
3810		 * Conditionally allow creating raw sockets in jail.
3811		 */
3812	case PRIV_NETINET_RAW:
3813		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
3814			return (0);
3815		else
3816			return (EPERM);
3817
3818		/*
3819		 * Since jail implements its own visibility limits on netstat
3820		 * sysctls, allow getcred.  This allows identd to work in
3821		 * jail.
3822		 */
3823	case PRIV_NETINET_GETCRED:
3824		return (0);
3825
3826	default:
3827		/*
3828		 * In all remaining cases, deny the privilege request.  This
3829		 * includes almost all network privileges, many system
3830		 * configuration privileges.
3831		 */
3832		return (EPERM);
3833	}
3834}
3835
3836/*
3837 * Return the part of pr2's name that is relative to pr1, or the whole name
3838 * if it does not directly follow.
3839 */
3840
3841char *
3842prison_name(struct prison *pr1, struct prison *pr2)
3843{
3844	char *name;
3845
3846	/* Jails see themselves as "0" (if they see themselves at all). */
3847	if (pr1 == pr2)
3848		return "0";
3849	name = pr2->pr_name;
3850	if (prison_ischild(pr1, pr2)) {
3851		/*
3852		 * pr1 isn't locked (and allprison_lock may not be either)
3853		 * so its length can't be counted on.  But the number of dots
3854		 * can be counted on - and counted.
3855		 */
3856		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
3857			name = strchr(name, '.') + 1;
3858	}
3859	return (name);
3860}
3861
3862/*
3863 * Return the part of pr2's path that is relative to pr1, or the whole path
3864 * if it does not directly follow.
3865 */
3866static char *
3867prison_path(struct prison *pr1, struct prison *pr2)
3868{
3869	char *path1, *path2;
3870	int len1;
3871
3872	path1 = pr1->pr_path;
3873	path2 = pr2->pr_path;
3874	if (!strcmp(path1, "/"))
3875		return (path2);
3876	len1 = strlen(path1);
3877	if (strncmp(path1, path2, len1))
3878		return (path2);
3879	if (path2[len1] == '\0')
3880		return "/";
3881	if (path2[len1] == '/')
3882		return (path2 + len1);
3883	return (path2);
3884}
3885
3886
3887/*
3888 * Jail-related sysctls.
3889 */
3890SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
3891    "Jails");
3892
3893static int
3894sysctl_jail_list(SYSCTL_HANDLER_ARGS)
3895{
3896	struct xprison *xp;
3897	struct prison *pr, *cpr;
3898#ifdef INET
3899	struct in_addr *ip4 = NULL;
3900	int ip4s = 0;
3901#endif
3902#ifdef INET6
3903	struct in_addr *ip6 = NULL;
3904	int ip6s = 0;
3905#endif
3906	int descend, error;
3907
3908	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
3909	pr = req->td->td_ucred->cr_prison;
3910	error = 0;
3911	sx_slock(&allprison_lock);
3912	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
3913#if defined(INET) || defined(INET6)
3914 again:
3915#endif
3916		mtx_lock(&cpr->pr_mtx);
3917#ifdef INET
3918		if (cpr->pr_ip4s > 0) {
3919			if (ip4s < cpr->pr_ip4s) {
3920				ip4s = cpr->pr_ip4s;
3921				mtx_unlock(&cpr->pr_mtx);
3922				ip4 = realloc(ip4, ip4s *
3923				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
3924				goto again;
3925			}
3926			bcopy(cpr->pr_ip4, ip4,
3927			    cpr->pr_ip4s * sizeof(struct in_addr));
3928		}
3929#endif
3930#ifdef INET6
3931		if (cpr->pr_ip6s > 0) {
3932			if (ip6s < cpr->pr_ip6s) {
3933				ip6s = cpr->pr_ip6s;
3934				mtx_unlock(&cpr->pr_mtx);
3935				ip6 = realloc(ip6, ip6s *
3936				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
3937				goto again;
3938			}
3939			bcopy(cpr->pr_ip6, ip6,
3940			    cpr->pr_ip6s * sizeof(struct in6_addr));
3941		}
3942#endif
3943		if (cpr->pr_ref == 0) {
3944			mtx_unlock(&cpr->pr_mtx);
3945			continue;
3946		}
3947		bzero(xp, sizeof(*xp));
3948		xp->pr_version = XPRISON_VERSION;
3949		xp->pr_id = cpr->pr_id;
3950		xp->pr_state = cpr->pr_uref > 0
3951		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
3952		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
3953		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
3954		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
3955#ifdef INET
3956		xp->pr_ip4s = cpr->pr_ip4s;
3957#endif
3958#ifdef INET6
3959		xp->pr_ip6s = cpr->pr_ip6s;
3960#endif
3961		mtx_unlock(&cpr->pr_mtx);
3962		error = SYSCTL_OUT(req, xp, sizeof(*xp));
3963		if (error)
3964			break;
3965#ifdef INET
3966		if (xp->pr_ip4s > 0) {
3967			error = SYSCTL_OUT(req, ip4,
3968			    xp->pr_ip4s * sizeof(struct in_addr));
3969			if (error)
3970				break;
3971		}
3972#endif
3973#ifdef INET6
3974		if (xp->pr_ip6s > 0) {
3975			error = SYSCTL_OUT(req, ip6,
3976			    xp->pr_ip6s * sizeof(struct in6_addr));
3977			if (error)
3978				break;
3979		}
3980#endif
3981	}
3982	sx_sunlock(&allprison_lock);
3983	free(xp, M_TEMP);
3984#ifdef INET
3985	free(ip4, M_TEMP);
3986#endif
3987#ifdef INET6
3988	free(ip6, M_TEMP);
3989#endif
3990	return (error);
3991}
3992
3993SYSCTL_OID(_security_jail, OID_AUTO, list,
3994    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3995    sysctl_jail_list, "S", "List of active jails");
3996
3997static int
3998sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
3999{
4000	int error, injail;
4001
4002	injail = jailed(req->td->td_ucred);
4003	error = SYSCTL_OUT(req, &injail, sizeof(injail));
4004
4005	return (error);
4006}
4007
4008SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
4009    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4010    sysctl_jail_jailed, "I", "Process in jail?");
4011
4012#if defined(INET) || defined(INET6)
4013SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
4014    &jail_max_af_ips, 0,
4015    "Number of IP addresses a jail may have at most per address family");
4016#endif
4017
4018/*
4019 * Default parameters for jail(2) compatability.  For historical reasons,
4020 * the sysctl names have varying similarity to the parameter names.  Prisons
4021 * just see their own parameters, and can't change them.
4022 */
4023static int
4024sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
4025{
4026	struct prison *pr;
4027	int allow, error, i;
4028
4029	pr = req->td->td_ucred->cr_prison;
4030	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
4031
4032	/* Get the current flag value, and convert it to a boolean. */
4033	i = (allow & arg2) ? 1 : 0;
4034	if (arg1 != NULL)
4035		i = !i;
4036	error = sysctl_handle_int(oidp, &i, 0, req);
4037	if (error || !req->newptr)
4038		return (error);
4039	i = i ? arg2 : 0;
4040	if (arg1 != NULL)
4041		i ^= arg2;
4042	/*
4043	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
4044	 * for writing.
4045	 */
4046	mtx_lock(&prison0.pr_mtx);
4047	jail_default_allow = (jail_default_allow & ~arg2) | i;
4048	mtx_unlock(&prison0.pr_mtx);
4049	return (0);
4050}
4051
4052SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
4053    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4054    NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
4055    "Processes in jail can set their hostnames");
4056SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
4057    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4058    (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
4059    "Processes in jail are limited to creating UNIX/IP/route sockets only");
4060SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
4061    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4062    NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
4063    "Processes in jail can use System V IPC primitives");
4064SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
4065    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4066    NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
4067    "Prison root can create raw sockets");
4068SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
4069    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4070    NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
4071    "Processes in jail can alter system file flags");
4072SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
4073    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4074    NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
4075    "Processes in jail can mount/unmount jail-friendly file systems");
4076
4077static int
4078sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
4079{
4080	struct prison *pr;
4081	int level, error;
4082
4083	pr = req->td->td_ucred->cr_prison;
4084	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4085	error = sysctl_handle_int(oidp, &level, 0, req);
4086	if (error || !req->newptr)
4087		return (error);
4088	*(int *)arg1 = level;
4089	return (0);
4090}
4091
4092SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4093    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4094    &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4095    sysctl_jail_default_level, "I",
4096    "Processes in jail cannot see all mounted file systems");
4097
4098/*
4099 * Nodes to describe jail parameters.  Maximum length of string parameters
4100 * is returned in the string itself, and the other parameters exist merely
4101 * to make themselves and their types known.
4102 */
4103SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
4104    "Jail parameters");
4105
4106int
4107sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4108{
4109	int i;
4110	long l;
4111	size_t s;
4112	char numbuf[12];
4113
4114	switch (oidp->oid_kind & CTLTYPE)
4115	{
4116	case CTLTYPE_LONG:
4117	case CTLTYPE_ULONG:
4118		l = 0;
4119#ifdef SCTL_MASK32
4120		if (!(req->flags & SCTL_MASK32))
4121#endif
4122			return (SYSCTL_OUT(req, &l, sizeof(l)));
4123	case CTLTYPE_INT:
4124	case CTLTYPE_UINT:
4125		i = 0;
4126		return (SYSCTL_OUT(req, &i, sizeof(i)));
4127	case CTLTYPE_STRING:
4128		snprintf(numbuf, sizeof(numbuf), "%d", arg2);
4129		return
4130		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4131	case CTLTYPE_STRUCT:
4132		s = (size_t)arg2;
4133		return (SYSCTL_OUT(req, &s, sizeof(s)));
4134	}
4135	return (0);
4136}
4137
4138SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4139SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4140SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4141SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4142SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4143    "I", "Jail secure level");
4144SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4145    "I", "Jail cannot see all mounted file systems");
4146SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4147    "B", "Jail persistence");
4148#ifdef VIMAGE
4149SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4150    "E,jailsys", "Virtual network stack");
4151#endif
4152SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4153    "B", "Jail is in the process of shutting down");
4154
4155SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4156SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4157    "I", "Current number of child jails");
4158SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4159    "I", "Maximum number of child jails");
4160
4161SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4162SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4163    "Jail hostname");
4164SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4165    "Jail NIS domainname");
4166SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4167    "Jail host UUID");
4168SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4169    "LU", "Jail host ID");
4170
4171SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4172SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4173
4174#ifdef INET
4175SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RW, "Jail IPv4 address virtualization");
4176SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4177    "S,in_addr,a", "Jail IPv4 addresses");
4178#endif
4179#ifdef INET6
4180SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RW, "Jail IPv6 address virtualization");
4181SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4182    "S,in6_addr,a", "Jail IPv6 addresses");
4183#endif
4184
4185SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4186SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4187    "B", "Jail may set hostname");
4188SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4189    "B", "Jail may use SYSV IPC");
4190SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4191    "B", "Jail may create raw sockets");
4192SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4193    "B", "Jail may alter system file flags");
4194SYSCTL_JAIL_PARAM(_allow, mount, CTLTYPE_INT | CTLFLAG_RW,
4195    "B", "Jail may mount/unmount jail-friendly file systems");
4196SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4197    "B", "Jail may set file quotas");
4198SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4199    "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4200
4201
4202#ifdef DDB
4203
4204static void
4205db_show_prison(struct prison *pr)
4206{
4207	int fi;
4208#if defined(INET) || defined(INET6)
4209	int ii;
4210#endif
4211	unsigned jsf;
4212#ifdef INET6
4213	char ip6buf[INET6_ADDRSTRLEN];
4214#endif
4215
4216	db_printf("prison %p:\n", pr);
4217	db_printf(" jid             = %d\n", pr->pr_id);
4218	db_printf(" name            = %s\n", pr->pr_name);
4219	db_printf(" parent          = %p\n", pr->pr_parent);
4220	db_printf(" ref             = %d\n", pr->pr_ref);
4221	db_printf(" uref            = %d\n", pr->pr_uref);
4222	db_printf(" path            = %s\n", pr->pr_path);
4223	db_printf(" cpuset          = %d\n", pr->pr_cpuset
4224	    ? pr->pr_cpuset->cs_id : -1);
4225#ifdef VIMAGE
4226	db_printf(" vnet            = %p\n", pr->pr_vnet);
4227#endif
4228	db_printf(" root            = %p\n", pr->pr_root);
4229	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
4230	db_printf(" childcount      = %d\n", pr->pr_childcount);
4231	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
4232	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
4233	db_printf(" flags           = %x", pr->pr_flags);
4234	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
4235	    fi++)
4236		if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
4237			db_printf(" %s", pr_flag_names[fi]);
4238	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
4239	    fi++) {
4240		jsf = pr->pr_flags &
4241		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
4242		db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
4243		    pr_flag_jailsys[fi].disable &&
4244		      (jsf == pr_flag_jailsys[fi].disable) ? "disable"
4245		    : (jsf == pr_flag_jailsys[fi].new) ? "new"
4246		    : "inherit");
4247	}
4248	db_printf(" allow           = %x", pr->pr_allow);
4249	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
4250	    fi++)
4251		if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
4252			db_printf(" %s", pr_allow_names[fi]);
4253	db_printf("\n");
4254	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
4255	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
4256	db_printf(" host.domainname = %s\n", pr->pr_domainname);
4257	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
4258	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
4259#ifdef INET
4260	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
4261	for (ii = 0; ii < pr->pr_ip4s; ii++)
4262		db_printf(" %s %s\n",
4263		    ii == 0 ? "ip4             =" : "                 ",
4264		    inet_ntoa(pr->pr_ip4[ii]));
4265#endif
4266#ifdef INET6
4267	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
4268	for (ii = 0; ii < pr->pr_ip6s; ii++)
4269		db_printf(" %s %s\n",
4270		    ii == 0 ? "ip6             =" : "                 ",
4271		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
4272#endif
4273}
4274
4275DB_SHOW_COMMAND(prison, db_show_prison_command)
4276{
4277	struct prison *pr;
4278
4279	if (!have_addr) {
4280		/*
4281		 * Show all prisons in the list, and prison0 which is not
4282		 * listed.
4283		 */
4284		db_show_prison(&prison0);
4285		if (!db_pager_quit) {
4286			TAILQ_FOREACH(pr, &allprison, pr_list) {
4287				db_show_prison(pr);
4288				if (db_pager_quit)
4289					break;
4290			}
4291		}
4292		return;
4293	}
4294
4295	if (addr == 0)
4296		pr = &prison0;
4297	else {
4298		/* Look for a prison with the ID and with references. */
4299		TAILQ_FOREACH(pr, &allprison, pr_list)
4300			if (pr->pr_id == addr && pr->pr_ref > 0)
4301				break;
4302		if (pr == NULL)
4303			/* Look again, without requiring a reference. */
4304			TAILQ_FOREACH(pr, &allprison, pr_list)
4305				if (pr->pr_id == addr)
4306					break;
4307		if (pr == NULL)
4308			/* Assume address points to a valid prison. */
4309			pr = (struct prison *)addr;
4310	}
4311	db_show_prison(pr);
4312}
4313
4314#endif /* DDB */
4315