kern_jail.c revision 302229
1219820Sjeff/*-
2219820Sjeff * Copyright (c) 1999 Poul-Henning Kamp.
3219820Sjeff * Copyright (c) 2008 Bjoern A. Zeeb.
4219820Sjeff * Copyright (c) 2009 James Gritton.
5219820Sjeff * All rights reserved.
6219820Sjeff *
7219820Sjeff * Redistribution and use in source and binary forms, with or without
8219820Sjeff * modification, are permitted provided that the following conditions
9219820Sjeff * are met:
10219820Sjeff * 1. Redistributions of source code must retain the above copyright
11219820Sjeff *    notice, this list of conditions and the following disclaimer.
12219820Sjeff * 2. Redistributions in binary form must reproduce the above copyright
13219820Sjeff *    notice, this list of conditions and the following disclaimer in the
14219820Sjeff *    documentation and/or other materials provided with the distribution.
15219820Sjeff *
16219820Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17219820Sjeff * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18219820Sjeff * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19219820Sjeff * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20219820Sjeff * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21219820Sjeff * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22219820Sjeff * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23219820Sjeff * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24219820Sjeff * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25219820Sjeff * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26219820Sjeff * SUCH DAMAGE.
27219820Sjeff */
28219820Sjeff
29219820Sjeff#include <sys/cdefs.h>
30219820Sjeff__FBSDID("$FreeBSD: stable/10/sys/kern/kern_jail.c 302229 2016-06-27 21:25:01Z bdrewery $");
31219820Sjeff
32219820Sjeff#include "opt_compat.h"
33219820Sjeff#include "opt_ddb.h"
34219820Sjeff#include "opt_inet.h"
35219820Sjeff#include "opt_inet6.h"
36219820Sjeff
37219820Sjeff#include <sys/param.h>
38219820Sjeff#include <sys/types.h>
39219820Sjeff#include <sys/kernel.h>
40219820Sjeff#include <sys/systm.h>
41219820Sjeff#include <sys/errno.h>
42219820Sjeff#include <sys/sysproto.h>
43219820Sjeff#include <sys/malloc.h>
44219820Sjeff#include <sys/osd.h>
45219820Sjeff#include <sys/priv.h>
46219820Sjeff#include <sys/proc.h>
47219820Sjeff#include <sys/taskqueue.h>
48219820Sjeff#include <sys/fcntl.h>
49219820Sjeff#include <sys/jail.h>
50219820Sjeff#include <sys/lock.h>
51219820Sjeff#include <sys/mutex.h>
52219820Sjeff#include <sys/racct.h>
53219820Sjeff#include <sys/refcount.h>
54219820Sjeff#include <sys/sx.h>
55219820Sjeff#include <sys/sysent.h>
56219820Sjeff#include <sys/namei.h>
57219820Sjeff#include <sys/mount.h>
58219820Sjeff#include <sys/queue.h>
59219820Sjeff#include <sys/socket.h>
60219820Sjeff#include <sys/syscallsubr.h>
61219820Sjeff#include <sys/sysctl.h>
62219820Sjeff#include <sys/vnode.h>
63219820Sjeff
64219820Sjeff#include <net/if.h>
65219820Sjeff#include <net/vnet.h>
66219820Sjeff
67219820Sjeff#include <netinet/in.h>
68219820Sjeff
69219820Sjeff#ifdef DDB
70219820Sjeff#include <ddb/ddb.h>
71219820Sjeff#ifdef INET6
72219820Sjeff#include <netinet6/in6_var.h>
73219820Sjeff#endif /* INET6 */
74219820Sjeff#endif /* DDB */
75219820Sjeff
76219820Sjeff#include <security/mac/mac_framework.h>
77219820Sjeff
78219820Sjeff#define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
79219820Sjeff
80219820SjeffMALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
81219820Sjeffstatic MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
82219820Sjeff
83219820Sjeff/* Keep struct prison prison0 and some code in kern_jail_set() readable. */
84219820Sjeff#ifdef INET
85219820Sjeff#ifdef INET6
86219820Sjeff#define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
87219820Sjeff#else
88219820Sjeff#define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
89219820Sjeff#endif
90219820Sjeff#else /* !INET */
91219820Sjeff#ifdef INET6
92219820Sjeff#define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
93219820Sjeff#else
94219820Sjeff#define	_PR_IP_SADDRSEL	0
95219820Sjeff#endif
96219820Sjeff#endif
97219820Sjeff
98219820Sjeff/* prison0 describes what is "real" about the system. */
99219820Sjeffstruct prison prison0 = {
100219820Sjeff	.pr_id		= 0,
101219820Sjeff	.pr_name	= "0",
102219820Sjeff	.pr_ref		= 1,
103219820Sjeff	.pr_uref	= 1,
104219820Sjeff	.pr_path	= "/",
105219820Sjeff	.pr_securelevel	= -1,
106219820Sjeff	.pr_devfs_rsnum = 0,
107219820Sjeff	.pr_childmax	= JAIL_MAX,
108219820Sjeff	.pr_hostuuid	= DEFAULT_HOSTUUID,
109219820Sjeff	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
110219820Sjeff#ifdef VIMAGE
111219820Sjeff	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
112219820Sjeff#else
113219820Sjeff	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
114219820Sjeff#endif
115219820Sjeff	.pr_allow	= PR_ALLOW_ALL,
116219820Sjeff};
117219820SjeffMTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
118219820Sjeff
119219820Sjeff/* allprison, allprison_racct and lastprid are protected by allprison_lock. */
120219820Sjeffstruct	sx allprison_lock;
121219820SjeffSX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
122219820Sjeffstruct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
123LIST_HEAD(, prison_racct) allprison_racct;
124int	lastprid = 0;
125
126static int do_jail_attach(struct thread *td, struct prison *pr);
127static void prison_complete(void *context, int pending);
128static void prison_deref(struct prison *pr, int flags);
129static char *prison_path(struct prison *pr1, struct prison *pr2);
130static void prison_remove_one(struct prison *pr);
131#ifdef RACCT
132static void prison_racct_attach(struct prison *pr);
133static void prison_racct_modify(struct prison *pr);
134static void prison_racct_detach(struct prison *pr);
135#endif
136#ifdef INET
137static int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
138static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4);
139#endif
140#ifdef INET6
141static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
142static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6);
143#endif
144
145/* Flags for prison_deref */
146#define	PD_DEREF	0x01
147#define	PD_DEUREF	0x02
148#define	PD_LOCKED	0x04
149#define	PD_LIST_SLOCKED	0x08
150#define	PD_LIST_XLOCKED	0x10
151
152/*
153 * Parameter names corresponding to PR_* flag values.  Size values are for kvm
154 * as we cannot figure out the size of a sparse array, or an array without a
155 * terminating entry.
156 */
157static char *pr_flag_names[] = {
158	[0] = "persist",
159#ifdef INET
160	[7] = "ip4.saddrsel",
161#endif
162#ifdef INET6
163	[8] = "ip6.saddrsel",
164#endif
165};
166const size_t pr_flag_names_size = sizeof(pr_flag_names);
167
168static char *pr_flag_nonames[] = {
169	[0] = "nopersist",
170#ifdef INET
171	[7] = "ip4.nosaddrsel",
172#endif
173#ifdef INET6
174	[8] = "ip6.nosaddrsel",
175#endif
176};
177const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames);
178
179struct jailsys_flags {
180	const char	*name;
181	unsigned	 disable;
182	unsigned	 new;
183} pr_flag_jailsys[] = {
184	{ "host", 0, PR_HOST },
185#ifdef VIMAGE
186	{ "vnet", 0, PR_VNET },
187#endif
188#ifdef INET
189	{ "ip4", PR_IP4_USER | PR_IP4_DISABLE, PR_IP4_USER },
190#endif
191#ifdef INET6
192	{ "ip6", PR_IP6_USER | PR_IP6_DISABLE, PR_IP6_USER },
193#endif
194};
195const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
196
197static char *pr_allow_names[] = {
198	"allow.set_hostname",
199	"allow.sysvipc",
200	"allow.raw_sockets",
201	"allow.chflags",
202	"allow.mount",
203	"allow.quotas",
204	"allow.socket_af",
205	"allow.mount.devfs",
206	"allow.mount.nullfs",
207	"allow.mount.zfs",
208	"allow.mount.procfs",
209	"allow.mount.tmpfs",
210	"allow.mount.fdescfs",
211	"allow.mount.linprocfs",
212	"allow.mount.linsysfs",
213};
214const size_t pr_allow_names_size = sizeof(pr_allow_names);
215
216static char *pr_allow_nonames[] = {
217	"allow.noset_hostname",
218	"allow.nosysvipc",
219	"allow.noraw_sockets",
220	"allow.nochflags",
221	"allow.nomount",
222	"allow.noquotas",
223	"allow.nosocket_af",
224	"allow.mount.nodevfs",
225	"allow.mount.nonullfs",
226	"allow.mount.nozfs",
227	"allow.mount.noprocfs",
228	"allow.mount.notmpfs",
229	"allow.mount.nofdescfs",
230	"allow.mount.nolinprocfs",
231	"allow.mount.nolinsysfs",
232};
233const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);
234
235#define	JAIL_DEFAULT_ALLOW		PR_ALLOW_SET_HOSTNAME
236#define	JAIL_DEFAULT_ENFORCE_STATFS	2
237#define	JAIL_DEFAULT_DEVFS_RSNUM	0
238static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
239static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
240static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
241#if defined(INET) || defined(INET6)
242static unsigned jail_max_af_ips = 255;
243#endif
244
245/*
246 * Initialize the parts of prison0 that can't be static-initialized with
247 * constants.  This is called from proc0_init() after creating thread0 cpuset.
248 */
249void
250prison0_init(void)
251{
252
253	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
254	prison0.pr_osreldate = osreldate;
255	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
256}
257
258#ifdef INET
259static int
260qcmp_v4(const void *ip1, const void *ip2)
261{
262	in_addr_t iaa, iab;
263
264	/*
265	 * We need to compare in HBO here to get the list sorted as expected
266	 * by the result of the code.  Sorting NBO addresses gives you
267	 * interesting results.  If you do not understand, do not try.
268	 */
269	iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
270	iab = ntohl(((const struct in_addr *)ip2)->s_addr);
271
272	/*
273	 * Do not simply return the difference of the two numbers, the int is
274	 * not wide enough.
275	 */
276	if (iaa > iab)
277		return (1);
278	else if (iaa < iab)
279		return (-1);
280	else
281		return (0);
282}
283#endif
284
285#ifdef INET6
286static int
287qcmp_v6(const void *ip1, const void *ip2)
288{
289	const struct in6_addr *ia6a, *ia6b;
290	int i, rc;
291
292	ia6a = (const struct in6_addr *)ip1;
293	ia6b = (const struct in6_addr *)ip2;
294
295	rc = 0;
296	for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
297		if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
298			rc = 1;
299		else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
300			rc = -1;
301	}
302	return (rc);
303}
304#endif
305
306/*
307 * struct jail_args {
308 *	struct jail *jail;
309 * };
310 */
311int
312sys_jail(struct thread *td, struct jail_args *uap)
313{
314	uint32_t version;
315	int error;
316	struct jail j;
317
318	error = copyin(uap->jail, &version, sizeof(uint32_t));
319	if (error)
320		return (error);
321
322	switch (version) {
323	case 0:
324	{
325		struct jail_v0 j0;
326
327		/* FreeBSD single IPv4 jails. */
328		bzero(&j, sizeof(struct jail));
329		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
330		if (error)
331			return (error);
332		j.version = j0.version;
333		j.path = j0.path;
334		j.hostname = j0.hostname;
335		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
336		break;
337	}
338
339	case 1:
340		/*
341		 * Version 1 was used by multi-IPv4 jail implementations
342		 * that never made it into the official kernel.
343		 */
344		return (EINVAL);
345
346	case 2:	/* JAIL_API_VERSION */
347		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
348		error = copyin(uap->jail, &j, sizeof(struct jail));
349		if (error)
350			return (error);
351		break;
352
353	default:
354		/* Sci-Fi jails are not supported, sorry. */
355		return (EINVAL);
356	}
357	return (kern_jail(td, &j));
358}
359
360int
361kern_jail(struct thread *td, struct jail *j)
362{
363	struct iovec optiov[2 * (4
364			    + sizeof(pr_allow_names) / sizeof(pr_allow_names[0])
365#ifdef INET
366			    + 1
367#endif
368#ifdef INET6
369			    + 1
370#endif
371			    )];
372	struct uio opt;
373	char *u_path, *u_hostname, *u_name;
374#ifdef INET
375	uint32_t ip4s;
376	struct in_addr *u_ip4;
377#endif
378#ifdef INET6
379	struct in6_addr *u_ip6;
380#endif
381	size_t tmplen;
382	int error, enforce_statfs, fi;
383
384	bzero(&optiov, sizeof(optiov));
385	opt.uio_iov = optiov;
386	opt.uio_iovcnt = 0;
387	opt.uio_offset = -1;
388	opt.uio_resid = -1;
389	opt.uio_segflg = UIO_SYSSPACE;
390	opt.uio_rw = UIO_READ;
391	opt.uio_td = td;
392
393	/* Set permissions for top-level jails from sysctls. */
394	if (!jailed(td->td_ucred)) {
395		for (fi = 0; fi < sizeof(pr_allow_names) /
396		     sizeof(pr_allow_names[0]); fi++) {
397			optiov[opt.uio_iovcnt].iov_base =
398			    (jail_default_allow & (1 << fi))
399			    ? pr_allow_names[fi] : pr_allow_nonames[fi];
400			optiov[opt.uio_iovcnt].iov_len =
401			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
402			opt.uio_iovcnt += 2;
403		}
404		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
405		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
406		opt.uio_iovcnt++;
407		enforce_statfs = jail_default_enforce_statfs;
408		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
409		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
410		opt.uio_iovcnt++;
411	}
412
413	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
414#ifdef INET
415	ip4s = (j->version == 0) ? 1 : j->ip4s;
416	if (ip4s > jail_max_af_ips)
417		return (EINVAL);
418	tmplen += ip4s * sizeof(struct in_addr);
419#else
420	if (j->ip4s > 0)
421		return (EINVAL);
422#endif
423#ifdef INET6
424	if (j->ip6s > jail_max_af_ips)
425		return (EINVAL);
426	tmplen += j->ip6s * sizeof(struct in6_addr);
427#else
428	if (j->ip6s > 0)
429		return (EINVAL);
430#endif
431	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
432	u_hostname = u_path + MAXPATHLEN;
433	u_name = u_hostname + MAXHOSTNAMELEN;
434#ifdef INET
435	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
436#endif
437#ifdef INET6
438#ifdef INET
439	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
440#else
441	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
442#endif
443#endif
444	optiov[opt.uio_iovcnt].iov_base = "path";
445	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
446	opt.uio_iovcnt++;
447	optiov[opt.uio_iovcnt].iov_base = u_path;
448	error = copyinstr(j->path, u_path, MAXPATHLEN,
449	    &optiov[opt.uio_iovcnt].iov_len);
450	if (error) {
451		free(u_path, M_TEMP);
452		return (error);
453	}
454	opt.uio_iovcnt++;
455	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
456	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
457	opt.uio_iovcnt++;
458	optiov[opt.uio_iovcnt].iov_base = u_hostname;
459	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
460	    &optiov[opt.uio_iovcnt].iov_len);
461	if (error) {
462		free(u_path, M_TEMP);
463		return (error);
464	}
465	opt.uio_iovcnt++;
466	if (j->jailname != NULL) {
467		optiov[opt.uio_iovcnt].iov_base = "name";
468		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
469		opt.uio_iovcnt++;
470		optiov[opt.uio_iovcnt].iov_base = u_name;
471		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
472		    &optiov[opt.uio_iovcnt].iov_len);
473		if (error) {
474			free(u_path, M_TEMP);
475			return (error);
476		}
477		opt.uio_iovcnt++;
478	}
479#ifdef INET
480	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
481	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
482	opt.uio_iovcnt++;
483	optiov[opt.uio_iovcnt].iov_base = u_ip4;
484	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
485	if (j->version == 0)
486		u_ip4->s_addr = j->ip4s;
487	else {
488		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
489		if (error) {
490			free(u_path, M_TEMP);
491			return (error);
492		}
493	}
494	opt.uio_iovcnt++;
495#endif
496#ifdef INET6
497	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
498	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
499	opt.uio_iovcnt++;
500	optiov[opt.uio_iovcnt].iov_base = u_ip6;
501	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
502	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
503	if (error) {
504		free(u_path, M_TEMP);
505		return (error);
506	}
507	opt.uio_iovcnt++;
508#endif
509	KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
510	    ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
511	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
512	free(u_path, M_TEMP);
513	return (error);
514}
515
516
517/*
518 * struct jail_set_args {
519 *	struct iovec *iovp;
520 *	unsigned int iovcnt;
521 *	int flags;
522 * };
523 */
524int
525sys_jail_set(struct thread *td, struct jail_set_args *uap)
526{
527	struct uio *auio;
528	int error;
529
530	/* Check that we have an even number of iovecs. */
531	if (uap->iovcnt & 1)
532		return (EINVAL);
533
534	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
535	if (error)
536		return (error);
537	error = kern_jail_set(td, auio, uap->flags);
538	free(auio, M_IOV);
539	return (error);
540}
541
542int
543kern_jail_set(struct thread *td, struct uio *optuio, int flags)
544{
545	struct nameidata nd;
546#ifdef INET
547	struct in_addr *ip4;
548#endif
549#ifdef INET6
550	struct in6_addr *ip6;
551#endif
552	struct vfsopt *opt;
553	struct vfsoptlist *opts;
554	struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
555	struct vnode *root;
556	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
557	char *g_path, *osrelstr;
558#if defined(INET) || defined(INET6)
559	struct prison *tppr;
560	void *op;
561#endif
562	unsigned long hid;
563	size_t namelen, onamelen, pnamelen;
564	int born, created, cuflags, descend, enforce;
565	int error, errmsg_len, errmsg_pos;
566	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
567	int fi, jid, jsys, len, level;
568	int childmax, osreldt, rsnum, slevel;
569	int fullpath_disabled;
570#if defined(INET) || defined(INET6)
571	int ii, ij;
572#endif
573#ifdef INET
574	int ip4s, redo_ip4;
575#endif
576#ifdef INET6
577	int ip6s, redo_ip6;
578#endif
579	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
580	unsigned tallow;
581	char numbuf[12];
582
583	error = priv_check(td, PRIV_JAIL_SET);
584	if (!error && (flags & JAIL_ATTACH))
585		error = priv_check(td, PRIV_JAIL_ATTACH);
586	if (error)
587		return (error);
588	mypr = td->td_ucred->cr_prison;
589	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
590		return (EPERM);
591	if (flags & ~JAIL_SET_MASK)
592		return (EINVAL);
593
594	/*
595	 * Check all the parameters before committing to anything.  Not all
596	 * errors can be caught early, but we may as well try.  Also, this
597	 * takes care of some expensive stuff (path lookup) before getting
598	 * the allprison lock.
599	 *
600	 * XXX Jails are not filesystems, and jail parameters are not mount
601	 *     options.  But it makes more sense to re-use the vfsopt code
602	 *     than duplicate it under a different name.
603	 */
604	error = vfs_buildopts(optuio, &opts);
605	if (error)
606		return (error);
607#ifdef INET
608	ip4 = NULL;
609#endif
610#ifdef INET6
611	ip6 = NULL;
612#endif
613	g_path = NULL;
614
615	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
616	if (!cuflags) {
617		error = EINVAL;
618		vfs_opterror(opts, "no valid operation (create or update)");
619		goto done_errmsg;
620	}
621
622	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
623	if (error == ENOENT)
624		jid = 0;
625	else if (error != 0)
626		goto done_free;
627
628	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
629	if (error == ENOENT)
630		gotslevel = 0;
631	else if (error != 0)
632		goto done_free;
633	else
634		gotslevel = 1;
635
636	error =
637	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
638	if (error == ENOENT)
639		gotchildmax = 0;
640	else if (error != 0)
641		goto done_free;
642	else
643		gotchildmax = 1;
644
645	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
646	if (error == ENOENT)
647		gotenforce = 0;
648	else if (error != 0)
649		goto done_free;
650	else if (enforce < 0 || enforce > 2) {
651		error = EINVAL;
652		goto done_free;
653	} else
654		gotenforce = 1;
655
656	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
657	if (error == ENOENT)
658		gotrsnum = 0;
659	else if (error != 0)
660		goto done_free;
661	else
662		gotrsnum = 1;
663
664	pr_flags = ch_flags = 0;
665	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
666	    fi++) {
667		if (pr_flag_names[fi] == NULL)
668			continue;
669		vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
670		vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
671	}
672	ch_flags |= pr_flags;
673	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
674	    fi++) {
675		error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
676		    sizeof(jsys));
677		if (error == ENOENT)
678			continue;
679		if (error != 0)
680			goto done_free;
681		switch (jsys) {
682		case JAIL_SYS_DISABLE:
683			if (!pr_flag_jailsys[fi].disable) {
684				error = EINVAL;
685				goto done_free;
686			}
687			pr_flags |= pr_flag_jailsys[fi].disable;
688			break;
689		case JAIL_SYS_NEW:
690			pr_flags |= pr_flag_jailsys[fi].new;
691			break;
692		case JAIL_SYS_INHERIT:
693			break;
694		default:
695			error = EINVAL;
696			goto done_free;
697		}
698		ch_flags |=
699		    pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
700	}
701	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
702	    && !(pr_flags & PR_PERSIST)) {
703		error = EINVAL;
704		vfs_opterror(opts, "new jail must persist or attach");
705		goto done_errmsg;
706	}
707#ifdef VIMAGE
708	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
709		error = EINVAL;
710		vfs_opterror(opts, "vnet cannot be changed after creation");
711		goto done_errmsg;
712	}
713#endif
714#ifdef INET
715	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
716		error = EINVAL;
717		vfs_opterror(opts, "ip4 cannot be changed after creation");
718		goto done_errmsg;
719	}
720#endif
721#ifdef INET6
722	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
723		error = EINVAL;
724		vfs_opterror(opts, "ip6 cannot be changed after creation");
725		goto done_errmsg;
726	}
727#endif
728
729	pr_allow = ch_allow = 0;
730	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
731	    fi++) {
732		vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
733		vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
734	}
735	ch_allow |= pr_allow;
736
737	error = vfs_getopt(opts, "name", (void **)&name, &len);
738	if (error == ENOENT)
739		name = NULL;
740	else if (error != 0)
741		goto done_free;
742	else {
743		if (len == 0 || name[len - 1] != '\0') {
744			error = EINVAL;
745			goto done_free;
746		}
747		if (len > MAXHOSTNAMELEN) {
748			error = ENAMETOOLONG;
749			goto done_free;
750		}
751	}
752
753	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
754	if (error == ENOENT)
755		host = NULL;
756	else if (error != 0)
757		goto done_free;
758	else {
759		ch_flags |= PR_HOST;
760		pr_flags |= PR_HOST;
761		if (len == 0 || host[len - 1] != '\0') {
762			error = EINVAL;
763			goto done_free;
764		}
765		if (len > MAXHOSTNAMELEN) {
766			error = ENAMETOOLONG;
767			goto done_free;
768		}
769	}
770
771	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
772	if (error == ENOENT)
773		domain = NULL;
774	else if (error != 0)
775		goto done_free;
776	else {
777		ch_flags |= PR_HOST;
778		pr_flags |= PR_HOST;
779		if (len == 0 || domain[len - 1] != '\0') {
780			error = EINVAL;
781			goto done_free;
782		}
783		if (len > MAXHOSTNAMELEN) {
784			error = ENAMETOOLONG;
785			goto done_free;
786		}
787	}
788
789	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
790	if (error == ENOENT)
791		uuid = NULL;
792	else if (error != 0)
793		goto done_free;
794	else {
795		ch_flags |= PR_HOST;
796		pr_flags |= PR_HOST;
797		if (len == 0 || uuid[len - 1] != '\0') {
798			error = EINVAL;
799			goto done_free;
800		}
801		if (len > HOSTUUIDLEN) {
802			error = ENAMETOOLONG;
803			goto done_free;
804		}
805	}
806
807#ifdef COMPAT_FREEBSD32
808	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
809		uint32_t hid32;
810
811		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
812		hid = hid32;
813	} else
814#endif
815		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
816	if (error == ENOENT)
817		gothid = 0;
818	else if (error != 0)
819		goto done_free;
820	else {
821		gothid = 1;
822		ch_flags |= PR_HOST;
823		pr_flags |= PR_HOST;
824	}
825
826#ifdef INET
827	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
828	if (error == ENOENT)
829		ip4s = 0;
830	else if (error != 0)
831		goto done_free;
832	else if (ip4s & (sizeof(*ip4) - 1)) {
833		error = EINVAL;
834		goto done_free;
835	} else {
836		ch_flags |= PR_IP4_USER | PR_IP4_DISABLE;
837		if (ip4s == 0)
838			pr_flags |= PR_IP4_USER | PR_IP4_DISABLE;
839		else {
840			pr_flags = (pr_flags & ~PR_IP4_DISABLE) | PR_IP4_USER;
841			ip4s /= sizeof(*ip4);
842			if (ip4s > jail_max_af_ips) {
843				error = EINVAL;
844				vfs_opterror(opts, "too many IPv4 addresses");
845				goto done_errmsg;
846			}
847			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
848			bcopy(op, ip4, ip4s * sizeof(*ip4));
849			/*
850			 * IP addresses are all sorted but ip[0] to preserve
851			 * the primary IP address as given from userland.
852			 * This special IP is used for unbound outgoing
853			 * connections as well for "loopback" traffic in case
854			 * source address selection cannot find any more fitting
855			 * address to connect from.
856			 */
857			if (ip4s > 1)
858				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
859			/*
860			 * Check for duplicate addresses and do some simple
861			 * zero and broadcast checks. If users give other bogus
862			 * addresses it is their problem.
863			 *
864			 * We do not have to care about byte order for these
865			 * checks so we will do them in NBO.
866			 */
867			for (ii = 0; ii < ip4s; ii++) {
868				if (ip4[ii].s_addr == INADDR_ANY ||
869				    ip4[ii].s_addr == INADDR_BROADCAST) {
870					error = EINVAL;
871					goto done_free;
872				}
873				if ((ii+1) < ip4s &&
874				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
875				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
876					error = EINVAL;
877					goto done_free;
878				}
879			}
880		}
881	}
882#endif
883
884#ifdef INET6
885	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
886	if (error == ENOENT)
887		ip6s = 0;
888	else if (error != 0)
889		goto done_free;
890	else if (ip6s & (sizeof(*ip6) - 1)) {
891		error = EINVAL;
892		goto done_free;
893	} else {
894		ch_flags |= PR_IP6_USER | PR_IP6_DISABLE;
895		if (ip6s == 0)
896			pr_flags |= PR_IP6_USER | PR_IP6_DISABLE;
897		else {
898			pr_flags = (pr_flags & ~PR_IP6_DISABLE) | PR_IP6_USER;
899			ip6s /= sizeof(*ip6);
900			if (ip6s > jail_max_af_ips) {
901				error = EINVAL;
902				vfs_opterror(opts, "too many IPv6 addresses");
903				goto done_errmsg;
904			}
905			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
906			bcopy(op, ip6, ip6s * sizeof(*ip6));
907			if (ip6s > 1)
908				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
909			for (ii = 0; ii < ip6s; ii++) {
910				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
911					error = EINVAL;
912					goto done_free;
913				}
914				if ((ii+1) < ip6s &&
915				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
916				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
917				{
918					error = EINVAL;
919					goto done_free;
920				}
921			}
922		}
923	}
924#endif
925
926#if defined(VIMAGE) && (defined(INET) || defined(INET6))
927	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
928		error = EINVAL;
929		vfs_opterror(opts,
930		    "vnet jails cannot have IP address restrictions");
931		goto done_errmsg;
932	}
933#endif
934
935	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
936	if (error == ENOENT)
937		osrelstr = NULL;
938	else if (error != 0)
939		goto done_free;
940	else {
941		if (flags & JAIL_UPDATE) {
942			error = EINVAL;
943			vfs_opterror(opts,
944			    "osrelease cannot be changed after creation");
945			goto done_errmsg;
946		}
947		if (len == 0 || len >= OSRELEASELEN) {
948			error = EINVAL;
949			vfs_opterror(opts,
950			    "osrelease string must be 1-%d bytes long",
951			    OSRELEASELEN - 1);
952			goto done_errmsg;
953		}
954	}
955
956	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
957	if (error == ENOENT)
958		osreldt = 0;
959	else if (error != 0)
960		goto done_free;
961	else {
962		if (flags & JAIL_UPDATE) {
963			error = EINVAL;
964			vfs_opterror(opts,
965			    "osreldate cannot be changed after creation");
966			goto done_errmsg;
967		}
968		if (osreldt == 0) {
969			error = EINVAL;
970			vfs_opterror(opts, "osreldate cannot be 0");
971			goto done_errmsg;
972		}
973	}
974
975	fullpath_disabled = 0;
976	root = NULL;
977	error = vfs_getopt(opts, "path", (void **)&path, &len);
978	if (error == ENOENT)
979		path = NULL;
980	else if (error != 0)
981		goto done_free;
982	else {
983		if (flags & JAIL_UPDATE) {
984			error = EINVAL;
985			vfs_opterror(opts,
986			    "path cannot be changed after creation");
987			goto done_errmsg;
988		}
989		if (len == 0 || path[len - 1] != '\0') {
990			error = EINVAL;
991			goto done_free;
992		}
993		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
994		    path, td);
995		error = namei(&nd);
996		if (error)
997			goto done_free;
998		root = nd.ni_vp;
999		NDFREE(&nd, NDF_ONLY_PNBUF);
1000		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1001		strlcpy(g_path, path, MAXPATHLEN);
1002		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
1003		if (error == 0)
1004			path = g_path;
1005		else if (error == ENODEV) {
1006			/* proceed if sysctl debug.disablefullpath == 1 */
1007			fullpath_disabled = 1;
1008			if (len < 2 || (len == 2 && path[0] == '/'))
1009				path = NULL;
1010		} else {
1011			/* exit on other errors */
1012			goto done_free;
1013		}
1014		if (root->v_type != VDIR) {
1015			error = ENOTDIR;
1016			vput(root);
1017			goto done_free;
1018		}
1019		VOP_UNLOCK(root, 0);
1020		if (fullpath_disabled) {
1021			/* Leave room for a real-root full pathname. */
1022			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
1023			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
1024				error = ENAMETOOLONG;
1025				vrele(root);
1026				goto done_free;
1027			}
1028		}
1029	}
1030
1031	/*
1032	 * Find the specified jail, or at least its parent.
1033	 * This abuses the file error codes ENOENT and EEXIST.
1034	 */
1035	pr = NULL;
1036	ppr = mypr;
1037	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
1038		namelc = strrchr(name, '.');
1039		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
1040		if (*p != '\0')
1041			jid = 0;
1042	}
1043	sx_xlock(&allprison_lock);
1044	if (jid != 0) {
1045		/*
1046		 * See if a requested jid already exists.  There is an
1047		 * information leak here if the jid exists but is not within
1048		 * the caller's jail hierarchy.  Jail creators will get EEXIST
1049		 * even though they cannot see the jail, and CREATE | UPDATE
1050		 * will return ENOENT which is not normally a valid error.
1051		 */
1052		if (jid < 0) {
1053			error = EINVAL;
1054			vfs_opterror(opts, "negative jid");
1055			goto done_unlock_list;
1056		}
1057		pr = prison_find(jid);
1058		if (pr != NULL) {
1059			ppr = pr->pr_parent;
1060			/* Create: jid must not exist. */
1061			if (cuflags == JAIL_CREATE) {
1062				mtx_unlock(&pr->pr_mtx);
1063				error = EEXIST;
1064				vfs_opterror(opts, "jail %d already exists",
1065				    jid);
1066				goto done_unlock_list;
1067			}
1068			if (!prison_ischild(mypr, pr)) {
1069				mtx_unlock(&pr->pr_mtx);
1070				pr = NULL;
1071			} else if (pr->pr_uref == 0) {
1072				if (!(flags & JAIL_DYING)) {
1073					mtx_unlock(&pr->pr_mtx);
1074					error = ENOENT;
1075					vfs_opterror(opts, "jail %d is dying",
1076					    jid);
1077					goto done_unlock_list;
1078				} else if ((flags & JAIL_ATTACH) ||
1079				    (pr_flags & PR_PERSIST)) {
1080					/*
1081					 * A dying jail might be resurrected
1082					 * (via attach or persist), but first
1083					 * it must determine if another jail
1084					 * has claimed its name.  Accomplish
1085					 * this by implicitly re-setting the
1086					 * name.
1087					 */
1088					if (name == NULL)
1089						name = prison_name(mypr, pr);
1090				}
1091			}
1092		}
1093		if (pr == NULL) {
1094			/* Update: jid must exist. */
1095			if (cuflags == JAIL_UPDATE) {
1096				error = ENOENT;
1097				vfs_opterror(opts, "jail %d not found", jid);
1098				goto done_unlock_list;
1099			}
1100		}
1101	}
1102	/*
1103	 * If the caller provided a name, look for a jail by that name.
1104	 * This has different semantics for creates and updates keyed by jid
1105	 * (where the name must not already exist in a different jail),
1106	 * and updates keyed by the name itself (where the name must exist
1107	 * because that is the jail being updated).
1108	 */
1109	namelc = NULL;
1110	if (name != NULL) {
1111		namelc = strrchr(name, '.');
1112		if (namelc == NULL)
1113			namelc = name;
1114		else {
1115			/*
1116			 * This is a hierarchical name.  Split it into the
1117			 * parent and child names, and make sure the parent
1118			 * exists or matches an already found jail.
1119			 */
1120			if (pr != NULL) {
1121				if (strncmp(name, ppr->pr_name, namelc - name)
1122				    || ppr->pr_name[namelc - name] != '\0') {
1123					mtx_unlock(&pr->pr_mtx);
1124					error = EINVAL;
1125					vfs_opterror(opts,
1126					    "cannot change jail's parent");
1127					goto done_unlock_list;
1128				}
1129			} else {
1130				*namelc = '\0';
1131				ppr = prison_find_name(mypr, name);
1132				if (ppr == NULL) {
1133					error = ENOENT;
1134					vfs_opterror(opts,
1135					    "jail \"%s\" not found", name);
1136					goto done_unlock_list;
1137				}
1138				mtx_unlock(&ppr->pr_mtx);
1139				*namelc = '.';
1140			}
1141			namelc++;
1142		}
1143		if (namelc[0] != '\0') {
1144			pnamelen =
1145			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1146 name_again:
1147			deadpr = NULL;
1148			FOREACH_PRISON_CHILD(ppr, tpr) {
1149				if (tpr != pr && tpr->pr_ref > 0 &&
1150				    !strcmp(tpr->pr_name + pnamelen, namelc)) {
1151					if (pr == NULL &&
1152					    cuflags != JAIL_CREATE) {
1153						mtx_lock(&tpr->pr_mtx);
1154						if (tpr->pr_ref > 0) {
1155							/*
1156							 * Use this jail
1157							 * for updates.
1158							 */
1159							if (tpr->pr_uref > 0) {
1160								pr = tpr;
1161								break;
1162							}
1163							deadpr = tpr;
1164						}
1165						mtx_unlock(&tpr->pr_mtx);
1166					} else if (tpr->pr_uref > 0) {
1167						/*
1168						 * Create, or update(jid):
1169						 * name must not exist in an
1170						 * active sibling jail.
1171						 */
1172						error = EEXIST;
1173						if (pr != NULL)
1174							mtx_unlock(&pr->pr_mtx);
1175						vfs_opterror(opts,
1176						   "jail \"%s\" already exists",
1177						   name);
1178						goto done_unlock_list;
1179					}
1180				}
1181			}
1182			/* If no active jail is found, use a dying one. */
1183			if (deadpr != NULL && pr == NULL) {
1184				if (flags & JAIL_DYING) {
1185					mtx_lock(&deadpr->pr_mtx);
1186					if (deadpr->pr_ref == 0) {
1187						mtx_unlock(&deadpr->pr_mtx);
1188						goto name_again;
1189					}
1190					pr = deadpr;
1191				} else if (cuflags == JAIL_UPDATE) {
1192					error = ENOENT;
1193					vfs_opterror(opts,
1194					    "jail \"%s\" is dying", name);
1195					goto done_unlock_list;
1196				}
1197			}
1198			/* Update: name must exist if no jid. */
1199			else if (cuflags == JAIL_UPDATE && pr == NULL) {
1200				error = ENOENT;
1201				vfs_opterror(opts, "jail \"%s\" not found",
1202				    name);
1203				goto done_unlock_list;
1204			}
1205		}
1206	}
1207	/* Update: must provide a jid or name. */
1208	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1209		error = ENOENT;
1210		vfs_opterror(opts, "update specified no jail");
1211		goto done_unlock_list;
1212	}
1213
1214	/* If there's no prison to update, create a new one and link it in. */
1215	if (pr == NULL) {
1216		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1217			if (tpr->pr_childcount >= tpr->pr_childmax) {
1218				error = EPERM;
1219				vfs_opterror(opts, "prison limit exceeded");
1220				goto done_unlock_list;
1221			}
1222		created = 1;
1223		mtx_lock(&ppr->pr_mtx);
1224		if (ppr->pr_ref == 0) {
1225			mtx_unlock(&ppr->pr_mtx);
1226			error = ENOENT;
1227			vfs_opterror(opts, "jail \"%s\" not found",
1228			    prison_name(mypr, ppr));
1229			goto done_unlock_list;
1230		}
1231		ppr->pr_ref++;
1232		ppr->pr_uref++;
1233		mtx_unlock(&ppr->pr_mtx);
1234		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1235		if (jid == 0) {
1236			/* Find the next free jid. */
1237			jid = lastprid + 1;
1238 findnext:
1239			if (jid == JAIL_MAX)
1240				jid = 1;
1241			TAILQ_FOREACH(tpr, &allprison, pr_list) {
1242				if (tpr->pr_id < jid)
1243					continue;
1244				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
1245					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1246					break;
1247				}
1248				if (jid == lastprid) {
1249					error = EAGAIN;
1250					vfs_opterror(opts,
1251					    "no available jail IDs");
1252					free(pr, M_PRISON);
1253					prison_deref(ppr, PD_DEREF |
1254					    PD_DEUREF | PD_LIST_XLOCKED);
1255					goto done_releroot;
1256				}
1257				jid++;
1258				goto findnext;
1259			}
1260			lastprid = jid;
1261		} else {
1262			/*
1263			 * The jail already has a jid (that did not yet exist),
1264			 * so just find where to insert it.
1265			 */
1266			TAILQ_FOREACH(tpr, &allprison, pr_list)
1267				if (tpr->pr_id >= jid) {
1268					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1269					break;
1270				}
1271		}
1272		if (tpr == NULL)
1273			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1274		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1275		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1276			tpr->pr_childcount++;
1277
1278		pr->pr_parent = ppr;
1279		pr->pr_id = jid;
1280
1281		/* Set some default values, and inherit some from the parent. */
1282		if (namelc == NULL)
1283			namelc = "";
1284		if (path == NULL) {
1285			path = "/";
1286			root = mypr->pr_root;
1287			vref(root);
1288		}
1289		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1290		pr->pr_flags |= PR_HOST;
1291#if defined(INET) || defined(INET6)
1292#ifdef VIMAGE
1293		if (!(pr_flags & PR_VNET))
1294#endif
1295		{
1296#ifdef INET
1297			if (!(ch_flags & PR_IP4_USER))
1298				pr->pr_flags |=
1299				    PR_IP4 | PR_IP4_USER | PR_IP4_DISABLE;
1300			else if (!(pr_flags & PR_IP4_USER)) {
1301				pr->pr_flags |= ppr->pr_flags & PR_IP4;
1302				if (ppr->pr_ip4 != NULL) {
1303					pr->pr_ip4s = ppr->pr_ip4s;
1304					pr->pr_ip4 = malloc(pr->pr_ip4s *
1305					    sizeof(struct in_addr), M_PRISON,
1306					    M_WAITOK);
1307					bcopy(ppr->pr_ip4, pr->pr_ip4,
1308					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1309				}
1310			}
1311#endif
1312#ifdef INET6
1313			if (!(ch_flags & PR_IP6_USER))
1314				pr->pr_flags |=
1315				    PR_IP6 | PR_IP6_USER | PR_IP6_DISABLE;
1316			else if (!(pr_flags & PR_IP6_USER)) {
1317				pr->pr_flags |= ppr->pr_flags & PR_IP6;
1318				if (ppr->pr_ip6 != NULL) {
1319					pr->pr_ip6s = ppr->pr_ip6s;
1320					pr->pr_ip6 = malloc(pr->pr_ip6s *
1321					    sizeof(struct in6_addr), M_PRISON,
1322					    M_WAITOK);
1323					bcopy(ppr->pr_ip6, pr->pr_ip6,
1324					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1325				}
1326			}
1327#endif
1328		}
1329#endif
1330		/* Source address selection is always on by default. */
1331		pr->pr_flags |= _PR_IP_SADDRSEL;
1332
1333		pr->pr_securelevel = ppr->pr_securelevel;
1334		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1335		pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
1336		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
1337
1338		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
1339		if (osrelstr == NULL)
1340		    strcpy(pr->pr_osrelease, ppr->pr_osrelease);
1341		else
1342		    strcpy(pr->pr_osrelease, osrelstr);
1343
1344		LIST_INIT(&pr->pr_children);
1345		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1346		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
1347
1348#ifdef VIMAGE
1349		/* Allocate a new vnet if specified. */
1350		pr->pr_vnet = (pr_flags & PR_VNET)
1351		    ? vnet_alloc() : ppr->pr_vnet;
1352#endif
1353		/*
1354		 * Allocate a dedicated cpuset for each jail.
1355		 * Unlike other initial settings, this may return an erorr.
1356		 */
1357		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1358		if (error) {
1359			prison_deref(pr, PD_LIST_XLOCKED);
1360			goto done_releroot;
1361		}
1362
1363		mtx_lock(&pr->pr_mtx);
1364		/*
1365		 * New prisons do not yet have a reference, because we do not
1366		 * want others to see the incomplete prison once the
1367		 * allprison_lock is downgraded.
1368		 */
1369	} else {
1370		created = 0;
1371		/*
1372		 * Grab a reference for existing prisons, to ensure they
1373		 * continue to exist for the duration of the call.
1374		 */
1375		pr->pr_ref++;
1376#if defined(VIMAGE) && (defined(INET) || defined(INET6))
1377		if ((pr->pr_flags & PR_VNET) &&
1378		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1379			error = EINVAL;
1380			vfs_opterror(opts,
1381			    "vnet jails cannot have IP address restrictions");
1382			goto done_deref_locked;
1383		}
1384#endif
1385#ifdef INET
1386		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1387			error = EINVAL;
1388			vfs_opterror(opts,
1389			    "ip4 cannot be changed after creation");
1390			goto done_deref_locked;
1391		}
1392#endif
1393#ifdef INET6
1394		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1395			error = EINVAL;
1396			vfs_opterror(opts,
1397			    "ip6 cannot be changed after creation");
1398			goto done_deref_locked;
1399		}
1400#endif
1401	}
1402
1403	/* Do final error checking before setting anything. */
1404	if (gotslevel) {
1405		if (slevel < ppr->pr_securelevel) {
1406			error = EPERM;
1407			goto done_deref_locked;
1408		}
1409	}
1410	if (gotchildmax) {
1411		if (childmax >= ppr->pr_childmax) {
1412			error = EPERM;
1413			goto done_deref_locked;
1414		}
1415	}
1416	if (gotenforce) {
1417		if (enforce < ppr->pr_enforce_statfs) {
1418			error = EPERM;
1419			goto done_deref_locked;
1420		}
1421	}
1422	if (gotrsnum) {
1423		/*
1424		 * devfs_rsnum is a uint16_t
1425		 */
1426		if (rsnum < 0 || rsnum > 65535) {
1427			error = EINVAL;
1428			goto done_deref_locked;
1429		}
1430		/*
1431		 * Nested jails always inherit parent's devfs ruleset
1432		 */
1433		if (jailed(td->td_ucred)) {
1434			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
1435				error = EPERM;
1436				goto done_deref_locked;
1437			} else
1438				rsnum = ppr->pr_devfs_rsnum;
1439		}
1440	}
1441#ifdef INET
1442	if (ip4s > 0) {
1443		if (ppr->pr_flags & PR_IP4) {
1444			/*
1445			 * Make sure the new set of IP addresses is a
1446			 * subset of the parent's list.  Don't worry
1447			 * about the parent being unlocked, as any
1448			 * setting is done with allprison_lock held.
1449			 */
1450			for (ij = 0; ij < ppr->pr_ip4s; ij++)
1451				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
1452					break;
1453			if (ij == ppr->pr_ip4s) {
1454				error = EPERM;
1455				goto done_deref_locked;
1456			}
1457			if (ip4s > 1) {
1458				for (ii = ij = 1; ii < ip4s; ii++) {
1459					if (ip4[ii].s_addr ==
1460					    ppr->pr_ip4[0].s_addr)
1461						continue;
1462					for (; ij < ppr->pr_ip4s; ij++)
1463						if (ip4[ii].s_addr ==
1464						    ppr->pr_ip4[ij].s_addr)
1465							break;
1466					if (ij == ppr->pr_ip4s)
1467						break;
1468				}
1469				if (ij == ppr->pr_ip4s) {
1470					error = EPERM;
1471					goto done_deref_locked;
1472				}
1473			}
1474		}
1475		/*
1476		 * Check for conflicting IP addresses.  We permit them
1477		 * if there is no more than one IP on each jail.  If
1478		 * there is a duplicate on a jail with more than one
1479		 * IP stop checking and return error.
1480		 */
1481		tppr = ppr;
1482#ifdef VIMAGE
1483		for (; tppr != &prison0; tppr = tppr->pr_parent)
1484			if (tppr->pr_flags & PR_VNET)
1485				break;
1486#endif
1487		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1488			if (tpr == pr ||
1489#ifdef VIMAGE
1490			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1491#endif
1492			    tpr->pr_uref == 0) {
1493				descend = 0;
1494				continue;
1495			}
1496			if (!(tpr->pr_flags & PR_IP4_USER))
1497				continue;
1498			descend = 0;
1499			if (tpr->pr_ip4 == NULL ||
1500			    (ip4s == 1 && tpr->pr_ip4s == 1))
1501				continue;
1502			for (ii = 0; ii < ip4s; ii++) {
1503				if (_prison_check_ip4(tpr, &ip4[ii]) == 0) {
1504					error = EADDRINUSE;
1505					vfs_opterror(opts,
1506					    "IPv4 addresses clash");
1507					goto done_deref_locked;
1508				}
1509			}
1510		}
1511	}
1512#endif
1513#ifdef INET6
1514	if (ip6s > 0) {
1515		if (ppr->pr_flags & PR_IP6) {
1516			/*
1517			 * Make sure the new set of IP addresses is a
1518			 * subset of the parent's list.
1519			 */
1520			for (ij = 0; ij < ppr->pr_ip6s; ij++)
1521				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
1522				    &ppr->pr_ip6[ij]))
1523					break;
1524			if (ij == ppr->pr_ip6s) {
1525				error = EPERM;
1526				goto done_deref_locked;
1527			}
1528			if (ip6s > 1) {
1529				for (ii = ij = 1; ii < ip6s; ii++) {
1530					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
1531					     &ppr->pr_ip6[0]))
1532						continue;
1533					for (; ij < ppr->pr_ip6s; ij++)
1534						if (IN6_ARE_ADDR_EQUAL(
1535						    &ip6[ii], &ppr->pr_ip6[ij]))
1536							break;
1537					if (ij == ppr->pr_ip6s)
1538						break;
1539				}
1540				if (ij == ppr->pr_ip6s) {
1541					error = EPERM;
1542					goto done_deref_locked;
1543				}
1544			}
1545		}
1546		/* Check for conflicting IP addresses. */
1547		tppr = ppr;
1548#ifdef VIMAGE
1549		for (; tppr != &prison0; tppr = tppr->pr_parent)
1550			if (tppr->pr_flags & PR_VNET)
1551				break;
1552#endif
1553		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1554			if (tpr == pr ||
1555#ifdef VIMAGE
1556			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1557#endif
1558			    tpr->pr_uref == 0) {
1559				descend = 0;
1560				continue;
1561			}
1562			if (!(tpr->pr_flags & PR_IP6_USER))
1563				continue;
1564			descend = 0;
1565			if (tpr->pr_ip6 == NULL ||
1566			    (ip6s == 1 && tpr->pr_ip6s == 1))
1567				continue;
1568			for (ii = 0; ii < ip6s; ii++) {
1569				if (_prison_check_ip6(tpr, &ip6[ii]) == 0) {
1570					error = EADDRINUSE;
1571					vfs_opterror(opts,
1572					    "IPv6 addresses clash");
1573					goto done_deref_locked;
1574				}
1575			}
1576		}
1577	}
1578#endif
1579	onamelen = namelen = 0;
1580	if (namelc != NULL) {
1581		/* Give a default name of the jid.  Also allow the name to be
1582		 * explicitly the jid - but not any other number, and only in
1583		 * normal form (no leading zero/etc).
1584		 */
1585		if (namelc[0] == '\0')
1586			snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
1587		else if ((strtoul(namelc, &p, 10) != jid ||
1588			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
1589			error = EINVAL;
1590			vfs_opterror(opts,
1591			    "name cannot be numeric (unless it is the jid)");
1592			goto done_deref_locked;
1593		}
1594		/*
1595		 * Make sure the name isn't too long for the prison or its
1596		 * children.
1597		 */
1598		pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1599		onamelen = strlen(pr->pr_name + pnamelen);
1600		namelen = strlen(namelc);
1601		if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
1602			error = ENAMETOOLONG;
1603			goto done_deref_locked;
1604		}
1605		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1606			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1607			    sizeof(pr->pr_name)) {
1608				error = ENAMETOOLONG;
1609				goto done_deref_locked;
1610			}
1611		}
1612	}
1613	if (pr_allow & ~ppr->pr_allow) {
1614		error = EPERM;
1615		goto done_deref_locked;
1616	}
1617
1618	/*
1619	 * Let modules check their parameters.  This requires unlocking and
1620	 * then re-locking the prison, but this is still a valid state as long
1621	 * as allprison_lock remains xlocked.
1622	 */
1623	mtx_unlock(&pr->pr_mtx);
1624	error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
1625	if (error != 0) {
1626		prison_deref(pr, created
1627		    ? PD_LIST_XLOCKED
1628		    : PD_DEREF | PD_LIST_XLOCKED);
1629		goto done_releroot;
1630	}
1631	mtx_lock(&pr->pr_mtx);
1632
1633	/* At this point, all valid parameters should have been noted. */
1634	TAILQ_FOREACH(opt, opts, link) {
1635		if (!opt->seen && strcmp(opt->name, "errmsg")) {
1636			error = EINVAL;
1637			vfs_opterror(opts, "unknown parameter: %s", opt->name);
1638			goto done_deref_locked;
1639		}
1640	}
1641
1642	/* Set the parameters of the prison. */
1643#ifdef INET
1644	redo_ip4 = 0;
1645	if (pr_flags & PR_IP4_USER) {
1646		pr->pr_flags |= PR_IP4;
1647		free(pr->pr_ip4, M_PRISON);
1648		pr->pr_ip4s = ip4s;
1649		pr->pr_ip4 = ip4;
1650		ip4 = NULL;
1651		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1652#ifdef VIMAGE
1653			if (tpr->pr_flags & PR_VNET) {
1654				descend = 0;
1655				continue;
1656			}
1657#endif
1658			if (prison_restrict_ip4(tpr, NULL)) {
1659				redo_ip4 = 1;
1660				descend = 0;
1661			}
1662		}
1663	}
1664#endif
1665#ifdef INET6
1666	redo_ip6 = 0;
1667	if (pr_flags & PR_IP6_USER) {
1668		pr->pr_flags |= PR_IP6;
1669		free(pr->pr_ip6, M_PRISON);
1670		pr->pr_ip6s = ip6s;
1671		pr->pr_ip6 = ip6;
1672		ip6 = NULL;
1673		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1674#ifdef VIMAGE
1675			if (tpr->pr_flags & PR_VNET) {
1676				descend = 0;
1677				continue;
1678			}
1679#endif
1680			if (prison_restrict_ip6(tpr, NULL)) {
1681				redo_ip6 = 1;
1682				descend = 0;
1683			}
1684		}
1685	}
1686#endif
1687	if (gotslevel) {
1688		pr->pr_securelevel = slevel;
1689		/* Set all child jails to be at least this level. */
1690		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1691			if (tpr->pr_securelevel < slevel)
1692				tpr->pr_securelevel = slevel;
1693	}
1694	if (gotchildmax) {
1695		pr->pr_childmax = childmax;
1696		/* Set all child jails to under this limit. */
1697		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1698			if (tpr->pr_childmax > childmax - level)
1699				tpr->pr_childmax = childmax > level
1700				    ? childmax - level : 0;
1701	}
1702	if (gotenforce) {
1703		pr->pr_enforce_statfs = enforce;
1704		/* Pass this restriction on to the children. */
1705		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1706			if (tpr->pr_enforce_statfs < enforce)
1707				tpr->pr_enforce_statfs = enforce;
1708	}
1709	if (gotrsnum) {
1710		pr->pr_devfs_rsnum = rsnum;
1711		/* Pass this restriction on to the children. */
1712		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1713			tpr->pr_devfs_rsnum = rsnum;
1714	}
1715	if (namelc != NULL) {
1716		if (ppr == &prison0)
1717			strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
1718		else
1719			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1720			    ppr->pr_name, namelc);
1721		/* Change this component of child names. */
1722		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1723			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1724			    strlen(tpr->pr_name + onamelen) + 1);
1725			bcopy(pr->pr_name, tpr->pr_name, namelen);
1726		}
1727	}
1728	if (path != NULL) {
1729		/* Try to keep a real-rooted full pathname. */
1730		if (fullpath_disabled && path[0] == '/' &&
1731		    strcmp(mypr->pr_path, "/"))
1732			snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
1733			    mypr->pr_path, path);
1734		else
1735			strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1736		pr->pr_root = root;
1737	}
1738	if (PR_HOST & ch_flags & ~pr_flags) {
1739		if (pr->pr_flags & PR_HOST) {
1740			/*
1741			 * Copy the parent's host info.  As with pr_ip4 above,
1742			 * the lack of a lock on the parent is not a problem;
1743			 * it is always set with allprison_lock at least
1744			 * shared, and is held exclusively here.
1745			 */
1746			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1747			    sizeof(pr->pr_hostname));
1748			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1749			    sizeof(pr->pr_domainname));
1750			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1751			    sizeof(pr->pr_hostuuid));
1752			pr->pr_hostid = pr->pr_parent->pr_hostid;
1753		}
1754	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1755		/* Set this prison, and any descendants without PR_HOST. */
1756		if (host != NULL)
1757			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1758		if (domain != NULL)
1759			strlcpy(pr->pr_domainname, domain,
1760			    sizeof(pr->pr_domainname));
1761		if (uuid != NULL)
1762			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1763		if (gothid)
1764			pr->pr_hostid = hid;
1765		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1766			if (tpr->pr_flags & PR_HOST)
1767				descend = 0;
1768			else {
1769				if (host != NULL)
1770					strlcpy(tpr->pr_hostname,
1771					    pr->pr_hostname,
1772					    sizeof(tpr->pr_hostname));
1773				if (domain != NULL)
1774					strlcpy(tpr->pr_domainname,
1775					    pr->pr_domainname,
1776					    sizeof(tpr->pr_domainname));
1777				if (uuid != NULL)
1778					strlcpy(tpr->pr_hostuuid,
1779					    pr->pr_hostuuid,
1780					    sizeof(tpr->pr_hostuuid));
1781				if (gothid)
1782					tpr->pr_hostid = hid;
1783			}
1784		}
1785	}
1786	if ((tallow = ch_allow & ~pr_allow)) {
1787		/* Clear allow bits in all children. */
1788		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1789			tpr->pr_allow &= ~tallow;
1790	}
1791	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1792	/*
1793	 * Persistent prisons get an extra reference, and prisons losing their
1794	 * persist flag lose that reference.  Only do this for existing prisons
1795	 * for now, so new ones will remain unseen until after the module
1796	 * handlers have completed.
1797	 */
1798	born = pr->pr_uref == 0;
1799	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
1800		if (pr_flags & PR_PERSIST) {
1801			pr->pr_ref++;
1802			pr->pr_uref++;
1803		} else {
1804			pr->pr_ref--;
1805			pr->pr_uref--;
1806		}
1807	}
1808	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1809	mtx_unlock(&pr->pr_mtx);
1810
1811#ifdef RACCT
1812	if (racct_enable && created)
1813		prison_racct_attach(pr);
1814#endif
1815
1816	/* Locks may have prevented a complete restriction of child IP
1817	 * addresses.  If so, allocate some more memory and try again.
1818	 */
1819#ifdef INET
1820	while (redo_ip4) {
1821		ip4s = pr->pr_ip4s;
1822		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
1823		mtx_lock(&pr->pr_mtx);
1824		redo_ip4 = 0;
1825		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1826#ifdef VIMAGE
1827			if (tpr->pr_flags & PR_VNET) {
1828				descend = 0;
1829				continue;
1830			}
1831#endif
1832			if (prison_restrict_ip4(tpr, ip4)) {
1833				if (ip4 != NULL)
1834					ip4 = NULL;
1835				else
1836					redo_ip4 = 1;
1837			}
1838		}
1839		mtx_unlock(&pr->pr_mtx);
1840	}
1841#endif
1842#ifdef INET6
1843	while (redo_ip6) {
1844		ip6s = pr->pr_ip6s;
1845		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
1846		mtx_lock(&pr->pr_mtx);
1847		redo_ip6 = 0;
1848		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1849#ifdef VIMAGE
1850			if (tpr->pr_flags & PR_VNET) {
1851				descend = 0;
1852				continue;
1853			}
1854#endif
1855			if (prison_restrict_ip6(tpr, ip6)) {
1856				if (ip6 != NULL)
1857					ip6 = NULL;
1858				else
1859					redo_ip6 = 1;
1860			}
1861		}
1862		mtx_unlock(&pr->pr_mtx);
1863	}
1864#endif
1865
1866	/* Let the modules do their work. */
1867	sx_downgrade(&allprison_lock);
1868	if (born) {
1869		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1870		if (error) {
1871			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
1872			prison_deref(pr, created
1873			    ? PD_LIST_SLOCKED
1874			    : PD_DEREF | PD_LIST_SLOCKED);
1875			goto done_errmsg;
1876		}
1877	}
1878	error = osd_jail_call(pr, PR_METHOD_SET, opts);
1879	if (error) {
1880		if (born)
1881			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
1882		prison_deref(pr, created
1883		    ? PD_LIST_SLOCKED
1884		    : PD_DEREF | PD_LIST_SLOCKED);
1885		goto done_errmsg;
1886	}
1887
1888	/* Attach this process to the prison if requested. */
1889	if (flags & JAIL_ATTACH) {
1890		mtx_lock(&pr->pr_mtx);
1891		error = do_jail_attach(td, pr);
1892		if (error) {
1893			vfs_opterror(opts, "attach failed");
1894			if (!created)
1895				prison_deref(pr, PD_DEREF);
1896			goto done_errmsg;
1897		}
1898	}
1899
1900#ifdef RACCT
1901	if (racct_enable && !created) {
1902		if (!(flags & JAIL_ATTACH))
1903			sx_sunlock(&allprison_lock);
1904		prison_racct_modify(pr);
1905		if (!(flags & JAIL_ATTACH))
1906			sx_slock(&allprison_lock);
1907	}
1908#endif
1909
1910	td->td_retval[0] = pr->pr_id;
1911
1912	/*
1913	 * Now that it is all there, drop the temporary reference from existing
1914	 * prisons.  Or add a reference to newly created persistent prisons
1915	 * (which was not done earlier so that the prison would not be publicly
1916	 * visible).
1917	 */
1918	if (!created) {
1919		prison_deref(pr, (flags & JAIL_ATTACH)
1920		    ? PD_DEREF
1921		    : PD_DEREF | PD_LIST_SLOCKED);
1922	} else {
1923		if (pr_flags & PR_PERSIST) {
1924			mtx_lock(&pr->pr_mtx);
1925			pr->pr_ref++;
1926			pr->pr_uref++;
1927			mtx_unlock(&pr->pr_mtx);
1928		}
1929		if (!(flags & JAIL_ATTACH))
1930			sx_sunlock(&allprison_lock);
1931	}
1932
1933	goto done_free;
1934
1935 done_deref_locked:
1936	prison_deref(pr, created
1937	    ? PD_LOCKED | PD_LIST_XLOCKED
1938	    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
1939	goto done_releroot;
1940 done_unlock_list:
1941	sx_xunlock(&allprison_lock);
1942 done_releroot:
1943	if (root != NULL)
1944		vrele(root);
1945 done_errmsg:
1946	if (error) {
1947		if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
1948		    &errmsg_len) == 0 && errmsg_len > 0) {
1949			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1950			if (optuio->uio_segflg == UIO_SYSSPACE)
1951				bcopy(errmsg,
1952				    optuio->uio_iov[errmsg_pos].iov_base,
1953				    errmsg_len);
1954			else
1955				copyout(errmsg,
1956				    optuio->uio_iov[errmsg_pos].iov_base,
1957				    errmsg_len);
1958		}
1959	}
1960 done_free:
1961#ifdef INET
1962	free(ip4, M_PRISON);
1963#endif
1964#ifdef INET6
1965	free(ip6, M_PRISON);
1966#endif
1967	if (g_path != NULL)
1968		free(g_path, M_TEMP);
1969	vfs_freeopts(opts);
1970	return (error);
1971}
1972
1973
1974/*
1975 * struct jail_get_args {
1976 *	struct iovec *iovp;
1977 *	unsigned int iovcnt;
1978 *	int flags;
1979 * };
1980 */
1981int
1982sys_jail_get(struct thread *td, struct jail_get_args *uap)
1983{
1984	struct uio *auio;
1985	int error;
1986
1987	/* Check that we have an even number of iovecs. */
1988	if (uap->iovcnt & 1)
1989		return (EINVAL);
1990
1991	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
1992	if (error)
1993		return (error);
1994	error = kern_jail_get(td, auio, uap->flags);
1995	if (error == 0)
1996		error = copyout(auio->uio_iov, uap->iovp,
1997		    uap->iovcnt * sizeof (struct iovec));
1998	free(auio, M_IOV);
1999	return (error);
2000}
2001
2002int
2003kern_jail_get(struct thread *td, struct uio *optuio, int flags)
2004{
2005	struct prison *pr, *mypr;
2006	struct vfsopt *opt;
2007	struct vfsoptlist *opts;
2008	char *errmsg, *name;
2009	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
2010
2011	if (flags & ~JAIL_GET_MASK)
2012		return (EINVAL);
2013
2014	/* Get the parameter list. */
2015	error = vfs_buildopts(optuio, &opts);
2016	if (error)
2017		return (error);
2018	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
2019	mypr = td->td_ucred->cr_prison;
2020
2021	/*
2022	 * Find the prison specified by one of: lastjid, jid, name.
2023	 */
2024	sx_slock(&allprison_lock);
2025	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
2026	if (error == 0) {
2027		TAILQ_FOREACH(pr, &allprison, pr_list) {
2028			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
2029				mtx_lock(&pr->pr_mtx);
2030				if (pr->pr_ref > 0 &&
2031				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
2032					break;
2033				mtx_unlock(&pr->pr_mtx);
2034			}
2035		}
2036		if (pr != NULL)
2037			goto found_prison;
2038		error = ENOENT;
2039		vfs_opterror(opts, "no jail after %d", jid);
2040		goto done_unlock_list;
2041	} else if (error != ENOENT)
2042		goto done_unlock_list;
2043
2044	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
2045	if (error == 0) {
2046		if (jid != 0) {
2047			pr = prison_find_child(mypr, jid);
2048			if (pr != NULL) {
2049				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
2050					mtx_unlock(&pr->pr_mtx);
2051					error = ENOENT;
2052					vfs_opterror(opts, "jail %d is dying",
2053					    jid);
2054					goto done_unlock_list;
2055				}
2056				goto found_prison;
2057			}
2058			error = ENOENT;
2059			vfs_opterror(opts, "jail %d not found", jid);
2060			goto done_unlock_list;
2061		}
2062	} else if (error != ENOENT)
2063		goto done_unlock_list;
2064
2065	error = vfs_getopt(opts, "name", (void **)&name, &len);
2066	if (error == 0) {
2067		if (len == 0 || name[len - 1] != '\0') {
2068			error = EINVAL;
2069			goto done_unlock_list;
2070		}
2071		pr = prison_find_name(mypr, name);
2072		if (pr != NULL) {
2073			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
2074				mtx_unlock(&pr->pr_mtx);
2075				error = ENOENT;
2076				vfs_opterror(opts, "jail \"%s\" is dying",
2077				    name);
2078				goto done_unlock_list;
2079			}
2080			goto found_prison;
2081		}
2082		error = ENOENT;
2083		vfs_opterror(opts, "jail \"%s\" not found", name);
2084		goto done_unlock_list;
2085	} else if (error != ENOENT)
2086		goto done_unlock_list;
2087
2088	vfs_opterror(opts, "no jail specified");
2089	error = ENOENT;
2090	goto done_unlock_list;
2091
2092 found_prison:
2093	/* Get the parameters of the prison. */
2094	pr->pr_ref++;
2095	locked = PD_LOCKED;
2096	td->td_retval[0] = pr->pr_id;
2097	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
2098	if (error != 0 && error != ENOENT)
2099		goto done_deref;
2100	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
2101	error = vfs_setopt(opts, "parent", &i, sizeof(i));
2102	if (error != 0 && error != ENOENT)
2103		goto done_deref;
2104	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
2105	if (error != 0 && error != ENOENT)
2106		goto done_deref;
2107	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
2108	    sizeof(pr->pr_cpuset->cs_id));
2109	if (error != 0 && error != ENOENT)
2110		goto done_deref;
2111	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
2112	if (error != 0 && error != ENOENT)
2113		goto done_deref;
2114#ifdef INET
2115	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
2116	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
2117	if (error != 0 && error != ENOENT)
2118		goto done_deref;
2119#endif
2120#ifdef INET6
2121	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
2122	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
2123	if (error != 0 && error != ENOENT)
2124		goto done_deref;
2125#endif
2126	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2127	    sizeof(pr->pr_securelevel));
2128	if (error != 0 && error != ENOENT)
2129		goto done_deref;
2130	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2131	    sizeof(pr->pr_childcount));
2132	if (error != 0 && error != ENOENT)
2133		goto done_deref;
2134	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2135	    sizeof(pr->pr_childmax));
2136	if (error != 0 && error != ENOENT)
2137		goto done_deref;
2138	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2139	if (error != 0 && error != ENOENT)
2140		goto done_deref;
2141	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2142	if (error != 0 && error != ENOENT)
2143		goto done_deref;
2144	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2145	if (error != 0 && error != ENOENT)
2146		goto done_deref;
2147#ifdef COMPAT_FREEBSD32
2148	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
2149		uint32_t hid32 = pr->pr_hostid;
2150
2151		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2152	} else
2153#endif
2154	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2155	    sizeof(pr->pr_hostid));
2156	if (error != 0 && error != ENOENT)
2157		goto done_deref;
2158	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2159	    sizeof(pr->pr_enforce_statfs));
2160	if (error != 0 && error != ENOENT)
2161		goto done_deref;
2162	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
2163	    sizeof(pr->pr_devfs_rsnum));
2164	if (error != 0 && error != ENOENT)
2165		goto done_deref;
2166	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
2167	    fi++) {
2168		if (pr_flag_names[fi] == NULL)
2169			continue;
2170		i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
2171		error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
2172		if (error != 0 && error != ENOENT)
2173			goto done_deref;
2174		i = !i;
2175		error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
2176		if (error != 0 && error != ENOENT)
2177			goto done_deref;
2178	}
2179	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
2180	    fi++) {
2181		i = pr->pr_flags &
2182		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
2183		i = pr_flag_jailsys[fi].disable &&
2184		      (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
2185		    : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
2186		    : JAIL_SYS_INHERIT;
2187		error =
2188		    vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
2189		if (error != 0 && error != ENOENT)
2190			goto done_deref;
2191	}
2192	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
2193	    fi++) {
2194		if (pr_allow_names[fi] == NULL)
2195			continue;
2196		i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
2197		error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
2198		if (error != 0 && error != ENOENT)
2199			goto done_deref;
2200		i = !i;
2201		error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
2202		if (error != 0 && error != ENOENT)
2203			goto done_deref;
2204	}
2205	i = (pr->pr_uref == 0);
2206	error = vfs_setopt(opts, "dying", &i, sizeof(i));
2207	if (error != 0 && error != ENOENT)
2208		goto done_deref;
2209	i = !i;
2210	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2211	if (error != 0 && error != ENOENT)
2212		goto done_deref;
2213	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
2214	    sizeof(pr->pr_osreldate));
2215	if (error != 0 && error != ENOENT)
2216		goto done_deref;
2217	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
2218	if (error != 0 && error != ENOENT)
2219		goto done_deref;
2220
2221	/* Get the module parameters. */
2222	mtx_unlock(&pr->pr_mtx);
2223	locked = 0;
2224	error = osd_jail_call(pr, PR_METHOD_GET, opts);
2225	if (error)
2226		goto done_deref;
2227	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
2228
2229	/* By now, all parameters should have been noted. */
2230	TAILQ_FOREACH(opt, opts, link) {
2231		if (!opt->seen && strcmp(opt->name, "errmsg")) {
2232			error = EINVAL;
2233			vfs_opterror(opts, "unknown parameter: %s", opt->name);
2234			goto done_errmsg;
2235		}
2236	}
2237
2238	/* Write the fetched parameters back to userspace. */
2239	error = 0;
2240	TAILQ_FOREACH(opt, opts, link) {
2241		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2242			pos = 2 * opt->pos + 1;
2243			optuio->uio_iov[pos].iov_len = opt->len;
2244			if (opt->value != NULL) {
2245				if (optuio->uio_segflg == UIO_SYSSPACE) {
2246					bcopy(opt->value,
2247					    optuio->uio_iov[pos].iov_base,
2248					    opt->len);
2249				} else {
2250					error = copyout(opt->value,
2251					    optuio->uio_iov[pos].iov_base,
2252					    opt->len);
2253					if (error)
2254						break;
2255				}
2256			}
2257		}
2258	}
2259	goto done_errmsg;
2260
2261 done_deref:
2262	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
2263	goto done_errmsg;
2264
2265 done_unlock_list:
2266	sx_sunlock(&allprison_lock);
2267 done_errmsg:
2268	if (error && errmsg_pos >= 0) {
2269		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2270		errmsg_pos = 2 * errmsg_pos + 1;
2271		if (errmsg_len > 0) {
2272			if (optuio->uio_segflg == UIO_SYSSPACE)
2273				bcopy(errmsg,
2274				    optuio->uio_iov[errmsg_pos].iov_base,
2275				    errmsg_len);
2276			else
2277				copyout(errmsg,
2278				    optuio->uio_iov[errmsg_pos].iov_base,
2279				    errmsg_len);
2280		}
2281	}
2282	vfs_freeopts(opts);
2283	return (error);
2284}
2285
2286
2287/*
2288 * struct jail_remove_args {
2289 *	int jid;
2290 * };
2291 */
2292int
2293sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
2294{
2295	struct prison *pr, *cpr, *lpr, *tpr;
2296	int descend, error;
2297
2298	error = priv_check(td, PRIV_JAIL_REMOVE);
2299	if (error)
2300		return (error);
2301
2302	sx_xlock(&allprison_lock);
2303	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2304	if (pr == NULL) {
2305		sx_xunlock(&allprison_lock);
2306		return (EINVAL);
2307	}
2308
2309	/* Remove all descendants of this prison, then remove this prison. */
2310	pr->pr_ref++;
2311	if (!LIST_EMPTY(&pr->pr_children)) {
2312		mtx_unlock(&pr->pr_mtx);
2313		lpr = NULL;
2314		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
2315			mtx_lock(&cpr->pr_mtx);
2316			if (cpr->pr_ref > 0) {
2317				tpr = cpr;
2318				cpr->pr_ref++;
2319			} else {
2320				/* Already removed - do not do it again. */
2321				tpr = NULL;
2322			}
2323			mtx_unlock(&cpr->pr_mtx);
2324			if (lpr != NULL) {
2325				mtx_lock(&lpr->pr_mtx);
2326				prison_remove_one(lpr);
2327				sx_xlock(&allprison_lock);
2328			}
2329			lpr = tpr;
2330		}
2331		if (lpr != NULL) {
2332			mtx_lock(&lpr->pr_mtx);
2333			prison_remove_one(lpr);
2334			sx_xlock(&allprison_lock);
2335		}
2336		mtx_lock(&pr->pr_mtx);
2337	}
2338	prison_remove_one(pr);
2339	return (0);
2340}
2341
2342static void
2343prison_remove_one(struct prison *pr)
2344{
2345	struct proc *p;
2346	int deuref;
2347
2348	/* If the prison was persistent, it is not anymore. */
2349	deuref = 0;
2350	if (pr->pr_flags & PR_PERSIST) {
2351		pr->pr_ref--;
2352		deuref = PD_DEUREF;
2353		pr->pr_flags &= ~PR_PERSIST;
2354	}
2355
2356	/*
2357	 * jail_remove added a reference.  If that's the only one, remove
2358	 * the prison now.
2359	 */
2360	KASSERT(pr->pr_ref > 0,
2361	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
2362	if (pr->pr_ref == 1) {
2363		prison_deref(pr,
2364		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
2365		return;
2366	}
2367
2368	mtx_unlock(&pr->pr_mtx);
2369	sx_xunlock(&allprison_lock);
2370	/*
2371	 * Kill all processes unfortunate enough to be attached to this prison.
2372	 */
2373	sx_slock(&allproc_lock);
2374	LIST_FOREACH(p, &allproc, p_list) {
2375		PROC_LOCK(p);
2376		if (p->p_state != PRS_NEW && p->p_ucred &&
2377		    p->p_ucred->cr_prison == pr)
2378			kern_psignal(p, SIGKILL);
2379		PROC_UNLOCK(p);
2380	}
2381	sx_sunlock(&allproc_lock);
2382	/* Remove the temporary reference added by jail_remove. */
2383	prison_deref(pr, deuref | PD_DEREF);
2384}
2385
2386
2387/*
2388 * struct jail_attach_args {
2389 *	int jid;
2390 * };
2391 */
2392int
2393sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
2394{
2395	struct prison *pr;
2396	int error;
2397
2398	error = priv_check(td, PRIV_JAIL_ATTACH);
2399	if (error)
2400		return (error);
2401
2402	/*
2403	 * Start with exclusive hold on allprison_lock to ensure that a possible
2404	 * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove.
2405	 * But then immediately downgrade it since we don't need to stop
2406	 * readers.
2407	 */
2408	sx_xlock(&allprison_lock);
2409	sx_downgrade(&allprison_lock);
2410	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2411	if (pr == NULL) {
2412		sx_sunlock(&allprison_lock);
2413		return (EINVAL);
2414	}
2415
2416	/*
2417	 * Do not allow a process to attach to a prison that is not
2418	 * considered to be "alive".
2419	 */
2420	if (pr->pr_uref == 0) {
2421		mtx_unlock(&pr->pr_mtx);
2422		sx_sunlock(&allprison_lock);
2423		return (EINVAL);
2424	}
2425
2426	return (do_jail_attach(td, pr));
2427}
2428
2429static int
2430do_jail_attach(struct thread *td, struct prison *pr)
2431{
2432	struct proc *p;
2433	struct ucred *newcred, *oldcred;
2434	int error;
2435
2436	/*
2437	 * XXX: Note that there is a slight race here if two threads
2438	 * in the same privileged process attempt to attach to two
2439	 * different jails at the same time.  It is important for
2440	 * user processes not to do this, or they might end up with
2441	 * a process root from one prison, but attached to the jail
2442	 * of another.
2443	 */
2444	pr->pr_ref++;
2445	pr->pr_uref++;
2446	mtx_unlock(&pr->pr_mtx);
2447
2448	/* Let modules do whatever they need to prepare for attaching. */
2449	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2450	if (error) {
2451		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
2452		return (error);
2453	}
2454	sx_sunlock(&allprison_lock);
2455
2456	/*
2457	 * Reparent the newly attached process to this jail.
2458	 */
2459	p = td->td_proc;
2460	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2461	if (error)
2462		goto e_revert_osd;
2463
2464	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2465	if ((error = change_dir(pr->pr_root, td)) != 0)
2466		goto e_unlock;
2467#ifdef MAC
2468	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2469		goto e_unlock;
2470#endif
2471	VOP_UNLOCK(pr->pr_root, 0);
2472	if ((error = change_root(pr->pr_root, td)))
2473		goto e_revert_osd;
2474
2475	newcred = crget();
2476	PROC_LOCK(p);
2477	oldcred = crcopysafe(p, newcred);
2478	newcred->cr_prison = pr;
2479	proc_set_cred(p, newcred);
2480	setsugid(p);
2481	PROC_UNLOCK(p);
2482#ifdef RACCT
2483	racct_proc_ucred_changed(p, oldcred, newcred);
2484#endif
2485	prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF);
2486	crfree(oldcred);
2487	return (0);
2488
2489 e_unlock:
2490	VOP_UNLOCK(pr->pr_root, 0);
2491 e_revert_osd:
2492	/* Tell modules this thread is still in its old jail after all. */
2493	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
2494	prison_deref(pr, PD_DEREF | PD_DEUREF);
2495	return (error);
2496}
2497
2498
2499/*
2500 * Returns a locked prison instance, or NULL on failure.
2501 */
2502struct prison *
2503prison_find(int prid)
2504{
2505	struct prison *pr;
2506
2507	sx_assert(&allprison_lock, SX_LOCKED);
2508	TAILQ_FOREACH(pr, &allprison, pr_list) {
2509		if (pr->pr_id == prid) {
2510			mtx_lock(&pr->pr_mtx);
2511			if (pr->pr_ref > 0)
2512				return (pr);
2513			mtx_unlock(&pr->pr_mtx);
2514		}
2515	}
2516	return (NULL);
2517}
2518
2519/*
2520 * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2521 */
2522struct prison *
2523prison_find_child(struct prison *mypr, int prid)
2524{
2525	struct prison *pr;
2526	int descend;
2527
2528	sx_assert(&allprison_lock, SX_LOCKED);
2529	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2530		if (pr->pr_id == prid) {
2531			mtx_lock(&pr->pr_mtx);
2532			if (pr->pr_ref > 0)
2533				return (pr);
2534			mtx_unlock(&pr->pr_mtx);
2535		}
2536	}
2537	return (NULL);
2538}
2539
2540/*
2541 * Look for the name relative to mypr.  Returns a locked prison or NULL.
2542 */
2543struct prison *
2544prison_find_name(struct prison *mypr, const char *name)
2545{
2546	struct prison *pr, *deadpr;
2547	size_t mylen;
2548	int descend;
2549
2550	sx_assert(&allprison_lock, SX_LOCKED);
2551	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2552 again:
2553	deadpr = NULL;
2554	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2555		if (!strcmp(pr->pr_name + mylen, name)) {
2556			mtx_lock(&pr->pr_mtx);
2557			if (pr->pr_ref > 0) {
2558				if (pr->pr_uref > 0)
2559					return (pr);
2560				deadpr = pr;
2561			}
2562			mtx_unlock(&pr->pr_mtx);
2563		}
2564	}
2565	/* There was no valid prison - perhaps there was a dying one. */
2566	if (deadpr != NULL) {
2567		mtx_lock(&deadpr->pr_mtx);
2568		if (deadpr->pr_ref == 0) {
2569			mtx_unlock(&deadpr->pr_mtx);
2570			goto again;
2571		}
2572	}
2573	return (deadpr);
2574}
2575
2576/*
2577 * See if a prison has the specific flag set.
2578 */
2579int
2580prison_flag(struct ucred *cred, unsigned flag)
2581{
2582
2583	/* This is an atomic read, so no locking is necessary. */
2584	return (cred->cr_prison->pr_flags & flag);
2585}
2586
2587int
2588prison_allow(struct ucred *cred, unsigned flag)
2589{
2590
2591	/* This is an atomic read, so no locking is necessary. */
2592	return (cred->cr_prison->pr_allow & flag);
2593}
2594
2595/*
2596 * Remove a prison reference.  If that was the last reference, remove the
2597 * prison itself - but not in this context in case there are locks held.
2598 */
2599void
2600prison_free_locked(struct prison *pr)
2601{
2602	int ref;
2603
2604	mtx_assert(&pr->pr_mtx, MA_OWNED);
2605	ref = --pr->pr_ref;
2606	mtx_unlock(&pr->pr_mtx);
2607	if (ref == 0)
2608		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2609}
2610
2611void
2612prison_free(struct prison *pr)
2613{
2614
2615	mtx_lock(&pr->pr_mtx);
2616	prison_free_locked(pr);
2617}
2618
2619/*
2620 * Complete a call to either prison_free or prison_proc_free.
2621 */
2622static void
2623prison_complete(void *context, int pending)
2624{
2625	struct prison *pr = context;
2626
2627	sx_xlock(&allprison_lock);
2628	mtx_lock(&pr->pr_mtx);
2629	prison_deref(pr, pr->pr_uref
2630	    ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED
2631	    : PD_LOCKED | PD_LIST_XLOCKED);
2632}
2633
2634/*
2635 * Remove a prison reference (usually).  This internal version assumes no
2636 * mutexes are held, except perhaps the prison itself.  If there are no more
2637 * references, release and delist the prison.  On completion, the prison lock
2638 * and the allprison lock are both unlocked.
2639 */
2640static void
2641prison_deref(struct prison *pr, int flags)
2642{
2643	struct prison *ppr, *tpr;
2644	int ref, lasturef;
2645
2646	if (!(flags & PD_LOCKED))
2647		mtx_lock(&pr->pr_mtx);
2648	for (;;) {
2649		if (flags & PD_DEUREF) {
2650			KASSERT(pr->pr_uref > 0,
2651			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
2652			     pr->pr_id));
2653			pr->pr_uref--;
2654			lasturef = pr->pr_uref == 0;
2655			if (lasturef)
2656				pr->pr_ref++;
2657			KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
2658		} else
2659			lasturef = 0;
2660		if (flags & PD_DEREF) {
2661			KASSERT(pr->pr_ref > 0,
2662			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
2663			     pr->pr_id));
2664			pr->pr_ref--;
2665		}
2666		ref = pr->pr_ref;
2667		mtx_unlock(&pr->pr_mtx);
2668
2669		/*
2670		 * Tell the modules if the last user reference was removed
2671		 * (even it sticks around in dying state).
2672		 */
2673		if (lasturef) {
2674			if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) {
2675				sx_xlock(&allprison_lock);
2676				flags |= PD_LIST_XLOCKED;
2677			}
2678			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
2679			mtx_lock(&pr->pr_mtx);
2680			ref = --pr->pr_ref;
2681			mtx_unlock(&pr->pr_mtx);
2682		}
2683
2684		/* If the prison still has references, nothing else to do. */
2685		if (ref > 0) {
2686			if (flags & PD_LIST_SLOCKED)
2687				sx_sunlock(&allprison_lock);
2688			else if (flags & PD_LIST_XLOCKED)
2689				sx_xunlock(&allprison_lock);
2690			return;
2691		}
2692
2693		if (flags & PD_LIST_SLOCKED) {
2694			if (!sx_try_upgrade(&allprison_lock)) {
2695				sx_sunlock(&allprison_lock);
2696				sx_xlock(&allprison_lock);
2697			}
2698		} else if (!(flags & PD_LIST_XLOCKED))
2699			sx_xlock(&allprison_lock);
2700
2701		TAILQ_REMOVE(&allprison, pr, pr_list);
2702		LIST_REMOVE(pr, pr_sibling);
2703		ppr = pr->pr_parent;
2704		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
2705			tpr->pr_childcount--;
2706		sx_xunlock(&allprison_lock);
2707
2708#ifdef VIMAGE
2709		if (pr->pr_vnet != ppr->pr_vnet)
2710			vnet_destroy(pr->pr_vnet);
2711#endif
2712		if (pr->pr_root != NULL)
2713			vrele(pr->pr_root);
2714		mtx_destroy(&pr->pr_mtx);
2715#ifdef INET
2716		free(pr->pr_ip4, M_PRISON);
2717#endif
2718#ifdef INET6
2719		free(pr->pr_ip6, M_PRISON);
2720#endif
2721		if (pr->pr_cpuset != NULL)
2722			cpuset_rel(pr->pr_cpuset);
2723		osd_jail_exit(pr);
2724#ifdef RACCT
2725		if (racct_enable)
2726			prison_racct_detach(pr);
2727#endif
2728		free(pr, M_PRISON);
2729
2730		/* Removing a prison frees a reference on its parent. */
2731		pr = ppr;
2732		mtx_lock(&pr->pr_mtx);
2733		flags = PD_DEREF | PD_DEUREF;
2734	}
2735}
2736
2737void
2738prison_hold_locked(struct prison *pr)
2739{
2740
2741	mtx_assert(&pr->pr_mtx, MA_OWNED);
2742	KASSERT(pr->pr_ref > 0,
2743	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
2744	pr->pr_ref++;
2745}
2746
2747void
2748prison_hold(struct prison *pr)
2749{
2750
2751	mtx_lock(&pr->pr_mtx);
2752	prison_hold_locked(pr);
2753	mtx_unlock(&pr->pr_mtx);
2754}
2755
2756void
2757prison_proc_hold(struct prison *pr)
2758{
2759
2760	mtx_lock(&pr->pr_mtx);
2761	KASSERT(pr->pr_uref > 0,
2762	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2763	pr->pr_uref++;
2764	mtx_unlock(&pr->pr_mtx);
2765}
2766
2767void
2768prison_proc_free(struct prison *pr)
2769{
2770
2771	mtx_lock(&pr->pr_mtx);
2772	KASSERT(pr->pr_uref > 0,
2773	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2774	if (pr->pr_uref > 1)
2775		pr->pr_uref--;
2776	else {
2777		/*
2778		 * Don't remove the last user reference in this context, which
2779		 * is expected to be a process that is not only locked, but
2780		 * also half dead.
2781		 */
2782		pr->pr_ref++;
2783		mtx_unlock(&pr->pr_mtx);
2784		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2785		return;
2786	}
2787	mtx_unlock(&pr->pr_mtx);
2788}
2789
2790
2791#ifdef INET
2792/*
2793 * Restrict a prison's IP address list with its parent's, possibly replacing
2794 * it.  Return true if the replacement buffer was used (or would have been).
2795 */
2796static int
2797prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
2798{
2799	int ii, ij, used;
2800	struct prison *ppr;
2801
2802	ppr = pr->pr_parent;
2803	if (!(pr->pr_flags & PR_IP4_USER)) {
2804		/* This has no user settings, so just copy the parent's list. */
2805		if (pr->pr_ip4s < ppr->pr_ip4s) {
2806			/*
2807			 * There's no room for the parent's list.  Use the
2808			 * new list buffer, which is assumed to be big enough
2809			 * (if it was passed).  If there's no buffer, try to
2810			 * allocate one.
2811			 */
2812			used = 1;
2813			if (newip4 == NULL) {
2814				newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
2815				    M_PRISON, M_NOWAIT);
2816				if (newip4 != NULL)
2817					used = 0;
2818			}
2819			if (newip4 != NULL) {
2820				bcopy(ppr->pr_ip4, newip4,
2821				    ppr->pr_ip4s * sizeof(*newip4));
2822				free(pr->pr_ip4, M_PRISON);
2823				pr->pr_ip4 = newip4;
2824				pr->pr_ip4s = ppr->pr_ip4s;
2825			}
2826			return (used);
2827		}
2828		pr->pr_ip4s = ppr->pr_ip4s;
2829		if (pr->pr_ip4s > 0)
2830			bcopy(ppr->pr_ip4, pr->pr_ip4,
2831			    pr->pr_ip4s * sizeof(*newip4));
2832		else if (pr->pr_ip4 != NULL) {
2833			free(pr->pr_ip4, M_PRISON);
2834			pr->pr_ip4 = NULL;
2835		}
2836	} else if (pr->pr_ip4s > 0) {
2837		/* Remove addresses that aren't in the parent. */
2838		for (ij = 0; ij < ppr->pr_ip4s; ij++)
2839			if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
2840				break;
2841		if (ij < ppr->pr_ip4s)
2842			ii = 1;
2843		else {
2844			bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
2845			    --pr->pr_ip4s * sizeof(*pr->pr_ip4));
2846			ii = 0;
2847		}
2848		for (ij = 1; ii < pr->pr_ip4s; ) {
2849			if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
2850				ii++;
2851				continue;
2852			}
2853			switch (ij >= ppr->pr_ip4s ? -1 :
2854				qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
2855			case -1:
2856				bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
2857				    (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
2858				break;
2859			case 0:
2860				ii++;
2861				ij++;
2862				break;
2863			case 1:
2864				ij++;
2865				break;
2866			}
2867		}
2868		if (pr->pr_ip4s == 0) {
2869			pr->pr_flags |= PR_IP4_DISABLE;
2870			free(pr->pr_ip4, M_PRISON);
2871			pr->pr_ip4 = NULL;
2872		}
2873	}
2874	return (0);
2875}
2876
2877/*
2878 * Pass back primary IPv4 address of this jail.
2879 *
2880 * If not restricted return success but do not alter the address.  Caller has
2881 * to make sure to initialize it correctly (e.g. INADDR_ANY).
2882 *
2883 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2884 * Address returned in NBO.
2885 */
2886int
2887prison_get_ip4(struct ucred *cred, struct in_addr *ia)
2888{
2889	struct prison *pr;
2890
2891	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2892	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2893
2894	pr = cred->cr_prison;
2895	if (!(pr->pr_flags & PR_IP4))
2896		return (0);
2897	mtx_lock(&pr->pr_mtx);
2898	if (!(pr->pr_flags & PR_IP4)) {
2899		mtx_unlock(&pr->pr_mtx);
2900		return (0);
2901	}
2902	if (pr->pr_ip4 == NULL) {
2903		mtx_unlock(&pr->pr_mtx);
2904		return (EAFNOSUPPORT);
2905	}
2906
2907	ia->s_addr = pr->pr_ip4[0].s_addr;
2908	mtx_unlock(&pr->pr_mtx);
2909	return (0);
2910}
2911
2912/*
2913 * Return 1 if we should do proper source address selection or are not jailed.
2914 * We will return 0 if we should bypass source address selection in favour
2915 * of the primary jail IPv4 address. Only in this case *ia will be updated and
2916 * returned in NBO.
2917 * Return EAFNOSUPPORT, in case this jail does not allow IPv4.
2918 */
2919int
2920prison_saddrsel_ip4(struct ucred *cred, struct in_addr *ia)
2921{
2922	struct prison *pr;
2923	struct in_addr lia;
2924	int error;
2925
2926	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2927	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2928
2929	if (!jailed(cred))
2930		return (1);
2931
2932	pr = cred->cr_prison;
2933	if (pr->pr_flags & PR_IP4_SADDRSEL)
2934		return (1);
2935
2936	lia.s_addr = INADDR_ANY;
2937	error = prison_get_ip4(cred, &lia);
2938	if (error)
2939		return (error);
2940	if (lia.s_addr == INADDR_ANY)
2941		return (1);
2942
2943	ia->s_addr = lia.s_addr;
2944	return (0);
2945}
2946
2947/*
2948 * Return true if pr1 and pr2 have the same IPv4 address restrictions.
2949 */
2950int
2951prison_equal_ip4(struct prison *pr1, struct prison *pr2)
2952{
2953
2954	if (pr1 == pr2)
2955		return (1);
2956
2957	/*
2958	 * No need to lock since the PR_IP4_USER flag can't be altered for
2959	 * existing prisons.
2960	 */
2961	while (pr1 != &prison0 &&
2962#ifdef VIMAGE
2963	       !(pr1->pr_flags & PR_VNET) &&
2964#endif
2965	       !(pr1->pr_flags & PR_IP4_USER))
2966		pr1 = pr1->pr_parent;
2967	while (pr2 != &prison0 &&
2968#ifdef VIMAGE
2969	       !(pr2->pr_flags & PR_VNET) &&
2970#endif
2971	       !(pr2->pr_flags & PR_IP4_USER))
2972		pr2 = pr2->pr_parent;
2973	return (pr1 == pr2);
2974}
2975
2976/*
2977 * Make sure our (source) address is set to something meaningful to this
2978 * jail.
2979 *
2980 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2981 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2982 * doesn't allow IPv4.  Address passed in in NBO and returned in NBO.
2983 */
2984int
2985prison_local_ip4(struct ucred *cred, struct in_addr *ia)
2986{
2987	struct prison *pr;
2988	struct in_addr ia0;
2989	int error;
2990
2991	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2992	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2993
2994	pr = cred->cr_prison;
2995	if (!(pr->pr_flags & PR_IP4))
2996		return (0);
2997	mtx_lock(&pr->pr_mtx);
2998	if (!(pr->pr_flags & PR_IP4)) {
2999		mtx_unlock(&pr->pr_mtx);
3000		return (0);
3001	}
3002	if (pr->pr_ip4 == NULL) {
3003		mtx_unlock(&pr->pr_mtx);
3004		return (EAFNOSUPPORT);
3005	}
3006
3007	ia0.s_addr = ntohl(ia->s_addr);
3008	if (ia0.s_addr == INADDR_LOOPBACK) {
3009		ia->s_addr = pr->pr_ip4[0].s_addr;
3010		mtx_unlock(&pr->pr_mtx);
3011		return (0);
3012	}
3013
3014	if (ia0.s_addr == INADDR_ANY) {
3015		/*
3016		 * In case there is only 1 IPv4 address, bind directly.
3017		 */
3018		if (pr->pr_ip4s == 1)
3019			ia->s_addr = pr->pr_ip4[0].s_addr;
3020		mtx_unlock(&pr->pr_mtx);
3021		return (0);
3022	}
3023
3024	error = _prison_check_ip4(pr, ia);
3025	mtx_unlock(&pr->pr_mtx);
3026	return (error);
3027}
3028
3029/*
3030 * Rewrite destination address in case we will connect to loopback address.
3031 *
3032 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
3033 * Address passed in in NBO and returned in NBO.
3034 */
3035int
3036prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
3037{
3038	struct prison *pr;
3039
3040	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3041	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
3042
3043	pr = cred->cr_prison;
3044	if (!(pr->pr_flags & PR_IP4))
3045		return (0);
3046	mtx_lock(&pr->pr_mtx);
3047	if (!(pr->pr_flags & PR_IP4)) {
3048		mtx_unlock(&pr->pr_mtx);
3049		return (0);
3050	}
3051	if (pr->pr_ip4 == NULL) {
3052		mtx_unlock(&pr->pr_mtx);
3053		return (EAFNOSUPPORT);
3054	}
3055
3056	if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
3057		ia->s_addr = pr->pr_ip4[0].s_addr;
3058		mtx_unlock(&pr->pr_mtx);
3059		return (0);
3060	}
3061
3062	/*
3063	 * Return success because nothing had to be changed.
3064	 */
3065	mtx_unlock(&pr->pr_mtx);
3066	return (0);
3067}
3068
3069/*
3070 * Check if given address belongs to the jail referenced by cred/prison.
3071 *
3072 * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
3073 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3074 * doesn't allow IPv4.  Address passed in in NBO.
3075 */
3076static int
3077_prison_check_ip4(struct prison *pr, struct in_addr *ia)
3078{
3079	int i, a, z, d;
3080
3081	/*
3082	 * Check the primary IP.
3083	 */
3084	if (pr->pr_ip4[0].s_addr == ia->s_addr)
3085		return (0);
3086
3087	/*
3088	 * All the other IPs are sorted so we can do a binary search.
3089	 */
3090	a = 0;
3091	z = pr->pr_ip4s - 2;
3092	while (a <= z) {
3093		i = (a + z) / 2;
3094		d = qcmp_v4(&pr->pr_ip4[i+1], ia);
3095		if (d > 0)
3096			z = i - 1;
3097		else if (d < 0)
3098			a = i + 1;
3099		else
3100			return (0);
3101	}
3102
3103	return (EADDRNOTAVAIL);
3104}
3105
3106int
3107prison_check_ip4(struct ucred *cred, struct in_addr *ia)
3108{
3109	struct prison *pr;
3110	int error;
3111
3112	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3113	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
3114
3115	pr = cred->cr_prison;
3116	if (!(pr->pr_flags & PR_IP4))
3117		return (0);
3118	mtx_lock(&pr->pr_mtx);
3119	if (!(pr->pr_flags & PR_IP4)) {
3120		mtx_unlock(&pr->pr_mtx);
3121		return (0);
3122	}
3123	if (pr->pr_ip4 == NULL) {
3124		mtx_unlock(&pr->pr_mtx);
3125		return (EAFNOSUPPORT);
3126	}
3127
3128	error = _prison_check_ip4(pr, ia);
3129	mtx_unlock(&pr->pr_mtx);
3130	return (error);
3131}
3132#endif
3133
3134#ifdef INET6
3135static int
3136prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6)
3137{
3138	int ii, ij, used;
3139	struct prison *ppr;
3140
3141	ppr = pr->pr_parent;
3142	if (!(pr->pr_flags & PR_IP6_USER)) {
3143		/* This has no user settings, so just copy the parent's list. */
3144		if (pr->pr_ip6s < ppr->pr_ip6s) {
3145			/*
3146			 * There's no room for the parent's list.  Use the
3147			 * new list buffer, which is assumed to be big enough
3148			 * (if it was passed).  If there's no buffer, try to
3149			 * allocate one.
3150			 */
3151			used = 1;
3152			if (newip6 == NULL) {
3153				newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
3154				    M_PRISON, M_NOWAIT);
3155				if (newip6 != NULL)
3156					used = 0;
3157			}
3158			if (newip6 != NULL) {
3159				bcopy(ppr->pr_ip6, newip6,
3160				    ppr->pr_ip6s * sizeof(*newip6));
3161				free(pr->pr_ip6, M_PRISON);
3162				pr->pr_ip6 = newip6;
3163				pr->pr_ip6s = ppr->pr_ip6s;
3164			}
3165			return (used);
3166		}
3167		pr->pr_ip6s = ppr->pr_ip6s;
3168		if (pr->pr_ip6s > 0)
3169			bcopy(ppr->pr_ip6, pr->pr_ip6,
3170			    pr->pr_ip6s * sizeof(*newip6));
3171		else if (pr->pr_ip6 != NULL) {
3172			free(pr->pr_ip6, M_PRISON);
3173			pr->pr_ip6 = NULL;
3174		}
3175	} else if (pr->pr_ip6s > 0) {
3176		/* Remove addresses that aren't in the parent. */
3177		for (ij = 0; ij < ppr->pr_ip6s; ij++)
3178			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
3179			    &ppr->pr_ip6[ij]))
3180				break;
3181		if (ij < ppr->pr_ip6s)
3182			ii = 1;
3183		else {
3184			bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
3185			    --pr->pr_ip6s * sizeof(*pr->pr_ip6));
3186			ii = 0;
3187		}
3188		for (ij = 1; ii < pr->pr_ip6s; ) {
3189			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
3190			    &ppr->pr_ip6[0])) {
3191				ii++;
3192				continue;
3193			}
3194			switch (ij >= ppr->pr_ip6s ? -1 :
3195				qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
3196			case -1:
3197				bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
3198				    (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
3199				break;
3200			case 0:
3201				ii++;
3202				ij++;
3203				break;
3204			case 1:
3205				ij++;
3206				break;
3207			}
3208		}
3209		if (pr->pr_ip6s == 0) {
3210			pr->pr_flags |= PR_IP6_DISABLE;
3211			free(pr->pr_ip6, M_PRISON);
3212			pr->pr_ip6 = NULL;
3213		}
3214	}
3215	return 0;
3216}
3217
3218/*
3219 * Pass back primary IPv6 address for this jail.
3220 *
3221 * If not restricted return success but do not alter the address.  Caller has
3222 * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
3223 *
3224 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3225 */
3226int
3227prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
3228{
3229	struct prison *pr;
3230
3231	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3232	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3233
3234	pr = cred->cr_prison;
3235	if (!(pr->pr_flags & PR_IP6))
3236		return (0);
3237	mtx_lock(&pr->pr_mtx);
3238	if (!(pr->pr_flags & PR_IP6)) {
3239		mtx_unlock(&pr->pr_mtx);
3240		return (0);
3241	}
3242	if (pr->pr_ip6 == NULL) {
3243		mtx_unlock(&pr->pr_mtx);
3244		return (EAFNOSUPPORT);
3245	}
3246
3247	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3248	mtx_unlock(&pr->pr_mtx);
3249	return (0);
3250}
3251
3252/*
3253 * Return 1 if we should do proper source address selection or are not jailed.
3254 * We will return 0 if we should bypass source address selection in favour
3255 * of the primary jail IPv6 address. Only in this case *ia will be updated and
3256 * returned in NBO.
3257 * Return EAFNOSUPPORT, in case this jail does not allow IPv6.
3258 */
3259int
3260prison_saddrsel_ip6(struct ucred *cred, struct in6_addr *ia6)
3261{
3262	struct prison *pr;
3263	struct in6_addr lia6;
3264	int error;
3265
3266	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3267	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3268
3269	if (!jailed(cred))
3270		return (1);
3271
3272	pr = cred->cr_prison;
3273	if (pr->pr_flags & PR_IP6_SADDRSEL)
3274		return (1);
3275
3276	lia6 = in6addr_any;
3277	error = prison_get_ip6(cred, &lia6);
3278	if (error)
3279		return (error);
3280	if (IN6_IS_ADDR_UNSPECIFIED(&lia6))
3281		return (1);
3282
3283	bcopy(&lia6, ia6, sizeof(struct in6_addr));
3284	return (0);
3285}
3286
3287/*
3288 * Return true if pr1 and pr2 have the same IPv6 address restrictions.
3289 */
3290int
3291prison_equal_ip6(struct prison *pr1, struct prison *pr2)
3292{
3293
3294	if (pr1 == pr2)
3295		return (1);
3296
3297	while (pr1 != &prison0 &&
3298#ifdef VIMAGE
3299	       !(pr1->pr_flags & PR_VNET) &&
3300#endif
3301	       !(pr1->pr_flags & PR_IP6_USER))
3302		pr1 = pr1->pr_parent;
3303	while (pr2 != &prison0 &&
3304#ifdef VIMAGE
3305	       !(pr2->pr_flags & PR_VNET) &&
3306#endif
3307	       !(pr2->pr_flags & PR_IP6_USER))
3308		pr2 = pr2->pr_parent;
3309	return (pr1 == pr2);
3310}
3311
3312/*
3313 * Make sure our (source) address is set to something meaningful to this jail.
3314 *
3315 * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
3316 * when needed while binding.
3317 *
3318 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3319 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3320 * doesn't allow IPv6.
3321 */
3322int
3323prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
3324{
3325	struct prison *pr;
3326	int error;
3327
3328	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3329	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3330
3331	pr = cred->cr_prison;
3332	if (!(pr->pr_flags & PR_IP6))
3333		return (0);
3334	mtx_lock(&pr->pr_mtx);
3335	if (!(pr->pr_flags & PR_IP6)) {
3336		mtx_unlock(&pr->pr_mtx);
3337		return (0);
3338	}
3339	if (pr->pr_ip6 == NULL) {
3340		mtx_unlock(&pr->pr_mtx);
3341		return (EAFNOSUPPORT);
3342	}
3343
3344	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3345		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3346		mtx_unlock(&pr->pr_mtx);
3347		return (0);
3348	}
3349
3350	if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
3351		/*
3352		 * In case there is only 1 IPv6 address, and v6only is true,
3353		 * then bind directly.
3354		 */
3355		if (v6only != 0 && pr->pr_ip6s == 1)
3356			bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3357		mtx_unlock(&pr->pr_mtx);
3358		return (0);
3359	}
3360
3361	error = _prison_check_ip6(pr, ia6);
3362	mtx_unlock(&pr->pr_mtx);
3363	return (error);
3364}
3365
3366/*
3367 * Rewrite destination address in case we will connect to loopback address.
3368 *
3369 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3370 */
3371int
3372prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
3373{
3374	struct prison *pr;
3375
3376	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3377	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3378
3379	pr = cred->cr_prison;
3380	if (!(pr->pr_flags & PR_IP6))
3381		return (0);
3382	mtx_lock(&pr->pr_mtx);
3383	if (!(pr->pr_flags & PR_IP6)) {
3384		mtx_unlock(&pr->pr_mtx);
3385		return (0);
3386	}
3387	if (pr->pr_ip6 == NULL) {
3388		mtx_unlock(&pr->pr_mtx);
3389		return (EAFNOSUPPORT);
3390	}
3391
3392	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3393		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3394		mtx_unlock(&pr->pr_mtx);
3395		return (0);
3396	}
3397
3398	/*
3399	 * Return success because nothing had to be changed.
3400	 */
3401	mtx_unlock(&pr->pr_mtx);
3402	return (0);
3403}
3404
3405/*
3406 * Check if given address belongs to the jail referenced by cred/prison.
3407 *
3408 * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3409 * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3410 * doesn't allow IPv6.
3411 */
3412static int
3413_prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
3414{
3415	int i, a, z, d;
3416
3417	/*
3418	 * Check the primary IP.
3419	 */
3420	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
3421		return (0);
3422
3423	/*
3424	 * All the other IPs are sorted so we can do a binary search.
3425	 */
3426	a = 0;
3427	z = pr->pr_ip6s - 2;
3428	while (a <= z) {
3429		i = (a + z) / 2;
3430		d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
3431		if (d > 0)
3432			z = i - 1;
3433		else if (d < 0)
3434			a = i + 1;
3435		else
3436			return (0);
3437	}
3438
3439	return (EADDRNOTAVAIL);
3440}
3441
3442int
3443prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
3444{
3445	struct prison *pr;
3446	int error;
3447
3448	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3449	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3450
3451	pr = cred->cr_prison;
3452	if (!(pr->pr_flags & PR_IP6))
3453		return (0);
3454	mtx_lock(&pr->pr_mtx);
3455	if (!(pr->pr_flags & PR_IP6)) {
3456		mtx_unlock(&pr->pr_mtx);
3457		return (0);
3458	}
3459	if (pr->pr_ip6 == NULL) {
3460		mtx_unlock(&pr->pr_mtx);
3461		return (EAFNOSUPPORT);
3462	}
3463
3464	error = _prison_check_ip6(pr, ia6);
3465	mtx_unlock(&pr->pr_mtx);
3466	return (error);
3467}
3468#endif
3469
3470/*
3471 * Check if a jail supports the given address family.
3472 *
3473 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3474 * if not.
3475 */
3476int
3477prison_check_af(struct ucred *cred, int af)
3478{
3479	struct prison *pr;
3480	int error;
3481
3482	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3483
3484	pr = cred->cr_prison;
3485#ifdef VIMAGE
3486	/* Prisons with their own network stack are not limited. */
3487	if (prison_owns_vnet(cred))
3488		return (0);
3489#endif
3490
3491	error = 0;
3492	switch (af)
3493	{
3494#ifdef INET
3495	case AF_INET:
3496		if (pr->pr_flags & PR_IP4)
3497		{
3498			mtx_lock(&pr->pr_mtx);
3499			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
3500				error = EAFNOSUPPORT;
3501			mtx_unlock(&pr->pr_mtx);
3502		}
3503		break;
3504#endif
3505#ifdef INET6
3506	case AF_INET6:
3507		if (pr->pr_flags & PR_IP6)
3508		{
3509			mtx_lock(&pr->pr_mtx);
3510			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
3511				error = EAFNOSUPPORT;
3512			mtx_unlock(&pr->pr_mtx);
3513		}
3514		break;
3515#endif
3516	case AF_LOCAL:
3517	case AF_ROUTE:
3518		break;
3519	default:
3520		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3521			error = EAFNOSUPPORT;
3522	}
3523	return (error);
3524}
3525
3526/*
3527 * Check if given address belongs to the jail referenced by cred (wrapper to
3528 * prison_check_ip[46]).
3529 *
3530 * Returns 0 if jail doesn't restrict the address family or if address belongs
3531 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3532 * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3533 */
3534int
3535prison_if(struct ucred *cred, struct sockaddr *sa)
3536{
3537#ifdef INET
3538	struct sockaddr_in *sai;
3539#endif
3540#ifdef INET6
3541	struct sockaddr_in6 *sai6;
3542#endif
3543	int error;
3544
3545	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3546	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3547
3548#ifdef VIMAGE
3549	if (prison_owns_vnet(cred))
3550		return (0);
3551#endif
3552
3553	error = 0;
3554	switch (sa->sa_family)
3555	{
3556#ifdef INET
3557	case AF_INET:
3558		sai = (struct sockaddr_in *)sa;
3559		error = prison_check_ip4(cred, &sai->sin_addr);
3560		break;
3561#endif
3562#ifdef INET6
3563	case AF_INET6:
3564		sai6 = (struct sockaddr_in6 *)sa;
3565		error = prison_check_ip6(cred, &sai6->sin6_addr);
3566		break;
3567#endif
3568	default:
3569		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3570			error = EAFNOSUPPORT;
3571	}
3572	return (error);
3573}
3574
3575/*
3576 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3577 */
3578int
3579prison_check(struct ucred *cred1, struct ucred *cred2)
3580{
3581
3582	return ((cred1->cr_prison == cred2->cr_prison ||
3583	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3584}
3585
3586/*
3587 * Return 1 if p2 is a child of p1, otherwise 0.
3588 */
3589int
3590prison_ischild(struct prison *pr1, struct prison *pr2)
3591{
3592
3593	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3594		if (pr1 == pr2)
3595			return (1);
3596	return (0);
3597}
3598
3599/*
3600 * Return 1 if the passed credential is in a jail, otherwise 0.
3601 */
3602int
3603jailed(struct ucred *cred)
3604{
3605
3606	return (cred->cr_prison != &prison0);
3607}
3608
3609/*
3610 * Return 1 if the passed credential is in a jail and that jail does not
3611 * have its own virtual network stack, otherwise 0.
3612 */
3613int
3614jailed_without_vnet(struct ucred *cred)
3615{
3616
3617	if (!jailed(cred))
3618		return (0);
3619#ifdef VIMAGE
3620	if (prison_owns_vnet(cred))
3621		return (0);
3622#endif
3623
3624	return (1);
3625}
3626
3627/*
3628 * Return the correct hostname (domainname, et al) for the passed credential.
3629 */
3630void
3631getcredhostname(struct ucred *cred, char *buf, size_t size)
3632{
3633	struct prison *pr;
3634
3635	/*
3636	 * A NULL credential can be used to shortcut to the physical
3637	 * system's hostname.
3638	 */
3639	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3640	mtx_lock(&pr->pr_mtx);
3641	strlcpy(buf, pr->pr_hostname, size);
3642	mtx_unlock(&pr->pr_mtx);
3643}
3644
3645void
3646getcreddomainname(struct ucred *cred, char *buf, size_t size)
3647{
3648
3649	mtx_lock(&cred->cr_prison->pr_mtx);
3650	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3651	mtx_unlock(&cred->cr_prison->pr_mtx);
3652}
3653
3654void
3655getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3656{
3657
3658	mtx_lock(&cred->cr_prison->pr_mtx);
3659	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3660	mtx_unlock(&cred->cr_prison->pr_mtx);
3661}
3662
3663void
3664getcredhostid(struct ucred *cred, unsigned long *hostid)
3665{
3666
3667	mtx_lock(&cred->cr_prison->pr_mtx);
3668	*hostid = cred->cr_prison->pr_hostid;
3669	mtx_unlock(&cred->cr_prison->pr_mtx);
3670}
3671
3672#ifdef VIMAGE
3673/*
3674 * Determine whether the prison represented by cred owns
3675 * its vnet rather than having it inherited.
3676 *
3677 * Returns 1 in case the prison owns the vnet, 0 otherwise.
3678 */
3679int
3680prison_owns_vnet(struct ucred *cred)
3681{
3682
3683	/*
3684	 * vnets cannot be added/removed after jail creation,
3685	 * so no need to lock here.
3686	 */
3687	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
3688}
3689#endif
3690
3691/*
3692 * Determine whether the subject represented by cred can "see"
3693 * status of a mount point.
3694 * Returns: 0 for permitted, ENOENT otherwise.
3695 * XXX: This function should be called cr_canseemount() and should be
3696 *      placed in kern_prot.c.
3697 */
3698int
3699prison_canseemount(struct ucred *cred, struct mount *mp)
3700{
3701	struct prison *pr;
3702	struct statfs *sp;
3703	size_t len;
3704
3705	pr = cred->cr_prison;
3706	if (pr->pr_enforce_statfs == 0)
3707		return (0);
3708	if (pr->pr_root->v_mount == mp)
3709		return (0);
3710	if (pr->pr_enforce_statfs == 2)
3711		return (ENOENT);
3712	/*
3713	 * If jail's chroot directory is set to "/" we should be able to see
3714	 * all mount-points from inside a jail.
3715	 * This is ugly check, but this is the only situation when jail's
3716	 * directory ends with '/'.
3717	 */
3718	if (strcmp(pr->pr_path, "/") == 0)
3719		return (0);
3720	len = strlen(pr->pr_path);
3721	sp = &mp->mnt_stat;
3722	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3723		return (ENOENT);
3724	/*
3725	 * Be sure that we don't have situation where jail's root directory
3726	 * is "/some/path" and mount point is "/some/pathpath".
3727	 */
3728	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3729		return (ENOENT);
3730	return (0);
3731}
3732
3733void
3734prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3735{
3736	char jpath[MAXPATHLEN];
3737	struct prison *pr;
3738	size_t len;
3739
3740	pr = cred->cr_prison;
3741	if (pr->pr_enforce_statfs == 0)
3742		return;
3743	if (prison_canseemount(cred, mp) != 0) {
3744		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3745		strlcpy(sp->f_mntonname, "[restricted]",
3746		    sizeof(sp->f_mntonname));
3747		return;
3748	}
3749	if (pr->pr_root->v_mount == mp) {
3750		/*
3751		 * Clear current buffer data, so we are sure nothing from
3752		 * the valid path left there.
3753		 */
3754		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3755		*sp->f_mntonname = '/';
3756		return;
3757	}
3758	/*
3759	 * If jail's chroot directory is set to "/" we should be able to see
3760	 * all mount-points from inside a jail.
3761	 */
3762	if (strcmp(pr->pr_path, "/") == 0)
3763		return;
3764	len = strlen(pr->pr_path);
3765	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3766	/*
3767	 * Clear current buffer data, so we are sure nothing from
3768	 * the valid path left there.
3769	 */
3770	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3771	if (*jpath == '\0') {
3772		/* Should never happen. */
3773		*sp->f_mntonname = '/';
3774	} else {
3775		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3776	}
3777}
3778
3779/*
3780 * Check with permission for a specific privilege is granted within jail.  We
3781 * have a specific list of accepted privileges; the rest are denied.
3782 */
3783int
3784prison_priv_check(struct ucred *cred, int priv)
3785{
3786
3787	if (!jailed(cred))
3788		return (0);
3789
3790#ifdef VIMAGE
3791	/*
3792	 * Privileges specific to prisons with a virtual network stack.
3793	 * There might be a duplicate entry here in case the privilege
3794	 * is only granted conditionally in the legacy jail case.
3795	 */
3796	switch (priv) {
3797#ifdef notyet
3798		/*
3799		 * NFS-specific privileges.
3800		 */
3801	case PRIV_NFS_DAEMON:
3802	case PRIV_NFS_LOCKD:
3803#endif
3804		/*
3805		 * Network stack privileges.
3806		 */
3807	case PRIV_NET_BRIDGE:
3808	case PRIV_NET_GRE:
3809	case PRIV_NET_BPF:
3810	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3811	case PRIV_NET_ROUTE:
3812	case PRIV_NET_TAP:
3813	case PRIV_NET_SETIFMTU:
3814	case PRIV_NET_SETIFFLAGS:
3815	case PRIV_NET_SETIFCAP:
3816	case PRIV_NET_SETIFDESCR:
3817	case PRIV_NET_SETIFNAME	:
3818	case PRIV_NET_SETIFMETRIC:
3819	case PRIV_NET_SETIFPHYS:
3820	case PRIV_NET_SETIFMAC:
3821	case PRIV_NET_ADDMULTI:
3822	case PRIV_NET_DELMULTI:
3823	case PRIV_NET_HWIOCTL:
3824	case PRIV_NET_SETLLADDR:
3825	case PRIV_NET_ADDIFGROUP:
3826	case PRIV_NET_DELIFGROUP:
3827	case PRIV_NET_IFCREATE:
3828	case PRIV_NET_IFDESTROY:
3829	case PRIV_NET_ADDIFADDR:
3830	case PRIV_NET_DELIFADDR:
3831	case PRIV_NET_LAGG:
3832	case PRIV_NET_GIF:
3833	case PRIV_NET_SETIFVNET:
3834	case PRIV_NET_SETIFFIB:
3835
3836		/*
3837		 * 802.11-related privileges.
3838		 */
3839	case PRIV_NET80211_GETKEY:
3840#ifdef notyet
3841	case PRIV_NET80211_MANAGE:		/* XXX-BZ discuss with sam@ */
3842#endif
3843
3844#ifdef notyet
3845		/*
3846		 * AppleTalk privileges.
3847		 */
3848	case PRIV_NETATALK_RESERVEDPORT:
3849
3850		/*
3851		 * ATM privileges.
3852		 */
3853	case PRIV_NETATM_CFG:
3854	case PRIV_NETATM_ADD:
3855	case PRIV_NETATM_DEL:
3856	case PRIV_NETATM_SET:
3857
3858		/*
3859		 * Bluetooth privileges.
3860		 */
3861	case PRIV_NETBLUETOOTH_RAW:
3862#endif
3863
3864		/*
3865		 * Netgraph and netgraph module privileges.
3866		 */
3867	case PRIV_NETGRAPH_CONTROL:
3868#ifdef notyet
3869	case PRIV_NETGRAPH_TTY:
3870#endif
3871
3872		/*
3873		 * IPv4 and IPv6 privileges.
3874		 */
3875	case PRIV_NETINET_IPFW:
3876	case PRIV_NETINET_DIVERT:
3877	case PRIV_NETINET_PF:
3878	case PRIV_NETINET_DUMMYNET:
3879	case PRIV_NETINET_CARP:
3880	case PRIV_NETINET_MROUTE:
3881	case PRIV_NETINET_RAW:
3882	case PRIV_NETINET_ADDRCTRL6:
3883	case PRIV_NETINET_ND6:
3884	case PRIV_NETINET_SCOPE6:
3885	case PRIV_NETINET_ALIFETIME6:
3886	case PRIV_NETINET_IPSEC:
3887	case PRIV_NETINET_BINDANY:
3888
3889#ifdef notyet
3890		/*
3891		 * IPX/SPX privileges.
3892		 */
3893	case PRIV_NETIPX_RESERVEDPORT:
3894	case PRIV_NETIPX_RAW:
3895
3896		/*
3897		 * NCP privileges.
3898		 */
3899	case PRIV_NETNCP:
3900
3901		/*
3902		 * SMB privileges.
3903		 */
3904	case PRIV_NETSMB:
3905#endif
3906
3907	/*
3908	 * No default: or deny here.
3909	 * In case of no permit fall through to next switch().
3910	 */
3911		if (cred->cr_prison->pr_flags & PR_VNET)
3912			return (0);
3913	}
3914#endif /* VIMAGE */
3915
3916	switch (priv) {
3917
3918		/*
3919		 * Allow ktrace privileges for root in jail.
3920		 */
3921	case PRIV_KTRACE:
3922
3923#if 0
3924		/*
3925		 * Allow jailed processes to configure audit identity and
3926		 * submit audit records (login, etc).  In the future we may
3927		 * want to further refine the relationship between audit and
3928		 * jail.
3929		 */
3930	case PRIV_AUDIT_GETAUDIT:
3931	case PRIV_AUDIT_SETAUDIT:
3932	case PRIV_AUDIT_SUBMIT:
3933#endif
3934
3935		/*
3936		 * Allow jailed processes to manipulate process UNIX
3937		 * credentials in any way they see fit.
3938		 */
3939	case PRIV_CRED_SETUID:
3940	case PRIV_CRED_SETEUID:
3941	case PRIV_CRED_SETGID:
3942	case PRIV_CRED_SETEGID:
3943	case PRIV_CRED_SETGROUPS:
3944	case PRIV_CRED_SETREUID:
3945	case PRIV_CRED_SETREGID:
3946	case PRIV_CRED_SETRESUID:
3947	case PRIV_CRED_SETRESGID:
3948
3949		/*
3950		 * Jail implements visibility constraints already, so allow
3951		 * jailed root to override uid/gid-based constraints.
3952		 */
3953	case PRIV_SEEOTHERGIDS:
3954	case PRIV_SEEOTHERUIDS:
3955
3956		/*
3957		 * Jail implements inter-process debugging limits already, so
3958		 * allow jailed root various debugging privileges.
3959		 */
3960	case PRIV_DEBUG_DIFFCRED:
3961	case PRIV_DEBUG_SUGID:
3962	case PRIV_DEBUG_UNPRIV:
3963
3964		/*
3965		 * Allow jail to set various resource limits and login
3966		 * properties, and for now, exceed process resource limits.
3967		 */
3968	case PRIV_PROC_LIMIT:
3969	case PRIV_PROC_SETLOGIN:
3970	case PRIV_PROC_SETRLIMIT:
3971
3972		/*
3973		 * System V and POSIX IPC privileges are granted in jail.
3974		 */
3975	case PRIV_IPC_READ:
3976	case PRIV_IPC_WRITE:
3977	case PRIV_IPC_ADMIN:
3978	case PRIV_IPC_MSGSIZE:
3979	case PRIV_MQ_ADMIN:
3980
3981		/*
3982		 * Jail operations within a jail work on child jails.
3983		 */
3984	case PRIV_JAIL_ATTACH:
3985	case PRIV_JAIL_SET:
3986	case PRIV_JAIL_REMOVE:
3987
3988		/*
3989		 * Jail implements its own inter-process limits, so allow
3990		 * root processes in jail to change scheduling on other
3991		 * processes in the same jail.  Likewise for signalling.
3992		 */
3993	case PRIV_SCHED_DIFFCRED:
3994	case PRIV_SCHED_CPUSET:
3995	case PRIV_SIGNAL_DIFFCRED:
3996	case PRIV_SIGNAL_SUGID:
3997
3998		/*
3999		 * Allow jailed processes to write to sysctls marked as jail
4000		 * writable.
4001		 */
4002	case PRIV_SYSCTL_WRITEJAIL:
4003
4004		/*
4005		 * Allow root in jail to manage a variety of quota
4006		 * properties.  These should likely be conditional on a
4007		 * configuration option.
4008		 */
4009	case PRIV_VFS_GETQUOTA:
4010	case PRIV_VFS_SETQUOTA:
4011
4012		/*
4013		 * Since Jail relies on chroot() to implement file system
4014		 * protections, grant many VFS privileges to root in jail.
4015		 * Be careful to exclude mount-related and NFS-related
4016		 * privileges.
4017		 */
4018	case PRIV_VFS_READ:
4019	case PRIV_VFS_WRITE:
4020	case PRIV_VFS_ADMIN:
4021	case PRIV_VFS_EXEC:
4022	case PRIV_VFS_LOOKUP:
4023	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
4024	case PRIV_VFS_CHFLAGS_DEV:
4025	case PRIV_VFS_CHOWN:
4026	case PRIV_VFS_CHROOT:
4027	case PRIV_VFS_RETAINSUGID:
4028	case PRIV_VFS_FCHROOT:
4029	case PRIV_VFS_LINK:
4030	case PRIV_VFS_SETGID:
4031	case PRIV_VFS_STAT:
4032	case PRIV_VFS_STICKYFILE:
4033
4034		/*
4035		 * As in the non-jail case, non-root users are expected to be
4036		 * able to read kernel/phyiscal memory (provided /dev/[k]mem
4037		 * exists in the jail and they have permission to access it).
4038		 */
4039	case PRIV_KMEM_READ:
4040		return (0);
4041
4042		/*
4043		 * Depending on the global setting, allow privilege of
4044		 * setting system flags.
4045		 */
4046	case PRIV_VFS_SYSFLAGS:
4047		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
4048			return (0);
4049		else
4050			return (EPERM);
4051
4052		/*
4053		 * Depending on the global setting, allow privilege of
4054		 * mounting/unmounting file systems.
4055		 */
4056	case PRIV_VFS_MOUNT:
4057	case PRIV_VFS_UNMOUNT:
4058	case PRIV_VFS_MOUNT_NONUSER:
4059	case PRIV_VFS_MOUNT_OWNER:
4060		if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT &&
4061		    cred->cr_prison->pr_enforce_statfs < 2)
4062			return (0);
4063		else
4064			return (EPERM);
4065
4066		/*
4067		 * Allow jailed root to bind reserved ports and reuse in-use
4068		 * ports.
4069		 */
4070	case PRIV_NETINET_RESERVEDPORT:
4071	case PRIV_NETINET_REUSEPORT:
4072		return (0);
4073
4074		/*
4075		 * Allow jailed root to set certian IPv4/6 (option) headers.
4076		 */
4077	case PRIV_NETINET_SETHDROPTS:
4078		return (0);
4079
4080		/*
4081		 * Conditionally allow creating raw sockets in jail.
4082		 */
4083	case PRIV_NETINET_RAW:
4084		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
4085			return (0);
4086		else
4087			return (EPERM);
4088
4089		/*
4090		 * Since jail implements its own visibility limits on netstat
4091		 * sysctls, allow getcred.  This allows identd to work in
4092		 * jail.
4093		 */
4094	case PRIV_NETINET_GETCRED:
4095		return (0);
4096
4097		/*
4098		 * Allow jailed root to set loginclass.
4099		 */
4100	case PRIV_PROC_SETLOGINCLASS:
4101		return (0);
4102
4103	default:
4104		/*
4105		 * In all remaining cases, deny the privilege request.  This
4106		 * includes almost all network privileges, many system
4107		 * configuration privileges.
4108		 */
4109		return (EPERM);
4110	}
4111}
4112
4113/*
4114 * Return the part of pr2's name that is relative to pr1, or the whole name
4115 * if it does not directly follow.
4116 */
4117
4118char *
4119prison_name(struct prison *pr1, struct prison *pr2)
4120{
4121	char *name;
4122
4123	/* Jails see themselves as "0" (if they see themselves at all). */
4124	if (pr1 == pr2)
4125		return "0";
4126	name = pr2->pr_name;
4127	if (prison_ischild(pr1, pr2)) {
4128		/*
4129		 * pr1 isn't locked (and allprison_lock may not be either)
4130		 * so its length can't be counted on.  But the number of dots
4131		 * can be counted on - and counted.
4132		 */
4133		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
4134			name = strchr(name, '.') + 1;
4135	}
4136	return (name);
4137}
4138
4139/*
4140 * Return the part of pr2's path that is relative to pr1, or the whole path
4141 * if it does not directly follow.
4142 */
4143static char *
4144prison_path(struct prison *pr1, struct prison *pr2)
4145{
4146	char *path1, *path2;
4147	int len1;
4148
4149	path1 = pr1->pr_path;
4150	path2 = pr2->pr_path;
4151	if (!strcmp(path1, "/"))
4152		return (path2);
4153	len1 = strlen(path1);
4154	if (strncmp(path1, path2, len1))
4155		return (path2);
4156	if (path2[len1] == '\0')
4157		return "/";
4158	if (path2[len1] == '/')
4159		return (path2 + len1);
4160	return (path2);
4161}
4162
4163
4164/*
4165 * Jail-related sysctls.
4166 */
4167static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
4168    "Jails");
4169
4170static int
4171sysctl_jail_list(SYSCTL_HANDLER_ARGS)
4172{
4173	struct xprison *xp;
4174	struct prison *pr, *cpr;
4175#ifdef INET
4176	struct in_addr *ip4 = NULL;
4177	int ip4s = 0;
4178#endif
4179#ifdef INET6
4180	struct in6_addr *ip6 = NULL;
4181	int ip6s = 0;
4182#endif
4183	int descend, error;
4184
4185	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
4186	pr = req->td->td_ucred->cr_prison;
4187	error = 0;
4188	sx_slock(&allprison_lock);
4189	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
4190#if defined(INET) || defined(INET6)
4191 again:
4192#endif
4193		mtx_lock(&cpr->pr_mtx);
4194#ifdef INET
4195		if (cpr->pr_ip4s > 0) {
4196			if (ip4s < cpr->pr_ip4s) {
4197				ip4s = cpr->pr_ip4s;
4198				mtx_unlock(&cpr->pr_mtx);
4199				ip4 = realloc(ip4, ip4s *
4200				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
4201				goto again;
4202			}
4203			bcopy(cpr->pr_ip4, ip4,
4204			    cpr->pr_ip4s * sizeof(struct in_addr));
4205		}
4206#endif
4207#ifdef INET6
4208		if (cpr->pr_ip6s > 0) {
4209			if (ip6s < cpr->pr_ip6s) {
4210				ip6s = cpr->pr_ip6s;
4211				mtx_unlock(&cpr->pr_mtx);
4212				ip6 = realloc(ip6, ip6s *
4213				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
4214				goto again;
4215			}
4216			bcopy(cpr->pr_ip6, ip6,
4217			    cpr->pr_ip6s * sizeof(struct in6_addr));
4218		}
4219#endif
4220		if (cpr->pr_ref == 0) {
4221			mtx_unlock(&cpr->pr_mtx);
4222			continue;
4223		}
4224		bzero(xp, sizeof(*xp));
4225		xp->pr_version = XPRISON_VERSION;
4226		xp->pr_id = cpr->pr_id;
4227		xp->pr_state = cpr->pr_uref > 0
4228		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
4229		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
4230		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
4231		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
4232#ifdef INET
4233		xp->pr_ip4s = cpr->pr_ip4s;
4234#endif
4235#ifdef INET6
4236		xp->pr_ip6s = cpr->pr_ip6s;
4237#endif
4238		mtx_unlock(&cpr->pr_mtx);
4239		error = SYSCTL_OUT(req, xp, sizeof(*xp));
4240		if (error)
4241			break;
4242#ifdef INET
4243		if (xp->pr_ip4s > 0) {
4244			error = SYSCTL_OUT(req, ip4,
4245			    xp->pr_ip4s * sizeof(struct in_addr));
4246			if (error)
4247				break;
4248		}
4249#endif
4250#ifdef INET6
4251		if (xp->pr_ip6s > 0) {
4252			error = SYSCTL_OUT(req, ip6,
4253			    xp->pr_ip6s * sizeof(struct in6_addr));
4254			if (error)
4255				break;
4256		}
4257#endif
4258	}
4259	sx_sunlock(&allprison_lock);
4260	free(xp, M_TEMP);
4261#ifdef INET
4262	free(ip4, M_TEMP);
4263#endif
4264#ifdef INET6
4265	free(ip6, M_TEMP);
4266#endif
4267	return (error);
4268}
4269
4270SYSCTL_OID(_security_jail, OID_AUTO, list,
4271    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4272    sysctl_jail_list, "S", "List of active jails");
4273
4274static int
4275sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
4276{
4277	int error, injail;
4278
4279	injail = jailed(req->td->td_ucred);
4280	error = SYSCTL_OUT(req, &injail, sizeof(injail));
4281
4282	return (error);
4283}
4284
4285SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
4286    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4287    sysctl_jail_jailed, "I", "Process in jail?");
4288
4289static int
4290sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
4291{
4292	int error, havevnet;
4293#ifdef VIMAGE
4294	struct ucred *cred = req->td->td_ucred;
4295
4296	havevnet = jailed(cred) && prison_owns_vnet(cred);
4297#else
4298	havevnet = 0;
4299#endif
4300	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
4301
4302	return (error);
4303}
4304
4305SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
4306    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4307    sysctl_jail_vnet, "I", "Jail owns VNET?");
4308
4309#if defined(INET) || defined(INET6)
4310SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
4311    &jail_max_af_ips, 0,
4312    "Number of IP addresses a jail may have at most per address family (deprecated)");
4313#endif
4314
4315/*
4316 * Default parameters for jail(2) compatability.  For historical reasons,
4317 * the sysctl names have varying similarity to the parameter names.  Prisons
4318 * just see their own parameters, and can't change them.
4319 */
4320static int
4321sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
4322{
4323	struct prison *pr;
4324	int allow, error, i;
4325
4326	pr = req->td->td_ucred->cr_prison;
4327	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
4328
4329	/* Get the current flag value, and convert it to a boolean. */
4330	i = (allow & arg2) ? 1 : 0;
4331	if (arg1 != NULL)
4332		i = !i;
4333	error = sysctl_handle_int(oidp, &i, 0, req);
4334	if (error || !req->newptr)
4335		return (error);
4336	i = i ? arg2 : 0;
4337	if (arg1 != NULL)
4338		i ^= arg2;
4339	/*
4340	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
4341	 * for writing.
4342	 */
4343	mtx_lock(&prison0.pr_mtx);
4344	jail_default_allow = (jail_default_allow & ~arg2) | i;
4345	mtx_unlock(&prison0.pr_mtx);
4346	return (0);
4347}
4348
4349SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
4350    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4351    NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
4352    "Processes in jail can set their hostnames (deprecated)");
4353SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
4354    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4355    (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
4356    "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
4357SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
4358    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4359    NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
4360    "Processes in jail can use System V IPC primitives (deprecated)");
4361SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
4362    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4363    NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
4364    "Prison root can create raw sockets (deprecated)");
4365SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
4366    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4367    NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
4368    "Processes in jail can alter system file flags (deprecated)");
4369SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
4370    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4371    NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
4372    "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
4373SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed,
4374    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4375    NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I",
4376    "Processes in jail can mount the devfs file system (deprecated)");
4377SYSCTL_PROC(_security_jail, OID_AUTO, mount_fdescfs_allowed,
4378    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4379    NULL, PR_ALLOW_MOUNT_FDESCFS, sysctl_jail_default_allow, "I",
4380    "Processes in jail can mount the fdescfs file system (deprecated)");
4381SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed,
4382    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4383    NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I",
4384    "Processes in jail can mount the nullfs file system (deprecated)");
4385SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed,
4386    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4387    NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I",
4388    "Processes in jail can mount the procfs file system (deprecated)");
4389SYSCTL_PROC(_security_jail, OID_AUTO, mount_linprocfs_allowed,
4390    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4391    NULL, PR_ALLOW_MOUNT_LINPROCFS, sysctl_jail_default_allow, "I",
4392    "Processes in jail can mount the linprocfs file system (deprecated)");
4393SYSCTL_PROC(_security_jail, OID_AUTO, mount_linsysfs_allowed,
4394    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4395    NULL, PR_ALLOW_MOUNT_LINSYSFS, sysctl_jail_default_allow, "I",
4396    "Processes in jail can mount the linsysfs file system (deprecated)");
4397SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed,
4398    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4399    NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I",
4400    "Processes in jail can mount the tmpfs file system (deprecated)");
4401SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed,
4402    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4403    NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I",
4404    "Processes in jail can mount the zfs file system (deprecated)");
4405
4406static int
4407sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
4408{
4409	struct prison *pr;
4410	int level, error;
4411
4412	pr = req->td->td_ucred->cr_prison;
4413	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4414	error = sysctl_handle_int(oidp, &level, 0, req);
4415	if (error || !req->newptr)
4416		return (error);
4417	*(int *)arg1 = level;
4418	return (0);
4419}
4420
4421SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4422    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4423    &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4424    sysctl_jail_default_level, "I",
4425    "Processes in jail cannot see all mounted file systems (deprecated)");
4426
4427SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
4428    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4429    &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
4430    sysctl_jail_default_level, "I",
4431    "Ruleset for the devfs filesystem in jail (deprecated)");
4432
4433/*
4434 * Nodes to describe jail parameters.  Maximum length of string parameters
4435 * is returned in the string itself, and the other parameters exist merely
4436 * to make themselves and their types known.
4437 */
4438SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
4439    "Jail parameters");
4440
4441int
4442sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4443{
4444	int i;
4445	long l;
4446	size_t s;
4447	char numbuf[12];
4448
4449	switch (oidp->oid_kind & CTLTYPE)
4450	{
4451	case CTLTYPE_LONG:
4452	case CTLTYPE_ULONG:
4453		l = 0;
4454#ifdef SCTL_MASK32
4455		if (!(req->flags & SCTL_MASK32))
4456#endif
4457			return (SYSCTL_OUT(req, &l, sizeof(l)));
4458	case CTLTYPE_INT:
4459	case CTLTYPE_UINT:
4460		i = 0;
4461		return (SYSCTL_OUT(req, &i, sizeof(i)));
4462	case CTLTYPE_STRING:
4463		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
4464		return
4465		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4466	case CTLTYPE_STRUCT:
4467		s = (size_t)arg2;
4468		return (SYSCTL_OUT(req, &s, sizeof(s)));
4469	}
4470	return (0);
4471}
4472
4473/*
4474 * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
4475 * jail creation time but cannot be changed in an existing jail.
4476 */
4477SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4478SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4479SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4480SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4481SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4482    "I", "Jail secure level");
4483SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
4484    "Jail value for kern.osreldate and uname -K");
4485SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
4486    "Jail value for kern.osrelease and uname -r");
4487SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4488    "I", "Jail cannot see all mounted file systems");
4489SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
4490    "I", "Ruleset for in-jail devfs mounts");
4491SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4492    "B", "Jail persistence");
4493#ifdef VIMAGE
4494SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4495    "E,jailsys", "Virtual network stack");
4496#endif
4497SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4498    "B", "Jail is in the process of shutting down");
4499
4500SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4501SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4502    "I", "Current number of child jails");
4503SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4504    "I", "Maximum number of child jails");
4505
4506SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4507SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4508    "Jail hostname");
4509SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4510    "Jail NIS domainname");
4511SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4512    "Jail host UUID");
4513SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4514    "LU", "Jail host ID");
4515
4516SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4517SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4518
4519#ifdef INET
4520SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4521    "Jail IPv4 address virtualization");
4522SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4523    "S,in_addr,a", "Jail IPv4 addresses");
4524SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4525    "B", "Do (not) use IPv4 source address selection rather than the "
4526    "primary jail IPv4 address.");
4527#endif
4528#ifdef INET6
4529SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4530    "Jail IPv6 address virtualization");
4531SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4532    "S,in6_addr,a", "Jail IPv6 addresses");
4533SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4534    "B", "Do (not) use IPv6 source address selection rather than the "
4535    "primary jail IPv6 address.");
4536#endif
4537
4538SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4539SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4540    "B", "Jail may set hostname");
4541SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4542    "B", "Jail may use SYSV IPC");
4543SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4544    "B", "Jail may create raw sockets");
4545SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4546    "B", "Jail may alter system file flags");
4547SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4548    "B", "Jail may set file quotas");
4549SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4550    "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4551
4552SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
4553SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
4554    "B", "Jail may mount/unmount jail-friendly file systems in general");
4555SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW,
4556    "B", "Jail may mount the devfs file system");
4557SYSCTL_JAIL_PARAM(_allow_mount, fdescfs, CTLTYPE_INT | CTLFLAG_RW,
4558    "B", "Jail may mount the fdescfs file system");
4559SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW,
4560    "B", "Jail may mount the nullfs file system");
4561SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW,
4562    "B", "Jail may mount the procfs file system");
4563SYSCTL_JAIL_PARAM(_allow_mount, linprocfs, CTLTYPE_INT | CTLFLAG_RW,
4564    "B", "Jail may mount the linprocfs file system");
4565SYSCTL_JAIL_PARAM(_allow_mount, linsysfs, CTLTYPE_INT | CTLFLAG_RW,
4566    "B", "Jail may mount the linsysfs file system");
4567SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW,
4568    "B", "Jail may mount the tmpfs file system");
4569SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW,
4570    "B", "Jail may mount the zfs file system");
4571
4572#ifdef RACCT
4573void
4574prison_racct_foreach(void (*callback)(struct racct *racct,
4575    void *arg2, void *arg3), void *arg2, void *arg3)
4576{
4577	struct prison_racct *prr;
4578
4579	ASSERT_RACCT_ENABLED();
4580
4581	sx_slock(&allprison_lock);
4582	LIST_FOREACH(prr, &allprison_racct, prr_next)
4583		(callback)(prr->prr_racct, arg2, arg3);
4584	sx_sunlock(&allprison_lock);
4585}
4586
4587static struct prison_racct *
4588prison_racct_find_locked(const char *name)
4589{
4590	struct prison_racct *prr;
4591
4592	ASSERT_RACCT_ENABLED();
4593	sx_assert(&allprison_lock, SA_XLOCKED);
4594
4595	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
4596		return (NULL);
4597
4598	LIST_FOREACH(prr, &allprison_racct, prr_next) {
4599		if (strcmp(name, prr->prr_name) != 0)
4600			continue;
4601
4602		/* Found prison_racct with a matching name? */
4603		prison_racct_hold(prr);
4604		return (prr);
4605	}
4606
4607	/* Add new prison_racct. */
4608	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
4609	racct_create(&prr->prr_racct);
4610
4611	strcpy(prr->prr_name, name);
4612	refcount_init(&prr->prr_refcount, 1);
4613	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
4614
4615	return (prr);
4616}
4617
4618struct prison_racct *
4619prison_racct_find(const char *name)
4620{
4621	struct prison_racct *prr;
4622
4623	ASSERT_RACCT_ENABLED();
4624
4625	sx_xlock(&allprison_lock);
4626	prr = prison_racct_find_locked(name);
4627	sx_xunlock(&allprison_lock);
4628	return (prr);
4629}
4630
4631void
4632prison_racct_hold(struct prison_racct *prr)
4633{
4634
4635	ASSERT_RACCT_ENABLED();
4636
4637	refcount_acquire(&prr->prr_refcount);
4638}
4639
4640static void
4641prison_racct_free_locked(struct prison_racct *prr)
4642{
4643
4644	ASSERT_RACCT_ENABLED();
4645	sx_assert(&allprison_lock, SA_XLOCKED);
4646
4647	if (refcount_release(&prr->prr_refcount)) {
4648		racct_destroy(&prr->prr_racct);
4649		LIST_REMOVE(prr, prr_next);
4650		free(prr, M_PRISON_RACCT);
4651	}
4652}
4653
4654void
4655prison_racct_free(struct prison_racct *prr)
4656{
4657	int old;
4658
4659	ASSERT_RACCT_ENABLED();
4660	sx_assert(&allprison_lock, SA_UNLOCKED);
4661
4662	old = prr->prr_refcount;
4663	if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1))
4664		return;
4665
4666	sx_xlock(&allprison_lock);
4667	prison_racct_free_locked(prr);
4668	sx_xunlock(&allprison_lock);
4669}
4670
4671static void
4672prison_racct_attach(struct prison *pr)
4673{
4674	struct prison_racct *prr;
4675
4676	ASSERT_RACCT_ENABLED();
4677	sx_assert(&allprison_lock, SA_XLOCKED);
4678
4679	prr = prison_racct_find_locked(pr->pr_name);
4680	KASSERT(prr != NULL, ("cannot find prison_racct"));
4681
4682	pr->pr_prison_racct = prr;
4683}
4684
4685/*
4686 * Handle jail renaming.  From the racct point of view, renaming means
4687 * moving from one prison_racct to another.
4688 */
4689static void
4690prison_racct_modify(struct prison *pr)
4691{
4692	struct proc *p;
4693	struct ucred *cred;
4694	struct prison_racct *oldprr;
4695
4696	ASSERT_RACCT_ENABLED();
4697
4698	sx_slock(&allproc_lock);
4699	sx_xlock(&allprison_lock);
4700
4701	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
4702		sx_xunlock(&allprison_lock);
4703		sx_sunlock(&allproc_lock);
4704		return;
4705	}
4706
4707	oldprr = pr->pr_prison_racct;
4708	pr->pr_prison_racct = NULL;
4709
4710	prison_racct_attach(pr);
4711
4712	/*
4713	 * Move resource utilisation records.
4714	 */
4715	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
4716
4717	/*
4718	 * Force rctl to reattach rules to processes.
4719	 */
4720	FOREACH_PROC_IN_SYSTEM(p) {
4721		PROC_LOCK(p);
4722		cred = crhold(p->p_ucred);
4723		PROC_UNLOCK(p);
4724		racct_proc_ucred_changed(p, cred, cred);
4725		crfree(cred);
4726	}
4727
4728	sx_sunlock(&allproc_lock);
4729	prison_racct_free_locked(oldprr);
4730	sx_xunlock(&allprison_lock);
4731}
4732
4733static void
4734prison_racct_detach(struct prison *pr)
4735{
4736
4737	ASSERT_RACCT_ENABLED();
4738	sx_assert(&allprison_lock, SA_UNLOCKED);
4739
4740	if (pr->pr_prison_racct == NULL)
4741		return;
4742	prison_racct_free(pr->pr_prison_racct);
4743	pr->pr_prison_racct = NULL;
4744}
4745#endif /* RACCT */
4746
4747#ifdef DDB
4748
4749static void
4750db_show_prison(struct prison *pr)
4751{
4752	int fi;
4753#if defined(INET) || defined(INET6)
4754	int ii;
4755#endif
4756	unsigned jsf;
4757#ifdef INET6
4758	char ip6buf[INET6_ADDRSTRLEN];
4759#endif
4760
4761	db_printf("prison %p:\n", pr);
4762	db_printf(" jid             = %d\n", pr->pr_id);
4763	db_printf(" name            = %s\n", pr->pr_name);
4764	db_printf(" parent          = %p\n", pr->pr_parent);
4765	db_printf(" ref             = %d\n", pr->pr_ref);
4766	db_printf(" uref            = %d\n", pr->pr_uref);
4767	db_printf(" path            = %s\n", pr->pr_path);
4768	db_printf(" cpuset          = %d\n", pr->pr_cpuset
4769	    ? pr->pr_cpuset->cs_id : -1);
4770#ifdef VIMAGE
4771	db_printf(" vnet            = %p\n", pr->pr_vnet);
4772#endif
4773	db_printf(" root            = %p\n", pr->pr_root);
4774	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
4775	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
4776	db_printf(" children.max    = %d\n", pr->pr_childmax);
4777	db_printf(" children.cur    = %d\n", pr->pr_childcount);
4778	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
4779	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
4780	db_printf(" flags           = 0x%x", pr->pr_flags);
4781	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
4782	    fi++)
4783		if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
4784			db_printf(" %s", pr_flag_names[fi]);
4785	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
4786	    fi++) {
4787		jsf = pr->pr_flags &
4788		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
4789		db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
4790		    pr_flag_jailsys[fi].disable &&
4791		      (jsf == pr_flag_jailsys[fi].disable) ? "disable"
4792		    : (jsf == pr_flag_jailsys[fi].new) ? "new"
4793		    : "inherit");
4794	}
4795	db_printf(" allow           = 0x%x", pr->pr_allow);
4796	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
4797	    fi++)
4798		if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
4799			db_printf(" %s", pr_allow_names[fi]);
4800	db_printf("\n");
4801	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
4802	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
4803	db_printf(" host.domainname = %s\n", pr->pr_domainname);
4804	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
4805	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
4806#ifdef INET
4807	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
4808	for (ii = 0; ii < pr->pr_ip4s; ii++)
4809		db_printf(" %s %s\n",
4810		    ii == 0 ? "ip4.addr        =" : "                 ",
4811		    inet_ntoa(pr->pr_ip4[ii]));
4812#endif
4813#ifdef INET6
4814	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
4815	for (ii = 0; ii < pr->pr_ip6s; ii++)
4816		db_printf(" %s %s\n",
4817		    ii == 0 ? "ip6.addr        =" : "                 ",
4818		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
4819#endif
4820}
4821
4822DB_SHOW_COMMAND(prison, db_show_prison_command)
4823{
4824	struct prison *pr;
4825
4826	if (!have_addr) {
4827		/*
4828		 * Show all prisons in the list, and prison0 which is not
4829		 * listed.
4830		 */
4831		db_show_prison(&prison0);
4832		if (!db_pager_quit) {
4833			TAILQ_FOREACH(pr, &allprison, pr_list) {
4834				db_show_prison(pr);
4835				if (db_pager_quit)
4836					break;
4837			}
4838		}
4839		return;
4840	}
4841
4842	if (addr == 0)
4843		pr = &prison0;
4844	else {
4845		/* Look for a prison with the ID and with references. */
4846		TAILQ_FOREACH(pr, &allprison, pr_list)
4847			if (pr->pr_id == addr && pr->pr_ref > 0)
4848				break;
4849		if (pr == NULL)
4850			/* Look again, without requiring a reference. */
4851			TAILQ_FOREACH(pr, &allprison, pr_list)
4852				if (pr->pr_id == addr)
4853					break;
4854		if (pr == NULL)
4855			/* Assume address points to a valid prison. */
4856			pr = (struct prison *)addr;
4857	}
4858	db_show_prison(pr);
4859}
4860
4861#endif /* DDB */
4862