kern_jail.c revision 168699
1139804Simp/*-
246197Sphk * ----------------------------------------------------------------------------
346197Sphk * "THE BEER-WARE LICENSE" (Revision 42):
446197Sphk * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
546197Sphk * can do whatever you want with this stuff. If we meet some day, and you think
646197Sphk * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
746197Sphk * ----------------------------------------------------------------------------
846197Sphk */
946155Sphk
10116182Sobrien#include <sys/cdefs.h>
11116182Sobrien__FBSDID("$FreeBSD: head/sys/kern/kern_jail.c 168699 2007-04-13 23:54:22Z pjd $");
12116182Sobrien
13131177Spjd#include "opt_mac.h"
14131177Spjd
1546155Sphk#include <sys/param.h>
1646155Sphk#include <sys/types.h>
1746155Sphk#include <sys/kernel.h>
1846155Sphk#include <sys/systm.h>
1946155Sphk#include <sys/errno.h>
2046155Sphk#include <sys/sysproto.h>
2146155Sphk#include <sys/malloc.h>
22164032Srwatson#include <sys/priv.h>
2346155Sphk#include <sys/proc.h>
24124882Srwatson#include <sys/taskqueue.h>
2546155Sphk#include <sys/jail.h>
2687275Srwatson#include <sys/lock.h>
2787275Srwatson#include <sys/mutex.h>
28168401Spjd#include <sys/sx.h>
29113275Smike#include <sys/namei.h>
30147185Spjd#include <sys/mount.h>
31113275Smike#include <sys/queue.h>
3246155Sphk#include <sys/socket.h>
33113275Smike#include <sys/syscallsubr.h>
3457163Srwatson#include <sys/sysctl.h>
35113275Smike#include <sys/vnode.h>
3646155Sphk#include <net/if.h>
3746155Sphk#include <netinet/in.h>
3846155Sphk
39163606Srwatson#include <security/mac/mac_framework.h>
40163606Srwatson
4146155SphkMALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
4246155Sphk
4389414SarrSYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
4457163Srwatson    "Jail rules");
4557163Srwatson
4657163Srwatsonint	jail_set_hostname_allowed = 1;
4789414SarrSYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW,
4857163Srwatson    &jail_set_hostname_allowed, 0,
4957163Srwatson    "Processes in jail can set their hostnames");
5057163Srwatson
5161235Srwatsonint	jail_socket_unixiproute_only = 1;
5289414SarrSYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW,
5361235Srwatson    &jail_socket_unixiproute_only, 0,
5461235Srwatson    "Processes in jail are limited to creating UNIX/IPv4/route sockets only");
5561235Srwatson
5668024Srwatsonint	jail_sysvipc_allowed = 0;
5789414SarrSYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW,
5868024Srwatson    &jail_sysvipc_allowed, 0,
5968024Srwatson    "Processes in jail can use System V IPC primitives");
6068024Srwatson
61147185Spjdstatic int jail_enforce_statfs = 2;
62147185SpjdSYSCTL_INT(_security_jail, OID_AUTO, enforce_statfs, CTLFLAG_RW,
63147185Spjd    &jail_enforce_statfs, 0,
64147185Spjd    "Processes in jail cannot see all mounted file systems");
65125804Srwatson
66128664Sbmilekicint	jail_allow_raw_sockets = 0;
67128664SbmilekicSYSCTL_INT(_security_jail, OID_AUTO, allow_raw_sockets, CTLFLAG_RW,
68128664Sbmilekic    &jail_allow_raw_sockets, 0,
69128664Sbmilekic    "Prison root can create raw sockets");
70128664Sbmilekic
71141543Scpercivaint	jail_chflags_allowed = 0;
72141543ScpercivaSYSCTL_INT(_security_jail, OID_AUTO, chflags_allowed, CTLFLAG_RW,
73141543Scperciva    &jail_chflags_allowed, 0,
74141543Scperciva    "Processes in jail can alter system file flags");
75141543Scperciva
76168396Spjdint	jail_mount_allowed = 0;
77168396SpjdSYSCTL_INT(_security_jail, OID_AUTO, mount_allowed, CTLFLAG_RW,
78168396Spjd    &jail_mount_allowed, 0,
79168396Spjd    "Processes in jail can mount/unmount jail-friendly file systems");
80168396Spjd
81168401Spjd/* allprison, lastprid, and prisoncount are protected by allprison_lock. */
82113275Smikestruct	prisonlist allprison;
83168401Spjdstruct	sx allprison_lock;
84113275Smikeint	lastprid = 0;
85113275Smikeint	prisoncount = 0;
86113275Smike
87168401Spjd/*
88168401Spjd * List of jail services. Protected by allprison_lock.
89168401Spjd */
90168401SpjdTAILQ_HEAD(prison_services_head, prison_service);
91168401Spjdstatic struct prison_services_head prison_services =
92168401Spjd    TAILQ_HEAD_INITIALIZER(prison_services);
93168401Spjdstatic int prison_service_slots = 0;
94168401Spjd
95168401Spjdstruct prison_service {
96168401Spjd	prison_create_t ps_create;
97168401Spjd	prison_destroy_t ps_destroy;
98168401Spjd	int		ps_slotno;
99168401Spjd	TAILQ_ENTRY(prison_service) ps_next;
100168401Spjd	char	ps_name[0];
101168401Spjd};
102168401Spjd
103113275Smikestatic void		 init_prison(void *);
104124882Srwatsonstatic void		 prison_complete(void *context, int pending);
105113275Smikestatic int		 sysctl_jail_list(SYSCTL_HANDLER_ARGS);
106113275Smike
107113275Smikestatic void
108113275Smikeinit_prison(void *data __unused)
109113275Smike{
110113275Smike
111168401Spjd	sx_init(&allprison_lock, "allprison");
112113275Smike	LIST_INIT(&allprison);
113113275Smike}
114113275Smike
115113275SmikeSYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);
116113275Smike
11782710Sdillon/*
118114168Smike * struct jail_args {
119114168Smike *	struct jail *jail;
120114168Smike * };
12182710Sdillon */
12246155Sphkint
123114168Smikejail(struct thread *td, struct jail_args *uap)
12446155Sphk{
125113275Smike	struct nameidata nd;
126113275Smike	struct prison *pr, *tpr;
127168401Spjd	struct prison_service *psrv;
12846155Sphk	struct jail j;
129113275Smike	struct jail_attach_args jaa;
130150652Scsjp	int vfslocked, error, tryprid;
13146155Sphk
132114168Smike	error = copyin(uap->jail, &j, sizeof(j));
13346155Sphk	if (error)
13484828Sjhb		return (error);
13584828Sjhb	if (j.version != 0)
13684828Sjhb		return (EINVAL);
13784828Sjhb
138114168Smike	MALLOC(pr, struct prison *, sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
13993818Sjhb	mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF);
140113275Smike	pr->pr_ref = 1;
141114168Smike	error = copyinstr(j.path, &pr->pr_path, sizeof(pr->pr_path), 0);
142113275Smike	if (error)
143113275Smike		goto e_killmtx;
144150652Scsjp	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF, UIO_SYSSPACE,
145150652Scsjp	    pr->pr_path, td);
146113275Smike	error = namei(&nd);
147150652Scsjp	if (error)
148113275Smike		goto e_killmtx;
149150652Scsjp	vfslocked = NDHASGIANT(&nd);
150113275Smike	pr->pr_root = nd.ni_vp;
151113275Smike	VOP_UNLOCK(nd.ni_vp, 0, td);
152113275Smike	NDFREE(&nd, NDF_ONLY_PNBUF);
153150652Scsjp	VFS_UNLOCK_GIANT(vfslocked);
154114168Smike	error = copyinstr(j.hostname, &pr->pr_host, sizeof(pr->pr_host), 0);
15584828Sjhb	if (error)
156113275Smike		goto e_dropvnref;
157113275Smike	pr->pr_ip = j.ip_number;
158113275Smike	pr->pr_linux = NULL;
159113275Smike	pr->pr_securelevel = securelevel;
160168401Spjd	if (prison_service_slots == 0)
161168401Spjd		pr->pr_slots = NULL;
162168401Spjd	else {
163168401Spjd		pr->pr_slots = malloc(sizeof(*pr->pr_slots) * prison_service_slots,
164168401Spjd		    M_PRISON, M_ZERO | M_WAITOK);
165168401Spjd	}
166113275Smike
167113275Smike	/* Determine next pr_id and add prison to allprison list. */
168168401Spjd	sx_xlock(&allprison_lock);
169113275Smike	tryprid = lastprid + 1;
170113275Smike	if (tryprid == JAIL_MAX)
171113275Smike		tryprid = 1;
172113275Smikenext:
173113275Smike	LIST_FOREACH(tpr, &allprison, pr_list) {
174113275Smike		if (tpr->pr_id == tryprid) {
175113275Smike			tryprid++;
176113275Smike			if (tryprid == JAIL_MAX) {
177168401Spjd				sx_xunlock(&allprison_lock);
178113275Smike				error = EAGAIN;
179113275Smike				goto e_dropvnref;
180113275Smike			}
181113275Smike			goto next;
182113275Smike		}
183113275Smike	}
184113275Smike	pr->pr_id = jaa.jid = lastprid = tryprid;
185113275Smike	LIST_INSERT_HEAD(&allprison, pr, pr_list);
186113275Smike	prisoncount++;
187168401Spjd	sx_downgrade(&allprison_lock);
188168401Spjd	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
189168401Spjd		psrv->ps_create(psrv, pr);
190168401Spjd	}
191168401Spjd	sx_sunlock(&allprison_lock);
192113275Smike
193113275Smike	error = jail_attach(td, &jaa);
194113275Smike	if (error)
195113275Smike		goto e_dropprref;
196113275Smike	mtx_lock(&pr->pr_mtx);
197113275Smike	pr->pr_ref--;
198113275Smike	mtx_unlock(&pr->pr_mtx);
199113275Smike	td->td_retval[0] = jaa.jid;
200113275Smike	return (0);
201113275Smikee_dropprref:
202168401Spjd	sx_xlock(&allprison_lock);
203113275Smike	LIST_REMOVE(pr, pr_list);
204113275Smike	prisoncount--;
205168401Spjd	sx_downgrade(&allprison_lock);
206168401Spjd	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
207168401Spjd		psrv->ps_destroy(psrv, pr);
208168401Spjd	}
209168401Spjd	sx_sunlock(&allprison_lock);
210113275Smikee_dropvnref:
211150652Scsjp	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
212113275Smike	vrele(pr->pr_root);
213150652Scsjp	VFS_UNLOCK_GIANT(vfslocked);
214113275Smikee_killmtx:
215113275Smike	mtx_destroy(&pr->pr_mtx);
216113275Smike	FREE(pr, M_PRISON);
217113275Smike	return (error);
218113275Smike}
219113275Smike
220113275Smike/*
221114168Smike * struct jail_attach_args {
222114168Smike *	int jid;
223114168Smike * };
224113275Smike */
225113275Smikeint
226114168Smikejail_attach(struct thread *td, struct jail_attach_args *uap)
227113275Smike{
228113275Smike	struct proc *p;
229113275Smike	struct ucred *newcred, *oldcred;
230113275Smike	struct prison *pr;
231150652Scsjp	int vfslocked, error;
232167309Spjd
233126023Snectar	/*
234126023Snectar	 * XXX: Note that there is a slight race here if two threads
235126023Snectar	 * in the same privileged process attempt to attach to two
236126023Snectar	 * different jails at the same time.  It is important for
237126023Snectar	 * user processes not to do this, or they might end up with
238126023Snectar	 * a process root from one prison, but attached to the jail
239126023Snectar	 * of another.
240126023Snectar	 */
241164032Srwatson	error = priv_check(td, PRIV_JAIL_ATTACH);
242126023Snectar	if (error)
243126023Snectar		return (error);
244126023Snectar
245113275Smike	p = td->td_proc;
246168401Spjd	sx_slock(&allprison_lock);
247113275Smike	pr = prison_find(uap->jid);
248113275Smike	if (pr == NULL) {
249168401Spjd		sx_sunlock(&allprison_lock);
250113275Smike		return (EINVAL);
251113275Smike	}
252113275Smike	pr->pr_ref++;
253113275Smike	mtx_unlock(&pr->pr_mtx);
254168401Spjd	sx_sunlock(&allprison_lock);
255113275Smike
256150652Scsjp	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
257113275Smike	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY, td);
258113275Smike	if ((error = change_dir(pr->pr_root, td)) != 0)
259113275Smike		goto e_unlock;
260113275Smike#ifdef MAC
261113275Smike	if ((error = mac_check_vnode_chroot(td->td_ucred, pr->pr_root)))
262113275Smike		goto e_unlock;
263113275Smike#endif
264113275Smike	VOP_UNLOCK(pr->pr_root, 0, td);
265113275Smike	change_root(pr->pr_root, td);
266150652Scsjp	VFS_UNLOCK_GIANT(vfslocked);
267113275Smike
26884828Sjhb	newcred = crget();
26984828Sjhb	PROC_LOCK(p);
27084828Sjhb	oldcred = p->p_ucred;
271113275Smike	setsugid(p);
27284828Sjhb	crcopy(newcred, oldcred);
273113630Sjhb	newcred->cr_prison = pr;
27484828Sjhb	p->p_ucred = newcred;
27584828Sjhb	PROC_UNLOCK(p);
27684828Sjhb	crfree(oldcred);
27746155Sphk	return (0);
278113275Smikee_unlock:
279113275Smike	VOP_UNLOCK(pr->pr_root, 0, td);
280150652Scsjp	VFS_UNLOCK_GIANT(vfslocked);
281113275Smike	mtx_lock(&pr->pr_mtx);
282113275Smike	pr->pr_ref--;
283113275Smike	mtx_unlock(&pr->pr_mtx);
28446155Sphk	return (error);
28546155Sphk}
28646155Sphk
287113275Smike/*
288113275Smike * Returns a locked prison instance, or NULL on failure.
289113275Smike */
290168399Spjdstruct prison *
291113275Smikeprison_find(int prid)
292113275Smike{
293113275Smike	struct prison *pr;
294113275Smike
295168401Spjd	sx_assert(&allprison_lock, SX_LOCKED);
296113275Smike	LIST_FOREACH(pr, &allprison, pr_list) {
297113275Smike		if (pr->pr_id == prid) {
298113275Smike			mtx_lock(&pr->pr_mtx);
299168489Spjd			if (pr->pr_ref == 0) {
300168489Spjd				mtx_unlock(&pr->pr_mtx);
301168489Spjd				break;
302168489Spjd			}
303113275Smike			return (pr);
304113275Smike		}
305113275Smike	}
306113275Smike	return (NULL);
307113275Smike}
308113275Smike
30972786Srwatsonvoid
31072786Srwatsonprison_free(struct prison *pr)
31172786Srwatson{
31272786Srwatson
31387275Srwatson	mtx_lock(&pr->pr_mtx);
31472786Srwatson	pr->pr_ref--;
31572786Srwatson	if (pr->pr_ref == 0) {
316168483Spjd		mtx_unlock(&pr->pr_mtx);
317124882Srwatson		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
318144660Sjeff		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
31987275Srwatson		return;
32072786Srwatson	}
32187275Srwatson	mtx_unlock(&pr->pr_mtx);
32272786Srwatson}
32372786Srwatson
324124882Srwatsonstatic void
325124882Srwatsonprison_complete(void *context, int pending)
326124882Srwatson{
327168489Spjd	struct prison_service *psrv;
328124882Srwatson	struct prison *pr;
329150652Scsjp	int vfslocked;
330124882Srwatson
331124882Srwatson	pr = (struct prison *)context;
332124882Srwatson
333168489Spjd	sx_xlock(&allprison_lock);
334168489Spjd	LIST_REMOVE(pr, pr_list);
335168489Spjd	prisoncount--;
336168489Spjd	sx_downgrade(&allprison_lock);
337168489Spjd	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
338168489Spjd		psrv->ps_destroy(psrv, pr);
339168489Spjd	}
340168489Spjd	sx_sunlock(&allprison_lock);
341168489Spjd
342150652Scsjp	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
343124882Srwatson	vrele(pr->pr_root);
344150652Scsjp	VFS_UNLOCK_GIANT(vfslocked);
345124882Srwatson
346124882Srwatson	mtx_destroy(&pr->pr_mtx);
347124882Srwatson	if (pr->pr_linux != NULL)
348124882Srwatson		FREE(pr->pr_linux, M_PRISON);
349124882Srwatson	FREE(pr, M_PRISON);
350124882Srwatson}
351124882Srwatson
35272786Srwatsonvoid
35372786Srwatsonprison_hold(struct prison *pr)
35472786Srwatson{
35572786Srwatson
35687275Srwatson	mtx_lock(&pr->pr_mtx);
357168489Spjd	KASSERT(pr->pr_ref > 0,
358168489Spjd	    ("Trying to hold dead prison (id=%d).", pr->pr_id));
35972786Srwatson	pr->pr_ref++;
36087275Srwatson	mtx_unlock(&pr->pr_mtx);
36172786Srwatson}
36272786Srwatson
36387275Srwatsonu_int32_t
36487275Srwatsonprison_getip(struct ucred *cred)
36587275Srwatson{
36687275Srwatson
36787275Srwatson	return (cred->cr_prison->pr_ip);
36887275Srwatson}
36987275Srwatson
37046155Sphkint
37172786Srwatsonprison_ip(struct ucred *cred, int flag, u_int32_t *ip)
37246155Sphk{
37346155Sphk	u_int32_t tmp;
37446155Sphk
37572786Srwatson	if (!jailed(cred))
37646155Sphk		return (0);
377167309Spjd	if (flag)
37846155Sphk		tmp = *ip;
37946155Sphk	else
38046155Sphk		tmp = ntohl(*ip);
38146155Sphk	if (tmp == INADDR_ANY) {
382167309Spjd		if (flag)
38372786Srwatson			*ip = cred->cr_prison->pr_ip;
38446155Sphk		else
38572786Srwatson			*ip = htonl(cred->cr_prison->pr_ip);
38646155Sphk		return (0);
38746155Sphk	}
38881114Srwatson	if (tmp == INADDR_LOOPBACK) {
38981114Srwatson		if (flag)
39081114Srwatson			*ip = cred->cr_prison->pr_ip;
39181114Srwatson		else
39281114Srwatson			*ip = htonl(cred->cr_prison->pr_ip);
39381114Srwatson		return (0);
39481114Srwatson	}
39572786Srwatson	if (cred->cr_prison->pr_ip != tmp)
39646155Sphk		return (1);
39746155Sphk	return (0);
39846155Sphk}
39946155Sphk
40046155Sphkvoid
40172786Srwatsonprison_remote_ip(struct ucred *cred, int flag, u_int32_t *ip)
40246155Sphk{
40346155Sphk	u_int32_t tmp;
40446155Sphk
40572786Srwatson	if (!jailed(cred))
40646155Sphk		return;
40746155Sphk	if (flag)
40846155Sphk		tmp = *ip;
40946155Sphk	else
41046155Sphk		tmp = ntohl(*ip);
41181114Srwatson	if (tmp == INADDR_LOOPBACK) {
41246155Sphk		if (flag)
41372786Srwatson			*ip = cred->cr_prison->pr_ip;
41446155Sphk		else
41572786Srwatson			*ip = htonl(cred->cr_prison->pr_ip);
41646155Sphk		return;
41746155Sphk	}
41846155Sphk	return;
41946155Sphk}
42046155Sphk
42146155Sphkint
42272786Srwatsonprison_if(struct ucred *cred, struct sockaddr *sa)
42346155Sphk{
424114168Smike	struct sockaddr_in *sai;
42546155Sphk	int ok;
42646155Sphk
427114168Smike	sai = (struct sockaddr_in *)sa;
42861235Srwatson	if ((sai->sin_family != AF_INET) && jail_socket_unixiproute_only)
42961235Srwatson		ok = 1;
43061235Srwatson	else if (sai->sin_family != AF_INET)
43146155Sphk		ok = 0;
43272786Srwatson	else if (cred->cr_prison->pr_ip != ntohl(sai->sin_addr.s_addr))
43346155Sphk		ok = 1;
43446155Sphk	else
43546155Sphk		ok = 0;
43646155Sphk	return (ok);
43746155Sphk}
43872786Srwatson
43972786Srwatson/*
44072786Srwatson * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
44172786Srwatson */
44272786Srwatsonint
443114168Smikeprison_check(struct ucred *cred1, struct ucred *cred2)
44472786Srwatson{
44572786Srwatson
44672786Srwatson	if (jailed(cred1)) {
44772786Srwatson		if (!jailed(cred2))
44872786Srwatson			return (ESRCH);
44972786Srwatson		if (cred2->cr_prison != cred1->cr_prison)
45072786Srwatson			return (ESRCH);
45172786Srwatson	}
45272786Srwatson
45372786Srwatson	return (0);
45472786Srwatson}
45572786Srwatson
45672786Srwatson/*
45772786Srwatson * Return 1 if the passed credential is in a jail, otherwise 0.
45872786Srwatson */
45972786Srwatsonint
460114168Smikejailed(struct ucred *cred)
46172786Srwatson{
46272786Srwatson
46372786Srwatson	return (cred->cr_prison != NULL);
46472786Srwatson}
46591384Srobert
46691384Srobert/*
46791384Srobert * Return the correct hostname for the passed credential.
46891384Srobert */
46991391Srobertvoid
470114168Smikegetcredhostname(struct ucred *cred, char *buf, size_t size)
47191384Srobert{
47291384Srobert
47391391Srobert	if (jailed(cred)) {
47491391Srobert		mtx_lock(&cred->cr_prison->pr_mtx);
475105354Srobert		strlcpy(buf, cred->cr_prison->pr_host, size);
47691391Srobert		mtx_unlock(&cred->cr_prison->pr_mtx);
477114168Smike	} else
478105354Srobert		strlcpy(buf, hostname, size);
47991384Srobert}
480113275Smike
481125804Srwatson/*
482147185Spjd * Determine whether the subject represented by cred can "see"
483147185Spjd * status of a mount point.
484147185Spjd * Returns: 0 for permitted, ENOENT otherwise.
485147185Spjd * XXX: This function should be called cr_canseemount() and should be
486147185Spjd *      placed in kern_prot.c.
487125804Srwatson */
488125804Srwatsonint
489147185Spjdprison_canseemount(struct ucred *cred, struct mount *mp)
490125804Srwatson{
491147185Spjd	struct prison *pr;
492147185Spjd	struct statfs *sp;
493147185Spjd	size_t len;
494125804Srwatson
495147185Spjd	if (!jailed(cred) || jail_enforce_statfs == 0)
496147185Spjd		return (0);
497147185Spjd	pr = cred->cr_prison;
498147185Spjd	if (pr->pr_root->v_mount == mp)
499147185Spjd		return (0);
500147185Spjd	if (jail_enforce_statfs == 2)
501147185Spjd		return (ENOENT);
502147185Spjd	/*
503147185Spjd	 * If jail's chroot directory is set to "/" we should be able to see
504147185Spjd	 * all mount-points from inside a jail.
505147185Spjd	 * This is ugly check, but this is the only situation when jail's
506147185Spjd	 * directory ends with '/'.
507147185Spjd	 */
508147185Spjd	if (strcmp(pr->pr_path, "/") == 0)
509147185Spjd		return (0);
510147185Spjd	len = strlen(pr->pr_path);
511147185Spjd	sp = &mp->mnt_stat;
512147185Spjd	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
513147185Spjd		return (ENOENT);
514147185Spjd	/*
515147185Spjd	 * Be sure that we don't have situation where jail's root directory
516147185Spjd	 * is "/some/path" and mount point is "/some/pathpath".
517147185Spjd	 */
518147185Spjd	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
519147185Spjd		return (ENOENT);
520147185Spjd	return (0);
521147185Spjd}
522147185Spjd
523147185Spjdvoid
524147185Spjdprison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
525147185Spjd{
526147185Spjd	char jpath[MAXPATHLEN];
527147185Spjd	struct prison *pr;
528147185Spjd	size_t len;
529147185Spjd
530147185Spjd	if (!jailed(cred) || jail_enforce_statfs == 0)
531147185Spjd		return;
532147185Spjd	pr = cred->cr_prison;
533147185Spjd	if (prison_canseemount(cred, mp) != 0) {
534147185Spjd		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
535147185Spjd		strlcpy(sp->f_mntonname, "[restricted]",
536147185Spjd		    sizeof(sp->f_mntonname));
537147185Spjd		return;
538125804Srwatson	}
539147185Spjd	if (pr->pr_root->v_mount == mp) {
540147185Spjd		/*
541147185Spjd		 * Clear current buffer data, so we are sure nothing from
542147185Spjd		 * the valid path left there.
543147185Spjd		 */
544147185Spjd		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
545147185Spjd		*sp->f_mntonname = '/';
546147185Spjd		return;
547147185Spjd	}
548147185Spjd	/*
549147185Spjd	 * If jail's chroot directory is set to "/" we should be able to see
550147185Spjd	 * all mount-points from inside a jail.
551147185Spjd	 */
552147185Spjd	if (strcmp(pr->pr_path, "/") == 0)
553147185Spjd		return;
554147185Spjd	len = strlen(pr->pr_path);
555147185Spjd	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
556147185Spjd	/*
557147185Spjd	 * Clear current buffer data, so we are sure nothing from
558147185Spjd	 * the valid path left there.
559147185Spjd	 */
560147185Spjd	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
561147185Spjd	if (*jpath == '\0') {
562147185Spjd		/* Should never happen. */
563147185Spjd		*sp->f_mntonname = '/';
564147185Spjd	} else {
565147185Spjd		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
566147185Spjd	}
567125804Srwatson}
568125804Srwatson
569164032Srwatson/*
570164032Srwatson * Check with permission for a specific privilege is granted within jail.  We
571164032Srwatson * have a specific list of accepted privileges; the rest are denied.
572164032Srwatson */
573164032Srwatsonint
574164032Srwatsonprison_priv_check(struct ucred *cred, int priv)
575164032Srwatson{
576164032Srwatson
577164032Srwatson	if (!jailed(cred))
578164032Srwatson		return (0);
579164032Srwatson
580164032Srwatson	switch (priv) {
581164032Srwatson
582164032Srwatson		/*
583164032Srwatson		 * Allow ktrace privileges for root in jail.
584164032Srwatson		 */
585164032Srwatson	case PRIV_KTRACE:
586164032Srwatson
587166827Srwatson#if 0
588164032Srwatson		/*
589164032Srwatson		 * Allow jailed processes to configure audit identity and
590164032Srwatson		 * submit audit records (login, etc).  In the future we may
591164032Srwatson		 * want to further refine the relationship between audit and
592164032Srwatson		 * jail.
593164032Srwatson		 */
594164032Srwatson	case PRIV_AUDIT_GETAUDIT:
595164032Srwatson	case PRIV_AUDIT_SETAUDIT:
596164032Srwatson	case PRIV_AUDIT_SUBMIT:
597166827Srwatson#endif
598164032Srwatson
599164032Srwatson		/*
600164032Srwatson		 * Allow jailed processes to manipulate process UNIX
601164032Srwatson		 * credentials in any way they see fit.
602164032Srwatson		 */
603164032Srwatson	case PRIV_CRED_SETUID:
604164032Srwatson	case PRIV_CRED_SETEUID:
605164032Srwatson	case PRIV_CRED_SETGID:
606164032Srwatson	case PRIV_CRED_SETEGID:
607164032Srwatson	case PRIV_CRED_SETGROUPS:
608164032Srwatson	case PRIV_CRED_SETREUID:
609164032Srwatson	case PRIV_CRED_SETREGID:
610164032Srwatson	case PRIV_CRED_SETRESUID:
611164032Srwatson	case PRIV_CRED_SETRESGID:
612164032Srwatson
613164032Srwatson		/*
614164032Srwatson		 * Jail implements visibility constraints already, so allow
615164032Srwatson		 * jailed root to override uid/gid-based constraints.
616164032Srwatson		 */
617164032Srwatson	case PRIV_SEEOTHERGIDS:
618164032Srwatson	case PRIV_SEEOTHERUIDS:
619164032Srwatson
620164032Srwatson		/*
621164032Srwatson		 * Jail implements inter-process debugging limits already, so
622164032Srwatson		 * allow jailed root various debugging privileges.
623164032Srwatson		 */
624164032Srwatson	case PRIV_DEBUG_DIFFCRED:
625164032Srwatson	case PRIV_DEBUG_SUGID:
626164032Srwatson	case PRIV_DEBUG_UNPRIV:
627164032Srwatson
628164032Srwatson		/*
629164032Srwatson		 * Allow jail to set various resource limits and login
630164032Srwatson		 * properties, and for now, exceed process resource limits.
631164032Srwatson		 */
632164032Srwatson	case PRIV_PROC_LIMIT:
633164032Srwatson	case PRIV_PROC_SETLOGIN:
634164032Srwatson	case PRIV_PROC_SETRLIMIT:
635164032Srwatson
636164032Srwatson		/*
637164032Srwatson		 * System V and POSIX IPC privileges are granted in jail.
638164032Srwatson		 */
639164032Srwatson	case PRIV_IPC_READ:
640164032Srwatson	case PRIV_IPC_WRITE:
641164032Srwatson	case PRIV_IPC_ADMIN:
642164032Srwatson	case PRIV_IPC_MSGSIZE:
643164032Srwatson	case PRIV_MQ_ADMIN:
644164032Srwatson
645164032Srwatson		/*
646164032Srwatson		 * Jail implements its own inter-process limits, so allow
647164032Srwatson		 * root processes in jail to change scheduling on other
648164032Srwatson		 * processes in the same jail.  Likewise for signalling.
649164032Srwatson		 */
650164032Srwatson	case PRIV_SCHED_DIFFCRED:
651164032Srwatson	case PRIV_SIGNAL_DIFFCRED:
652164032Srwatson	case PRIV_SIGNAL_SUGID:
653164032Srwatson
654164032Srwatson		/*
655164032Srwatson		 * Allow jailed processes to write to sysctls marked as jail
656164032Srwatson		 * writable.
657164032Srwatson		 */
658164032Srwatson	case PRIV_SYSCTL_WRITEJAIL:
659164032Srwatson
660164032Srwatson		/*
661164032Srwatson		 * Allow root in jail to manage a variety of quota
662166831Srwatson		 * properties.  These should likely be conditional on a
663166831Srwatson		 * configuration option.
664164032Srwatson		 */
665166832Srwatson	case PRIV_VFS_GETQUOTA:
666166832Srwatson	case PRIV_VFS_SETQUOTA:
667164032Srwatson
668164032Srwatson		/*
669164032Srwatson		 * Since Jail relies on chroot() to implement file system
670164032Srwatson		 * protections, grant many VFS privileges to root in jail.
671164032Srwatson		 * Be careful to exclude mount-related and NFS-related
672164032Srwatson		 * privileges.
673164032Srwatson		 */
674164032Srwatson	case PRIV_VFS_READ:
675164032Srwatson	case PRIV_VFS_WRITE:
676164032Srwatson	case PRIV_VFS_ADMIN:
677164032Srwatson	case PRIV_VFS_EXEC:
678164032Srwatson	case PRIV_VFS_LOOKUP:
679164032Srwatson	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
680164032Srwatson	case PRIV_VFS_CHFLAGS_DEV:
681164032Srwatson	case PRIV_VFS_CHOWN:
682164032Srwatson	case PRIV_VFS_CHROOT:
683167152Spjd	case PRIV_VFS_RETAINSUGID:
684164032Srwatson	case PRIV_VFS_FCHROOT:
685164032Srwatson	case PRIV_VFS_LINK:
686164032Srwatson	case PRIV_VFS_SETGID:
687164032Srwatson	case PRIV_VFS_STICKYFILE:
688164032Srwatson		return (0);
689164032Srwatson
690164032Srwatson		/*
691164032Srwatson		 * Depending on the global setting, allow privilege of
692164032Srwatson		 * setting system flags.
693164032Srwatson		 */
694164032Srwatson	case PRIV_VFS_SYSFLAGS:
695164032Srwatson		if (jail_chflags_allowed)
696164032Srwatson			return (0);
697164032Srwatson		else
698164032Srwatson			return (EPERM);
699164032Srwatson
700164032Srwatson		/*
701168396Spjd		 * Depending on the global setting, allow privilege of
702168396Spjd		 * mounting/unmounting file systems.
703168396Spjd		 */
704168396Spjd	case PRIV_VFS_MOUNT:
705168396Spjd	case PRIV_VFS_UNMOUNT:
706168396Spjd	case PRIV_VFS_MOUNT_NONUSER:
707168699Spjd	case PRIV_VFS_MOUNT_OWNER:
708168396Spjd		if (jail_mount_allowed)
709168396Spjd			return (0);
710168396Spjd		else
711168396Spjd			return (EPERM);
712168396Spjd
713168396Spjd		/*
714168591Srwatson		 * Allow jailed root to bind reserved ports and reuse in-use
715168591Srwatson		 * ports.
716164032Srwatson		 */
717164032Srwatson	case PRIV_NETINET_RESERVEDPORT:
718168591Srwatson	case PRIV_NETINET_REUSEPORT:
719164032Srwatson		return (0);
720164032Srwatson
721164032Srwatson		/*
722164032Srwatson		 * Conditionally allow creating raw sockets in jail.
723164032Srwatson		 */
724164032Srwatson	case PRIV_NETINET_RAW:
725164032Srwatson		if (jail_allow_raw_sockets)
726164032Srwatson			return (0);
727164032Srwatson		else
728164032Srwatson			return (EPERM);
729164032Srwatson
730164032Srwatson		/*
731164032Srwatson		 * Since jail implements its own visibility limits on netstat
732164032Srwatson		 * sysctls, allow getcred.  This allows identd to work in
733164032Srwatson		 * jail.
734164032Srwatson		 */
735164032Srwatson	case PRIV_NETINET_GETCRED:
736164032Srwatson		return (0);
737164032Srwatson
738164032Srwatson	default:
739164032Srwatson		/*
740164032Srwatson		 * In all remaining cases, deny the privilege request.  This
741164032Srwatson		 * includes almost all network privileges, many system
742164032Srwatson		 * configuration privileges.
743164032Srwatson		 */
744164032Srwatson		return (EPERM);
745164032Srwatson	}
746164032Srwatson}
747164032Srwatson
748168401Spjd/*
749168401Spjd * Register jail service. Provides 'create' and 'destroy' methods.
750168401Spjd * 'create' method will be called for every existing jail and all
751168401Spjd * jails in the future as they beeing created.
752168401Spjd * 'destroy' method will be called for every jail going away and
753168401Spjd * for all existing jails at the time of service deregistration.
754168401Spjd */
755168401Spjdstruct prison_service *
756168401Spjdprison_service_register(const char *name, prison_create_t create,
757168401Spjd    prison_destroy_t destroy)
758168401Spjd{
759168401Spjd	struct prison_service *psrv, *psrv2;
760168401Spjd	struct prison *pr;
761168401Spjd	int reallocate = 1, slotno = 0;
762168401Spjd	void **slots, **oldslots;
763168401Spjd
764168401Spjd	psrv = malloc(sizeof(*psrv) + strlen(name) + 1, M_PRISON,
765168401Spjd	    M_WAITOK | M_ZERO);
766168401Spjd	psrv->ps_create = create;
767168401Spjd	psrv->ps_destroy = destroy;
768168401Spjd	strcpy(psrv->ps_name, name);
769168401Spjd	/*
770168401Spjd	 * Grab the allprison_lock here, so we won't miss any jail
771168401Spjd	 * creation/destruction.
772168401Spjd	 */
773168401Spjd	sx_xlock(&allprison_lock);
774168401Spjd#ifdef INVARIANTS
775168401Spjd	/*
776168401Spjd	 * Verify if service is not already registered.
777168401Spjd	 */
778168401Spjd	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
779168401Spjd		KASSERT(strcmp(psrv2->ps_name, name) != 0,
780168401Spjd		    ("jail service %s already registered", name));
781168401Spjd	}
782168401Spjd#endif
783168401Spjd	/*
784168401Spjd	 * Find free slot. When there is no existing free slot available,
785168401Spjd	 * allocate one at the end.
786168401Spjd	 */
787168401Spjd	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
788168401Spjd		if (psrv2->ps_slotno != slotno) {
789168401Spjd			KASSERT(slotno < psrv2->ps_slotno,
790168401Spjd			    ("Invalid slotno (slotno=%d >= ps_slotno=%d",
791168401Spjd			    slotno, psrv2->ps_slotno));
792168401Spjd			/* We found free slot. */
793168401Spjd			reallocate = 0;
794168401Spjd			break;
795168401Spjd		}
796168401Spjd		slotno++;
797168401Spjd	}
798168401Spjd	psrv->ps_slotno = slotno;
799168401Spjd	/*
800168401Spjd	 * Keep the list sorted by slot number.
801168401Spjd	 */
802168401Spjd	if (psrv2 != NULL) {
803168401Spjd		KASSERT(reallocate == 0, ("psrv2 != NULL && reallocate != 0"));
804168401Spjd		TAILQ_INSERT_BEFORE(psrv2, psrv, ps_next);
805168401Spjd	} else {
806168401Spjd		KASSERT(reallocate == 1, ("psrv2 == NULL && reallocate == 0"));
807168401Spjd		TAILQ_INSERT_TAIL(&prison_services, psrv, ps_next);
808168401Spjd	}
809168401Spjd	prison_service_slots++;
810168401Spjd	sx_downgrade(&allprison_lock);
811168401Spjd	/*
812168401Spjd	 * Allocate memory for new slot if we didn't found empty one.
813168401Spjd	 * Do not use realloc(9), because pr_slots is protected with a mutex,
814168401Spjd	 * so we can't sleep.
815168401Spjd	 */
816168401Spjd	LIST_FOREACH(pr, &allprison, pr_list) {
817168401Spjd		if (reallocate) {
818168401Spjd			/* First allocate memory with M_WAITOK. */
819168401Spjd			slots = malloc(sizeof(*slots) * prison_service_slots,
820168401Spjd			    M_PRISON, M_WAITOK);
821168401Spjd			/* Now grab the mutex and replace pr_slots. */
822168401Spjd			mtx_lock(&pr->pr_mtx);
823168401Spjd			oldslots = pr->pr_slots;
824168401Spjd			if (psrv->ps_slotno > 0) {
825168401Spjd				bcopy(oldslots, slots,
826168401Spjd				    sizeof(*slots) * (prison_service_slots - 1));
827168401Spjd			}
828168401Spjd			slots[psrv->ps_slotno] = NULL;
829168401Spjd			pr->pr_slots = slots;
830168401Spjd			mtx_unlock(&pr->pr_mtx);
831168401Spjd			if (oldslots != NULL)
832168401Spjd				free(oldslots, M_PRISON);
833168401Spjd		}
834168401Spjd		/*
835168401Spjd		 * Call 'create' method for each existing jail.
836168401Spjd		 */
837168401Spjd		psrv->ps_create(psrv, pr);
838168401Spjd	}
839168401Spjd	sx_sunlock(&allprison_lock);
840168401Spjd
841168401Spjd	return (psrv);
842168401Spjd}
843168401Spjd
844168401Spjdvoid
845168401Spjdprison_service_deregister(struct prison_service *psrv)
846168401Spjd{
847168401Spjd	struct prison *pr;
848168401Spjd	void **slots, **oldslots;
849168401Spjd	int last = 0;
850168401Spjd
851168401Spjd	sx_xlock(&allprison_lock);
852168401Spjd	if (TAILQ_LAST(&prison_services, prison_services_head) == psrv)
853168401Spjd		last = 1;
854168401Spjd	TAILQ_REMOVE(&prison_services, psrv, ps_next);
855168401Spjd	prison_service_slots--;
856168401Spjd	sx_downgrade(&allprison_lock);
857168401Spjd	LIST_FOREACH(pr, &allprison, pr_list) {
858168401Spjd		/*
859168401Spjd		 * Call 'destroy' method for every currently existing jail.
860168401Spjd		 */
861168401Spjd		psrv->ps_destroy(psrv, pr);
862168401Spjd		/*
863168401Spjd		 * If this is the last slot, free the memory allocated for it.
864168401Spjd		 */
865168401Spjd		if (last) {
866168401Spjd			if (prison_service_slots == 0)
867168401Spjd				slots = NULL;
868168401Spjd			else {
869168401Spjd				slots = malloc(sizeof(*slots) * prison_service_slots,
870168401Spjd				    M_PRISON, M_WAITOK);
871168401Spjd			}
872168401Spjd			mtx_lock(&pr->pr_mtx);
873168401Spjd			oldslots = pr->pr_slots;
874168401Spjd			/*
875168401Spjd			 * We require setting slot to NULL after freeing it,
876168401Spjd			 * this way we can check for memory leaks here.
877168401Spjd			 */
878168401Spjd			KASSERT(oldslots[psrv->ps_slotno] == NULL,
879168401Spjd			    ("Slot %d (service %s, jailid=%d) still contains data?",
880168401Spjd			     psrv->ps_slotno, psrv->ps_name, pr->pr_id));
881168401Spjd			if (psrv->ps_slotno > 0) {
882168401Spjd				bcopy(oldslots, slots,
883168401Spjd				    sizeof(*slots) * prison_service_slots);
884168401Spjd			}
885168401Spjd			pr->pr_slots = slots;
886168401Spjd			mtx_unlock(&pr->pr_mtx);
887168401Spjd			KASSERT(oldslots != NULL, ("oldslots == NULL"));
888168401Spjd			free(oldslots, M_PRISON);
889168401Spjd		}
890168401Spjd	}
891168401Spjd	sx_sunlock(&allprison_lock);
892168401Spjd	free(psrv, M_PRISON);
893168401Spjd}
894168401Spjd
895168401Spjd/*
896168401Spjd * Function sets data for the given jail in slot assigned for the given
897168401Spjd * jail service.
898168401Spjd */
899168401Spjdvoid
900168401Spjdprison_service_data_set(struct prison_service *psrv, struct prison *pr,
901168401Spjd    void *data)
902168401Spjd{
903168401Spjd
904168401Spjd	mtx_assert(&pr->pr_mtx, MA_OWNED);
905168401Spjd	pr->pr_slots[psrv->ps_slotno] = data;
906168401Spjd}
907168401Spjd
908168401Spjd/*
909168401Spjd * Function clears slots assigned for the given jail service in the given
910168401Spjd * prison structure and returns current slot data.
911168401Spjd */
912168401Spjdvoid *
913168401Spjdprison_service_data_del(struct prison_service *psrv, struct prison *pr)
914168401Spjd{
915168401Spjd	void *data;
916168401Spjd
917168401Spjd	mtx_assert(&pr->pr_mtx, MA_OWNED);
918168401Spjd	data = pr->pr_slots[psrv->ps_slotno];
919168401Spjd	pr->pr_slots[psrv->ps_slotno] = NULL;
920168401Spjd	return (data);
921168401Spjd}
922168401Spjd
923168401Spjd/*
924168401Spjd * Function returns current data from the slot assigned to the given jail
925168401Spjd * service for the given jail.
926168401Spjd */
927168401Spjdvoid *
928168401Spjdprison_service_data_get(struct prison_service *psrv, struct prison *pr)
929168401Spjd{
930168401Spjd
931168401Spjd	mtx_assert(&pr->pr_mtx, MA_OWNED);
932168401Spjd	return (pr->pr_slots[psrv->ps_slotno]);
933168401Spjd}
934168401Spjd
935113275Smikestatic int
936113275Smikesysctl_jail_list(SYSCTL_HANDLER_ARGS)
937113275Smike{
938113275Smike	struct xprison *xp, *sxp;
939113275Smike	struct prison *pr;
940113275Smike	int count, error;
941113275Smike
942127020Spjd	if (jailed(req->td->td_ucred))
943125806Srwatson		return (0);
944113275Smike
945168401Spjd	sx_slock(&allprison_lock);
946168401Spjd	if ((count = prisoncount) == 0) {
947168401Spjd		sx_sunlock(&allprison_lock);
948113275Smike		return (0);
949168401Spjd	}
950113275Smike
951113275Smike	sxp = xp = malloc(sizeof(*xp) * count, M_TEMP, M_WAITOK | M_ZERO);
952167309Spjd
953113275Smike	LIST_FOREACH(pr, &allprison, pr_list) {
954113275Smike		xp->pr_version = XPRISON_VERSION;
955113275Smike		xp->pr_id = pr->pr_id;
956168487Spjd		xp->pr_ip = pr->pr_ip;
957113275Smike		strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path));
958168487Spjd		mtx_lock(&pr->pr_mtx);
959113275Smike		strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host));
960113275Smike		mtx_unlock(&pr->pr_mtx);
961113275Smike		xp++;
962113275Smike	}
963168401Spjd	sx_sunlock(&allprison_lock);
964113275Smike
965113275Smike	error = SYSCTL_OUT(req, sxp, sizeof(*sxp) * count);
966113275Smike	free(sxp, M_TEMP);
967167354Spjd	return (error);
968113275Smike}
969113275Smike
970113275SmikeSYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD,
971113275Smike    NULL, 0, sysctl_jail_list, "S", "List of active jails");
972126004Spjd
973126004Spjdstatic int
974126004Spjdsysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
975126004Spjd{
976126004Spjd	int error, injail;
977126004Spjd
978126004Spjd	injail = jailed(req->td->td_ucred);
979126004Spjd	error = SYSCTL_OUT(req, &injail, sizeof(injail));
980126004Spjd
981126004Spjd	return (error);
982126004Spjd}
983126004SpjdSYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD,
984126004Spjd    NULL, 0, sysctl_jail_jailed, "I", "Process in jail?");
985