1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2011 James Gritton
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/types.h>
30#include <sys/cpuset.h>
31#include <sys/event.h>
32#include <sys/mount.h>
33#include <sys/stat.h>
34#include <sys/sysctl.h>
35#include <sys/user.h>
36#include <sys/wait.h>
37
38#include <err.h>
39#include <errno.h>
40#include <fcntl.h>
41#include <kvm.h>
42#include <login_cap.h>
43#include <paths.h>
44#include <pwd.h>
45#include <signal.h>
46#include <stdio.h>
47#include <stdlib.h>
48#include <string.h>
49#include <unistd.h>
50#include <vis.h>
51
52#include "jailp.h"
53
54#define DEFAULT_STOP_TIMEOUT	10
55#define PHASH_SIZE		256
56
57LIST_HEAD(phhead, phash);
58
59struct phash {
60	LIST_ENTRY(phash)	le;
61	struct cfjail		*j;
62	pid_t			pid;
63};
64
65int paralimit = -1;
66
67extern char **environ;
68
69static int run_command(struct cfjail *j);
70static int add_proc(struct cfjail *j, pid_t pid);
71static void clear_procs(struct cfjail *j);
72static struct cfjail *find_proc(pid_t pid);
73static int term_procs(struct cfjail *j);
74static int get_user_info(struct cfjail *j, const char *username,
75    const struct passwd **pwdp, login_cap_t **lcapp);
76static int check_path(struct cfjail *j, const char *pname, const char *path,
77    int isfile, const char *umount_type);
78
79static struct cfjails sleeping = TAILQ_HEAD_INITIALIZER(sleeping);
80static struct cfjails runnable = TAILQ_HEAD_INITIALIZER(runnable);
81static struct cfstring dummystring = { .len = 1 };
82static struct phhead phash[PHASH_SIZE];
83static int kq;
84
85static cpusetid_t
86root_cpuset_id(void)
87{
88	static cpusetid_t setid = CPUSET_INVALID;
89	static int error;
90
91	/* Only try to get the cpuset once. */
92	if (error == 0 && setid == CPUSET_INVALID)
93		error = cpuset_getid(CPU_LEVEL_ROOT, CPU_WHICH_PID, -1, &setid);
94	if (error != 0)
95		return (CPUSET_INVALID);
96	return (setid);
97}
98
99/*
100 * Run the next command associated with a jail.
101 */
102int
103next_command(struct cfjail *j)
104{
105	enum intparam comparam;
106	int create_failed, stopping;
107
108	if (paralimit == 0) {
109		if (j->flags & JF_FROM_RUNQ)
110			requeue_head(j, &runnable);
111		else
112			requeue(j, &runnable);
113		return 1;
114	}
115	j->flags &= ~JF_FROM_RUNQ;
116	create_failed = (j->flags & (JF_STOP | JF_FAILED)) == JF_FAILED;
117	stopping = (j->flags & JF_STOP) != 0;
118	comparam = *j->comparam;
119	for (;;) {
120		if (j->comstring == NULL) {
121			j->comparam += create_failed ? -1 : 1;
122			switch ((comparam = *j->comparam)) {
123			case IP__NULL:
124				return 0;
125			case IP_MOUNT_DEVFS:
126				if (!bool_param(j->intparams[IP_MOUNT_DEVFS]))
127					continue;
128				j->comstring = &dummystring;
129				break;
130			case IP_MOUNT_FDESCFS:
131				if (!bool_param(j->intparams[IP_MOUNT_FDESCFS]))
132					continue;
133				j->comstring = &dummystring;
134				break;
135			case IP_MOUNT_PROCFS:
136				if (!bool_param(j->intparams[IP_MOUNT_PROCFS]))
137					continue;
138				j->comstring = &dummystring;
139				break;
140			case IP__OP:
141			case IP_STOP_TIMEOUT:
142				j->comstring = &dummystring;
143				break;
144			default:
145				if (j->intparams[comparam] == NULL)
146					continue;
147				j->comstring = create_failed || (stopping &&
148				    (j->intparams[comparam]->flags & PF_REV))
149				    ? TAILQ_LAST(&j->intparams[comparam]->val,
150					cfstrings)
151				    : TAILQ_FIRST(&j->intparams[comparam]->val);
152			}
153		} else {
154			j->comstring = j->comstring == &dummystring ? NULL :
155			    create_failed || (stopping &&
156			    (j->intparams[comparam]->flags & PF_REV))
157			    ? TAILQ_PREV(j->comstring, cfstrings, tq)
158			    : TAILQ_NEXT(j->comstring, tq);
159		}
160		if (j->comstring == NULL || j->comstring->len == 0 ||
161		    (create_failed && (comparam == IP_EXEC_PRESTART ||
162		    comparam == IP_EXEC_CREATED || comparam == IP_EXEC_START ||
163		    comparam == IP_COMMAND || comparam == IP_EXEC_POSTSTART ||
164		    comparam == IP_EXEC_PREPARE)))
165			continue;
166		switch (run_command(j)) {
167		case -1:
168			failed(j);
169			/* FALLTHROUGH */
170		case 1:
171			return 1;
172		}
173	}
174}
175
176/*
177 * Check command exit status
178 */
179int
180finish_command(struct cfjail *j)
181{
182	struct cfjail *rj;
183	int error;
184
185	if (!(j->flags & JF_SLEEPQ))
186		return 0;
187	j->flags &= ~JF_SLEEPQ;
188	if (*j->comparam == IP_STOP_TIMEOUT) {
189		j->flags &= ~JF_TIMEOUT;
190		j->pstatus = 0;
191		return 0;
192	}
193	paralimit++;
194	if (!TAILQ_EMPTY(&runnable)) {
195		rj = TAILQ_FIRST(&runnable);
196		rj->flags |= JF_FROM_RUNQ;
197		requeue(rj, &ready);
198	}
199	error = 0;
200	if (j->flags & JF_TIMEOUT) {
201		j->flags &= ~JF_TIMEOUT;
202		if (*j->comparam != IP_STOP_TIMEOUT) {
203			jail_warnx(j, "%s: timed out", j->comline);
204			failed(j);
205			error = -1;
206		} else if (verbose > 0)
207			jail_note(j, "timed out\n");
208	} else if (j->pstatus != 0) {
209		if (WIFSIGNALED(j->pstatus))
210			jail_warnx(j, "%s: exited on signal %d",
211			    j->comline, WTERMSIG(j->pstatus));
212		else
213			jail_warnx(j, "%s: failed", j->comline);
214		j->pstatus = 0;
215		failed(j);
216		error = -1;
217	}
218	free(j->comline);
219	j->comline = NULL;
220	return error;
221}
222
223/*
224 * Check for finished processes or timeouts.
225 */
226struct cfjail *
227next_proc(int nonblock)
228{
229	struct kevent ke;
230	struct timespec ts;
231	struct timespec *tsp;
232	struct cfjail *j;
233
234	if (!TAILQ_EMPTY(&sleeping)) {
235	again:
236		tsp = NULL;
237		if ((j = TAILQ_FIRST(&sleeping)) && j->timeout.tv_sec) {
238			clock_gettime(CLOCK_REALTIME, &ts);
239			ts.tv_sec = j->timeout.tv_sec - ts.tv_sec;
240			ts.tv_nsec = j->timeout.tv_nsec - ts.tv_nsec;
241			if (ts.tv_nsec < 0) {
242				ts.tv_sec--;
243				ts.tv_nsec += 1000000000;
244			}
245			if (ts.tv_sec < 0 ||
246			    (ts.tv_sec == 0 && ts.tv_nsec == 0)) {
247				j->flags |= JF_TIMEOUT;
248				clear_procs(j);
249				return j;
250			}
251			tsp = &ts;
252		}
253		if (nonblock) {
254			ts.tv_sec = 0;
255			ts.tv_nsec = 0;
256			tsp = &ts;
257		}
258		switch (kevent(kq, NULL, 0, &ke, 1, tsp)) {
259		case -1:
260			if (errno != EINTR)
261				err(1, "kevent");
262			goto again;
263		case 0:
264			if (!nonblock) {
265				j = TAILQ_FIRST(&sleeping);
266				j->flags |= JF_TIMEOUT;
267				clear_procs(j);
268				return j;
269			}
270			break;
271		case 1:
272			(void)waitpid(ke.ident, NULL, WNOHANG);
273			if ((j = find_proc(ke.ident))) {
274				j->pstatus = ke.data;
275				return j;
276			}
277			goto again;
278		}
279	}
280	return NULL;
281}
282
283/*
284 * Run a single command for a jail, possibly inside the jail.
285 */
286static int
287run_command(struct cfjail *j)
288{
289	const struct passwd *pwd;
290	const struct cfstring *comstring, *s;
291	login_cap_t *lcap;
292	const char **argv;
293	char *acs, *cs, *comcs, *devpath;
294	const char *jidstr, *conslog, *fmt, *path, *ruleset, *term, *username;
295	enum intparam comparam;
296	size_t comlen, ret;
297	pid_t pid;
298	cpusetid_t setid;
299	int argc, bg, clean, consfd, down, fib, i, injail, sjuser, timeout;
300#if defined(INET) || defined(INET6)
301	char *addr, *extrap, *p, *val;
302#endif
303
304	static char *cleanenv;
305
306	/* Perform some operations that aren't actually commands */
307	comparam = *j->comparam;
308	down = j->flags & (JF_STOP | JF_FAILED);
309	switch (comparam) {
310	case IP_STOP_TIMEOUT:
311		return term_procs(j);
312
313	case IP__OP:
314		if (down) {
315			if (jail_remove(j->jid) < 0 && errno == EPERM) {
316				jail_warnx(j, "jail_remove: %s",
317					   strerror(errno));
318				return -1;
319			}
320			if (verbose > 0 || (verbose == 0 && (j->flags & JF_STOP
321			    ? note_remove : j->name != NULL)))
322			    jail_note(j, "removed\n");
323			j->jid = -1;
324			if (j->flags & JF_STOP)
325				dep_done(j, DF_LIGHT);
326			else
327				j->flags &= ~JF_PERSIST;
328		} else {
329			if (create_jail(j) < 0)
330				return -1;
331			if (iflag)
332				printf("%d\n", j->jid);
333			if (verbose >= 0 && (j->name || verbose > 0))
334				jail_note(j, "created\n");
335			dep_done(j, DF_LIGHT);
336		}
337		return 0;
338
339	default: ;
340	}
341	/*
342	 * Collect exec arguments.  Internal commands for network and
343	 * mounting build their own argument lists.
344	 */
345	comstring = j->comstring;
346	bg = 0;
347	switch (comparam) {
348#ifdef INET
349	case IP__IP4_IFADDR:
350		argc = 0;
351		val = alloca(strlen(comstring->s) + 1);
352		strcpy(val, comstring->s);
353		cs = val;
354		extrap = NULL;
355		while ((p = strchr(cs, ' ')) != NULL && strlen(p) > 1) {
356			if (extrap == NULL) {
357				*p = '\0';
358				extrap = p + 1;
359			}
360			cs = p + 1;
361			argc++;
362		}
363
364		argv = alloca((8 + argc) * sizeof(char *));
365		argv[0] = _PATH_IFCONFIG;
366		if ((cs = strchr(val, '|'))) {
367			argv[1] = acs = alloca(cs - val + 1);
368			strlcpy(acs, val, cs - val + 1);
369			addr = cs + 1;
370		} else {
371			argv[1] = string_param(j->intparams[IP_INTERFACE]);
372			addr = val;
373		}
374		argv[2] = "inet";
375		if (!(cs = strchr(addr, '/'))) {
376			argv[3] = addr;
377			argv[4] = "netmask";
378			argv[5] = "255.255.255.255";
379			argc = 6;
380		} else if (strchr(cs + 1, '.')) {
381			argv[3] = acs = alloca(cs - addr + 1);
382			strlcpy(acs, addr, cs - addr + 1);
383			argv[4] = "netmask";
384			argv[5] = cs + 1;
385			argc = 6;
386		} else {
387			argv[3] = addr;
388			argc = 4;
389		}
390
391		if (!down && extrap != NULL) {
392			for (cs = strtok(extrap, " "); cs;
393			     cs = strtok(NULL, " ")) {
394				size_t len = strlen(cs) + 1;
395				argv[argc++] = acs = alloca(len);
396				strlcpy(acs, cs, len);
397			}
398		}
399
400		argv[argc] = down ? "-alias" : "alias";
401		argv[argc + 1] = NULL;
402		break;
403#endif
404
405#ifdef INET6
406	case IP__IP6_IFADDR:
407		argc = 0;
408		val = alloca(strlen(comstring->s) + 1);
409		strcpy(val, comstring->s);
410		cs = val;
411		extrap = NULL;
412		while ((p = strchr(cs, ' ')) != NULL && strlen(p) > 1) {
413			if (extrap == NULL) {
414				*p = '\0';
415				extrap = p + 1;
416			}
417			cs = p + 1;
418			argc++;
419		}
420
421		argv = alloca((8 + argc) * sizeof(char *));
422		argv[0] = _PATH_IFCONFIG;
423		if ((cs = strchr(val, '|'))) {
424			argv[1] = acs = alloca(cs - val + 1);
425			strlcpy(acs, val, cs - val + 1);
426			addr = cs + 1;
427		} else {
428			argv[1] = string_param(j->intparams[IP_INTERFACE]);
429			addr = val;
430		}
431		argv[2] = "inet6";
432		argv[3] = addr;
433		if (!(cs = strchr(addr, '/'))) {
434			argv[4] = "prefixlen";
435			argv[5] = "128";
436			argc = 6;
437		} else
438			argc = 4;
439
440		if (!down && extrap != NULL) {
441			for (cs = strtok(extrap, " "); cs;
442			     cs = strtok(NULL, " ")) {
443				size_t len = strlen(cs) + 1;
444				argv[argc++] = acs = alloca(len);
445				strlcpy(acs, cs, len);
446			}
447		}
448
449		argv[argc] = down ? "-alias" : "alias";
450		argv[argc + 1] = NULL;
451		break;
452#endif
453
454	case IP_VNET_INTERFACE:
455		argv = alloca(5 * sizeof(char *));
456		argv[0] = _PATH_IFCONFIG;
457		argv[1] = comstring->s;
458		argv[2] = down ? "-vnet" : "vnet";
459		jidstr = string_param(j->intparams[KP_JID]);
460		argv[3] = jidstr ? jidstr : string_param(j->intparams[KP_NAME]);
461		argv[4] = NULL;
462		break;
463
464	case IP_MOUNT:
465	case IP__MOUNT_FROM_FSTAB:
466		argv = alloca(8 * sizeof(char *));
467		comcs = alloca(comstring->len + 1);
468		strcpy(comcs, comstring->s);
469		argc = 0;
470		for (cs = strtok(comcs, " \t\f\v\r\n"); cs && argc < 4;
471		     cs = strtok(NULL, " \t\f\v\r\n")) {
472			if (argc <= 1 && strunvis(cs, cs) < 0) {
473				jail_warnx(j, "%s: %s: fstab parse error",
474				    j->intparams[comparam]->name, comstring->s);
475				return -1;
476			}
477			argv[argc++] = cs;
478		}
479		if (argc == 0)
480			return 0;
481		if (argc < 3) {
482			jail_warnx(j, "%s: %s: missing information",
483			    j->intparams[comparam]->name, comstring->s);
484			return -1;
485		}
486		if (check_path(j, j->intparams[comparam]->name, argv[1], 0,
487		    down ? argv[2] : NULL) < 0)
488			return -1;
489		if (down) {
490			argv[4] = NULL;
491			argv[3] = argv[1];
492			argv[0] = "/sbin/umount";
493		} else {
494			if (argc == 4) {
495				argv[7] = NULL;
496				argv[6] = argv[1];
497				argv[5] = argv[0];
498				argv[4] = argv[3];
499				argv[3] = "-o";
500			} else {
501				argv[5] = NULL;
502				argv[4] = argv[1];
503				argv[3] = argv[0];
504			}
505			argv[0] = _PATH_MOUNT;
506		}
507		argv[1] = "-t";
508		break;
509
510	case IP_MOUNT_DEVFS:
511		argv = alloca(7 * sizeof(char *));
512		path = string_param(j->intparams[KP_PATH]);
513		if (path == NULL) {
514			jail_warnx(j, "mount.devfs: no jail root path defined");
515			return -1;
516		}
517		devpath = alloca(strlen(path) + 5);
518		sprintf(devpath, "%s/dev", path);
519		if (check_path(j, "mount.devfs", devpath, 0,
520		    down ? "devfs" : NULL) < 0)
521			return -1;
522		if (down) {
523			argv[0] = "/sbin/umount";
524			argv[1] = devpath;
525			argv[2] = NULL;
526		} else {
527			argv[0] = _PATH_MOUNT;
528			argv[1] = "-t";
529			argv[2] = "devfs";
530			ruleset = string_param(j->intparams[KP_DEVFS_RULESET]);
531			if (!ruleset)
532			    ruleset = "4";	/* devfsrules_jail */
533			argv[3] = acs = alloca(11 + strlen(ruleset));
534			sprintf(acs, "-oruleset=%s", ruleset);
535			argv[4] = ".";
536			argv[5] = devpath;
537			argv[6] = NULL;
538		}
539		break;
540
541	case IP_MOUNT_FDESCFS:
542		argv = alloca(7 * sizeof(char *));
543		path = string_param(j->intparams[KP_PATH]);
544		if (path == NULL) {
545			jail_warnx(j, "mount.fdescfs: no jail root path defined");
546			return -1;
547		}
548		devpath = alloca(strlen(path) + 8);
549		sprintf(devpath, "%s/dev/fd", path);
550		if (check_path(j, "mount.fdescfs", devpath, 0,
551		    down ? "fdescfs" : NULL) < 0)
552			return -1;
553		if (down) {
554			argv[0] = "/sbin/umount";
555			argv[1] = devpath;
556			argv[2] = NULL;
557		} else {
558			argv[0] = _PATH_MOUNT;
559			argv[1] = "-t";
560			argv[2] = "fdescfs";
561			argv[3] = ".";
562			argv[4] = devpath;
563			argv[5] = NULL;
564		}
565		break;
566
567	case IP_MOUNT_PROCFS:
568		argv = alloca(7 * sizeof(char *));
569		path = string_param(j->intparams[KP_PATH]);
570		if (path == NULL) {
571			jail_warnx(j, "mount.procfs: no jail root path defined");
572			return -1;
573		}
574		devpath = alloca(strlen(path) + 6);
575		sprintf(devpath, "%s/proc", path);
576		if (check_path(j, "mount.procfs", devpath, 0,
577		    down ? "procfs" : NULL) < 0)
578			return -1;
579		if (down) {
580			argv[0] = "/sbin/umount";
581			argv[1] = devpath;
582			argv[2] = NULL;
583		} else {
584			argv[0] = _PATH_MOUNT;
585			argv[1] = "-t";
586			argv[2] = "procfs";
587			argv[3] = ".";
588			argv[4] = devpath;
589			argv[5] = NULL;
590		}
591		break;
592
593	case IP_ZFS_DATASET:
594		argv = alloca(4 * sizeof(char *));
595		jidstr = string_param(j->intparams[KP_JID]) ?
596		    string_param(j->intparams[KP_JID]) :
597		    string_param(j->intparams[KP_NAME]);
598		fmt = "if [ $(/sbin/zfs get -H -o value jailed %s) = on ]; then /sbin/zfs jail %s %s || echo error, attaching %s to jail %s failed; else echo error, you need to set jailed=on for dataset %s; fi";
599		comlen = strlen(fmt)
600		    + 2 * strlen(jidstr)
601		    + 4 * comstring->len
602		    - 6 * 2	/* 6 * "%s" */
603		    + 1;
604		comcs = alloca(comlen);
605		ret = snprintf(comcs, comlen, fmt, comstring->s,
606		    jidstr, comstring->s, comstring->s, jidstr,
607		    comstring->s);
608		if (ret >= comlen) {
609			jail_warnx(j, "internal error in ZFS dataset handling");
610			exit(1);
611		}
612		argv[0] = _PATH_BSHELL;
613		argv[1] = "-c";
614		argv[2] = comcs;
615		argv[3] = NULL;
616		break;
617
618	case IP_COMMAND:
619		if (j->name != NULL)
620			goto default_command;
621		argc = 0;
622		TAILQ_FOREACH(s, &j->intparams[IP_COMMAND]->val, tq)
623			argc++;
624		argv = alloca((argc + 1) * sizeof(char *));
625		argc = 0;
626		TAILQ_FOREACH(s, &j->intparams[IP_COMMAND]->val, tq)
627			argv[argc++] = s->s;
628		argv[argc] = NULL;
629		j->comstring = &dummystring;
630		break;
631
632	default:
633	default_command:
634		if ((cs = strpbrk(comstring->s, "!\"$&'()*;<>?[\\]`{|}~")) &&
635		    !(cs[0] == '&' && cs[1] == '\0')) {
636			argv = alloca(4 * sizeof(char *));
637			argv[0] = _PATH_BSHELL;
638			argv[1] = "-c";
639			argv[2] = comstring->s;
640			argv[3] = NULL;
641		} else {
642			if (cs) {
643				*cs = 0;
644				bg = 1;
645			}
646			comcs = alloca(comstring->len + 1);
647			strcpy(comcs, comstring->s);
648			argc = 0;
649			for (cs = strtok(comcs, " \t\f\v\r\n"); cs;
650			     cs = strtok(NULL, " \t\f\v\r\n"))
651				argc++;
652			argv = alloca((argc + 1) * sizeof(char *));
653			strcpy(comcs, comstring->s);
654			argc = 0;
655			for (cs = strtok(comcs, " \t\f\v\r\n"); cs;
656			     cs = strtok(NULL, " \t\f\v\r\n"))
657				argv[argc++] = cs;
658			argv[argc] = NULL;
659		}
660	}
661	if (argv[0] == NULL)
662		return 0;
663
664	if (int_param(j->intparams[IP_EXEC_TIMEOUT], &timeout) &&
665	    timeout != 0) {
666		clock_gettime(CLOCK_REALTIME, &j->timeout);
667		j->timeout.tv_sec += timeout;
668	} else
669		j->timeout.tv_sec = 0;
670
671	injail = comparam == IP_EXEC_START || comparam == IP_COMMAND ||
672	    comparam == IP_EXEC_STOP;
673	if (injail)
674		setid = root_cpuset_id();
675	else
676		setid = CPUSET_INVALID;
677	clean = bool_param(j->intparams[IP_EXEC_CLEAN]);
678	username = string_param(j->intparams[injail
679	    ? IP_EXEC_JAIL_USER : IP_EXEC_SYSTEM_USER]);
680	sjuser = bool_param(j->intparams[IP_EXEC_SYSTEM_JAIL_USER]);
681
682	consfd = 0;
683	if (injail &&
684	    (conslog = string_param(j->intparams[IP_EXEC_CONSOLELOG]))) {
685		if (check_path(j, "exec.consolelog", conslog, 1, NULL) < 0)
686			return -1;
687		consfd =
688		    open(conslog, O_WRONLY | O_CREAT | O_APPEND, DEFFILEMODE);
689		if (consfd < 0) {
690			jail_warnx(j, "open %s: %s", conslog, strerror(errno));
691			return -1;
692		}
693	}
694
695	comlen = 0;
696	for (i = 0; argv[i]; i++)
697		comlen += strlen(argv[i]) + 1;
698	j->comline = cs = emalloc(comlen);
699	for (i = 0; argv[i]; i++) {
700		strcpy(cs, argv[i]);
701		if (argv[i + 1]) {
702			cs += strlen(argv[i]) + 1;
703			cs[-1] = ' ';
704		}
705	}
706	if (verbose > 0)
707		jail_note(j, "run command%s%s%s: %s\n",
708		    injail ? " in jail" : "", username ? " as " : "",
709		    username ? username : "", j->comline);
710
711	pid = fork();
712	if (pid < 0)
713		err(1, "fork");
714	if (pid > 0) {
715		if (bg || !add_proc(j, pid)) {
716			free(j->comline);
717			j->comline = NULL;
718			return 0;
719		} else {
720			paralimit--;
721			return 1;
722		}
723	}
724	if (bg)
725		setsid();
726
727	/* Set up the environment and run the command */
728	pwd = NULL;
729	lcap = NULL;
730	if ((clean || username) && injail && sjuser &&
731	    get_user_info(j, username, &pwd, &lcap) < 0)
732		exit(1);
733	if (injail) {
734		/* jail_attach won't chdir along with its chroot. */
735		path = string_param(j->intparams[KP_PATH]);
736		if (path && chdir(path) < 0) {
737			jail_warnx(j, "chdir %s: %s", path, strerror(errno));
738			exit(1);
739		}
740		if (int_param(j->intparams[IP_EXEC_FIB], &fib) &&
741		    setfib(fib) < 0) {
742			jail_warnx(j, "setfib: %s", strerror(errno));
743			exit(1);
744		}
745
746		/*
747		 * We wouldn't have specialized our affinity, so just setid to
748		 * root.  We do this prior to attaching to avoid the kernel
749		 * having to create a transient cpuset that we'll promptly
750		 * free up with a reset to the jail's cpuset.
751		 *
752		 * This is just a best-effort to use as wide of mask as
753		 * possible.
754		 */
755		if (setid != CPUSET_INVALID)
756			(void)cpuset_setid(CPU_WHICH_PID, -1, setid);
757
758		if (jail_attach(j->jid) < 0) {
759			jail_warnx(j, "jail_attach: %s", strerror(errno));
760			exit(1);
761		}
762	}
763	if (clean || username) {
764		if (!(injail && sjuser) &&
765		    get_user_info(j, username, &pwd, &lcap) < 0)
766			exit(1);
767		if (clean) {
768			term = getenv("TERM");
769			environ = &cleanenv;
770			setenv("PATH", "/bin:/usr/bin", 0);
771			if (term != NULL)
772				setenv("TERM", term, 1);
773		}
774		if (setgid(pwd->pw_gid) < 0) {
775			jail_warnx(j, "setgid %d: %s", pwd->pw_gid,
776			    strerror(errno));
777			exit(1);
778		}
779		if (setusercontext(lcap, pwd, pwd->pw_uid, username
780		    ? LOGIN_SETALL & ~LOGIN_SETGROUP & ~LOGIN_SETLOGIN
781		    : LOGIN_SETPATH | LOGIN_SETENV) < 0) {
782			jail_warnx(j, "setusercontext %s: %s", pwd->pw_name,
783			    strerror(errno));
784			exit(1);
785		}
786		login_close(lcap);
787		setenv("USER", pwd->pw_name, 1);
788		setenv("HOME", pwd->pw_dir, 1);
789		setenv("SHELL",
790		    *pwd->pw_shell ? pwd->pw_shell : _PATH_BSHELL, 1);
791		if (clean && chdir(pwd->pw_dir) < 0) {
792			jail_warnx(j, "chdir %s: %s",
793			    pwd->pw_dir, strerror(errno));
794			exit(1);
795		}
796		endpwent();
797	}
798
799	if (consfd != 0 && (dup2(consfd, 1) < 0 || dup2(consfd, 2) < 0)) {
800		jail_warnx(j, "exec.consolelog: %s", strerror(errno));
801		exit(1);
802	}
803	closefrom(3);
804	execvp(argv[0], __DECONST(char *const*, argv));
805	jail_warnx(j, "exec %s: %s", argv[0], strerror(errno));
806	exit(1);
807}
808
809/*
810 * Add a process to the hash, tied to a jail.
811 */
812static int
813add_proc(struct cfjail *j, pid_t pid)
814{
815	struct kevent ke;
816	struct cfjail *tj;
817	struct phash *ph;
818
819	if (!kq && (kq = kqueue()) < 0)
820		err(1, "kqueue");
821	EV_SET(&ke, pid, EVFILT_PROC, EV_ADD, NOTE_EXIT, 0, NULL);
822	if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) {
823		if (errno == ESRCH)
824			return 0;
825		err(1, "kevent");
826	}
827	ph = emalloc(sizeof(struct phash));
828	ph->j = j;
829	ph->pid = pid;
830	LIST_INSERT_HEAD(&phash[pid % PHASH_SIZE], ph, le);
831	j->nprocs++;
832	j->flags |= JF_SLEEPQ;
833	if (j->timeout.tv_sec == 0)
834		requeue(j, &sleeping);
835	else {
836		/* File the jail in the sleep queue according to its timeout. */
837		TAILQ_REMOVE(j->queue, j, tq);
838		TAILQ_FOREACH(tj, &sleeping, tq) {
839			if (!tj->timeout.tv_sec ||
840			    j->timeout.tv_sec < tj->timeout.tv_sec ||
841			    (j->timeout.tv_sec == tj->timeout.tv_sec &&
842			    j->timeout.tv_nsec <= tj->timeout.tv_nsec)) {
843				TAILQ_INSERT_BEFORE(tj, j, tq);
844				break;
845			}
846		}
847		if (tj == NULL)
848			TAILQ_INSERT_TAIL(&sleeping, j, tq);
849		j->queue = &sleeping;
850	}
851	return 1;
852}
853
854/*
855 * Remove any processes from the hash that correspond to a jail.
856 */
857static void
858clear_procs(struct cfjail *j)
859{
860	struct kevent ke;
861	struct phash *ph, *tph;
862	int i;
863
864	j->nprocs = 0;
865	for (i = 0; i < PHASH_SIZE; i++)
866		LIST_FOREACH_SAFE(ph, &phash[i], le, tph)
867			if (ph->j == j) {
868				EV_SET(&ke, ph->pid, EVFILT_PROC, EV_DELETE,
869				    NOTE_EXIT, 0, NULL);
870				(void)kevent(kq, &ke, 1, NULL, 0, NULL);
871				LIST_REMOVE(ph, le);
872				free(ph);
873			}
874}
875
876/*
877 * Find the jail that corresponds to an exited process.
878 */
879static struct cfjail *
880find_proc(pid_t pid)
881{
882	struct cfjail *j;
883	struct phash *ph;
884
885	LIST_FOREACH(ph, &phash[pid % PHASH_SIZE], le)
886		if (ph->pid == pid) {
887			j = ph->j;
888			LIST_REMOVE(ph, le);
889			free(ph);
890			return --j->nprocs ? NULL : j;
891		}
892	return NULL;
893}
894
895/*
896 * Send SIGTERM to all processes in a jail and wait for them to die.
897 */
898static int
899term_procs(struct cfjail *j)
900{
901	struct kinfo_proc *ki;
902	int i, noted, pcnt, timeout;
903
904	static kvm_t *kd;
905
906	if (!int_param(j->intparams[IP_STOP_TIMEOUT], &timeout))
907		timeout = DEFAULT_STOP_TIMEOUT;
908	else if (timeout == 0)
909		return 0;
910
911	if (kd == NULL) {
912		kd = kvm_open(NULL, NULL, NULL, O_RDONLY, NULL);
913		if (kd == NULL)
914			return 0;
915	}
916
917	ki = kvm_getprocs(kd, KERN_PROC_PROC, 0, &pcnt);
918	if (ki == NULL)
919		return 0;
920	noted = 0;
921	for (i = 0; i < pcnt; i++)
922		if (ki[i].ki_jid == j->jid &&
923		    kill(ki[i].ki_pid, SIGTERM) == 0) {
924			(void)add_proc(j, ki[i].ki_pid);
925			if (verbose > 0) {
926				if (!noted) {
927					noted = 1;
928					jail_note(j, "sent SIGTERM to:");
929				}
930				printf(" %d", ki[i].ki_pid);
931			}
932		}
933	if (noted)
934		printf("\n");
935	if (j->nprocs > 0) {
936		clock_gettime(CLOCK_REALTIME, &j->timeout);
937		j->timeout.tv_sec += timeout;
938		return 1;
939	}
940	return 0;
941}
942
943/*
944 * Look up a user in the passwd and login.conf files.
945 */
946static int
947get_user_info(struct cfjail *j, const char *username,
948    const struct passwd **pwdp, login_cap_t **lcapp)
949{
950	const struct passwd *pwd;
951
952	errno = 0;
953	*pwdp = pwd = username ? getpwnam(username) : getpwuid(getuid());
954	if (pwd == NULL) {
955		if (errno)
956			jail_warnx(j, "getpwnam%s%s: %s", username ? " " : "",
957			    username ? username : "", strerror(errno));
958		else if (username)
959			jail_warnx(j, "%s: no such user", username);
960		else
961			jail_warnx(j, "unknown uid %d", getuid());
962		return -1;
963	}
964	*lcapp = login_getpwclass(pwd);
965	if (*lcapp == NULL) {
966		jail_warnx(j, "getpwclass %s: %s", pwd->pw_name,
967		    strerror(errno));
968		return -1;
969	}
970	/* Set the groups while the group file is still available */
971	if (initgroups(pwd->pw_name, pwd->pw_gid) < 0) {
972		jail_warnx(j, "initgroups %s: %s", pwd->pw_name,
973		    strerror(errno));
974		return -1;
975	}
976	return 0;
977}
978
979/*
980 * Make sure a mount or consolelog path is a valid absolute pathname
981 * with no symlinks.
982 */
983static int
984check_path(struct cfjail *j, const char *pname, const char *path, int isfile,
985    const char *umount_type)
986{
987	struct stat st, mpst;
988	struct statfs stfs;
989	char *tpath, *p;
990	const char *jailpath;
991	size_t jplen;
992
993	if (path[0] != '/') {
994		jail_warnx(j, "%s: %s: not an absolute pathname",
995		    pname, path);
996		return -1;
997	}
998	/*
999	 * Only check for symlinks in components below the jail's path,
1000	 * since that's where the security risk lies.
1001	 */
1002	jailpath = string_param(j->intparams[KP_PATH]);
1003	if (jailpath == NULL)
1004		jailpath = "";
1005	jplen = strlen(jailpath);
1006	if (!strncmp(path, jailpath, jplen) && path[jplen] == '/') {
1007		tpath = alloca(strlen(path) + 1);
1008		strcpy(tpath, path);
1009		for (p = tpath + jplen; p != NULL; ) {
1010			p = strchr(p + 1, '/');
1011			if (p)
1012				*p = '\0';
1013			if (lstat(tpath, &st) < 0) {
1014				if (errno == ENOENT && isfile && !p)
1015					break;
1016				jail_warnx(j, "%s: %s: %s", pname, tpath,
1017				    strerror(errno));
1018				return -1;
1019			}
1020			if (S_ISLNK(st.st_mode)) {
1021				jail_warnx(j, "%s: %s is a symbolic link",
1022				    pname, tpath);
1023				return -1;
1024			}
1025			if (p)
1026				*p = '/';
1027		}
1028	}
1029	if (umount_type != NULL) {
1030		if (stat(path, &st) < 0 || statfs(path, &stfs) < 0) {
1031			jail_warnx(j, "%s: %s: %s", pname, path,
1032			    strerror(errno));
1033			return -1;
1034		}
1035		if (stat(stfs.f_mntonname, &mpst) < 0) {
1036			jail_warnx(j, "%s: %s: %s", pname, stfs.f_mntonname,
1037			    strerror(errno));
1038			return -1;
1039		}
1040		if (st.st_ino != mpst.st_ino) {
1041			jail_warnx(j, "%s: %s: not a mount point",
1042			    pname, path);
1043			return -1;
1044		}
1045		if (strcmp(stfs.f_fstypename, umount_type)) {
1046			jail_warnx(j, "%s: %s: not a %s mount",
1047			    pname, path, umount_type);
1048			return -1;
1049		}
1050	}
1051	return 0;
1052}
1053