kern_fork.c revision 304905
1169689Skan/*-
2169689Skan * Copyright (c) 1982, 1986, 1989, 1991, 1993
3169689Skan *	The Regents of the University of California.  All rights reserved.
4169689Skan * (c) UNIX System Laboratories, Inc.
5169689Skan * All or some portions of this file are derived from material licensed
6169689Skan * to the University of California by American Telephone and Telegraph
7169689Skan * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8169689Skan * the permission of UNIX System Laboratories, Inc.
9169689Skan *
10169689Skan * Redistribution and use in source and binary forms, with or without
11169689Skan * modification, are permitted provided that the following conditions
12169689Skan * are met:
13169689Skan * 1. Redistributions of source code must retain the above copyright
14169689Skan *    notice, this list of conditions and the following disclaimer.
15169689Skan * 2. Redistributions in binary form must reproduce the above copyright
16169689Skan *    notice, this list of conditions and the following disclaimer in the
17169689Skan *    documentation and/or other materials provided with the distribution.
18169689Skan * 4. Neither the name of the University nor the names of its contributors
19169689Skan *    may be used to endorse or promote products derived from this software
20169689Skan *    without specific prior written permission.
21169689Skan *
22169689Skan * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23169689Skan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24169689Skan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25169689Skan * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26169689Skan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27169689Skan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28169689Skan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29169689Skan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30169689Skan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31169689Skan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32169689Skan * SUCH DAMAGE.
33169689Skan *
34169689Skan *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
35169689Skan */
36169689Skan
37169689Skan#include <sys/cdefs.h>
38169689Skan__FBSDID("$FreeBSD: stable/10/sys/kern/kern_fork.c 304905 2016-08-27 11:45:05Z kib $");
39169689Skan
40169689Skan#include "opt_kdtrace.h"
41169689Skan#include "opt_ktrace.h"
42169689Skan#include "opt_kstack_pages.h"
43169689Skan#include "opt_procdesc.h"
44169689Skan
45169689Skan#include <sys/param.h>
46169689Skan#include <sys/systm.h>
47169689Skan#include <sys/sysproto.h>
48169689Skan#include <sys/eventhandler.h>
49169689Skan#include <sys/fcntl.h>
50169689Skan#include <sys/filedesc.h>
51169689Skan#include <sys/jail.h>
52169689Skan#include <sys/kernel.h>
53169689Skan#include <sys/kthread.h>
54169689Skan#include <sys/sysctl.h>
55169689Skan#include <sys/lock.h>
56169689Skan#include <sys/malloc.h>
57169689Skan#include <sys/mutex.h>
58169689Skan#include <sys/priv.h>
59169689Skan#include <sys/proc.h>
60169689Skan#include <sys/procdesc.h>
61169689Skan#include <sys/pioctl.h>
62169689Skan#include <sys/ptrace.h>
63169689Skan#include <sys/racct.h>
64169689Skan#include <sys/resourcevar.h>
65169689Skan#include <sys/sched.h>
66169689Skan#include <sys/syscall.h>
67169689Skan#include <sys/vmmeter.h>
68169689Skan#include <sys/vnode.h>
69169689Skan#include <sys/acct.h>
70169689Skan#include <sys/ktr.h>
71169689Skan#include <sys/ktrace.h>
72169689Skan#include <sys/unistd.h>
73169689Skan#include <sys/sdt.h>
74169689Skan#include <sys/sx.h>
75169689Skan#include <sys/sysent.h>
76169689Skan#include <sys/signalvar.h>
77169689Skan
78169689Skan#include <security/audit/audit.h>
79169689Skan#include <security/mac/mac_framework.h>
80169689Skan
81169689Skan#include <vm/vm.h>
82169689Skan#include <vm/pmap.h>
83169689Skan#include <vm/vm_map.h>
84169689Skan#include <vm/vm_extern.h>
85169689Skan#include <vm/uma.h>
86169689Skan
87169689Skan#ifdef KDTRACE_HOOKS
88169689Skan#include <sys/dtrace_bsd.h>
89169689Skandtrace_fork_func_t	dtrace_fasttrap_fork;
90169689Skan#endif
91169689Skan
92169689SkanSDT_PROVIDER_DECLARE(proc);
93169689SkanSDT_PROBE_DEFINE3(proc, , , create, "struct proc *", "struct proc *", "int");
94169689Skan
95169689Skan#ifndef _SYS_SYSPROTO_H_
96169689Skanstruct fork_args {
97169689Skan	int     dummy;
98169689Skan};
99169689Skan#endif
100169689Skan
101169689Skan/* ARGSUSED */
102169689Skanint
103169689Skansys_fork(struct thread *td, struct fork_args *uap)
104169689Skan{
105169689Skan	int error;
106169689Skan	struct proc *p2;
107169689Skan
108169689Skan	error = fork1(td, RFFDG | RFPROC, 0, &p2, NULL, 0);
109169689Skan	if (error == 0) {
110169689Skan		td->td_retval[0] = p2->p_pid;
111169689Skan		td->td_retval[1] = 0;
112169689Skan	}
113169689Skan	return (error);
114169689Skan}
115169689Skan
116169689Skan/* ARGUSED */
117169689Skanint
118169689Skansys_pdfork(td, uap)
119169689Skan	struct thread *td;
120169689Skan	struct pdfork_args *uap;
121169689Skan{
122169689Skan#ifdef PROCDESC
123169689Skan	int error, fd;
124169689Skan	struct proc *p2;
125169689Skan
126169689Skan	/*
127169689Skan	 * It is necessary to return fd by reference because 0 is a valid file
128169689Skan	 * descriptor number, and the child needs to be able to distinguish
129169689Skan	 * itself from the parent using the return value.
130169689Skan	 */
131169689Skan	error = fork1(td, RFFDG | RFPROC | RFPROCDESC, 0, &p2,
132169689Skan	    &fd, uap->flags);
133169689Skan	if (error == 0) {
134169689Skan		td->td_retval[0] = p2->p_pid;
135169689Skan		td->td_retval[1] = 0;
136169689Skan		error = copyout(&fd, uap->fdp, sizeof(fd));
137169689Skan	}
138169689Skan	return (error);
139169689Skan#else
140169689Skan	return (ENOSYS);
141169689Skan#endif
142169689Skan}
143169689Skan
144169689Skan/* ARGSUSED */
145169689Skanint
146169689Skansys_vfork(struct thread *td, struct vfork_args *uap)
147169689Skan{
148169689Skan	int error, flags;
149169689Skan	struct proc *p2;
150169689Skan
151169689Skan	flags = RFFDG | RFPROC | RFPPWAIT | RFMEM;
152169689Skan	error = fork1(td, flags, 0, &p2, NULL, 0);
153169689Skan	if (error == 0) {
154169689Skan		td->td_retval[0] = p2->p_pid;
155169689Skan		td->td_retval[1] = 0;
156169689Skan	}
157169689Skan	return (error);
158169689Skan}
159169689Skan
160169689Skanint
161169689Skansys_rfork(struct thread *td, struct rfork_args *uap)
162169689Skan{
163169689Skan	struct proc *p2;
164169689Skan	int error;
165169689Skan
166169689Skan	/* Don't allow kernel-only flags. */
167169689Skan	if ((uap->flags & RFKERNELONLY) != 0)
168169689Skan		return (EINVAL);
169169689Skan
170169689Skan	AUDIT_ARG_FFLAGS(uap->flags);
171169689Skan	error = fork1(td, uap->flags, 0, &p2, NULL, 0);
172169689Skan	if (error == 0) {
173169689Skan		td->td_retval[0] = p2 ? p2->p_pid : 0;
174169689Skan		td->td_retval[1] = 0;
175169689Skan	}
176169689Skan	return (error);
177169689Skan}
178169689Skan
179169689Skanint	nprocs = 1;		/* process 0 */
180169689Skanint	lastpid = 0;
181169689SkanSYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
182169689Skan    "Last used PID");
183169689Skan
184169689Skan/*
185169689Skan * Random component to lastpid generation.  We mix in a random factor to make
186169689Skan * it a little harder to predict.  We sanity check the modulus value to avoid
187169689Skan * doing it in critical paths.  Don't let it be too small or we pointlessly
188169689Skan * waste randomness entropy, and don't let it be impossibly large.  Using a
189169689Skan * modulus that is too big causes a LOT more process table scans and slows
190169689Skan * down fork processing as the pidchecked caching is defeated.
191169689Skan */
192169689Skanstatic int randompid = 0;
193169689Skan
194169689Skanstatic int
195169689Skansysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
196169689Skan{
197169689Skan	int error, pid;
198169689Skan
199169689Skan	error = sysctl_wire_old_buffer(req, sizeof(int));
200169689Skan	if (error != 0)
201169689Skan		return(error);
202169689Skan	sx_xlock(&allproc_lock);
203169689Skan	pid = randompid;
204169689Skan	error = sysctl_handle_int(oidp, &pid, 0, req);
205169689Skan	if (error == 0 && req->newptr != NULL) {
206169689Skan		if (pid < 0 || pid > pid_max - 100)	/* out of range */
207169689Skan			pid = pid_max - 100;
208169689Skan		else if (pid < 2)			/* NOP */
209169689Skan			pid = 0;
210169689Skan		else if (pid < 100)			/* Make it reasonable */
211169689Skan			pid = 100;
212169689Skan		randompid = pid;
213169689Skan	}
214169689Skan	sx_xunlock(&allproc_lock);
215169689Skan	return (error);
216169689Skan}
217169689Skan
218169689SkanSYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
219169689Skan    0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
220169689Skan
221169689Skanstatic int
222169689Skanfork_findpid(int flags)
223169689Skan{
224169689Skan	struct proc *p;
225169689Skan	int trypid;
226169689Skan	static int pidchecked = 0;
227169689Skan
228169689Skan	/*
229169689Skan	 * Requires allproc_lock in order to iterate over the list
230169689Skan	 * of processes, and proctree_lock to access p_pgrp.
231169689Skan	 */
232169689Skan	sx_assert(&allproc_lock, SX_LOCKED);
233169689Skan	sx_assert(&proctree_lock, SX_LOCKED);
234169689Skan
235169689Skan	/*
236169689Skan	 * Find an unused process ID.  We remember a range of unused IDs
237169689Skan	 * ready to use (from lastpid+1 through pidchecked-1).
238169689Skan	 *
239169689Skan	 * If RFHIGHPID is set (used during system boot), do not allocate
240169689Skan	 * low-numbered pids.
241169689Skan	 */
242169689Skan	trypid = lastpid + 1;
243169689Skan	if (flags & RFHIGHPID) {
244169689Skan		if (trypid < 10)
245169689Skan			trypid = 10;
246169689Skan	} else {
247169689Skan		if (randompid)
248169689Skan			trypid += arc4random() % randompid;
249169689Skan	}
250169689Skanretry:
251169689Skan	/*
252169689Skan	 * If the process ID prototype has wrapped around,
253169689Skan	 * restart somewhat above 0, as the low-numbered procs
254169689Skan	 * tend to include daemons that don't exit.
255169689Skan	 */
256169689Skan	if (trypid >= pid_max) {
257169689Skan		trypid = trypid % pid_max;
258169689Skan		if (trypid < 100)
259169689Skan			trypid += 100;
260169689Skan		pidchecked = 0;
261169689Skan	}
262169689Skan	if (trypid >= pidchecked) {
263169689Skan		int doingzomb = 0;
264169689Skan
265169689Skan		pidchecked = PID_MAX;
266169689Skan		/*
267169689Skan		 * Scan the active and zombie procs to check whether this pid
268169689Skan		 * is in use.  Remember the lowest pid that's greater
269169689Skan		 * than trypid, so we can avoid checking for a while.
270169689Skan		 *
271169689Skan		 * Avoid reuse of the process group id, session id or
272169689Skan		 * the reaper subtree id.  Note that for process group
273169689Skan		 * and sessions, the amount of reserved pids is
274169689Skan		 * limited by process limit.  For the subtree ids, the
275169689Skan		 * id is kept reserved only while there is a
276169689Skan		 * non-reaped process in the subtree, so amount of
277169689Skan		 * reserved pids is limited by process limit times
278169689Skan		 * two.
279169689Skan		 */
280169689Skan		p = LIST_FIRST(&allproc);
281169689Skanagain:
282169689Skan		for (; p != NULL; p = LIST_NEXT(p, p_list)) {
283169689Skan			while (p->p_pid == trypid ||
284169689Skan			    p->p_reapsubtree == trypid ||
285169689Skan			    (p->p_pgrp != NULL &&
286169689Skan			    (p->p_pgrp->pg_id == trypid ||
287169689Skan			    (p->p_session != NULL &&
288169689Skan			    p->p_session->s_sid == trypid)))) {
289169689Skan				trypid++;
290169689Skan				if (trypid >= pidchecked)
291169689Skan					goto retry;
292169689Skan			}
293169689Skan			if (p->p_pid > trypid && pidchecked > p->p_pid)
294169689Skan				pidchecked = p->p_pid;
295169689Skan			if (p->p_pgrp != NULL) {
296169689Skan				if (p->p_pgrp->pg_id > trypid &&
297169689Skan				    pidchecked > p->p_pgrp->pg_id)
298169689Skan					pidchecked = p->p_pgrp->pg_id;
299169689Skan				if (p->p_session != NULL &&
300169689Skan				    p->p_session->s_sid > trypid &&
301169689Skan				    pidchecked > p->p_session->s_sid)
302169689Skan					pidchecked = p->p_session->s_sid;
303169689Skan			}
304169689Skan		}
305169689Skan		if (!doingzomb) {
306169689Skan			doingzomb = 1;
307169689Skan			p = LIST_FIRST(&zombproc);
308169689Skan			goto again;
309169689Skan		}
310169689Skan	}
311169689Skan
312169689Skan	/*
313169689Skan	 * RFHIGHPID does not mess with the lastpid counter during boot.
314169689Skan	 */
315169689Skan	if (flags & RFHIGHPID)
316169689Skan		pidchecked = 0;
317169689Skan	else
318169689Skan		lastpid = trypid;
319169689Skan
320169689Skan	return (trypid);
321169689Skan}
322169689Skan
323169689Skanstatic int
324169689Skanfork_norfproc(struct thread *td, int flags)
325169689Skan{
326169689Skan	int error;
327169689Skan	struct proc *p1;
328169689Skan
329169689Skan	KASSERT((flags & RFPROC) == 0,
330169689Skan	    ("fork_norfproc called with RFPROC set"));
331169689Skan	p1 = td->td_proc;
332169689Skan
333169689Skan	if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
334169689Skan	    (flags & (RFCFDG | RFFDG))) {
335169689Skan		PROC_LOCK(p1);
336169689Skan		if (thread_single(p1, SINGLE_BOUNDARY)) {
337169689Skan			PROC_UNLOCK(p1);
338169689Skan			return (ERESTART);
339169689Skan		}
340169689Skan		PROC_UNLOCK(p1);
341169689Skan	}
342169689Skan
343169689Skan	error = vm_forkproc(td, NULL, NULL, NULL, flags);
344169689Skan	if (error)
345169689Skan		goto fail;
346169689Skan
347169689Skan	/*
348169689Skan	 * Close all file descriptors.
349169689Skan	 */
350169689Skan	if (flags & RFCFDG) {
351169689Skan		struct filedesc *fdtmp;
352169689Skan		fdtmp = fdinit(td->td_proc->p_fd);
353169689Skan		fdescfree(td);
354169689Skan		p1->p_fd = fdtmp;
355169689Skan	}
356169689Skan
357169689Skan	/*
358169689Skan	 * Unshare file descriptors (from parent).
359169689Skan	 */
360169689Skan	if (flags & RFFDG)
361169689Skan		fdunshare(td);
362169689Skan
363169689Skanfail:
364169689Skan	if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
365169689Skan	    (flags & (RFCFDG | RFFDG))) {
366169689Skan		PROC_LOCK(p1);
367169689Skan		thread_single_end(p1, SINGLE_BOUNDARY);
368169689Skan		PROC_UNLOCK(p1);
369169689Skan	}
370169689Skan	return (error);
371169689Skan}
372169689Skan
373169689Skanstatic void
374169689Skando_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
375169689Skan    struct vmspace *vm2, int pdflags)
376169689Skan{
377169689Skan	struct proc *p1, *pptr;
378169689Skan	int p2_held, trypid;
379169689Skan	struct filedesc *fd;
380169689Skan	struct filedesc_to_leader *fdtol;
381169689Skan	struct sigacts *newsigacts;
382169689Skan
383169689Skan	sx_assert(&proctree_lock, SX_SLOCKED);
384169689Skan	sx_assert(&allproc_lock, SX_XLOCKED);
385169689Skan
386169689Skan	p2_held = 0;
387169689Skan	p1 = td->td_proc;
388169689Skan
389169689Skan	trypid = fork_findpid(flags);
390169689Skan
391169689Skan	sx_sunlock(&proctree_lock);
392169689Skan
393169689Skan	p2->p_state = PRS_NEW;		/* protect against others */
394169689Skan	p2->p_pid = trypid;
395169689Skan	AUDIT_ARG_PID(p2->p_pid);
396169689Skan	LIST_INSERT_HEAD(&allproc, p2, p_list);
397169689Skan	allproc_gen++;
398169689Skan	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
399169689Skan	tidhash_add(td2);
400169689Skan	PROC_LOCK(p2);
401169689Skan	PROC_LOCK(p1);
402169689Skan
403169689Skan	sx_xunlock(&allproc_lock);
404169689Skan
405169689Skan	bcopy(&p1->p_startcopy, &p2->p_startcopy,
406169689Skan	    __rangeof(struct proc, p_startcopy, p_endcopy));
407169689Skan	pargs_hold(p2->p_args);
408169689Skan	PROC_UNLOCK(p1);
409169689Skan
410169689Skan	bzero(&p2->p_startzero,
411169689Skan	    __rangeof(struct proc, p_startzero, p_endzero));
412169689Skan	p2->p_treeflag = 0;
413169689Skan	p2->p_filemon = NULL;
414169689Skan	p2->p_ptevents = 0;
415169689Skan
416169689Skan	/* Tell the prison that we exist. */
417169689Skan	prison_proc_hold(p2->p_ucred->cr_prison);
418169689Skan
419169689Skan	PROC_UNLOCK(p2);
420169689Skan
421169689Skan	/*
422169689Skan	 * Malloc things while we don't hold any locks.
423169689Skan	 */
424169689Skan	if (flags & RFSIGSHARE)
425169689Skan		newsigacts = NULL;
426169689Skan	else
427169689Skan		newsigacts = sigacts_alloc();
428169689Skan
429169689Skan	/*
430169689Skan	 * Copy filedesc.
431169689Skan	 */
432169689Skan	if (flags & RFCFDG) {
433169689Skan		fd = fdinit(p1->p_fd);
434169689Skan		fdtol = NULL;
435169689Skan	} else if (flags & RFFDG) {
436169689Skan		fd = fdcopy(p1->p_fd);
437169689Skan		fdtol = NULL;
438169689Skan	} else {
439169689Skan		fd = fdshare(p1->p_fd);
440169689Skan		if (p1->p_fdtol == NULL)
441169689Skan			p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL,
442169689Skan			    p1->p_leader);
443169689Skan		if ((flags & RFTHREAD) != 0) {
444169689Skan			/*
445169689Skan			 * Shared file descriptor table, and shared
446169689Skan			 * process leaders.
447169689Skan			 */
448169689Skan			fdtol = p1->p_fdtol;
449169689Skan			FILEDESC_XLOCK(p1->p_fd);
450169689Skan			fdtol->fdl_refcount++;
451169689Skan			FILEDESC_XUNLOCK(p1->p_fd);
452169689Skan		} else {
453169689Skan			/*
454169689Skan			 * Shared file descriptor table, and different
455169689Skan			 * process leaders.
456169689Skan			 */
457169689Skan			fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
458169689Skan			    p1->p_fd, p2);
459169689Skan		}
460169689Skan	}
461169689Skan	/*
462169689Skan	 * Make a proc table entry for the new process.
463169689Skan	 * Start by zeroing the section of proc that is zero-initialized,
464169689Skan	 * then copy the section that is copied directly from the parent.
465169689Skan	 */
466169689Skan
467169689Skan	PROC_LOCK(p2);
468169689Skan	PROC_LOCK(p1);
469169689Skan
470169689Skan	bzero(&td2->td_startzero,
471169689Skan	    __rangeof(struct thread, td_startzero, td_endzero));
472169689Skan	td2->td_su = NULL;
473169689Skan	td2->td_sleeptimo = 0;
474169689Skan
475169689Skan	bcopy(&td->td_startcopy, &td2->td_startcopy,
476169689Skan	    __rangeof(struct thread, td_startcopy, td_endcopy));
477169689Skan
478169689Skan	bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
479169689Skan	td2->td_sigstk = td->td_sigstk;
480169689Skan	td2->td_flags = TDF_INMEM;
481169689Skan	td2->td_lend_user_pri = PRI_MAX;
482169689Skan	td2->td_dbg_sc_code = td->td_dbg_sc_code;
483169689Skan	td2->td_dbg_sc_narg = td->td_dbg_sc_narg;
484169689Skan
485169689Skan#ifdef VIMAGE
486169689Skan	td2->td_vnet = NULL;
487169689Skan	td2->td_vnet_lpush = NULL;
488169689Skan#endif
489169689Skan
490169689Skan	/*
491169689Skan	 * Allow the scheduler to initialize the child.
492169689Skan	 */
493169689Skan	thread_lock(td);
494169689Skan	sched_fork(td, td2);
495169689Skan	thread_unlock(td);
496169689Skan
497169689Skan	/*
498169689Skan	 * Duplicate sub-structures as needed.
499169689Skan	 * Increase reference counts on shared objects.
500169689Skan	 */
501169689Skan	p2->p_flag = P_INMEM;
502169689Skan	p2->p_flag2 = p1->p_flag2 & (P2_NOTRACE | P2_NOTRACE_EXEC);
503169689Skan	p2->p_swtick = ticks;
504169689Skan	if (p1->p_flag & P_PROFIL)
505169689Skan		startprofclock(p2);
506169689Skan	td2->td_ucred = crhold(p2->p_ucred);
507169689Skan
508169689Skan	if (flags & RFSIGSHARE) {
509169689Skan		p2->p_sigacts = sigacts_hold(p1->p_sigacts);
510169689Skan	} else {
511169689Skan		sigacts_copy(newsigacts, p1->p_sigacts);
512169689Skan		p2->p_sigacts = newsigacts;
513169689Skan	}
514169689Skan
515169689Skan	if (flags & RFTSIGZMB)
516169689Skan	        p2->p_sigparent = RFTSIGNUM(flags);
517169689Skan	else if (flags & RFLINUXTHPN)
518169689Skan	        p2->p_sigparent = SIGUSR1;
519169689Skan	else
520169689Skan	        p2->p_sigparent = SIGCHLD;
521169689Skan
522169689Skan	p2->p_textvp = p1->p_textvp;
523169689Skan	p2->p_fd = fd;
524169689Skan	p2->p_fdtol = fdtol;
525169689Skan
526169689Skan	if (p1->p_flag2 & P2_INHERIT_PROTECTED) {
527169689Skan		p2->p_flag |= P_PROTECTED;
528169689Skan		p2->p_flag2 |= P2_INHERIT_PROTECTED;
529169689Skan	}
530169689Skan
531169689Skan	/*
532169689Skan	 * p_limit is copy-on-write.  Bump its refcount.
533169689Skan	 */
534169689Skan	lim_fork(p1, p2);
535169689Skan
536169689Skan	pstats_fork(p1->p_stats, p2->p_stats);
537169689Skan
538169689Skan	PROC_UNLOCK(p1);
539169689Skan	PROC_UNLOCK(p2);
540169689Skan
541169689Skan	/* Bump references to the text vnode (for procfs). */
542169689Skan	if (p2->p_textvp)
543169689Skan		vref(p2->p_textvp);
544169689Skan
545169689Skan	/*
546169689Skan	 * Set up linkage for kernel based threading.
547169689Skan	 */
548169689Skan	if ((flags & RFTHREAD) != 0) {
549169689Skan		mtx_lock(&ppeers_lock);
550169689Skan		p2->p_peers = p1->p_peers;
551169689Skan		p1->p_peers = p2;
552169689Skan		p2->p_leader = p1->p_leader;
553169689Skan		mtx_unlock(&ppeers_lock);
554169689Skan		PROC_LOCK(p1->p_leader);
555169689Skan		if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
556169689Skan			PROC_UNLOCK(p1->p_leader);
557169689Skan			/*
558169689Skan			 * The task leader is exiting, so process p1 is
559169689Skan			 * going to be killed shortly.  Since p1 obviously
560169689Skan			 * isn't dead yet, we know that the leader is either
561169689Skan			 * sending SIGKILL's to all the processes in this
562169689Skan			 * task or is sleeping waiting for all the peers to
563169689Skan			 * exit.  We let p1 complete the fork, but we need
564169689Skan			 * to go ahead and kill the new process p2 since
565169689Skan			 * the task leader may not get a chance to send
566169689Skan			 * SIGKILL to it.  We leave it on the list so that
567169689Skan			 * the task leader will wait for this new process
568169689Skan			 * to commit suicide.
569169689Skan			 */
570169689Skan			PROC_LOCK(p2);
571169689Skan			kern_psignal(p2, SIGKILL);
572169689Skan			PROC_UNLOCK(p2);
573169689Skan		} else
574169689Skan			PROC_UNLOCK(p1->p_leader);
575169689Skan	} else {
576169689Skan		p2->p_peers = NULL;
577169689Skan		p2->p_leader = p2;
578169689Skan	}
579169689Skan
580169689Skan	sx_xlock(&proctree_lock);
581169689Skan	PGRP_LOCK(p1->p_pgrp);
582169689Skan	PROC_LOCK(p2);
583169689Skan	PROC_LOCK(p1);
584169689Skan
585169689Skan	/*
586169689Skan	 * Preserve some more flags in subprocess.  P_PROFIL has already
587169689Skan	 * been preserved.
588169689Skan	 */
589169689Skan	p2->p_flag |= p1->p_flag & P_SUGID;
590169689Skan	td2->td_pflags |= td->td_pflags & TDP_ALTSTACK;
591169689Skan	SESS_LOCK(p1->p_session);
592169689Skan	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
593169689Skan		p2->p_flag |= P_CONTROLT;
594169689Skan	SESS_UNLOCK(p1->p_session);
595169689Skan	if (flags & RFPPWAIT)
596169689Skan		p2->p_flag |= P_PPWAIT;
597169689Skan
598169689Skan	p2->p_pgrp = p1->p_pgrp;
599169689Skan	LIST_INSERT_AFTER(p1, p2, p_pglist);
600169689Skan	PGRP_UNLOCK(p1->p_pgrp);
601169689Skan	LIST_INIT(&p2->p_children);
602169689Skan	LIST_INIT(&p2->p_orphans);
603169689Skan
604169689Skan	callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0);
605169689Skan
606169689Skan	/*
607169689Skan	 * If PF_FORK is set, the child process inherits the
608169689Skan	 * procfs ioctl flags from its parent.
609169689Skan	 */
610169689Skan	if (p1->p_pfsflags & PF_FORK) {
611169689Skan		p2->p_stops = p1->p_stops;
612169689Skan		p2->p_pfsflags = p1->p_pfsflags;
613169689Skan	}
614169689Skan
615169689Skan	/*
616169689Skan	 * This begins the section where we must prevent the parent
617169689Skan	 * from being swapped.
618169689Skan	 */
619169689Skan	_PHOLD(p1);
620169689Skan	PROC_UNLOCK(p1);
621169689Skan
622169689Skan	/*
623169689Skan	 * Attach the new process to its parent.
624169689Skan	 *
625169689Skan	 * If RFNOWAIT is set, the newly created process becomes a child
626169689Skan	 * of init.  This effectively disassociates the child from the
627169689Skan	 * parent.
628169689Skan	 */
629169689Skan	if ((flags & RFNOWAIT) != 0) {
630169689Skan		pptr = p1->p_reaper;
631169689Skan		p2->p_reaper = pptr;
632169689Skan	} else {
633169689Skan		p2->p_reaper = (p1->p_treeflag & P_TREE_REAPER) != 0 ?
634169689Skan		    p1 : p1->p_reaper;
635169689Skan		pptr = p1;
636169689Skan	}
637169689Skan	p2->p_pptr = pptr;
638169689Skan	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
639169689Skan	LIST_INIT(&p2->p_reaplist);
640169689Skan	LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling);
641169689Skan	if (p2->p_reaper == p1)
642169689Skan		p2->p_reapsubtree = p2->p_pid;
643169689Skan	else
644169689Skan		p2->p_reapsubtree = p1->p_reapsubtree;
645169689Skan	sx_xunlock(&proctree_lock);
646169689Skan
647169689Skan	/* Inform accounting that we have forked. */
648169689Skan	p2->p_acflag = AFORK;
649169689Skan	PROC_UNLOCK(p2);
650169689Skan
651169689Skan#ifdef KTRACE
652169689Skan	ktrprocfork(p1, p2);
653169689Skan#endif
654169689Skan
655169689Skan	/*
656169689Skan	 * Finish creating the child process.  It will return via a different
657169689Skan	 * execution path later.  (ie: directly into user mode)
658169689Skan	 */
659169689Skan	vm_forkproc(td, p2, td2, vm2, flags);
660169689Skan
661169689Skan	if (flags == (RFFDG | RFPROC)) {
662169689Skan		PCPU_INC(cnt.v_forks);
663169689Skan		PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize +
664169689Skan		    p2->p_vmspace->vm_ssize);
665169689Skan	} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
666169689Skan		PCPU_INC(cnt.v_vforks);
667169689Skan		PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
668169689Skan		    p2->p_vmspace->vm_ssize);
669169689Skan	} else if (p1 == &proc0) {
670169689Skan		PCPU_INC(cnt.v_kthreads);
671169689Skan		PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
672169689Skan		    p2->p_vmspace->vm_ssize);
673169689Skan	} else {
674169689Skan		PCPU_INC(cnt.v_rforks);
675169689Skan		PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
676169689Skan		    p2->p_vmspace->vm_ssize);
677169689Skan	}
678169689Skan
679169689Skan#ifdef PROCDESC
680169689Skan	/*
681169689Skan	 * Associate the process descriptor with the process before anything
682169689Skan	 * can happen that might cause that process to need the descriptor.
683169689Skan	 * However, don't do this until after fork(2) can no longer fail.
684169689Skan	 */
685169689Skan	if (flags & RFPROCDESC)
686169689Skan		procdesc_new(p2, pdflags);
687169689Skan#endif
688169689Skan
689169689Skan	/*
690169689Skan	 * Both processes are set up, now check if any loadable modules want
691169689Skan	 * to adjust anything.
692169689Skan	 */
693169689Skan	EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);
694169689Skan
695169689Skan	/*
696169689Skan	 * Set the child start time and mark the process as being complete.
697169689Skan	 */
698169689Skan	PROC_LOCK(p2);
699169689Skan	PROC_LOCK(p1);
700169689Skan	microuptime(&p2->p_stats->p_start);
701169689Skan	PROC_SLOCK(p2);
702169689Skan	p2->p_state = PRS_NORMAL;
703169689Skan	PROC_SUNLOCK(p2);
704169689Skan
705169689Skan#ifdef KDTRACE_HOOKS
706169689Skan	/*
707169689Skan	 * Tell the DTrace fasttrap provider about the new process so that any
708169689Skan	 * tracepoints inherited from the parent can be removed. We have to do
709169689Skan	 * this only after p_state is PRS_NORMAL since the fasttrap module will
710169689Skan	 * use pfind() later on.
711169689Skan	 */
712169689Skan	if ((flags & RFMEM) == 0 && dtrace_fasttrap_fork)
713169689Skan		dtrace_fasttrap_fork(p1, p2);
714169689Skan#endif
715169689Skan	if (p1->p_ptevents & PTRACE_FORK) {
716169689Skan		/*
717169689Skan		 * Arrange for debugger to receive the fork event.
718169689Skan		 *
719169689Skan		 * We can report PL_FLAG_FORKED regardless of
720169689Skan		 * P_FOLLOWFORK settings, but it does not make a sense
721169689Skan		 * for runaway child.
722169689Skan		 */
723169689Skan		td->td_dbgflags |= TDB_FORK;
724169689Skan		td->td_dbg_forked = p2->p_pid;
725169689Skan		td2->td_dbgflags |= TDB_STOPATFORK;
726169689Skan		_PHOLD(p2);
727169689Skan		p2_held = 1;
728169689Skan	}
729169689Skan	if (flags & RFPPWAIT) {
730169689Skan		td->td_pflags |= TDP_RFPPWAIT;
731169689Skan		td->td_rfppwait_p = p2;
732169689Skan		td->td_dbgflags |= TDB_VFORK;
733169689Skan	}
734169689Skan	PROC_UNLOCK(p2);
735169689Skan	if ((flags & RFSTOPPED) == 0) {
736169689Skan		/*
737169689Skan		 * If RFSTOPPED not requested, make child runnable and
738169689Skan		 * add to run queue.
739169689Skan		 */
740169689Skan		thread_lock(td2);
741169689Skan		TD_SET_CAN_RUN(td2);
742169689Skan		sched_add(td2, SRQ_BORING);
743169689Skan		thread_unlock(td2);
744169689Skan	}
745169689Skan
746169689Skan	/*
747169689Skan	 * Now can be swapped.
748169689Skan	 */
749169689Skan	_PRELE(p1);
750169689Skan	PROC_UNLOCK(p1);
751169689Skan
752169689Skan	/*
753169689Skan	 * Tell any interested parties about the new process.
754169689Skan	 */
755169689Skan	knote_fork(&p1->p_klist, p2->p_pid);
756169689Skan	SDT_PROBE3(proc, , , create, p2, p1, flags);
757169689Skan
758169689Skan	/*
759169689Skan	 * Wait until debugger is attached to child.
760169689Skan	 */
761169689Skan	PROC_LOCK(p2);
762169689Skan	while ((td2->td_dbgflags & TDB_STOPATFORK) != 0)
763169689Skan		cv_wait(&p2->p_dbgwait, &p2->p_mtx);
764169689Skan	if (p2_held)
765169689Skan		_PRELE(p2);
766169689Skan	PROC_UNLOCK(p2);
767169689Skan}
768169689Skan
769169689Skanint
770169689Skanfork1(struct thread *td, int flags, int pages, struct proc **procp,
771169689Skan    int *procdescp, int pdflags)
772169689Skan{
773169689Skan	struct proc *p1, *newproc;
774169689Skan	struct thread *td2;
775169689Skan	struct vmspace *vm2;
776169689Skan#ifdef PROCDESC
777169689Skan	struct file *fp_procdesc;
778169689Skan#endif
779169689Skan	vm_ooffset_t mem_charged;
780169689Skan	int error, nprocs_new, ok;
781169689Skan	static int curfail;
782169689Skan	static struct timeval lastfail;
783169689Skan
784169689Skan	/* Check for the undefined or unimplemented flags. */
785169689Skan	if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0)
786169689Skan		return (EINVAL);
787169689Skan
788169689Skan	/* Signal value requires RFTSIGZMB. */
789169689Skan	if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0)
790169689Skan		return (EINVAL);
791169689Skan
792169689Skan	/* Can't copy and clear. */
793169689Skan	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
794169689Skan		return (EINVAL);
795169689Skan
796169689Skan	/* Check the validity of the signal number. */
797169689Skan	if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG)
798169689Skan		return (EINVAL);
799169689Skan
800169689Skan#ifdef PROCDESC
801169689Skan	if ((flags & RFPROCDESC) != 0) {
802169689Skan		/* Can't not create a process yet get a process descriptor. */
803169689Skan		if ((flags & RFPROC) == 0)
804169689Skan			return (EINVAL);
805169689Skan
806169689Skan		/* Must provide a place to put a procdesc if creating one. */
807169689Skan		if (procdescp == NULL)
808169689Skan			return (EINVAL);
809169689Skan	}
810169689Skan#endif
811169689Skan
812169689Skan	p1 = td->td_proc;
813169689Skan
814169689Skan	/*
815169689Skan	 * Here we don't create a new process, but we divorce
816169689Skan	 * certain parts of a process from itself.
817169689Skan	 */
818169689Skan	if ((flags & RFPROC) == 0) {
819169689Skan		*procp = NULL;
820169689Skan		return (fork_norfproc(td, flags));
821169689Skan	}
822169689Skan
823169689Skan#ifdef PROCDESC
824169689Skan	fp_procdesc = NULL;
825169689Skan#endif
826169689Skan	newproc = NULL;
827169689Skan	vm2 = NULL;
828169689Skan
829169689Skan	/*
830169689Skan	 * Increment the nprocs resource before allocations occur.
831169689Skan	 * Although process entries are dynamically created, we still
832169689Skan	 * keep a global limit on the maximum number we will
833169689Skan	 * create. There are hard-limits as to the number of processes
834169689Skan	 * that can run, established by the KVA and memory usage for
835169689Skan	 * the process data.
836169689Skan	 *
837169689Skan	 * Don't allow a nonprivileged user to use the last ten
838169689Skan	 * processes; don't let root exceed the limit.
839169689Skan	 */
840169689Skan	nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1;
841169689Skan	if ((nprocs_new >= maxproc - 10 && priv_check_cred(td->td_ucred,
842169689Skan	    PRIV_MAXPROC, 0) != 0) || nprocs_new >= maxproc) {
843169689Skan		sx_xlock(&allproc_lock);
844169689Skan		if (ppsratecheck(&lastfail, &curfail, 1)) {
845169689Skan			printf("maxproc limit exceeded by uid %u (pid %d); "
846169689Skan			    "see tuning(7) and login.conf(5)\n",
847169689Skan			    td->td_ucred->cr_ruid, p1->p_pid);
848169689Skan		}
849169689Skan		sx_xunlock(&allproc_lock);
850169689Skan		error = EAGAIN;
851169689Skan		goto fail1;
852169689Skan	}
853169689Skan
854169689Skan#ifdef PROCDESC
855169689Skan	/*
856169689Skan	 * If required, create a process descriptor in the parent first; we
857169689Skan	 * will abandon it if something goes wrong. We don't finit() until
858169689Skan	 * later.
859169689Skan	 */
860169689Skan	if (flags & RFPROCDESC) {
861169689Skan		error = falloc(td, &fp_procdesc, procdescp, 0);
862169689Skan		if (error != 0)
863169689Skan			goto fail1;
864169689Skan	}
865169689Skan#endif
866169689Skan
867169689Skan	mem_charged = 0;
868169689Skan	if (pages == 0)
869169689Skan		pages = KSTACK_PAGES;
870169689Skan	/* Allocate new proc. */
871169689Skan	newproc = uma_zalloc(proc_zone, M_WAITOK);
872169689Skan	td2 = FIRST_THREAD_IN_PROC(newproc);
873169689Skan	if (td2 == NULL) {
874169689Skan		td2 = thread_alloc(pages);
875169689Skan		if (td2 == NULL) {
876169689Skan			error = ENOMEM;
877169689Skan			goto fail2;
878169689Skan		}
879169689Skan		proc_linkup(newproc, td2);
880169689Skan	} else {
881169689Skan		if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) {
882169689Skan			if (td2->td_kstack != 0)
883169689Skan				vm_thread_dispose(td2);
884169689Skan			if (!thread_alloc_stack(td2, pages)) {
885169689Skan				error = ENOMEM;
886169689Skan				goto fail2;
887169689Skan			}
888169689Skan		}
889169689Skan	}
890169689Skan
891169689Skan	if ((flags & RFMEM) == 0) {
892169689Skan		vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
893169689Skan		if (vm2 == NULL) {
894169689Skan			error = ENOMEM;
895169689Skan			goto fail2;
896169689Skan		}
897169689Skan		if (!swap_reserve(mem_charged)) {
898169689Skan			/*
899169689Skan			 * The swap reservation failed. The accounting
900169689Skan			 * from the entries of the copied vm2 will be
901169689Skan			 * subtracted in vmspace_free(), so force the
902169689Skan			 * reservation there.
903169689Skan			 */
904169689Skan			swap_reserve_force(mem_charged);
905169689Skan			error = ENOMEM;
906169689Skan			goto fail2;
907169689Skan		}
908169689Skan	} else
909169689Skan		vm2 = NULL;
910169689Skan
911169689Skan	/*
912169689Skan	 * XXX: This is ugly; when we copy resource usage, we need to bump
913169689Skan	 *      per-cred resource counters.
914169689Skan	 */
915169689Skan	proc_set_cred_init(newproc, crhold(td->td_ucred));
916169689Skan
917169689Skan	/*
918169689Skan	 * Initialize resource accounting for the child process.
919169689Skan	 */
920169689Skan	error = racct_proc_fork(p1, newproc);
921169689Skan	if (error != 0) {
922169689Skan		error = EAGAIN;
923169689Skan		goto fail1;
924169689Skan	}
925169689Skan
926169689Skan#ifdef MAC
927169689Skan	mac_proc_init(newproc);
928169689Skan#endif
929169689Skan	knlist_init_mtx(&newproc->p_klist, &newproc->p_mtx);
930169689Skan	STAILQ_INIT(&newproc->p_ktr);
931169689Skan
932169689Skan	/* We have to lock the process tree while we look for a pid. */
933169689Skan	sx_slock(&proctree_lock);
934169689Skan	sx_xlock(&allproc_lock);
935169689Skan
936169689Skan	/*
937169689Skan	 * Increment the count of procs running with this uid. Don't allow
938169689Skan	 * a nonprivileged user to exceed their current limit.
939169689Skan	 *
940169689Skan	 * XXXRW: Can we avoid privilege here if it's not needed?
941169689Skan	 */
942169689Skan	error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0);
943169689Skan	if (error == 0)
944169689Skan		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
945169689Skan	else {
946169689Skan		PROC_LOCK(p1);
947169689Skan		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
948169689Skan		    lim_cur(p1, RLIMIT_NPROC));
949169689Skan		PROC_UNLOCK(p1);
950169689Skan	}
951169689Skan	if (ok) {
952169689Skan		do_fork(td, flags, newproc, td2, vm2, pdflags);
953169689Skan
954169689Skan		/*
955169689Skan		 * Return child proc pointer to parent.
956169689Skan		 */
957169689Skan		*procp = newproc;
958169689Skan#ifdef PROCDESC
959169689Skan		if (flags & RFPROCDESC) {
960169689Skan			procdesc_finit(newproc->p_procdesc, fp_procdesc);
961169689Skan			fdrop(fp_procdesc, td);
962169689Skan		}
963169689Skan#endif
964169689Skan		racct_proc_fork_done(newproc);
965169689Skan		return (0);
966169689Skan	}
967169689Skan
968169689Skan	error = EAGAIN;
969169689Skan	sx_sunlock(&proctree_lock);
970169689Skan	sx_xunlock(&allproc_lock);
971169689Skan#ifdef MAC
972169689Skan	mac_proc_destroy(newproc);
973169689Skan#endif
974169689Skan	racct_proc_exit(newproc);
975169689Skanfail1:
976169689Skan	crfree(newproc->p_ucred);
977169689Skan	newproc->p_ucred = NULL;
978169689Skanfail2:
979169689Skan	if (vm2 != NULL)
980169689Skan		vmspace_free(vm2);
981169689Skan	uma_zfree(proc_zone, newproc);
982169689Skan#ifdef PROCDESC
983169689Skan	if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) {
984169689Skan		fdclose(td->td_proc->p_fd, fp_procdesc, *procdescp, td);
985169689Skan		fdrop(fp_procdesc, td);
986169689Skan	}
987169689Skan#endif
988169689Skan	atomic_add_int(&nprocs, -1);
989169689Skan	pause("fork", hz / 2);
990169689Skan	return (error);
991169689Skan}
992169689Skan
993169689Skan/*
994169689Skan * Handle the return of a child process from fork1().  This function
995169689Skan * is called from the MD fork_trampoline() entry point.
996169689Skan */
997169689Skanvoid
998169689Skanfork_exit(void (*callout)(void *, struct trapframe *), void *arg,
999169689Skan    struct trapframe *frame)
1000169689Skan{
1001169689Skan	struct proc *p;
1002169689Skan	struct thread *td;
1003169689Skan	struct thread *dtd;
1004169689Skan
1005169689Skan	td = curthread;
1006169689Skan	p = td->td_proc;
1007169689Skan	KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
1008169689Skan
1009169689Skan	CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)",
1010169689Skan		td, td->td_sched, p->p_pid, td->td_name);
1011169689Skan
1012169689Skan	sched_fork_exit(td);
1013169689Skan	/*
1014169689Skan	* Processes normally resume in mi_switch() after being
1015169689Skan	* cpu_switch()'ed to, but when children start up they arrive here
1016169689Skan	* instead, so we must do much the same things as mi_switch() would.
1017169689Skan	*/
1018169689Skan	if ((dtd = PCPU_GET(deadthread))) {
1019169689Skan		PCPU_SET(deadthread, NULL);
1020169689Skan		thread_stash(dtd);
1021169689Skan	}
1022169689Skan	thread_unlock(td);
1023169689Skan
1024169689Skan	/*
1025169689Skan	 * cpu_set_fork_handler intercepts this function call to
1026169689Skan	 * have this call a non-return function to stay in kernel mode.
1027169689Skan	 * initproc has its own fork handler, but it does return.
1028169689Skan	 */
1029169689Skan	KASSERT(callout != NULL, ("NULL callout in fork_exit"));
1030169689Skan	callout(arg, frame);
1031169689Skan
1032169689Skan	/*
1033169689Skan	 * Check if a kernel thread misbehaved and returned from its main
1034169689Skan	 * function.
1035169689Skan	 */
1036169689Skan	if (p->p_flag & P_KTHREAD) {
1037169689Skan		printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
1038169689Skan		    td->td_name, p->p_pid);
1039169689Skan		kthread_exit();
1040169689Skan	}
1041169689Skan	mtx_assert(&Giant, MA_NOTOWNED);
1042169689Skan
1043169689Skan	if (p->p_sysent->sv_schedtail != NULL)
1044169689Skan		(p->p_sysent->sv_schedtail)(td);
1045169689Skan}
1046169689Skan
1047169689Skan/*
1048169689Skan * Simplified back end of syscall(), used when returning from fork()
1049169689Skan * directly into user mode.  This function is passed in to fork_exit()
1050169689Skan * as the first parameter and is called when returning to a new
1051169689Skan * userland process.
1052169689Skan */
1053169689Skanvoid
1054169689Skanfork_return(struct thread *td, struct trapframe *frame)
1055169689Skan{
1056169689Skan	struct proc *p, *dbg;
1057169689Skan
1058169689Skan	p = td->td_proc;
1059169689Skan	if (td->td_dbgflags & TDB_STOPATFORK) {
1060169689Skan		sx_xlock(&proctree_lock);
1061169689Skan		PROC_LOCK(p);
1062169689Skan		if (p->p_pptr->p_ptevents & PTRACE_FORK) {
1063169689Skan			/*
1064169689Skan			 * If debugger still wants auto-attach for the
1065169689Skan			 * parent's children, do it now.
1066169689Skan			 */
1067169689Skan			dbg = p->p_pptr->p_pptr;
1068169689Skan			proc_set_traced(p, true);
1069169689Skan			CTR2(KTR_PTRACE,
1070169689Skan		    "fork_return: attaching to new child pid %d: oppid %d",
1071169689Skan			    p->p_pid, p->p_oppid);
1072169689Skan			proc_reparent(p, dbg);
1073169689Skan			sx_xunlock(&proctree_lock);
1074169689Skan			td->td_dbgflags |= TDB_CHILD | TDB_SCX | TDB_FSTP;
1075169689Skan			ptracestop(td, SIGSTOP);
1076169689Skan			td->td_dbgflags &= ~(TDB_CHILD | TDB_SCX);
1077169689Skan		} else {
1078169689Skan			/*
1079169689Skan			 * ... otherwise clear the request.
1080169689Skan			 */
1081169689Skan			sx_xunlock(&proctree_lock);
1082169689Skan			td->td_dbgflags &= ~TDB_STOPATFORK;
1083169689Skan			cv_broadcast(&p->p_dbgwait);
1084169689Skan		}
1085169689Skan		PROC_UNLOCK(p);
1086169689Skan	} else if (p->p_flag & P_TRACED || td->td_dbgflags & TDB_BORN) {
1087169689Skan 		/*
1088169689Skan		 * This is the start of a new thread in a traced
1089169689Skan		 * process.  Report a system call exit event.
1090169689Skan		 */
1091169689Skan		PROC_LOCK(p);
1092169689Skan		td->td_dbgflags |= TDB_SCX;
1093169689Skan		_STOPEVENT(p, S_SCX, td->td_dbg_sc_code);
1094169689Skan		if ((p->p_ptevents & PTRACE_SCX) != 0 ||
1095169689Skan		    (td->td_dbgflags & TDB_BORN) != 0)
1096169689Skan			ptracestop(td, SIGTRAP);
1097169689Skan		td->td_dbgflags &= ~(TDB_SCX | TDB_BORN);
1098169689Skan		PROC_UNLOCK(p);
1099169689Skan	}
1100169689Skan
1101169689Skan	userret(td, frame);
1102169689Skan
1103169689Skan#ifdef KTRACE
1104169689Skan	if (KTRPOINT(td, KTR_SYSRET))
1105169689Skan		ktrsysret(SYS_fork, 0, 0);
1106169689Skan#endif
1107169689Skan}
1108169689Skan