kern_fork.c revision 321343
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: stable/11/sys/kern/kern_fork.c 321343 2017-07-21 18:06:57Z kib $");
39
40#include "opt_ktrace.h"
41#include "opt_kstack_pages.h"
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/sysproto.h>
46#include <sys/eventhandler.h>
47#include <sys/fcntl.h>
48#include <sys/filedesc.h>
49#include <sys/jail.h>
50#include <sys/kernel.h>
51#include <sys/kthread.h>
52#include <sys/sysctl.h>
53#include <sys/lock.h>
54#include <sys/malloc.h>
55#include <sys/mutex.h>
56#include <sys/priv.h>
57#include <sys/proc.h>
58#include <sys/procdesc.h>
59#include <sys/pioctl.h>
60#include <sys/ptrace.h>
61#include <sys/racct.h>
62#include <sys/resourcevar.h>
63#include <sys/sched.h>
64#include <sys/syscall.h>
65#include <sys/vmmeter.h>
66#include <sys/vnode.h>
67#include <sys/acct.h>
68#include <sys/ktr.h>
69#include <sys/ktrace.h>
70#include <sys/unistd.h>
71#include <sys/sdt.h>
72#include <sys/sx.h>
73#include <sys/sysent.h>
74#include <sys/signalvar.h>
75
76#include <security/audit/audit.h>
77#include <security/mac/mac_framework.h>
78
79#include <vm/vm.h>
80#include <vm/pmap.h>
81#include <vm/vm_map.h>
82#include <vm/vm_extern.h>
83#include <vm/uma.h>
84#include <vm/vm_domain.h>
85
86#ifdef KDTRACE_HOOKS
87#include <sys/dtrace_bsd.h>
88dtrace_fork_func_t	dtrace_fasttrap_fork;
89#endif
90
91SDT_PROVIDER_DECLARE(proc);
92SDT_PROBE_DEFINE3(proc, , , create, "struct proc *", "struct proc *", "int");
93
94#ifndef _SYS_SYSPROTO_H_
95struct fork_args {
96	int     dummy;
97};
98#endif
99
100/* ARGSUSED */
101int
102sys_fork(struct thread *td, struct fork_args *uap)
103{
104	struct fork_req fr;
105	int error, pid;
106
107	bzero(&fr, sizeof(fr));
108	fr.fr_flags = RFFDG | RFPROC;
109	fr.fr_pidp = &pid;
110	error = fork1(td, &fr);
111	if (error == 0) {
112		td->td_retval[0] = pid;
113		td->td_retval[1] = 0;
114	}
115	return (error);
116}
117
118/* ARGUSED */
119int
120sys_pdfork(struct thread *td, struct pdfork_args *uap)
121{
122	struct fork_req fr;
123	int error, fd, pid;
124
125	bzero(&fr, sizeof(fr));
126	fr.fr_flags = RFFDG | RFPROC | RFPROCDESC;
127	fr.fr_pidp = &pid;
128	fr.fr_pd_fd = &fd;
129	fr.fr_pd_flags = uap->flags;
130	/*
131	 * It is necessary to return fd by reference because 0 is a valid file
132	 * descriptor number, and the child needs to be able to distinguish
133	 * itself from the parent using the return value.
134	 */
135	error = fork1(td, &fr);
136	if (error == 0) {
137		td->td_retval[0] = pid;
138		td->td_retval[1] = 0;
139		error = copyout(&fd, uap->fdp, sizeof(fd));
140	}
141	return (error);
142}
143
144/* ARGSUSED */
145int
146sys_vfork(struct thread *td, struct vfork_args *uap)
147{
148	struct fork_req fr;
149	int error, pid;
150
151	bzero(&fr, sizeof(fr));
152	fr.fr_flags = RFFDG | RFPROC | RFPPWAIT | RFMEM;
153	fr.fr_pidp = &pid;
154	error = fork1(td, &fr);
155	if (error == 0) {
156		td->td_retval[0] = pid;
157		td->td_retval[1] = 0;
158	}
159	return (error);
160}
161
162int
163sys_rfork(struct thread *td, struct rfork_args *uap)
164{
165	struct fork_req fr;
166	int error, pid;
167
168	/* Don't allow kernel-only flags. */
169	if ((uap->flags & RFKERNELONLY) != 0)
170		return (EINVAL);
171
172	AUDIT_ARG_FFLAGS(uap->flags);
173	bzero(&fr, sizeof(fr));
174	fr.fr_flags = uap->flags;
175	fr.fr_pidp = &pid;
176	error = fork1(td, &fr);
177	if (error == 0) {
178		td->td_retval[0] = pid;
179		td->td_retval[1] = 0;
180	}
181	return (error);
182}
183
184int	nprocs = 1;		/* process 0 */
185int	lastpid = 0;
186SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
187    "Last used PID");
188
189/*
190 * Random component to lastpid generation.  We mix in a random factor to make
191 * it a little harder to predict.  We sanity check the modulus value to avoid
192 * doing it in critical paths.  Don't let it be too small or we pointlessly
193 * waste randomness entropy, and don't let it be impossibly large.  Using a
194 * modulus that is too big causes a LOT more process table scans and slows
195 * down fork processing as the pidchecked caching is defeated.
196 */
197static int randompid = 0;
198
199static int
200sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
201{
202	int error, pid;
203
204	error = sysctl_wire_old_buffer(req, sizeof(int));
205	if (error != 0)
206		return(error);
207	sx_xlock(&allproc_lock);
208	pid = randompid;
209	error = sysctl_handle_int(oidp, &pid, 0, req);
210	if (error == 0 && req->newptr != NULL) {
211		if (pid < 0 || pid > pid_max - 100)	/* out of range */
212			pid = pid_max - 100;
213		else if (pid < 2)			/* NOP */
214			pid = 0;
215		else if (pid < 100)			/* Make it reasonable */
216			pid = 100;
217		randompid = pid;
218	}
219	sx_xunlock(&allproc_lock);
220	return (error);
221}
222
223SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
224    0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
225
226static int
227fork_findpid(int flags)
228{
229	struct proc *p;
230	int trypid;
231	static int pidchecked = 0;
232
233	/*
234	 * Requires allproc_lock in order to iterate over the list
235	 * of processes, and proctree_lock to access p_pgrp.
236	 */
237	sx_assert(&allproc_lock, SX_LOCKED);
238	sx_assert(&proctree_lock, SX_LOCKED);
239
240	/*
241	 * Find an unused process ID.  We remember a range of unused IDs
242	 * ready to use (from lastpid+1 through pidchecked-1).
243	 *
244	 * If RFHIGHPID is set (used during system boot), do not allocate
245	 * low-numbered pids.
246	 */
247	trypid = lastpid + 1;
248	if (flags & RFHIGHPID) {
249		if (trypid < 10)
250			trypid = 10;
251	} else {
252		if (randompid)
253			trypid += arc4random() % randompid;
254	}
255retry:
256	/*
257	 * If the process ID prototype has wrapped around,
258	 * restart somewhat above 0, as the low-numbered procs
259	 * tend to include daemons that don't exit.
260	 */
261	if (trypid >= pid_max) {
262		trypid = trypid % pid_max;
263		if (trypid < 100)
264			trypid += 100;
265		pidchecked = 0;
266	}
267	if (trypid >= pidchecked) {
268		int doingzomb = 0;
269
270		pidchecked = PID_MAX;
271		/*
272		 * Scan the active and zombie procs to check whether this pid
273		 * is in use.  Remember the lowest pid that's greater
274		 * than trypid, so we can avoid checking for a while.
275		 *
276		 * Avoid reuse of the process group id, session id or
277		 * the reaper subtree id.  Note that for process group
278		 * and sessions, the amount of reserved pids is
279		 * limited by process limit.  For the subtree ids, the
280		 * id is kept reserved only while there is a
281		 * non-reaped process in the subtree, so amount of
282		 * reserved pids is limited by process limit times
283		 * two.
284		 */
285		p = LIST_FIRST(&allproc);
286again:
287		for (; p != NULL; p = LIST_NEXT(p, p_list)) {
288			while (p->p_pid == trypid ||
289			    p->p_reapsubtree == trypid ||
290			    (p->p_pgrp != NULL &&
291			    (p->p_pgrp->pg_id == trypid ||
292			    (p->p_session != NULL &&
293			    p->p_session->s_sid == trypid)))) {
294				trypid++;
295				if (trypid >= pidchecked)
296					goto retry;
297			}
298			if (p->p_pid > trypid && pidchecked > p->p_pid)
299				pidchecked = p->p_pid;
300			if (p->p_pgrp != NULL) {
301				if (p->p_pgrp->pg_id > trypid &&
302				    pidchecked > p->p_pgrp->pg_id)
303					pidchecked = p->p_pgrp->pg_id;
304				if (p->p_session != NULL &&
305				    p->p_session->s_sid > trypid &&
306				    pidchecked > p->p_session->s_sid)
307					pidchecked = p->p_session->s_sid;
308			}
309		}
310		if (!doingzomb) {
311			doingzomb = 1;
312			p = LIST_FIRST(&zombproc);
313			goto again;
314		}
315	}
316
317	/*
318	 * RFHIGHPID does not mess with the lastpid counter during boot.
319	 */
320	if (flags & RFHIGHPID)
321		pidchecked = 0;
322	else
323		lastpid = trypid;
324
325	return (trypid);
326}
327
328static int
329fork_norfproc(struct thread *td, int flags)
330{
331	int error;
332	struct proc *p1;
333
334	KASSERT((flags & RFPROC) == 0,
335	    ("fork_norfproc called with RFPROC set"));
336	p1 = td->td_proc;
337
338	if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
339	    (flags & (RFCFDG | RFFDG))) {
340		PROC_LOCK(p1);
341		if (thread_single(p1, SINGLE_BOUNDARY)) {
342			PROC_UNLOCK(p1);
343			return (ERESTART);
344		}
345		PROC_UNLOCK(p1);
346	}
347
348	error = vm_forkproc(td, NULL, NULL, NULL, flags);
349	if (error)
350		goto fail;
351
352	/*
353	 * Close all file descriptors.
354	 */
355	if (flags & RFCFDG) {
356		struct filedesc *fdtmp;
357		fdtmp = fdinit(td->td_proc->p_fd, false);
358		fdescfree(td);
359		p1->p_fd = fdtmp;
360	}
361
362	/*
363	 * Unshare file descriptors (from parent).
364	 */
365	if (flags & RFFDG)
366		fdunshare(td);
367
368fail:
369	if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
370	    (flags & (RFCFDG | RFFDG))) {
371		PROC_LOCK(p1);
372		thread_single_end(p1, SINGLE_BOUNDARY);
373		PROC_UNLOCK(p1);
374	}
375	return (error);
376}
377
378static void
379do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread *td2,
380    struct vmspace *vm2, struct file *fp_procdesc)
381{
382	struct proc *p1, *pptr;
383	int trypid;
384	struct filedesc *fd;
385	struct filedesc_to_leader *fdtol;
386	struct sigacts *newsigacts;
387
388	sx_assert(&proctree_lock, SX_SLOCKED);
389	sx_assert(&allproc_lock, SX_XLOCKED);
390
391	p1 = td->td_proc;
392
393	trypid = fork_findpid(fr->fr_flags);
394
395	sx_sunlock(&proctree_lock);
396
397	p2->p_state = PRS_NEW;		/* protect against others */
398	p2->p_pid = trypid;
399	AUDIT_ARG_PID(p2->p_pid);
400	LIST_INSERT_HEAD(&allproc, p2, p_list);
401	allproc_gen++;
402	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
403	tidhash_add(td2);
404	PROC_LOCK(p2);
405	PROC_LOCK(p1);
406
407	sx_xunlock(&allproc_lock);
408
409	bcopy(&p1->p_startcopy, &p2->p_startcopy,
410	    __rangeof(struct proc, p_startcopy, p_endcopy));
411	p2->p_elf_machine = p1->p_elf_machine;
412	p2->p_elf_flags = p1->p_elf_flags;
413	pargs_hold(p2->p_args);
414
415	PROC_UNLOCK(p1);
416
417	bzero(&p2->p_startzero,
418	    __rangeof(struct proc, p_startzero, p_endzero));
419	p2->p_ptevents = 0;
420
421	/* Tell the prison that we exist. */
422	prison_proc_hold(p2->p_ucred->cr_prison);
423
424	PROC_UNLOCK(p2);
425
426	/*
427	 * Malloc things while we don't hold any locks.
428	 */
429	if (fr->fr_flags & RFSIGSHARE)
430		newsigacts = NULL;
431	else
432		newsigacts = sigacts_alloc();
433
434	/*
435	 * Copy filedesc.
436	 */
437	if (fr->fr_flags & RFCFDG) {
438		fd = fdinit(p1->p_fd, false);
439		fdtol = NULL;
440	} else if (fr->fr_flags & RFFDG) {
441		fd = fdcopy(p1->p_fd);
442		fdtol = NULL;
443	} else {
444		fd = fdshare(p1->p_fd);
445		if (p1->p_fdtol == NULL)
446			p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL,
447			    p1->p_leader);
448		if ((fr->fr_flags & RFTHREAD) != 0) {
449			/*
450			 * Shared file descriptor table, and shared
451			 * process leaders.
452			 */
453			fdtol = p1->p_fdtol;
454			FILEDESC_XLOCK(p1->p_fd);
455			fdtol->fdl_refcount++;
456			FILEDESC_XUNLOCK(p1->p_fd);
457		} else {
458			/*
459			 * Shared file descriptor table, and different
460			 * process leaders.
461			 */
462			fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
463			    p1->p_fd, p2);
464		}
465	}
466	/*
467	 * Make a proc table entry for the new process.
468	 * Start by zeroing the section of proc that is zero-initialized,
469	 * then copy the section that is copied directly from the parent.
470	 */
471
472	PROC_LOCK(p2);
473	PROC_LOCK(p1);
474
475	bzero(&td2->td_startzero,
476	    __rangeof(struct thread, td_startzero, td_endzero));
477	td2->td_sleeptimo = 0;
478
479	bcopy(&td->td_startcopy, &td2->td_startcopy,
480	    __rangeof(struct thread, td_startcopy, td_endcopy));
481	td2->td_sa = td->td_sa;
482
483	bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
484	td2->td_sigstk = td->td_sigstk;
485	td2->td_flags = TDF_INMEM;
486	td2->td_lend_user_pri = PRI_MAX;
487
488#ifdef VIMAGE
489	td2->td_vnet = NULL;
490	td2->td_vnet_lpush = NULL;
491#endif
492
493	/*
494	 * Allow the scheduler to initialize the child.
495	 */
496	thread_lock(td);
497	sched_fork(td, td2);
498	thread_unlock(td);
499
500	/*
501	 * Duplicate sub-structures as needed.
502	 * Increase reference counts on shared objects.
503	 */
504	p2->p_flag = P_INMEM;
505	p2->p_flag2 = p1->p_flag2 & (P2_NOTRACE | P2_NOTRACE_EXEC | P2_TRAPCAP);
506	p2->p_swtick = ticks;
507	if (p1->p_flag & P_PROFIL)
508		startprofclock(p2);
509
510	/*
511	 * Whilst the proc lock is held, copy the VM domain data out
512	 * using the VM domain method.
513	 */
514	vm_domain_policy_init(&p2->p_vm_dom_policy);
515	vm_domain_policy_localcopy(&p2->p_vm_dom_policy,
516	    &p1->p_vm_dom_policy);
517
518	if (fr->fr_flags & RFSIGSHARE) {
519		p2->p_sigacts = sigacts_hold(p1->p_sigacts);
520	} else {
521		sigacts_copy(newsigacts, p1->p_sigacts);
522		p2->p_sigacts = newsigacts;
523	}
524
525	if (fr->fr_flags & RFTSIGZMB)
526	        p2->p_sigparent = RFTSIGNUM(fr->fr_flags);
527	else if (fr->fr_flags & RFLINUXTHPN)
528	        p2->p_sigparent = SIGUSR1;
529	else
530	        p2->p_sigparent = SIGCHLD;
531
532	p2->p_textvp = p1->p_textvp;
533	p2->p_fd = fd;
534	p2->p_fdtol = fdtol;
535
536	if (p1->p_flag2 & P2_INHERIT_PROTECTED) {
537		p2->p_flag |= P_PROTECTED;
538		p2->p_flag2 |= P2_INHERIT_PROTECTED;
539	}
540
541	/*
542	 * p_limit is copy-on-write.  Bump its refcount.
543	 */
544	lim_fork(p1, p2);
545
546	thread_cow_get_proc(td2, p2);
547
548	pstats_fork(p1->p_stats, p2->p_stats);
549
550	PROC_UNLOCK(p1);
551	PROC_UNLOCK(p2);
552
553	/* Bump references to the text vnode (for procfs). */
554	if (p2->p_textvp)
555		vrefact(p2->p_textvp);
556
557	/*
558	 * Set up linkage for kernel based threading.
559	 */
560	if ((fr->fr_flags & RFTHREAD) != 0) {
561		mtx_lock(&ppeers_lock);
562		p2->p_peers = p1->p_peers;
563		p1->p_peers = p2;
564		p2->p_leader = p1->p_leader;
565		mtx_unlock(&ppeers_lock);
566		PROC_LOCK(p1->p_leader);
567		if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
568			PROC_UNLOCK(p1->p_leader);
569			/*
570			 * The task leader is exiting, so process p1 is
571			 * going to be killed shortly.  Since p1 obviously
572			 * isn't dead yet, we know that the leader is either
573			 * sending SIGKILL's to all the processes in this
574			 * task or is sleeping waiting for all the peers to
575			 * exit.  We let p1 complete the fork, but we need
576			 * to go ahead and kill the new process p2 since
577			 * the task leader may not get a chance to send
578			 * SIGKILL to it.  We leave it on the list so that
579			 * the task leader will wait for this new process
580			 * to commit suicide.
581			 */
582			PROC_LOCK(p2);
583			kern_psignal(p2, SIGKILL);
584			PROC_UNLOCK(p2);
585		} else
586			PROC_UNLOCK(p1->p_leader);
587	} else {
588		p2->p_peers = NULL;
589		p2->p_leader = p2;
590	}
591
592	sx_xlock(&proctree_lock);
593	PGRP_LOCK(p1->p_pgrp);
594	PROC_LOCK(p2);
595	PROC_LOCK(p1);
596
597	/*
598	 * Preserve some more flags in subprocess.  P_PROFIL has already
599	 * been preserved.
600	 */
601	p2->p_flag |= p1->p_flag & P_SUGID;
602	td2->td_pflags |= (td->td_pflags & TDP_ALTSTACK) | TDP_FORKING;
603	SESS_LOCK(p1->p_session);
604	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
605		p2->p_flag |= P_CONTROLT;
606	SESS_UNLOCK(p1->p_session);
607	if (fr->fr_flags & RFPPWAIT)
608		p2->p_flag |= P_PPWAIT;
609
610	p2->p_pgrp = p1->p_pgrp;
611	LIST_INSERT_AFTER(p1, p2, p_pglist);
612	PGRP_UNLOCK(p1->p_pgrp);
613	LIST_INIT(&p2->p_children);
614	LIST_INIT(&p2->p_orphans);
615
616	callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0);
617
618	/*
619	 * If PF_FORK is set, the child process inherits the
620	 * procfs ioctl flags from its parent.
621	 */
622	if (p1->p_pfsflags & PF_FORK) {
623		p2->p_stops = p1->p_stops;
624		p2->p_pfsflags = p1->p_pfsflags;
625	}
626
627	/*
628	 * This begins the section where we must prevent the parent
629	 * from being swapped.
630	 */
631	_PHOLD(p1);
632	PROC_UNLOCK(p1);
633
634	/*
635	 * Attach the new process to its parent.
636	 *
637	 * If RFNOWAIT is set, the newly created process becomes a child
638	 * of init.  This effectively disassociates the child from the
639	 * parent.
640	 */
641	if ((fr->fr_flags & RFNOWAIT) != 0) {
642		pptr = p1->p_reaper;
643		p2->p_reaper = pptr;
644	} else {
645		p2->p_reaper = (p1->p_treeflag & P_TREE_REAPER) != 0 ?
646		    p1 : p1->p_reaper;
647		pptr = p1;
648	}
649	p2->p_pptr = pptr;
650	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
651	LIST_INIT(&p2->p_reaplist);
652	LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling);
653	if (p2->p_reaper == p1)
654		p2->p_reapsubtree = p2->p_pid;
655	sx_xunlock(&proctree_lock);
656
657	/* Inform accounting that we have forked. */
658	p2->p_acflag = AFORK;
659	PROC_UNLOCK(p2);
660
661#ifdef KTRACE
662	ktrprocfork(p1, p2);
663#endif
664
665	/*
666	 * Finish creating the child process.  It will return via a different
667	 * execution path later.  (ie: directly into user mode)
668	 */
669	vm_forkproc(td, p2, td2, vm2, fr->fr_flags);
670
671	if (fr->fr_flags == (RFFDG | RFPROC)) {
672		PCPU_INC(cnt.v_forks);
673		PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize +
674		    p2->p_vmspace->vm_ssize);
675	} else if (fr->fr_flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
676		PCPU_INC(cnt.v_vforks);
677		PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
678		    p2->p_vmspace->vm_ssize);
679	} else if (p1 == &proc0) {
680		PCPU_INC(cnt.v_kthreads);
681		PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
682		    p2->p_vmspace->vm_ssize);
683	} else {
684		PCPU_INC(cnt.v_rforks);
685		PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
686		    p2->p_vmspace->vm_ssize);
687	}
688
689	/*
690	 * Associate the process descriptor with the process before anything
691	 * can happen that might cause that process to need the descriptor.
692	 * However, don't do this until after fork(2) can no longer fail.
693	 */
694	if (fr->fr_flags & RFPROCDESC)
695		procdesc_new(p2, fr->fr_pd_flags);
696
697	/*
698	 * Both processes are set up, now check if any loadable modules want
699	 * to adjust anything.
700	 */
701	EVENTHANDLER_INVOKE(process_fork, p1, p2, fr->fr_flags);
702
703	/*
704	 * Set the child start time and mark the process as being complete.
705	 */
706	PROC_LOCK(p2);
707	PROC_LOCK(p1);
708	microuptime(&p2->p_stats->p_start);
709	PROC_SLOCK(p2);
710	p2->p_state = PRS_NORMAL;
711	PROC_SUNLOCK(p2);
712
713#ifdef KDTRACE_HOOKS
714	/*
715	 * Tell the DTrace fasttrap provider about the new process so that any
716	 * tracepoints inherited from the parent can be removed. We have to do
717	 * this only after p_state is PRS_NORMAL since the fasttrap module will
718	 * use pfind() later on.
719	 */
720	if ((fr->fr_flags & RFMEM) == 0 && dtrace_fasttrap_fork)
721		dtrace_fasttrap_fork(p1, p2);
722#endif
723	/*
724	 * Hold the process so that it cannot exit after we make it runnable,
725	 * but before we wait for the debugger.
726	 */
727	_PHOLD(p2);
728	if (p1->p_ptevents & PTRACE_FORK) {
729		/*
730		 * Arrange for debugger to receive the fork event.
731		 *
732		 * We can report PL_FLAG_FORKED regardless of
733		 * P_FOLLOWFORK settings, but it does not make a sense
734		 * for runaway child.
735		 */
736		td->td_dbgflags |= TDB_FORK;
737		td->td_dbg_forked = p2->p_pid;
738		td2->td_dbgflags |= TDB_STOPATFORK;
739	}
740	if (fr->fr_flags & RFPPWAIT) {
741		td->td_pflags |= TDP_RFPPWAIT;
742		td->td_rfppwait_p = p2;
743		td->td_dbgflags |= TDB_VFORK;
744	}
745	PROC_UNLOCK(p2);
746
747	/*
748	 * Now can be swapped.
749	 */
750	_PRELE(p1);
751	PROC_UNLOCK(p1);
752
753	/*
754	 * Tell any interested parties about the new process.
755	 */
756	knote_fork(p1->p_klist, p2->p_pid);
757	SDT_PROBE3(proc, , , create, p2, p1, fr->fr_flags);
758
759	if (fr->fr_flags & RFPROCDESC) {
760		procdesc_finit(p2->p_procdesc, fp_procdesc);
761		fdrop(fp_procdesc, td);
762	}
763
764	if ((fr->fr_flags & RFSTOPPED) == 0) {
765		/*
766		 * If RFSTOPPED not requested, make child runnable and
767		 * add to run queue.
768		 */
769		thread_lock(td2);
770		TD_SET_CAN_RUN(td2);
771		sched_add(td2, SRQ_BORING);
772		thread_unlock(td2);
773		if (fr->fr_pidp != NULL)
774			*fr->fr_pidp = p2->p_pid;
775	} else {
776		*fr->fr_procp = p2;
777	}
778
779	PROC_LOCK(p2);
780	/*
781	 * Wait until debugger is attached to child.
782	 */
783	while (td2->td_proc == p2 && (td2->td_dbgflags & TDB_STOPATFORK) != 0)
784		cv_wait(&p2->p_dbgwait, &p2->p_mtx);
785	_PRELE(p2);
786	racct_proc_fork_done(p2);
787	PROC_UNLOCK(p2);
788}
789
790int
791fork1(struct thread *td, struct fork_req *fr)
792{
793	struct proc *p1, *newproc;
794	struct thread *td2;
795	struct vmspace *vm2;
796	struct file *fp_procdesc;
797	vm_ooffset_t mem_charged;
798	int error, nprocs_new, ok;
799	static int curfail;
800	static struct timeval lastfail;
801	int flags, pages;
802
803	flags = fr->fr_flags;
804	pages = fr->fr_pages;
805
806	if ((flags & RFSTOPPED) != 0)
807		MPASS(fr->fr_procp != NULL && fr->fr_pidp == NULL);
808	else
809		MPASS(fr->fr_procp == NULL);
810
811	/* Check for the undefined or unimplemented flags. */
812	if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0)
813		return (EINVAL);
814
815	/* Signal value requires RFTSIGZMB. */
816	if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0)
817		return (EINVAL);
818
819	/* Can't copy and clear. */
820	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
821		return (EINVAL);
822
823	/* Check the validity of the signal number. */
824	if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG)
825		return (EINVAL);
826
827	if ((flags & RFPROCDESC) != 0) {
828		/* Can't not create a process yet get a process descriptor. */
829		if ((flags & RFPROC) == 0)
830			return (EINVAL);
831
832		/* Must provide a place to put a procdesc if creating one. */
833		if (fr->fr_pd_fd == NULL)
834			return (EINVAL);
835
836		/* Check if we are using supported flags. */
837		if ((fr->fr_pd_flags & ~PD_ALLOWED_AT_FORK) != 0)
838			return (EINVAL);
839	}
840
841	p1 = td->td_proc;
842
843	/*
844	 * Here we don't create a new process, but we divorce
845	 * certain parts of a process from itself.
846	 */
847	if ((flags & RFPROC) == 0) {
848		if (fr->fr_procp != NULL)
849			*fr->fr_procp = NULL;
850		else if (fr->fr_pidp != NULL)
851			*fr->fr_pidp = 0;
852		return (fork_norfproc(td, flags));
853	}
854
855	fp_procdesc = NULL;
856	newproc = NULL;
857	vm2 = NULL;
858
859	/*
860	 * Increment the nprocs resource before allocations occur.
861	 * Although process entries are dynamically created, we still
862	 * keep a global limit on the maximum number we will
863	 * create. There are hard-limits as to the number of processes
864	 * that can run, established by the KVA and memory usage for
865	 * the process data.
866	 *
867	 * Don't allow a nonprivileged user to use the last ten
868	 * processes; don't let root exceed the limit.
869	 */
870	nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1;
871	if ((nprocs_new >= maxproc - 10 && priv_check_cred(td->td_ucred,
872	    PRIV_MAXPROC, 0) != 0) || nprocs_new >= maxproc) {
873		error = EAGAIN;
874		sx_xlock(&allproc_lock);
875		if (ppsratecheck(&lastfail, &curfail, 1)) {
876			printf("maxproc limit exceeded by uid %u (pid %d); "
877			    "see tuning(7) and login.conf(5)\n",
878			    td->td_ucred->cr_ruid, p1->p_pid);
879		}
880		sx_xunlock(&allproc_lock);
881		goto fail2;
882	}
883
884	/*
885	 * If required, create a process descriptor in the parent first; we
886	 * will abandon it if something goes wrong. We don't finit() until
887	 * later.
888	 */
889	if (flags & RFPROCDESC) {
890		error = procdesc_falloc(td, &fp_procdesc, fr->fr_pd_fd,
891		    fr->fr_pd_flags, fr->fr_pd_fcaps);
892		if (error != 0)
893			goto fail2;
894	}
895
896	mem_charged = 0;
897	if (pages == 0)
898		pages = kstack_pages;
899	/* Allocate new proc. */
900	newproc = uma_zalloc(proc_zone, M_WAITOK);
901	td2 = FIRST_THREAD_IN_PROC(newproc);
902	if (td2 == NULL) {
903		td2 = thread_alloc(pages);
904		if (td2 == NULL) {
905			error = ENOMEM;
906			goto fail2;
907		}
908		proc_linkup(newproc, td2);
909	} else {
910		if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) {
911			if (td2->td_kstack != 0)
912				vm_thread_dispose(td2);
913			if (!thread_alloc_stack(td2, pages)) {
914				error = ENOMEM;
915				goto fail2;
916			}
917		}
918	}
919
920	if ((flags & RFMEM) == 0) {
921		vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
922		if (vm2 == NULL) {
923			error = ENOMEM;
924			goto fail2;
925		}
926		if (!swap_reserve(mem_charged)) {
927			/*
928			 * The swap reservation failed. The accounting
929			 * from the entries of the copied vm2 will be
930			 * subtracted in vmspace_free(), so force the
931			 * reservation there.
932			 */
933			swap_reserve_force(mem_charged);
934			error = ENOMEM;
935			goto fail2;
936		}
937	} else
938		vm2 = NULL;
939
940	/*
941	 * XXX: This is ugly; when we copy resource usage, we need to bump
942	 *      per-cred resource counters.
943	 */
944	proc_set_cred_init(newproc, crhold(td->td_ucred));
945
946	/*
947	 * Initialize resource accounting for the child process.
948	 */
949	error = racct_proc_fork(p1, newproc);
950	if (error != 0) {
951		error = EAGAIN;
952		goto fail1;
953	}
954
955#ifdef MAC
956	mac_proc_init(newproc);
957#endif
958	newproc->p_klist = knlist_alloc(&newproc->p_mtx);
959	STAILQ_INIT(&newproc->p_ktr);
960
961	/* We have to lock the process tree while we look for a pid. */
962	sx_slock(&proctree_lock);
963	sx_xlock(&allproc_lock);
964
965	/*
966	 * Increment the count of procs running with this uid. Don't allow
967	 * a nonprivileged user to exceed their current limit.
968	 *
969	 * XXXRW: Can we avoid privilege here if it's not needed?
970	 */
971	error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0);
972	if (error == 0)
973		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
974	else {
975		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
976		    lim_cur(td, RLIMIT_NPROC));
977	}
978	if (ok) {
979		do_fork(td, fr, newproc, td2, vm2, fp_procdesc);
980		return (0);
981	}
982
983	error = EAGAIN;
984	sx_sunlock(&proctree_lock);
985	sx_xunlock(&allproc_lock);
986#ifdef MAC
987	mac_proc_destroy(newproc);
988#endif
989	racct_proc_exit(newproc);
990fail1:
991	crfree(newproc->p_ucred);
992	newproc->p_ucred = NULL;
993fail2:
994	if (vm2 != NULL)
995		vmspace_free(vm2);
996	uma_zfree(proc_zone, newproc);
997	if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) {
998		fdclose(td, fp_procdesc, *fr->fr_pd_fd);
999		fdrop(fp_procdesc, td);
1000	}
1001	atomic_add_int(&nprocs, -1);
1002	pause("fork", hz / 2);
1003	return (error);
1004}
1005
1006/*
1007 * Handle the return of a child process from fork1().  This function
1008 * is called from the MD fork_trampoline() entry point.
1009 */
1010void
1011fork_exit(void (*callout)(void *, struct trapframe *), void *arg,
1012    struct trapframe *frame)
1013{
1014	struct proc *p;
1015	struct thread *td;
1016	struct thread *dtd;
1017
1018	td = curthread;
1019	p = td->td_proc;
1020	KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
1021
1022	CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)",
1023	    td, td_get_sched(td), p->p_pid, td->td_name);
1024
1025	sched_fork_exit(td);
1026	/*
1027	* Processes normally resume in mi_switch() after being
1028	* cpu_switch()'ed to, but when children start up they arrive here
1029	* instead, so we must do much the same things as mi_switch() would.
1030	*/
1031	if ((dtd = PCPU_GET(deadthread))) {
1032		PCPU_SET(deadthread, NULL);
1033		thread_stash(dtd);
1034	}
1035	thread_unlock(td);
1036
1037	/*
1038	 * cpu_fork_kthread_handler intercepts this function call to
1039	 * have this call a non-return function to stay in kernel mode.
1040	 * initproc has its own fork handler, but it does return.
1041	 */
1042	KASSERT(callout != NULL, ("NULL callout in fork_exit"));
1043	callout(arg, frame);
1044
1045	/*
1046	 * Check if a kernel thread misbehaved and returned from its main
1047	 * function.
1048	 */
1049	if (p->p_flag & P_KPROC) {
1050		printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
1051		    td->td_name, p->p_pid);
1052		kthread_exit();
1053	}
1054	mtx_assert(&Giant, MA_NOTOWNED);
1055
1056	if (p->p_sysent->sv_schedtail != NULL)
1057		(p->p_sysent->sv_schedtail)(td);
1058	td->td_pflags &= ~TDP_FORKING;
1059}
1060
1061/*
1062 * Simplified back end of syscall(), used when returning from fork()
1063 * directly into user mode.  This function is passed in to fork_exit()
1064 * as the first parameter and is called when returning to a new
1065 * userland process.
1066 */
1067void
1068fork_return(struct thread *td, struct trapframe *frame)
1069{
1070	struct proc *p, *dbg;
1071
1072	p = td->td_proc;
1073	if (td->td_dbgflags & TDB_STOPATFORK) {
1074		sx_xlock(&proctree_lock);
1075		PROC_LOCK(p);
1076		if (p->p_pptr->p_ptevents & PTRACE_FORK) {
1077			/*
1078			 * If debugger still wants auto-attach for the
1079			 * parent's children, do it now.
1080			 */
1081			dbg = p->p_pptr->p_pptr;
1082			proc_set_traced(p, true);
1083			CTR2(KTR_PTRACE,
1084		    "fork_return: attaching to new child pid %d: oppid %d",
1085			    p->p_pid, p->p_oppid);
1086			proc_reparent(p, dbg);
1087			sx_xunlock(&proctree_lock);
1088			td->td_dbgflags |= TDB_CHILD | TDB_SCX | TDB_FSTP;
1089			ptracestop(td, SIGSTOP, NULL);
1090			td->td_dbgflags &= ~(TDB_CHILD | TDB_SCX);
1091		} else {
1092			/*
1093			 * ... otherwise clear the request.
1094			 */
1095			sx_xunlock(&proctree_lock);
1096			td->td_dbgflags &= ~TDB_STOPATFORK;
1097			cv_broadcast(&p->p_dbgwait);
1098		}
1099		PROC_UNLOCK(p);
1100	} else if (p->p_flag & P_TRACED || td->td_dbgflags & TDB_BORN) {
1101 		/*
1102		 * This is the start of a new thread in a traced
1103		 * process.  Report a system call exit event.
1104		 */
1105		PROC_LOCK(p);
1106		td->td_dbgflags |= TDB_SCX;
1107		_STOPEVENT(p, S_SCX, td->td_sa.code);
1108		if ((p->p_ptevents & PTRACE_SCX) != 0 ||
1109		    (td->td_dbgflags & TDB_BORN) != 0)
1110			ptracestop(td, SIGTRAP, NULL);
1111		td->td_dbgflags &= ~(TDB_SCX | TDB_BORN);
1112		PROC_UNLOCK(p);
1113	}
1114
1115	userret(td, frame);
1116
1117#ifdef KTRACE
1118	if (KTRPOINT(td, KTR_SYSRET))
1119		ktrsysret(SYS_fork, 0, 0);
1120#endif
1121}
1122