kern_thread.c revision 331017
167761Smsmith/*-
267761Smsmith * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
367761Smsmith *  All rights reserved.
467761Smsmith *
567761Smsmith * Redistribution and use in source and binary forms, with or without
667761Smsmith * modification, are permitted provided that the following conditions
767761Smsmith * are met:
867761Smsmith * 1. Redistributions of source code must retain the above copyright
967761Smsmith *    notice(s), this list of conditions and the following disclaimer as
1067761Smsmith *    the first lines of this file unmodified other than the possible
1167761Smsmith *    addition of one or more copyright notices.
1267761Smsmith * 2. Redistributions in binary form must reproduce the above copyright
1367761Smsmith *    notice(s), this list of conditions and the following disclaimer in the
1467761Smsmith *    documentation and/or other materials provided with the distribution.
1567761Smsmith *
1667761Smsmith * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
1767761Smsmith * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1867761Smsmith * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1967761Smsmith * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
2067761Smsmith * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
2167761Smsmith * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
2267761Smsmith * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
2367761Smsmith * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2467761Smsmith * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2567761Smsmith * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
2667761Smsmith * DAMAGE.
2767761Smsmith */
2867761Smsmith
2967761Smsmith#include "opt_witness.h"
3067761Smsmith#include "opt_hwpmc_hooks.h"
3167761Smsmith
3267761Smsmith#include <sys/cdefs.h>
3367761Smsmith__FBSDID("$FreeBSD: stable/11/sys/kern/kern_thread.c 331017 2018-03-15 19:08:33Z kevans $");
3467761Smsmith
3567761Smsmith#include <sys/param.h>
3667761Smsmith#include <sys/systm.h>
3767761Smsmith#include <sys/kernel.h>
3867761Smsmith#include <sys/lock.h>
3967761Smsmith#include <sys/mutex.h>
4067761Smsmith#include <sys/proc.h>
4169744Smsmith#include <sys/rangelock.h>
4269744Smsmith#include <sys/resourcevar.h>
4369744Smsmith#include <sys/sdt.h>
4469744Smsmith#include <sys/smp.h>
4569744Smsmith#include <sys/sched.h>
4669744Smsmith#include <sys/sleepqueue.h>
4767761Smsmith#include <sys/selinfo.h>
4867761Smsmith#include <sys/syscallsubr.h>
4967761Smsmith#include <sys/sysent.h>
5067761Smsmith#include <sys/turnstile.h>
5167761Smsmith#include <sys/ktr.h>
5267761Smsmith#include <sys/rwlock.h>
5367761Smsmith#include <sys/umtx.h>
5467761Smsmith#include <sys/vmmeter.h>
5567761Smsmith#include <sys/cpuset.h>
5667761Smsmith#ifdef	HWPMC_HOOKS
5767761Smsmith#include <sys/pmckern.h>
5867761Smsmith#endif
5967761Smsmith
6067761Smsmith#include <security/audit/audit.h>
6167761Smsmith
6267761Smsmith#include <vm/vm.h>
6367761Smsmith#include <vm/vm_extern.h>
6467761Smsmith#include <vm/uma.h>
6567761Smsmith#include <vm/vm_domain.h>
6667761Smsmith#include <sys/eventhandler.h>
6767761Smsmith
6867761Smsmith/*
6967761Smsmith * Asserts below verify the stability of struct thread and struct proc
7067761Smsmith * layout, as exposed by KBI to modules.  On head, the KBI is allowed
7167761Smsmith * to drift, change to the structures must be accompanied by the
7267761Smsmith * assert update.
7367761Smsmith *
7467761Smsmith * On the stable branches after KBI freeze, conditions must not be
7567761Smsmith * violated.  Typically new fields are moved to the end of the
7667761Smsmith * structures.
7767761Smsmith */
7867761Smsmith#ifdef __amd64__
7969744Smsmith_Static_assert(offsetof(struct thread, td_flags) == 0xe4,
8067761Smsmith    "struct thread KBI td_flags");
8167761Smsmith_Static_assert(offsetof(struct thread, td_pflags) == 0xec,
8267761Smsmith    "struct thread KBI td_pflags");
8367761Smsmith_Static_assert(offsetof(struct thread, td_frame) == 0x418,
8467761Smsmith    "struct thread KBI td_frame");
8567761Smsmith_Static_assert(offsetof(struct thread, td_emuldata) == 0x4c0,
8667761Smsmith    "struct thread KBI td_emuldata");
8767761Smsmith_Static_assert(offsetof(struct proc, p_flag) == 0xb0,
8867761Smsmith    "struct proc KBI p_flag");
8967761Smsmith_Static_assert(offsetof(struct proc, p_pid) == 0xbc,
9067761Smsmith    "struct proc KBI p_pid");
9167761Smsmith_Static_assert(offsetof(struct proc, p_filemon) == 0x3c0,
9269744Smsmith    "struct proc KBI p_filemon");
9369744Smsmith_Static_assert(offsetof(struct proc, p_comm) == 0x3d0,
9467761Smsmith    "struct proc KBI p_comm");
9567761Smsmith_Static_assert(offsetof(struct proc, p_emuldata) == 0x4a0,
9667761Smsmith    "struct proc KBI p_emuldata");
9767761Smsmith#endif
9867761Smsmith#ifdef __i386__
9967761Smsmith_Static_assert(offsetof(struct thread, td_flags) == 0x8c,
10067761Smsmith    "struct thread KBI td_flags");
10167761Smsmith_Static_assert(offsetof(struct thread, td_pflags) == 0x94,
10269744Smsmith    "struct thread KBI td_pflags");
10367761Smsmith_Static_assert(offsetof(struct thread, td_frame) == 0x2c0,
10467761Smsmith    "struct thread KBI td_frame");
10567761Smsmith_Static_assert(offsetof(struct thread, td_emuldata) == 0x30c,
10667761Smsmith    "struct thread KBI td_emuldata");
10767761Smsmith_Static_assert(offsetof(struct proc, p_flag) == 0x68,
10867761Smsmith    "struct proc KBI p_flag");
10967761Smsmith_Static_assert(offsetof(struct proc, p_pid) == 0x74,
11067761Smsmith    "struct proc KBI p_pid");
11167761Smsmith_Static_assert(offsetof(struct proc, p_filemon) == 0x268,
11267761Smsmith    "struct proc KBI p_filemon");
11369744Smsmith_Static_assert(offsetof(struct proc, p_comm) == 0x274,
11469744Smsmith    "struct proc KBI p_comm");
11567761Smsmith_Static_assert(offsetof(struct proc, p_emuldata) == 0x2f4,
11667761Smsmith    "struct proc KBI p_emuldata");
11767761Smsmith#endif
11867761Smsmith
11967761SmsmithSDT_PROVIDER_DECLARE(proc);
12067761SmsmithSDT_PROBE_DEFINE(proc, , , lwp__exit);
12167761Smsmith
12267761Smsmith/*
12367761Smsmith * thread related storage.
12467761Smsmith */
12569744Smsmithstatic uma_zone_t thread_zone;
12667761Smsmith
12769744SmsmithTAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
12867761Smsmithstatic struct mtx zombie_lock;
12967761SmsmithMTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN);
13067761Smsmith
13167761Smsmithstatic void thread_zombie(struct thread *);
13267761Smsmithstatic int thread_unsuspend_one(struct thread *td, struct proc *p,
13367761Smsmith    bool boundary);
13467761Smsmith
13567761Smsmith#define TID_BUFFER_SIZE	1024
13667761Smsmith
13769744Smsmithstruct mtx tid_lock;
13867761Smsmithstatic struct unrhdr *tid_unrhdr;
13967761Smsmithstatic lwpid_t tid_buffer[TID_BUFFER_SIZE];
14067761Smsmithstatic int tid_head, tid_tail;
14167761Smsmithstatic MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash");
14267761Smsmith
14367761Smsmithstruct	tidhashhead *tidhashtbl;
14467761Smsmithu_long	tidhash;
14569744Smsmithstruct	rwlock tidhash_lock;
14669744Smsmith
14767761Smsmithstatic lwpid_t
14867761Smsmithtid_alloc(void)
14967761Smsmith{
15067761Smsmith	lwpid_t	tid;
15167761Smsmith
15267761Smsmith	tid = alloc_unr(tid_unrhdr);
15367761Smsmith	if (tid != -1)
15467761Smsmith		return (tid);
15567761Smsmith	mtx_lock(&tid_lock);
15667761Smsmith	if (tid_head == tid_tail) {
15769744Smsmith		mtx_unlock(&tid_lock);
15869744Smsmith		return (-1);
15967761Smsmith	}
16067761Smsmith	tid = tid_buffer[tid_head];
16167761Smsmith	tid_head = (tid_head + 1) % TID_BUFFER_SIZE;
16267761Smsmith	mtx_unlock(&tid_lock);
16367761Smsmith	return (tid);
16469744Smsmith}
16567761Smsmith
16669744Smsmithstatic void
16767761Smsmithtid_free(lwpid_t tid)
16867761Smsmith{
169	lwpid_t tmp_tid = -1;
170
171	mtx_lock(&tid_lock);
172	if ((tid_tail + 1) % TID_BUFFER_SIZE == tid_head) {
173		tmp_tid = tid_buffer[tid_head];
174		tid_head = (tid_head + 1) % TID_BUFFER_SIZE;
175	}
176	tid_buffer[tid_tail] = tid;
177	tid_tail = (tid_tail + 1) % TID_BUFFER_SIZE;
178	mtx_unlock(&tid_lock);
179	if (tmp_tid != -1)
180		free_unr(tid_unrhdr, tmp_tid);
181}
182
183/*
184 * Prepare a thread for use.
185 */
186static int
187thread_ctor(void *mem, int size, void *arg, int flags)
188{
189	struct thread	*td;
190
191	td = (struct thread *)mem;
192	td->td_state = TDS_INACTIVE;
193	td->td_oncpu = NOCPU;
194
195	td->td_tid = tid_alloc();
196
197	/*
198	 * Note that td_critnest begins life as 1 because the thread is not
199	 * running and is thereby implicitly waiting to be on the receiving
200	 * end of a context switch.
201	 */
202	td->td_critnest = 1;
203	td->td_lend_user_pri = PRI_MAX;
204	EVENTHANDLER_INVOKE(thread_ctor, td);
205#ifdef AUDIT
206	audit_thread_alloc(td);
207#endif
208	umtx_thread_alloc(td);
209	return (0);
210}
211
212/*
213 * Reclaim a thread after use.
214 */
215static void
216thread_dtor(void *mem, int size, void *arg)
217{
218	struct thread *td;
219
220	td = (struct thread *)mem;
221
222#ifdef INVARIANTS
223	/* Verify that this thread is in a safe state to free. */
224	switch (td->td_state) {
225	case TDS_INHIBITED:
226	case TDS_RUNNING:
227	case TDS_CAN_RUN:
228	case TDS_RUNQ:
229		/*
230		 * We must never unlink a thread that is in one of
231		 * these states, because it is currently active.
232		 */
233		panic("bad state for thread unlinking");
234		/* NOTREACHED */
235	case TDS_INACTIVE:
236		break;
237	default:
238		panic("bad thread state");
239		/* NOTREACHED */
240	}
241#endif
242#ifdef AUDIT
243	audit_thread_free(td);
244#endif
245	/* Free all OSD associated to this thread. */
246	osd_thread_exit(td);
247	td_softdep_cleanup(td);
248	MPASS(td->td_su == NULL);
249
250	EVENTHANDLER_INVOKE(thread_dtor, td);
251	tid_free(td->td_tid);
252}
253
254/*
255 * Initialize type-stable parts of a thread (when newly created).
256 */
257static int
258thread_init(void *mem, int size, int flags)
259{
260	struct thread *td;
261
262	td = (struct thread *)mem;
263
264	td->td_sleepqueue = sleepq_alloc();
265	td->td_turnstile = turnstile_alloc();
266	td->td_rlqe = NULL;
267	EVENTHANDLER_INVOKE(thread_init, td);
268	umtx_thread_init(td);
269	td->td_kstack = 0;
270	td->td_sel = NULL;
271	return (0);
272}
273
274/*
275 * Tear down type-stable parts of a thread (just before being discarded).
276 */
277static void
278thread_fini(void *mem, int size)
279{
280	struct thread *td;
281
282	td = (struct thread *)mem;
283	EVENTHANDLER_INVOKE(thread_fini, td);
284	rlqentry_free(td->td_rlqe);
285	turnstile_free(td->td_turnstile);
286	sleepq_free(td->td_sleepqueue);
287	umtx_thread_fini(td);
288	seltdfini(td);
289}
290
291/*
292 * For a newly created process,
293 * link up all the structures and its initial threads etc.
294 * called from:
295 * {arch}/{arch}/machdep.c   {arch}_init(), init386() etc.
296 * proc_dtor() (should go away)
297 * proc_init()
298 */
299void
300proc_linkup0(struct proc *p, struct thread *td)
301{
302	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
303	proc_linkup(p, td);
304}
305
306void
307proc_linkup(struct proc *p, struct thread *td)
308{
309
310	sigqueue_init(&p->p_sigqueue, p);
311	p->p_ksi = ksiginfo_alloc(1);
312	if (p->p_ksi != NULL) {
313		/* XXX p_ksi may be null if ksiginfo zone is not ready */
314		p->p_ksi->ksi_flags = KSI_EXT | KSI_INS;
315	}
316	LIST_INIT(&p->p_mqnotifier);
317	p->p_numthreads = 0;
318	thread_link(td, p);
319}
320
321/*
322 * Initialize global thread allocation resources.
323 */
324void
325threadinit(void)
326{
327
328	mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF);
329
330	/*
331	 * pid_max cannot be greater than PID_MAX.
332	 * leave one number for thread0.
333	 */
334	tid_unrhdr = new_unrhdr(PID_MAX + 2, INT_MAX, &tid_lock);
335
336	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
337	    thread_ctor, thread_dtor, thread_init, thread_fini,
338	    32 - 1, UMA_ZONE_NOFREE);
339	tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash);
340	rw_init(&tidhash_lock, "tidhash");
341}
342
343/*
344 * Place an unused thread on the zombie list.
345 * Use the slpq as that must be unused by now.
346 */
347void
348thread_zombie(struct thread *td)
349{
350	mtx_lock_spin(&zombie_lock);
351	TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq);
352	mtx_unlock_spin(&zombie_lock);
353}
354
355/*
356 * Release a thread that has exited after cpu_throw().
357 */
358void
359thread_stash(struct thread *td)
360{
361	atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1);
362	thread_zombie(td);
363}
364
365/*
366 * Reap zombie resources.
367 */
368void
369thread_reap(void)
370{
371	struct thread *td_first, *td_next;
372
373	/*
374	 * Don't even bother to lock if none at this instant,
375	 * we really don't care about the next instant.
376	 */
377	if (!TAILQ_EMPTY(&zombie_threads)) {
378		mtx_lock_spin(&zombie_lock);
379		td_first = TAILQ_FIRST(&zombie_threads);
380		if (td_first)
381			TAILQ_INIT(&zombie_threads);
382		mtx_unlock_spin(&zombie_lock);
383		while (td_first) {
384			td_next = TAILQ_NEXT(td_first, td_slpq);
385			thread_cow_free(td_first);
386			thread_free(td_first);
387			td_first = td_next;
388		}
389	}
390}
391
392/*
393 * Allocate a thread.
394 */
395struct thread *
396thread_alloc(int pages)
397{
398	struct thread *td;
399
400	thread_reap(); /* check if any zombies to get */
401
402	td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK);
403	KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack"));
404	if (!vm_thread_new(td, pages)) {
405		uma_zfree(thread_zone, td);
406		return (NULL);
407	}
408	cpu_thread_alloc(td);
409	vm_domain_policy_init(&td->td_vm_dom_policy);
410	return (td);
411}
412
413int
414thread_alloc_stack(struct thread *td, int pages)
415{
416
417	KASSERT(td->td_kstack == 0,
418	    ("thread_alloc_stack called on a thread with kstack"));
419	if (!vm_thread_new(td, pages))
420		return (0);
421	cpu_thread_alloc(td);
422	return (1);
423}
424
425/*
426 * Deallocate a thread.
427 */
428void
429thread_free(struct thread *td)
430{
431
432	lock_profile_thread_exit(td);
433	if (td->td_cpuset)
434		cpuset_rel(td->td_cpuset);
435	td->td_cpuset = NULL;
436	cpu_thread_free(td);
437	if (td->td_kstack != 0)
438		vm_thread_dispose(td);
439	vm_domain_policy_cleanup(&td->td_vm_dom_policy);
440	callout_drain(&td->td_slpcallout);
441	uma_zfree(thread_zone, td);
442}
443
444void
445thread_cow_get_proc(struct thread *newtd, struct proc *p)
446{
447
448	PROC_LOCK_ASSERT(p, MA_OWNED);
449	newtd->td_ucred = crhold(p->p_ucred);
450	newtd->td_limit = lim_hold(p->p_limit);
451	newtd->td_cowgen = p->p_cowgen;
452}
453
454void
455thread_cow_get(struct thread *newtd, struct thread *td)
456{
457
458	newtd->td_ucred = crhold(td->td_ucred);
459	newtd->td_limit = lim_hold(td->td_limit);
460	newtd->td_cowgen = td->td_cowgen;
461}
462
463void
464thread_cow_free(struct thread *td)
465{
466
467	if (td->td_ucred != NULL)
468		crfree(td->td_ucred);
469	if (td->td_limit != NULL)
470		lim_free(td->td_limit);
471}
472
473void
474thread_cow_update(struct thread *td)
475{
476	struct proc *p;
477	struct ucred *oldcred;
478	struct plimit *oldlimit;
479
480	p = td->td_proc;
481	oldcred = NULL;
482	oldlimit = NULL;
483	PROC_LOCK(p);
484	if (td->td_ucred != p->p_ucred) {
485		oldcred = td->td_ucred;
486		td->td_ucred = crhold(p->p_ucred);
487	}
488	if (td->td_limit != p->p_limit) {
489		oldlimit = td->td_limit;
490		td->td_limit = lim_hold(p->p_limit);
491	}
492	td->td_cowgen = p->p_cowgen;
493	PROC_UNLOCK(p);
494	if (oldcred != NULL)
495		crfree(oldcred);
496	if (oldlimit != NULL)
497		lim_free(oldlimit);
498}
499
500/*
501 * Discard the current thread and exit from its context.
502 * Always called with scheduler locked.
503 *
504 * Because we can't free a thread while we're operating under its context,
505 * push the current thread into our CPU's deadthread holder. This means
506 * we needn't worry about someone else grabbing our context before we
507 * do a cpu_throw().
508 */
509void
510thread_exit(void)
511{
512	uint64_t runtime, new_switchtime;
513	struct thread *td;
514	struct thread *td2;
515	struct proc *p;
516	int wakeup_swapper;
517
518	td = curthread;
519	p = td->td_proc;
520
521	PROC_SLOCK_ASSERT(p, MA_OWNED);
522	mtx_assert(&Giant, MA_NOTOWNED);
523
524	PROC_LOCK_ASSERT(p, MA_OWNED);
525	KASSERT(p != NULL, ("thread exiting without a process"));
526	CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
527	    (long)p->p_pid, td->td_name);
528	SDT_PROBE0(proc, , , lwp__exit);
529	KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending"));
530
531#ifdef AUDIT
532	AUDIT_SYSCALL_EXIT(0, td);
533#endif
534	/*
535	 * drop FPU & debug register state storage, or any other
536	 * architecture specific resources that
537	 * would not be on a new untouched process.
538	 */
539	cpu_thread_exit(td);
540
541	/*
542	 * The last thread is left attached to the process
543	 * So that the whole bundle gets recycled. Skip
544	 * all this stuff if we never had threads.
545	 * EXIT clears all sign of other threads when
546	 * it goes to single threading, so the last thread always
547	 * takes the short path.
548	 */
549	if (p->p_flag & P_HADTHREADS) {
550		if (p->p_numthreads > 1) {
551			atomic_add_int(&td->td_proc->p_exitthreads, 1);
552			thread_unlink(td);
553			td2 = FIRST_THREAD_IN_PROC(p);
554			sched_exit_thread(td2, td);
555
556			/*
557			 * The test below is NOT true if we are the
558			 * sole exiting thread. P_STOPPED_SINGLE is unset
559			 * in exit1() after it is the only survivor.
560			 */
561			if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
562				if (p->p_numthreads == p->p_suspcount) {
563					thread_lock(p->p_singlethread);
564					wakeup_swapper = thread_unsuspend_one(
565						p->p_singlethread, p, false);
566					thread_unlock(p->p_singlethread);
567					if (wakeup_swapper)
568						kick_proc0();
569				}
570			}
571
572			PCPU_SET(deadthread, td);
573		} else {
574			/*
575			 * The last thread is exiting.. but not through exit()
576			 */
577			panic ("thread_exit: Last thread exiting on its own");
578		}
579	}
580#ifdef	HWPMC_HOOKS
581	/*
582	 * If this thread is part of a process that is being tracked by hwpmc(4),
583	 * inform the module of the thread's impending exit.
584	 */
585	if (PMC_PROC_IS_USING_PMCS(td->td_proc))
586		PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
587#endif
588	PROC_UNLOCK(p);
589	PROC_STATLOCK(p);
590	thread_lock(td);
591	PROC_SUNLOCK(p);
592
593	/* Do the same timestamp bookkeeping that mi_switch() would do. */
594	new_switchtime = cpu_ticks();
595	runtime = new_switchtime - PCPU_GET(switchtime);
596	td->td_runtime += runtime;
597	td->td_incruntime += runtime;
598	PCPU_SET(switchtime, new_switchtime);
599	PCPU_SET(switchticks, ticks);
600	PCPU_INC(cnt.v_swtch);
601
602	/* Save our resource usage in our process. */
603	td->td_ru.ru_nvcsw++;
604	ruxagg(p, td);
605	rucollect(&p->p_ru, &td->td_ru);
606	PROC_STATUNLOCK(p);
607
608	td->td_state = TDS_INACTIVE;
609#ifdef WITNESS
610	witness_thread_exit(td);
611#endif
612	CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
613	sched_throw(td);
614	panic("I'm a teapot!");
615	/* NOTREACHED */
616}
617
618/*
619 * Do any thread specific cleanups that may be needed in wait()
620 * called with Giant, proc and schedlock not held.
621 */
622void
623thread_wait(struct proc *p)
624{
625	struct thread *td;
626
627	mtx_assert(&Giant, MA_NOTOWNED);
628	KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()"));
629	KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking"));
630	td = FIRST_THREAD_IN_PROC(p);
631	/* Lock the last thread so we spin until it exits cpu_throw(). */
632	thread_lock(td);
633	thread_unlock(td);
634	lock_profile_thread_exit(td);
635	cpuset_rel(td->td_cpuset);
636	td->td_cpuset = NULL;
637	cpu_thread_clean(td);
638	thread_cow_free(td);
639	callout_drain(&td->td_slpcallout);
640	thread_reap();	/* check for zombie threads etc. */
641}
642
643/*
644 * Link a thread to a process.
645 * set up anything that needs to be initialized for it to
646 * be used by the process.
647 */
648void
649thread_link(struct thread *td, struct proc *p)
650{
651
652	/*
653	 * XXX This can't be enabled because it's called for proc0 before
654	 * its lock has been created.
655	 * PROC_LOCK_ASSERT(p, MA_OWNED);
656	 */
657	td->td_state    = TDS_INACTIVE;
658	td->td_proc     = p;
659	td->td_flags    = TDF_INMEM;
660
661	LIST_INIT(&td->td_contested);
662	LIST_INIT(&td->td_lprof[0]);
663	LIST_INIT(&td->td_lprof[1]);
664	sigqueue_init(&td->td_sigqueue, p);
665	callout_init(&td->td_slpcallout, 1);
666	TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist);
667	p->p_numthreads++;
668}
669
670/*
671 * Called from:
672 *  thread_exit()
673 */
674void
675thread_unlink(struct thread *td)
676{
677	struct proc *p = td->td_proc;
678
679	PROC_LOCK_ASSERT(p, MA_OWNED);
680	TAILQ_REMOVE(&p->p_threads, td, td_plist);
681	p->p_numthreads--;
682	/* could clear a few other things here */
683	/* Must  NOT clear links to proc! */
684}
685
686static int
687calc_remaining(struct proc *p, int mode)
688{
689	int remaining;
690
691	PROC_LOCK_ASSERT(p, MA_OWNED);
692	PROC_SLOCK_ASSERT(p, MA_OWNED);
693	if (mode == SINGLE_EXIT)
694		remaining = p->p_numthreads;
695	else if (mode == SINGLE_BOUNDARY)
696		remaining = p->p_numthreads - p->p_boundary_count;
697	else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC)
698		remaining = p->p_numthreads - p->p_suspcount;
699	else
700		panic("calc_remaining: wrong mode %d", mode);
701	return (remaining);
702}
703
704static int
705remain_for_mode(int mode)
706{
707
708	return (mode == SINGLE_ALLPROC ? 0 : 1);
709}
710
711static int
712weed_inhib(int mode, struct thread *td2, struct proc *p)
713{
714	int wakeup_swapper;
715
716	PROC_LOCK_ASSERT(p, MA_OWNED);
717	PROC_SLOCK_ASSERT(p, MA_OWNED);
718	THREAD_LOCK_ASSERT(td2, MA_OWNED);
719
720	wakeup_swapper = 0;
721	switch (mode) {
722	case SINGLE_EXIT:
723		if (TD_IS_SUSPENDED(td2))
724			wakeup_swapper |= thread_unsuspend_one(td2, p, true);
725		if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0)
726			wakeup_swapper |= sleepq_abort(td2, EINTR);
727		break;
728	case SINGLE_BOUNDARY:
729	case SINGLE_NO_EXIT:
730		if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0)
731			wakeup_swapper |= thread_unsuspend_one(td2, p, false);
732		if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0)
733			wakeup_swapper |= sleepq_abort(td2, ERESTART);
734		break;
735	case SINGLE_ALLPROC:
736		/*
737		 * ALLPROC suspend tries to avoid spurious EINTR for
738		 * threads sleeping interruptable, by suspending the
739		 * thread directly, similarly to sig_suspend_threads().
740		 * Since such sleep is not performed at the user
741		 * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP
742		 * is used to avoid immediate un-suspend.
743		 */
744		if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY |
745		    TDF_ALLPROCSUSP)) == 0)
746			wakeup_swapper |= thread_unsuspend_one(td2, p, false);
747		if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) {
748			if ((td2->td_flags & TDF_SBDRY) == 0) {
749				thread_suspend_one(td2);
750				td2->td_flags |= TDF_ALLPROCSUSP;
751			} else {
752				wakeup_swapper |= sleepq_abort(td2, ERESTART);
753			}
754		}
755		break;
756	}
757	return (wakeup_swapper);
758}
759
760/*
761 * Enforce single-threading.
762 *
763 * Returns 1 if the caller must abort (another thread is waiting to
764 * exit the process or similar). Process is locked!
765 * Returns 0 when you are successfully the only thread running.
766 * A process has successfully single threaded in the suspend mode when
767 * There are no threads in user mode. Threads in the kernel must be
768 * allowed to continue until they get to the user boundary. They may even
769 * copy out their return values and data before suspending. They may however be
770 * accelerated in reaching the user boundary as we will wake up
771 * any sleeping threads that are interruptable. (PCATCH).
772 */
773int
774thread_single(struct proc *p, int mode)
775{
776	struct thread *td;
777	struct thread *td2;
778	int remaining, wakeup_swapper;
779
780	td = curthread;
781	KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
782	    mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
783	    ("invalid mode %d", mode));
784	/*
785	 * If allowing non-ALLPROC singlethreading for non-curproc
786	 * callers, calc_remaining() and remain_for_mode() should be
787	 * adjusted to also account for td->td_proc != p.  For now
788	 * this is not implemented because it is not used.
789	 */
790	KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) ||
791	    (mode != SINGLE_ALLPROC && td->td_proc == p),
792	    ("mode %d proc %p curproc %p", mode, p, td->td_proc));
793	mtx_assert(&Giant, MA_NOTOWNED);
794	PROC_LOCK_ASSERT(p, MA_OWNED);
795
796	if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC)
797		return (0);
798
799	/* Is someone already single threading? */
800	if (p->p_singlethread != NULL && p->p_singlethread != td)
801		return (1);
802
803	if (mode == SINGLE_EXIT) {
804		p->p_flag |= P_SINGLE_EXIT;
805		p->p_flag &= ~P_SINGLE_BOUNDARY;
806	} else {
807		p->p_flag &= ~P_SINGLE_EXIT;
808		if (mode == SINGLE_BOUNDARY)
809			p->p_flag |= P_SINGLE_BOUNDARY;
810		else
811			p->p_flag &= ~P_SINGLE_BOUNDARY;
812	}
813	if (mode == SINGLE_ALLPROC)
814		p->p_flag |= P_TOTAL_STOP;
815	p->p_flag |= P_STOPPED_SINGLE;
816	PROC_SLOCK(p);
817	p->p_singlethread = td;
818	remaining = calc_remaining(p, mode);
819	while (remaining != remain_for_mode(mode)) {
820		if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE)
821			goto stopme;
822		wakeup_swapper = 0;
823		FOREACH_THREAD_IN_PROC(p, td2) {
824			if (td2 == td)
825				continue;
826			thread_lock(td2);
827			td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
828			if (TD_IS_INHIBITED(td2)) {
829				wakeup_swapper |= weed_inhib(mode, td2, p);
830#ifdef SMP
831			} else if (TD_IS_RUNNING(td2) && td != td2) {
832				forward_signal(td2);
833#endif
834			}
835			thread_unlock(td2);
836		}
837		if (wakeup_swapper)
838			kick_proc0();
839		remaining = calc_remaining(p, mode);
840
841		/*
842		 * Maybe we suspended some threads.. was it enough?
843		 */
844		if (remaining == remain_for_mode(mode))
845			break;
846
847stopme:
848		/*
849		 * Wake us up when everyone else has suspended.
850		 * In the mean time we suspend as well.
851		 */
852		thread_suspend_switch(td, p);
853		remaining = calc_remaining(p, mode);
854	}
855	if (mode == SINGLE_EXIT) {
856		/*
857		 * Convert the process to an unthreaded process.  The
858		 * SINGLE_EXIT is called by exit1() or execve(), in
859		 * both cases other threads must be retired.
860		 */
861		KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads"));
862		p->p_singlethread = NULL;
863		p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS);
864
865		/*
866		 * Wait for any remaining threads to exit cpu_throw().
867		 */
868		while (p->p_exitthreads != 0) {
869			PROC_SUNLOCK(p);
870			PROC_UNLOCK(p);
871			sched_relinquish(td);
872			PROC_LOCK(p);
873			PROC_SLOCK(p);
874		}
875	} else if (mode == SINGLE_BOUNDARY) {
876		/*
877		 * Wait until all suspended threads are removed from
878		 * the processors.  The thread_suspend_check()
879		 * increments p_boundary_count while it is still
880		 * running, which makes it possible for the execve()
881		 * to destroy vmspace while our other threads are
882		 * still using the address space.
883		 *
884		 * We lock the thread, which is only allowed to
885		 * succeed after context switch code finished using
886		 * the address space.
887		 */
888		FOREACH_THREAD_IN_PROC(p, td2) {
889			if (td2 == td)
890				continue;
891			thread_lock(td2);
892			KASSERT((td2->td_flags & TDF_BOUNDARY) != 0,
893			    ("td %p not on boundary", td2));
894			KASSERT(TD_IS_SUSPENDED(td2),
895			    ("td %p is not suspended", td2));
896			thread_unlock(td2);
897		}
898	}
899	PROC_SUNLOCK(p);
900	return (0);
901}
902
903bool
904thread_suspend_check_needed(void)
905{
906	struct proc *p;
907	struct thread *td;
908
909	td = curthread;
910	p = td->td_proc;
911	PROC_LOCK_ASSERT(p, MA_OWNED);
912	return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 &&
913	    (td->td_dbgflags & TDB_SUSPEND) != 0));
914}
915
916/*
917 * Called in from locations that can safely check to see
918 * whether we have to suspend or at least throttle for a
919 * single-thread event (e.g. fork).
920 *
921 * Such locations include userret().
922 * If the "return_instead" argument is non zero, the thread must be able to
923 * accept 0 (caller may continue), or 1 (caller must abort) as a result.
924 *
925 * The 'return_instead' argument tells the function if it may do a
926 * thread_exit() or suspend, or whether the caller must abort and back
927 * out instead.
928 *
929 * If the thread that set the single_threading request has set the
930 * P_SINGLE_EXIT bit in the process flags then this call will never return
931 * if 'return_instead' is false, but will exit.
932 *
933 * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
934 *---------------+--------------------+---------------------
935 *       0       | returns 0          |   returns 0 or 1
936 *               | when ST ends       |   immediately
937 *---------------+--------------------+---------------------
938 *       1       | thread exits       |   returns 1
939 *               |                    |  immediately
940 * 0 = thread_exit() or suspension ok,
941 * other = return error instead of stopping the thread.
942 *
943 * While a full suspension is under effect, even a single threading
944 * thread would be suspended if it made this call (but it shouldn't).
945 * This call should only be made from places where
946 * thread_exit() would be safe as that may be the outcome unless
947 * return_instead is set.
948 */
949int
950thread_suspend_check(int return_instead)
951{
952	struct thread *td;
953	struct proc *p;
954	int wakeup_swapper;
955
956	td = curthread;
957	p = td->td_proc;
958	mtx_assert(&Giant, MA_NOTOWNED);
959	PROC_LOCK_ASSERT(p, MA_OWNED);
960	while (thread_suspend_check_needed()) {
961		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
962			KASSERT(p->p_singlethread != NULL,
963			    ("singlethread not set"));
964			/*
965			 * The only suspension in action is a
966			 * single-threading. Single threader need not stop.
967			 * It is safe to access p->p_singlethread unlocked
968			 * because it can only be set to our address by us.
969			 */
970			if (p->p_singlethread == td)
971				return (0);	/* Exempt from stopping. */
972		}
973		if ((p->p_flag & P_SINGLE_EXIT) && return_instead)
974			return (EINTR);
975
976		/* Should we goto user boundary if we didn't come from there? */
977		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
978		    (p->p_flag & P_SINGLE_BOUNDARY) && return_instead)
979			return (ERESTART);
980
981		/*
982		 * Ignore suspend requests if they are deferred.
983		 */
984		if ((td->td_flags & TDF_SBDRY) != 0) {
985			KASSERT(return_instead,
986			    ("TDF_SBDRY set for unsafe thread_suspend_check"));
987			KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) !=
988			    (TDF_SEINTR | TDF_SERESTART),
989			    ("both TDF_SEINTR and TDF_SERESTART"));
990			return (TD_SBDRY_INTR(td) ? TD_SBDRY_ERRNO(td) : 0);
991		}
992
993		/*
994		 * If the process is waiting for us to exit,
995		 * this thread should just suicide.
996		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
997		 */
998		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
999			PROC_UNLOCK(p);
1000
1001			/*
1002			 * Allow Linux emulation layer to do some work
1003			 * before thread suicide.
1004			 */
1005			if (__predict_false(p->p_sysent->sv_thread_detach != NULL))
1006				(p->p_sysent->sv_thread_detach)(td);
1007			umtx_thread_exit(td);
1008			kern_thr_exit(td);
1009			panic("stopped thread did not exit");
1010		}
1011
1012		PROC_SLOCK(p);
1013		thread_stopped(p);
1014		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1015			if (p->p_numthreads == p->p_suspcount + 1) {
1016				thread_lock(p->p_singlethread);
1017				wakeup_swapper = thread_unsuspend_one(
1018				    p->p_singlethread, p, false);
1019				thread_unlock(p->p_singlethread);
1020				if (wakeup_swapper)
1021					kick_proc0();
1022			}
1023		}
1024		PROC_UNLOCK(p);
1025		thread_lock(td);
1026		/*
1027		 * When a thread suspends, it just
1028		 * gets taken off all queues.
1029		 */
1030		thread_suspend_one(td);
1031		if (return_instead == 0) {
1032			p->p_boundary_count++;
1033			td->td_flags |= TDF_BOUNDARY;
1034		}
1035		PROC_SUNLOCK(p);
1036		mi_switch(SW_INVOL | SWT_SUSPEND, NULL);
1037		thread_unlock(td);
1038		PROC_LOCK(p);
1039	}
1040	return (0);
1041}
1042
1043void
1044thread_suspend_switch(struct thread *td, struct proc *p)
1045{
1046
1047	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
1048	PROC_LOCK_ASSERT(p, MA_OWNED);
1049	PROC_SLOCK_ASSERT(p, MA_OWNED);
1050	/*
1051	 * We implement thread_suspend_one in stages here to avoid
1052	 * dropping the proc lock while the thread lock is owned.
1053	 */
1054	if (p == td->td_proc) {
1055		thread_stopped(p);
1056		p->p_suspcount++;
1057	}
1058	PROC_UNLOCK(p);
1059	thread_lock(td);
1060	td->td_flags &= ~TDF_NEEDSUSPCHK;
1061	TD_SET_SUSPENDED(td);
1062	sched_sleep(td, 0);
1063	PROC_SUNLOCK(p);
1064	DROP_GIANT();
1065	mi_switch(SW_VOL | SWT_SUSPEND, NULL);
1066	thread_unlock(td);
1067	PICKUP_GIANT();
1068	PROC_LOCK(p);
1069	PROC_SLOCK(p);
1070}
1071
1072void
1073thread_suspend_one(struct thread *td)
1074{
1075	struct proc *p;
1076
1077	p = td->td_proc;
1078	PROC_SLOCK_ASSERT(p, MA_OWNED);
1079	THREAD_LOCK_ASSERT(td, MA_OWNED);
1080	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
1081	p->p_suspcount++;
1082	td->td_flags &= ~TDF_NEEDSUSPCHK;
1083	TD_SET_SUSPENDED(td);
1084	sched_sleep(td, 0);
1085}
1086
1087static int
1088thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary)
1089{
1090
1091	THREAD_LOCK_ASSERT(td, MA_OWNED);
1092	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
1093	TD_CLR_SUSPENDED(td);
1094	td->td_flags &= ~TDF_ALLPROCSUSP;
1095	if (td->td_proc == p) {
1096		PROC_SLOCK_ASSERT(p, MA_OWNED);
1097		p->p_suspcount--;
1098		if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) {
1099			td->td_flags &= ~TDF_BOUNDARY;
1100			p->p_boundary_count--;
1101		}
1102	}
1103	return (setrunnable(td));
1104}
1105
1106/*
1107 * Allow all threads blocked by single threading to continue running.
1108 */
1109void
1110thread_unsuspend(struct proc *p)
1111{
1112	struct thread *td;
1113	int wakeup_swapper;
1114
1115	PROC_LOCK_ASSERT(p, MA_OWNED);
1116	PROC_SLOCK_ASSERT(p, MA_OWNED);
1117	wakeup_swapper = 0;
1118	if (!P_SHOULDSTOP(p)) {
1119                FOREACH_THREAD_IN_PROC(p, td) {
1120			thread_lock(td);
1121			if (TD_IS_SUSPENDED(td)) {
1122				wakeup_swapper |= thread_unsuspend_one(td, p,
1123				    true);
1124			}
1125			thread_unlock(td);
1126		}
1127	} else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
1128	    p->p_numthreads == p->p_suspcount) {
1129		/*
1130		 * Stopping everything also did the job for the single
1131		 * threading request. Now we've downgraded to single-threaded,
1132		 * let it continue.
1133		 */
1134		if (p->p_singlethread->td_proc == p) {
1135			thread_lock(p->p_singlethread);
1136			wakeup_swapper = thread_unsuspend_one(
1137			    p->p_singlethread, p, false);
1138			thread_unlock(p->p_singlethread);
1139		}
1140	}
1141	if (wakeup_swapper)
1142		kick_proc0();
1143}
1144
1145/*
1146 * End the single threading mode..
1147 */
1148void
1149thread_single_end(struct proc *p, int mode)
1150{
1151	struct thread *td;
1152	int wakeup_swapper;
1153
1154	KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
1155	    mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
1156	    ("invalid mode %d", mode));
1157	PROC_LOCK_ASSERT(p, MA_OWNED);
1158	KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) ||
1159	    (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0),
1160	    ("mode %d does not match P_TOTAL_STOP", mode));
1161	KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread,
1162	    ("thread_single_end from other thread %p %p",
1163	    curthread, p->p_singlethread));
1164	KASSERT(mode != SINGLE_BOUNDARY ||
1165	    (p->p_flag & P_SINGLE_BOUNDARY) != 0,
1166	    ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag));
1167	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY |
1168	    P_TOTAL_STOP);
1169	PROC_SLOCK(p);
1170	p->p_singlethread = NULL;
1171	wakeup_swapper = 0;
1172	/*
1173	 * If there are other threads they may now run,
1174	 * unless of course there is a blanket 'stop order'
1175	 * on the process. The single threader must be allowed
1176	 * to continue however as this is a bad place to stop.
1177	 */
1178	if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) {
1179                FOREACH_THREAD_IN_PROC(p, td) {
1180			thread_lock(td);
1181			if (TD_IS_SUSPENDED(td)) {
1182				wakeup_swapper |= thread_unsuspend_one(td, p,
1183				    mode == SINGLE_BOUNDARY);
1184			}
1185			thread_unlock(td);
1186		}
1187	}
1188	KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0,
1189	    ("inconsistent boundary count %d", p->p_boundary_count));
1190	PROC_SUNLOCK(p);
1191	if (wakeup_swapper)
1192		kick_proc0();
1193}
1194
1195struct thread *
1196thread_find(struct proc *p, lwpid_t tid)
1197{
1198	struct thread *td;
1199
1200	PROC_LOCK_ASSERT(p, MA_OWNED);
1201	FOREACH_THREAD_IN_PROC(p, td) {
1202		if (td->td_tid == tid)
1203			break;
1204	}
1205	return (td);
1206}
1207
1208/* Locate a thread by number; return with proc lock held. */
1209struct thread *
1210tdfind(lwpid_t tid, pid_t pid)
1211{
1212#define RUN_THRESH	16
1213	struct thread *td;
1214	int run = 0;
1215
1216	rw_rlock(&tidhash_lock);
1217	LIST_FOREACH(td, TIDHASH(tid), td_hash) {
1218		if (td->td_tid == tid) {
1219			if (pid != -1 && td->td_proc->p_pid != pid) {
1220				td = NULL;
1221				break;
1222			}
1223			PROC_LOCK(td->td_proc);
1224			if (td->td_proc->p_state == PRS_NEW) {
1225				PROC_UNLOCK(td->td_proc);
1226				td = NULL;
1227				break;
1228			}
1229			if (run > RUN_THRESH) {
1230				if (rw_try_upgrade(&tidhash_lock)) {
1231					LIST_REMOVE(td, td_hash);
1232					LIST_INSERT_HEAD(TIDHASH(td->td_tid),
1233						td, td_hash);
1234					rw_wunlock(&tidhash_lock);
1235					return (td);
1236				}
1237			}
1238			break;
1239		}
1240		run++;
1241	}
1242	rw_runlock(&tidhash_lock);
1243	return (td);
1244}
1245
1246void
1247tidhash_add(struct thread *td)
1248{
1249	rw_wlock(&tidhash_lock);
1250	LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
1251	rw_wunlock(&tidhash_lock);
1252}
1253
1254void
1255tidhash_remove(struct thread *td)
1256{
1257	rw_wlock(&tidhash_lock);
1258	LIST_REMOVE(td, td_hash);
1259	rw_wunlock(&tidhash_lock);
1260}
1261