1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
5 *  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice(s), this list of conditions and the following disclaimer as
12 *    the first lines of this file unmodified other than the possible
13 *    addition of one or more copyright notices.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice(s), this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
19 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
28 * DAMAGE.
29 */
30
31#include "opt_witness.h"
32#include "opt_hwpmc_hooks.h"
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD$");
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/lock.h>
41#include <sys/mutex.h>
42#include <sys/proc.h>
43#include <sys/bitstring.h>
44#include <sys/epoch.h>
45#include <sys/rangelock.h>
46#include <sys/resourcevar.h>
47#include <sys/sdt.h>
48#include <sys/smp.h>
49#include <sys/sched.h>
50#include <sys/sleepqueue.h>
51#include <sys/selinfo.h>
52#include <sys/syscallsubr.h>
53#include <sys/dtrace_bsd.h>
54#include <sys/sysent.h>
55#include <sys/turnstile.h>
56#include <sys/taskqueue.h>
57#include <sys/ktr.h>
58#include <sys/rwlock.h>
59#include <sys/umtx.h>
60#include <sys/vmmeter.h>
61#include <sys/cpuset.h>
62#ifdef	HWPMC_HOOKS
63#include <sys/pmckern.h>
64#endif
65#include <sys/priv.h>
66
67#include <security/audit/audit.h>
68
69#include <vm/pmap.h>
70#include <vm/vm.h>
71#include <vm/vm_extern.h>
72#include <vm/uma.h>
73#include <vm/vm_phys.h>
74#include <sys/eventhandler.h>
75
76/*
77 * Asserts below verify the stability of struct thread and struct proc
78 * layout, as exposed by KBI to modules.  On head, the KBI is allowed
79 * to drift, change to the structures must be accompanied by the
80 * assert update.
81 *
82 * On the stable branches after KBI freeze, conditions must not be
83 * violated.  Typically new fields are moved to the end of the
84 * structures.
85 */
86#ifdef __amd64__
87_Static_assert(offsetof(struct thread, td_flags) == 0xfc,
88    "struct thread KBI td_flags");
89_Static_assert(offsetof(struct thread, td_pflags) == 0x104,
90    "struct thread KBI td_pflags");
91_Static_assert(offsetof(struct thread, td_frame) == 0x4a0,
92    "struct thread KBI td_frame");
93_Static_assert(offsetof(struct thread, td_emuldata) == 0x6b0,
94    "struct thread KBI td_emuldata");
95_Static_assert(offsetof(struct proc, p_flag) == 0xb8,
96    "struct proc KBI p_flag");
97_Static_assert(offsetof(struct proc, p_pid) == 0xc4,
98    "struct proc KBI p_pid");
99_Static_assert(offsetof(struct proc, p_filemon) == 0x3c0,
100    "struct proc KBI p_filemon");
101_Static_assert(offsetof(struct proc, p_comm) == 0x3d8,
102    "struct proc KBI p_comm");
103_Static_assert(offsetof(struct proc, p_emuldata) == 0x4b8,
104    "struct proc KBI p_emuldata");
105#endif
106#ifdef __i386__
107_Static_assert(offsetof(struct thread, td_flags) == 0x98,
108    "struct thread KBI td_flags");
109_Static_assert(offsetof(struct thread, td_pflags) == 0xa0,
110    "struct thread KBI td_pflags");
111_Static_assert(offsetof(struct thread, td_frame) == 0x300,
112    "struct thread KBI td_frame");
113_Static_assert(offsetof(struct thread, td_emuldata) == 0x344,
114    "struct thread KBI td_emuldata");
115_Static_assert(offsetof(struct proc, p_flag) == 0x6c,
116    "struct proc KBI p_flag");
117_Static_assert(offsetof(struct proc, p_pid) == 0x78,
118    "struct proc KBI p_pid");
119_Static_assert(offsetof(struct proc, p_filemon) == 0x26c,
120    "struct proc KBI p_filemon");
121_Static_assert(offsetof(struct proc, p_comm) == 0x280,
122    "struct proc KBI p_comm");
123_Static_assert(offsetof(struct proc, p_emuldata) == 0x30c,
124    "struct proc KBI p_emuldata");
125#endif
126
127SDT_PROVIDER_DECLARE(proc);
128SDT_PROBE_DEFINE(proc, , , lwp__exit);
129
130/*
131 * thread related storage.
132 */
133static uma_zone_t thread_zone;
134
135struct thread_domain_data {
136	struct thread	*tdd_zombies;
137	int		tdd_reapticks;
138} __aligned(CACHE_LINE_SIZE);
139
140static struct thread_domain_data thread_domain_data[MAXMEMDOM];
141
142static struct task	thread_reap_task;
143static struct callout  	thread_reap_callout;
144
145static void thread_zombie(struct thread *);
146static void thread_reap(void);
147static void thread_reap_all(void);
148static void thread_reap_task_cb(void *, int);
149static void thread_reap_callout_cb(void *);
150static int thread_unsuspend_one(struct thread *td, struct proc *p,
151    bool boundary);
152static void thread_free_batched(struct thread *td);
153
154static __exclusive_cache_line struct mtx tid_lock;
155static bitstr_t *tid_bitmap;
156
157static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash");
158
159static int maxthread;
160SYSCTL_INT(_kern, OID_AUTO, maxthread, CTLFLAG_RDTUN,
161    &maxthread, 0, "Maximum number of threads");
162
163static __exclusive_cache_line int nthreads;
164
165static LIST_HEAD(tidhashhead, thread) *tidhashtbl;
166static u_long	tidhash;
167static u_long	tidhashlock;
168static struct	rwlock *tidhashtbl_lock;
169#define	TIDHASH(tid)		(&tidhashtbl[(tid) & tidhash])
170#define	TIDHASHLOCK(tid)	(&tidhashtbl_lock[(tid) & tidhashlock])
171
172EVENTHANDLER_LIST_DEFINE(thread_ctor);
173EVENTHANDLER_LIST_DEFINE(thread_dtor);
174EVENTHANDLER_LIST_DEFINE(thread_init);
175EVENTHANDLER_LIST_DEFINE(thread_fini);
176
177static bool
178thread_count_inc_try(void)
179{
180	int nthreads_new;
181
182	nthreads_new = atomic_fetchadd_int(&nthreads, 1) + 1;
183	if (nthreads_new >= maxthread - 100) {
184		if (priv_check_cred(curthread->td_ucred, PRIV_MAXPROC) != 0 ||
185		    nthreads_new >= maxthread) {
186			atomic_subtract_int(&nthreads, 1);
187			return (false);
188		}
189	}
190	return (true);
191}
192
193static bool
194thread_count_inc(void)
195{
196	static struct timeval lastfail;
197	static int curfail;
198
199	thread_reap();
200	if (thread_count_inc_try()) {
201		return (true);
202	}
203
204	thread_reap_all();
205	if (thread_count_inc_try()) {
206		return (true);
207	}
208
209	if (ppsratecheck(&lastfail, &curfail, 1)) {
210		printf("maxthread limit exceeded by uid %u "
211		    "(pid %d); consider increasing kern.maxthread\n",
212		    curthread->td_ucred->cr_ruid, curproc->p_pid);
213	}
214	return (false);
215}
216
217static void
218thread_count_sub(int n)
219{
220
221	atomic_subtract_int(&nthreads, n);
222}
223
224static void
225thread_count_dec(void)
226{
227
228	thread_count_sub(1);
229}
230
231static lwpid_t
232tid_alloc(void)
233{
234	static lwpid_t trytid;
235	lwpid_t tid;
236
237	mtx_lock(&tid_lock);
238	/*
239	 * It is an invariant that the bitmap is big enough to hold maxthread
240	 * IDs. If we got to this point there has to be at least one free.
241	 */
242	if (trytid >= maxthread)
243		trytid = 0;
244	bit_ffc_at(tid_bitmap, trytid, maxthread, &tid);
245	if (tid == -1) {
246		KASSERT(trytid != 0, ("unexpectedly ran out of IDs"));
247		trytid = 0;
248		bit_ffc_at(tid_bitmap, trytid, maxthread, &tid);
249		KASSERT(tid != -1, ("unexpectedly ran out of IDs"));
250	}
251	bit_set(tid_bitmap, tid);
252	trytid = tid + 1;
253	mtx_unlock(&tid_lock);
254	return (tid + NO_PID);
255}
256
257static void
258tid_free_locked(lwpid_t rtid)
259{
260	lwpid_t tid;
261
262	mtx_assert(&tid_lock, MA_OWNED);
263	KASSERT(rtid >= NO_PID,
264	    ("%s: invalid tid %d\n", __func__, rtid));
265	tid = rtid - NO_PID;
266	KASSERT(bit_test(tid_bitmap, tid) != 0,
267	    ("thread ID %d not allocated\n", rtid));
268	bit_clear(tid_bitmap, tid);
269}
270
271static void
272tid_free(lwpid_t rtid)
273{
274
275	mtx_lock(&tid_lock);
276	tid_free_locked(rtid);
277	mtx_unlock(&tid_lock);
278}
279
280static void
281tid_free_batch(lwpid_t *batch, int n)
282{
283	int i;
284
285	mtx_lock(&tid_lock);
286	for (i = 0; i < n; i++) {
287		tid_free_locked(batch[i]);
288	}
289	mtx_unlock(&tid_lock);
290}
291
292/*
293 * Batching for thread reapping.
294 */
295struct tidbatch {
296	lwpid_t tab[16];
297	int n;
298};
299
300static void
301tidbatch_prep(struct tidbatch *tb)
302{
303
304	tb->n = 0;
305}
306
307static void
308tidbatch_add(struct tidbatch *tb, struct thread *td)
309{
310
311	KASSERT(tb->n < nitems(tb->tab),
312	    ("%s: count too high %d", __func__, tb->n));
313	tb->tab[tb->n] = td->td_tid;
314	tb->n++;
315}
316
317static void
318tidbatch_process(struct tidbatch *tb)
319{
320
321	KASSERT(tb->n <= nitems(tb->tab),
322	    ("%s: count too high %d", __func__, tb->n));
323	if (tb->n == nitems(tb->tab)) {
324		tid_free_batch(tb->tab, tb->n);
325		tb->n = 0;
326	}
327}
328
329static void
330tidbatch_final(struct tidbatch *tb)
331{
332
333	KASSERT(tb->n <= nitems(tb->tab),
334	    ("%s: count too high %d", __func__, tb->n));
335	if (tb->n != 0) {
336		tid_free_batch(tb->tab, tb->n);
337	}
338}
339
340/*
341 * Prepare a thread for use.
342 */
343static int
344thread_ctor(void *mem, int size, void *arg, int flags)
345{
346	struct thread	*td;
347
348	td = (struct thread *)mem;
349	td->td_state = TDS_INACTIVE;
350	td->td_lastcpu = td->td_oncpu = NOCPU;
351
352	/*
353	 * Note that td_critnest begins life as 1 because the thread is not
354	 * running and is thereby implicitly waiting to be on the receiving
355	 * end of a context switch.
356	 */
357	td->td_critnest = 1;
358	td->td_lend_user_pri = PRI_MAX;
359#ifdef AUDIT
360	audit_thread_alloc(td);
361#endif
362#ifdef KDTRACE_HOOKS
363	kdtrace_thread_ctor(td);
364#endif
365	umtx_thread_alloc(td);
366	MPASS(td->td_sel == NULL);
367	return (0);
368}
369
370/*
371 * Reclaim a thread after use.
372 */
373static void
374thread_dtor(void *mem, int size, void *arg)
375{
376	struct thread *td;
377
378	td = (struct thread *)mem;
379
380#ifdef INVARIANTS
381	/* Verify that this thread is in a safe state to free. */
382	switch (td->td_state) {
383	case TDS_INHIBITED:
384	case TDS_RUNNING:
385	case TDS_CAN_RUN:
386	case TDS_RUNQ:
387		/*
388		 * We must never unlink a thread that is in one of
389		 * these states, because it is currently active.
390		 */
391		panic("bad state for thread unlinking");
392		/* NOTREACHED */
393	case TDS_INACTIVE:
394		break;
395	default:
396		panic("bad thread state");
397		/* NOTREACHED */
398	}
399#endif
400#ifdef AUDIT
401	audit_thread_free(td);
402#endif
403#ifdef KDTRACE_HOOKS
404	kdtrace_thread_dtor(td);
405#endif
406	/* Free all OSD associated to this thread. */
407	osd_thread_exit(td);
408	td_softdep_cleanup(td);
409	MPASS(td->td_su == NULL);
410	seltdfini(td);
411}
412
413/*
414 * Initialize type-stable parts of a thread (when newly created).
415 */
416static int
417thread_init(void *mem, int size, int flags)
418{
419	struct thread *td;
420
421	td = (struct thread *)mem;
422
423	td->td_allocdomain = vm_phys_domain(vtophys(td));
424	td->td_sleepqueue = sleepq_alloc();
425	td->td_turnstile = turnstile_alloc();
426	td->td_rlqe = NULL;
427	EVENTHANDLER_DIRECT_INVOKE(thread_init, td);
428	umtx_thread_init(td);
429	td->td_kstack = 0;
430	td->td_sel = NULL;
431	return (0);
432}
433
434/*
435 * Tear down type-stable parts of a thread (just before being discarded).
436 */
437static void
438thread_fini(void *mem, int size)
439{
440	struct thread *td;
441
442	td = (struct thread *)mem;
443	EVENTHANDLER_DIRECT_INVOKE(thread_fini, td);
444	rlqentry_free(td->td_rlqe);
445	turnstile_free(td->td_turnstile);
446	sleepq_free(td->td_sleepqueue);
447	umtx_thread_fini(td);
448	MPASS(td->td_sel == NULL);
449}
450
451/*
452 * For a newly created process,
453 * link up all the structures and its initial threads etc.
454 * called from:
455 * {arch}/{arch}/machdep.c   {arch}_init(), init386() etc.
456 * proc_dtor() (should go away)
457 * proc_init()
458 */
459void
460proc_linkup0(struct proc *p, struct thread *td)
461{
462	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
463	proc_linkup(p, td);
464}
465
466void
467proc_linkup(struct proc *p, struct thread *td)
468{
469
470	sigqueue_init(&p->p_sigqueue, p);
471	p->p_ksi = ksiginfo_alloc(1);
472	if (p->p_ksi != NULL) {
473		/* XXX p_ksi may be null if ksiginfo zone is not ready */
474		p->p_ksi->ksi_flags = KSI_EXT | KSI_INS;
475	}
476	LIST_INIT(&p->p_mqnotifier);
477	p->p_numthreads = 0;
478	thread_link(td, p);
479}
480
481extern int max_threads_per_proc;
482
483/*
484 * Initialize global thread allocation resources.
485 */
486void
487threadinit(void)
488{
489	u_long i;
490	lwpid_t tid0;
491	uint32_t flags;
492
493	/*
494	 * Place an upper limit on threads which can be allocated.
495	 *
496	 * Note that other factors may make the de facto limit much lower.
497	 *
498	 * Platform limits are somewhat arbitrary but deemed "more than good
499	 * enough" for the foreseable future.
500	 */
501	if (maxthread == 0) {
502#ifdef _LP64
503		maxthread = MIN(maxproc * max_threads_per_proc, 1000000);
504#else
505		maxthread = MIN(maxproc * max_threads_per_proc, 100000);
506#endif
507	}
508
509	mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF);
510	tid_bitmap = bit_alloc(maxthread, M_TIDHASH, M_WAITOK);
511	/*
512	 * Handle thread0.
513	 */
514	thread_count_inc();
515	tid0 = tid_alloc();
516	if (tid0 != THREAD0_TID)
517		panic("tid0 %d != %d\n", tid0, THREAD0_TID);
518
519	flags = UMA_ZONE_NOFREE;
520#ifdef __aarch64__
521	/*
522	 * Force thread structures to be allocated from the direct map.
523	 * Otherwise, superpage promotions and demotions may temporarily
524	 * invalidate thread structure mappings.  For most dynamically allocated
525	 * structures this is not a problem, but translation faults cannot be
526	 * handled without accessing curthread.
527	 */
528	flags |= UMA_ZONE_CONTIG;
529#endif
530	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
531	    thread_ctor, thread_dtor, thread_init, thread_fini,
532	    32 - 1, flags);
533	tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash);
534	tidhashlock = (tidhash + 1) / 64;
535	if (tidhashlock > 0)
536		tidhashlock--;
537	tidhashtbl_lock = malloc(sizeof(*tidhashtbl_lock) * (tidhashlock + 1),
538	    M_TIDHASH, M_WAITOK | M_ZERO);
539	for (i = 0; i < tidhashlock + 1; i++)
540		rw_init(&tidhashtbl_lock[i], "tidhash");
541
542	TASK_INIT(&thread_reap_task, 0, thread_reap_task_cb, NULL);
543	callout_init(&thread_reap_callout, 1);
544	callout_reset(&thread_reap_callout, 5 * hz,
545	    thread_reap_callout_cb, NULL);
546}
547
548/*
549 * Place an unused thread on the zombie list.
550 */
551void
552thread_zombie(struct thread *td)
553{
554	struct thread_domain_data *tdd;
555	struct thread *ztd;
556
557	tdd = &thread_domain_data[td->td_allocdomain];
558	ztd = atomic_load_ptr(&tdd->tdd_zombies);
559	for (;;) {
560		td->td_zombie = ztd;
561		if (atomic_fcmpset_rel_ptr((uintptr_t *)&tdd->tdd_zombies,
562		    (uintptr_t *)&ztd, (uintptr_t)td))
563			break;
564		continue;
565	}
566}
567
568/*
569 * Release a thread that has exited after cpu_throw().
570 */
571void
572thread_stash(struct thread *td)
573{
574	atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1);
575	thread_zombie(td);
576}
577
578/*
579 * Reap zombies from passed domain.
580 */
581static void
582thread_reap_domain(struct thread_domain_data *tdd)
583{
584	struct thread *itd, *ntd;
585	struct tidbatch tidbatch;
586	struct credbatch credbatch;
587	int tdcount;
588	struct plimit *lim;
589	int limcount;
590
591	/*
592	 * Reading upfront is pessimal if followed by concurrent atomic_swap,
593	 * but most of the time the list is empty.
594	 */
595	if (tdd->tdd_zombies == NULL)
596		return;
597
598	itd = (struct thread *)atomic_swap_ptr((uintptr_t *)&tdd->tdd_zombies,
599	    (uintptr_t)NULL);
600	if (itd == NULL)
601		return;
602
603	/*
604	 * Multiple CPUs can get here, the race is fine as ticks is only
605	 * advisory.
606	 */
607	tdd->tdd_reapticks = ticks;
608
609	tidbatch_prep(&tidbatch);
610	credbatch_prep(&credbatch);
611	tdcount = 0;
612	lim = NULL;
613	limcount = 0;
614
615	while (itd != NULL) {
616		ntd = itd->td_zombie;
617		EVENTHANDLER_DIRECT_INVOKE(thread_dtor, itd);
618		tidbatch_add(&tidbatch, itd);
619		credbatch_add(&credbatch, itd);
620		MPASS(itd->td_limit != NULL);
621		if (lim != itd->td_limit) {
622			if (limcount != 0) {
623				lim_freen(lim, limcount);
624				limcount = 0;
625			}
626		}
627		lim = itd->td_limit;
628		limcount++;
629		thread_free_batched(itd);
630		tidbatch_process(&tidbatch);
631		credbatch_process(&credbatch);
632		tdcount++;
633		if (tdcount == 32) {
634			thread_count_sub(tdcount);
635			tdcount = 0;
636		}
637		itd = ntd;
638	}
639
640	tidbatch_final(&tidbatch);
641	credbatch_final(&credbatch);
642	if (tdcount != 0) {
643		thread_count_sub(tdcount);
644	}
645	MPASS(limcount != 0);
646	lim_freen(lim, limcount);
647}
648
649/*
650 * Reap zombies from all domains.
651 */
652static void
653thread_reap_all(void)
654{
655	struct thread_domain_data *tdd;
656	int i, domain;
657
658	domain = PCPU_GET(domain);
659	for (i = 0; i < vm_ndomains; i++) {
660		tdd = &thread_domain_data[(i + domain) % vm_ndomains];
661		thread_reap_domain(tdd);
662	}
663}
664
665/*
666 * Reap zombies from local domain.
667 */
668static void
669thread_reap(void)
670{
671	struct thread_domain_data *tdd;
672	int domain;
673
674	domain = PCPU_GET(domain);
675	tdd = &thread_domain_data[domain];
676
677	thread_reap_domain(tdd);
678}
679
680static void
681thread_reap_task_cb(void *arg __unused, int pending __unused)
682{
683
684	thread_reap_all();
685}
686
687static void
688thread_reap_callout_cb(void *arg __unused)
689{
690	struct thread_domain_data *tdd;
691	int i, cticks, lticks;
692	bool wantreap;
693
694	wantreap = false;
695	cticks = atomic_load_int(&ticks);
696	for (i = 0; i < vm_ndomains; i++) {
697		tdd = &thread_domain_data[i];
698		lticks = tdd->tdd_reapticks;
699		if (tdd->tdd_zombies != NULL &&
700		    (u_int)(cticks - lticks) > 5 * hz) {
701			wantreap = true;
702			break;
703		}
704	}
705
706	if (wantreap)
707		taskqueue_enqueue(taskqueue_thread, &thread_reap_task);
708	callout_reset(&thread_reap_callout, 5 * hz,
709	    thread_reap_callout_cb, NULL);
710}
711
712/*
713 * Calling this function guarantees that any thread that exited before
714 * the call is reaped when the function returns.  By 'exited' we mean
715 * a thread removed from the process linkage with thread_unlink().
716 * Practically this means that caller must lock/unlock corresponding
717 * process lock before the call, to synchronize with thread_exit().
718 */
719void
720thread_reap_barrier(void)
721{
722	struct task *t;
723
724	/*
725	 * First do context switches to each CPU to ensure that all
726	 * PCPU pc_deadthreads are moved to zombie list.
727	 */
728	quiesce_all_cpus("", PDROP);
729
730	/*
731	 * Second, fire the task in the same thread as normal
732	 * thread_reap() is done, to serialize reaping.
733	 */
734	t = malloc(sizeof(*t), M_TEMP, M_WAITOK);
735	TASK_INIT(t, 0, thread_reap_task_cb, t);
736	taskqueue_enqueue(taskqueue_thread, t);
737	taskqueue_drain(taskqueue_thread, t);
738	free(t, M_TEMP);
739}
740
741/*
742 * Allocate a thread.
743 */
744struct thread *
745thread_alloc(int pages)
746{
747	struct thread *td;
748	lwpid_t tid;
749
750	if (!thread_count_inc()) {
751		return (NULL);
752	}
753
754	tid = tid_alloc();
755	td = uma_zalloc(thread_zone, M_WAITOK);
756	KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack"));
757	if (!vm_thread_new(td, pages)) {
758		uma_zfree(thread_zone, td);
759		tid_free(tid);
760		thread_count_dec();
761		return (NULL);
762	}
763	td->td_tid = tid;
764	cpu_thread_alloc(td);
765	EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td);
766	return (td);
767}
768
769int
770thread_alloc_stack(struct thread *td, int pages)
771{
772
773	KASSERT(td->td_kstack == 0,
774	    ("thread_alloc_stack called on a thread with kstack"));
775	if (!vm_thread_new(td, pages))
776		return (0);
777	cpu_thread_alloc(td);
778	return (1);
779}
780
781/*
782 * Deallocate a thread.
783 */
784static void
785thread_free_batched(struct thread *td)
786{
787
788	lock_profile_thread_exit(td);
789	if (td->td_cpuset)
790		cpuset_rel(td->td_cpuset);
791	td->td_cpuset = NULL;
792	cpu_thread_free(td);
793	if (td->td_kstack != 0)
794		vm_thread_dispose(td);
795	callout_drain(&td->td_slpcallout);
796	/*
797	 * Freeing handled by the caller.
798	 */
799	td->td_tid = -1;
800	uma_zfree(thread_zone, td);
801}
802
803void
804thread_free(struct thread *td)
805{
806	lwpid_t tid;
807
808	EVENTHANDLER_DIRECT_INVOKE(thread_dtor, td);
809	tid = td->td_tid;
810	thread_free_batched(td);
811	tid_free(tid);
812	thread_count_dec();
813}
814
815void
816thread_cow_get_proc(struct thread *newtd, struct proc *p)
817{
818
819	PROC_LOCK_ASSERT(p, MA_OWNED);
820	newtd->td_realucred = crcowget(p->p_ucred);
821	newtd->td_ucred = newtd->td_realucred;
822	newtd->td_limit = lim_hold(p->p_limit);
823	newtd->td_cowgen = p->p_cowgen;
824}
825
826void
827thread_cow_get(struct thread *newtd, struct thread *td)
828{
829
830	MPASS(td->td_realucred == td->td_ucred);
831	newtd->td_realucred = crcowget(td->td_realucred);
832	newtd->td_ucred = newtd->td_realucred;
833	newtd->td_limit = lim_hold(td->td_limit);
834	newtd->td_cowgen = td->td_cowgen;
835}
836
837void
838thread_cow_free(struct thread *td)
839{
840
841	if (td->td_realucred != NULL)
842		crcowfree(td);
843	if (td->td_limit != NULL)
844		lim_free(td->td_limit);
845}
846
847void
848thread_cow_update(struct thread *td)
849{
850	struct proc *p;
851	struct ucred *oldcred;
852	struct plimit *oldlimit;
853
854	p = td->td_proc;
855	oldlimit = NULL;
856	PROC_LOCK(p);
857	oldcred = crcowsync();
858	if (td->td_limit != p->p_limit) {
859		oldlimit = td->td_limit;
860		td->td_limit = lim_hold(p->p_limit);
861	}
862	td->td_cowgen = p->p_cowgen;
863	PROC_UNLOCK(p);
864	if (oldcred != NULL)
865		crfree(oldcred);
866	if (oldlimit != NULL)
867		lim_free(oldlimit);
868}
869
870/*
871 * Discard the current thread and exit from its context.
872 * Always called with scheduler locked.
873 *
874 * Because we can't free a thread while we're operating under its context,
875 * push the current thread into our CPU's deadthread holder. This means
876 * we needn't worry about someone else grabbing our context before we
877 * do a cpu_throw().
878 */
879void
880thread_exit(void)
881{
882	uint64_t runtime, new_switchtime;
883	struct thread *td;
884	struct thread *td2;
885	struct proc *p;
886	int wakeup_swapper;
887
888	td = curthread;
889	p = td->td_proc;
890
891	PROC_SLOCK_ASSERT(p, MA_OWNED);
892	mtx_assert(&Giant, MA_NOTOWNED);
893
894	PROC_LOCK_ASSERT(p, MA_OWNED);
895	KASSERT(p != NULL, ("thread exiting without a process"));
896	CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
897	    (long)p->p_pid, td->td_name);
898	SDT_PROBE0(proc, , , lwp__exit);
899	KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending"));
900	MPASS(td->td_realucred == td->td_ucred);
901
902	/*
903	 * drop FPU & debug register state storage, or any other
904	 * architecture specific resources that
905	 * would not be on a new untouched process.
906	 */
907	cpu_thread_exit(td);
908
909	/*
910	 * The last thread is left attached to the process
911	 * So that the whole bundle gets recycled. Skip
912	 * all this stuff if we never had threads.
913	 * EXIT clears all sign of other threads when
914	 * it goes to single threading, so the last thread always
915	 * takes the short path.
916	 */
917	if (p->p_flag & P_HADTHREADS) {
918		if (p->p_numthreads > 1) {
919			atomic_add_int(&td->td_proc->p_exitthreads, 1);
920			thread_unlink(td);
921			td2 = FIRST_THREAD_IN_PROC(p);
922			sched_exit_thread(td2, td);
923
924			/*
925			 * The test below is NOT true if we are the
926			 * sole exiting thread. P_STOPPED_SINGLE is unset
927			 * in exit1() after it is the only survivor.
928			 */
929			if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
930				if (p->p_numthreads == p->p_suspcount) {
931					thread_lock(p->p_singlethread);
932					wakeup_swapper = thread_unsuspend_one(
933						p->p_singlethread, p, false);
934					if (wakeup_swapper)
935						kick_proc0();
936				}
937			}
938
939			PCPU_SET(deadthread, td);
940		} else {
941			/*
942			 * The last thread is exiting.. but not through exit()
943			 */
944			panic ("thread_exit: Last thread exiting on its own");
945		}
946	}
947#ifdef	HWPMC_HOOKS
948	/*
949	 * If this thread is part of a process that is being tracked by hwpmc(4),
950	 * inform the module of the thread's impending exit.
951	 */
952	if (PMC_PROC_IS_USING_PMCS(td->td_proc)) {
953		PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
954		PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT, NULL);
955	} else if (PMC_SYSTEM_SAMPLING_ACTIVE())
956		PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT_LOG, NULL);
957#endif
958	PROC_UNLOCK(p);
959	PROC_STATLOCK(p);
960	thread_lock(td);
961	PROC_SUNLOCK(p);
962
963	/* Do the same timestamp bookkeeping that mi_switch() would do. */
964	new_switchtime = cpu_ticks();
965	runtime = new_switchtime - PCPU_GET(switchtime);
966	td->td_runtime += runtime;
967	td->td_incruntime += runtime;
968	PCPU_SET(switchtime, new_switchtime);
969	PCPU_SET(switchticks, ticks);
970	VM_CNT_INC(v_swtch);
971
972	/* Save our resource usage in our process. */
973	td->td_ru.ru_nvcsw++;
974	ruxagg_locked(p, td);
975	rucollect(&p->p_ru, &td->td_ru);
976	PROC_STATUNLOCK(p);
977
978	td->td_state = TDS_INACTIVE;
979#ifdef WITNESS
980	witness_thread_exit(td);
981#endif
982	CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
983	sched_throw(td);
984	panic("I'm a teapot!");
985	/* NOTREACHED */
986}
987
988/*
989 * Do any thread specific cleanups that may be needed in wait()
990 * called with Giant, proc and schedlock not held.
991 */
992void
993thread_wait(struct proc *p)
994{
995	struct thread *td;
996
997	mtx_assert(&Giant, MA_NOTOWNED);
998	KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()"));
999	KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking"));
1000	td = FIRST_THREAD_IN_PROC(p);
1001	/* Lock the last thread so we spin until it exits cpu_throw(). */
1002	thread_lock(td);
1003	thread_unlock(td);
1004	lock_profile_thread_exit(td);
1005	cpuset_rel(td->td_cpuset);
1006	td->td_cpuset = NULL;
1007	cpu_thread_clean(td);
1008	thread_cow_free(td);
1009	callout_drain(&td->td_slpcallout);
1010	thread_reap();	/* check for zombie threads etc. */
1011}
1012
1013/*
1014 * Link a thread to a process.
1015 * set up anything that needs to be initialized for it to
1016 * be used by the process.
1017 */
1018void
1019thread_link(struct thread *td, struct proc *p)
1020{
1021
1022	/*
1023	 * XXX This can't be enabled because it's called for proc0 before
1024	 * its lock has been created.
1025	 * PROC_LOCK_ASSERT(p, MA_OWNED);
1026	 */
1027	td->td_state    = TDS_INACTIVE;
1028	td->td_proc     = p;
1029	td->td_flags    = TDF_INMEM;
1030
1031	LIST_INIT(&td->td_contested);
1032	LIST_INIT(&td->td_lprof[0]);
1033	LIST_INIT(&td->td_lprof[1]);
1034#ifdef EPOCH_TRACE
1035	SLIST_INIT(&td->td_epochs);
1036#endif
1037	sigqueue_init(&td->td_sigqueue, p);
1038	callout_init(&td->td_slpcallout, 1);
1039	TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist);
1040	p->p_numthreads++;
1041}
1042
1043/*
1044 * Called from:
1045 *  thread_exit()
1046 */
1047void
1048thread_unlink(struct thread *td)
1049{
1050	struct proc *p = td->td_proc;
1051
1052	PROC_LOCK_ASSERT(p, MA_OWNED);
1053#ifdef EPOCH_TRACE
1054	MPASS(SLIST_EMPTY(&td->td_epochs));
1055#endif
1056
1057	TAILQ_REMOVE(&p->p_threads, td, td_plist);
1058	p->p_numthreads--;
1059	/* could clear a few other things here */
1060	/* Must  NOT clear links to proc! */
1061}
1062
1063static int
1064calc_remaining(struct proc *p, int mode)
1065{
1066	int remaining;
1067
1068	PROC_LOCK_ASSERT(p, MA_OWNED);
1069	PROC_SLOCK_ASSERT(p, MA_OWNED);
1070	if (mode == SINGLE_EXIT)
1071		remaining = p->p_numthreads;
1072	else if (mode == SINGLE_BOUNDARY)
1073		remaining = p->p_numthreads - p->p_boundary_count;
1074	else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC)
1075		remaining = p->p_numthreads - p->p_suspcount;
1076	else
1077		panic("calc_remaining: wrong mode %d", mode);
1078	return (remaining);
1079}
1080
1081static int
1082remain_for_mode(int mode)
1083{
1084
1085	return (mode == SINGLE_ALLPROC ? 0 : 1);
1086}
1087
1088static int
1089weed_inhib(int mode, struct thread *td2, struct proc *p)
1090{
1091	int wakeup_swapper;
1092
1093	PROC_LOCK_ASSERT(p, MA_OWNED);
1094	PROC_SLOCK_ASSERT(p, MA_OWNED);
1095	THREAD_LOCK_ASSERT(td2, MA_OWNED);
1096
1097	wakeup_swapper = 0;
1098
1099	/*
1100	 * Since the thread lock is dropped by the scheduler we have
1101	 * to retry to check for races.
1102	 */
1103restart:
1104	switch (mode) {
1105	case SINGLE_EXIT:
1106		if (TD_IS_SUSPENDED(td2)) {
1107			wakeup_swapper |= thread_unsuspend_one(td2, p, true);
1108			thread_lock(td2);
1109			goto restart;
1110		}
1111		if (TD_CAN_ABORT(td2)) {
1112			wakeup_swapper |= sleepq_abort(td2, EINTR);
1113			return (wakeup_swapper);
1114		}
1115		break;
1116	case SINGLE_BOUNDARY:
1117	case SINGLE_NO_EXIT:
1118		if (TD_IS_SUSPENDED(td2) &&
1119		    (td2->td_flags & TDF_BOUNDARY) == 0) {
1120			wakeup_swapper |= thread_unsuspend_one(td2, p, false);
1121			thread_lock(td2);
1122			goto restart;
1123		}
1124		if (TD_CAN_ABORT(td2)) {
1125			wakeup_swapper |= sleepq_abort(td2, ERESTART);
1126			return (wakeup_swapper);
1127		}
1128		break;
1129	case SINGLE_ALLPROC:
1130		/*
1131		 * ALLPROC suspend tries to avoid spurious EINTR for
1132		 * threads sleeping interruptable, by suspending the
1133		 * thread directly, similarly to sig_suspend_threads().
1134		 * Since such sleep is not performed at the user
1135		 * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP
1136		 * is used to avoid immediate un-suspend.
1137		 */
1138		if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY |
1139		    TDF_ALLPROCSUSP)) == 0) {
1140			wakeup_swapper |= thread_unsuspend_one(td2, p, false);
1141			thread_lock(td2);
1142			goto restart;
1143		}
1144		if (TD_CAN_ABORT(td2)) {
1145			if ((td2->td_flags & TDF_SBDRY) == 0) {
1146				thread_suspend_one(td2);
1147				td2->td_flags |= TDF_ALLPROCSUSP;
1148			} else {
1149				wakeup_swapper |= sleepq_abort(td2, ERESTART);
1150				return (wakeup_swapper);
1151			}
1152		}
1153		break;
1154	default:
1155		break;
1156	}
1157	thread_unlock(td2);
1158	return (wakeup_swapper);
1159}
1160
1161/*
1162 * Enforce single-threading.
1163 *
1164 * Returns 1 if the caller must abort (another thread is waiting to
1165 * exit the process or similar). Process is locked!
1166 * Returns 0 when you are successfully the only thread running.
1167 * A process has successfully single threaded in the suspend mode when
1168 * There are no threads in user mode. Threads in the kernel must be
1169 * allowed to continue until they get to the user boundary. They may even
1170 * copy out their return values and data before suspending. They may however be
1171 * accelerated in reaching the user boundary as we will wake up
1172 * any sleeping threads that are interruptable. (PCATCH).
1173 */
1174int
1175thread_single(struct proc *p, int mode)
1176{
1177	struct thread *td;
1178	struct thread *td2;
1179	int remaining, wakeup_swapper;
1180
1181	td = curthread;
1182	KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
1183	    mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
1184	    ("invalid mode %d", mode));
1185	/*
1186	 * If allowing non-ALLPROC singlethreading for non-curproc
1187	 * callers, calc_remaining() and remain_for_mode() should be
1188	 * adjusted to also account for td->td_proc != p.  For now
1189	 * this is not implemented because it is not used.
1190	 */
1191	KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) ||
1192	    (mode != SINGLE_ALLPROC && td->td_proc == p),
1193	    ("mode %d proc %p curproc %p", mode, p, td->td_proc));
1194	mtx_assert(&Giant, MA_NOTOWNED);
1195	PROC_LOCK_ASSERT(p, MA_OWNED);
1196
1197	if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC)
1198		return (0);
1199
1200	/* Is someone already single threading? */
1201	if (p->p_singlethread != NULL && p->p_singlethread != td)
1202		return (1);
1203
1204	if (mode == SINGLE_EXIT) {
1205		p->p_flag |= P_SINGLE_EXIT;
1206		p->p_flag &= ~P_SINGLE_BOUNDARY;
1207	} else {
1208		p->p_flag &= ~P_SINGLE_EXIT;
1209		if (mode == SINGLE_BOUNDARY)
1210			p->p_flag |= P_SINGLE_BOUNDARY;
1211		else
1212			p->p_flag &= ~P_SINGLE_BOUNDARY;
1213	}
1214	if (mode == SINGLE_ALLPROC)
1215		p->p_flag |= P_TOTAL_STOP;
1216	p->p_flag |= P_STOPPED_SINGLE;
1217	PROC_SLOCK(p);
1218	p->p_singlethread = td;
1219	remaining = calc_remaining(p, mode);
1220	while (remaining != remain_for_mode(mode)) {
1221		if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE)
1222			goto stopme;
1223		wakeup_swapper = 0;
1224		FOREACH_THREAD_IN_PROC(p, td2) {
1225			if (td2 == td)
1226				continue;
1227			thread_lock(td2);
1228			td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
1229			if (TD_IS_INHIBITED(td2)) {
1230				wakeup_swapper |= weed_inhib(mode, td2, p);
1231#ifdef SMP
1232			} else if (TD_IS_RUNNING(td2) && td != td2) {
1233				forward_signal(td2);
1234				thread_unlock(td2);
1235#endif
1236			} else
1237				thread_unlock(td2);
1238		}
1239		if (wakeup_swapper)
1240			kick_proc0();
1241		remaining = calc_remaining(p, mode);
1242
1243		/*
1244		 * Maybe we suspended some threads.. was it enough?
1245		 */
1246		if (remaining == remain_for_mode(mode))
1247			break;
1248
1249stopme:
1250		/*
1251		 * Wake us up when everyone else has suspended.
1252		 * In the mean time we suspend as well.
1253		 */
1254		thread_suspend_switch(td, p);
1255		remaining = calc_remaining(p, mode);
1256	}
1257	if (mode == SINGLE_EXIT) {
1258		/*
1259		 * Convert the process to an unthreaded process.  The
1260		 * SINGLE_EXIT is called by exit1() or execve(), in
1261		 * both cases other threads must be retired.
1262		 */
1263		KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads"));
1264		p->p_singlethread = NULL;
1265		p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS);
1266
1267		/*
1268		 * Wait for any remaining threads to exit cpu_throw().
1269		 */
1270		while (p->p_exitthreads != 0) {
1271			PROC_SUNLOCK(p);
1272			PROC_UNLOCK(p);
1273			sched_relinquish(td);
1274			PROC_LOCK(p);
1275			PROC_SLOCK(p);
1276		}
1277	} else if (mode == SINGLE_BOUNDARY) {
1278		/*
1279		 * Wait until all suspended threads are removed from
1280		 * the processors.  The thread_suspend_check()
1281		 * increments p_boundary_count while it is still
1282		 * running, which makes it possible for the execve()
1283		 * to destroy vmspace while our other threads are
1284		 * still using the address space.
1285		 *
1286		 * We lock the thread, which is only allowed to
1287		 * succeed after context switch code finished using
1288		 * the address space.
1289		 */
1290		FOREACH_THREAD_IN_PROC(p, td2) {
1291			if (td2 == td)
1292				continue;
1293			thread_lock(td2);
1294			KASSERT((td2->td_flags & TDF_BOUNDARY) != 0,
1295			    ("td %p not on boundary", td2));
1296			KASSERT(TD_IS_SUSPENDED(td2),
1297			    ("td %p is not suspended", td2));
1298			thread_unlock(td2);
1299		}
1300	}
1301	PROC_SUNLOCK(p);
1302	return (0);
1303}
1304
1305bool
1306thread_suspend_check_needed(void)
1307{
1308	struct proc *p;
1309	struct thread *td;
1310
1311	td = curthread;
1312	p = td->td_proc;
1313	PROC_LOCK_ASSERT(p, MA_OWNED);
1314	return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 &&
1315	    (td->td_dbgflags & TDB_SUSPEND) != 0));
1316}
1317
1318/*
1319 * Called in from locations that can safely check to see
1320 * whether we have to suspend or at least throttle for a
1321 * single-thread event (e.g. fork).
1322 *
1323 * Such locations include userret().
1324 * If the "return_instead" argument is non zero, the thread must be able to
1325 * accept 0 (caller may continue), or 1 (caller must abort) as a result.
1326 *
1327 * The 'return_instead' argument tells the function if it may do a
1328 * thread_exit() or suspend, or whether the caller must abort and back
1329 * out instead.
1330 *
1331 * If the thread that set the single_threading request has set the
1332 * P_SINGLE_EXIT bit in the process flags then this call will never return
1333 * if 'return_instead' is false, but will exit.
1334 *
1335 * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
1336 *---------------+--------------------+---------------------
1337 *       0       | returns 0          |   returns 0 or 1
1338 *               | when ST ends       |   immediately
1339 *---------------+--------------------+---------------------
1340 *       1       | thread exits       |   returns 1
1341 *               |                    |  immediately
1342 * 0 = thread_exit() or suspension ok,
1343 * other = return error instead of stopping the thread.
1344 *
1345 * While a full suspension is under effect, even a single threading
1346 * thread would be suspended if it made this call (but it shouldn't).
1347 * This call should only be made from places where
1348 * thread_exit() would be safe as that may be the outcome unless
1349 * return_instead is set.
1350 */
1351int
1352thread_suspend_check(int return_instead)
1353{
1354	struct thread *td;
1355	struct proc *p;
1356	int wakeup_swapper;
1357
1358	td = curthread;
1359	p = td->td_proc;
1360	mtx_assert(&Giant, MA_NOTOWNED);
1361	PROC_LOCK_ASSERT(p, MA_OWNED);
1362	while (thread_suspend_check_needed()) {
1363		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1364			KASSERT(p->p_singlethread != NULL,
1365			    ("singlethread not set"));
1366			/*
1367			 * The only suspension in action is a
1368			 * single-threading. Single threader need not stop.
1369			 * It is safe to access p->p_singlethread unlocked
1370			 * because it can only be set to our address by us.
1371			 */
1372			if (p->p_singlethread == td)
1373				return (0);	/* Exempt from stopping. */
1374		}
1375		if ((p->p_flag & P_SINGLE_EXIT) && return_instead)
1376			return (EINTR);
1377
1378		/* Should we goto user boundary if we didn't come from there? */
1379		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
1380		    (p->p_flag & P_SINGLE_BOUNDARY) && return_instead)
1381			return (ERESTART);
1382
1383		/*
1384		 * Ignore suspend requests if they are deferred.
1385		 */
1386		if ((td->td_flags & TDF_SBDRY) != 0) {
1387			KASSERT(return_instead,
1388			    ("TDF_SBDRY set for unsafe thread_suspend_check"));
1389			KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) !=
1390			    (TDF_SEINTR | TDF_SERESTART),
1391			    ("both TDF_SEINTR and TDF_SERESTART"));
1392			return (TD_SBDRY_INTR(td) ? TD_SBDRY_ERRNO(td) : 0);
1393		}
1394
1395		/*
1396		 * If the process is waiting for us to exit,
1397		 * this thread should just suicide.
1398		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
1399		 */
1400		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
1401			PROC_UNLOCK(p);
1402
1403			/*
1404			 * Allow Linux emulation layer to do some work
1405			 * before thread suicide.
1406			 */
1407			if (__predict_false(p->p_sysent->sv_thread_detach != NULL))
1408				(p->p_sysent->sv_thread_detach)(td);
1409			umtx_thread_exit(td);
1410			kern_thr_exit(td);
1411			panic("stopped thread did not exit");
1412		}
1413
1414		PROC_SLOCK(p);
1415		thread_stopped(p);
1416		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1417			if (p->p_numthreads == p->p_suspcount + 1) {
1418				thread_lock(p->p_singlethread);
1419				wakeup_swapper = thread_unsuspend_one(
1420				    p->p_singlethread, p, false);
1421				if (wakeup_swapper)
1422					kick_proc0();
1423			}
1424		}
1425		PROC_UNLOCK(p);
1426		thread_lock(td);
1427		/*
1428		 * When a thread suspends, it just
1429		 * gets taken off all queues.
1430		 */
1431		thread_suspend_one(td);
1432		if (return_instead == 0) {
1433			p->p_boundary_count++;
1434			td->td_flags |= TDF_BOUNDARY;
1435		}
1436		PROC_SUNLOCK(p);
1437		mi_switch(SW_INVOL | SWT_SUSPEND);
1438		PROC_LOCK(p);
1439	}
1440	return (0);
1441}
1442
1443/*
1444 * Check for possible stops and suspensions while executing a
1445 * casueword or similar transiently failing operation.
1446 *
1447 * The sleep argument controls whether the function can handle a stop
1448 * request itself or it should return ERESTART and the request is
1449 * proceed at the kernel/user boundary in ast.
1450 *
1451 * Typically, when retrying due to casueword(9) failure (rv == 1), we
1452 * should handle the stop requests there, with exception of cases when
1453 * the thread owns a kernel resource, for instance busied the umtx
1454 * key, or when functions return immediately if thread_check_susp()
1455 * returned non-zero.  On the other hand, retrying the whole lock
1456 * operation, we better not stop there but delegate the handling to
1457 * ast.
1458 *
1459 * If the request is for thread termination P_SINGLE_EXIT, we cannot
1460 * handle it at all, and simply return EINTR.
1461 */
1462int
1463thread_check_susp(struct thread *td, bool sleep)
1464{
1465	struct proc *p;
1466	int error;
1467
1468	/*
1469	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
1470	 * eventually break the lockstep loop.
1471	 */
1472	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
1473		return (0);
1474	error = 0;
1475	p = td->td_proc;
1476	PROC_LOCK(p);
1477	if (p->p_flag & P_SINGLE_EXIT)
1478		error = EINTR;
1479	else if (P_SHOULDSTOP(p) ||
1480	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND)))
1481		error = sleep ? thread_suspend_check(0) : ERESTART;
1482	PROC_UNLOCK(p);
1483	return (error);
1484}
1485
1486void
1487thread_suspend_switch(struct thread *td, struct proc *p)
1488{
1489
1490	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
1491	PROC_LOCK_ASSERT(p, MA_OWNED);
1492	PROC_SLOCK_ASSERT(p, MA_OWNED);
1493	/*
1494	 * We implement thread_suspend_one in stages here to avoid
1495	 * dropping the proc lock while the thread lock is owned.
1496	 */
1497	if (p == td->td_proc) {
1498		thread_stopped(p);
1499		p->p_suspcount++;
1500	}
1501	PROC_UNLOCK(p);
1502	thread_lock(td);
1503	td->td_flags &= ~TDF_NEEDSUSPCHK;
1504	TD_SET_SUSPENDED(td);
1505	sched_sleep(td, 0);
1506	PROC_SUNLOCK(p);
1507	DROP_GIANT();
1508	mi_switch(SW_VOL | SWT_SUSPEND);
1509	PICKUP_GIANT();
1510	PROC_LOCK(p);
1511	PROC_SLOCK(p);
1512}
1513
1514void
1515thread_suspend_one(struct thread *td)
1516{
1517	struct proc *p;
1518
1519	p = td->td_proc;
1520	PROC_SLOCK_ASSERT(p, MA_OWNED);
1521	THREAD_LOCK_ASSERT(td, MA_OWNED);
1522	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
1523	p->p_suspcount++;
1524	td->td_flags &= ~TDF_NEEDSUSPCHK;
1525	TD_SET_SUSPENDED(td);
1526	sched_sleep(td, 0);
1527}
1528
1529static int
1530thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary)
1531{
1532
1533	THREAD_LOCK_ASSERT(td, MA_OWNED);
1534	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
1535	TD_CLR_SUSPENDED(td);
1536	td->td_flags &= ~TDF_ALLPROCSUSP;
1537	if (td->td_proc == p) {
1538		PROC_SLOCK_ASSERT(p, MA_OWNED);
1539		p->p_suspcount--;
1540		if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) {
1541			td->td_flags &= ~TDF_BOUNDARY;
1542			p->p_boundary_count--;
1543		}
1544	}
1545	return (setrunnable(td, 0));
1546}
1547
1548void
1549thread_run_flash(struct thread *td)
1550{
1551	struct proc *p;
1552
1553	p = td->td_proc;
1554	PROC_LOCK_ASSERT(p, MA_OWNED);
1555
1556	if (TD_ON_SLEEPQ(td))
1557		sleepq_remove_nested(td);
1558	else
1559		thread_lock(td);
1560
1561	THREAD_LOCK_ASSERT(td, MA_OWNED);
1562	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
1563
1564	TD_CLR_SUSPENDED(td);
1565	PROC_SLOCK(p);
1566	MPASS(p->p_suspcount > 0);
1567	p->p_suspcount--;
1568	PROC_SUNLOCK(p);
1569	if (setrunnable(td, 0))
1570		kick_proc0();
1571}
1572
1573/*
1574 * Allow all threads blocked by single threading to continue running.
1575 */
1576void
1577thread_unsuspend(struct proc *p)
1578{
1579	struct thread *td;
1580	int wakeup_swapper;
1581
1582	PROC_LOCK_ASSERT(p, MA_OWNED);
1583	PROC_SLOCK_ASSERT(p, MA_OWNED);
1584	wakeup_swapper = 0;
1585	if (!P_SHOULDSTOP(p)) {
1586                FOREACH_THREAD_IN_PROC(p, td) {
1587			thread_lock(td);
1588			if (TD_IS_SUSPENDED(td)) {
1589				wakeup_swapper |= thread_unsuspend_one(td, p,
1590				    true);
1591			} else
1592				thread_unlock(td);
1593		}
1594	} else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
1595	    p->p_numthreads == p->p_suspcount) {
1596		/*
1597		 * Stopping everything also did the job for the single
1598		 * threading request. Now we've downgraded to single-threaded,
1599		 * let it continue.
1600		 */
1601		if (p->p_singlethread->td_proc == p) {
1602			thread_lock(p->p_singlethread);
1603			wakeup_swapper = thread_unsuspend_one(
1604			    p->p_singlethread, p, false);
1605		}
1606	}
1607	if (wakeup_swapper)
1608		kick_proc0();
1609}
1610
1611/*
1612 * End the single threading mode..
1613 */
1614void
1615thread_single_end(struct proc *p, int mode)
1616{
1617	struct thread *td;
1618	int wakeup_swapper;
1619
1620	KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
1621	    mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
1622	    ("invalid mode %d", mode));
1623	PROC_LOCK_ASSERT(p, MA_OWNED);
1624	KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) ||
1625	    (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0),
1626	    ("mode %d does not match P_TOTAL_STOP", mode));
1627	KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread,
1628	    ("thread_single_end from other thread %p %p",
1629	    curthread, p->p_singlethread));
1630	KASSERT(mode != SINGLE_BOUNDARY ||
1631	    (p->p_flag & P_SINGLE_BOUNDARY) != 0,
1632	    ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag));
1633	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY |
1634	    P_TOTAL_STOP);
1635	PROC_SLOCK(p);
1636	p->p_singlethread = NULL;
1637	wakeup_swapper = 0;
1638	/*
1639	 * If there are other threads they may now run,
1640	 * unless of course there is a blanket 'stop order'
1641	 * on the process. The single threader must be allowed
1642	 * to continue however as this is a bad place to stop.
1643	 */
1644	if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) {
1645                FOREACH_THREAD_IN_PROC(p, td) {
1646			thread_lock(td);
1647			if (TD_IS_SUSPENDED(td)) {
1648				wakeup_swapper |= thread_unsuspend_one(td, p,
1649				    mode == SINGLE_BOUNDARY);
1650			} else
1651				thread_unlock(td);
1652		}
1653	}
1654	KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0,
1655	    ("inconsistent boundary count %d", p->p_boundary_count));
1656	PROC_SUNLOCK(p);
1657	if (wakeup_swapper)
1658		kick_proc0();
1659}
1660
1661/*
1662 * Locate a thread by number and return with proc lock held.
1663 *
1664 * thread exit establishes proc -> tidhash lock ordering, but lookup
1665 * takes tidhash first and needs to return locked proc.
1666 *
1667 * The problem is worked around by relying on type-safety of both
1668 * structures and doing the work in 2 steps:
1669 * - tidhash-locked lookup which saves both thread and proc pointers
1670 * - proc-locked verification that the found thread still matches
1671 */
1672static bool
1673tdfind_hash(lwpid_t tid, pid_t pid, struct proc **pp, struct thread **tdp)
1674{
1675#define RUN_THRESH	16
1676	struct proc *p;
1677	struct thread *td;
1678	int run;
1679	bool locked;
1680
1681	run = 0;
1682	rw_rlock(TIDHASHLOCK(tid));
1683	locked = true;
1684	LIST_FOREACH(td, TIDHASH(tid), td_hash) {
1685		if (td->td_tid != tid) {
1686			run++;
1687			continue;
1688		}
1689		p = td->td_proc;
1690		if (pid != -1 && p->p_pid != pid) {
1691			td = NULL;
1692			break;
1693		}
1694		if (run > RUN_THRESH) {
1695			if (rw_try_upgrade(TIDHASHLOCK(tid))) {
1696				LIST_REMOVE(td, td_hash);
1697				LIST_INSERT_HEAD(TIDHASH(td->td_tid),
1698					td, td_hash);
1699				rw_wunlock(TIDHASHLOCK(tid));
1700				locked = false;
1701				break;
1702			}
1703		}
1704		break;
1705	}
1706	if (locked)
1707		rw_runlock(TIDHASHLOCK(tid));
1708	if (td == NULL)
1709		return (false);
1710	*pp = p;
1711	*tdp = td;
1712	return (true);
1713}
1714
1715struct thread *
1716tdfind(lwpid_t tid, pid_t pid)
1717{
1718	struct proc *p;
1719	struct thread *td;
1720
1721	td = curthread;
1722	if (td->td_tid == tid) {
1723		if (pid != -1 && td->td_proc->p_pid != pid)
1724			return (NULL);
1725		PROC_LOCK(td->td_proc);
1726		return (td);
1727	}
1728
1729	for (;;) {
1730		if (!tdfind_hash(tid, pid, &p, &td))
1731			return (NULL);
1732		PROC_LOCK(p);
1733		if (td->td_tid != tid) {
1734			PROC_UNLOCK(p);
1735			continue;
1736		}
1737		if (td->td_proc != p) {
1738			PROC_UNLOCK(p);
1739			continue;
1740		}
1741		if (p->p_state == PRS_NEW) {
1742			PROC_UNLOCK(p);
1743			return (NULL);
1744		}
1745		return (td);
1746	}
1747}
1748
1749void
1750tidhash_add(struct thread *td)
1751{
1752	rw_wlock(TIDHASHLOCK(td->td_tid));
1753	LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
1754	rw_wunlock(TIDHASHLOCK(td->td_tid));
1755}
1756
1757void
1758tidhash_remove(struct thread *td)
1759{
1760
1761	rw_wlock(TIDHASHLOCK(td->td_tid));
1762	LIST_REMOVE(td, td_hash);
1763	rw_wunlock(TIDHASHLOCK(td->td_tid));
1764}
1765