1/*-
2 * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
3 *
4 * Copyright (c) 1991, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * The Mach Operating System project at Carnegie-Mellon University.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *
35 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36 * All rights reserved.
37 *
38 * Permission to use, copy, modify and distribute this software and
39 * its documentation is hereby granted, provided that both the copyright
40 * notice and this permission notice appear in all copies of the
41 * software, derivative works or modified versions, and any portions
42 * thereof, and that both notices appear in supporting documentation.
43 *
44 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
45 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
46 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
47 *
48 * Carnegie Mellon requests users of this software to return to
49 *
50 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
51 *  School of Computer Science
52 *  Carnegie Mellon University
53 *  Pittsburgh PA 15213-3890
54 *
55 * any improvements or extensions that they make and grant Carnegie the
56 * rights to redistribute these changes.
57 */
58
59#include <sys/cdefs.h>
60#include "opt_vm.h"
61#include "opt_kstack_pages.h"
62#include "opt_kstack_max_pages.h"
63#include "opt_kstack_usage_prof.h"
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/asan.h>
68#include <sys/domainset.h>
69#include <sys/limits.h>
70#include <sys/lock.h>
71#include <sys/malloc.h>
72#include <sys/msan.h>
73#include <sys/mutex.h>
74#include <sys/proc.h>
75#include <sys/racct.h>
76#include <sys/refcount.h>
77#include <sys/resourcevar.h>
78#include <sys/rwlock.h>
79#include <sys/sched.h>
80#include <sys/sf_buf.h>
81#include <sys/shm.h>
82#include <sys/smp.h>
83#include <sys/vmmeter.h>
84#include <sys/vmem.h>
85#include <sys/sx.h>
86#include <sys/sysctl.h>
87#include <sys/kernel.h>
88#include <sys/ktr.h>
89#include <sys/unistd.h>
90
91#include <vm/uma.h>
92#include <vm/vm.h>
93#include <vm/vm_param.h>
94#include <vm/pmap.h>
95#include <vm/vm_domainset.h>
96#include <vm/vm_map.h>
97#include <vm/vm_page.h>
98#include <vm/vm_pageout.h>
99#include <vm/vm_pagequeue.h>
100#include <vm/vm_object.h>
101#include <vm/vm_kern.h>
102#include <vm/vm_extern.h>
103#include <vm/vm_pager.h>
104#include <vm/swap_pager.h>
105#include <vm/vm_phys.h>
106
107#include <machine/cpu.h>
108
109#if VM_NRESERVLEVEL > 0
110#define KVA_KSTACK_QUANTUM_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT)
111#else
112#define KVA_KSTACK_QUANTUM_SHIFT (8 + PAGE_SHIFT)
113#endif
114#define KVA_KSTACK_QUANTUM (1ul << KVA_KSTACK_QUANTUM_SHIFT)
115
116/*
117 * MPSAFE
118 *
119 * WARNING!  This code calls vm_map_check_protection() which only checks
120 * the associated vm_map_entry range.  It does not determine whether the
121 * contents of the memory is actually readable or writable.  In most cases
122 * just checking the vm_map_entry is sufficient within the kernel's address
123 * space.
124 */
125bool
126kernacc(void *addr, int len, int rw)
127{
128	boolean_t rv;
129	vm_offset_t saddr, eaddr;
130	vm_prot_t prot;
131
132	KASSERT((rw & ~VM_PROT_ALL) == 0,
133	    ("illegal ``rw'' argument to kernacc (%x)\n", rw));
134
135	if ((vm_offset_t)addr + len > vm_map_max(kernel_map) ||
136	    (vm_offset_t)addr + len < (vm_offset_t)addr)
137		return (false);
138
139	prot = rw;
140	saddr = trunc_page((vm_offset_t)addr);
141	eaddr = round_page((vm_offset_t)addr + len);
142	vm_map_lock_read(kernel_map);
143	rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
144	vm_map_unlock_read(kernel_map);
145	return (rv == TRUE);
146}
147
148/*
149 * MPSAFE
150 *
151 * WARNING!  This code calls vm_map_check_protection() which only checks
152 * the associated vm_map_entry range.  It does not determine whether the
153 * contents of the memory is actually readable or writable.  vmapbuf(),
154 * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
155 * used in conjunction with this call.
156 */
157bool
158useracc(void *addr, int len, int rw)
159{
160	boolean_t rv;
161	vm_prot_t prot;
162	vm_map_t map;
163
164	KASSERT((rw & ~VM_PROT_ALL) == 0,
165	    ("illegal ``rw'' argument to useracc (%x)\n", rw));
166	prot = rw;
167	map = &curproc->p_vmspace->vm_map;
168	if ((vm_offset_t)addr + len > vm_map_max(map) ||
169	    (vm_offset_t)addr + len < (vm_offset_t)addr) {
170		return (false);
171	}
172	vm_map_lock_read(map);
173	rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
174	    round_page((vm_offset_t)addr + len), prot);
175	vm_map_unlock_read(map);
176	return (rv == TRUE);
177}
178
179int
180vslock(void *addr, size_t len)
181{
182	vm_offset_t end, last, start;
183	vm_size_t npages;
184	int error;
185
186	last = (vm_offset_t)addr + len;
187	start = trunc_page((vm_offset_t)addr);
188	end = round_page(last);
189	if (last < (vm_offset_t)addr || end < (vm_offset_t)addr)
190		return (EINVAL);
191	npages = atop(end - start);
192	if (npages > vm_page_max_user_wired)
193		return (ENOMEM);
194	error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
195	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
196	if (error == KERN_SUCCESS) {
197		curthread->td_vslock_sz += len;
198		return (0);
199	}
200
201	/*
202	 * Return EFAULT on error to match copy{in,out}() behaviour
203	 * rather than returning ENOMEM like mlock() would.
204	 */
205	return (EFAULT);
206}
207
208void
209vsunlock(void *addr, size_t len)
210{
211
212	/* Rely on the parameter sanity checks performed by vslock(). */
213	MPASS(curthread->td_vslock_sz >= len);
214	curthread->td_vslock_sz -= len;
215	(void)vm_map_unwire(&curproc->p_vmspace->vm_map,
216	    trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
217	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
218}
219
220/*
221 * Pin the page contained within the given object at the given offset.  If the
222 * page is not resident, allocate and load it using the given object's pager.
223 * Return the pinned page if successful; otherwise, return NULL.
224 */
225static vm_page_t
226vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
227{
228	vm_page_t m;
229	vm_pindex_t pindex;
230
231	pindex = OFF_TO_IDX(offset);
232	(void)vm_page_grab_valid_unlocked(&m, object, pindex,
233	    VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED);
234	return (m);
235}
236
237/*
238 * Return a CPU private mapping to the page at the given offset within the
239 * given object.  The page is pinned before it is mapped.
240 */
241struct sf_buf *
242vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
243{
244	vm_page_t m;
245
246	m = vm_imgact_hold_page(object, offset);
247	if (m == NULL)
248		return (NULL);
249	sched_pin();
250	return (sf_buf_alloc(m, SFB_CPUPRIVATE));
251}
252
253/*
254 * Destroy the given CPU private mapping and unpin the page that it mapped.
255 */
256void
257vm_imgact_unmap_page(struct sf_buf *sf)
258{
259	vm_page_t m;
260
261	m = sf_buf_page(sf);
262	sf_buf_free(sf);
263	sched_unpin();
264	vm_page_unwire(m, PQ_ACTIVE);
265}
266
267void
268vm_sync_icache(vm_map_t map, vm_offset_t va, vm_offset_t sz)
269{
270
271	pmap_sync_icache(map->pmap, va, sz);
272}
273
274static vm_object_t kstack_object;
275static vm_object_t kstack_alt_object;
276static uma_zone_t kstack_cache;
277static int kstack_cache_size;
278static vmem_t *vmd_kstack_arena[MAXMEMDOM];
279
280static int
281sysctl_kstack_cache_size(SYSCTL_HANDLER_ARGS)
282{
283	int error, oldsize;
284
285	oldsize = kstack_cache_size;
286	error = sysctl_handle_int(oidp, arg1, arg2, req);
287	if (error == 0 && req->newptr && oldsize != kstack_cache_size)
288		uma_zone_set_maxcache(kstack_cache, kstack_cache_size);
289	return (error);
290}
291SYSCTL_PROC(_vm, OID_AUTO, kstack_cache_size,
292    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &kstack_cache_size, 0,
293    sysctl_kstack_cache_size, "IU", "Maximum number of cached kernel stacks");
294
295/*
296 *	Allocate a virtual address range from a domain kstack arena, following
297 *	the specified NUMA policy.
298 */
299static vm_offset_t
300vm_thread_alloc_kstack_kva(vm_size_t size, int domain)
301{
302#ifndef __ILP32__
303	int rv;
304	vmem_t *arena;
305	vm_offset_t addr = 0;
306
307	size = round_page(size);
308	/* Allocate from the kernel arena for non-standard kstack sizes. */
309	if (size != ptoa(kstack_pages + KSTACK_GUARD_PAGES)) {
310		arena = vm_dom[domain].vmd_kernel_arena;
311	} else {
312		arena = vmd_kstack_arena[domain];
313	}
314	rv = vmem_alloc(arena, size, M_BESTFIT | M_NOWAIT, &addr);
315	if (rv == ENOMEM)
316		return (0);
317	KASSERT(atop(addr - VM_MIN_KERNEL_ADDRESS) %
318	    (kstack_pages + KSTACK_GUARD_PAGES) == 0,
319	    ("%s: allocated kstack KVA not aligned to multiple of kstack size",
320	    __func__));
321
322	return (addr);
323#else
324	return (kva_alloc(size));
325#endif
326}
327
328/*
329 *	Release a region of kernel virtual memory
330 *	allocated from the kstack arena.
331 */
332static __noinline void
333vm_thread_free_kstack_kva(vm_offset_t addr, vm_size_t size, int domain)
334{
335	vmem_t *arena;
336
337	size = round_page(size);
338#ifdef __ILP32__
339	arena = kernel_arena;
340#else
341	arena = vmd_kstack_arena[domain];
342	if (size != ptoa(kstack_pages + KSTACK_GUARD_PAGES)) {
343		arena = vm_dom[domain].vmd_kernel_arena;
344	}
345#endif
346	vmem_free(arena, addr, size);
347}
348
349static vmem_size_t
350vm_thread_kstack_import_quantum(void)
351{
352#ifndef __ILP32__
353	/*
354	 * The kstack_quantum is larger than KVA_QUANTUM to account
355	 * for holes induced by guard pages.
356	 */
357	return (KVA_KSTACK_QUANTUM * (kstack_pages + KSTACK_GUARD_PAGES));
358#else
359	return (KVA_KSTACK_QUANTUM);
360#endif
361}
362
363/*
364 * Import KVA from a parent arena into the kstack arena. Imports must be
365 * a multiple of kernel stack pages + guard pages in size.
366 *
367 * Kstack VA allocations need to be aligned so that the linear KVA pindex
368 * is divisible by the total number of kstack VA pages. This is necessary to
369 * make vm_kstack_pindex work properly.
370 *
371 * We import a multiple of KVA_KSTACK_QUANTUM-sized region from the parent
372 * arena. The actual size used by the kstack arena is one kstack smaller to
373 * allow for the necessary alignment adjustments to be made.
374 */
375static int
376vm_thread_kstack_arena_import(void *arena, vmem_size_t size, int flags,
377    vmem_addr_t *addrp)
378{
379	int error, rem;
380	size_t kpages = kstack_pages + KSTACK_GUARD_PAGES;
381
382	KASSERT(atop(size) % kpages == 0,
383	    ("%s: Size %jd is not a multiple of kstack pages (%d)", __func__,
384	    (intmax_t)size, (int)kpages));
385
386	error = vmem_xalloc(arena, vm_thread_kstack_import_quantum(),
387	    KVA_KSTACK_QUANTUM, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags,
388	    addrp);
389	if (error) {
390		return (error);
391	}
392
393	rem = atop(*addrp - VM_MIN_KERNEL_ADDRESS) % kpages;
394	if (rem != 0) {
395		/* Bump addr to next aligned address */
396		*addrp = *addrp + (kpages - rem) * PAGE_SIZE;
397	}
398
399	return (0);
400}
401
402/*
403 * Release KVA from a parent arena into the kstack arena. Released imports must
404 * be a multiple of kernel stack pages + guard pages in size.
405 */
406static void
407vm_thread_kstack_arena_release(void *arena, vmem_addr_t addr, vmem_size_t size)
408{
409	int rem;
410	size_t kpages __diagused = kstack_pages + KSTACK_GUARD_PAGES;
411
412	KASSERT(size % kpages == 0,
413	    ("%s: Size %jd is not a multiple of kstack pages (%d)", __func__,
414	    (intmax_t)size, (int)kpages));
415
416	KASSERT((addr - VM_MIN_KERNEL_ADDRESS) % kpages == 0,
417	    ("%s: Address %p is not properly aligned (%p)", __func__,
418		(void *)addr, (void *)VM_MIN_KERNEL_ADDRESS));
419	/*
420	 * If the address is not KVA_KSTACK_QUANTUM-aligned we have to decrement
421	 * it to account for the shift in kva_import_kstack.
422	 */
423	rem = addr % KVA_KSTACK_QUANTUM;
424	if (rem) {
425		KASSERT(rem <= ptoa(kpages),
426		    ("%s: rem > kpages (%d), (%d)", __func__, rem,
427			(int)kpages));
428		addr -= rem;
429	}
430	vmem_xfree(arena, addr, vm_thread_kstack_import_quantum());
431}
432
433/*
434 * Create the kernel stack for a new thread.
435 */
436static vm_offset_t
437vm_thread_stack_create(struct domainset *ds, int pages)
438{
439	vm_page_t ma[KSTACK_MAX_PAGES];
440	struct vm_domainset_iter di;
441	int req = VM_ALLOC_NORMAL;
442	vm_object_t obj;
443	vm_offset_t ks;
444	int domain, i;
445
446	obj = vm_thread_kstack_size_to_obj(pages);
447	if (vm_ndomains > 1)
448		obj->domain.dr_policy = ds;
449	vm_domainset_iter_page_init(&di, obj, 0, &domain, &req);
450	do {
451		/*
452		 * Get a kernel virtual address for this thread's kstack.
453		 */
454		ks = vm_thread_alloc_kstack_kva(ptoa(pages + KSTACK_GUARD_PAGES),
455		    domain);
456		if (ks == 0)
457			continue;
458		ks += ptoa(KSTACK_GUARD_PAGES);
459
460		/*
461		 * Allocate physical pages to back the stack.
462		 */
463		if (vm_thread_stack_back(ks, ma, pages, req, domain) != 0) {
464			vm_thread_free_kstack_kva(ks - ptoa(KSTACK_GUARD_PAGES),
465			    ptoa(pages + KSTACK_GUARD_PAGES), domain);
466			continue;
467		}
468		if (KSTACK_GUARD_PAGES != 0) {
469			pmap_qremove(ks - ptoa(KSTACK_GUARD_PAGES),
470			    KSTACK_GUARD_PAGES);
471		}
472		for (i = 0; i < pages; i++)
473			vm_page_valid(ma[i]);
474		pmap_qenter(ks, ma, pages);
475		return (ks);
476	} while (vm_domainset_iter_page(&di, obj, &domain) == 0);
477
478	return (0);
479}
480
481static __noinline void
482vm_thread_stack_dispose(vm_offset_t ks, int pages)
483{
484	vm_page_t m;
485	vm_pindex_t pindex;
486	int i, domain;
487	vm_object_t obj = vm_thread_kstack_size_to_obj(pages);
488
489	pindex = vm_kstack_pindex(ks, pages);
490	domain = vm_phys_domain(vtophys(ks));
491	pmap_qremove(ks, pages);
492	VM_OBJECT_WLOCK(obj);
493	for (i = 0; i < pages; i++) {
494		m = vm_page_lookup(obj, pindex + i);
495		if (m == NULL)
496			panic("%s: kstack already missing?", __func__);
497		KASSERT(vm_page_domain(m) == domain,
498		    ("%s: page %p domain mismatch, expected %d got %d",
499		    __func__, m, domain, vm_page_domain(m)));
500		vm_page_xbusy_claim(m);
501		vm_page_unwire_noq(m);
502		vm_page_free(m);
503	}
504	VM_OBJECT_WUNLOCK(obj);
505	kasan_mark((void *)ks, ptoa(pages), ptoa(pages), 0);
506	vm_thread_free_kstack_kva(ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
507	    ptoa(pages + KSTACK_GUARD_PAGES), domain);
508}
509
510/*
511 * Allocate the kernel stack for a new thread.
512 */
513int
514vm_thread_new(struct thread *td, int pages)
515{
516	vm_offset_t ks;
517	u_short ks_domain;
518
519	/* Bounds check */
520	if (pages <= 1)
521		pages = kstack_pages;
522	else if (pages > KSTACK_MAX_PAGES)
523		pages = KSTACK_MAX_PAGES;
524
525	ks = 0;
526	if (pages == kstack_pages && kstack_cache != NULL)
527		ks = (vm_offset_t)uma_zalloc(kstack_cache, M_NOWAIT);
528
529	/*
530	 * Ensure that kstack objects can draw pages from any memory
531	 * domain.  Otherwise a local memory shortage can block a process
532	 * swap-in.
533	 */
534	if (ks == 0)
535		ks = vm_thread_stack_create(DOMAINSET_PREF(PCPU_GET(domain)),
536		    pages);
537	if (ks == 0)
538		return (0);
539
540	ks_domain = vm_phys_domain(vtophys(ks));
541	KASSERT(ks_domain >= 0 && ks_domain < vm_ndomains,
542	    ("%s: invalid domain for kstack %p", __func__, (void *)ks));
543	td->td_kstack = ks;
544	td->td_kstack_pages = pages;
545	td->td_kstack_domain = ks_domain;
546	return (1);
547}
548
549/*
550 * Dispose of a thread's kernel stack.
551 */
552void
553vm_thread_dispose(struct thread *td)
554{
555	vm_offset_t ks;
556	int pages;
557
558	pages = td->td_kstack_pages;
559	ks = td->td_kstack;
560	td->td_kstack = 0;
561	td->td_kstack_pages = 0;
562	td->td_kstack_domain = MAXMEMDOM;
563	if (pages == kstack_pages) {
564		kasan_mark((void *)ks, 0, ptoa(pages), KASAN_KSTACK_FREED);
565		uma_zfree(kstack_cache, (void *)ks);
566	} else {
567		vm_thread_stack_dispose(ks, pages);
568	}
569}
570
571/*
572 * Calculate kstack pindex.
573 *
574 * Uses a non-identity mapping if guard pages are
575 * active to avoid pindex holes in the kstack object.
576 */
577vm_pindex_t
578vm_kstack_pindex(vm_offset_t ks, int kpages)
579{
580	vm_pindex_t pindex = atop(ks - VM_MIN_KERNEL_ADDRESS);
581
582#ifdef __ILP32__
583	return (pindex);
584#else
585	/*
586	 * Return the linear pindex if guard pages aren't active or if we are
587	 * allocating a non-standard kstack size.
588	 */
589	if (KSTACK_GUARD_PAGES == 0 || kpages != kstack_pages) {
590		return (pindex);
591	}
592	KASSERT(pindex % (kpages + KSTACK_GUARD_PAGES) >= KSTACK_GUARD_PAGES,
593	    ("%s: Attempting to calculate kstack guard page pindex", __func__));
594
595	return (pindex -
596	    (pindex / (kpages + KSTACK_GUARD_PAGES) + 1) * KSTACK_GUARD_PAGES);
597#endif
598}
599
600/*
601 * Allocate physical pages, following the specified NUMA policy, to back a
602 * kernel stack.
603 */
604int
605vm_thread_stack_back(vm_offset_t ks, vm_page_t ma[], int npages, int req_class,
606    int domain)
607{
608	vm_object_t obj = vm_thread_kstack_size_to_obj(npages);
609	vm_pindex_t pindex;
610	vm_page_t m;
611	int n;
612
613	pindex = vm_kstack_pindex(ks, npages);
614
615	VM_OBJECT_WLOCK(obj);
616	for (n = 0; n < npages;) {
617		m = vm_page_grab(obj, pindex + n,
618		    VM_ALLOC_NOCREAT | VM_ALLOC_WIRED);
619		if (m == NULL) {
620			m = vm_page_alloc_domain(obj, pindex + n, domain,
621			    req_class | VM_ALLOC_WIRED);
622		}
623		if (m == NULL)
624			break;
625		ma[n++] = m;
626	}
627	if (n < npages)
628		goto cleanup;
629	VM_OBJECT_WUNLOCK(obj);
630
631	return (0);
632cleanup:
633	for (int i = 0; i < n; i++) {
634		m = ma[i];
635		(void)vm_page_unwire_noq(m);
636		vm_page_free(m);
637	}
638	VM_OBJECT_WUNLOCK(obj);
639
640	return (ENOMEM);
641}
642
643vm_object_t
644vm_thread_kstack_size_to_obj(int npages)
645{
646	return (npages == kstack_pages ? kstack_object : kstack_alt_object);
647}
648
649static int
650kstack_import(void *arg, void **store, int cnt, int domain, int flags)
651{
652	struct domainset *ds;
653	int i;
654
655	if (domain == UMA_ANYDOMAIN)
656		ds = DOMAINSET_RR();
657	else
658		ds = DOMAINSET_PREF(domain);
659
660	for (i = 0; i < cnt; i++) {
661		store[i] = (void *)vm_thread_stack_create(ds, kstack_pages);
662		if (store[i] == NULL)
663			break;
664	}
665	return (i);
666}
667
668static void
669kstack_release(void *arg, void **store, int cnt)
670{
671	vm_offset_t ks;
672	int i;
673
674	for (i = 0; i < cnt; i++) {
675		ks = (vm_offset_t)store[i];
676		vm_thread_stack_dispose(ks, kstack_pages);
677	}
678}
679
680static void
681kstack_cache_init(void *null)
682{
683	vm_size_t kstack_quantum;
684	int domain;
685
686	kstack_object = vm_object_allocate(OBJT_SWAP,
687	    atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS));
688	kstack_cache = uma_zcache_create("kstack_cache",
689	    kstack_pages * PAGE_SIZE, NULL, NULL, NULL, NULL,
690	    kstack_import, kstack_release, NULL,
691	    UMA_ZONE_FIRSTTOUCH);
692	kstack_cache_size = imax(128, mp_ncpus * 4);
693	uma_zone_set_maxcache(kstack_cache, kstack_cache_size);
694
695	kstack_alt_object = vm_object_allocate(OBJT_SWAP,
696	    atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS));
697
698	kstack_quantum = vm_thread_kstack_import_quantum();
699	/*
700	 * Reduce size used by the kstack arena to allow for
701	 * alignment adjustments in vm_thread_kstack_arena_import.
702	 */
703	kstack_quantum -= (kstack_pages + KSTACK_GUARD_PAGES) * PAGE_SIZE;
704	/*
705	 * Create the kstack_arena for each domain and set kernel_arena as
706	 * parent.
707	 */
708	for (domain = 0; domain < vm_ndomains; domain++) {
709		vmd_kstack_arena[domain] = vmem_create("kstack arena", 0, 0,
710		    PAGE_SIZE, 0, M_WAITOK);
711		KASSERT(vmd_kstack_arena[domain] != NULL,
712		    ("%s: failed to create domain %d kstack_arena", __func__,
713		    domain));
714		vmem_set_import(vmd_kstack_arena[domain],
715		    vm_thread_kstack_arena_import,
716		    vm_thread_kstack_arena_release,
717		    vm_dom[domain].vmd_kernel_arena, kstack_quantum);
718	}
719}
720SYSINIT(vm_kstacks, SI_SUB_KMEM, SI_ORDER_ANY, kstack_cache_init, NULL);
721
722#ifdef KSTACK_USAGE_PROF
723/*
724 * Track maximum stack used by a thread in kernel.
725 */
726static int max_kstack_used;
727
728SYSCTL_INT(_debug, OID_AUTO, max_kstack_used, CTLFLAG_RD,
729    &max_kstack_used, 0,
730    "Maximum stack depth used by a thread in kernel");
731
732void
733intr_prof_stack_use(struct thread *td, struct trapframe *frame)
734{
735	vm_offset_t stack_top;
736	vm_offset_t current;
737	int used, prev_used;
738
739	/*
740	 * Testing for interrupted kernel mode isn't strictly
741	 * needed. It optimizes the execution, since interrupts from
742	 * usermode will have only the trap frame on the stack.
743	 */
744	if (TRAPF_USERMODE(frame))
745		return;
746
747	stack_top = td->td_kstack + td->td_kstack_pages * PAGE_SIZE;
748	current = (vm_offset_t)(uintptr_t)&stack_top;
749
750	/*
751	 * Try to detect if interrupt is using kernel thread stack.
752	 * Hardware could use a dedicated stack for interrupt handling.
753	 */
754	if (stack_top <= current || current < td->td_kstack)
755		return;
756
757	used = stack_top - current;
758	for (;;) {
759		prev_used = max_kstack_used;
760		if (prev_used >= used)
761			break;
762		if (atomic_cmpset_int(&max_kstack_used, prev_used, used))
763			break;
764	}
765}
766#endif /* KSTACK_USAGE_PROF */
767
768/*
769 * Implement fork's actions on an address space.
770 * Here we arrange for the address space to be copied or referenced,
771 * allocate a user struct (pcb and kernel stack), then call the
772 * machine-dependent layer to fill those in and make the new process
773 * ready to run.  The new process is set up so that it returns directly
774 * to user mode to avoid stack copying and relocation problems.
775 */
776int
777vm_forkproc(struct thread *td, struct proc *p2, struct thread *td2,
778    struct vmspace *vm2, int flags)
779{
780	struct proc *p1 = td->td_proc;
781	struct domainset *dset;
782	int error;
783
784	if ((flags & RFPROC) == 0) {
785		/*
786		 * Divorce the memory, if it is shared, essentially
787		 * this changes shared memory amongst threads, into
788		 * COW locally.
789		 */
790		if ((flags & RFMEM) == 0) {
791			error = vmspace_unshare(p1);
792			if (error)
793				return (error);
794		}
795		cpu_fork(td, p2, td2, flags);
796		return (0);
797	}
798
799	if (flags & RFMEM) {
800		p2->p_vmspace = p1->p_vmspace;
801		refcount_acquire(&p1->p_vmspace->vm_refcnt);
802	}
803	dset = td2->td_domain.dr_policy;
804	while (vm_page_count_severe_set(&dset->ds_mask)) {
805		vm_wait_doms(&dset->ds_mask, 0);
806	}
807
808	if ((flags & RFMEM) == 0) {
809		p2->p_vmspace = vm2;
810		if (p1->p_vmspace->vm_shm)
811			shmfork(p1, p2);
812	}
813
814	/*
815	 * cpu_fork will copy and update the pcb, set up the kernel stack,
816	 * and make the child ready to run.
817	 */
818	cpu_fork(td, p2, td2, flags);
819	return (0);
820}
821
822/*
823 * Called after process has been wait(2)'ed upon and is being reaped.
824 * The idea is to reclaim resources that we could not reclaim while
825 * the process was still executing.
826 */
827void
828vm_waitproc(struct proc *p)
829{
830
831	vmspace_exitfree(p);		/* and clean-out the vmspace */
832}
833
834void
835kick_proc0(void)
836{
837
838	wakeup(&proc0);
839}
840