vm_mmap.c revision 313991
1/*-
2 * Copyright (c) 1988 University of Utah.
3 * Copyright (c) 1991, 1993
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the Systems Programming Group of the University of Utah Computer
8 * Science Department.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
35 *
36 *	@(#)vm_mmap.c	8.4 (Berkeley) 1/12/94
37 */
38
39/*
40 * Mapped file (mmap) interface to VM
41 */
42
43#include <sys/cdefs.h>
44__FBSDID("$FreeBSD: stable/10/sys/vm/vm_mmap.c 313991 2017-02-20 10:51:46Z kib $");
45
46#include "opt_compat.h"
47#include "opt_hwpmc_hooks.h"
48
49#include <sys/param.h>
50#include <sys/systm.h>
51#include <sys/capsicum.h>
52#include <sys/kernel.h>
53#include <sys/lock.h>
54#include <sys/mutex.h>
55#include <sys/sysproto.h>
56#include <sys/filedesc.h>
57#include <sys/priv.h>
58#include <sys/proc.h>
59#include <sys/procctl.h>
60#include <sys/racct.h>
61#include <sys/resource.h>
62#include <sys/resourcevar.h>
63#include <sys/rwlock.h>
64#include <sys/sysctl.h>
65#include <sys/vnode.h>
66#include <sys/fcntl.h>
67#include <sys/file.h>
68#include <sys/mman.h>
69#include <sys/mount.h>
70#include <sys/conf.h>
71#include <sys/stat.h>
72#include <sys/syscallsubr.h>
73#include <sys/sysent.h>
74#include <sys/vmmeter.h>
75
76#include <security/mac/mac_framework.h>
77
78#include <vm/vm.h>
79#include <vm/vm_param.h>
80#include <vm/pmap.h>
81#include <vm/vm_map.h>
82#include <vm/vm_object.h>
83#include <vm/vm_page.h>
84#include <vm/vm_pager.h>
85#include <vm/vm_pageout.h>
86#include <vm/vm_extern.h>
87#include <vm/vm_page.h>
88#include <vm/vnode_pager.h>
89
90#ifdef HWPMC_HOOKS
91#include <sys/pmckern.h>
92#endif
93
94int old_mlock = 0;
95SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0,
96    "Do not apply RLIMIT_MEMLOCK on mlockall");
97TUNABLE_INT("vm.old_mlock", &old_mlock);
98
99#ifdef MAP_32BIT
100#define	MAP_32BIT_MAX_ADDR	((vm_offset_t)1 << 31)
101#endif
102
103static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
104    int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *);
105static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
106    int *, struct cdev *, vm_ooffset_t *, vm_object_t *);
107static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
108    int *, struct shmfd *, vm_ooffset_t, vm_object_t *);
109
110#ifndef _SYS_SYSPROTO_H_
111struct sbrk_args {
112	int incr;
113};
114#endif
115
116int
117sys_sbrk(struct thread *td, struct sbrk_args *uap)
118{
119	/* Not yet implemented */
120	return (EOPNOTSUPP);
121}
122
123#ifndef _SYS_SYSPROTO_H_
124struct sstk_args {
125	int incr;
126};
127#endif
128
129int
130sys_sstk(struct thread *td, struct sstk_args *uap)
131{
132	/* Not yet implemented */
133	return (EOPNOTSUPP);
134}
135
136#if defined(COMPAT_43)
137#ifndef _SYS_SYSPROTO_H_
138struct getpagesize_args {
139	int dummy;
140};
141#endif
142
143int
144ogetpagesize(struct thread *td, struct getpagesize_args *uap)
145{
146
147	td->td_retval[0] = PAGE_SIZE;
148	return (0);
149}
150#endif				/* COMPAT_43 */
151
152
153/*
154 * Memory Map (mmap) system call.  Note that the file offset
155 * and address are allowed to be NOT page aligned, though if
156 * the MAP_FIXED flag it set, both must have the same remainder
157 * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
158 * page-aligned, the actual mapping starts at trunc_page(addr)
159 * and the return value is adjusted up by the page offset.
160 *
161 * Generally speaking, only character devices which are themselves
162 * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
163 * there would be no cache coherency between a descriptor and a VM mapping
164 * both to the same character device.
165 */
166#ifndef _SYS_SYSPROTO_H_
167struct mmap_args {
168	void *addr;
169	size_t len;
170	int prot;
171	int flags;
172	int fd;
173	long pad;
174	off_t pos;
175};
176#endif
177
178int
179sys_mmap(td, uap)
180	struct thread *td;
181	struct mmap_args *uap;
182{
183#ifdef HWPMC_HOOKS
184	struct pmckern_map_in pkm;
185#endif
186	struct file *fp;
187	struct vnode *vp;
188	vm_offset_t addr;
189	vm_size_t size, pageoff;
190	vm_prot_t cap_maxprot, prot, maxprot;
191	void *handle;
192	objtype_t handle_type;
193	int align, error, flags;
194	off_t pos;
195	struct vmspace *vms = td->td_proc->p_vmspace;
196	cap_rights_t rights;
197
198	addr = (vm_offset_t) uap->addr;
199	size = uap->len;
200	prot = uap->prot & VM_PROT_ALL;
201	flags = uap->flags;
202	pos = uap->pos;
203
204	fp = NULL;
205
206	/*
207	 * Enforce the constraints.
208	 * Mapping of length 0 is only allowed for old binaries.
209	 * Anonymous mapping shall specify -1 as filedescriptor and
210	 * zero position for new code. Be nice to ancient a.out
211	 * binaries and correct pos for anonymous mapping, since old
212	 * ld.so sometimes issues anonymous map requests with non-zero
213	 * pos.
214	 */
215	if (!SV_CURPROC_FLAG(SV_AOUT)) {
216		if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
217		    ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0)))
218			return (EINVAL);
219	} else {
220		if ((flags & MAP_ANON) != 0)
221			pos = 0;
222	}
223
224	if (flags & MAP_STACK) {
225		if ((uap->fd != -1) ||
226		    ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
227			return (EINVAL);
228		flags |= MAP_ANON;
229		pos = 0;
230	}
231	if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
232		return (EINVAL);
233
234	/*
235	 * Align the file position to a page boundary,
236	 * and save its page offset component.
237	 */
238	pageoff = (pos & PAGE_MASK);
239	pos -= pageoff;
240
241	/* Adjust size for rounding (on both ends). */
242	size += pageoff;			/* low end... */
243	size = (vm_size_t) round_page(size);	/* hi end */
244
245	/* Ensure alignment is at least a page and fits in a pointer. */
246	align = flags & MAP_ALIGNMENT_MASK;
247	if (align != 0 && align != MAP_ALIGNED_SUPER &&
248	    (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
249	    align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT))
250		return (EINVAL);
251
252	/*
253	 * Check for illegal addresses.  Watch out for address wrap... Note
254	 * that VM_*_ADDRESS are not constants due to casts (argh).
255	 */
256	if (flags & MAP_FIXED) {
257		/*
258		 * The specified address must have the same remainder
259		 * as the file offset taken modulo PAGE_SIZE, so it
260		 * should be aligned after adjustment by pageoff.
261		 */
262		addr -= pageoff;
263		if (addr & PAGE_MASK)
264			return (EINVAL);
265
266		/* Address range must be all in user VM space. */
267		if (addr < vm_map_min(&vms->vm_map) ||
268		    addr + size > vm_map_max(&vms->vm_map))
269			return (EINVAL);
270		if (addr + size < addr)
271			return (EINVAL);
272#ifdef MAP_32BIT
273		if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR)
274			return (EINVAL);
275	} else if (flags & MAP_32BIT) {
276		/*
277		 * For MAP_32BIT, override the hint if it is too high and
278		 * do not bother moving the mapping past the heap (since
279		 * the heap is usually above 2GB).
280		 */
281		if (addr + size > MAP_32BIT_MAX_ADDR)
282			addr = 0;
283#endif
284	} else {
285		/*
286		 * XXX for non-fixed mappings where no hint is provided or
287		 * the hint would fall in the potential heap space,
288		 * place it after the end of the largest possible heap.
289		 *
290		 * There should really be a pmap call to determine a reasonable
291		 * location.
292		 */
293		PROC_LOCK(td->td_proc);
294		if (addr == 0 ||
295		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
296		    addr < round_page((vm_offset_t)vms->vm_daddr +
297		    lim_max(td->td_proc, RLIMIT_DATA))))
298			addr = round_page((vm_offset_t)vms->vm_daddr +
299			    lim_max(td->td_proc, RLIMIT_DATA));
300		PROC_UNLOCK(td->td_proc);
301	}
302	if (flags & MAP_ANON) {
303		/*
304		 * Mapping blank space is trivial.
305		 */
306		handle = NULL;
307		handle_type = OBJT_DEFAULT;
308		maxprot = VM_PROT_ALL;
309		cap_maxprot = VM_PROT_ALL;
310	} else {
311		/*
312		 * Mapping file, get fp for validation and don't let the
313		 * descriptor disappear on us if we block. Check capability
314		 * rights, but also return the maximum rights to be combined
315		 * with maxprot later.
316		 */
317		cap_rights_init(&rights, CAP_MMAP);
318		if (prot & PROT_READ)
319			cap_rights_set(&rights, CAP_MMAP_R);
320		if ((flags & MAP_SHARED) != 0) {
321			if (prot & PROT_WRITE)
322				cap_rights_set(&rights, CAP_MMAP_W);
323		}
324		if (prot & PROT_EXEC)
325			cap_rights_set(&rights, CAP_MMAP_X);
326		error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp);
327		if (error != 0)
328			goto done;
329		if (fp->f_type == DTYPE_SHM) {
330			handle = fp->f_data;
331			handle_type = OBJT_SWAP;
332			maxprot = VM_PROT_NONE;
333
334			/* FREAD should always be set. */
335			if (fp->f_flag & FREAD)
336				maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
337			if (fp->f_flag & FWRITE)
338				maxprot |= VM_PROT_WRITE;
339			goto map;
340		}
341		if (fp->f_type != DTYPE_VNODE) {
342			error = ENODEV;
343			goto done;
344		}
345#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
346    defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
347		/*
348		 * POSIX shared-memory objects are defined to have
349		 * kernel persistence, and are not defined to support
350		 * read(2)/write(2) -- or even open(2).  Thus, we can
351		 * use MAP_ASYNC to trade on-disk coherence for speed.
352		 * The shm_open(3) library routine turns on the FPOSIXSHM
353		 * flag to request this behavior.
354		 */
355		if (fp->f_flag & FPOSIXSHM)
356			flags |= MAP_NOSYNC;
357#endif
358		vp = fp->f_vnode;
359		/*
360		 * Ensure that file and memory protections are
361		 * compatible.  Note that we only worry about
362		 * writability if mapping is shared; in this case,
363		 * current and max prot are dictated by the open file.
364		 * XXX use the vnode instead?  Problem is: what
365		 * credentials do we use for determination? What if
366		 * proc does a setuid?
367		 */
368		if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC)
369			maxprot = VM_PROT_NONE;
370		else
371			maxprot = VM_PROT_EXECUTE;
372		if (fp->f_flag & FREAD) {
373			maxprot |= VM_PROT_READ;
374		} else if (prot & PROT_READ) {
375			error = EACCES;
376			goto done;
377		}
378		/*
379		 * If we are sharing potential changes (either via
380		 * MAP_SHARED or via the implicit sharing of character
381		 * device mappings), and we are trying to get write
382		 * permission although we opened it without asking
383		 * for it, bail out.
384		 */
385		if ((flags & MAP_SHARED) != 0) {
386			if ((fp->f_flag & FWRITE) != 0) {
387				maxprot |= VM_PROT_WRITE;
388			} else if ((prot & PROT_WRITE) != 0) {
389				error = EACCES;
390				goto done;
391			}
392		} else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) {
393			maxprot |= VM_PROT_WRITE;
394			cap_maxprot |= VM_PROT_WRITE;
395		}
396		handle = (void *)vp;
397		handle_type = OBJT_VNODE;
398	}
399map:
400	td->td_fpop = fp;
401	maxprot &= cap_maxprot;
402	error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
403	    flags, handle_type, handle, pos);
404	td->td_fpop = NULL;
405#ifdef HWPMC_HOOKS
406	/* inform hwpmc(4) if an executable is being mapped */
407	if (error == 0 && handle_type == OBJT_VNODE &&
408	    (prot & PROT_EXEC)) {
409		pkm.pm_file = handle;
410		pkm.pm_address = (uintptr_t) addr;
411		PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
412	}
413#endif
414	if (error == 0)
415		td->td_retval[0] = (register_t) (addr + pageoff);
416done:
417	if (fp)
418		fdrop(fp, td);
419
420	return (error);
421}
422
423int
424freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
425{
426	struct mmap_args oargs;
427
428	oargs.addr = uap->addr;
429	oargs.len = uap->len;
430	oargs.prot = uap->prot;
431	oargs.flags = uap->flags;
432	oargs.fd = uap->fd;
433	oargs.pos = uap->pos;
434	return (sys_mmap(td, &oargs));
435}
436
437#ifdef COMPAT_43
438#ifndef _SYS_SYSPROTO_H_
439struct ommap_args {
440	caddr_t addr;
441	int len;
442	int prot;
443	int flags;
444	int fd;
445	long pos;
446};
447#endif
448int
449ommap(td, uap)
450	struct thread *td;
451	struct ommap_args *uap;
452{
453	struct mmap_args nargs;
454	static const char cvtbsdprot[8] = {
455		0,
456		PROT_EXEC,
457		PROT_WRITE,
458		PROT_EXEC | PROT_WRITE,
459		PROT_READ,
460		PROT_EXEC | PROT_READ,
461		PROT_WRITE | PROT_READ,
462		PROT_EXEC | PROT_WRITE | PROT_READ,
463	};
464
465#define	OMAP_ANON	0x0002
466#define	OMAP_COPY	0x0020
467#define	OMAP_SHARED	0x0010
468#define	OMAP_FIXED	0x0100
469
470	nargs.addr = uap->addr;
471	nargs.len = uap->len;
472	nargs.prot = cvtbsdprot[uap->prot & 0x7];
473#ifdef COMPAT_FREEBSD32
474#if defined(__amd64__) || defined(__ia64__)
475	if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
476	    nargs.prot != 0)
477		nargs.prot |= PROT_EXEC;
478#endif
479#endif
480	nargs.flags = 0;
481	if (uap->flags & OMAP_ANON)
482		nargs.flags |= MAP_ANON;
483	if (uap->flags & OMAP_COPY)
484		nargs.flags |= MAP_COPY;
485	if (uap->flags & OMAP_SHARED)
486		nargs.flags |= MAP_SHARED;
487	else
488		nargs.flags |= MAP_PRIVATE;
489	if (uap->flags & OMAP_FIXED)
490		nargs.flags |= MAP_FIXED;
491	nargs.fd = uap->fd;
492	nargs.pos = uap->pos;
493	return (sys_mmap(td, &nargs));
494}
495#endif				/* COMPAT_43 */
496
497
498#ifndef _SYS_SYSPROTO_H_
499struct msync_args {
500	void *addr;
501	size_t len;
502	int flags;
503};
504#endif
505int
506sys_msync(td, uap)
507	struct thread *td;
508	struct msync_args *uap;
509{
510	vm_offset_t addr;
511	vm_size_t size, pageoff;
512	int flags;
513	vm_map_t map;
514	int rv;
515
516	addr = (vm_offset_t) uap->addr;
517	size = uap->len;
518	flags = uap->flags;
519
520	pageoff = (addr & PAGE_MASK);
521	addr -= pageoff;
522	size += pageoff;
523	size = (vm_size_t) round_page(size);
524	if (addr + size < addr)
525		return (EINVAL);
526
527	if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
528		return (EINVAL);
529
530	map = &td->td_proc->p_vmspace->vm_map;
531
532	/*
533	 * Clean the pages and interpret the return value.
534	 */
535	rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
536	    (flags & MS_INVALIDATE) != 0);
537	switch (rv) {
538	case KERN_SUCCESS:
539		return (0);
540	case KERN_INVALID_ADDRESS:
541		return (ENOMEM);
542	case KERN_INVALID_ARGUMENT:
543		return (EBUSY);
544	case KERN_FAILURE:
545		return (EIO);
546	default:
547		return (EINVAL);
548	}
549}
550
551#ifndef _SYS_SYSPROTO_H_
552struct munmap_args {
553	void *addr;
554	size_t len;
555};
556#endif
557int
558sys_munmap(td, uap)
559	struct thread *td;
560	struct munmap_args *uap;
561{
562#ifdef HWPMC_HOOKS
563	struct pmckern_map_out pkm;
564	vm_map_entry_t entry;
565#endif
566	vm_offset_t addr;
567	vm_size_t size, pageoff;
568	vm_map_t map;
569
570	addr = (vm_offset_t) uap->addr;
571	size = uap->len;
572	if (size == 0)
573		return (EINVAL);
574
575	pageoff = (addr & PAGE_MASK);
576	addr -= pageoff;
577	size += pageoff;
578	size = (vm_size_t) round_page(size);
579	if (addr + size < addr)
580		return (EINVAL);
581
582	/*
583	 * Check for illegal addresses.  Watch out for address wrap...
584	 */
585	map = &td->td_proc->p_vmspace->vm_map;
586	if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
587		return (EINVAL);
588	vm_map_lock(map);
589#ifdef HWPMC_HOOKS
590	/*
591	 * Inform hwpmc if the address range being unmapped contains
592	 * an executable region.
593	 */
594	pkm.pm_address = (uintptr_t) NULL;
595	if (vm_map_lookup_entry(map, addr, &entry)) {
596		for (;
597		     entry != &map->header && entry->start < addr + size;
598		     entry = entry->next) {
599			if (vm_map_check_protection(map, entry->start,
600				entry->end, VM_PROT_EXECUTE) == TRUE) {
601				pkm.pm_address = (uintptr_t) addr;
602				pkm.pm_size = (size_t) size;
603				break;
604			}
605		}
606	}
607#endif
608	vm_map_delete(map, addr, addr + size);
609
610#ifdef HWPMC_HOOKS
611	/* downgrade the lock to prevent a LOR with the pmc-sx lock */
612	vm_map_lock_downgrade(map);
613	if (pkm.pm_address != (uintptr_t) NULL)
614		PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
615	vm_map_unlock_read(map);
616#else
617	vm_map_unlock(map);
618#endif
619	/* vm_map_delete returns nothing but KERN_SUCCESS anyway */
620	return (0);
621}
622
623#ifndef _SYS_SYSPROTO_H_
624struct mprotect_args {
625	const void *addr;
626	size_t len;
627	int prot;
628};
629#endif
630int
631sys_mprotect(td, uap)
632	struct thread *td;
633	struct mprotect_args *uap;
634{
635	vm_offset_t addr;
636	vm_size_t size, pageoff;
637	vm_prot_t prot;
638
639	addr = (vm_offset_t) uap->addr;
640	size = uap->len;
641	prot = uap->prot & VM_PROT_ALL;
642
643	pageoff = (addr & PAGE_MASK);
644	addr -= pageoff;
645	size += pageoff;
646	size = (vm_size_t) round_page(size);
647	if (addr + size < addr)
648		return (EINVAL);
649
650	switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
651	    addr + size, prot, FALSE)) {
652	case KERN_SUCCESS:
653		return (0);
654	case KERN_PROTECTION_FAILURE:
655		return (EACCES);
656	case KERN_RESOURCE_SHORTAGE:
657		return (ENOMEM);
658	}
659	return (EINVAL);
660}
661
662#ifndef _SYS_SYSPROTO_H_
663struct minherit_args {
664	void *addr;
665	size_t len;
666	int inherit;
667};
668#endif
669int
670sys_minherit(struct thread *td, struct minherit_args *uap)
671{
672	vm_offset_t addr;
673	vm_size_t size, pageoff;
674	vm_inherit_t inherit;
675
676	addr = (vm_offset_t)uap->addr;
677	size = uap->len;
678	inherit = uap->inherit;
679
680	pageoff = (addr & PAGE_MASK);
681	addr -= pageoff;
682	size += pageoff;
683	size = (vm_size_t) round_page(size);
684	if (addr + size < addr)
685		return (EINVAL);
686
687	switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
688	    addr + size, inherit)) {
689	case KERN_SUCCESS:
690		return (0);
691	case KERN_PROTECTION_FAILURE:
692		return (EACCES);
693	}
694	return (EINVAL);
695}
696
697#ifndef _SYS_SYSPROTO_H_
698struct madvise_args {
699	void *addr;
700	size_t len;
701	int behav;
702};
703#endif
704
705int
706sys_madvise(struct thread *td, struct madvise_args *uap)
707{
708	vm_offset_t start, end;
709	vm_map_t map;
710	int flags;
711
712	/*
713	 * Check for our special case, advising the swap pager we are
714	 * "immortal."
715	 */
716	if (uap->behav == MADV_PROTECT) {
717		flags = PPROT_SET;
718		return (kern_procctl(td, P_PID, td->td_proc->p_pid,
719		    PROC_SPROTECT, &flags));
720	}
721
722	/*
723	 * Check for illegal behavior
724	 */
725	if (uap->behav < 0 || uap->behav > MADV_CORE)
726		return (EINVAL);
727	/*
728	 * Check for illegal addresses.  Watch out for address wrap... Note
729	 * that VM_*_ADDRESS are not constants due to casts (argh).
730	 */
731	map = &td->td_proc->p_vmspace->vm_map;
732	if ((vm_offset_t)uap->addr < vm_map_min(map) ||
733	    (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
734		return (EINVAL);
735	if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
736		return (EINVAL);
737
738	/*
739	 * Since this routine is only advisory, we default to conservative
740	 * behavior.
741	 */
742	start = trunc_page((vm_offset_t) uap->addr);
743	end = round_page((vm_offset_t) uap->addr + uap->len);
744
745	if (vm_map_madvise(map, start, end, uap->behav))
746		return (EINVAL);
747	return (0);
748}
749
750#ifndef _SYS_SYSPROTO_H_
751struct mincore_args {
752	const void *addr;
753	size_t len;
754	char *vec;
755};
756#endif
757
758int
759sys_mincore(struct thread *td, struct mincore_args *uap)
760{
761	vm_offset_t addr, first_addr;
762	vm_offset_t end, cend;
763	pmap_t pmap;
764	vm_map_t map;
765	char *vec;
766	int error = 0;
767	int vecindex, lastvecindex;
768	vm_map_entry_t current;
769	vm_map_entry_t entry;
770	vm_object_t object;
771	vm_paddr_t locked_pa;
772	vm_page_t m;
773	vm_pindex_t pindex;
774	int mincoreinfo;
775	unsigned int timestamp;
776	boolean_t locked;
777
778	/*
779	 * Make sure that the addresses presented are valid for user
780	 * mode.
781	 */
782	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
783	end = addr + (vm_size_t)round_page(uap->len);
784	map = &td->td_proc->p_vmspace->vm_map;
785	if (end > vm_map_max(map) || end < addr)
786		return (ENOMEM);
787
788	/*
789	 * Address of byte vector
790	 */
791	vec = uap->vec;
792
793	pmap = vmspace_pmap(td->td_proc->p_vmspace);
794
795	vm_map_lock_read(map);
796RestartScan:
797	timestamp = map->timestamp;
798
799	if (!vm_map_lookup_entry(map, addr, &entry)) {
800		vm_map_unlock_read(map);
801		return (ENOMEM);
802	}
803
804	/*
805	 * Do this on a map entry basis so that if the pages are not
806	 * in the current processes address space, we can easily look
807	 * up the pages elsewhere.
808	 */
809	lastvecindex = -1;
810	for (current = entry;
811	    (current != &map->header) && (current->start < end);
812	    current = current->next) {
813
814		/*
815		 * check for contiguity
816		 */
817		if (current->end < end &&
818		    (entry->next == &map->header ||
819		     current->next->start > current->end)) {
820			vm_map_unlock_read(map);
821			return (ENOMEM);
822		}
823
824		/*
825		 * ignore submaps (for now) or null objects
826		 */
827		if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
828			current->object.vm_object == NULL)
829			continue;
830
831		/*
832		 * limit this scan to the current map entry and the
833		 * limits for the mincore call
834		 */
835		if (addr < current->start)
836			addr = current->start;
837		cend = current->end;
838		if (cend > end)
839			cend = end;
840
841		/*
842		 * scan this entry one page at a time
843		 */
844		while (addr < cend) {
845			/*
846			 * Check pmap first, it is likely faster, also
847			 * it can provide info as to whether we are the
848			 * one referencing or modifying the page.
849			 */
850			object = NULL;
851			locked_pa = 0;
852		retry:
853			m = NULL;
854			mincoreinfo = pmap_mincore(pmap, addr, &locked_pa);
855			if (locked_pa != 0) {
856				/*
857				 * The page is mapped by this process but not
858				 * both accessed and modified.  It is also
859				 * managed.  Acquire the object lock so that
860				 * other mappings might be examined.
861				 */
862				m = PHYS_TO_VM_PAGE(locked_pa);
863				if (m->object != object) {
864					if (object != NULL)
865						VM_OBJECT_WUNLOCK(object);
866					object = m->object;
867					locked = VM_OBJECT_TRYWLOCK(object);
868					vm_page_unlock(m);
869					if (!locked) {
870						VM_OBJECT_WLOCK(object);
871						vm_page_lock(m);
872						goto retry;
873					}
874				} else
875					vm_page_unlock(m);
876				KASSERT(m->valid == VM_PAGE_BITS_ALL,
877				    ("mincore: page %p is mapped but invalid",
878				    m));
879			} else if (mincoreinfo == 0) {
880				/*
881				 * The page is not mapped by this process.  If
882				 * the object implements managed pages, then
883				 * determine if the page is resident so that
884				 * the mappings might be examined.
885				 */
886				if (current->object.vm_object != object) {
887					if (object != NULL)
888						VM_OBJECT_WUNLOCK(object);
889					object = current->object.vm_object;
890					VM_OBJECT_WLOCK(object);
891				}
892				if (object->type == OBJT_DEFAULT ||
893				    object->type == OBJT_SWAP ||
894				    object->type == OBJT_VNODE) {
895					pindex = OFF_TO_IDX(current->offset +
896					    (addr - current->start));
897					m = vm_page_lookup(object, pindex);
898					if (m == NULL &&
899					    vm_page_is_cached(object, pindex))
900						mincoreinfo = MINCORE_INCORE;
901					if (m != NULL && m->valid == 0)
902						m = NULL;
903					if (m != NULL)
904						mincoreinfo = MINCORE_INCORE;
905				}
906			}
907			if (m != NULL) {
908				/* Examine other mappings to the page. */
909				if (m->dirty == 0 && pmap_is_modified(m))
910					vm_page_dirty(m);
911				if (m->dirty != 0)
912					mincoreinfo |= MINCORE_MODIFIED_OTHER;
913				/*
914				 * The first test for PGA_REFERENCED is an
915				 * optimization.  The second test is
916				 * required because a concurrent pmap
917				 * operation could clear the last reference
918				 * and set PGA_REFERENCED before the call to
919				 * pmap_is_referenced().
920				 */
921				if ((m->aflags & PGA_REFERENCED) != 0 ||
922				    pmap_is_referenced(m) ||
923				    (m->aflags & PGA_REFERENCED) != 0)
924					mincoreinfo |= MINCORE_REFERENCED_OTHER;
925			}
926			if (object != NULL)
927				VM_OBJECT_WUNLOCK(object);
928
929			/*
930			 * subyte may page fault.  In case it needs to modify
931			 * the map, we release the lock.
932			 */
933			vm_map_unlock_read(map);
934
935			/*
936			 * calculate index into user supplied byte vector
937			 */
938			vecindex = OFF_TO_IDX(addr - first_addr);
939
940			/*
941			 * If we have skipped map entries, we need to make sure that
942			 * the byte vector is zeroed for those skipped entries.
943			 */
944			while ((lastvecindex + 1) < vecindex) {
945				++lastvecindex;
946				error = subyte(vec + lastvecindex, 0);
947				if (error) {
948					error = EFAULT;
949					goto done2;
950				}
951			}
952
953			/*
954			 * Pass the page information to the user
955			 */
956			error = subyte(vec + vecindex, mincoreinfo);
957			if (error) {
958				error = EFAULT;
959				goto done2;
960			}
961
962			/*
963			 * If the map has changed, due to the subyte, the previous
964			 * output may be invalid.
965			 */
966			vm_map_lock_read(map);
967			if (timestamp != map->timestamp)
968				goto RestartScan;
969
970			lastvecindex = vecindex;
971			addr += PAGE_SIZE;
972		}
973	}
974
975	/*
976	 * subyte may page fault.  In case it needs to modify
977	 * the map, we release the lock.
978	 */
979	vm_map_unlock_read(map);
980
981	/*
982	 * Zero the last entries in the byte vector.
983	 */
984	vecindex = OFF_TO_IDX(end - first_addr);
985	while ((lastvecindex + 1) < vecindex) {
986		++lastvecindex;
987		error = subyte(vec + lastvecindex, 0);
988		if (error) {
989			error = EFAULT;
990			goto done2;
991		}
992	}
993
994	/*
995	 * If the map has changed, due to the subyte, the previous
996	 * output may be invalid.
997	 */
998	vm_map_lock_read(map);
999	if (timestamp != map->timestamp)
1000		goto RestartScan;
1001	vm_map_unlock_read(map);
1002done2:
1003	return (error);
1004}
1005
1006#ifndef _SYS_SYSPROTO_H_
1007struct mlock_args {
1008	const void *addr;
1009	size_t len;
1010};
1011#endif
1012int
1013sys_mlock(struct thread *td, struct mlock_args *uap)
1014{
1015
1016	return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len));
1017}
1018
1019int
1020vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len)
1021{
1022	vm_offset_t addr, end, last, start;
1023	vm_size_t npages, size;
1024	vm_map_t map;
1025	unsigned long nsize;
1026	int error;
1027
1028	error = priv_check_cred(cred, PRIV_VM_MLOCK, 0);
1029	if (error)
1030		return (error);
1031	addr = (vm_offset_t)addr0;
1032	size = len;
1033	last = addr + size;
1034	start = trunc_page(addr);
1035	end = round_page(last);
1036	if (last < addr || end < addr)
1037		return (EINVAL);
1038	npages = atop(end - start);
1039	if (npages > vm_page_max_wired)
1040		return (ENOMEM);
1041	map = &proc->p_vmspace->vm_map;
1042	PROC_LOCK(proc);
1043	nsize = ptoa(npages + pmap_wired_count(map->pmap));
1044	if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) {
1045		PROC_UNLOCK(proc);
1046		return (ENOMEM);
1047	}
1048	PROC_UNLOCK(proc);
1049	if (npages + cnt.v_wire_count > vm_page_max_wired)
1050		return (EAGAIN);
1051#ifdef RACCT
1052	if (racct_enable) {
1053		PROC_LOCK(proc);
1054		error = racct_set(proc, RACCT_MEMLOCK, nsize);
1055		PROC_UNLOCK(proc);
1056		if (error != 0)
1057			return (ENOMEM);
1058	}
1059#endif
1060	error = vm_map_wire(map, start, end,
1061	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1062#ifdef RACCT
1063	if (racct_enable && error != KERN_SUCCESS) {
1064		PROC_LOCK(proc);
1065		racct_set(proc, RACCT_MEMLOCK,
1066		    ptoa(pmap_wired_count(map->pmap)));
1067		PROC_UNLOCK(proc);
1068	}
1069#endif
1070	return (error == KERN_SUCCESS ? 0 : ENOMEM);
1071}
1072
1073#ifndef _SYS_SYSPROTO_H_
1074struct mlockall_args {
1075	int	how;
1076};
1077#endif
1078
1079int
1080sys_mlockall(struct thread *td, struct mlockall_args *uap)
1081{
1082	vm_map_t map;
1083	int error;
1084
1085	map = &td->td_proc->p_vmspace->vm_map;
1086	error = priv_check(td, PRIV_VM_MLOCK);
1087	if (error)
1088		return (error);
1089
1090	if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
1091		return (EINVAL);
1092
1093	/*
1094	 * If wiring all pages in the process would cause it to exceed
1095	 * a hard resource limit, return ENOMEM.
1096	 */
1097	if (!old_mlock && uap->how & MCL_CURRENT) {
1098		PROC_LOCK(td->td_proc);
1099		if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
1100			PROC_UNLOCK(td->td_proc);
1101			return (ENOMEM);
1102		}
1103		PROC_UNLOCK(td->td_proc);
1104	}
1105#ifdef RACCT
1106	if (racct_enable) {
1107		PROC_LOCK(td->td_proc);
1108		error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
1109		PROC_UNLOCK(td->td_proc);
1110		if (error != 0)
1111			return (ENOMEM);
1112	}
1113#endif
1114
1115	if (uap->how & MCL_FUTURE) {
1116		vm_map_lock(map);
1117		vm_map_modflags(map, MAP_WIREFUTURE, 0);
1118		vm_map_unlock(map);
1119		error = 0;
1120	}
1121
1122	if (uap->how & MCL_CURRENT) {
1123		/*
1124		 * P1003.1-2001 mandates that all currently mapped pages
1125		 * will be memory resident and locked (wired) upon return
1126		 * from mlockall(). vm_map_wire() will wire pages, by
1127		 * calling vm_fault_wire() for each page in the region.
1128		 */
1129		error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
1130		    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1131		error = (error == KERN_SUCCESS ? 0 : EAGAIN);
1132	}
1133#ifdef RACCT
1134	if (racct_enable && error != KERN_SUCCESS) {
1135		PROC_LOCK(td->td_proc);
1136		racct_set(td->td_proc, RACCT_MEMLOCK,
1137		    ptoa(pmap_wired_count(map->pmap)));
1138		PROC_UNLOCK(td->td_proc);
1139	}
1140#endif
1141
1142	return (error);
1143}
1144
1145#ifndef _SYS_SYSPROTO_H_
1146struct munlockall_args {
1147	register_t dummy;
1148};
1149#endif
1150
1151int
1152sys_munlockall(struct thread *td, struct munlockall_args *uap)
1153{
1154	vm_map_t map;
1155	int error;
1156
1157	map = &td->td_proc->p_vmspace->vm_map;
1158	error = priv_check(td, PRIV_VM_MUNLOCK);
1159	if (error)
1160		return (error);
1161
1162	/* Clear the MAP_WIREFUTURE flag from this vm_map. */
1163	vm_map_lock(map);
1164	vm_map_modflags(map, 0, MAP_WIREFUTURE);
1165	vm_map_unlock(map);
1166
1167	/* Forcibly unwire all pages. */
1168	error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
1169	    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1170#ifdef RACCT
1171	if (racct_enable && error == KERN_SUCCESS) {
1172		PROC_LOCK(td->td_proc);
1173		racct_set(td->td_proc, RACCT_MEMLOCK, 0);
1174		PROC_UNLOCK(td->td_proc);
1175	}
1176#endif
1177
1178	return (error);
1179}
1180
1181#ifndef _SYS_SYSPROTO_H_
1182struct munlock_args {
1183	const void *addr;
1184	size_t len;
1185};
1186#endif
1187int
1188sys_munlock(td, uap)
1189	struct thread *td;
1190	struct munlock_args *uap;
1191{
1192	vm_offset_t addr, end, last, start;
1193	vm_size_t size;
1194#ifdef RACCT
1195	vm_map_t map;
1196#endif
1197	int error;
1198
1199	error = priv_check(td, PRIV_VM_MUNLOCK);
1200	if (error)
1201		return (error);
1202	addr = (vm_offset_t)uap->addr;
1203	size = uap->len;
1204	last = addr + size;
1205	start = trunc_page(addr);
1206	end = round_page(last);
1207	if (last < addr || end < addr)
1208		return (EINVAL);
1209	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
1210	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1211#ifdef RACCT
1212	if (racct_enable && error == KERN_SUCCESS) {
1213		PROC_LOCK(td->td_proc);
1214		map = &td->td_proc->p_vmspace->vm_map;
1215		racct_set(td->td_proc, RACCT_MEMLOCK,
1216		    ptoa(pmap_wired_count(map->pmap)));
1217		PROC_UNLOCK(td->td_proc);
1218	}
1219#endif
1220	return (error == KERN_SUCCESS ? 0 : ENOMEM);
1221}
1222
1223/*
1224 * vm_mmap_vnode()
1225 *
1226 * Helper function for vm_mmap.  Perform sanity check specific for mmap
1227 * operations on vnodes.
1228 *
1229 * For VCHR vnodes, the vnode lock is held over the call to
1230 * vm_mmap_cdev() to keep vp->v_rdev valid.
1231 */
1232int
1233vm_mmap_vnode(struct thread *td, vm_size_t objsize,
1234    vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
1235    struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
1236    boolean_t *writecounted)
1237{
1238	struct vattr va;
1239	vm_object_t obj;
1240	vm_offset_t foff;
1241	struct ucred *cred;
1242	int error, flags, locktype;
1243
1244	cred = td->td_ucred;
1245	if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED))
1246		locktype = LK_EXCLUSIVE;
1247	else
1248		locktype = LK_SHARED;
1249	if ((error = vget(vp, locktype, td)) != 0)
1250		return (error);
1251	foff = *foffp;
1252	flags = *flagsp;
1253	obj = vp->v_object;
1254	if (vp->v_type == VREG) {
1255		/*
1256		 * Get the proper underlying object
1257		 */
1258		if (obj == NULL) {
1259			error = EINVAL;
1260			goto done;
1261		}
1262		if (obj->type == OBJT_VNODE && obj->handle != vp) {
1263			vput(vp);
1264			vp = (struct vnode *)obj->handle;
1265			/*
1266			 * Bypass filesystems obey the mpsafety of the
1267			 * underlying fs.  Tmpfs never bypasses.
1268			 */
1269			error = vget(vp, locktype, td);
1270			if (error != 0)
1271				return (error);
1272		}
1273		if (locktype == LK_EXCLUSIVE) {
1274			*writecounted = TRUE;
1275			vnode_pager_update_writecount(obj, 0, objsize);
1276		}
1277	} else if (vp->v_type == VCHR) {
1278		error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp,
1279		    vp->v_rdev, foffp, objp);
1280		if (error == 0)
1281			goto mark_atime;
1282		goto done;
1283	} else {
1284		error = EINVAL;
1285		goto done;
1286	}
1287	if ((error = VOP_GETATTR(vp, &va, cred)))
1288		goto done;
1289#ifdef MAC
1290	error = mac_vnode_check_mmap(cred, vp, prot, flags);
1291	if (error != 0)
1292		goto done;
1293#endif
1294	if ((flags & MAP_SHARED) != 0) {
1295		if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
1296			if (prot & PROT_WRITE) {
1297				error = EPERM;
1298				goto done;
1299			}
1300			*maxprotp &= ~VM_PROT_WRITE;
1301		}
1302	}
1303	/*
1304	 * If it is a regular file without any references
1305	 * we do not need to sync it.
1306	 * Adjust object size to be the size of actual file.
1307	 */
1308	objsize = round_page(va.va_size);
1309	if (va.va_nlink == 0)
1310		flags |= MAP_NOSYNC;
1311	if (obj->type == OBJT_VNODE)
1312		obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
1313		    cred);
1314	else {
1315		KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
1316		    ("wrong object type"));
1317		vm_object_reference(obj);
1318	}
1319	if (obj == NULL) {
1320		error = ENOMEM;
1321		goto done;
1322	}
1323	*objp = obj;
1324	*flagsp = flags;
1325
1326mark_atime:
1327	vfs_mark_atime(vp, cred);
1328
1329done:
1330	if (error != 0 && *writecounted) {
1331		*writecounted = FALSE;
1332		vnode_pager_update_writecount(obj, objsize, 0);
1333	}
1334	vput(vp);
1335	return (error);
1336}
1337
1338/*
1339 * vm_mmap_cdev()
1340 *
1341 * Helper function for vm_mmap.  Perform sanity check specific for mmap
1342 * operations on cdevs.
1343 */
1344int
1345vm_mmap_cdev(struct thread *td, vm_size_t objsize,
1346    vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
1347    struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp)
1348{
1349	vm_object_t obj;
1350	struct cdevsw *dsw;
1351	int error, flags, ref;
1352
1353	flags = *flagsp;
1354
1355	dsw = dev_refthread(cdev, &ref);
1356	if (dsw == NULL)
1357		return (ENXIO);
1358	if (dsw->d_flags & D_MMAP_ANON) {
1359		dev_relthread(cdev, ref);
1360		*maxprotp = VM_PROT_ALL;
1361		*flagsp |= MAP_ANON;
1362		return (0);
1363	}
1364	/*
1365	 * cdevs do not provide private mappings of any kind.
1366	 */
1367	if ((*maxprotp & VM_PROT_WRITE) == 0 &&
1368	    (prot & PROT_WRITE) != 0) {
1369		dev_relthread(cdev, ref);
1370		return (EACCES);
1371	}
1372	if (flags & (MAP_PRIVATE|MAP_COPY)) {
1373		dev_relthread(cdev, ref);
1374		return (EINVAL);
1375	}
1376	/*
1377	 * Force device mappings to be shared.
1378	 */
1379	flags |= MAP_SHARED;
1380#ifdef MAC_XXX
1381	error = mac_cdev_check_mmap(td->td_ucred, cdev, prot);
1382	if (error != 0) {
1383		dev_relthread(cdev, ref);
1384		return (error);
1385	}
1386#endif
1387	/*
1388	 * First, try d_mmap_single().  If that is not implemented
1389	 * (returns ENODEV), fall back to using the device pager.
1390	 * Note that d_mmap_single() must return a reference to the
1391	 * object (it needs to bump the reference count of the object
1392	 * it returns somehow).
1393	 *
1394	 * XXX assumes VM_PROT_* == PROT_*
1395	 */
1396	error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
1397	dev_relthread(cdev, ref);
1398	if (error != ENODEV)
1399		return (error);
1400	obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
1401	    td->td_ucred);
1402	if (obj == NULL)
1403		return (EINVAL);
1404	*objp = obj;
1405	*flagsp = flags;
1406	return (0);
1407}
1408
1409/*
1410 * vm_mmap_shm()
1411 *
1412 * MPSAFE
1413 *
1414 * Helper function for vm_mmap.  Perform sanity check specific for mmap
1415 * operations on shm file descriptors.
1416 */
1417int
1418vm_mmap_shm(struct thread *td, vm_size_t objsize,
1419    vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
1420    struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp)
1421{
1422	int error;
1423
1424	if ((*flagsp & MAP_SHARED) != 0 &&
1425	    (*maxprotp & VM_PROT_WRITE) == 0 &&
1426	    (prot & PROT_WRITE) != 0)
1427		return (EACCES);
1428#ifdef MAC
1429	error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp);
1430	if (error != 0)
1431		return (error);
1432#endif
1433	error = shm_mmap(shmfd, objsize, foff, objp);
1434	if (error)
1435		return (error);
1436	return (0);
1437}
1438
1439/*
1440 * vm_mmap()
1441 *
1442 * MPSAFE
1443 *
1444 * Internal version of mmap.  Currently used by mmap, exec, and sys5
1445 * shared memory.  Handle is either a vnode pointer or NULL for MAP_ANON.
1446 */
1447int
1448vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1449	vm_prot_t maxprot, int flags,
1450	objtype_t handle_type, void *handle,
1451	vm_ooffset_t foff)
1452{
1453	boolean_t fitit;
1454	vm_object_t object = NULL;
1455	struct thread *td = curthread;
1456	int docow, error, findspace, rv;
1457	boolean_t writecounted;
1458
1459	if (size == 0)
1460		return (0);
1461
1462	size = round_page(size);
1463
1464	if (map == &td->td_proc->p_vmspace->vm_map) {
1465		PROC_LOCK(td->td_proc);
1466		if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) {
1467			PROC_UNLOCK(td->td_proc);
1468			return (ENOMEM);
1469		}
1470		if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
1471			PROC_UNLOCK(td->td_proc);
1472			return (ENOMEM);
1473		}
1474		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
1475			if (ptoa(pmap_wired_count(map->pmap)) + size >
1476			    lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
1477				racct_set_force(td->td_proc, RACCT_VMEM,
1478				    map->size);
1479				PROC_UNLOCK(td->td_proc);
1480				return (ENOMEM);
1481			}
1482			error = racct_set(td->td_proc, RACCT_MEMLOCK,
1483			    ptoa(pmap_wired_count(map->pmap)) + size);
1484			if (error != 0) {
1485				racct_set_force(td->td_proc, RACCT_VMEM,
1486				    map->size);
1487				PROC_UNLOCK(td->td_proc);
1488				return (error);
1489			}
1490		}
1491		PROC_UNLOCK(td->td_proc);
1492	}
1493
1494	/*
1495	 * We currently can only deal with page aligned file offsets.
1496	 * The check is here rather than in the syscall because the
1497	 * kernel calls this function internally for other mmaping
1498	 * operations (such as in exec) and non-aligned offsets will
1499	 * cause pmap inconsistencies...so we want to be sure to
1500	 * disallow this in all cases.
1501	 */
1502	if (foff & PAGE_MASK)
1503		return (EINVAL);
1504
1505	if ((flags & MAP_FIXED) == 0) {
1506		fitit = TRUE;
1507		*addr = round_page(*addr);
1508	} else {
1509		if (*addr != trunc_page(*addr))
1510			return (EINVAL);
1511		fitit = FALSE;
1512	}
1513	writecounted = FALSE;
1514
1515	/*
1516	 * Lookup/allocate object.
1517	 */
1518	switch (handle_type) {
1519	case OBJT_DEVICE:
1520		error = vm_mmap_cdev(td, size, prot, &maxprot, &flags,
1521		    handle, &foff, &object);
1522		break;
1523	case OBJT_VNODE:
1524		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
1525		    handle, &foff, &object, &writecounted);
1526		break;
1527	case OBJT_SWAP:
1528		error = vm_mmap_shm(td, size, prot, &maxprot, &flags,
1529		    handle, foff, &object);
1530		break;
1531	case OBJT_DEFAULT:
1532		if (handle == NULL) {
1533			error = 0;
1534			break;
1535		}
1536		/* FALLTHROUGH */
1537	default:
1538		error = EINVAL;
1539		break;
1540	}
1541	if (error)
1542		return (error);
1543	if (flags & MAP_ANON) {
1544		object = NULL;
1545		docow = 0;
1546		/*
1547		 * Unnamed anonymous regions always start at 0.
1548		 */
1549		if (handle == 0)
1550			foff = 0;
1551	} else if (flags & MAP_PREFAULT_READ)
1552		docow = MAP_PREFAULT;
1553	else
1554		docow = MAP_PREFAULT_PARTIAL;
1555
1556	if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
1557		docow |= MAP_COPY_ON_WRITE;
1558	if (flags & MAP_NOSYNC)
1559		docow |= MAP_DISABLE_SYNCER;
1560	if (flags & MAP_NOCORE)
1561		docow |= MAP_DISABLE_COREDUMP;
1562	/* Shared memory is also shared with children. */
1563	if (flags & MAP_SHARED)
1564		docow |= MAP_INHERIT_SHARE;
1565	if (writecounted)
1566		docow |= MAP_VN_WRITECOUNT;
1567	if (flags & MAP_STACK) {
1568		if (object != NULL)
1569			return (EINVAL);
1570		docow |= MAP_STACK_GROWS_DOWN;
1571	}
1572	if ((flags & MAP_EXCL) != 0)
1573		docow |= MAP_CHECK_EXCL;
1574
1575	if (fitit) {
1576		if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
1577			findspace = VMFS_SUPER_SPACE;
1578		else if ((flags & MAP_ALIGNMENT_MASK) != 0)
1579			findspace = VMFS_ALIGNED_SPACE(flags >>
1580			    MAP_ALIGNMENT_SHIFT);
1581		else
1582			findspace = VMFS_OPTIMAL_SPACE;
1583		rv = vm_map_find(map, object, foff, addr, size,
1584#ifdef MAP_32BIT
1585		    flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR :
1586#endif
1587		    0, findspace, prot, maxprot, docow);
1588	} else {
1589		rv = vm_map_fixed(map, object, foff, *addr, size,
1590		    prot, maxprot, docow);
1591	}
1592
1593	if (rv == KERN_SUCCESS) {
1594		/*
1595		 * If the process has requested that all future mappings
1596		 * be wired, then heed this.
1597		 */
1598		if (map->flags & MAP_WIREFUTURE) {
1599			vm_map_wire(map, *addr, *addr + size,
1600			    VM_MAP_WIRE_USER | ((flags & MAP_STACK) ?
1601			    VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES));
1602		}
1603	} else {
1604		/*
1605		 * If this mapping was accounted for in the vnode's
1606		 * writecount, then undo that now.
1607		 */
1608		if (writecounted)
1609			vnode_pager_release_writecount(object, 0, size);
1610		/*
1611		 * Lose the object reference.  Will destroy the
1612		 * object if it's an unnamed anonymous mapping
1613		 * or named anonymous without other references.
1614		 */
1615		vm_object_deallocate(object);
1616	}
1617	return (vm_mmap_to_errno(rv));
1618}
1619
1620/*
1621 * Translate a Mach VM return code to zero on success or the appropriate errno
1622 * on failure.
1623 */
1624int
1625vm_mmap_to_errno(int rv)
1626{
1627
1628	switch (rv) {
1629	case KERN_SUCCESS:
1630		return (0);
1631	case KERN_INVALID_ADDRESS:
1632	case KERN_NO_SPACE:
1633		return (ENOMEM);
1634	case KERN_PROTECTION_FAILURE:
1635		return (EACCES);
1636	default:
1637		return (EINVAL);
1638	}
1639}
1640