vm_mmap.c revision 321717
1193326Sed/*- 2193326Sed * Copyright (c) 1988 University of Utah. 3193326Sed * Copyright (c) 1991, 1993 4193326Sed * The Regents of the University of California. All rights reserved. 5193326Sed * 6193326Sed * This code is derived from software contributed to Berkeley by 7193326Sed * the Systems Programming Group of the University of Utah Computer 8193326Sed * Science Department. 9193326Sed * 10193326Sed * Redistribution and use in source and binary forms, with or without 11193326Sed * modification, are permitted provided that the following conditions 12193326Sed * are met: 13193326Sed * 1. Redistributions of source code must retain the above copyright 14249423Sdim * notice, this list of conditions and the following disclaimer. 15249423Sdim * 2. Redistributions in binary form must reproduce the above copyright 16193326Sed * notice, this list of conditions and the following disclaimer in the 17193326Sed * documentation and/or other materials provided with the distribution. 18249423Sdim * 4. Neither the name of the University nor the names of its contributors 19193326Sed * may be used to endorse or promote products derived from this software 20193326Sed * without specific prior written permission. 21263508Sdim * 22193326Sed * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23249423Sdim * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24207619Srdivacky * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25249423Sdim * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26243830Sdim * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27224145Sdim * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28193326Sed * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29193326Sed * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30193326Sed * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31193326Sed * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32193326Sed * SUCH DAMAGE. 33193326Sed * 34208600Srdivacky * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35193326Sed * 36193326Sed * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37193326Sed */ 38193326Sed 39193326Sed/* 40193326Sed * Mapped file (mmap) interface to VM 41193326Sed */ 42193326Sed 43208600Srdivacky#include <sys/cdefs.h> 44193326Sed__FBSDID("$FreeBSD: stable/10/sys/vm/vm_mmap.c 321717 2017-07-30 10:36:20Z kib $"); 45193326Sed 46193326Sed#include "opt_compat.h" 47208600Srdivacky#include "opt_hwpmc_hooks.h" 48193326Sed 49193326Sed#include <sys/param.h> 50193326Sed#include <sys/systm.h> 51193326Sed#include <sys/capsicum.h> 52263508Sdim#include <sys/kernel.h> 53263508Sdim#include <sys/lock.h> 54249423Sdim#include <sys/mutex.h> 55249423Sdim#include <sys/sysproto.h> 56249423Sdim#include <sys/filedesc.h> 57193326Sed#include <sys/priv.h> 58218893Sdim#include <sys/proc.h> 59193326Sed#include <sys/procctl.h> 60193326Sed#include <sys/racct.h> 61193326Sed#include <sys/resource.h> 62193326Sed#include <sys/resourcevar.h> 63193326Sed#include <sys/rwlock.h> 64193326Sed#include <sys/sysctl.h> 65193326Sed#include <sys/vnode.h> 66224145Sdim#include <sys/fcntl.h> 67208600Srdivacky#include <sys/file.h> 68193326Sed#include <sys/mman.h> 69193326Sed#include <sys/mount.h> 70193326Sed#include <sys/conf.h> 71224145Sdim#include <sys/stat.h> 72193326Sed#include <sys/syscallsubr.h> 73208600Srdivacky#include <sys/sysent.h> 74193326Sed#include <sys/vmmeter.h> 75193326Sed 76224145Sdim#include <security/mac/mac_framework.h> 77224145Sdim 78224145Sdim#include <vm/vm.h> 79224145Sdim#include <vm/vm_param.h> 80193326Sed#include <vm/pmap.h> 81193326Sed#include <vm/vm_map.h> 82224145Sdim#include <vm/vm_object.h> 83224145Sdim#include <vm/vm_page.h> 84193326Sed#include <vm/vm_pager.h> 85193326Sed#include <vm/vm_pageout.h> 86193326Sed#include <vm/vm_extern.h> 87193326Sed#include <vm/vm_page.h> 88193326Sed#include <vm/vnode_pager.h> 89193326Sed 90234353Sdim#ifdef HWPMC_HOOKS 91234353Sdim#include <sys/pmckern.h> 92234353Sdim#endif 93193326Sed 94193326Sedint old_mlock = 0; 95224145SdimSYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0, 96224145Sdim "Do not apply RLIMIT_MEMLOCK on mlockall"); 97224145SdimTUNABLE_INT("vm.old_mlock", &old_mlock); 98224145Sdim 99224145Sdim#ifdef MAP_32BIT 100224145Sdim#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 101224145Sdim#endif 102224145Sdim 103224145Sdimstatic int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 104224145Sdim int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 105224145Sdimstatic int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 106224145Sdim int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 107226633Sdimstatic int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 108226633Sdim int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 109234982Sdim 110234982Sdim#ifndef _SYS_SYSPROTO_H_ 111226633Sdimstruct sbrk_args { 112226633Sdim int incr; 113226633Sdim}; 114226633Sdim#endif 115226633Sdim 116226633Sdimint 117226633Sdimsys_sbrk(struct thread *td, struct sbrk_args *uap) 118234982Sdim{ 119234982Sdim /* Not yet implemented */ 120226633Sdim return (EOPNOTSUPP); 121226633Sdim} 122226633Sdim 123226633Sdim#ifndef _SYS_SYSPROTO_H_ 124226633Sdimstruct sstk_args { 125218893Sdim int incr; 126218893Sdim}; 127218893Sdim#endif 128218893Sdim 129218893Sdimint 130218893Sdimsys_sstk(struct thread *td, struct sstk_args *uap) 131218893Sdim{ 132218893Sdim /* Not yet implemented */ 133218893Sdim return (EOPNOTSUPP); 134218893Sdim} 135218893Sdim 136218893Sdim#if defined(COMPAT_43) 137218893Sdim#ifndef _SYS_SYSPROTO_H_ 138263508Sdimstruct getpagesize_args { 139218893Sdim int dummy; 140218893Sdim}; 141218893Sdim#endif 142218893Sdim 143249423Sdimint 144249423Sdimogetpagesize(struct thread *td, struct getpagesize_args *uap) 145218893Sdim{ 146218893Sdim 147263508Sdim td->td_retval[0] = PAGE_SIZE; 148249423Sdim return (0); 149218893Sdim} 150218893Sdim#endif /* COMPAT_43 */ 151218893Sdim 152249423Sdim 153249423Sdim/* 154249423Sdim * Memory Map (mmap) system call. Note that the file offset 155249423Sdim * and address are allowed to be NOT page aligned, though if 156263508Sdim * the MAP_FIXED flag it set, both must have the same remainder 157249423Sdim * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 158249423Sdim * page-aligned, the actual mapping starts at trunc_page(addr) 159249423Sdim * and the return value is adjusted up by the page offset. 160249423Sdim * 161218893Sdim * Generally speaking, only character devices which are themselves 162263508Sdim * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 163249423Sdim * there would be no cache coherency between a descriptor and a VM mapping 164263508Sdim * both to the same character device. 165249423Sdim */ 166263508Sdim#ifndef _SYS_SYSPROTO_H_ 167218893Sdimstruct mmap_args { 168218893Sdim void *addr; 169218893Sdim size_t len; 170218893Sdim int prot; 171263508Sdim int flags; 172218893Sdim int fd; 173218893Sdim long pad; 174218893Sdim off_t pos; 175218893Sdim}; 176249423Sdim#endif 177249423Sdim 178218893Sdimint 179218893Sdimsys_mmap(td, uap) 180218893Sdim struct thread *td; 181218893Sdim struct mmap_args *uap; 182218893Sdim{ 183218893Sdim#ifdef HWPMC_HOOKS 184218893Sdim struct pmckern_map_in pkm; 185218893Sdim#endif 186218893Sdim struct file *fp; 187218893Sdim struct vnode *vp; 188218893Sdim vm_offset_t addr; 189218893Sdim vm_size_t size, pageoff; 190218893Sdim vm_prot_t cap_maxprot, prot, maxprot; 191218893Sdim void *handle; 192218893Sdim objtype_t handle_type; 193234353Sdim int align, error, flags; 194249423Sdim off_t pos; 195234353Sdim struct vmspace *vms = td->td_proc->p_vmspace; 196234353Sdim cap_rights_t rights; 197234353Sdim 198234353Sdim addr = (vm_offset_t) uap->addr; 199249423Sdim size = uap->len; 200234353Sdim prot = uap->prot & VM_PROT_ALL; 201249423Sdim flags = uap->flags; 202249423Sdim pos = uap->pos; 203249423Sdim 204234353Sdim fp = NULL; 205234353Sdim 206234353Sdim /* 207249423Sdim * Enforce the constraints. 208234353Sdim * Mapping of length 0 is only allowed for old binaries. 209234353Sdim * Anonymous mapping shall specify -1 as filedescriptor and 210234353Sdim * zero position for new code. Be nice to ancient a.out 211249423Sdim * binaries and correct pos for anonymous mapping, since old 212249423Sdim * ld.so sometimes issues anonymous map requests with non-zero 213249423Sdim * pos. 214249423Sdim */ 215234353Sdim if (!SV_CURPROC_FLAG(SV_AOUT)) { 216249423Sdim if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 217234353Sdim ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 218249423Sdim return (EINVAL); 219249423Sdim } else { 220249423Sdim if ((flags & MAP_ANON) != 0) 221249423Sdim pos = 0; 222249423Sdim } 223249423Sdim 224249423Sdim if (flags & MAP_STACK) { 225249423Sdim if ((uap->fd != -1) || 226234353Sdim ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 227249423Sdim return (EINVAL); 228234353Sdim flags |= MAP_ANON; 229234353Sdim pos = 0; 230234353Sdim } 231249423Sdim if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 232234353Sdim return (EINVAL); 233234353Sdim if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || uap->fd != -1 || 234234353Sdim pos != 0 || (flags & (MAP_SHARED | MAP_PRIVATE | MAP_PREFAULT | 235234353Sdim MAP_PREFAULT_READ | MAP_ANON | MAP_STACK)) != 0)) 236234353Sdim return (EINVAL); 237249423Sdim 238234353Sdim /* 239234353Sdim * Align the file position to a page boundary, 240234353Sdim * and save its page offset component. 241234353Sdim */ 242234353Sdim pageoff = (pos & PAGE_MASK); 243234353Sdim pos -= pageoff; 244234353Sdim 245234353Sdim /* Adjust size for rounding (on both ends). */ 246234353Sdim size += pageoff; /* low end... */ 247234353Sdim size = (vm_size_t) round_page(size); /* hi end */ 248234353Sdim 249249423Sdim /* Ensure alignment is at least a page and fits in a pointer. */ 250234353Sdim align = flags & MAP_ALIGNMENT_MASK; 251234353Sdim if (align != 0 && align != MAP_ALIGNED_SUPER && 252234353Sdim (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 253234353Sdim align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 254234353Sdim return (EINVAL); 255263508Sdim 256239462Sdim /* 257239462Sdim * Check for illegal addresses. Watch out for address wrap... Note 258249423Sdim * that VM_*_ADDRESS are not constants due to casts (argh). 259249423Sdim */ 260239462Sdim if (flags & MAP_FIXED) { 261239462Sdim /* 262249423Sdim * The specified address must have the same remainder 263239462Sdim * as the file offset taken modulo PAGE_SIZE, so it 264239462Sdim * should be aligned after adjustment by pageoff. 265239462Sdim */ 266239462Sdim addr -= pageoff; 267249423Sdim if (addr & PAGE_MASK) 268249423Sdim return (EINVAL); 269239462Sdim 270239462Sdim /* Address range must be all in user VM space. */ 271263508Sdim if (addr < vm_map_min(&vms->vm_map) || 272263508Sdim addr + size > vm_map_max(&vms->vm_map)) 273193326Sed return (EINVAL); 274193326Sed if (addr + size < addr) 275218893Sdim return (EINVAL); 276193326Sed#ifdef MAP_32BIT 277193326Sed if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 278193326Sed return (EINVAL); 279193326Sed } else if (flags & MAP_32BIT) { 280193326Sed /* 281193326Sed * For MAP_32BIT, override the hint if it is too high and 282218893Sdim * do not bother moving the mapping past the heap (since 283193326Sed * the heap is usually above 2GB). 284193326Sed */ 285263508Sdim if (addr + size > MAP_32BIT_MAX_ADDR) 286239462Sdim addr = 0; 287239462Sdim#endif 288239462Sdim } else { 289239462Sdim /* 290239462Sdim * XXX for non-fixed mappings where no hint is provided or 291239462Sdim * the hint would fall in the potential heap space, 292239462Sdim * place it after the end of the largest possible heap. 293239462Sdim * 294263508Sdim * There should really be a pmap call to determine a reasonable 295263508Sdim * location. 296239462Sdim */ 297239462Sdim PROC_LOCK(td->td_proc); 298239462Sdim if (addr == 0 || 299239462Sdim (addr >= round_page((vm_offset_t)vms->vm_taddr) && 300239462Sdim addr < round_page((vm_offset_t)vms->vm_daddr + 301239462Sdim lim_max(td->td_proc, RLIMIT_DATA)))) 302239462Sdim addr = round_page((vm_offset_t)vms->vm_daddr + 303263508Sdim lim_max(td->td_proc, RLIMIT_DATA)); 304243830Sdim PROC_UNLOCK(td->td_proc); 305243830Sdim } 306243830Sdim if ((flags & MAP_GUARD) != 0) { 307243830Sdim handle = NULL; 308243830Sdim handle_type = OBJT_DEFAULT; 309193326Sed maxprot = VM_PROT_NONE; 310193326Sed cap_maxprot = VM_PROT_NONE; 311226633Sdim } else if ((flags & MAP_ANON) != 0) { 312243830Sdim /* 313243830Sdim * Mapping blank space is trivial. 314243830Sdim */ 315243830Sdim handle = NULL; 316243830Sdim handle_type = OBJT_DEFAULT; 317193326Sed maxprot = VM_PROT_ALL; 318193326Sed cap_maxprot = VM_PROT_ALL; 319243830Sdim } else { 320243830Sdim /* 321243830Sdim * Mapping file, get fp for validation and don't let the 322243830Sdim * descriptor disappear on us if we block. Check capability 323243830Sdim * rights, but also return the maximum rights to be combined 324243830Sdim * with maxprot later. 325243830Sdim */ 326243830Sdim cap_rights_init(&rights, CAP_MMAP); 327243830Sdim if (prot & PROT_READ) 328243830Sdim cap_rights_set(&rights, CAP_MMAP_R); 329243830Sdim if ((flags & MAP_SHARED) != 0) { 330243830Sdim if (prot & PROT_WRITE) 331243830Sdim cap_rights_set(&rights, CAP_MMAP_W); 332243830Sdim } 333243830Sdim if (prot & PROT_EXEC) 334243830Sdim cap_rights_set(&rights, CAP_MMAP_X); 335243830Sdim error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); 336243830Sdim if (error != 0) 337243830Sdim goto done; 338243830Sdim if (fp->f_type == DTYPE_SHM) { 339243830Sdim handle = fp->f_data; 340243830Sdim handle_type = OBJT_SWAP; 341243830Sdim maxprot = VM_PROT_NONE; 342243830Sdim 343243830Sdim /* FREAD should always be set. */ 344243830Sdim if (fp->f_flag & FREAD) 345243830Sdim maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 346243830Sdim if (fp->f_flag & FWRITE) 347243830Sdim maxprot |= VM_PROT_WRITE; 348243830Sdim goto map; 349243830Sdim } 350243830Sdim if (fp->f_type != DTYPE_VNODE) { 351193326Sed error = ENODEV; 352193326Sed goto done; 353193326Sed } 354193326Sed#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 355193326Sed defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 356193326Sed /* 357193326Sed * POSIX shared-memory objects are defined to have 358193326Sed * kernel persistence, and are not defined to support 359193326Sed * read(2)/write(2) -- or even open(2). Thus, we can 360193326Sed * use MAP_ASYNC to trade on-disk coherence for speed. 361243830Sdim * The shm_open(3) library routine turns on the FPOSIXSHM 362243830Sdim * flag to request this behavior. 363243830Sdim */ 364243830Sdim if (fp->f_flag & FPOSIXSHM) 365243830Sdim flags |= MAP_NOSYNC; 366243830Sdim#endif 367243830Sdim vp = fp->f_vnode; 368243830Sdim /* 369243830Sdim * Ensure that file and memory protections are 370243830Sdim * compatible. Note that we only worry about 371243830Sdim * writability if mapping is shared; in this case, 372243830Sdim * current and max prot are dictated by the open file. 373243830Sdim * XXX use the vnode instead? Problem is: what 374243830Sdim * credentials do we use for determination? What if 375243830Sdim * proc does a setuid? 376243830Sdim */ 377193326Sed if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 378193326Sed maxprot = VM_PROT_NONE; 379243830Sdim else 380219077Sdim maxprot = VM_PROT_EXECUTE; 381219077Sdim if (fp->f_flag & FREAD) { 382193326Sed maxprot |= VM_PROT_READ; 383193326Sed } else if (prot & PROT_READ) { 384193326Sed error = EACCES; 385243830Sdim goto done; 386203955Srdivacky } 387193326Sed /* 388193326Sed * If we are sharing potential changes (either via 389263508Sdim * MAP_SHARED or via the implicit sharing of character 390263508Sdim * device mappings), and we are trying to get write 391263508Sdim * permission although we opened it without asking 392263508Sdim * for it, bail out. 393263508Sdim */ 394263508Sdim if ((flags & MAP_SHARED) != 0) { 395263508Sdim if ((fp->f_flag & FWRITE) != 0) { 396263508Sdim maxprot |= VM_PROT_WRITE; 397193326Sed } else if ((prot & PROT_WRITE) != 0) { 398193326Sed error = EACCES; 399203955Srdivacky goto done; 400203955Srdivacky } 401203955Srdivacky } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 402239462Sdim maxprot |= VM_PROT_WRITE; 403203955Srdivacky cap_maxprot |= VM_PROT_WRITE; 404203955Srdivacky } 405203955Srdivacky handle = (void *)vp; 406239462Sdim handle_type = OBJT_VNODE; 407203955Srdivacky } 408203955Srdivackymap: 409203955Srdivacky td->td_fpop = fp; 410239462Sdim maxprot &= cap_maxprot; 411203955Srdivacky error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 412203955Srdivacky flags, handle_type, handle, pos); 413203955Srdivacky td->td_fpop = NULL; 414239462Sdim#ifdef HWPMC_HOOKS 415203955Srdivacky /* inform hwpmc(4) if an executable is being mapped */ 416203955Srdivacky if (error == 0 && handle_type == OBJT_VNODE && 417203955Srdivacky (prot & PROT_EXEC)) { 418193326Sed pkm.pm_file = handle; 419193326Sed pkm.pm_address = (uintptr_t) addr; 420193326Sed PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 421193326Sed } 422193326Sed#endif 423243830Sdim if (error == 0) 424193326Sed td->td_retval[0] = (register_t) (addr + pageoff); 425198092Srdivackydone: 426193326Sed if (fp) 427193326Sed fdrop(fp, td); 428193326Sed 429193326Sed return (error); 430193326Sed} 431198092Srdivacky 432193326Sedint 433193326Sedfreebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 434193326Sed{ 435193326Sed struct mmap_args oargs; 436193326Sed 437193326Sed oargs.addr = uap->addr; 438193326Sed oargs.len = uap->len; 439193326Sed oargs.prot = uap->prot; 440193326Sed oargs.flags = uap->flags; 441193326Sed oargs.fd = uap->fd; 442193326Sed oargs.pos = uap->pos; 443243830Sdim return (sys_mmap(td, &oargs)); 444263508Sdim} 445226633Sdim 446212904Sdim#ifdef COMPAT_43 447212904Sdim#ifndef _SYS_SYSPROTO_H_ 448193326Sedstruct ommap_args { 449198092Srdivacky caddr_t addr; 450193326Sed int len; 451193326Sed int prot; 452193326Sed int flags; 453193326Sed int fd; 454193326Sed long pos; 455193326Sed}; 456193326Sed#endif 457193326Sedint 458193326Sedommap(td, uap) 459193326Sed struct thread *td; 460193326Sed struct ommap_args *uap; 461193326Sed{ 462193326Sed struct mmap_args nargs; 463193326Sed static const char cvtbsdprot[8] = { 464193326Sed 0, 465193326Sed PROT_EXEC, 466193326Sed PROT_WRITE, 467193326Sed PROT_EXEC | PROT_WRITE, 468193326Sed PROT_READ, 469193326Sed PROT_EXEC | PROT_READ, 470193326Sed PROT_WRITE | PROT_READ, 471198092Srdivacky PROT_EXEC | PROT_WRITE | PROT_READ, 472226633Sdim }; 473239462Sdim 474193326Sed#define OMAP_ANON 0x0002 475193326Sed#define OMAP_COPY 0x0020 476193326Sed#define OMAP_SHARED 0x0010 477193326Sed#define OMAP_FIXED 0x0100 478193326Sed 479193326Sed nargs.addr = uap->addr; 480193326Sed nargs.len = uap->len; 481198092Srdivacky nargs.prot = cvtbsdprot[uap->prot & 0x7]; 482193326Sed#ifdef COMPAT_FREEBSD32 483206275Srdivacky#if defined(__amd64__) || defined(__ia64__) 484206275Srdivacky if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 485207619Srdivacky nargs.prot != 0) 486207619Srdivacky nargs.prot |= PROT_EXEC; 487207619Srdivacky#endif 488206275Srdivacky#endif 489206275Srdivacky nargs.flags = 0; 490206275Srdivacky if (uap->flags & OMAP_ANON) 491193326Sed nargs.flags |= MAP_ANON; 492193326Sed if (uap->flags & OMAP_COPY) 493193326Sed nargs.flags |= MAP_COPY; 494239462Sdim if (uap->flags & OMAP_SHARED) 495193326Sed nargs.flags |= MAP_SHARED; 496193326Sed else 497193326Sed nargs.flags |= MAP_PRIVATE; 498193326Sed if (uap->flags & OMAP_FIXED) 499193326Sed nargs.flags |= MAP_FIXED; 500193326Sed nargs.fd = uap->fd; 501198092Srdivacky nargs.pos = uap->pos; 502193326Sed return (sys_mmap(td, &nargs)); 503193326Sed} 504193326Sed#endif /* COMPAT_43 */ 505193326Sed 506193326Sed 507193326Sed#ifndef _SYS_SYSPROTO_H_ 508198092Srdivackystruct msync_args { 509193326Sed void *addr; 510193326Sed size_t len; 511193326Sed int flags; 512193326Sed}; 513198092Srdivacky#endif 514193326Sedint 515193326Sedsys_msync(td, uap) 516193326Sed struct thread *td; 517193326Sed struct msync_args *uap; 518193326Sed{ 519193326Sed vm_offset_t addr; 520198092Srdivacky vm_size_t size, pageoff; 521193326Sed int flags; 522193326Sed vm_map_t map; 523249423Sdim int rv; 524224145Sdim 525224145Sdim addr = (vm_offset_t) uap->addr; 526224145Sdim size = uap->len; 527224145Sdim flags = uap->flags; 528193326Sed 529193326Sed pageoff = (addr & PAGE_MASK); 530193326Sed addr -= pageoff; 531198092Srdivacky size += pageoff; 532249423Sdim size = (vm_size_t) round_page(size); 533193326Sed if (addr + size < addr) 534193326Sed return (EINVAL); 535198092Srdivacky 536193326Sed if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 537249423Sdim return (EINVAL); 538193326Sed 539198092Srdivacky map = &td->td_proc->p_vmspace->vm_map; 540193326Sed 541193326Sed /* 542193326Sed * Clean the pages and interpret the return value. 543193326Sed */ 544193326Sed rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 545193326Sed (flags & MS_INVALIDATE) != 0); 546193326Sed switch (rv) { 547193326Sed case KERN_SUCCESS: 548193326Sed return (0); 549193326Sed case KERN_INVALID_ADDRESS: 550198092Srdivacky return (ENOMEM); 551193326Sed case KERN_INVALID_ARGUMENT: 552193326Sed return (EBUSY); 553193326Sed case KERN_FAILURE: 554198092Srdivacky return (EIO); 555193326Sed default: 556193326Sed return (EINVAL); 557193326Sed } 558193326Sed} 559193326Sed 560193326Sed#ifndef _SYS_SYSPROTO_H_ 561198092Srdivackystruct munmap_args { 562226633Sdim void *addr; 563198092Srdivacky size_t len; 564193326Sed}; 565193326Sed#endif 566193326Sedint 567193326Sedsys_munmap(td, uap) 568193326Sed struct thread *td; 569193326Sed struct munmap_args *uap; 570193326Sed{ 571198092Srdivacky#ifdef HWPMC_HOOKS 572193326Sed struct pmckern_map_out pkm; 573193326Sed vm_map_entry_t entry; 574193326Sed#endif 575198092Srdivacky vm_offset_t addr; 576193326Sed vm_size_t size, pageoff; 577193326Sed vm_map_t map; 578193326Sed 579193326Sed addr = (vm_offset_t) uap->addr; 580193326Sed size = uap->len; 581243830Sdim if (size == 0) 582263508Sdim return (EINVAL); 583243830Sdim 584243830Sdim pageoff = (addr & PAGE_MASK); 585243830Sdim addr -= pageoff; 586243830Sdim size += pageoff; 587243830Sdim size = (vm_size_t) round_page(size); 588243830Sdim if (addr + size < addr) 589243830Sdim return (EINVAL); 590243830Sdim 591243830Sdim /* 592243830Sdim * Check for illegal addresses. Watch out for address wrap... 593243830Sdim */ 594243830Sdim map = &td->td_proc->p_vmspace->vm_map; 595243830Sdim if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 596243830Sdim return (EINVAL); 597243830Sdim vm_map_lock(map); 598243830Sdim#ifdef HWPMC_HOOKS 599243830Sdim /* 600243830Sdim * Inform hwpmc if the address range being unmapped contains 601243830Sdim * an executable region. 602243830Sdim */ 603263508Sdim pkm.pm_address = (uintptr_t) NULL; 604243830Sdim if (vm_map_lookup_entry(map, addr, &entry)) { 605243830Sdim for (; 606243830Sdim entry != &map->header && entry->start < addr + size; 607243830Sdim entry = entry->next) { 608243830Sdim if (vm_map_check_protection(map, entry->start, 609243830Sdim entry->end, VM_PROT_EXECUTE) == TRUE) { 610243830Sdim pkm.pm_address = (uintptr_t) addr; 611243830Sdim pkm.pm_size = (size_t) size; 612243830Sdim break; 613243830Sdim } 614243830Sdim } 615243830Sdim } 616243830Sdim#endif 617243830Sdim vm_map_delete(map, addr, addr + size); 618243830Sdim 619203955Srdivacky#ifdef HWPMC_HOOKS 620203955Srdivacky /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 621203955Srdivacky vm_map_lock_downgrade(map); 622203955Srdivacky if (pkm.pm_address != (uintptr_t) NULL) 623203955Srdivacky PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 624203955Srdivacky vm_map_unlock_read(map); 625193326Sed#else 626193326Sed vm_map_unlock(map); 627193326Sed#endif 628193326Sed /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 629263508Sdim return (0); 630263508Sdim} 631263508Sdim 632263508Sdim#ifndef _SYS_SYSPROTO_H_ 633263508Sdimstruct mprotect_args { 634263508Sdim const void *addr; 635243830Sdim size_t len; 636243830Sdim int prot; 637193326Sed}; 638239462Sdim#endif 639239462Sdimint 640203955Srdivackysys_mprotect(td, uap) 641203955Srdivacky struct thread *td; 642203955Srdivacky struct mprotect_args *uap; 643203955Srdivacky{ 644203955Srdivacky vm_offset_t addr; 645203955Srdivacky vm_size_t size, pageoff; 646203955Srdivacky vm_prot_t prot; 647203955Srdivacky 648203955Srdivacky addr = (vm_offset_t) uap->addr; 649203955Srdivacky size = uap->len; 650203955Srdivacky prot = uap->prot & VM_PROT_ALL; 651193326Sed 652193326Sed pageoff = (addr & PAGE_MASK); 653263508Sdim addr -= pageoff; 654239462Sdim size += pageoff; 655243830Sdim size = (vm_size_t) round_page(size); 656251662Sdim if (addr + size < addr) 657243830Sdim return (EINVAL); 658243830Sdim 659243830Sdim switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 660243830Sdim addr + size, prot, FALSE)) { 661243830Sdim case KERN_SUCCESS: 662251662Sdim return (0); 663239462Sdim case KERN_PROTECTION_FAILURE: 664251662Sdim return (EACCES); 665251662Sdim case KERN_RESOURCE_SHORTAGE: 666239462Sdim return (ENOMEM); 667263508Sdim } 668251662Sdim return (EINVAL); 669251662Sdim} 670251662Sdim 671251662Sdim#ifndef _SYS_SYSPROTO_H_ 672251662Sdimstruct minherit_args { 673239462Sdim void *addr; 674263508Sdim size_t len; 675251662Sdim int inherit; 676251662Sdim}; 677251662Sdim#endif 678251662Sdimint 679251662Sdimsys_minherit(struct thread *td, struct minherit_args *uap) 680251662Sdim{ 681251662Sdim vm_offset_t addr; 682251662Sdim vm_size_t size, pageoff; 683251662Sdim vm_inherit_t inherit; 684251662Sdim 685251662Sdim addr = (vm_offset_t)uap->addr; 686251662Sdim size = uap->len; 687251662Sdim inherit = uap->inherit; 688243830Sdim 689243830Sdim pageoff = (addr & PAGE_MASK); 690243830Sdim addr -= pageoff; 691243830Sdim size += pageoff; 692239462Sdim size = (vm_size_t) round_page(size); 693239462Sdim if (addr + size < addr) 694239462Sdim return (EINVAL); 695239462Sdim 696243830Sdim switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 697243830Sdim addr + size, inherit)) { 698251662Sdim case KERN_SUCCESS: 699243830Sdim return (0); 700243830Sdim case KERN_PROTECTION_FAILURE: 701239462Sdim return (EACCES); 702239462Sdim } 703239462Sdim return (EINVAL); 704251662Sdim} 705239462Sdim 706239462Sdim#ifndef _SYS_SYSPROTO_H_ 707239462Sdimstruct madvise_args { 708193326Sed void *addr; 709193326Sed size_t len; 710193326Sed int behav; 711193326Sed}; 712193326Sed#endif 713263508Sdim 714193326Sedint 715193326Sedsys_madvise(struct thread *td, struct madvise_args *uap) 716193326Sed{ 717193326Sed vm_offset_t start, end; 718193326Sed vm_map_t map; 719207619Srdivacky int flags; 720207619Srdivacky 721207619Srdivacky /* 722207619Srdivacky * Check for our special case, advising the swap pager we are 723207619Srdivacky * "immortal." 724207619Srdivacky */ 725207619Srdivacky if (uap->behav == MADV_PROTECT) { 726207619Srdivacky flags = PPROT_SET; 727207619Srdivacky return (kern_procctl(td, P_PID, td->td_proc->p_pid, 728207619Srdivacky PROC_SPROTECT, &flags)); 729239462Sdim } 730207619Srdivacky 731207619Srdivacky /* 732207619Srdivacky * Check for illegal behavior 733193326Sed */ 734263508Sdim if (uap->behav < 0 || uap->behav > MADV_CORE) 735239462Sdim return (EINVAL); 736207619Srdivacky /* 737239462Sdim * Check for illegal addresses. Watch out for address wrap... Note 738207619Srdivacky * that VM_*_ADDRESS are not constants due to casts (argh). 739207619Srdivacky */ 740239462Sdim map = &td->td_proc->p_vmspace->vm_map; 741207619Srdivacky if ((vm_offset_t)uap->addr < vm_map_min(map) || 742218893Sdim (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 743207619Srdivacky return (EINVAL); 744207619Srdivacky if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 745207619Srdivacky return (EINVAL); 746193326Sed 747263508Sdim /* 748263508Sdim * Since this routine is only advisory, we default to conservative 749263508Sdim * behavior. 750239462Sdim */ 751207619Srdivacky start = trunc_page((vm_offset_t) uap->addr); 752218893Sdim end = round_page((vm_offset_t) uap->addr + uap->len); 753239462Sdim 754207619Srdivacky if (vm_map_madvise(map, start, end, uap->behav)) 755193326Sed return (EINVAL); 756249423Sdim return (0); 757207619Srdivacky} 758249423Sdim 759249423Sdim#ifndef _SYS_SYSPROTO_H_ 760249423Sdimstruct mincore_args { 761249423Sdim const void *addr; 762193326Sed size_t len; 763193326Sed char *vec; 764263508Sdim}; 765249423Sdim#endif 766203955Srdivacky 767249423Sdimint 768203955Srdivackysys_mincore(struct thread *td, struct mincore_args *uap) 769218893Sdim{ 770249423Sdim vm_offset_t addr, first_addr; 771203955Srdivacky vm_offset_t end, cend; 772203955Srdivacky pmap_t pmap; 773263508Sdim vm_map_t map; 774212904Sdim char *vec; 775212904Sdim int error = 0; 776212904Sdim int vecindex, lastvecindex; 777212904Sdim vm_map_entry_t current; 778218893Sdim vm_map_entry_t entry; 779212904Sdim vm_object_t object; 780212904Sdim vm_paddr_t locked_pa; 781212904Sdim vm_page_t m; 782203955Srdivacky vm_pindex_t pindex; 783249423Sdim int mincoreinfo; 784249423Sdim unsigned int timestamp; 785203955Srdivacky boolean_t locked; 786203955Srdivacky 787249423Sdim /* 788203955Srdivacky * Make sure that the addresses presented are valid for user 789203955Srdivacky * mode. 790221345Sdim */ 791221345Sdim first_addr = addr = trunc_page((vm_offset_t) uap->addr); 792221345Sdim end = addr + (vm_size_t)round_page(uap->len); 793221345Sdim map = &td->td_proc->p_vmspace->vm_map; 794221345Sdim if (end > vm_map_max(map) || end < addr) 795221345Sdim return (ENOMEM); 796221345Sdim 797263508Sdim /* 798263508Sdim * Address of byte vector 799221345Sdim */ 800221345Sdim vec = uap->vec; 801221345Sdim 802221345Sdim pmap = vmspace_pmap(td->td_proc->p_vmspace); 803221345Sdim 804221345Sdim vm_map_lock_read(map); 805221345SdimRestartScan: 806221345Sdim timestamp = map->timestamp; 807221345Sdim 808221345Sdim if (!vm_map_lookup_entry(map, addr, &entry)) { 809221345Sdim vm_map_unlock_read(map); 810221345Sdim return (ENOMEM); 811221345Sdim } 812221345Sdim 813221345Sdim /* 814221345Sdim * Do this on a map entry basis so that if the pages are not 815221345Sdim * in the current processes address space, we can easily look 816221345Sdim * up the pages elsewhere. 817221345Sdim */ 818221345Sdim lastvecindex = -1; 819221345Sdim for (current = entry; 820221345Sdim (current != &map->header) && (current->start < end); 821221345Sdim current = current->next) { 822221345Sdim 823221345Sdim /* 824263508Sdim * check for contiguity 825210299Sed */ 826210299Sed if (current->end < end && 827210299Sed (entry->next == &map->header || 828210299Sed current->next->start > current->end)) { 829263508Sdim vm_map_unlock_read(map); 830210299Sed return (ENOMEM); 831239462Sdim } 832210299Sed 833210299Sed /* 834210299Sed * ignore submaps (for now) or null objects 835210299Sed */ 836210299Sed if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 837239462Sdim current->object.vm_object == NULL) 838210299Sed continue; 839210299Sed 840210299Sed /* 841210299Sed * limit this scan to the current map entry and the 842263508Sdim * limits for the mincore call 843210299Sed */ 844210299Sed if (addr < current->start) 845210299Sed addr = current->start; 846210299Sed cend = current->end; 847239462Sdim if (cend > end) 848234353Sdim cend = end; 849234353Sdim 850234353Sdim /* 851210299Sed * scan this entry one page at a time 852210299Sed */ 853263508Sdim while (addr < cend) { 854239462Sdim /* 855210299Sed * Check pmap first, it is likely faster, also 856239462Sdim * it can provide info as to whether we are the 857210299Sed * one referencing or modifying the page. 858210299Sed */ 859210299Sed object = NULL; 860263508Sdim locked_pa = 0; 861263508Sdim retry: 862210299Sed m = NULL; 863210299Sed mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 864210299Sed if (locked_pa != 0) { 865210299Sed /* 866210299Sed * The page is mapped by this process but not 867210299Sed * both accessed and modified. It is also 868239462Sdim * managed. Acquire the object lock so that 869210299Sed * other mappings might be examined. 870210299Sed */ 871210299Sed m = PHYS_TO_VM_PAGE(locked_pa); 872210299Sed if (m->object != object) { 873263508Sdim if (object != NULL) 874210299Sed VM_OBJECT_WUNLOCK(object); 875210299Sed object = m->object; 876210299Sed locked = VM_OBJECT_TRYWLOCK(object); 877210299Sed vm_page_unlock(m); 878239462Sdim if (!locked) { 879234353Sdim VM_OBJECT_WLOCK(object); 880234353Sdim vm_page_lock(m); 881234353Sdim goto retry; 882210299Sed } 883210299Sed } else 884263508Sdim vm_page_unlock(m); 885239462Sdim KASSERT(m->valid == VM_PAGE_BITS_ALL, 886210299Sed ("mincore: page %p is mapped but invalid", 887210299Sed m)); 888263508Sdim } else if (mincoreinfo == 0) { 889210299Sed /* 890210299Sed * The page is not mapped by this process. If 891210299Sed * the object implements managed pages, then 892210299Sed * determine if the page is resident so that 893210299Sed * the mappings might be examined. 894210299Sed */ 895239462Sdim if (current->object.vm_object != object) { 896210299Sed if (object != NULL) 897210299Sed VM_OBJECT_WUNLOCK(object); 898210299Sed object = current->object.vm_object; 899210299Sed VM_OBJECT_WLOCK(object); 900263508Sdim } 901210299Sed if (object->type == OBJT_DEFAULT || 902210299Sed object->type == OBJT_SWAP || 903210299Sed object->type == OBJT_VNODE) { 904210299Sed pindex = OFF_TO_IDX(current->offset + 905239462Sdim (addr - current->start)); 906234353Sdim m = vm_page_lookup(object, pindex); 907234353Sdim if (m == NULL && 908234353Sdim vm_page_is_cached(object, pindex)) 909210299Sed mincoreinfo = MINCORE_INCORE; 910210299Sed if (m != NULL && m->valid == 0) 911218893Sdim m = NULL; 912221345Sdim if (m != NULL) 913221345Sdim mincoreinfo = MINCORE_INCORE; 914218893Sdim } 915218893Sdim } 916218893Sdim if (m != NULL) { 917263508Sdim /* Examine other mappings to the page. */ 918210299Sed if (m->dirty == 0 && pmap_is_modified(m)) 919221345Sdim vm_page_dirty(m); 920210299Sed if (m->dirty != 0) 921263508Sdim mincoreinfo |= MINCORE_MODIFIED_OTHER; 922210299Sed /* 923210299Sed * The first test for PGA_REFERENCED is an 924210299Sed * optimization. The second test is 925210299Sed * required because a concurrent pmap 926210299Sed * operation could clear the last reference 927210299Sed * and set PGA_REFERENCED before the call to 928210299Sed * pmap_is_referenced(). 929239462Sdim */ 930210299Sed if ((m->aflags & PGA_REFERENCED) != 0 || 931210299Sed pmap_is_referenced(m) || 932210299Sed (m->aflags & PGA_REFERENCED) != 0) 933210299Sed mincoreinfo |= MINCORE_REFERENCED_OTHER; 934263508Sdim } 935210299Sed if (object != NULL) 936210299Sed VM_OBJECT_WUNLOCK(object); 937210299Sed 938210299Sed /* 939234353Sdim * subyte may page fault. In case it needs to modify 940234353Sdim * the map, we release the lock. 941234353Sdim */ 942234353Sdim vm_map_unlock_read(map); 943210299Sed 944210299Sed /* 945218893Sdim * calculate index into user supplied byte vector 946218893Sdim */ 947218893Sdim vecindex = OFF_TO_IDX(addr - first_addr); 948218893Sdim 949218893Sdim /* 950218893Sdim * If we have skipped map entries, we need to make sure that 951193326Sed * the byte vector is zeroed for those skipped entries. 952193326Sed */ 953193326Sed while ((lastvecindex + 1) < vecindex) { 954193326Sed ++lastvecindex; 955193326Sed error = subyte(vec + lastvecindex, 0); 956193326Sed if (error) { 957193326Sed error = EFAULT; 958193326Sed goto done2; 959193326Sed } 960221345Sdim } 961221345Sdim 962221345Sdim /* 963221345Sdim * Pass the page information to the user 964221345Sdim */ 965221345Sdim error = subyte(vec + vecindex, mincoreinfo); 966221345Sdim if (error) { 967221345Sdim error = EFAULT; 968221345Sdim goto done2; 969221345Sdim } 970221345Sdim 971221345Sdim /* 972221345Sdim * If the map has changed, due to the subyte, the previous 973263508Sdim * output may be invalid. 974263508Sdim */ 975221345Sdim vm_map_lock_read(map); 976221345Sdim if (timestamp != map->timestamp) 977221345Sdim goto RestartScan; 978221345Sdim 979221345Sdim lastvecindex = vecindex; 980221345Sdim addr += PAGE_SIZE; 981221345Sdim } 982221345Sdim } 983221345Sdim 984221345Sdim /* 985221345Sdim * subyte may page fault. In case it needs to modify 986221345Sdim * the map, we release the lock. 987221345Sdim */ 988221345Sdim vm_map_unlock_read(map); 989221345Sdim 990221345Sdim /* 991221345Sdim * Zero the last entries in the byte vector. 992221345Sdim */ 993263508Sdim vecindex = OFF_TO_IDX(end - first_addr); 994221345Sdim while ((lastvecindex + 1) < vecindex) { 995221345Sdim ++lastvecindex; 996221345Sdim error = subyte(vec + lastvecindex, 0); 997263508Sdim if (error) { 998263508Sdim error = EFAULT; 999221345Sdim goto done2; 1000221345Sdim } 1001221345Sdim } 1002221345Sdim 1003221345Sdim /* 1004221345Sdim * If the map has changed, due to the subyte, the previous 1005221345Sdim * output may be invalid. 1006221345Sdim */ 1007221345Sdim vm_map_lock_read(map); 1008221345Sdim if (timestamp != map->timestamp) 1009263508Sdim goto RestartScan; 1010221345Sdim vm_map_unlock_read(map); 1011221345Sdimdone2: 1012221345Sdim return (error); 1013251662Sdim} 1014251662Sdim 1015251662Sdim#ifndef _SYS_SYSPROTO_H_ 1016251662Sdimstruct mlock_args { 1017251662Sdim const void *addr; 1018251662Sdim size_t len; 1019251662Sdim}; 1020251662Sdim#endif 1021251662Sdimint 1022251662Sdimsys_mlock(struct thread *td, struct mlock_args *uap) 1023251662Sdim{ 1024251662Sdim 1025251662Sdim return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); 1026251662Sdim} 1027251662Sdim 1028251662Sdimint 1029251662Sdimvm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) 1030251662Sdim{ 1031251662Sdim vm_offset_t addr, end, last, start; 1032251662Sdim vm_size_t npages, size; 1033251662Sdim vm_map_t map; 1034251662Sdim unsigned long nsize; 1035251662Sdim int error; 1036251662Sdim 1037251662Sdim error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); 1038251662Sdim if (error) 1039251662Sdim return (error); 1040251662Sdim addr = (vm_offset_t)addr0; 1041251662Sdim size = len; 1042251662Sdim last = addr + size; 1043251662Sdim start = trunc_page(addr); 1044251662Sdim end = round_page(last); 1045251662Sdim if (last < addr || end < addr) 1046251662Sdim return (EINVAL); 1047251662Sdim npages = atop(end - start); 1048251662Sdim if (npages > vm_page_max_wired) 1049251662Sdim return (ENOMEM); 1050251662Sdim map = &proc->p_vmspace->vm_map; 1051251662Sdim PROC_LOCK(proc); 1052251662Sdim nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1053251662Sdim if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1054251662Sdim PROC_UNLOCK(proc); 1055251662Sdim return (ENOMEM); 1056263508Sdim } 1057251662Sdim PROC_UNLOCK(proc); 1058251662Sdim if (npages + cnt.v_wire_count > vm_page_max_wired) 1059251662Sdim return (EAGAIN); 1060251662Sdim#ifdef RACCT 1061251662Sdim if (racct_enable) { 1062251662Sdim PROC_LOCK(proc); 1063251662Sdim error = racct_set(proc, RACCT_MEMLOCK, nsize); 1064251662Sdim PROC_UNLOCK(proc); 1065251662Sdim if (error != 0) 1066251662Sdim return (ENOMEM); 1067251662Sdim } 1068251662Sdim#endif 1069251662Sdim error = vm_map_wire(map, start, end, 1070251662Sdim VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1071251662Sdim#ifdef RACCT 1072251662Sdim if (racct_enable && error != KERN_SUCCESS) { 1073251662Sdim PROC_LOCK(proc); 1074251662Sdim racct_set(proc, RACCT_MEMLOCK, 1075251662Sdim ptoa(pmap_wired_count(map->pmap))); 1076251662Sdim PROC_UNLOCK(proc); 1077251662Sdim } 1078251662Sdim#endif 1079251662Sdim return (error == KERN_SUCCESS ? 0 : ENOMEM); 1080251662Sdim} 1081251662Sdim 1082251662Sdim#ifndef _SYS_SYSPROTO_H_ 1083251662Sdimstruct mlockall_args { 1084263508Sdim int how; 1085251662Sdim}; 1086251662Sdim#endif 1087251662Sdim 1088251662Sdimint 1089251662Sdimsys_mlockall(struct thread *td, struct mlockall_args *uap) 1090251662Sdim{ 1091251662Sdim vm_map_t map; 1092251662Sdim int error; 1093251662Sdim 1094251662Sdim map = &td->td_proc->p_vmspace->vm_map; 1095251662Sdim error = priv_check(td, PRIV_VM_MLOCK); 1096251662Sdim if (error) 1097251662Sdim return (error); 1098251662Sdim 1099251662Sdim if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1100251662Sdim return (EINVAL); 1101251662Sdim 1102251662Sdim /* 1103251662Sdim * If wiring all pages in the process would cause it to exceed 1104251662Sdim * a hard resource limit, return ENOMEM. 1105263508Sdim */ 1106251662Sdim if (!old_mlock && uap->how & MCL_CURRENT) { 1107251662Sdim PROC_LOCK(td->td_proc); 1108251662Sdim if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1109251662Sdim PROC_UNLOCK(td->td_proc); 1110251662Sdim return (ENOMEM); 1111251662Sdim } 1112251662Sdim PROC_UNLOCK(td->td_proc); 1113251662Sdim } 1114251662Sdim#ifdef RACCT 1115251662Sdim if (racct_enable) { 1116251662Sdim PROC_LOCK(td->td_proc); 1117263508Sdim error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1118263508Sdim PROC_UNLOCK(td->td_proc); 1119263508Sdim if (error != 0) 1120263508Sdim return (ENOMEM); 1121263508Sdim } 1122263508Sdim#endif 1123263508Sdim 1124263508Sdim if (uap->how & MCL_FUTURE) { 1125263508Sdim vm_map_lock(map); 1126263508Sdim vm_map_modflags(map, MAP_WIREFUTURE, 0); 1127263508Sdim vm_map_unlock(map); 1128263508Sdim error = 0; 1129263508Sdim } 1130263508Sdim 1131263508Sdim if (uap->how & MCL_CURRENT) { 1132263508Sdim /* 1133263508Sdim * P1003.1-2001 mandates that all currently mapped pages 1134263508Sdim * will be memory resident and locked (wired) upon return 1135263508Sdim * from mlockall(). vm_map_wire() will wire pages, by 1136263508Sdim * calling vm_fault_wire() for each page in the region. 1137263508Sdim */ 1138263508Sdim error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1139263508Sdim VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1140263508Sdim error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1141263508Sdim } 1142263508Sdim#ifdef RACCT 1143263508Sdim if (racct_enable && error != KERN_SUCCESS) { 1144263508Sdim PROC_LOCK(td->td_proc); 1145263508Sdim racct_set(td->td_proc, RACCT_MEMLOCK, 1146263508Sdim ptoa(pmap_wired_count(map->pmap))); 1147263508Sdim PROC_UNLOCK(td->td_proc); 1148263508Sdim } 1149263508Sdim#endif 1150263508Sdim 1151263508Sdim return (error); 1152263508Sdim} 1153263508Sdim 1154263508Sdim#ifndef _SYS_SYSPROTO_H_ 1155263508Sdimstruct munlockall_args { 1156263508Sdim register_t dummy; 1157263508Sdim}; 1158263508Sdim#endif 1159263508Sdim 1160263508Sdimint 1161263508Sdimsys_munlockall(struct thread *td, struct munlockall_args *uap) 1162263508Sdim{ 1163263508Sdim vm_map_t map; 1164263508Sdim int error; 1165263508Sdim 1166263508Sdim map = &td->td_proc->p_vmspace->vm_map; 1167263508Sdim error = priv_check(td, PRIV_VM_MUNLOCK); 1168263508Sdim if (error) 1169263508Sdim return (error); 1170263508Sdim 1171263508Sdim /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1172263508Sdim vm_map_lock(map); 1173263508Sdim vm_map_modflags(map, 0, MAP_WIREFUTURE); 1174263508Sdim vm_map_unlock(map); 1175263508Sdim 1176263508Sdim /* Forcibly unwire all pages. */ 1177263508Sdim error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1178263508Sdim VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1179263508Sdim#ifdef RACCT 1180263508Sdim if (racct_enable && error == KERN_SUCCESS) { 1181263508Sdim PROC_LOCK(td->td_proc); 1182263508Sdim racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1183263508Sdim PROC_UNLOCK(td->td_proc); 1184263508Sdim } 1185263508Sdim#endif 1186263508Sdim 1187263508Sdim return (error); 1188263508Sdim} 1189263508Sdim 1190263508Sdim#ifndef _SYS_SYSPROTO_H_ 1191263508Sdimstruct munlock_args { 1192263508Sdim const void *addr; 1193263508Sdim size_t len; 1194263508Sdim}; 1195263508Sdim#endif 1196263508Sdimint 1197263508Sdimsys_munlock(td, uap) 1198263508Sdim struct thread *td; 1199263508Sdim struct munlock_args *uap; 1200263508Sdim{ 1201263508Sdim vm_offset_t addr, end, last, start; 1202263508Sdim vm_size_t size; 1203263508Sdim#ifdef RACCT 1204263508Sdim vm_map_t map; 1205263508Sdim#endif 1206263508Sdim int error; 1207263508Sdim 1208263508Sdim error = priv_check(td, PRIV_VM_MUNLOCK); 1209263508Sdim if (error) 1210263508Sdim return (error); 1211263508Sdim addr = (vm_offset_t)uap->addr; 1212263508Sdim size = uap->len; 1213263508Sdim last = addr + size; 1214263508Sdim start = trunc_page(addr); 1215263508Sdim end = round_page(last); 1216263508Sdim if (last < addr || end < addr) 1217263508Sdim return (EINVAL); 1218263508Sdim error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1219263508Sdim VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1220263508Sdim#ifdef RACCT 1221 if (racct_enable && error == KERN_SUCCESS) { 1222 PROC_LOCK(td->td_proc); 1223 map = &td->td_proc->p_vmspace->vm_map; 1224 racct_set(td->td_proc, RACCT_MEMLOCK, 1225 ptoa(pmap_wired_count(map->pmap))); 1226 PROC_UNLOCK(td->td_proc); 1227 } 1228#endif 1229 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1230} 1231 1232/* 1233 * vm_mmap_vnode() 1234 * 1235 * Helper function for vm_mmap. Perform sanity check specific for mmap 1236 * operations on vnodes. 1237 * 1238 * For VCHR vnodes, the vnode lock is held over the call to 1239 * vm_mmap_cdev() to keep vp->v_rdev valid. 1240 */ 1241int 1242vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1243 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1244 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1245 boolean_t *writecounted) 1246{ 1247 struct vattr va; 1248 vm_object_t obj; 1249 vm_offset_t foff; 1250 struct ucred *cred; 1251 int error, flags, locktype; 1252 1253 cred = td->td_ucred; 1254 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1255 locktype = LK_EXCLUSIVE; 1256 else 1257 locktype = LK_SHARED; 1258 if ((error = vget(vp, locktype, td)) != 0) 1259 return (error); 1260 foff = *foffp; 1261 flags = *flagsp; 1262 obj = vp->v_object; 1263 if (vp->v_type == VREG) { 1264 /* 1265 * Get the proper underlying object 1266 */ 1267 if (obj == NULL) { 1268 error = EINVAL; 1269 goto done; 1270 } 1271 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1272 vput(vp); 1273 vp = (struct vnode *)obj->handle; 1274 /* 1275 * Bypass filesystems obey the mpsafety of the 1276 * underlying fs. Tmpfs never bypasses. 1277 */ 1278 error = vget(vp, locktype, td); 1279 if (error != 0) 1280 return (error); 1281 } 1282 if (locktype == LK_EXCLUSIVE) { 1283 *writecounted = TRUE; 1284 vnode_pager_update_writecount(obj, 0, objsize); 1285 } 1286 } else if (vp->v_type == VCHR) { 1287 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1288 vp->v_rdev, foffp, objp); 1289 if (error == 0) 1290 goto mark_atime; 1291 goto done; 1292 } else { 1293 error = EINVAL; 1294 goto done; 1295 } 1296 if ((error = VOP_GETATTR(vp, &va, cred))) 1297 goto done; 1298#ifdef MAC 1299 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1300 if (error != 0) 1301 goto done; 1302#endif 1303 if ((flags & MAP_SHARED) != 0) { 1304 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1305 if (prot & PROT_WRITE) { 1306 error = EPERM; 1307 goto done; 1308 } 1309 *maxprotp &= ~VM_PROT_WRITE; 1310 } 1311 } 1312 /* 1313 * If it is a regular file without any references 1314 * we do not need to sync it. 1315 * Adjust object size to be the size of actual file. 1316 */ 1317 objsize = round_page(va.va_size); 1318 if (va.va_nlink == 0) 1319 flags |= MAP_NOSYNC; 1320 if (obj->type == OBJT_VNODE) 1321 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1322 cred); 1323 else { 1324 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1325 ("wrong object type")); 1326 vm_object_reference(obj); 1327 } 1328 if (obj == NULL) { 1329 error = ENOMEM; 1330 goto done; 1331 } 1332 *objp = obj; 1333 *flagsp = flags; 1334 1335mark_atime: 1336 vfs_mark_atime(vp, cred); 1337 1338done: 1339 if (error != 0 && *writecounted) { 1340 *writecounted = FALSE; 1341 vnode_pager_update_writecount(obj, objsize, 0); 1342 } 1343 vput(vp); 1344 return (error); 1345} 1346 1347/* 1348 * vm_mmap_cdev() 1349 * 1350 * Helper function for vm_mmap. Perform sanity check specific for mmap 1351 * operations on cdevs. 1352 */ 1353int 1354vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1355 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1356 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1357{ 1358 vm_object_t obj; 1359 struct cdevsw *dsw; 1360 int error, flags, ref; 1361 1362 flags = *flagsp; 1363 1364 dsw = dev_refthread(cdev, &ref); 1365 if (dsw == NULL) 1366 return (ENXIO); 1367 if (dsw->d_flags & D_MMAP_ANON) { 1368 dev_relthread(cdev, ref); 1369 *maxprotp = VM_PROT_ALL; 1370 *flagsp |= MAP_ANON; 1371 return (0); 1372 } 1373 /* 1374 * cdevs do not provide private mappings of any kind. 1375 */ 1376 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1377 (prot & PROT_WRITE) != 0) { 1378 dev_relthread(cdev, ref); 1379 return (EACCES); 1380 } 1381 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1382 dev_relthread(cdev, ref); 1383 return (EINVAL); 1384 } 1385 /* 1386 * Force device mappings to be shared. 1387 */ 1388 flags |= MAP_SHARED; 1389#ifdef MAC_XXX 1390 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1391 if (error != 0) { 1392 dev_relthread(cdev, ref); 1393 return (error); 1394 } 1395#endif 1396 /* 1397 * First, try d_mmap_single(). If that is not implemented 1398 * (returns ENODEV), fall back to using the device pager. 1399 * Note that d_mmap_single() must return a reference to the 1400 * object (it needs to bump the reference count of the object 1401 * it returns somehow). 1402 * 1403 * XXX assumes VM_PROT_* == PROT_* 1404 */ 1405 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1406 dev_relthread(cdev, ref); 1407 if (error != ENODEV) 1408 return (error); 1409 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1410 td->td_ucred); 1411 if (obj == NULL) 1412 return (EINVAL); 1413 *objp = obj; 1414 *flagsp = flags; 1415 return (0); 1416} 1417 1418/* 1419 * vm_mmap_shm() 1420 * 1421 * MPSAFE 1422 * 1423 * Helper function for vm_mmap. Perform sanity check specific for mmap 1424 * operations on shm file descriptors. 1425 */ 1426int 1427vm_mmap_shm(struct thread *td, vm_size_t objsize, 1428 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1429 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1430{ 1431 int error; 1432 1433 if ((*flagsp & MAP_SHARED) != 0 && 1434 (*maxprotp & VM_PROT_WRITE) == 0 && 1435 (prot & PROT_WRITE) != 0) 1436 return (EACCES); 1437#ifdef MAC 1438 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1439 if (error != 0) 1440 return (error); 1441#endif 1442 error = shm_mmap(shmfd, objsize, foff, objp); 1443 if (error) 1444 return (error); 1445 return (0); 1446} 1447 1448/* 1449 * vm_mmap() 1450 * 1451 * MPSAFE 1452 * 1453 * Internal version of mmap. Currently used by mmap, exec, and sys5 1454 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1455 */ 1456int 1457vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1458 vm_prot_t maxprot, int flags, 1459 objtype_t handle_type, void *handle, 1460 vm_ooffset_t foff) 1461{ 1462 boolean_t curmap, fitit; 1463 vm_offset_t max_addr; 1464 vm_object_t object = NULL; 1465 struct thread *td = curthread; 1466 int docow, error, findspace, rv; 1467 boolean_t writecounted; 1468 1469 if (size == 0) 1470 return (0); 1471 1472 size = round_page(size); 1473 1474 curmap = map == &td->td_proc->p_vmspace->vm_map; 1475 if (curmap) { 1476 PROC_LOCK(td->td_proc); 1477 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1478 PROC_UNLOCK(td->td_proc); 1479 return (ENOMEM); 1480 } 1481 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1482 PROC_UNLOCK(td->td_proc); 1483 return (ENOMEM); 1484 } 1485 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1486 if (ptoa(pmap_wired_count(map->pmap)) + size > 1487 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1488 racct_set_force(td->td_proc, RACCT_VMEM, 1489 map->size); 1490 PROC_UNLOCK(td->td_proc); 1491 return (ENOMEM); 1492 } 1493 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1494 ptoa(pmap_wired_count(map->pmap)) + size); 1495 if (error != 0) { 1496 racct_set_force(td->td_proc, RACCT_VMEM, 1497 map->size); 1498 PROC_UNLOCK(td->td_proc); 1499 return (error); 1500 } 1501 } 1502 PROC_UNLOCK(td->td_proc); 1503 } 1504 1505 /* 1506 * We currently can only deal with page aligned file offsets. 1507 * The check is here rather than in the syscall because the 1508 * kernel calls this function internally for other mmaping 1509 * operations (such as in exec) and non-aligned offsets will 1510 * cause pmap inconsistencies...so we want to be sure to 1511 * disallow this in all cases. 1512 */ 1513 if (foff & PAGE_MASK) 1514 return (EINVAL); 1515 1516 if ((flags & MAP_FIXED) == 0) { 1517 fitit = TRUE; 1518 *addr = round_page(*addr); 1519 } else { 1520 if (*addr != trunc_page(*addr)) 1521 return (EINVAL); 1522 fitit = FALSE; 1523 } 1524 writecounted = FALSE; 1525 1526 /* 1527 * Lookup/allocate object. 1528 */ 1529 switch (handle_type) { 1530 case OBJT_DEVICE: 1531 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1532 handle, &foff, &object); 1533 break; 1534 case OBJT_VNODE: 1535 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1536 handle, &foff, &object, &writecounted); 1537 break; 1538 case OBJT_SWAP: 1539 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1540 handle, foff, &object); 1541 break; 1542 case OBJT_DEFAULT: 1543 if (handle == NULL) { 1544 error = 0; 1545 break; 1546 } 1547 /* FALLTHROUGH */ 1548 default: 1549 error = EINVAL; 1550 break; 1551 } 1552 if (error) 1553 return (error); 1554 if (flags & MAP_ANON) { 1555 object = NULL; 1556 docow = 0; 1557 /* 1558 * Unnamed anonymous regions always start at 0. 1559 */ 1560 if (handle == 0) 1561 foff = 0; 1562 } else if (flags & MAP_PREFAULT_READ) 1563 docow = MAP_PREFAULT; 1564 else 1565 docow = MAP_PREFAULT_PARTIAL; 1566 1567 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1568 docow |= MAP_COPY_ON_WRITE; 1569 if (flags & MAP_NOSYNC) 1570 docow |= MAP_DISABLE_SYNCER; 1571 if (flags & MAP_NOCORE) 1572 docow |= MAP_DISABLE_COREDUMP; 1573 /* Shared memory is also shared with children. */ 1574 if (flags & MAP_SHARED) 1575 docow |= MAP_INHERIT_SHARE; 1576 if (writecounted) 1577 docow |= MAP_VN_WRITECOUNT; 1578 if (flags & MAP_STACK) { 1579 if (object != NULL) 1580 return (EINVAL); 1581 docow |= MAP_STACK_GROWS_DOWN; 1582 } 1583 if ((flags & MAP_EXCL) != 0) 1584 docow |= MAP_CHECK_EXCL; 1585 if ((flags & MAP_GUARD) != 0) 1586 docow |= MAP_CREATE_GUARD; 1587 1588 if (fitit) { 1589 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1590 findspace = VMFS_SUPER_SPACE; 1591 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1592 findspace = VMFS_ALIGNED_SPACE(flags >> 1593 MAP_ALIGNMENT_SHIFT); 1594 else 1595 findspace = VMFS_OPTIMAL_SPACE; 1596 max_addr = 0; 1597#ifdef MAP_32BIT 1598 if ((flags & MAP_32BIT) != 0) 1599 max_addr = MAP_32BIT_MAX_ADDR; 1600#endif 1601 if (curmap) { 1602 vm_offset_t min_addr; 1603 1604 PROC_LOCK(td->td_proc); 1605 min_addr = round_page((vm_offset_t)td->td_proc-> 1606 p_vmspace->vm_daddr + lim_max(td->td_proc, 1607 RLIMIT_DATA)); 1608 PROC_UNLOCK(td->td_proc); 1609 rv = vm_map_find_min(map, object, foff, addr, size, 1610 min_addr, max_addr, 1611 findspace, prot, maxprot, docow); 1612 } else { 1613 rv = vm_map_find(map, object, foff, addr, size, 1614 max_addr, findspace, prot, maxprot, docow); 1615 } 1616 } else { 1617 rv = vm_map_fixed(map, object, foff, *addr, size, 1618 prot, maxprot, docow); 1619 } 1620 1621 if (rv == KERN_SUCCESS) { 1622 /* 1623 * If the process has requested that all future mappings 1624 * be wired, then heed this. 1625 */ 1626 if (map->flags & MAP_WIREFUTURE) { 1627 vm_map_wire(map, *addr, *addr + size, 1628 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1629 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1630 } 1631 } else { 1632 /* 1633 * If this mapping was accounted for in the vnode's 1634 * writecount, then undo that now. 1635 */ 1636 if (writecounted) 1637 vnode_pager_release_writecount(object, 0, size); 1638 /* 1639 * Lose the object reference. Will destroy the 1640 * object if it's an unnamed anonymous mapping 1641 * or named anonymous without other references. 1642 */ 1643 vm_object_deallocate(object); 1644 } 1645 return (vm_mmap_to_errno(rv)); 1646} 1647 1648/* 1649 * Translate a Mach VM return code to zero on success or the appropriate errno 1650 * on failure. 1651 */ 1652int 1653vm_mmap_to_errno(int rv) 1654{ 1655 1656 switch (rv) { 1657 case KERN_SUCCESS: 1658 return (0); 1659 case KERN_INVALID_ADDRESS: 1660 case KERN_NO_SPACE: 1661 return (ENOMEM); 1662 case KERN_PROTECTION_FAILURE: 1663 return (EACCES); 1664 default: 1665 return (EINVAL); 1666 } 1667} 1668