vm_mmap.c revision 266492
1279377Simp/*- 2279377Simp * Copyright (c) 1988 University of Utah. 3279377Simp * Copyright (c) 1991, 1993 4279377Simp * The Regents of the University of California. All rights reserved. 5279377Simp * 6279377Simp * This code is derived from software contributed to Berkeley by 7279377Simp * the Systems Programming Group of the University of Utah Computer 8279377Simp * Science Department. 9279377Simp * 10279377Simp * Redistribution and use in source and binary forms, with or without 11279377Simp * modification, are permitted provided that the following conditions 12279377Simp * are met: 13279377Simp * 1. Redistributions of source code must retain the above copyright 14279377Simp * notice, this list of conditions and the following disclaimer. 15279377Simp * 2. Redistributions in binary form must reproduce the above copyright 16279377Simp * notice, this list of conditions and the following disclaimer in the 17279377Simp * documentation and/or other materials provided with the distribution. 18279377Simp * 4. Neither the name of the University nor the names of its contributors 19279377Simp * may be used to endorse or promote products derived from this software 20279377Simp * without specific prior written permission. 21279377Simp * 22279377Simp * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23279377Simp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24279377Simp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25279377Simp * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26279377Simp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27279377Simp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28279377Simp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29279377Simp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30279377Simp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31279377Simp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32279377Simp * SUCH DAMAGE. 33279377Simp * 34279377Simp * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35279377Simp * 36279377Simp * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37279377Simp */ 38279377Simp 39279377Simp/* 40279377Simp * Mapped file (mmap) interface to VM 41279377Simp */ 42279377Simp 43279377Simp#include <sys/cdefs.h> 44279377Simp__FBSDID("$FreeBSD: stable/10/sys/vm/vm_mmap.c 266492 2014-05-21 09:19:05Z pho $"); 45279377Simp 46279377Simp#include "opt_compat.h" 47279377Simp#include "opt_hwpmc_hooks.h" 48279377Simp 49279377Simp#include <sys/param.h> 50279377Simp#include <sys/systm.h> 51279377Simp#include <sys/capability.h> 52279377Simp#include <sys/kernel.h> 53279377Simp#include <sys/lock.h> 54279377Simp#include <sys/mutex.h> 55279377Simp#include <sys/sysproto.h> 56279377Simp#include <sys/filedesc.h> 57279377Simp#include <sys/priv.h> 58279377Simp#include <sys/proc.h> 59279377Simp#include <sys/procctl.h> 60279377Simp#include <sys/racct.h> 61279377Simp#include <sys/resource.h> 62279377Simp#include <sys/resourcevar.h> 63279377Simp#include <sys/rwlock.h> 64279377Simp#include <sys/sysctl.h> 65279377Simp#include <sys/vnode.h> 66279377Simp#include <sys/fcntl.h> 67279377Simp#include <sys/file.h> 68279377Simp#include <sys/mman.h> 69279377Simp#include <sys/mount.h> 70279377Simp#include <sys/conf.h> 71279377Simp#include <sys/stat.h> 72279377Simp#include <sys/syscallsubr.h> 73279377Simp#include <sys/sysent.h> 74279377Simp#include <sys/vmmeter.h> 75279377Simp 76279377Simp#include <security/mac/mac_framework.h> 77279377Simp 78279377Simp#include <vm/vm.h> 79279377Simp#include <vm/vm_param.h> 80279377Simp#include <vm/pmap.h> 81279377Simp#include <vm/vm_map.h> 82279377Simp#include <vm/vm_object.h> 83279377Simp#include <vm/vm_page.h> 84279377Simp#include <vm/vm_pager.h> 85279377Simp#include <vm/vm_pageout.h> 86279377Simp#include <vm/vm_extern.h> 87279377Simp#include <vm/vm_page.h> 88279377Simp#include <vm/vnode_pager.h> 89279377Simp 90279377Simp#ifdef HWPMC_HOOKS 91279377Simp#include <sys/pmckern.h> 92279377Simp#endif 93279377Simp 94279377Simpint old_mlock = 0; 95279377SimpSYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0, 96279377Simp "Do not apply RLIMIT_MEMLOCK on mlockall"); 97279377SimpTUNABLE_INT("vm.old_mlock", &old_mlock); 98279377Simp 99279377Simp#ifdef MAP_32BIT 100279377Simp#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 101279377Simp#endif 102279377Simp 103279377Simpstatic int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 104279377Simp int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 105279377Simpstatic int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 106279377Simp int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 107279377Simpstatic int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 108279377Simp int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 109279377Simp 110279377Simp#ifndef _SYS_SYSPROTO_H_ 111279377Simpstruct sbrk_args { 112279377Simp int incr; 113279377Simp}; 114279377Simp#endif 115279377Simp 116279377Simp/* 117279377Simp * MPSAFE 118279377Simp */ 119279377Simp/* ARGSUSED */ 120279377Simpint 121279377Simpsys_sbrk(td, uap) 122279377Simp struct thread *td; 123279377Simp struct sbrk_args *uap; 124279377Simp{ 125279377Simp /* Not yet implemented */ 126279377Simp return (EOPNOTSUPP); 127279377Simp} 128279377Simp 129279377Simp#ifndef _SYS_SYSPROTO_H_ 130279377Simpstruct sstk_args { 131279377Simp int incr; 132279377Simp}; 133279377Simp#endif 134279377Simp 135279377Simp/* 136279377Simp * MPSAFE 137279377Simp */ 138279377Simp/* ARGSUSED */ 139279377Simpint 140279377Simpsys_sstk(td, uap) 141279377Simp struct thread *td; 142279377Simp struct sstk_args *uap; 143279377Simp{ 144279377Simp /* Not yet implemented */ 145279377Simp return (EOPNOTSUPP); 146279377Simp} 147279377Simp 148279377Simp#if defined(COMPAT_43) 149279377Simp#ifndef _SYS_SYSPROTO_H_ 150279377Simpstruct getpagesize_args { 151279377Simp int dummy; 152279377Simp}; 153279377Simp#endif 154279377Simp 155279377Simpint 156279377Simpogetpagesize(td, uap) 157279377Simp struct thread *td; 158279377Simp struct getpagesize_args *uap; 159279377Simp{ 160279377Simp /* MP SAFE */ 161279377Simp td->td_retval[0] = PAGE_SIZE; 162279377Simp return (0); 163279377Simp} 164279377Simp#endif /* COMPAT_43 */ 165279377Simp 166279377Simp 167279377Simp/* 168279377Simp * Memory Map (mmap) system call. Note that the file offset 169279377Simp * and address are allowed to be NOT page aligned, though if 170279377Simp * the MAP_FIXED flag it set, both must have the same remainder 171279377Simp * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 172279377Simp * page-aligned, the actual mapping starts at trunc_page(addr) 173279377Simp * and the return value is adjusted up by the page offset. 174279377Simp * 175279377Simp * Generally speaking, only character devices which are themselves 176279377Simp * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 177279377Simp * there would be no cache coherency between a descriptor and a VM mapping 178279377Simp * both to the same character device. 179279377Simp */ 180279377Simp#ifndef _SYS_SYSPROTO_H_ 181279377Simpstruct mmap_args { 182279377Simp void *addr; 183279377Simp size_t len; 184279377Simp int prot; 185279377Simp int flags; 186279377Simp int fd; 187279377Simp long pad; 188279377Simp off_t pos; 189279377Simp}; 190279377Simp#endif 191279377Simp 192279377Simp/* 193279377Simp * MPSAFE 194279377Simp */ 195279377Simpint 196279377Simpsys_mmap(td, uap) 197279377Simp struct thread *td; 198279377Simp struct mmap_args *uap; 199279377Simp{ 200279377Simp#ifdef HWPMC_HOOKS 201279377Simp struct pmckern_map_in pkm; 202279377Simp#endif 203279377Simp struct file *fp; 204279377Simp struct vnode *vp; 205279377Simp vm_offset_t addr; 206279377Simp vm_size_t size, pageoff; 207279377Simp vm_prot_t cap_maxprot, prot, maxprot; 208279377Simp void *handle; 209279377Simp objtype_t handle_type; 210279377Simp int align, error, flags; 211279377Simp off_t pos; 212279377Simp struct vmspace *vms = td->td_proc->p_vmspace; 213279377Simp cap_rights_t rights; 214279377Simp 215279377Simp addr = (vm_offset_t) uap->addr; 216279377Simp size = uap->len; 217279377Simp prot = uap->prot & VM_PROT_ALL; 218279377Simp flags = uap->flags; 219279377Simp pos = uap->pos; 220279377Simp 221279377Simp fp = NULL; 222279377Simp 223279377Simp /* 224279377Simp * Enforce the constraints. 225279377Simp * Mapping of length 0 is only allowed for old binaries. 226279377Simp * Anonymous mapping shall specify -1 as filedescriptor and 227279377Simp * zero position for new code. Be nice to ancient a.out 228279377Simp * binaries and correct pos for anonymous mapping, since old 229279377Simp * ld.so sometimes issues anonymous map requests with non-zero 230279377Simp * pos. 231279377Simp */ 232279377Simp if (!SV_CURPROC_FLAG(SV_AOUT)) { 233279377Simp if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 234279377Simp ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 235279377Simp return (EINVAL); 236279377Simp } else { 237279377Simp if ((flags & MAP_ANON) != 0) 238279377Simp pos = 0; 239279377Simp } 240279377Simp 241279377Simp if (flags & MAP_STACK) { 242279377Simp if ((uap->fd != -1) || 243279377Simp ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 244279377Simp return (EINVAL); 245279377Simp flags |= MAP_ANON; 246279377Simp pos = 0; 247279377Simp } 248279377Simp 249279377Simp /* 250279377Simp * Align the file position to a page boundary, 251279377Simp * and save its page offset component. 252279377Simp */ 253279377Simp pageoff = (pos & PAGE_MASK); 254279377Simp pos -= pageoff; 255279377Simp 256279377Simp /* Adjust size for rounding (on both ends). */ 257279377Simp size += pageoff; /* low end... */ 258279377Simp size = (vm_size_t) round_page(size); /* hi end */ 259279377Simp 260279377Simp /* Ensure alignment is at least a page and fits in a pointer. */ 261279377Simp align = flags & MAP_ALIGNMENT_MASK; 262279377Simp if (align != 0 && align != MAP_ALIGNED_SUPER && 263279377Simp (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 264279377Simp align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 265279377Simp return (EINVAL); 266279377Simp 267279377Simp /* 268279377Simp * Check for illegal addresses. Watch out for address wrap... Note 269279377Simp * that VM_*_ADDRESS are not constants due to casts (argh). 270279377Simp */ 271279377Simp if (flags & MAP_FIXED) { 272279377Simp /* 273279377Simp * The specified address must have the same remainder 274279377Simp * as the file offset taken modulo PAGE_SIZE, so it 275279377Simp * should be aligned after adjustment by pageoff. 276279377Simp */ 277279377Simp addr -= pageoff; 278279377Simp if (addr & PAGE_MASK) 279279377Simp return (EINVAL); 280279377Simp 281279377Simp /* Address range must be all in user VM space. */ 282279377Simp if (addr < vm_map_min(&vms->vm_map) || 283279377Simp addr + size > vm_map_max(&vms->vm_map)) 284279377Simp return (EINVAL); 285279377Simp if (addr + size < addr) 286279377Simp return (EINVAL); 287279377Simp#ifdef MAP_32BIT 288279377Simp if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 289279377Simp return (EINVAL); 290279377Simp } else if (flags & MAP_32BIT) { 291279377Simp /* 292279377Simp * For MAP_32BIT, override the hint if it is too high and 293279377Simp * do not bother moving the mapping past the heap (since 294279377Simp * the heap is usually above 2GB). 295279377Simp */ 296279377Simp if (addr + size > MAP_32BIT_MAX_ADDR) 297279377Simp addr = 0; 298279377Simp#endif 299279377Simp } else { 300279377Simp /* 301279377Simp * XXX for non-fixed mappings where no hint is provided or 302279377Simp * the hint would fall in the potential heap space, 303279377Simp * place it after the end of the largest possible heap. 304279377Simp * 305279377Simp * There should really be a pmap call to determine a reasonable 306279377Simp * location. 307279377Simp */ 308279377Simp PROC_LOCK(td->td_proc); 309279377Simp if (addr == 0 || 310279377Simp (addr >= round_page((vm_offset_t)vms->vm_taddr) && 311279377Simp addr < round_page((vm_offset_t)vms->vm_daddr + 312279377Simp lim_max(td->td_proc, RLIMIT_DATA)))) 313279377Simp addr = round_page((vm_offset_t)vms->vm_daddr + 314279377Simp lim_max(td->td_proc, RLIMIT_DATA)); 315279377Simp PROC_UNLOCK(td->td_proc); 316279377Simp } 317279377Simp if (flags & MAP_ANON) { 318279377Simp /* 319279377Simp * Mapping blank space is trivial. 320279377Simp */ 321279377Simp handle = NULL; 322279377Simp handle_type = OBJT_DEFAULT; 323279377Simp maxprot = VM_PROT_ALL; 324279377Simp cap_maxprot = VM_PROT_ALL; 325279377Simp } else { 326279377Simp /* 327279377Simp * Mapping file, get fp for validation and don't let the 328279377Simp * descriptor disappear on us if we block. Check capability 329279377Simp * rights, but also return the maximum rights to be combined 330279377Simp * with maxprot later. 331279377Simp */ 332279377Simp cap_rights_init(&rights, CAP_MMAP); 333279377Simp if (prot & PROT_READ) 334279377Simp cap_rights_set(&rights, CAP_MMAP_R); 335279377Simp if ((flags & MAP_SHARED) != 0) { 336279377Simp if (prot & PROT_WRITE) 337279377Simp cap_rights_set(&rights, CAP_MMAP_W); 338279377Simp } 339279377Simp if (prot & PROT_EXEC) 340279377Simp cap_rights_set(&rights, CAP_MMAP_X); 341279377Simp error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); 342279377Simp if (error != 0) 343279377Simp goto done; 344279377Simp if (fp->f_type == DTYPE_SHM) { 345279377Simp handle = fp->f_data; 346279377Simp handle_type = OBJT_SWAP; 347279377Simp maxprot = VM_PROT_NONE; 348279377Simp 349279377Simp /* FREAD should always be set. */ 350279377Simp if (fp->f_flag & FREAD) 351279377Simp maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 352279377Simp if (fp->f_flag & FWRITE) 353279377Simp maxprot |= VM_PROT_WRITE; 354279377Simp goto map; 355279377Simp } 356279377Simp if (fp->f_type != DTYPE_VNODE) { 357279377Simp error = ENODEV; 358279377Simp goto done; 359279377Simp } 360279377Simp#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 361279377Simp defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 362279377Simp /* 363279377Simp * POSIX shared-memory objects are defined to have 364279377Simp * kernel persistence, and are not defined to support 365279377Simp * read(2)/write(2) -- or even open(2). Thus, we can 366279377Simp * use MAP_ASYNC to trade on-disk coherence for speed. 367279377Simp * The shm_open(3) library routine turns on the FPOSIXSHM 368279377Simp * flag to request this behavior. 369279377Simp */ 370279377Simp if (fp->f_flag & FPOSIXSHM) 371279377Simp flags |= MAP_NOSYNC; 372279377Simp#endif 373279377Simp vp = fp->f_vnode; 374279377Simp /* 375279377Simp * Ensure that file and memory protections are 376279377Simp * compatible. Note that we only worry about 377279377Simp * writability if mapping is shared; in this case, 378279377Simp * current and max prot are dictated by the open file. 379279377Simp * XXX use the vnode instead? Problem is: what 380279377Simp * credentials do we use for determination? What if 381279377Simp * proc does a setuid? 382279377Simp */ 383279377Simp if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 384279377Simp maxprot = VM_PROT_NONE; 385279377Simp else 386279377Simp maxprot = VM_PROT_EXECUTE; 387279377Simp if (fp->f_flag & FREAD) { 388279377Simp maxprot |= VM_PROT_READ; 389279377Simp } else if (prot & PROT_READ) { 390279377Simp error = EACCES; 391279377Simp goto done; 392279377Simp } 393279377Simp /* 394279377Simp * If we are sharing potential changes (either via 395279377Simp * MAP_SHARED or via the implicit sharing of character 396279377Simp * device mappings), and we are trying to get write 397279377Simp * permission although we opened it without asking 398279377Simp * for it, bail out. 399279377Simp */ 400279377Simp if ((flags & MAP_SHARED) != 0) { 401279377Simp if ((fp->f_flag & FWRITE) != 0) { 402279377Simp maxprot |= VM_PROT_WRITE; 403279377Simp } else if ((prot & PROT_WRITE) != 0) { 404279377Simp error = EACCES; 405279377Simp goto done; 406279377Simp } 407279377Simp } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 408279377Simp maxprot |= VM_PROT_WRITE; 409279377Simp cap_maxprot |= VM_PROT_WRITE; 410279377Simp } 411279377Simp handle = (void *)vp; 412279377Simp handle_type = OBJT_VNODE; 413279377Simp } 414279377Simpmap: 415279377Simp td->td_fpop = fp; 416279377Simp maxprot &= cap_maxprot; 417279377Simp error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 418279377Simp flags, handle_type, handle, pos); 419279377Simp td->td_fpop = NULL; 420279377Simp#ifdef HWPMC_HOOKS 421279377Simp /* inform hwpmc(4) if an executable is being mapped */ 422279377Simp if (error == 0 && handle_type == OBJT_VNODE && 423279377Simp (prot & PROT_EXEC)) { 424279377Simp pkm.pm_file = handle; 425279377Simp pkm.pm_address = (uintptr_t) addr; 426279377Simp PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 427279377Simp } 428279377Simp#endif 429279377Simp if (error == 0) 430279377Simp td->td_retval[0] = (register_t) (addr + pageoff); 431279377Simpdone: 432279377Simp if (fp) 433279377Simp fdrop(fp, td); 434279377Simp 435279377Simp return (error); 436279377Simp} 437279377Simp 438279377Simpint 439279377Simpfreebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 440279377Simp{ 441279377Simp struct mmap_args oargs; 442279377Simp 443279377Simp oargs.addr = uap->addr; 444279377Simp oargs.len = uap->len; 445279377Simp oargs.prot = uap->prot; 446279377Simp oargs.flags = uap->flags; 447279377Simp oargs.fd = uap->fd; 448279377Simp oargs.pos = uap->pos; 449279377Simp return (sys_mmap(td, &oargs)); 450279377Simp} 451279377Simp 452279377Simp#ifdef COMPAT_43 453279377Simp#ifndef _SYS_SYSPROTO_H_ 454279377Simpstruct ommap_args { 455279377Simp caddr_t addr; 456279377Simp int len; 457279377Simp int prot; 458279377Simp int flags; 459279377Simp int fd; 460279377Simp long pos; 461279377Simp}; 462279377Simp#endif 463279377Simpint 464279377Simpommap(td, uap) 465279377Simp struct thread *td; 466279377Simp struct ommap_args *uap; 467279377Simp{ 468279377Simp struct mmap_args nargs; 469279377Simp static const char cvtbsdprot[8] = { 470279377Simp 0, 471279377Simp PROT_EXEC, 472279377Simp PROT_WRITE, 473279377Simp PROT_EXEC | PROT_WRITE, 474279377Simp PROT_READ, 475279377Simp PROT_EXEC | PROT_READ, 476279377Simp PROT_WRITE | PROT_READ, 477279377Simp PROT_EXEC | PROT_WRITE | PROT_READ, 478279377Simp }; 479279377Simp 480279377Simp#define OMAP_ANON 0x0002 481279377Simp#define OMAP_COPY 0x0020 482279377Simp#define OMAP_SHARED 0x0010 483279377Simp#define OMAP_FIXED 0x0100 484279377Simp 485279377Simp nargs.addr = uap->addr; 486279377Simp nargs.len = uap->len; 487279377Simp nargs.prot = cvtbsdprot[uap->prot & 0x7]; 488279377Simp#ifdef COMPAT_FREEBSD32 489279377Simp#if defined(__amd64__) || defined(__ia64__) 490279377Simp if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 491279377Simp nargs.prot != 0) 492279377Simp nargs.prot |= PROT_EXEC; 493279377Simp#endif 494279377Simp#endif 495279377Simp nargs.flags = 0; 496279377Simp if (uap->flags & OMAP_ANON) 497279377Simp nargs.flags |= MAP_ANON; 498279377Simp if (uap->flags & OMAP_COPY) 499279377Simp nargs.flags |= MAP_COPY; 500279377Simp if (uap->flags & OMAP_SHARED) 501279377Simp nargs.flags |= MAP_SHARED; 502279377Simp else 503279377Simp nargs.flags |= MAP_PRIVATE; 504279377Simp if (uap->flags & OMAP_FIXED) 505279377Simp nargs.flags |= MAP_FIXED; 506279377Simp nargs.fd = uap->fd; 507279377Simp nargs.pos = uap->pos; 508279377Simp return (sys_mmap(td, &nargs)); 509279377Simp} 510279377Simp#endif /* COMPAT_43 */ 511279377Simp 512279377Simp 513279377Simp#ifndef _SYS_SYSPROTO_H_ 514279377Simpstruct msync_args { 515279377Simp void *addr; 516279377Simp size_t len; 517279377Simp int flags; 518279377Simp}; 519279377Simp#endif 520279377Simp/* 521279377Simp * MPSAFE 522279377Simp */ 523279377Simpint 524279377Simpsys_msync(td, uap) 525279377Simp struct thread *td; 526279377Simp struct msync_args *uap; 527279377Simp{ 528279377Simp vm_offset_t addr; 529279377Simp vm_size_t size, pageoff; 530279377Simp int flags; 531279377Simp vm_map_t map; 532279377Simp int rv; 533279377Simp 534279377Simp addr = (vm_offset_t) uap->addr; 535279377Simp size = uap->len; 536279377Simp flags = uap->flags; 537279377Simp 538279377Simp pageoff = (addr & PAGE_MASK); 539279377Simp addr -= pageoff; 540279377Simp size += pageoff; 541279377Simp size = (vm_size_t) round_page(size); 542279377Simp if (addr + size < addr) 543279377Simp return (EINVAL); 544279377Simp 545279377Simp if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 546279377Simp return (EINVAL); 547279377Simp 548279377Simp map = &td->td_proc->p_vmspace->vm_map; 549279377Simp 550279377Simp /* 551279377Simp * Clean the pages and interpret the return value. 552279377Simp */ 553279377Simp rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 554279377Simp (flags & MS_INVALIDATE) != 0); 555279377Simp switch (rv) { 556279377Simp case KERN_SUCCESS: 557279377Simp return (0); 558279377Simp case KERN_INVALID_ADDRESS: 559279377Simp return (ENOMEM); 560279377Simp case KERN_INVALID_ARGUMENT: 561279377Simp return (EBUSY); 562279377Simp case KERN_FAILURE: 563279377Simp return (EIO); 564279377Simp default: 565279377Simp return (EINVAL); 566279377Simp } 567279377Simp} 568279377Simp 569279377Simp#ifndef _SYS_SYSPROTO_H_ 570279377Simpstruct munmap_args { 571279377Simp void *addr; 572279377Simp size_t len; 573279377Simp}; 574279377Simp#endif 575279377Simp/* 576 * MPSAFE 577 */ 578int 579sys_munmap(td, uap) 580 struct thread *td; 581 struct munmap_args *uap; 582{ 583#ifdef HWPMC_HOOKS 584 struct pmckern_map_out pkm; 585 vm_map_entry_t entry; 586#endif 587 vm_offset_t addr; 588 vm_size_t size, pageoff; 589 vm_map_t map; 590 591 addr = (vm_offset_t) uap->addr; 592 size = uap->len; 593 if (size == 0) 594 return (EINVAL); 595 596 pageoff = (addr & PAGE_MASK); 597 addr -= pageoff; 598 size += pageoff; 599 size = (vm_size_t) round_page(size); 600 if (addr + size < addr) 601 return (EINVAL); 602 603 /* 604 * Check for illegal addresses. Watch out for address wrap... 605 */ 606 map = &td->td_proc->p_vmspace->vm_map; 607 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 608 return (EINVAL); 609 vm_map_lock(map); 610#ifdef HWPMC_HOOKS 611 /* 612 * Inform hwpmc if the address range being unmapped contains 613 * an executable region. 614 */ 615 pkm.pm_address = (uintptr_t) NULL; 616 if (vm_map_lookup_entry(map, addr, &entry)) { 617 for (; 618 entry != &map->header && entry->start < addr + size; 619 entry = entry->next) { 620 if (vm_map_check_protection(map, entry->start, 621 entry->end, VM_PROT_EXECUTE) == TRUE) { 622 pkm.pm_address = (uintptr_t) addr; 623 pkm.pm_size = (size_t) size; 624 break; 625 } 626 } 627 } 628#endif 629 vm_map_delete(map, addr, addr + size); 630 631#ifdef HWPMC_HOOKS 632 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 633 vm_map_lock_downgrade(map); 634 if (pkm.pm_address != (uintptr_t) NULL) 635 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 636 vm_map_unlock_read(map); 637#else 638 vm_map_unlock(map); 639#endif 640 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 641 return (0); 642} 643 644#ifndef _SYS_SYSPROTO_H_ 645struct mprotect_args { 646 const void *addr; 647 size_t len; 648 int prot; 649}; 650#endif 651/* 652 * MPSAFE 653 */ 654int 655sys_mprotect(td, uap) 656 struct thread *td; 657 struct mprotect_args *uap; 658{ 659 vm_offset_t addr; 660 vm_size_t size, pageoff; 661 vm_prot_t prot; 662 663 addr = (vm_offset_t) uap->addr; 664 size = uap->len; 665 prot = uap->prot & VM_PROT_ALL; 666 667 pageoff = (addr & PAGE_MASK); 668 addr -= pageoff; 669 size += pageoff; 670 size = (vm_size_t) round_page(size); 671 if (addr + size < addr) 672 return (EINVAL); 673 674 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 675 addr + size, prot, FALSE)) { 676 case KERN_SUCCESS: 677 return (0); 678 case KERN_PROTECTION_FAILURE: 679 return (EACCES); 680 case KERN_RESOURCE_SHORTAGE: 681 return (ENOMEM); 682 } 683 return (EINVAL); 684} 685 686#ifndef _SYS_SYSPROTO_H_ 687struct minherit_args { 688 void *addr; 689 size_t len; 690 int inherit; 691}; 692#endif 693/* 694 * MPSAFE 695 */ 696int 697sys_minherit(td, uap) 698 struct thread *td; 699 struct minherit_args *uap; 700{ 701 vm_offset_t addr; 702 vm_size_t size, pageoff; 703 vm_inherit_t inherit; 704 705 addr = (vm_offset_t)uap->addr; 706 size = uap->len; 707 inherit = uap->inherit; 708 709 pageoff = (addr & PAGE_MASK); 710 addr -= pageoff; 711 size += pageoff; 712 size = (vm_size_t) round_page(size); 713 if (addr + size < addr) 714 return (EINVAL); 715 716 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 717 addr + size, inherit)) { 718 case KERN_SUCCESS: 719 return (0); 720 case KERN_PROTECTION_FAILURE: 721 return (EACCES); 722 } 723 return (EINVAL); 724} 725 726#ifndef _SYS_SYSPROTO_H_ 727struct madvise_args { 728 void *addr; 729 size_t len; 730 int behav; 731}; 732#endif 733 734/* 735 * MPSAFE 736 */ 737int 738sys_madvise(td, uap) 739 struct thread *td; 740 struct madvise_args *uap; 741{ 742 vm_offset_t start, end; 743 vm_map_t map; 744 int flags; 745 746 /* 747 * Check for our special case, advising the swap pager we are 748 * "immortal." 749 */ 750 if (uap->behav == MADV_PROTECT) { 751 flags = PPROT_SET; 752 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 753 PROC_SPROTECT, &flags)); 754 } 755 756 /* 757 * Check for illegal behavior 758 */ 759 if (uap->behav < 0 || uap->behav > MADV_CORE) 760 return (EINVAL); 761 /* 762 * Check for illegal addresses. Watch out for address wrap... Note 763 * that VM_*_ADDRESS are not constants due to casts (argh). 764 */ 765 map = &td->td_proc->p_vmspace->vm_map; 766 if ((vm_offset_t)uap->addr < vm_map_min(map) || 767 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 768 return (EINVAL); 769 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 770 return (EINVAL); 771 772 /* 773 * Since this routine is only advisory, we default to conservative 774 * behavior. 775 */ 776 start = trunc_page((vm_offset_t) uap->addr); 777 end = round_page((vm_offset_t) uap->addr + uap->len); 778 779 if (vm_map_madvise(map, start, end, uap->behav)) 780 return (EINVAL); 781 return (0); 782} 783 784#ifndef _SYS_SYSPROTO_H_ 785struct mincore_args { 786 const void *addr; 787 size_t len; 788 char *vec; 789}; 790#endif 791 792/* 793 * MPSAFE 794 */ 795int 796sys_mincore(td, uap) 797 struct thread *td; 798 struct mincore_args *uap; 799{ 800 vm_offset_t addr, first_addr; 801 vm_offset_t end, cend; 802 pmap_t pmap; 803 vm_map_t map; 804 char *vec; 805 int error = 0; 806 int vecindex, lastvecindex; 807 vm_map_entry_t current; 808 vm_map_entry_t entry; 809 vm_object_t object; 810 vm_paddr_t locked_pa; 811 vm_page_t m; 812 vm_pindex_t pindex; 813 int mincoreinfo; 814 unsigned int timestamp; 815 boolean_t locked; 816 817 /* 818 * Make sure that the addresses presented are valid for user 819 * mode. 820 */ 821 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 822 end = addr + (vm_size_t)round_page(uap->len); 823 map = &td->td_proc->p_vmspace->vm_map; 824 if (end > vm_map_max(map) || end < addr) 825 return (ENOMEM); 826 827 /* 828 * Address of byte vector 829 */ 830 vec = uap->vec; 831 832 pmap = vmspace_pmap(td->td_proc->p_vmspace); 833 834 vm_map_lock_read(map); 835RestartScan: 836 timestamp = map->timestamp; 837 838 if (!vm_map_lookup_entry(map, addr, &entry)) { 839 vm_map_unlock_read(map); 840 return (ENOMEM); 841 } 842 843 /* 844 * Do this on a map entry basis so that if the pages are not 845 * in the current processes address space, we can easily look 846 * up the pages elsewhere. 847 */ 848 lastvecindex = -1; 849 for (current = entry; 850 (current != &map->header) && (current->start < end); 851 current = current->next) { 852 853 /* 854 * check for contiguity 855 */ 856 if (current->end < end && 857 (entry->next == &map->header || 858 current->next->start > current->end)) { 859 vm_map_unlock_read(map); 860 return (ENOMEM); 861 } 862 863 /* 864 * ignore submaps (for now) or null objects 865 */ 866 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 867 current->object.vm_object == NULL) 868 continue; 869 870 /* 871 * limit this scan to the current map entry and the 872 * limits for the mincore call 873 */ 874 if (addr < current->start) 875 addr = current->start; 876 cend = current->end; 877 if (cend > end) 878 cend = end; 879 880 /* 881 * scan this entry one page at a time 882 */ 883 while (addr < cend) { 884 /* 885 * Check pmap first, it is likely faster, also 886 * it can provide info as to whether we are the 887 * one referencing or modifying the page. 888 */ 889 object = NULL; 890 locked_pa = 0; 891 retry: 892 m = NULL; 893 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 894 if (locked_pa != 0) { 895 /* 896 * The page is mapped by this process but not 897 * both accessed and modified. It is also 898 * managed. Acquire the object lock so that 899 * other mappings might be examined. 900 */ 901 m = PHYS_TO_VM_PAGE(locked_pa); 902 if (m->object != object) { 903 if (object != NULL) 904 VM_OBJECT_WUNLOCK(object); 905 object = m->object; 906 locked = VM_OBJECT_TRYWLOCK(object); 907 vm_page_unlock(m); 908 if (!locked) { 909 VM_OBJECT_WLOCK(object); 910 vm_page_lock(m); 911 goto retry; 912 } 913 } else 914 vm_page_unlock(m); 915 KASSERT(m->valid == VM_PAGE_BITS_ALL, 916 ("mincore: page %p is mapped but invalid", 917 m)); 918 } else if (mincoreinfo == 0) { 919 /* 920 * The page is not mapped by this process. If 921 * the object implements managed pages, then 922 * determine if the page is resident so that 923 * the mappings might be examined. 924 */ 925 if (current->object.vm_object != object) { 926 if (object != NULL) 927 VM_OBJECT_WUNLOCK(object); 928 object = current->object.vm_object; 929 VM_OBJECT_WLOCK(object); 930 } 931 if (object->type == OBJT_DEFAULT || 932 object->type == OBJT_SWAP || 933 object->type == OBJT_VNODE) { 934 pindex = OFF_TO_IDX(current->offset + 935 (addr - current->start)); 936 m = vm_page_lookup(object, pindex); 937 if (m == NULL && 938 vm_page_is_cached(object, pindex)) 939 mincoreinfo = MINCORE_INCORE; 940 if (m != NULL && m->valid == 0) 941 m = NULL; 942 if (m != NULL) 943 mincoreinfo = MINCORE_INCORE; 944 } 945 } 946 if (m != NULL) { 947 /* Examine other mappings to the page. */ 948 if (m->dirty == 0 && pmap_is_modified(m)) 949 vm_page_dirty(m); 950 if (m->dirty != 0) 951 mincoreinfo |= MINCORE_MODIFIED_OTHER; 952 /* 953 * The first test for PGA_REFERENCED is an 954 * optimization. The second test is 955 * required because a concurrent pmap 956 * operation could clear the last reference 957 * and set PGA_REFERENCED before the call to 958 * pmap_is_referenced(). 959 */ 960 if ((m->aflags & PGA_REFERENCED) != 0 || 961 pmap_is_referenced(m) || 962 (m->aflags & PGA_REFERENCED) != 0) 963 mincoreinfo |= MINCORE_REFERENCED_OTHER; 964 } 965 if (object != NULL) 966 VM_OBJECT_WUNLOCK(object); 967 968 /* 969 * subyte may page fault. In case it needs to modify 970 * the map, we release the lock. 971 */ 972 vm_map_unlock_read(map); 973 974 /* 975 * calculate index into user supplied byte vector 976 */ 977 vecindex = OFF_TO_IDX(addr - first_addr); 978 979 /* 980 * If we have skipped map entries, we need to make sure that 981 * the byte vector is zeroed for those skipped entries. 982 */ 983 while ((lastvecindex + 1) < vecindex) { 984 ++lastvecindex; 985 error = subyte(vec + lastvecindex, 0); 986 if (error) { 987 error = EFAULT; 988 goto done2; 989 } 990 } 991 992 /* 993 * Pass the page information to the user 994 */ 995 error = subyte(vec + vecindex, mincoreinfo); 996 if (error) { 997 error = EFAULT; 998 goto done2; 999 } 1000 1001 /* 1002 * If the map has changed, due to the subyte, the previous 1003 * output may be invalid. 1004 */ 1005 vm_map_lock_read(map); 1006 if (timestamp != map->timestamp) 1007 goto RestartScan; 1008 1009 lastvecindex = vecindex; 1010 addr += PAGE_SIZE; 1011 } 1012 } 1013 1014 /* 1015 * subyte may page fault. In case it needs to modify 1016 * the map, we release the lock. 1017 */ 1018 vm_map_unlock_read(map); 1019 1020 /* 1021 * Zero the last entries in the byte vector. 1022 */ 1023 vecindex = OFF_TO_IDX(end - first_addr); 1024 while ((lastvecindex + 1) < vecindex) { 1025 ++lastvecindex; 1026 error = subyte(vec + lastvecindex, 0); 1027 if (error) { 1028 error = EFAULT; 1029 goto done2; 1030 } 1031 } 1032 1033 /* 1034 * If the map has changed, due to the subyte, the previous 1035 * output may be invalid. 1036 */ 1037 vm_map_lock_read(map); 1038 if (timestamp != map->timestamp) 1039 goto RestartScan; 1040 vm_map_unlock_read(map); 1041done2: 1042 return (error); 1043} 1044 1045#ifndef _SYS_SYSPROTO_H_ 1046struct mlock_args { 1047 const void *addr; 1048 size_t len; 1049}; 1050#endif 1051/* 1052 * MPSAFE 1053 */ 1054int 1055sys_mlock(td, uap) 1056 struct thread *td; 1057 struct mlock_args *uap; 1058{ 1059 1060 return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); 1061} 1062 1063int 1064vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) 1065{ 1066 vm_offset_t addr, end, last, start; 1067 vm_size_t npages, size; 1068 vm_map_t map; 1069 unsigned long nsize; 1070 int error; 1071 1072 error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); 1073 if (error) 1074 return (error); 1075 addr = (vm_offset_t)addr0; 1076 size = len; 1077 last = addr + size; 1078 start = trunc_page(addr); 1079 end = round_page(last); 1080 if (last < addr || end < addr) 1081 return (EINVAL); 1082 npages = atop(end - start); 1083 if (npages > vm_page_max_wired) 1084 return (ENOMEM); 1085 map = &proc->p_vmspace->vm_map; 1086 PROC_LOCK(proc); 1087 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1088 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1089 PROC_UNLOCK(proc); 1090 return (ENOMEM); 1091 } 1092 PROC_UNLOCK(proc); 1093 if (npages + cnt.v_wire_count > vm_page_max_wired) 1094 return (EAGAIN); 1095#ifdef RACCT 1096 PROC_LOCK(proc); 1097 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1098 PROC_UNLOCK(proc); 1099 if (error != 0) 1100 return (ENOMEM); 1101#endif 1102 error = vm_map_wire(map, start, end, 1103 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1104#ifdef RACCT 1105 if (error != KERN_SUCCESS) { 1106 PROC_LOCK(proc); 1107 racct_set(proc, RACCT_MEMLOCK, 1108 ptoa(pmap_wired_count(map->pmap))); 1109 PROC_UNLOCK(proc); 1110 } 1111#endif 1112 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1113} 1114 1115#ifndef _SYS_SYSPROTO_H_ 1116struct mlockall_args { 1117 int how; 1118}; 1119#endif 1120 1121/* 1122 * MPSAFE 1123 */ 1124int 1125sys_mlockall(td, uap) 1126 struct thread *td; 1127 struct mlockall_args *uap; 1128{ 1129 vm_map_t map; 1130 int error; 1131 1132 map = &td->td_proc->p_vmspace->vm_map; 1133 error = priv_check(td, PRIV_VM_MLOCK); 1134 if (error) 1135 return (error); 1136 1137 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1138 return (EINVAL); 1139 1140 /* 1141 * If wiring all pages in the process would cause it to exceed 1142 * a hard resource limit, return ENOMEM. 1143 */ 1144 if (!old_mlock && uap->how & MCL_CURRENT) { 1145 PROC_LOCK(td->td_proc); 1146 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1147 PROC_UNLOCK(td->td_proc); 1148 return (ENOMEM); 1149 } 1150 PROC_UNLOCK(td->td_proc); 1151 } 1152#ifdef RACCT 1153 PROC_LOCK(td->td_proc); 1154 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1155 PROC_UNLOCK(td->td_proc); 1156 if (error != 0) 1157 return (ENOMEM); 1158#endif 1159 1160 if (uap->how & MCL_FUTURE) { 1161 vm_map_lock(map); 1162 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1163 vm_map_unlock(map); 1164 error = 0; 1165 } 1166 1167 if (uap->how & MCL_CURRENT) { 1168 /* 1169 * P1003.1-2001 mandates that all currently mapped pages 1170 * will be memory resident and locked (wired) upon return 1171 * from mlockall(). vm_map_wire() will wire pages, by 1172 * calling vm_fault_wire() for each page in the region. 1173 */ 1174 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1175 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1176 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1177 } 1178#ifdef RACCT 1179 if (error != KERN_SUCCESS) { 1180 PROC_LOCK(td->td_proc); 1181 racct_set(td->td_proc, RACCT_MEMLOCK, 1182 ptoa(pmap_wired_count(map->pmap))); 1183 PROC_UNLOCK(td->td_proc); 1184 } 1185#endif 1186 1187 return (error); 1188} 1189 1190#ifndef _SYS_SYSPROTO_H_ 1191struct munlockall_args { 1192 register_t dummy; 1193}; 1194#endif 1195 1196/* 1197 * MPSAFE 1198 */ 1199int 1200sys_munlockall(td, uap) 1201 struct thread *td; 1202 struct munlockall_args *uap; 1203{ 1204 vm_map_t map; 1205 int error; 1206 1207 map = &td->td_proc->p_vmspace->vm_map; 1208 error = priv_check(td, PRIV_VM_MUNLOCK); 1209 if (error) 1210 return (error); 1211 1212 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1213 vm_map_lock(map); 1214 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1215 vm_map_unlock(map); 1216 1217 /* Forcibly unwire all pages. */ 1218 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1219 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1220#ifdef RACCT 1221 if (error == KERN_SUCCESS) { 1222 PROC_LOCK(td->td_proc); 1223 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1224 PROC_UNLOCK(td->td_proc); 1225 } 1226#endif 1227 1228 return (error); 1229} 1230 1231#ifndef _SYS_SYSPROTO_H_ 1232struct munlock_args { 1233 const void *addr; 1234 size_t len; 1235}; 1236#endif 1237/* 1238 * MPSAFE 1239 */ 1240int 1241sys_munlock(td, uap) 1242 struct thread *td; 1243 struct munlock_args *uap; 1244{ 1245 vm_offset_t addr, end, last, start; 1246 vm_size_t size; 1247#ifdef RACCT 1248 vm_map_t map; 1249#endif 1250 int error; 1251 1252 error = priv_check(td, PRIV_VM_MUNLOCK); 1253 if (error) 1254 return (error); 1255 addr = (vm_offset_t)uap->addr; 1256 size = uap->len; 1257 last = addr + size; 1258 start = trunc_page(addr); 1259 end = round_page(last); 1260 if (last < addr || end < addr) 1261 return (EINVAL); 1262 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1263 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1264#ifdef RACCT 1265 if (error == KERN_SUCCESS) { 1266 PROC_LOCK(td->td_proc); 1267 map = &td->td_proc->p_vmspace->vm_map; 1268 racct_set(td->td_proc, RACCT_MEMLOCK, 1269 ptoa(pmap_wired_count(map->pmap))); 1270 PROC_UNLOCK(td->td_proc); 1271 } 1272#endif 1273 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1274} 1275 1276/* 1277 * vm_mmap_vnode() 1278 * 1279 * Helper function for vm_mmap. Perform sanity check specific for mmap 1280 * operations on vnodes. 1281 * 1282 * For VCHR vnodes, the vnode lock is held over the call to 1283 * vm_mmap_cdev() to keep vp->v_rdev valid. 1284 */ 1285int 1286vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1287 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1288 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1289 boolean_t *writecounted) 1290{ 1291 struct vattr va; 1292 vm_object_t obj; 1293 vm_offset_t foff; 1294 struct mount *mp; 1295 struct ucred *cred; 1296 int error, flags, locktype; 1297 1298 mp = vp->v_mount; 1299 cred = td->td_ucred; 1300 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1301 locktype = LK_EXCLUSIVE; 1302 else 1303 locktype = LK_SHARED; 1304 if ((error = vget(vp, locktype, td)) != 0) 1305 return (error); 1306 foff = *foffp; 1307 flags = *flagsp; 1308 obj = vp->v_object; 1309 if (vp->v_type == VREG) { 1310 /* 1311 * Get the proper underlying object 1312 */ 1313 if (obj == NULL) { 1314 error = EINVAL; 1315 goto done; 1316 } 1317 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1318 vput(vp); 1319 vp = (struct vnode *)obj->handle; 1320 /* 1321 * Bypass filesystems obey the mpsafety of the 1322 * underlying fs. Tmpfs never bypasses. 1323 */ 1324 error = vget(vp, locktype, td); 1325 if (error != 0) 1326 return (error); 1327 } 1328 if (locktype == LK_EXCLUSIVE) { 1329 *writecounted = TRUE; 1330 vnode_pager_update_writecount(obj, 0, objsize); 1331 } 1332 } else if (vp->v_type == VCHR) { 1333 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1334 vp->v_rdev, foffp, objp); 1335 if (error == 0) 1336 goto mark_atime; 1337 goto done; 1338 } else { 1339 error = EINVAL; 1340 goto done; 1341 } 1342 if ((error = VOP_GETATTR(vp, &va, cred))) 1343 goto done; 1344#ifdef MAC 1345 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1346 if (error != 0) 1347 goto done; 1348#endif 1349 if ((flags & MAP_SHARED) != 0) { 1350 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1351 if (prot & PROT_WRITE) { 1352 error = EPERM; 1353 goto done; 1354 } 1355 *maxprotp &= ~VM_PROT_WRITE; 1356 } 1357 } 1358 /* 1359 * If it is a regular file without any references 1360 * we do not need to sync it. 1361 * Adjust object size to be the size of actual file. 1362 */ 1363 objsize = round_page(va.va_size); 1364 if (va.va_nlink == 0) 1365 flags |= MAP_NOSYNC; 1366 if (obj->type == OBJT_VNODE) 1367 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1368 cred); 1369 else { 1370 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1371 ("wrong object type")); 1372 vm_object_reference(obj); 1373 } 1374 if (obj == NULL) { 1375 error = ENOMEM; 1376 goto done; 1377 } 1378 *objp = obj; 1379 *flagsp = flags; 1380 1381mark_atime: 1382 vfs_mark_atime(vp, cred); 1383 1384done: 1385 if (error != 0 && *writecounted) { 1386 *writecounted = FALSE; 1387 vnode_pager_update_writecount(obj, objsize, 0); 1388 } 1389 vput(vp); 1390 return (error); 1391} 1392 1393/* 1394 * vm_mmap_cdev() 1395 * 1396 * MPSAFE 1397 * 1398 * Helper function for vm_mmap. Perform sanity check specific for mmap 1399 * operations on cdevs. 1400 */ 1401int 1402vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1403 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1404 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1405{ 1406 vm_object_t obj; 1407 struct cdevsw *dsw; 1408 int error, flags, ref; 1409 1410 flags = *flagsp; 1411 1412 dsw = dev_refthread(cdev, &ref); 1413 if (dsw == NULL) 1414 return (ENXIO); 1415 if (dsw->d_flags & D_MMAP_ANON) { 1416 dev_relthread(cdev, ref); 1417 *maxprotp = VM_PROT_ALL; 1418 *flagsp |= MAP_ANON; 1419 return (0); 1420 } 1421 /* 1422 * cdevs do not provide private mappings of any kind. 1423 */ 1424 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1425 (prot & PROT_WRITE) != 0) { 1426 dev_relthread(cdev, ref); 1427 return (EACCES); 1428 } 1429 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1430 dev_relthread(cdev, ref); 1431 return (EINVAL); 1432 } 1433 /* 1434 * Force device mappings to be shared. 1435 */ 1436 flags |= MAP_SHARED; 1437#ifdef MAC_XXX 1438 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1439 if (error != 0) { 1440 dev_relthread(cdev, ref); 1441 return (error); 1442 } 1443#endif 1444 /* 1445 * First, try d_mmap_single(). If that is not implemented 1446 * (returns ENODEV), fall back to using the device pager. 1447 * Note that d_mmap_single() must return a reference to the 1448 * object (it needs to bump the reference count of the object 1449 * it returns somehow). 1450 * 1451 * XXX assumes VM_PROT_* == PROT_* 1452 */ 1453 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1454 dev_relthread(cdev, ref); 1455 if (error != ENODEV) 1456 return (error); 1457 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1458 td->td_ucred); 1459 if (obj == NULL) 1460 return (EINVAL); 1461 *objp = obj; 1462 *flagsp = flags; 1463 return (0); 1464} 1465 1466/* 1467 * vm_mmap_shm() 1468 * 1469 * MPSAFE 1470 * 1471 * Helper function for vm_mmap. Perform sanity check specific for mmap 1472 * operations on shm file descriptors. 1473 */ 1474int 1475vm_mmap_shm(struct thread *td, vm_size_t objsize, 1476 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1477 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1478{ 1479 int error; 1480 1481 if ((*flagsp & MAP_SHARED) != 0 && 1482 (*maxprotp & VM_PROT_WRITE) == 0 && 1483 (prot & PROT_WRITE) != 0) 1484 return (EACCES); 1485#ifdef MAC 1486 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1487 if (error != 0) 1488 return (error); 1489#endif 1490 error = shm_mmap(shmfd, objsize, foff, objp); 1491 if (error) 1492 return (error); 1493 return (0); 1494} 1495 1496/* 1497 * vm_mmap() 1498 * 1499 * MPSAFE 1500 * 1501 * Internal version of mmap. Currently used by mmap, exec, and sys5 1502 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1503 */ 1504int 1505vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1506 vm_prot_t maxprot, int flags, 1507 objtype_t handle_type, void *handle, 1508 vm_ooffset_t foff) 1509{ 1510 boolean_t fitit; 1511 vm_object_t object = NULL; 1512 struct thread *td = curthread; 1513 int docow, error, findspace, rv; 1514 boolean_t writecounted; 1515 1516 if (size == 0) 1517 return (0); 1518 1519 size = round_page(size); 1520 1521 if (map == &td->td_proc->p_vmspace->vm_map) { 1522 PROC_LOCK(td->td_proc); 1523 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1524 PROC_UNLOCK(td->td_proc); 1525 return (ENOMEM); 1526 } 1527 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1528 PROC_UNLOCK(td->td_proc); 1529 return (ENOMEM); 1530 } 1531 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1532 if (ptoa(pmap_wired_count(map->pmap)) + size > 1533 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1534 racct_set_force(td->td_proc, RACCT_VMEM, 1535 map->size); 1536 PROC_UNLOCK(td->td_proc); 1537 return (ENOMEM); 1538 } 1539 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1540 ptoa(pmap_wired_count(map->pmap)) + size); 1541 if (error != 0) { 1542 racct_set_force(td->td_proc, RACCT_VMEM, 1543 map->size); 1544 PROC_UNLOCK(td->td_proc); 1545 return (error); 1546 } 1547 } 1548 PROC_UNLOCK(td->td_proc); 1549 } 1550 1551 /* 1552 * We currently can only deal with page aligned file offsets. 1553 * The check is here rather than in the syscall because the 1554 * kernel calls this function internally for other mmaping 1555 * operations (such as in exec) and non-aligned offsets will 1556 * cause pmap inconsistencies...so we want to be sure to 1557 * disallow this in all cases. 1558 */ 1559 if (foff & PAGE_MASK) 1560 return (EINVAL); 1561 1562 if ((flags & MAP_FIXED) == 0) { 1563 fitit = TRUE; 1564 *addr = round_page(*addr); 1565 } else { 1566 if (*addr != trunc_page(*addr)) 1567 return (EINVAL); 1568 fitit = FALSE; 1569 } 1570 writecounted = FALSE; 1571 1572 /* 1573 * Lookup/allocate object. 1574 */ 1575 switch (handle_type) { 1576 case OBJT_DEVICE: 1577 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1578 handle, &foff, &object); 1579 break; 1580 case OBJT_VNODE: 1581 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1582 handle, &foff, &object, &writecounted); 1583 break; 1584 case OBJT_SWAP: 1585 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1586 handle, foff, &object); 1587 break; 1588 case OBJT_DEFAULT: 1589 if (handle == NULL) { 1590 error = 0; 1591 break; 1592 } 1593 /* FALLTHROUGH */ 1594 default: 1595 error = EINVAL; 1596 break; 1597 } 1598 if (error) 1599 return (error); 1600 if (flags & MAP_ANON) { 1601 object = NULL; 1602 docow = 0; 1603 /* 1604 * Unnamed anonymous regions always start at 0. 1605 */ 1606 if (handle == 0) 1607 foff = 0; 1608 } else if (flags & MAP_PREFAULT_READ) 1609 docow = MAP_PREFAULT; 1610 else 1611 docow = MAP_PREFAULT_PARTIAL; 1612 1613 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1614 docow |= MAP_COPY_ON_WRITE; 1615 if (flags & MAP_NOSYNC) 1616 docow |= MAP_DISABLE_SYNCER; 1617 if (flags & MAP_NOCORE) 1618 docow |= MAP_DISABLE_COREDUMP; 1619 /* Shared memory is also shared with children. */ 1620 if (flags & MAP_SHARED) 1621 docow |= MAP_INHERIT_SHARE; 1622 if (writecounted) 1623 docow |= MAP_VN_WRITECOUNT; 1624 1625 if (flags & MAP_STACK) 1626 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1627 docow | MAP_STACK_GROWS_DOWN); 1628 else if (fitit) { 1629 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1630 findspace = VMFS_SUPER_SPACE; 1631 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1632 findspace = VMFS_ALIGNED_SPACE(flags >> 1633 MAP_ALIGNMENT_SHIFT); 1634 else 1635 findspace = VMFS_OPTIMAL_SPACE; 1636 rv = vm_map_find(map, object, foff, addr, size, 1637#ifdef MAP_32BIT 1638 flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR : 1639#endif 1640 0, findspace, prot, maxprot, docow); 1641 } else 1642 rv = vm_map_fixed(map, object, foff, *addr, size, 1643 prot, maxprot, docow); 1644 1645 if (rv == KERN_SUCCESS) { 1646 /* 1647 * If the process has requested that all future mappings 1648 * be wired, then heed this. 1649 */ 1650 if (map->flags & MAP_WIREFUTURE) { 1651 vm_map_wire(map, *addr, *addr + size, 1652 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1653 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1654 } 1655 } else { 1656 /* 1657 * If this mapping was accounted for in the vnode's 1658 * writecount, then undo that now. 1659 */ 1660 if (writecounted) 1661 vnode_pager_release_writecount(object, 0, size); 1662 /* 1663 * Lose the object reference. Will destroy the 1664 * object if it's an unnamed anonymous mapping 1665 * or named anonymous without other references. 1666 */ 1667 vm_object_deallocate(object); 1668 } 1669 return (vm_mmap_to_errno(rv)); 1670} 1671 1672/* 1673 * Translate a Mach VM return code to zero on success or the appropriate errno 1674 * on failure. 1675 */ 1676int 1677vm_mmap_to_errno(int rv) 1678{ 1679 1680 switch (rv) { 1681 case KERN_SUCCESS: 1682 return (0); 1683 case KERN_INVALID_ADDRESS: 1684 case KERN_NO_SPACE: 1685 return (ENOMEM); 1686 case KERN_PROTECTION_FAILURE: 1687 return (EACCES); 1688 default: 1689 return (EINVAL); 1690 } 1691} 1692