vm_mmap.c revision 284665
1164022Sdds/*- 2164022Sdds * Copyright (c) 1988 University of Utah. 3164022Sdds * Copyright (c) 1991, 1993 4164022Sdds * The Regents of the University of California. All rights reserved. 5164022Sdds * 6164022Sdds * This code is derived from software contributed to Berkeley by 7164022Sdds * the Systems Programming Group of the University of Utah Computer 8164022Sdds * Science Department. 9164022Sdds * 10164022Sdds * Redistribution and use in source and binary forms, with or without 11164022Sdds * modification, are permitted provided that the following conditions 12164022Sdds * are met: 13164022Sdds * 1. Redistributions of source code must retain the above copyright 14164022Sdds * notice, this list of conditions and the following disclaimer. 15164022Sdds * 2. Redistributions in binary form must reproduce the above copyright 16164022Sdds * notice, this list of conditions and the following disclaimer in the 17164022Sdds * documentation and/or other materials provided with the distribution. 18164022Sdds * 4. Neither the name of the University nor the names of its contributors 19164022Sdds * may be used to endorse or promote products derived from this software 20164022Sdds * without specific prior written permission. 21164022Sdds * 22164022Sdds * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23164022Sdds * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24164022Sdds * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25164022Sdds * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26164022Sdds * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27164022Sdds * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28164022Sdds * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29164022Sdds * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30164022Sdds * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31164022Sdds * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32164022Sdds * SUCH DAMAGE. 33164022Sdds * 34164022Sdds * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35164022Sdds * 36164022Sdds * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37164022Sdds */ 38164022Sdds 39164022Sdds/* 40164022Sdds * Mapped file (mmap) interface to VM 41164022Sdds */ 42164022Sdds 43164022Sdds#include <sys/cdefs.h> 44164022Sdds__FBSDID("$FreeBSD: stable/10/sys/vm/vm_mmap.c 284665 2015-06-21 06:28:26Z trasz $"); 45164022Sdds 46164022Sdds#include "opt_compat.h" 47164022Sdds#include "opt_hwpmc_hooks.h" 48164022Sdds 49164022Sdds#include <sys/param.h> 50164022Sdds#include <sys/systm.h> 51#include <sys/capsicum.h> 52#include <sys/kernel.h> 53#include <sys/lock.h> 54#include <sys/mutex.h> 55#include <sys/sysproto.h> 56#include <sys/filedesc.h> 57#include <sys/priv.h> 58#include <sys/proc.h> 59#include <sys/procctl.h> 60#include <sys/racct.h> 61#include <sys/resource.h> 62#include <sys/resourcevar.h> 63#include <sys/rwlock.h> 64#include <sys/sysctl.h> 65#include <sys/vnode.h> 66#include <sys/fcntl.h> 67#include <sys/file.h> 68#include <sys/mman.h> 69#include <sys/mount.h> 70#include <sys/conf.h> 71#include <sys/stat.h> 72#include <sys/syscallsubr.h> 73#include <sys/sysent.h> 74#include <sys/vmmeter.h> 75 76#include <security/mac/mac_framework.h> 77 78#include <vm/vm.h> 79#include <vm/vm_param.h> 80#include <vm/pmap.h> 81#include <vm/vm_map.h> 82#include <vm/vm_object.h> 83#include <vm/vm_page.h> 84#include <vm/vm_pager.h> 85#include <vm/vm_pageout.h> 86#include <vm/vm_extern.h> 87#include <vm/vm_page.h> 88#include <vm/vnode_pager.h> 89 90#ifdef HWPMC_HOOKS 91#include <sys/pmckern.h> 92#endif 93 94int old_mlock = 0; 95SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0, 96 "Do not apply RLIMIT_MEMLOCK on mlockall"); 97TUNABLE_INT("vm.old_mlock", &old_mlock); 98 99#ifdef MAP_32BIT 100#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 101#endif 102 103static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 104 int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 105static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 106 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 107static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 108 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 109 110#ifndef _SYS_SYSPROTO_H_ 111struct sbrk_args { 112 int incr; 113}; 114#endif 115 116/* 117 * MPSAFE 118 */ 119/* ARGSUSED */ 120int 121sys_sbrk(td, uap) 122 struct thread *td; 123 struct sbrk_args *uap; 124{ 125 /* Not yet implemented */ 126 return (EOPNOTSUPP); 127} 128 129#ifndef _SYS_SYSPROTO_H_ 130struct sstk_args { 131 int incr; 132}; 133#endif 134 135/* 136 * MPSAFE 137 */ 138/* ARGSUSED */ 139int 140sys_sstk(td, uap) 141 struct thread *td; 142 struct sstk_args *uap; 143{ 144 /* Not yet implemented */ 145 return (EOPNOTSUPP); 146} 147 148#if defined(COMPAT_43) 149#ifndef _SYS_SYSPROTO_H_ 150struct getpagesize_args { 151 int dummy; 152}; 153#endif 154 155int 156ogetpagesize(td, uap) 157 struct thread *td; 158 struct getpagesize_args *uap; 159{ 160 /* MP SAFE */ 161 td->td_retval[0] = PAGE_SIZE; 162 return (0); 163} 164#endif /* COMPAT_43 */ 165 166 167/* 168 * Memory Map (mmap) system call. Note that the file offset 169 * and address are allowed to be NOT page aligned, though if 170 * the MAP_FIXED flag it set, both must have the same remainder 171 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 172 * page-aligned, the actual mapping starts at trunc_page(addr) 173 * and the return value is adjusted up by the page offset. 174 * 175 * Generally speaking, only character devices which are themselves 176 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 177 * there would be no cache coherency between a descriptor and a VM mapping 178 * both to the same character device. 179 */ 180#ifndef _SYS_SYSPROTO_H_ 181struct mmap_args { 182 void *addr; 183 size_t len; 184 int prot; 185 int flags; 186 int fd; 187 long pad; 188 off_t pos; 189}; 190#endif 191 192/* 193 * MPSAFE 194 */ 195int 196sys_mmap(td, uap) 197 struct thread *td; 198 struct mmap_args *uap; 199{ 200#ifdef HWPMC_HOOKS 201 struct pmckern_map_in pkm; 202#endif 203 struct file *fp; 204 struct vnode *vp; 205 vm_offset_t addr; 206 vm_size_t size, pageoff; 207 vm_prot_t cap_maxprot, prot, maxprot; 208 void *handle; 209 objtype_t handle_type; 210 int align, error, flags; 211 off_t pos; 212 struct vmspace *vms = td->td_proc->p_vmspace; 213 cap_rights_t rights; 214 215 addr = (vm_offset_t) uap->addr; 216 size = uap->len; 217 prot = uap->prot & VM_PROT_ALL; 218 flags = uap->flags; 219 pos = uap->pos; 220 221 fp = NULL; 222 223 /* 224 * Enforce the constraints. 225 * Mapping of length 0 is only allowed for old binaries. 226 * Anonymous mapping shall specify -1 as filedescriptor and 227 * zero position for new code. Be nice to ancient a.out 228 * binaries and correct pos for anonymous mapping, since old 229 * ld.so sometimes issues anonymous map requests with non-zero 230 * pos. 231 */ 232 if (!SV_CURPROC_FLAG(SV_AOUT)) { 233 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 234 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 235 return (EINVAL); 236 } else { 237 if ((flags & MAP_ANON) != 0) 238 pos = 0; 239 } 240 241 if (flags & MAP_STACK) { 242 if ((uap->fd != -1) || 243 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 244 return (EINVAL); 245 flags |= MAP_ANON; 246 pos = 0; 247 } 248 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 249 return (EINVAL); 250 251 /* 252 * Align the file position to a page boundary, 253 * and save its page offset component. 254 */ 255 pageoff = (pos & PAGE_MASK); 256 pos -= pageoff; 257 258 /* Adjust size for rounding (on both ends). */ 259 size += pageoff; /* low end... */ 260 size = (vm_size_t) round_page(size); /* hi end */ 261 262 /* Ensure alignment is at least a page and fits in a pointer. */ 263 align = flags & MAP_ALIGNMENT_MASK; 264 if (align != 0 && align != MAP_ALIGNED_SUPER && 265 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 266 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 267 return (EINVAL); 268 269 /* 270 * Check for illegal addresses. Watch out for address wrap... Note 271 * that VM_*_ADDRESS are not constants due to casts (argh). 272 */ 273 if (flags & MAP_FIXED) { 274 /* 275 * The specified address must have the same remainder 276 * as the file offset taken modulo PAGE_SIZE, so it 277 * should be aligned after adjustment by pageoff. 278 */ 279 addr -= pageoff; 280 if (addr & PAGE_MASK) 281 return (EINVAL); 282 283 /* Address range must be all in user VM space. */ 284 if (addr < vm_map_min(&vms->vm_map) || 285 addr + size > vm_map_max(&vms->vm_map)) 286 return (EINVAL); 287 if (addr + size < addr) 288 return (EINVAL); 289#ifdef MAP_32BIT 290 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 291 return (EINVAL); 292 } else if (flags & MAP_32BIT) { 293 /* 294 * For MAP_32BIT, override the hint if it is too high and 295 * do not bother moving the mapping past the heap (since 296 * the heap is usually above 2GB). 297 */ 298 if (addr + size > MAP_32BIT_MAX_ADDR) 299 addr = 0; 300#endif 301 } else { 302 /* 303 * XXX for non-fixed mappings where no hint is provided or 304 * the hint would fall in the potential heap space, 305 * place it after the end of the largest possible heap. 306 * 307 * There should really be a pmap call to determine a reasonable 308 * location. 309 */ 310 PROC_LOCK(td->td_proc); 311 if (addr == 0 || 312 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 313 addr < round_page((vm_offset_t)vms->vm_daddr + 314 lim_max(td->td_proc, RLIMIT_DATA)))) 315 addr = round_page((vm_offset_t)vms->vm_daddr + 316 lim_max(td->td_proc, RLIMIT_DATA)); 317 PROC_UNLOCK(td->td_proc); 318 } 319 if (flags & MAP_ANON) { 320 /* 321 * Mapping blank space is trivial. 322 */ 323 handle = NULL; 324 handle_type = OBJT_DEFAULT; 325 maxprot = VM_PROT_ALL; 326 cap_maxprot = VM_PROT_ALL; 327 } else { 328 /* 329 * Mapping file, get fp for validation and don't let the 330 * descriptor disappear on us if we block. Check capability 331 * rights, but also return the maximum rights to be combined 332 * with maxprot later. 333 */ 334 cap_rights_init(&rights, CAP_MMAP); 335 if (prot & PROT_READ) 336 cap_rights_set(&rights, CAP_MMAP_R); 337 if ((flags & MAP_SHARED) != 0) { 338 if (prot & PROT_WRITE) 339 cap_rights_set(&rights, CAP_MMAP_W); 340 } 341 if (prot & PROT_EXEC) 342 cap_rights_set(&rights, CAP_MMAP_X); 343 error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); 344 if (error != 0) 345 goto done; 346 if (fp->f_type == DTYPE_SHM) { 347 handle = fp->f_data; 348 handle_type = OBJT_SWAP; 349 maxprot = VM_PROT_NONE; 350 351 /* FREAD should always be set. */ 352 if (fp->f_flag & FREAD) 353 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 354 if (fp->f_flag & FWRITE) 355 maxprot |= VM_PROT_WRITE; 356 goto map; 357 } 358 if (fp->f_type != DTYPE_VNODE) { 359 error = ENODEV; 360 goto done; 361 } 362#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 363 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 364 /* 365 * POSIX shared-memory objects are defined to have 366 * kernel persistence, and are not defined to support 367 * read(2)/write(2) -- or even open(2). Thus, we can 368 * use MAP_ASYNC to trade on-disk coherence for speed. 369 * The shm_open(3) library routine turns on the FPOSIXSHM 370 * flag to request this behavior. 371 */ 372 if (fp->f_flag & FPOSIXSHM) 373 flags |= MAP_NOSYNC; 374#endif 375 vp = fp->f_vnode; 376 /* 377 * Ensure that file and memory protections are 378 * compatible. Note that we only worry about 379 * writability if mapping is shared; in this case, 380 * current and max prot are dictated by the open file. 381 * XXX use the vnode instead? Problem is: what 382 * credentials do we use for determination? What if 383 * proc does a setuid? 384 */ 385 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 386 maxprot = VM_PROT_NONE; 387 else 388 maxprot = VM_PROT_EXECUTE; 389 if (fp->f_flag & FREAD) { 390 maxprot |= VM_PROT_READ; 391 } else if (prot & PROT_READ) { 392 error = EACCES; 393 goto done; 394 } 395 /* 396 * If we are sharing potential changes (either via 397 * MAP_SHARED or via the implicit sharing of character 398 * device mappings), and we are trying to get write 399 * permission although we opened it without asking 400 * for it, bail out. 401 */ 402 if ((flags & MAP_SHARED) != 0) { 403 if ((fp->f_flag & FWRITE) != 0) { 404 maxprot |= VM_PROT_WRITE; 405 } else if ((prot & PROT_WRITE) != 0) { 406 error = EACCES; 407 goto done; 408 } 409 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 410 maxprot |= VM_PROT_WRITE; 411 cap_maxprot |= VM_PROT_WRITE; 412 } 413 handle = (void *)vp; 414 handle_type = OBJT_VNODE; 415 } 416map: 417 td->td_fpop = fp; 418 maxprot &= cap_maxprot; 419 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 420 flags, handle_type, handle, pos); 421 td->td_fpop = NULL; 422#ifdef HWPMC_HOOKS 423 /* inform hwpmc(4) if an executable is being mapped */ 424 if (error == 0 && handle_type == OBJT_VNODE && 425 (prot & PROT_EXEC)) { 426 pkm.pm_file = handle; 427 pkm.pm_address = (uintptr_t) addr; 428 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 429 } 430#endif 431 if (error == 0) 432 td->td_retval[0] = (register_t) (addr + pageoff); 433done: 434 if (fp) 435 fdrop(fp, td); 436 437 return (error); 438} 439 440int 441freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 442{ 443 struct mmap_args oargs; 444 445 oargs.addr = uap->addr; 446 oargs.len = uap->len; 447 oargs.prot = uap->prot; 448 oargs.flags = uap->flags; 449 oargs.fd = uap->fd; 450 oargs.pos = uap->pos; 451 return (sys_mmap(td, &oargs)); 452} 453 454#ifdef COMPAT_43 455#ifndef _SYS_SYSPROTO_H_ 456struct ommap_args { 457 caddr_t addr; 458 int len; 459 int prot; 460 int flags; 461 int fd; 462 long pos; 463}; 464#endif 465int 466ommap(td, uap) 467 struct thread *td; 468 struct ommap_args *uap; 469{ 470 struct mmap_args nargs; 471 static const char cvtbsdprot[8] = { 472 0, 473 PROT_EXEC, 474 PROT_WRITE, 475 PROT_EXEC | PROT_WRITE, 476 PROT_READ, 477 PROT_EXEC | PROT_READ, 478 PROT_WRITE | PROT_READ, 479 PROT_EXEC | PROT_WRITE | PROT_READ, 480 }; 481 482#define OMAP_ANON 0x0002 483#define OMAP_COPY 0x0020 484#define OMAP_SHARED 0x0010 485#define OMAP_FIXED 0x0100 486 487 nargs.addr = uap->addr; 488 nargs.len = uap->len; 489 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 490#ifdef COMPAT_FREEBSD32 491#if defined(__amd64__) || defined(__ia64__) 492 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 493 nargs.prot != 0) 494 nargs.prot |= PROT_EXEC; 495#endif 496#endif 497 nargs.flags = 0; 498 if (uap->flags & OMAP_ANON) 499 nargs.flags |= MAP_ANON; 500 if (uap->flags & OMAP_COPY) 501 nargs.flags |= MAP_COPY; 502 if (uap->flags & OMAP_SHARED) 503 nargs.flags |= MAP_SHARED; 504 else 505 nargs.flags |= MAP_PRIVATE; 506 if (uap->flags & OMAP_FIXED) 507 nargs.flags |= MAP_FIXED; 508 nargs.fd = uap->fd; 509 nargs.pos = uap->pos; 510 return (sys_mmap(td, &nargs)); 511} 512#endif /* COMPAT_43 */ 513 514 515#ifndef _SYS_SYSPROTO_H_ 516struct msync_args { 517 void *addr; 518 size_t len; 519 int flags; 520}; 521#endif 522/* 523 * MPSAFE 524 */ 525int 526sys_msync(td, uap) 527 struct thread *td; 528 struct msync_args *uap; 529{ 530 vm_offset_t addr; 531 vm_size_t size, pageoff; 532 int flags; 533 vm_map_t map; 534 int rv; 535 536 addr = (vm_offset_t) uap->addr; 537 size = uap->len; 538 flags = uap->flags; 539 540 pageoff = (addr & PAGE_MASK); 541 addr -= pageoff; 542 size += pageoff; 543 size = (vm_size_t) round_page(size); 544 if (addr + size < addr) 545 return (EINVAL); 546 547 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 548 return (EINVAL); 549 550 map = &td->td_proc->p_vmspace->vm_map; 551 552 /* 553 * Clean the pages and interpret the return value. 554 */ 555 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 556 (flags & MS_INVALIDATE) != 0); 557 switch (rv) { 558 case KERN_SUCCESS: 559 return (0); 560 case KERN_INVALID_ADDRESS: 561 return (ENOMEM); 562 case KERN_INVALID_ARGUMENT: 563 return (EBUSY); 564 case KERN_FAILURE: 565 return (EIO); 566 default: 567 return (EINVAL); 568 } 569} 570 571#ifndef _SYS_SYSPROTO_H_ 572struct munmap_args { 573 void *addr; 574 size_t len; 575}; 576#endif 577/* 578 * MPSAFE 579 */ 580int 581sys_munmap(td, uap) 582 struct thread *td; 583 struct munmap_args *uap; 584{ 585#ifdef HWPMC_HOOKS 586 struct pmckern_map_out pkm; 587 vm_map_entry_t entry; 588#endif 589 vm_offset_t addr; 590 vm_size_t size, pageoff; 591 vm_map_t map; 592 593 addr = (vm_offset_t) uap->addr; 594 size = uap->len; 595 if (size == 0) 596 return (EINVAL); 597 598 pageoff = (addr & PAGE_MASK); 599 addr -= pageoff; 600 size += pageoff; 601 size = (vm_size_t) round_page(size); 602 if (addr + size < addr) 603 return (EINVAL); 604 605 /* 606 * Check for illegal addresses. Watch out for address wrap... 607 */ 608 map = &td->td_proc->p_vmspace->vm_map; 609 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 610 return (EINVAL); 611 vm_map_lock(map); 612#ifdef HWPMC_HOOKS 613 /* 614 * Inform hwpmc if the address range being unmapped contains 615 * an executable region. 616 */ 617 pkm.pm_address = (uintptr_t) NULL; 618 if (vm_map_lookup_entry(map, addr, &entry)) { 619 for (; 620 entry != &map->header && entry->start < addr + size; 621 entry = entry->next) { 622 if (vm_map_check_protection(map, entry->start, 623 entry->end, VM_PROT_EXECUTE) == TRUE) { 624 pkm.pm_address = (uintptr_t) addr; 625 pkm.pm_size = (size_t) size; 626 break; 627 } 628 } 629 } 630#endif 631 vm_map_delete(map, addr, addr + size); 632 633#ifdef HWPMC_HOOKS 634 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 635 vm_map_lock_downgrade(map); 636 if (pkm.pm_address != (uintptr_t) NULL) 637 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 638 vm_map_unlock_read(map); 639#else 640 vm_map_unlock(map); 641#endif 642 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 643 return (0); 644} 645 646#ifndef _SYS_SYSPROTO_H_ 647struct mprotect_args { 648 const void *addr; 649 size_t len; 650 int prot; 651}; 652#endif 653/* 654 * MPSAFE 655 */ 656int 657sys_mprotect(td, uap) 658 struct thread *td; 659 struct mprotect_args *uap; 660{ 661 vm_offset_t addr; 662 vm_size_t size, pageoff; 663 vm_prot_t prot; 664 665 addr = (vm_offset_t) uap->addr; 666 size = uap->len; 667 prot = uap->prot & VM_PROT_ALL; 668 669 pageoff = (addr & PAGE_MASK); 670 addr -= pageoff; 671 size += pageoff; 672 size = (vm_size_t) round_page(size); 673 if (addr + size < addr) 674 return (EINVAL); 675 676 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 677 addr + size, prot, FALSE)) { 678 case KERN_SUCCESS: 679 return (0); 680 case KERN_PROTECTION_FAILURE: 681 return (EACCES); 682 case KERN_RESOURCE_SHORTAGE: 683 return (ENOMEM); 684 } 685 return (EINVAL); 686} 687 688#ifndef _SYS_SYSPROTO_H_ 689struct minherit_args { 690 void *addr; 691 size_t len; 692 int inherit; 693}; 694#endif 695/* 696 * MPSAFE 697 */ 698int 699sys_minherit(td, uap) 700 struct thread *td; 701 struct minherit_args *uap; 702{ 703 vm_offset_t addr; 704 vm_size_t size, pageoff; 705 vm_inherit_t inherit; 706 707 addr = (vm_offset_t)uap->addr; 708 size = uap->len; 709 inherit = uap->inherit; 710 711 pageoff = (addr & PAGE_MASK); 712 addr -= pageoff; 713 size += pageoff; 714 size = (vm_size_t) round_page(size); 715 if (addr + size < addr) 716 return (EINVAL); 717 718 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 719 addr + size, inherit)) { 720 case KERN_SUCCESS: 721 return (0); 722 case KERN_PROTECTION_FAILURE: 723 return (EACCES); 724 } 725 return (EINVAL); 726} 727 728#ifndef _SYS_SYSPROTO_H_ 729struct madvise_args { 730 void *addr; 731 size_t len; 732 int behav; 733}; 734#endif 735 736/* 737 * MPSAFE 738 */ 739int 740sys_madvise(td, uap) 741 struct thread *td; 742 struct madvise_args *uap; 743{ 744 vm_offset_t start, end; 745 vm_map_t map; 746 int flags; 747 748 /* 749 * Check for our special case, advising the swap pager we are 750 * "immortal." 751 */ 752 if (uap->behav == MADV_PROTECT) { 753 flags = PPROT_SET; 754 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 755 PROC_SPROTECT, &flags)); 756 } 757 758 /* 759 * Check for illegal behavior 760 */ 761 if (uap->behav < 0 || uap->behav > MADV_CORE) 762 return (EINVAL); 763 /* 764 * Check for illegal addresses. Watch out for address wrap... Note 765 * that VM_*_ADDRESS are not constants due to casts (argh). 766 */ 767 map = &td->td_proc->p_vmspace->vm_map; 768 if ((vm_offset_t)uap->addr < vm_map_min(map) || 769 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 770 return (EINVAL); 771 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 772 return (EINVAL); 773 774 /* 775 * Since this routine is only advisory, we default to conservative 776 * behavior. 777 */ 778 start = trunc_page((vm_offset_t) uap->addr); 779 end = round_page((vm_offset_t) uap->addr + uap->len); 780 781 if (vm_map_madvise(map, start, end, uap->behav)) 782 return (EINVAL); 783 return (0); 784} 785 786#ifndef _SYS_SYSPROTO_H_ 787struct mincore_args { 788 const void *addr; 789 size_t len; 790 char *vec; 791}; 792#endif 793 794/* 795 * MPSAFE 796 */ 797int 798sys_mincore(td, uap) 799 struct thread *td; 800 struct mincore_args *uap; 801{ 802 vm_offset_t addr, first_addr; 803 vm_offset_t end, cend; 804 pmap_t pmap; 805 vm_map_t map; 806 char *vec; 807 int error = 0; 808 int vecindex, lastvecindex; 809 vm_map_entry_t current; 810 vm_map_entry_t entry; 811 vm_object_t object; 812 vm_paddr_t locked_pa; 813 vm_page_t m; 814 vm_pindex_t pindex; 815 int mincoreinfo; 816 unsigned int timestamp; 817 boolean_t locked; 818 819 /* 820 * Make sure that the addresses presented are valid for user 821 * mode. 822 */ 823 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 824 end = addr + (vm_size_t)round_page(uap->len); 825 map = &td->td_proc->p_vmspace->vm_map; 826 if (end > vm_map_max(map) || end < addr) 827 return (ENOMEM); 828 829 /* 830 * Address of byte vector 831 */ 832 vec = uap->vec; 833 834 pmap = vmspace_pmap(td->td_proc->p_vmspace); 835 836 vm_map_lock_read(map); 837RestartScan: 838 timestamp = map->timestamp; 839 840 if (!vm_map_lookup_entry(map, addr, &entry)) { 841 vm_map_unlock_read(map); 842 return (ENOMEM); 843 } 844 845 /* 846 * Do this on a map entry basis so that if the pages are not 847 * in the current processes address space, we can easily look 848 * up the pages elsewhere. 849 */ 850 lastvecindex = -1; 851 for (current = entry; 852 (current != &map->header) && (current->start < end); 853 current = current->next) { 854 855 /* 856 * check for contiguity 857 */ 858 if (current->end < end && 859 (entry->next == &map->header || 860 current->next->start > current->end)) { 861 vm_map_unlock_read(map); 862 return (ENOMEM); 863 } 864 865 /* 866 * ignore submaps (for now) or null objects 867 */ 868 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 869 current->object.vm_object == NULL) 870 continue; 871 872 /* 873 * limit this scan to the current map entry and the 874 * limits for the mincore call 875 */ 876 if (addr < current->start) 877 addr = current->start; 878 cend = current->end; 879 if (cend > end) 880 cend = end; 881 882 /* 883 * scan this entry one page at a time 884 */ 885 while (addr < cend) { 886 /* 887 * Check pmap first, it is likely faster, also 888 * it can provide info as to whether we are the 889 * one referencing or modifying the page. 890 */ 891 object = NULL; 892 locked_pa = 0; 893 retry: 894 m = NULL; 895 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 896 if (locked_pa != 0) { 897 /* 898 * The page is mapped by this process but not 899 * both accessed and modified. It is also 900 * managed. Acquire the object lock so that 901 * other mappings might be examined. 902 */ 903 m = PHYS_TO_VM_PAGE(locked_pa); 904 if (m->object != object) { 905 if (object != NULL) 906 VM_OBJECT_WUNLOCK(object); 907 object = m->object; 908 locked = VM_OBJECT_TRYWLOCK(object); 909 vm_page_unlock(m); 910 if (!locked) { 911 VM_OBJECT_WLOCK(object); 912 vm_page_lock(m); 913 goto retry; 914 } 915 } else 916 vm_page_unlock(m); 917 KASSERT(m->valid == VM_PAGE_BITS_ALL, 918 ("mincore: page %p is mapped but invalid", 919 m)); 920 } else if (mincoreinfo == 0) { 921 /* 922 * The page is not mapped by this process. If 923 * the object implements managed pages, then 924 * determine if the page is resident so that 925 * the mappings might be examined. 926 */ 927 if (current->object.vm_object != object) { 928 if (object != NULL) 929 VM_OBJECT_WUNLOCK(object); 930 object = current->object.vm_object; 931 VM_OBJECT_WLOCK(object); 932 } 933 if (object->type == OBJT_DEFAULT || 934 object->type == OBJT_SWAP || 935 object->type == OBJT_VNODE) { 936 pindex = OFF_TO_IDX(current->offset + 937 (addr - current->start)); 938 m = vm_page_lookup(object, pindex); 939 if (m == NULL && 940 vm_page_is_cached(object, pindex)) 941 mincoreinfo = MINCORE_INCORE; 942 if (m != NULL && m->valid == 0) 943 m = NULL; 944 if (m != NULL) 945 mincoreinfo = MINCORE_INCORE; 946 } 947 } 948 if (m != NULL) { 949 /* Examine other mappings to the page. */ 950 if (m->dirty == 0 && pmap_is_modified(m)) 951 vm_page_dirty(m); 952 if (m->dirty != 0) 953 mincoreinfo |= MINCORE_MODIFIED_OTHER; 954 /* 955 * The first test for PGA_REFERENCED is an 956 * optimization. The second test is 957 * required because a concurrent pmap 958 * operation could clear the last reference 959 * and set PGA_REFERENCED before the call to 960 * pmap_is_referenced(). 961 */ 962 if ((m->aflags & PGA_REFERENCED) != 0 || 963 pmap_is_referenced(m) || 964 (m->aflags & PGA_REFERENCED) != 0) 965 mincoreinfo |= MINCORE_REFERENCED_OTHER; 966 } 967 if (object != NULL) 968 VM_OBJECT_WUNLOCK(object); 969 970 /* 971 * subyte may page fault. In case it needs to modify 972 * the map, we release the lock. 973 */ 974 vm_map_unlock_read(map); 975 976 /* 977 * calculate index into user supplied byte vector 978 */ 979 vecindex = OFF_TO_IDX(addr - first_addr); 980 981 /* 982 * If we have skipped map entries, we need to make sure that 983 * the byte vector is zeroed for those skipped entries. 984 */ 985 while ((lastvecindex + 1) < vecindex) { 986 ++lastvecindex; 987 error = subyte(vec + lastvecindex, 0); 988 if (error) { 989 error = EFAULT; 990 goto done2; 991 } 992 } 993 994 /* 995 * Pass the page information to the user 996 */ 997 error = subyte(vec + vecindex, mincoreinfo); 998 if (error) { 999 error = EFAULT; 1000 goto done2; 1001 } 1002 1003 /* 1004 * If the map has changed, due to the subyte, the previous 1005 * output may be invalid. 1006 */ 1007 vm_map_lock_read(map); 1008 if (timestamp != map->timestamp) 1009 goto RestartScan; 1010 1011 lastvecindex = vecindex; 1012 addr += PAGE_SIZE; 1013 } 1014 } 1015 1016 /* 1017 * subyte may page fault. In case it needs to modify 1018 * the map, we release the lock. 1019 */ 1020 vm_map_unlock_read(map); 1021 1022 /* 1023 * Zero the last entries in the byte vector. 1024 */ 1025 vecindex = OFF_TO_IDX(end - first_addr); 1026 while ((lastvecindex + 1) < vecindex) { 1027 ++lastvecindex; 1028 error = subyte(vec + lastvecindex, 0); 1029 if (error) { 1030 error = EFAULT; 1031 goto done2; 1032 } 1033 } 1034 1035 /* 1036 * If the map has changed, due to the subyte, the previous 1037 * output may be invalid. 1038 */ 1039 vm_map_lock_read(map); 1040 if (timestamp != map->timestamp) 1041 goto RestartScan; 1042 vm_map_unlock_read(map); 1043done2: 1044 return (error); 1045} 1046 1047#ifndef _SYS_SYSPROTO_H_ 1048struct mlock_args { 1049 const void *addr; 1050 size_t len; 1051}; 1052#endif 1053/* 1054 * MPSAFE 1055 */ 1056int 1057sys_mlock(td, uap) 1058 struct thread *td; 1059 struct mlock_args *uap; 1060{ 1061 1062 return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); 1063} 1064 1065int 1066vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) 1067{ 1068 vm_offset_t addr, end, last, start; 1069 vm_size_t npages, size; 1070 vm_map_t map; 1071 unsigned long nsize; 1072 int error; 1073 1074 error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); 1075 if (error) 1076 return (error); 1077 addr = (vm_offset_t)addr0; 1078 size = len; 1079 last = addr + size; 1080 start = trunc_page(addr); 1081 end = round_page(last); 1082 if (last < addr || end < addr) 1083 return (EINVAL); 1084 npages = atop(end - start); 1085 if (npages > vm_page_max_wired) 1086 return (ENOMEM); 1087 map = &proc->p_vmspace->vm_map; 1088 PROC_LOCK(proc); 1089 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1090 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1091 PROC_UNLOCK(proc); 1092 return (ENOMEM); 1093 } 1094 PROC_UNLOCK(proc); 1095 if (npages + cnt.v_wire_count > vm_page_max_wired) 1096 return (EAGAIN); 1097#ifdef RACCT 1098 if (racct_enable) { 1099 PROC_LOCK(proc); 1100 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1101 PROC_UNLOCK(proc); 1102 if (error != 0) 1103 return (ENOMEM); 1104 } 1105#endif 1106 error = vm_map_wire(map, start, end, 1107 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1108#ifdef RACCT 1109 if (racct_enable && error != KERN_SUCCESS) { 1110 PROC_LOCK(proc); 1111 racct_set(proc, RACCT_MEMLOCK, 1112 ptoa(pmap_wired_count(map->pmap))); 1113 PROC_UNLOCK(proc); 1114 } 1115#endif 1116 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1117} 1118 1119#ifndef _SYS_SYSPROTO_H_ 1120struct mlockall_args { 1121 int how; 1122}; 1123#endif 1124 1125/* 1126 * MPSAFE 1127 */ 1128int 1129sys_mlockall(td, uap) 1130 struct thread *td; 1131 struct mlockall_args *uap; 1132{ 1133 vm_map_t map; 1134 int error; 1135 1136 map = &td->td_proc->p_vmspace->vm_map; 1137 error = priv_check(td, PRIV_VM_MLOCK); 1138 if (error) 1139 return (error); 1140 1141 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1142 return (EINVAL); 1143 1144 /* 1145 * If wiring all pages in the process would cause it to exceed 1146 * a hard resource limit, return ENOMEM. 1147 */ 1148 if (!old_mlock && uap->how & MCL_CURRENT) { 1149 PROC_LOCK(td->td_proc); 1150 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1151 PROC_UNLOCK(td->td_proc); 1152 return (ENOMEM); 1153 } 1154 PROC_UNLOCK(td->td_proc); 1155 } 1156#ifdef RACCT 1157 if (racct_enable) { 1158 PROC_LOCK(td->td_proc); 1159 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1160 PROC_UNLOCK(td->td_proc); 1161 if (error != 0) 1162 return (ENOMEM); 1163 } 1164#endif 1165 1166 if (uap->how & MCL_FUTURE) { 1167 vm_map_lock(map); 1168 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1169 vm_map_unlock(map); 1170 error = 0; 1171 } 1172 1173 if (uap->how & MCL_CURRENT) { 1174 /* 1175 * P1003.1-2001 mandates that all currently mapped pages 1176 * will be memory resident and locked (wired) upon return 1177 * from mlockall(). vm_map_wire() will wire pages, by 1178 * calling vm_fault_wire() for each page in the region. 1179 */ 1180 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1181 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1182 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1183 } 1184#ifdef RACCT 1185 if (racct_enable && error != KERN_SUCCESS) { 1186 PROC_LOCK(td->td_proc); 1187 racct_set(td->td_proc, RACCT_MEMLOCK, 1188 ptoa(pmap_wired_count(map->pmap))); 1189 PROC_UNLOCK(td->td_proc); 1190 } 1191#endif 1192 1193 return (error); 1194} 1195 1196#ifndef _SYS_SYSPROTO_H_ 1197struct munlockall_args { 1198 register_t dummy; 1199}; 1200#endif 1201 1202/* 1203 * MPSAFE 1204 */ 1205int 1206sys_munlockall(td, uap) 1207 struct thread *td; 1208 struct munlockall_args *uap; 1209{ 1210 vm_map_t map; 1211 int error; 1212 1213 map = &td->td_proc->p_vmspace->vm_map; 1214 error = priv_check(td, PRIV_VM_MUNLOCK); 1215 if (error) 1216 return (error); 1217 1218 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1219 vm_map_lock(map); 1220 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1221 vm_map_unlock(map); 1222 1223 /* Forcibly unwire all pages. */ 1224 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1225 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1226#ifdef RACCT 1227 if (racct_enable && error == KERN_SUCCESS) { 1228 PROC_LOCK(td->td_proc); 1229 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1230 PROC_UNLOCK(td->td_proc); 1231 } 1232#endif 1233 1234 return (error); 1235} 1236 1237#ifndef _SYS_SYSPROTO_H_ 1238struct munlock_args { 1239 const void *addr; 1240 size_t len; 1241}; 1242#endif 1243/* 1244 * MPSAFE 1245 */ 1246int 1247sys_munlock(td, uap) 1248 struct thread *td; 1249 struct munlock_args *uap; 1250{ 1251 vm_offset_t addr, end, last, start; 1252 vm_size_t size; 1253#ifdef RACCT 1254 vm_map_t map; 1255#endif 1256 int error; 1257 1258 error = priv_check(td, PRIV_VM_MUNLOCK); 1259 if (error) 1260 return (error); 1261 addr = (vm_offset_t)uap->addr; 1262 size = uap->len; 1263 last = addr + size; 1264 start = trunc_page(addr); 1265 end = round_page(last); 1266 if (last < addr || end < addr) 1267 return (EINVAL); 1268 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1269 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1270#ifdef RACCT 1271 if (racct_enable && error == KERN_SUCCESS) { 1272 PROC_LOCK(td->td_proc); 1273 map = &td->td_proc->p_vmspace->vm_map; 1274 racct_set(td->td_proc, RACCT_MEMLOCK, 1275 ptoa(pmap_wired_count(map->pmap))); 1276 PROC_UNLOCK(td->td_proc); 1277 } 1278#endif 1279 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1280} 1281 1282/* 1283 * vm_mmap_vnode() 1284 * 1285 * Helper function for vm_mmap. Perform sanity check specific for mmap 1286 * operations on vnodes. 1287 * 1288 * For VCHR vnodes, the vnode lock is held over the call to 1289 * vm_mmap_cdev() to keep vp->v_rdev valid. 1290 */ 1291int 1292vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1293 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1294 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1295 boolean_t *writecounted) 1296{ 1297 struct vattr va; 1298 vm_object_t obj; 1299 vm_offset_t foff; 1300 struct ucred *cred; 1301 int error, flags, locktype; 1302 1303 cred = td->td_ucred; 1304 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1305 locktype = LK_EXCLUSIVE; 1306 else 1307 locktype = LK_SHARED; 1308 if ((error = vget(vp, locktype, td)) != 0) 1309 return (error); 1310 foff = *foffp; 1311 flags = *flagsp; 1312 obj = vp->v_object; 1313 if (vp->v_type == VREG) { 1314 /* 1315 * Get the proper underlying object 1316 */ 1317 if (obj == NULL) { 1318 error = EINVAL; 1319 goto done; 1320 } 1321 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1322 vput(vp); 1323 vp = (struct vnode *)obj->handle; 1324 /* 1325 * Bypass filesystems obey the mpsafety of the 1326 * underlying fs. Tmpfs never bypasses. 1327 */ 1328 error = vget(vp, locktype, td); 1329 if (error != 0) 1330 return (error); 1331 } 1332 if (locktype == LK_EXCLUSIVE) { 1333 *writecounted = TRUE; 1334 vnode_pager_update_writecount(obj, 0, objsize); 1335 } 1336 } else if (vp->v_type == VCHR) { 1337 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1338 vp->v_rdev, foffp, objp); 1339 if (error == 0) 1340 goto mark_atime; 1341 goto done; 1342 } else { 1343 error = EINVAL; 1344 goto done; 1345 } 1346 if ((error = VOP_GETATTR(vp, &va, cred))) 1347 goto done; 1348#ifdef MAC 1349 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1350 if (error != 0) 1351 goto done; 1352#endif 1353 if ((flags & MAP_SHARED) != 0) { 1354 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1355 if (prot & PROT_WRITE) { 1356 error = EPERM; 1357 goto done; 1358 } 1359 *maxprotp &= ~VM_PROT_WRITE; 1360 } 1361 } 1362 /* 1363 * If it is a regular file without any references 1364 * we do not need to sync it. 1365 * Adjust object size to be the size of actual file. 1366 */ 1367 objsize = round_page(va.va_size); 1368 if (va.va_nlink == 0) 1369 flags |= MAP_NOSYNC; 1370 if (obj->type == OBJT_VNODE) 1371 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1372 cred); 1373 else { 1374 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1375 ("wrong object type")); 1376 vm_object_reference(obj); 1377 } 1378 if (obj == NULL) { 1379 error = ENOMEM; 1380 goto done; 1381 } 1382 *objp = obj; 1383 *flagsp = flags; 1384 1385mark_atime: 1386 vfs_mark_atime(vp, cred); 1387 1388done: 1389 if (error != 0 && *writecounted) { 1390 *writecounted = FALSE; 1391 vnode_pager_update_writecount(obj, objsize, 0); 1392 } 1393 vput(vp); 1394 return (error); 1395} 1396 1397/* 1398 * vm_mmap_cdev() 1399 * 1400 * MPSAFE 1401 * 1402 * Helper function for vm_mmap. Perform sanity check specific for mmap 1403 * operations on cdevs. 1404 */ 1405int 1406vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1407 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1408 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1409{ 1410 vm_object_t obj; 1411 struct cdevsw *dsw; 1412 int error, flags, ref; 1413 1414 flags = *flagsp; 1415 1416 dsw = dev_refthread(cdev, &ref); 1417 if (dsw == NULL) 1418 return (ENXIO); 1419 if (dsw->d_flags & D_MMAP_ANON) { 1420 dev_relthread(cdev, ref); 1421 *maxprotp = VM_PROT_ALL; 1422 *flagsp |= MAP_ANON; 1423 return (0); 1424 } 1425 /* 1426 * cdevs do not provide private mappings of any kind. 1427 */ 1428 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1429 (prot & PROT_WRITE) != 0) { 1430 dev_relthread(cdev, ref); 1431 return (EACCES); 1432 } 1433 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1434 dev_relthread(cdev, ref); 1435 return (EINVAL); 1436 } 1437 /* 1438 * Force device mappings to be shared. 1439 */ 1440 flags |= MAP_SHARED; 1441#ifdef MAC_XXX 1442 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1443 if (error != 0) { 1444 dev_relthread(cdev, ref); 1445 return (error); 1446 } 1447#endif 1448 /* 1449 * First, try d_mmap_single(). If that is not implemented 1450 * (returns ENODEV), fall back to using the device pager. 1451 * Note that d_mmap_single() must return a reference to the 1452 * object (it needs to bump the reference count of the object 1453 * it returns somehow). 1454 * 1455 * XXX assumes VM_PROT_* == PROT_* 1456 */ 1457 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1458 dev_relthread(cdev, ref); 1459 if (error != ENODEV) 1460 return (error); 1461 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1462 td->td_ucred); 1463 if (obj == NULL) 1464 return (EINVAL); 1465 *objp = obj; 1466 *flagsp = flags; 1467 return (0); 1468} 1469 1470/* 1471 * vm_mmap_shm() 1472 * 1473 * MPSAFE 1474 * 1475 * Helper function for vm_mmap. Perform sanity check specific for mmap 1476 * operations on shm file descriptors. 1477 */ 1478int 1479vm_mmap_shm(struct thread *td, vm_size_t objsize, 1480 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1481 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1482{ 1483 int error; 1484 1485 if ((*flagsp & MAP_SHARED) != 0 && 1486 (*maxprotp & VM_PROT_WRITE) == 0 && 1487 (prot & PROT_WRITE) != 0) 1488 return (EACCES); 1489#ifdef MAC 1490 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1491 if (error != 0) 1492 return (error); 1493#endif 1494 error = shm_mmap(shmfd, objsize, foff, objp); 1495 if (error) 1496 return (error); 1497 return (0); 1498} 1499 1500/* 1501 * vm_mmap() 1502 * 1503 * MPSAFE 1504 * 1505 * Internal version of mmap. Currently used by mmap, exec, and sys5 1506 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1507 */ 1508int 1509vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1510 vm_prot_t maxprot, int flags, 1511 objtype_t handle_type, void *handle, 1512 vm_ooffset_t foff) 1513{ 1514 boolean_t fitit; 1515 vm_object_t object = NULL; 1516 struct thread *td = curthread; 1517 int docow, error, findspace, rv; 1518 boolean_t writecounted; 1519 1520 if (size == 0) 1521 return (0); 1522 1523 size = round_page(size); 1524 1525 if (map == &td->td_proc->p_vmspace->vm_map) { 1526 PROC_LOCK(td->td_proc); 1527 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1528 PROC_UNLOCK(td->td_proc); 1529 return (ENOMEM); 1530 } 1531 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1532 PROC_UNLOCK(td->td_proc); 1533 return (ENOMEM); 1534 } 1535 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1536 if (ptoa(pmap_wired_count(map->pmap)) + size > 1537 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1538 racct_set_force(td->td_proc, RACCT_VMEM, 1539 map->size); 1540 PROC_UNLOCK(td->td_proc); 1541 return (ENOMEM); 1542 } 1543 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1544 ptoa(pmap_wired_count(map->pmap)) + size); 1545 if (error != 0) { 1546 racct_set_force(td->td_proc, RACCT_VMEM, 1547 map->size); 1548 PROC_UNLOCK(td->td_proc); 1549 return (error); 1550 } 1551 } 1552 PROC_UNLOCK(td->td_proc); 1553 } 1554 1555 /* 1556 * We currently can only deal with page aligned file offsets. 1557 * The check is here rather than in the syscall because the 1558 * kernel calls this function internally for other mmaping 1559 * operations (such as in exec) and non-aligned offsets will 1560 * cause pmap inconsistencies...so we want to be sure to 1561 * disallow this in all cases. 1562 */ 1563 if (foff & PAGE_MASK) 1564 return (EINVAL); 1565 1566 if ((flags & MAP_FIXED) == 0) { 1567 fitit = TRUE; 1568 *addr = round_page(*addr); 1569 } else { 1570 if (*addr != trunc_page(*addr)) 1571 return (EINVAL); 1572 fitit = FALSE; 1573 } 1574 writecounted = FALSE; 1575 1576 /* 1577 * Lookup/allocate object. 1578 */ 1579 switch (handle_type) { 1580 case OBJT_DEVICE: 1581 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1582 handle, &foff, &object); 1583 break; 1584 case OBJT_VNODE: 1585 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1586 handle, &foff, &object, &writecounted); 1587 break; 1588 case OBJT_SWAP: 1589 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1590 handle, foff, &object); 1591 break; 1592 case OBJT_DEFAULT: 1593 if (handle == NULL) { 1594 error = 0; 1595 break; 1596 } 1597 /* FALLTHROUGH */ 1598 default: 1599 error = EINVAL; 1600 break; 1601 } 1602 if (error) 1603 return (error); 1604 if (flags & MAP_ANON) { 1605 object = NULL; 1606 docow = 0; 1607 /* 1608 * Unnamed anonymous regions always start at 0. 1609 */ 1610 if (handle == 0) 1611 foff = 0; 1612 } else if (flags & MAP_PREFAULT_READ) 1613 docow = MAP_PREFAULT; 1614 else 1615 docow = MAP_PREFAULT_PARTIAL; 1616 1617 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1618 docow |= MAP_COPY_ON_WRITE; 1619 if (flags & MAP_NOSYNC) 1620 docow |= MAP_DISABLE_SYNCER; 1621 if (flags & MAP_NOCORE) 1622 docow |= MAP_DISABLE_COREDUMP; 1623 /* Shared memory is also shared with children. */ 1624 if (flags & MAP_SHARED) 1625 docow |= MAP_INHERIT_SHARE; 1626 if (writecounted) 1627 docow |= MAP_VN_WRITECOUNT; 1628 if (flags & MAP_STACK) { 1629 if (object != NULL) 1630 return (EINVAL); 1631 docow |= MAP_STACK_GROWS_DOWN; 1632 } 1633 if ((flags & MAP_EXCL) != 0) 1634 docow |= MAP_CHECK_EXCL; 1635 1636 if (fitit) { 1637 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1638 findspace = VMFS_SUPER_SPACE; 1639 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1640 findspace = VMFS_ALIGNED_SPACE(flags >> 1641 MAP_ALIGNMENT_SHIFT); 1642 else 1643 findspace = VMFS_OPTIMAL_SPACE; 1644 rv = vm_map_find(map, object, foff, addr, size, 1645#ifdef MAP_32BIT 1646 flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR : 1647#endif 1648 0, findspace, prot, maxprot, docow); 1649 } else { 1650 rv = vm_map_fixed(map, object, foff, *addr, size, 1651 prot, maxprot, docow); 1652 } 1653 1654 if (rv == KERN_SUCCESS) { 1655 /* 1656 * If the process has requested that all future mappings 1657 * be wired, then heed this. 1658 */ 1659 if (map->flags & MAP_WIREFUTURE) { 1660 vm_map_wire(map, *addr, *addr + size, 1661 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1662 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1663 } 1664 } else { 1665 /* 1666 * If this mapping was accounted for in the vnode's 1667 * writecount, then undo that now. 1668 */ 1669 if (writecounted) 1670 vnode_pager_release_writecount(object, 0, size); 1671 /* 1672 * Lose the object reference. Will destroy the 1673 * object if it's an unnamed anonymous mapping 1674 * or named anonymous without other references. 1675 */ 1676 vm_object_deallocate(object); 1677 } 1678 return (vm_mmap_to_errno(rv)); 1679} 1680 1681/* 1682 * Translate a Mach VM return code to zero on success or the appropriate errno 1683 * on failure. 1684 */ 1685int 1686vm_mmap_to_errno(int rv) 1687{ 1688 1689 switch (rv) { 1690 case KERN_SUCCESS: 1691 return (0); 1692 case KERN_INVALID_ADDRESS: 1693 case KERN_NO_SPACE: 1694 return (ENOMEM); 1695 case KERN_PROTECTION_FAILURE: 1696 return (EACCES); 1697 default: 1698 return (EINVAL); 1699 } 1700} 1701