vm_mmap.c revision 281776
1/*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39/* 40 * Mapped file (mmap) interface to VM 41 */ 42 43#include <sys/cdefs.h> 44__FBSDID("$FreeBSD: stable/10/sys/vm/vm_mmap.c 281776 2015-04-20 17:36:43Z alc $"); 45 46#include "opt_compat.h" 47#include "opt_hwpmc_hooks.h" 48 49#include <sys/param.h> 50#include <sys/systm.h> 51#include <sys/capsicum.h> 52#include <sys/kernel.h> 53#include <sys/lock.h> 54#include <sys/mutex.h> 55#include <sys/sysproto.h> 56#include <sys/filedesc.h> 57#include <sys/priv.h> 58#include <sys/proc.h> 59#include <sys/procctl.h> 60#include <sys/racct.h> 61#include <sys/resource.h> 62#include <sys/resourcevar.h> 63#include <sys/rwlock.h> 64#include <sys/sysctl.h> 65#include <sys/vnode.h> 66#include <sys/fcntl.h> 67#include <sys/file.h> 68#include <sys/mman.h> 69#include <sys/mount.h> 70#include <sys/conf.h> 71#include <sys/stat.h> 72#include <sys/syscallsubr.h> 73#include <sys/sysent.h> 74#include <sys/vmmeter.h> 75 76#include <security/mac/mac_framework.h> 77 78#include <vm/vm.h> 79#include <vm/vm_param.h> 80#include <vm/pmap.h> 81#include <vm/vm_map.h> 82#include <vm/vm_object.h> 83#include <vm/vm_page.h> 84#include <vm/vm_pager.h> 85#include <vm/vm_pageout.h> 86#include <vm/vm_extern.h> 87#include <vm/vm_page.h> 88#include <vm/vnode_pager.h> 89 90#ifdef HWPMC_HOOKS 91#include <sys/pmckern.h> 92#endif 93 94int old_mlock = 0; 95SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0, 96 "Do not apply RLIMIT_MEMLOCK on mlockall"); 97TUNABLE_INT("vm.old_mlock", &old_mlock); 98 99#ifdef MAP_32BIT 100#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 101#endif 102 103static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 104 int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 105static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 106 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 107static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 108 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 109 110#ifndef _SYS_SYSPROTO_H_ 111struct sbrk_args { 112 int incr; 113}; 114#endif 115 116/* 117 * MPSAFE 118 */ 119/* ARGSUSED */ 120int 121sys_sbrk(td, uap) 122 struct thread *td; 123 struct sbrk_args *uap; 124{ 125 /* Not yet implemented */ 126 return (EOPNOTSUPP); 127} 128 129#ifndef _SYS_SYSPROTO_H_ 130struct sstk_args { 131 int incr; 132}; 133#endif 134 135/* 136 * MPSAFE 137 */ 138/* ARGSUSED */ 139int 140sys_sstk(td, uap) 141 struct thread *td; 142 struct sstk_args *uap; 143{ 144 /* Not yet implemented */ 145 return (EOPNOTSUPP); 146} 147 148#if defined(COMPAT_43) 149#ifndef _SYS_SYSPROTO_H_ 150struct getpagesize_args { 151 int dummy; 152}; 153#endif 154 155int 156ogetpagesize(td, uap) 157 struct thread *td; 158 struct getpagesize_args *uap; 159{ 160 /* MP SAFE */ 161 td->td_retval[0] = PAGE_SIZE; 162 return (0); 163} 164#endif /* COMPAT_43 */ 165 166 167/* 168 * Memory Map (mmap) system call. Note that the file offset 169 * and address are allowed to be NOT page aligned, though if 170 * the MAP_FIXED flag it set, both must have the same remainder 171 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 172 * page-aligned, the actual mapping starts at trunc_page(addr) 173 * and the return value is adjusted up by the page offset. 174 * 175 * Generally speaking, only character devices which are themselves 176 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 177 * there would be no cache coherency between a descriptor and a VM mapping 178 * both to the same character device. 179 */ 180#ifndef _SYS_SYSPROTO_H_ 181struct mmap_args { 182 void *addr; 183 size_t len; 184 int prot; 185 int flags; 186 int fd; 187 long pad; 188 off_t pos; 189}; 190#endif 191 192/* 193 * MPSAFE 194 */ 195int 196sys_mmap(td, uap) 197 struct thread *td; 198 struct mmap_args *uap; 199{ 200#ifdef HWPMC_HOOKS 201 struct pmckern_map_in pkm; 202#endif 203 struct file *fp; 204 struct vnode *vp; 205 vm_offset_t addr; 206 vm_size_t size, pageoff; 207 vm_prot_t cap_maxprot, prot, maxprot; 208 void *handle; 209 objtype_t handle_type; 210 int align, error, flags; 211 off_t pos; 212 struct vmspace *vms = td->td_proc->p_vmspace; 213 cap_rights_t rights; 214 215 addr = (vm_offset_t) uap->addr; 216 size = uap->len; 217 prot = uap->prot & VM_PROT_ALL; 218 flags = uap->flags; 219 pos = uap->pos; 220 221 fp = NULL; 222 223 /* 224 * Enforce the constraints. 225 * Mapping of length 0 is only allowed for old binaries. 226 * Anonymous mapping shall specify -1 as filedescriptor and 227 * zero position for new code. Be nice to ancient a.out 228 * binaries and correct pos for anonymous mapping, since old 229 * ld.so sometimes issues anonymous map requests with non-zero 230 * pos. 231 */ 232 if (!SV_CURPROC_FLAG(SV_AOUT)) { 233 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 234 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 235 return (EINVAL); 236 } else { 237 if ((flags & MAP_ANON) != 0) 238 pos = 0; 239 } 240 241 if (flags & MAP_STACK) { 242 if ((uap->fd != -1) || 243 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 244 return (EINVAL); 245 flags |= MAP_ANON; 246 pos = 0; 247 } 248 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 249 return (EINVAL); 250 251 /* 252 * Align the file position to a page boundary, 253 * and save its page offset component. 254 */ 255 pageoff = (pos & PAGE_MASK); 256 pos -= pageoff; 257 258 /* Adjust size for rounding (on both ends). */ 259 size += pageoff; /* low end... */ 260 size = (vm_size_t) round_page(size); /* hi end */ 261 262 /* Ensure alignment is at least a page and fits in a pointer. */ 263 align = flags & MAP_ALIGNMENT_MASK; 264 if (align != 0 && align != MAP_ALIGNED_SUPER && 265 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 266 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 267 return (EINVAL); 268 269 /* 270 * Check for illegal addresses. Watch out for address wrap... Note 271 * that VM_*_ADDRESS are not constants due to casts (argh). 272 */ 273 if (flags & MAP_FIXED) { 274 /* 275 * The specified address must have the same remainder 276 * as the file offset taken modulo PAGE_SIZE, so it 277 * should be aligned after adjustment by pageoff. 278 */ 279 addr -= pageoff; 280 if (addr & PAGE_MASK) 281 return (EINVAL); 282 283 /* Address range must be all in user VM space. */ 284 if (addr < vm_map_min(&vms->vm_map) || 285 addr + size > vm_map_max(&vms->vm_map)) 286 return (EINVAL); 287 if (addr + size < addr) 288 return (EINVAL); 289#ifdef MAP_32BIT 290 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 291 return (EINVAL); 292 } else if (flags & MAP_32BIT) { 293 /* 294 * For MAP_32BIT, override the hint if it is too high and 295 * do not bother moving the mapping past the heap (since 296 * the heap is usually above 2GB). 297 */ 298 if (addr + size > MAP_32BIT_MAX_ADDR) 299 addr = 0; 300#endif 301 } else { 302 /* 303 * XXX for non-fixed mappings where no hint is provided or 304 * the hint would fall in the potential heap space, 305 * place it after the end of the largest possible heap. 306 * 307 * There should really be a pmap call to determine a reasonable 308 * location. 309 */ 310 PROC_LOCK(td->td_proc); 311 if (addr == 0 || 312 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 313 addr < round_page((vm_offset_t)vms->vm_daddr + 314 lim_max(td->td_proc, RLIMIT_DATA)))) 315 addr = round_page((vm_offset_t)vms->vm_daddr + 316 lim_max(td->td_proc, RLIMIT_DATA)); 317 PROC_UNLOCK(td->td_proc); 318 } 319 if (flags & MAP_ANON) { 320 /* 321 * Mapping blank space is trivial. 322 */ 323 handle = NULL; 324 handle_type = OBJT_DEFAULT; 325 maxprot = VM_PROT_ALL; 326 cap_maxprot = VM_PROT_ALL; 327 } else { 328 /* 329 * Mapping file, get fp for validation and don't let the 330 * descriptor disappear on us if we block. Check capability 331 * rights, but also return the maximum rights to be combined 332 * with maxprot later. 333 */ 334 cap_rights_init(&rights, CAP_MMAP); 335 if (prot & PROT_READ) 336 cap_rights_set(&rights, CAP_MMAP_R); 337 if ((flags & MAP_SHARED) != 0) { 338 if (prot & PROT_WRITE) 339 cap_rights_set(&rights, CAP_MMAP_W); 340 } 341 if (prot & PROT_EXEC) 342 cap_rights_set(&rights, CAP_MMAP_X); 343 error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); 344 if (error != 0) 345 goto done; 346 if (fp->f_type == DTYPE_SHM) { 347 handle = fp->f_data; 348 handle_type = OBJT_SWAP; 349 maxprot = VM_PROT_NONE; 350 351 /* FREAD should always be set. */ 352 if (fp->f_flag & FREAD) 353 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 354 if (fp->f_flag & FWRITE) 355 maxprot |= VM_PROT_WRITE; 356 goto map; 357 } 358 if (fp->f_type != DTYPE_VNODE) { 359 error = ENODEV; 360 goto done; 361 } 362#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 363 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 364 /* 365 * POSIX shared-memory objects are defined to have 366 * kernel persistence, and are not defined to support 367 * read(2)/write(2) -- or even open(2). Thus, we can 368 * use MAP_ASYNC to trade on-disk coherence for speed. 369 * The shm_open(3) library routine turns on the FPOSIXSHM 370 * flag to request this behavior. 371 */ 372 if (fp->f_flag & FPOSIXSHM) 373 flags |= MAP_NOSYNC; 374#endif 375 vp = fp->f_vnode; 376 /* 377 * Ensure that file and memory protections are 378 * compatible. Note that we only worry about 379 * writability if mapping is shared; in this case, 380 * current and max prot are dictated by the open file. 381 * XXX use the vnode instead? Problem is: what 382 * credentials do we use for determination? What if 383 * proc does a setuid? 384 */ 385 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 386 maxprot = VM_PROT_NONE; 387 else 388 maxprot = VM_PROT_EXECUTE; 389 if (fp->f_flag & FREAD) { 390 maxprot |= VM_PROT_READ; 391 } else if (prot & PROT_READ) { 392 error = EACCES; 393 goto done; 394 } 395 /* 396 * If we are sharing potential changes (either via 397 * MAP_SHARED or via the implicit sharing of character 398 * device mappings), and we are trying to get write 399 * permission although we opened it without asking 400 * for it, bail out. 401 */ 402 if ((flags & MAP_SHARED) != 0) { 403 if ((fp->f_flag & FWRITE) != 0) { 404 maxprot |= VM_PROT_WRITE; 405 } else if ((prot & PROT_WRITE) != 0) { 406 error = EACCES; 407 goto done; 408 } 409 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 410 maxprot |= VM_PROT_WRITE; 411 cap_maxprot |= VM_PROT_WRITE; 412 } 413 handle = (void *)vp; 414 handle_type = OBJT_VNODE; 415 } 416map: 417 td->td_fpop = fp; 418 maxprot &= cap_maxprot; 419 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 420 flags, handle_type, handle, pos); 421 td->td_fpop = NULL; 422#ifdef HWPMC_HOOKS 423 /* inform hwpmc(4) if an executable is being mapped */ 424 if (error == 0 && handle_type == OBJT_VNODE && 425 (prot & PROT_EXEC)) { 426 pkm.pm_file = handle; 427 pkm.pm_address = (uintptr_t) addr; 428 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 429 } 430#endif 431 if (error == 0) 432 td->td_retval[0] = (register_t) (addr + pageoff); 433done: 434 if (fp) 435 fdrop(fp, td); 436 437 return (error); 438} 439 440int 441freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 442{ 443 struct mmap_args oargs; 444 445 oargs.addr = uap->addr; 446 oargs.len = uap->len; 447 oargs.prot = uap->prot; 448 oargs.flags = uap->flags; 449 oargs.fd = uap->fd; 450 oargs.pos = uap->pos; 451 return (sys_mmap(td, &oargs)); 452} 453 454#ifdef COMPAT_43 455#ifndef _SYS_SYSPROTO_H_ 456struct ommap_args { 457 caddr_t addr; 458 int len; 459 int prot; 460 int flags; 461 int fd; 462 long pos; 463}; 464#endif 465int 466ommap(td, uap) 467 struct thread *td; 468 struct ommap_args *uap; 469{ 470 struct mmap_args nargs; 471 static const char cvtbsdprot[8] = { 472 0, 473 PROT_EXEC, 474 PROT_WRITE, 475 PROT_EXEC | PROT_WRITE, 476 PROT_READ, 477 PROT_EXEC | PROT_READ, 478 PROT_WRITE | PROT_READ, 479 PROT_EXEC | PROT_WRITE | PROT_READ, 480 }; 481 482#define OMAP_ANON 0x0002 483#define OMAP_COPY 0x0020 484#define OMAP_SHARED 0x0010 485#define OMAP_FIXED 0x0100 486 487 nargs.addr = uap->addr; 488 nargs.len = uap->len; 489 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 490#ifdef COMPAT_FREEBSD32 491#if defined(__amd64__) || defined(__ia64__) 492 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 493 nargs.prot != 0) 494 nargs.prot |= PROT_EXEC; 495#endif 496#endif 497 nargs.flags = 0; 498 if (uap->flags & OMAP_ANON) 499 nargs.flags |= MAP_ANON; 500 if (uap->flags & OMAP_COPY) 501 nargs.flags |= MAP_COPY; 502 if (uap->flags & OMAP_SHARED) 503 nargs.flags |= MAP_SHARED; 504 else 505 nargs.flags |= MAP_PRIVATE; 506 if (uap->flags & OMAP_FIXED) 507 nargs.flags |= MAP_FIXED; 508 nargs.fd = uap->fd; 509 nargs.pos = uap->pos; 510 return (sys_mmap(td, &nargs)); 511} 512#endif /* COMPAT_43 */ 513 514 515#ifndef _SYS_SYSPROTO_H_ 516struct msync_args { 517 void *addr; 518 size_t len; 519 int flags; 520}; 521#endif 522/* 523 * MPSAFE 524 */ 525int 526sys_msync(td, uap) 527 struct thread *td; 528 struct msync_args *uap; 529{ 530 vm_offset_t addr; 531 vm_size_t size, pageoff; 532 int flags; 533 vm_map_t map; 534 int rv; 535 536 addr = (vm_offset_t) uap->addr; 537 size = uap->len; 538 flags = uap->flags; 539 540 pageoff = (addr & PAGE_MASK); 541 addr -= pageoff; 542 size += pageoff; 543 size = (vm_size_t) round_page(size); 544 if (addr + size < addr) 545 return (EINVAL); 546 547 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 548 return (EINVAL); 549 550 map = &td->td_proc->p_vmspace->vm_map; 551 552 /* 553 * Clean the pages and interpret the return value. 554 */ 555 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 556 (flags & MS_INVALIDATE) != 0); 557 switch (rv) { 558 case KERN_SUCCESS: 559 return (0); 560 case KERN_INVALID_ADDRESS: 561 return (ENOMEM); 562 case KERN_INVALID_ARGUMENT: 563 return (EBUSY); 564 case KERN_FAILURE: 565 return (EIO); 566 default: 567 return (EINVAL); 568 } 569} 570 571#ifndef _SYS_SYSPROTO_H_ 572struct munmap_args { 573 void *addr; 574 size_t len; 575}; 576#endif 577/* 578 * MPSAFE 579 */ 580int 581sys_munmap(td, uap) 582 struct thread *td; 583 struct munmap_args *uap; 584{ 585#ifdef HWPMC_HOOKS 586 struct pmckern_map_out pkm; 587 vm_map_entry_t entry; 588#endif 589 vm_offset_t addr; 590 vm_size_t size, pageoff; 591 vm_map_t map; 592 593 addr = (vm_offset_t) uap->addr; 594 size = uap->len; 595 if (size == 0) 596 return (EINVAL); 597 598 pageoff = (addr & PAGE_MASK); 599 addr -= pageoff; 600 size += pageoff; 601 size = (vm_size_t) round_page(size); 602 if (addr + size < addr) 603 return (EINVAL); 604 605 /* 606 * Check for illegal addresses. Watch out for address wrap... 607 */ 608 map = &td->td_proc->p_vmspace->vm_map; 609 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 610 return (EINVAL); 611 vm_map_lock(map); 612#ifdef HWPMC_HOOKS 613 /* 614 * Inform hwpmc if the address range being unmapped contains 615 * an executable region. 616 */ 617 pkm.pm_address = (uintptr_t) NULL; 618 if (vm_map_lookup_entry(map, addr, &entry)) { 619 for (; 620 entry != &map->header && entry->start < addr + size; 621 entry = entry->next) { 622 if (vm_map_check_protection(map, entry->start, 623 entry->end, VM_PROT_EXECUTE) == TRUE) { 624 pkm.pm_address = (uintptr_t) addr; 625 pkm.pm_size = (size_t) size; 626 break; 627 } 628 } 629 } 630#endif 631 vm_map_delete(map, addr, addr + size); 632 633#ifdef HWPMC_HOOKS 634 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 635 vm_map_lock_downgrade(map); 636 if (pkm.pm_address != (uintptr_t) NULL) 637 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 638 vm_map_unlock_read(map); 639#else 640 vm_map_unlock(map); 641#endif 642 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 643 return (0); 644} 645 646#ifndef _SYS_SYSPROTO_H_ 647struct mprotect_args { 648 const void *addr; 649 size_t len; 650 int prot; 651}; 652#endif 653/* 654 * MPSAFE 655 */ 656int 657sys_mprotect(td, uap) 658 struct thread *td; 659 struct mprotect_args *uap; 660{ 661 vm_offset_t addr; 662 vm_size_t size, pageoff; 663 vm_prot_t prot; 664 665 addr = (vm_offset_t) uap->addr; 666 size = uap->len; 667 prot = uap->prot & VM_PROT_ALL; 668 669 pageoff = (addr & PAGE_MASK); 670 addr -= pageoff; 671 size += pageoff; 672 size = (vm_size_t) round_page(size); 673 if (addr + size < addr) 674 return (EINVAL); 675 676 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 677 addr + size, prot, FALSE)) { 678 case KERN_SUCCESS: 679 return (0); 680 case KERN_PROTECTION_FAILURE: 681 return (EACCES); 682 case KERN_RESOURCE_SHORTAGE: 683 return (ENOMEM); 684 } 685 return (EINVAL); 686} 687 688#ifndef _SYS_SYSPROTO_H_ 689struct minherit_args { 690 void *addr; 691 size_t len; 692 int inherit; 693}; 694#endif 695/* 696 * MPSAFE 697 */ 698int 699sys_minherit(td, uap) 700 struct thread *td; 701 struct minherit_args *uap; 702{ 703 vm_offset_t addr; 704 vm_size_t size, pageoff; 705 vm_inherit_t inherit; 706 707 addr = (vm_offset_t)uap->addr; 708 size = uap->len; 709 inherit = uap->inherit; 710 711 pageoff = (addr & PAGE_MASK); 712 addr -= pageoff; 713 size += pageoff; 714 size = (vm_size_t) round_page(size); 715 if (addr + size < addr) 716 return (EINVAL); 717 718 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 719 addr + size, inherit)) { 720 case KERN_SUCCESS: 721 return (0); 722 case KERN_PROTECTION_FAILURE: 723 return (EACCES); 724 } 725 return (EINVAL); 726} 727 728#ifndef _SYS_SYSPROTO_H_ 729struct madvise_args { 730 void *addr; 731 size_t len; 732 int behav; 733}; 734#endif 735 736/* 737 * MPSAFE 738 */ 739int 740sys_madvise(td, uap) 741 struct thread *td; 742 struct madvise_args *uap; 743{ 744 vm_offset_t start, end; 745 vm_map_t map; 746 int flags; 747 748 /* 749 * Check for our special case, advising the swap pager we are 750 * "immortal." 751 */ 752 if (uap->behav == MADV_PROTECT) { 753 flags = PPROT_SET; 754 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 755 PROC_SPROTECT, &flags)); 756 } 757 758 /* 759 * Check for illegal behavior 760 */ 761 if (uap->behav < 0 || uap->behav > MADV_CORE) 762 return (EINVAL); 763 /* 764 * Check for illegal addresses. Watch out for address wrap... Note 765 * that VM_*_ADDRESS are not constants due to casts (argh). 766 */ 767 map = &td->td_proc->p_vmspace->vm_map; 768 if ((vm_offset_t)uap->addr < vm_map_min(map) || 769 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 770 return (EINVAL); 771 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 772 return (EINVAL); 773 774 /* 775 * Since this routine is only advisory, we default to conservative 776 * behavior. 777 */ 778 start = trunc_page((vm_offset_t) uap->addr); 779 end = round_page((vm_offset_t) uap->addr + uap->len); 780 781 if (vm_map_madvise(map, start, end, uap->behav)) 782 return (EINVAL); 783 return (0); 784} 785 786#ifndef _SYS_SYSPROTO_H_ 787struct mincore_args { 788 const void *addr; 789 size_t len; 790 char *vec; 791}; 792#endif 793 794/* 795 * MPSAFE 796 */ 797int 798sys_mincore(td, uap) 799 struct thread *td; 800 struct mincore_args *uap; 801{ 802 vm_offset_t addr, first_addr; 803 vm_offset_t end, cend; 804 pmap_t pmap; 805 vm_map_t map; 806 char *vec; 807 int error = 0; 808 int vecindex, lastvecindex; 809 vm_map_entry_t current; 810 vm_map_entry_t entry; 811 vm_object_t object; 812 vm_paddr_t locked_pa; 813 vm_page_t m; 814 vm_pindex_t pindex; 815 int mincoreinfo; 816 unsigned int timestamp; 817 boolean_t locked; 818 819 /* 820 * Make sure that the addresses presented are valid for user 821 * mode. 822 */ 823 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 824 end = addr + (vm_size_t)round_page(uap->len); 825 map = &td->td_proc->p_vmspace->vm_map; 826 if (end > vm_map_max(map) || end < addr) 827 return (ENOMEM); 828 829 /* 830 * Address of byte vector 831 */ 832 vec = uap->vec; 833 834 pmap = vmspace_pmap(td->td_proc->p_vmspace); 835 836 vm_map_lock_read(map); 837RestartScan: 838 timestamp = map->timestamp; 839 840 if (!vm_map_lookup_entry(map, addr, &entry)) { 841 vm_map_unlock_read(map); 842 return (ENOMEM); 843 } 844 845 /* 846 * Do this on a map entry basis so that if the pages are not 847 * in the current processes address space, we can easily look 848 * up the pages elsewhere. 849 */ 850 lastvecindex = -1; 851 for (current = entry; 852 (current != &map->header) && (current->start < end); 853 current = current->next) { 854 855 /* 856 * check for contiguity 857 */ 858 if (current->end < end && 859 (entry->next == &map->header || 860 current->next->start > current->end)) { 861 vm_map_unlock_read(map); 862 return (ENOMEM); 863 } 864 865 /* 866 * ignore submaps (for now) or null objects 867 */ 868 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 869 current->object.vm_object == NULL) 870 continue; 871 872 /* 873 * limit this scan to the current map entry and the 874 * limits for the mincore call 875 */ 876 if (addr < current->start) 877 addr = current->start; 878 cend = current->end; 879 if (cend > end) 880 cend = end; 881 882 /* 883 * scan this entry one page at a time 884 */ 885 while (addr < cend) { 886 /* 887 * Check pmap first, it is likely faster, also 888 * it can provide info as to whether we are the 889 * one referencing or modifying the page. 890 */ 891 object = NULL; 892 locked_pa = 0; 893 retry: 894 m = NULL; 895 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 896 if (locked_pa != 0) { 897 /* 898 * The page is mapped by this process but not 899 * both accessed and modified. It is also 900 * managed. Acquire the object lock so that 901 * other mappings might be examined. 902 */ 903 m = PHYS_TO_VM_PAGE(locked_pa); 904 if (m->object != object) { 905 if (object != NULL) 906 VM_OBJECT_WUNLOCK(object); 907 object = m->object; 908 locked = VM_OBJECT_TRYWLOCK(object); 909 vm_page_unlock(m); 910 if (!locked) { 911 VM_OBJECT_WLOCK(object); 912 vm_page_lock(m); 913 goto retry; 914 } 915 } else 916 vm_page_unlock(m); 917 KASSERT(m->valid == VM_PAGE_BITS_ALL, 918 ("mincore: page %p is mapped but invalid", 919 m)); 920 } else if (mincoreinfo == 0) { 921 /* 922 * The page is not mapped by this process. If 923 * the object implements managed pages, then 924 * determine if the page is resident so that 925 * the mappings might be examined. 926 */ 927 if (current->object.vm_object != object) { 928 if (object != NULL) 929 VM_OBJECT_WUNLOCK(object); 930 object = current->object.vm_object; 931 VM_OBJECT_WLOCK(object); 932 } 933 if (object->type == OBJT_DEFAULT || 934 object->type == OBJT_SWAP || 935 object->type == OBJT_VNODE) { 936 pindex = OFF_TO_IDX(current->offset + 937 (addr - current->start)); 938 m = vm_page_lookup(object, pindex); 939 if (m == NULL && 940 vm_page_is_cached(object, pindex)) 941 mincoreinfo = MINCORE_INCORE; 942 if (m != NULL && m->valid == 0) 943 m = NULL; 944 if (m != NULL) 945 mincoreinfo = MINCORE_INCORE; 946 } 947 } 948 if (m != NULL) { 949 /* Examine other mappings to the page. */ 950 if (m->dirty == 0 && pmap_is_modified(m)) 951 vm_page_dirty(m); 952 if (m->dirty != 0) 953 mincoreinfo |= MINCORE_MODIFIED_OTHER; 954 /* 955 * The first test for PGA_REFERENCED is an 956 * optimization. The second test is 957 * required because a concurrent pmap 958 * operation could clear the last reference 959 * and set PGA_REFERENCED before the call to 960 * pmap_is_referenced(). 961 */ 962 if ((m->aflags & PGA_REFERENCED) != 0 || 963 pmap_is_referenced(m) || 964 (m->aflags & PGA_REFERENCED) != 0) 965 mincoreinfo |= MINCORE_REFERENCED_OTHER; 966 } 967 if (object != NULL) 968 VM_OBJECT_WUNLOCK(object); 969 970 /* 971 * subyte may page fault. In case it needs to modify 972 * the map, we release the lock. 973 */ 974 vm_map_unlock_read(map); 975 976 /* 977 * calculate index into user supplied byte vector 978 */ 979 vecindex = OFF_TO_IDX(addr - first_addr); 980 981 /* 982 * If we have skipped map entries, we need to make sure that 983 * the byte vector is zeroed for those skipped entries. 984 */ 985 while ((lastvecindex + 1) < vecindex) { 986 ++lastvecindex; 987 error = subyte(vec + lastvecindex, 0); 988 if (error) { 989 error = EFAULT; 990 goto done2; 991 } 992 } 993 994 /* 995 * Pass the page information to the user 996 */ 997 error = subyte(vec + vecindex, mincoreinfo); 998 if (error) { 999 error = EFAULT; 1000 goto done2; 1001 } 1002 1003 /* 1004 * If the map has changed, due to the subyte, the previous 1005 * output may be invalid. 1006 */ 1007 vm_map_lock_read(map); 1008 if (timestamp != map->timestamp) 1009 goto RestartScan; 1010 1011 lastvecindex = vecindex; 1012 addr += PAGE_SIZE; 1013 } 1014 } 1015 1016 /* 1017 * subyte may page fault. In case it needs to modify 1018 * the map, we release the lock. 1019 */ 1020 vm_map_unlock_read(map); 1021 1022 /* 1023 * Zero the last entries in the byte vector. 1024 */ 1025 vecindex = OFF_TO_IDX(end - first_addr); 1026 while ((lastvecindex + 1) < vecindex) { 1027 ++lastvecindex; 1028 error = subyte(vec + lastvecindex, 0); 1029 if (error) { 1030 error = EFAULT; 1031 goto done2; 1032 } 1033 } 1034 1035 /* 1036 * If the map has changed, due to the subyte, the previous 1037 * output may be invalid. 1038 */ 1039 vm_map_lock_read(map); 1040 if (timestamp != map->timestamp) 1041 goto RestartScan; 1042 vm_map_unlock_read(map); 1043done2: 1044 return (error); 1045} 1046 1047#ifndef _SYS_SYSPROTO_H_ 1048struct mlock_args { 1049 const void *addr; 1050 size_t len; 1051}; 1052#endif 1053/* 1054 * MPSAFE 1055 */ 1056int 1057sys_mlock(td, uap) 1058 struct thread *td; 1059 struct mlock_args *uap; 1060{ 1061 1062 return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); 1063} 1064 1065int 1066vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) 1067{ 1068 vm_offset_t addr, end, last, start; 1069 vm_size_t npages, size; 1070 vm_map_t map; 1071 unsigned long nsize; 1072 int error; 1073 1074 error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); 1075 if (error) 1076 return (error); 1077 addr = (vm_offset_t)addr0; 1078 size = len; 1079 last = addr + size; 1080 start = trunc_page(addr); 1081 end = round_page(last); 1082 if (last < addr || end < addr) 1083 return (EINVAL); 1084 npages = atop(end - start); 1085 if (npages > vm_page_max_wired) 1086 return (ENOMEM); 1087 map = &proc->p_vmspace->vm_map; 1088 PROC_LOCK(proc); 1089 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1090 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1091 PROC_UNLOCK(proc); 1092 return (ENOMEM); 1093 } 1094 PROC_UNLOCK(proc); 1095 if (npages + cnt.v_wire_count > vm_page_max_wired) 1096 return (EAGAIN); 1097#ifdef RACCT 1098 PROC_LOCK(proc); 1099 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1100 PROC_UNLOCK(proc); 1101 if (error != 0) 1102 return (ENOMEM); 1103#endif 1104 error = vm_map_wire(map, start, end, 1105 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1106#ifdef RACCT 1107 if (error != KERN_SUCCESS) { 1108 PROC_LOCK(proc); 1109 racct_set(proc, RACCT_MEMLOCK, 1110 ptoa(pmap_wired_count(map->pmap))); 1111 PROC_UNLOCK(proc); 1112 } 1113#endif 1114 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1115} 1116 1117#ifndef _SYS_SYSPROTO_H_ 1118struct mlockall_args { 1119 int how; 1120}; 1121#endif 1122 1123/* 1124 * MPSAFE 1125 */ 1126int 1127sys_mlockall(td, uap) 1128 struct thread *td; 1129 struct mlockall_args *uap; 1130{ 1131 vm_map_t map; 1132 int error; 1133 1134 map = &td->td_proc->p_vmspace->vm_map; 1135 error = priv_check(td, PRIV_VM_MLOCK); 1136 if (error) 1137 return (error); 1138 1139 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1140 return (EINVAL); 1141 1142 /* 1143 * If wiring all pages in the process would cause it to exceed 1144 * a hard resource limit, return ENOMEM. 1145 */ 1146 if (!old_mlock && uap->how & MCL_CURRENT) { 1147 PROC_LOCK(td->td_proc); 1148 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1149 PROC_UNLOCK(td->td_proc); 1150 return (ENOMEM); 1151 } 1152 PROC_UNLOCK(td->td_proc); 1153 } 1154#ifdef RACCT 1155 PROC_LOCK(td->td_proc); 1156 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1157 PROC_UNLOCK(td->td_proc); 1158 if (error != 0) 1159 return (ENOMEM); 1160#endif 1161 1162 if (uap->how & MCL_FUTURE) { 1163 vm_map_lock(map); 1164 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1165 vm_map_unlock(map); 1166 error = 0; 1167 } 1168 1169 if (uap->how & MCL_CURRENT) { 1170 /* 1171 * P1003.1-2001 mandates that all currently mapped pages 1172 * will be memory resident and locked (wired) upon return 1173 * from mlockall(). vm_map_wire() will wire pages, by 1174 * calling vm_fault_wire() for each page in the region. 1175 */ 1176 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1177 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1178 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1179 } 1180#ifdef RACCT 1181 if (error != KERN_SUCCESS) { 1182 PROC_LOCK(td->td_proc); 1183 racct_set(td->td_proc, RACCT_MEMLOCK, 1184 ptoa(pmap_wired_count(map->pmap))); 1185 PROC_UNLOCK(td->td_proc); 1186 } 1187#endif 1188 1189 return (error); 1190} 1191 1192#ifndef _SYS_SYSPROTO_H_ 1193struct munlockall_args { 1194 register_t dummy; 1195}; 1196#endif 1197 1198/* 1199 * MPSAFE 1200 */ 1201int 1202sys_munlockall(td, uap) 1203 struct thread *td; 1204 struct munlockall_args *uap; 1205{ 1206 vm_map_t map; 1207 int error; 1208 1209 map = &td->td_proc->p_vmspace->vm_map; 1210 error = priv_check(td, PRIV_VM_MUNLOCK); 1211 if (error) 1212 return (error); 1213 1214 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1215 vm_map_lock(map); 1216 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1217 vm_map_unlock(map); 1218 1219 /* Forcibly unwire all pages. */ 1220 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1221 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1222#ifdef RACCT 1223 if (error == KERN_SUCCESS) { 1224 PROC_LOCK(td->td_proc); 1225 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1226 PROC_UNLOCK(td->td_proc); 1227 } 1228#endif 1229 1230 return (error); 1231} 1232 1233#ifndef _SYS_SYSPROTO_H_ 1234struct munlock_args { 1235 const void *addr; 1236 size_t len; 1237}; 1238#endif 1239/* 1240 * MPSAFE 1241 */ 1242int 1243sys_munlock(td, uap) 1244 struct thread *td; 1245 struct munlock_args *uap; 1246{ 1247 vm_offset_t addr, end, last, start; 1248 vm_size_t size; 1249#ifdef RACCT 1250 vm_map_t map; 1251#endif 1252 int error; 1253 1254 error = priv_check(td, PRIV_VM_MUNLOCK); 1255 if (error) 1256 return (error); 1257 addr = (vm_offset_t)uap->addr; 1258 size = uap->len; 1259 last = addr + size; 1260 start = trunc_page(addr); 1261 end = round_page(last); 1262 if (last < addr || end < addr) 1263 return (EINVAL); 1264 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1265 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1266#ifdef RACCT 1267 if (error == KERN_SUCCESS) { 1268 PROC_LOCK(td->td_proc); 1269 map = &td->td_proc->p_vmspace->vm_map; 1270 racct_set(td->td_proc, RACCT_MEMLOCK, 1271 ptoa(pmap_wired_count(map->pmap))); 1272 PROC_UNLOCK(td->td_proc); 1273 } 1274#endif 1275 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1276} 1277 1278/* 1279 * vm_mmap_vnode() 1280 * 1281 * Helper function for vm_mmap. Perform sanity check specific for mmap 1282 * operations on vnodes. 1283 * 1284 * For VCHR vnodes, the vnode lock is held over the call to 1285 * vm_mmap_cdev() to keep vp->v_rdev valid. 1286 */ 1287int 1288vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1289 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1290 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1291 boolean_t *writecounted) 1292{ 1293 struct vattr va; 1294 vm_object_t obj; 1295 vm_offset_t foff; 1296 struct ucred *cred; 1297 int error, flags, locktype; 1298 1299 cred = td->td_ucred; 1300 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1301 locktype = LK_EXCLUSIVE; 1302 else 1303 locktype = LK_SHARED; 1304 if ((error = vget(vp, locktype, td)) != 0) 1305 return (error); 1306 foff = *foffp; 1307 flags = *flagsp; 1308 obj = vp->v_object; 1309 if (vp->v_type == VREG) { 1310 /* 1311 * Get the proper underlying object 1312 */ 1313 if (obj == NULL) { 1314 error = EINVAL; 1315 goto done; 1316 } 1317 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1318 vput(vp); 1319 vp = (struct vnode *)obj->handle; 1320 /* 1321 * Bypass filesystems obey the mpsafety of the 1322 * underlying fs. Tmpfs never bypasses. 1323 */ 1324 error = vget(vp, locktype, td); 1325 if (error != 0) 1326 return (error); 1327 } 1328 if (locktype == LK_EXCLUSIVE) { 1329 *writecounted = TRUE; 1330 vnode_pager_update_writecount(obj, 0, objsize); 1331 } 1332 } else if (vp->v_type == VCHR) { 1333 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1334 vp->v_rdev, foffp, objp); 1335 if (error == 0) 1336 goto mark_atime; 1337 goto done; 1338 } else { 1339 error = EINVAL; 1340 goto done; 1341 } 1342 if ((error = VOP_GETATTR(vp, &va, cred))) 1343 goto done; 1344#ifdef MAC 1345 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1346 if (error != 0) 1347 goto done; 1348#endif 1349 if ((flags & MAP_SHARED) != 0) { 1350 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1351 if (prot & PROT_WRITE) { 1352 error = EPERM; 1353 goto done; 1354 } 1355 *maxprotp &= ~VM_PROT_WRITE; 1356 } 1357 } 1358 /* 1359 * If it is a regular file without any references 1360 * we do not need to sync it. 1361 * Adjust object size to be the size of actual file. 1362 */ 1363 objsize = round_page(va.va_size); 1364 if (va.va_nlink == 0) 1365 flags |= MAP_NOSYNC; 1366 if (obj->type == OBJT_VNODE) 1367 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1368 cred); 1369 else { 1370 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1371 ("wrong object type")); 1372 vm_object_reference(obj); 1373 } 1374 if (obj == NULL) { 1375 error = ENOMEM; 1376 goto done; 1377 } 1378 *objp = obj; 1379 *flagsp = flags; 1380 1381mark_atime: 1382 vfs_mark_atime(vp, cred); 1383 1384done: 1385 if (error != 0 && *writecounted) { 1386 *writecounted = FALSE; 1387 vnode_pager_update_writecount(obj, objsize, 0); 1388 } 1389 vput(vp); 1390 return (error); 1391} 1392 1393/* 1394 * vm_mmap_cdev() 1395 * 1396 * MPSAFE 1397 * 1398 * Helper function for vm_mmap. Perform sanity check specific for mmap 1399 * operations on cdevs. 1400 */ 1401int 1402vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1403 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1404 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1405{ 1406 vm_object_t obj; 1407 struct cdevsw *dsw; 1408 int error, flags, ref; 1409 1410 flags = *flagsp; 1411 1412 dsw = dev_refthread(cdev, &ref); 1413 if (dsw == NULL) 1414 return (ENXIO); 1415 if (dsw->d_flags & D_MMAP_ANON) { 1416 dev_relthread(cdev, ref); 1417 *maxprotp = VM_PROT_ALL; 1418 *flagsp |= MAP_ANON; 1419 return (0); 1420 } 1421 /* 1422 * cdevs do not provide private mappings of any kind. 1423 */ 1424 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1425 (prot & PROT_WRITE) != 0) { 1426 dev_relthread(cdev, ref); 1427 return (EACCES); 1428 } 1429 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1430 dev_relthread(cdev, ref); 1431 return (EINVAL); 1432 } 1433 /* 1434 * Force device mappings to be shared. 1435 */ 1436 flags |= MAP_SHARED; 1437#ifdef MAC_XXX 1438 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1439 if (error != 0) { 1440 dev_relthread(cdev, ref); 1441 return (error); 1442 } 1443#endif 1444 /* 1445 * First, try d_mmap_single(). If that is not implemented 1446 * (returns ENODEV), fall back to using the device pager. 1447 * Note that d_mmap_single() must return a reference to the 1448 * object (it needs to bump the reference count of the object 1449 * it returns somehow). 1450 * 1451 * XXX assumes VM_PROT_* == PROT_* 1452 */ 1453 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1454 dev_relthread(cdev, ref); 1455 if (error != ENODEV) 1456 return (error); 1457 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1458 td->td_ucred); 1459 if (obj == NULL) 1460 return (EINVAL); 1461 *objp = obj; 1462 *flagsp = flags; 1463 return (0); 1464} 1465 1466/* 1467 * vm_mmap_shm() 1468 * 1469 * MPSAFE 1470 * 1471 * Helper function for vm_mmap. Perform sanity check specific for mmap 1472 * operations on shm file descriptors. 1473 */ 1474int 1475vm_mmap_shm(struct thread *td, vm_size_t objsize, 1476 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1477 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1478{ 1479 int error; 1480 1481 if ((*flagsp & MAP_SHARED) != 0 && 1482 (*maxprotp & VM_PROT_WRITE) == 0 && 1483 (prot & PROT_WRITE) != 0) 1484 return (EACCES); 1485#ifdef MAC 1486 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1487 if (error != 0) 1488 return (error); 1489#endif 1490 error = shm_mmap(shmfd, objsize, foff, objp); 1491 if (error) 1492 return (error); 1493 return (0); 1494} 1495 1496/* 1497 * vm_mmap() 1498 * 1499 * MPSAFE 1500 * 1501 * Internal version of mmap. Currently used by mmap, exec, and sys5 1502 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1503 */ 1504int 1505vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1506 vm_prot_t maxprot, int flags, 1507 objtype_t handle_type, void *handle, 1508 vm_ooffset_t foff) 1509{ 1510 boolean_t fitit; 1511 vm_object_t object = NULL; 1512 struct thread *td = curthread; 1513 int docow, error, findspace, rv; 1514 boolean_t writecounted; 1515 1516 if (size == 0) 1517 return (0); 1518 1519 size = round_page(size); 1520 1521 if (map == &td->td_proc->p_vmspace->vm_map) { 1522 PROC_LOCK(td->td_proc); 1523 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1524 PROC_UNLOCK(td->td_proc); 1525 return (ENOMEM); 1526 } 1527 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1528 PROC_UNLOCK(td->td_proc); 1529 return (ENOMEM); 1530 } 1531 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1532 if (ptoa(pmap_wired_count(map->pmap)) + size > 1533 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1534 racct_set_force(td->td_proc, RACCT_VMEM, 1535 map->size); 1536 PROC_UNLOCK(td->td_proc); 1537 return (ENOMEM); 1538 } 1539 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1540 ptoa(pmap_wired_count(map->pmap)) + size); 1541 if (error != 0) { 1542 racct_set_force(td->td_proc, RACCT_VMEM, 1543 map->size); 1544 PROC_UNLOCK(td->td_proc); 1545 return (error); 1546 } 1547 } 1548 PROC_UNLOCK(td->td_proc); 1549 } 1550 1551 /* 1552 * We currently can only deal with page aligned file offsets. 1553 * The check is here rather than in the syscall because the 1554 * kernel calls this function internally for other mmaping 1555 * operations (such as in exec) and non-aligned offsets will 1556 * cause pmap inconsistencies...so we want to be sure to 1557 * disallow this in all cases. 1558 */ 1559 if (foff & PAGE_MASK) 1560 return (EINVAL); 1561 1562 if ((flags & MAP_FIXED) == 0) { 1563 fitit = TRUE; 1564 *addr = round_page(*addr); 1565 } else { 1566 if (*addr != trunc_page(*addr)) 1567 return (EINVAL); 1568 fitit = FALSE; 1569 } 1570 writecounted = FALSE; 1571 1572 /* 1573 * Lookup/allocate object. 1574 */ 1575 switch (handle_type) { 1576 case OBJT_DEVICE: 1577 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1578 handle, &foff, &object); 1579 break; 1580 case OBJT_VNODE: 1581 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1582 handle, &foff, &object, &writecounted); 1583 break; 1584 case OBJT_SWAP: 1585 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1586 handle, foff, &object); 1587 break; 1588 case OBJT_DEFAULT: 1589 if (handle == NULL) { 1590 error = 0; 1591 break; 1592 } 1593 /* FALLTHROUGH */ 1594 default: 1595 error = EINVAL; 1596 break; 1597 } 1598 if (error) 1599 return (error); 1600 if (flags & MAP_ANON) { 1601 object = NULL; 1602 docow = 0; 1603 /* 1604 * Unnamed anonymous regions always start at 0. 1605 */ 1606 if (handle == 0) 1607 foff = 0; 1608 } else if (flags & MAP_PREFAULT_READ) 1609 docow = MAP_PREFAULT; 1610 else 1611 docow = MAP_PREFAULT_PARTIAL; 1612 1613 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1614 docow |= MAP_COPY_ON_WRITE; 1615 if (flags & MAP_NOSYNC) 1616 docow |= MAP_DISABLE_SYNCER; 1617 if (flags & MAP_NOCORE) 1618 docow |= MAP_DISABLE_COREDUMP; 1619 /* Shared memory is also shared with children. */ 1620 if (flags & MAP_SHARED) 1621 docow |= MAP_INHERIT_SHARE; 1622 if (writecounted) 1623 docow |= MAP_VN_WRITECOUNT; 1624 if (flags & MAP_STACK) { 1625 if (object != NULL) 1626 return (EINVAL); 1627 docow |= MAP_STACK_GROWS_DOWN; 1628 } 1629 if ((flags & MAP_EXCL) != 0) 1630 docow |= MAP_CHECK_EXCL; 1631 1632 if (fitit) { 1633 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1634 findspace = VMFS_SUPER_SPACE; 1635 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1636 findspace = VMFS_ALIGNED_SPACE(flags >> 1637 MAP_ALIGNMENT_SHIFT); 1638 else 1639 findspace = VMFS_OPTIMAL_SPACE; 1640 rv = vm_map_find(map, object, foff, addr, size, 1641#ifdef MAP_32BIT 1642 flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR : 1643#endif 1644 0, findspace, prot, maxprot, docow); 1645 } else { 1646 rv = vm_map_fixed(map, object, foff, *addr, size, 1647 prot, maxprot, docow); 1648 } 1649 1650 if (rv == KERN_SUCCESS) { 1651 /* 1652 * If the process has requested that all future mappings 1653 * be wired, then heed this. 1654 */ 1655 if (map->flags & MAP_WIREFUTURE) { 1656 vm_map_wire(map, *addr, *addr + size, 1657 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1658 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1659 } 1660 } else { 1661 /* 1662 * If this mapping was accounted for in the vnode's 1663 * writecount, then undo that now. 1664 */ 1665 if (writecounted) 1666 vnode_pager_release_writecount(object, 0, size); 1667 /* 1668 * Lose the object reference. Will destroy the 1669 * object if it's an unnamed anonymous mapping 1670 * or named anonymous without other references. 1671 */ 1672 vm_object_deallocate(object); 1673 } 1674 return (vm_mmap_to_errno(rv)); 1675} 1676 1677/* 1678 * Translate a Mach VM return code to zero on success or the appropriate errno 1679 * on failure. 1680 */ 1681int 1682vm_mmap_to_errno(int rv) 1683{ 1684 1685 switch (rv) { 1686 case KERN_SUCCESS: 1687 return (0); 1688 case KERN_INVALID_ADDRESS: 1689 case KERN_NO_SPACE: 1690 return (ENOMEM); 1691 case KERN_PROTECTION_FAILURE: 1692 return (EACCES); 1693 default: 1694 return (EINVAL); 1695 } 1696} 1697