vm_mmap.c revision 313991
1/*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39/* 40 * Mapped file (mmap) interface to VM 41 */ 42 43#include <sys/cdefs.h> 44__FBSDID("$FreeBSD: stable/10/sys/vm/vm_mmap.c 313991 2017-02-20 10:51:46Z kib $"); 45 46#include "opt_compat.h" 47#include "opt_hwpmc_hooks.h" 48 49#include <sys/param.h> 50#include <sys/systm.h> 51#include <sys/capsicum.h> 52#include <sys/kernel.h> 53#include <sys/lock.h> 54#include <sys/mutex.h> 55#include <sys/sysproto.h> 56#include <sys/filedesc.h> 57#include <sys/priv.h> 58#include <sys/proc.h> 59#include <sys/procctl.h> 60#include <sys/racct.h> 61#include <sys/resource.h> 62#include <sys/resourcevar.h> 63#include <sys/rwlock.h> 64#include <sys/sysctl.h> 65#include <sys/vnode.h> 66#include <sys/fcntl.h> 67#include <sys/file.h> 68#include <sys/mman.h> 69#include <sys/mount.h> 70#include <sys/conf.h> 71#include <sys/stat.h> 72#include <sys/syscallsubr.h> 73#include <sys/sysent.h> 74#include <sys/vmmeter.h> 75 76#include <security/mac/mac_framework.h> 77 78#include <vm/vm.h> 79#include <vm/vm_param.h> 80#include <vm/pmap.h> 81#include <vm/vm_map.h> 82#include <vm/vm_object.h> 83#include <vm/vm_page.h> 84#include <vm/vm_pager.h> 85#include <vm/vm_pageout.h> 86#include <vm/vm_extern.h> 87#include <vm/vm_page.h> 88#include <vm/vnode_pager.h> 89 90#ifdef HWPMC_HOOKS 91#include <sys/pmckern.h> 92#endif 93 94int old_mlock = 0; 95SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0, 96 "Do not apply RLIMIT_MEMLOCK on mlockall"); 97TUNABLE_INT("vm.old_mlock", &old_mlock); 98 99#ifdef MAP_32BIT 100#define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 101#endif 102 103static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 104 int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 105static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 106 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 107static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 108 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 109 110#ifndef _SYS_SYSPROTO_H_ 111struct sbrk_args { 112 int incr; 113}; 114#endif 115 116int 117sys_sbrk(struct thread *td, struct sbrk_args *uap) 118{ 119 /* Not yet implemented */ 120 return (EOPNOTSUPP); 121} 122 123#ifndef _SYS_SYSPROTO_H_ 124struct sstk_args { 125 int incr; 126}; 127#endif 128 129int 130sys_sstk(struct thread *td, struct sstk_args *uap) 131{ 132 /* Not yet implemented */ 133 return (EOPNOTSUPP); 134} 135 136#if defined(COMPAT_43) 137#ifndef _SYS_SYSPROTO_H_ 138struct getpagesize_args { 139 int dummy; 140}; 141#endif 142 143int 144ogetpagesize(struct thread *td, struct getpagesize_args *uap) 145{ 146 147 td->td_retval[0] = PAGE_SIZE; 148 return (0); 149} 150#endif /* COMPAT_43 */ 151 152 153/* 154 * Memory Map (mmap) system call. Note that the file offset 155 * and address are allowed to be NOT page aligned, though if 156 * the MAP_FIXED flag it set, both must have the same remainder 157 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 158 * page-aligned, the actual mapping starts at trunc_page(addr) 159 * and the return value is adjusted up by the page offset. 160 * 161 * Generally speaking, only character devices which are themselves 162 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 163 * there would be no cache coherency between a descriptor and a VM mapping 164 * both to the same character device. 165 */ 166#ifndef _SYS_SYSPROTO_H_ 167struct mmap_args { 168 void *addr; 169 size_t len; 170 int prot; 171 int flags; 172 int fd; 173 long pad; 174 off_t pos; 175}; 176#endif 177 178int 179sys_mmap(td, uap) 180 struct thread *td; 181 struct mmap_args *uap; 182{ 183#ifdef HWPMC_HOOKS 184 struct pmckern_map_in pkm; 185#endif 186 struct file *fp; 187 struct vnode *vp; 188 vm_offset_t addr; 189 vm_size_t size, pageoff; 190 vm_prot_t cap_maxprot, prot, maxprot; 191 void *handle; 192 objtype_t handle_type; 193 int align, error, flags; 194 off_t pos; 195 struct vmspace *vms = td->td_proc->p_vmspace; 196 cap_rights_t rights; 197 198 addr = (vm_offset_t) uap->addr; 199 size = uap->len; 200 prot = uap->prot & VM_PROT_ALL; 201 flags = uap->flags; 202 pos = uap->pos; 203 204 fp = NULL; 205 206 /* 207 * Enforce the constraints. 208 * Mapping of length 0 is only allowed for old binaries. 209 * Anonymous mapping shall specify -1 as filedescriptor and 210 * zero position for new code. Be nice to ancient a.out 211 * binaries and correct pos for anonymous mapping, since old 212 * ld.so sometimes issues anonymous map requests with non-zero 213 * pos. 214 */ 215 if (!SV_CURPROC_FLAG(SV_AOUT)) { 216 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 217 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 218 return (EINVAL); 219 } else { 220 if ((flags & MAP_ANON) != 0) 221 pos = 0; 222 } 223 224 if (flags & MAP_STACK) { 225 if ((uap->fd != -1) || 226 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 227 return (EINVAL); 228 flags |= MAP_ANON; 229 pos = 0; 230 } 231 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 232 return (EINVAL); 233 234 /* 235 * Align the file position to a page boundary, 236 * and save its page offset component. 237 */ 238 pageoff = (pos & PAGE_MASK); 239 pos -= pageoff; 240 241 /* Adjust size for rounding (on both ends). */ 242 size += pageoff; /* low end... */ 243 size = (vm_size_t) round_page(size); /* hi end */ 244 245 /* Ensure alignment is at least a page and fits in a pointer. */ 246 align = flags & MAP_ALIGNMENT_MASK; 247 if (align != 0 && align != MAP_ALIGNED_SUPER && 248 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 249 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 250 return (EINVAL); 251 252 /* 253 * Check for illegal addresses. Watch out for address wrap... Note 254 * that VM_*_ADDRESS are not constants due to casts (argh). 255 */ 256 if (flags & MAP_FIXED) { 257 /* 258 * The specified address must have the same remainder 259 * as the file offset taken modulo PAGE_SIZE, so it 260 * should be aligned after adjustment by pageoff. 261 */ 262 addr -= pageoff; 263 if (addr & PAGE_MASK) 264 return (EINVAL); 265 266 /* Address range must be all in user VM space. */ 267 if (addr < vm_map_min(&vms->vm_map) || 268 addr + size > vm_map_max(&vms->vm_map)) 269 return (EINVAL); 270 if (addr + size < addr) 271 return (EINVAL); 272#ifdef MAP_32BIT 273 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 274 return (EINVAL); 275 } else if (flags & MAP_32BIT) { 276 /* 277 * For MAP_32BIT, override the hint if it is too high and 278 * do not bother moving the mapping past the heap (since 279 * the heap is usually above 2GB). 280 */ 281 if (addr + size > MAP_32BIT_MAX_ADDR) 282 addr = 0; 283#endif 284 } else { 285 /* 286 * XXX for non-fixed mappings where no hint is provided or 287 * the hint would fall in the potential heap space, 288 * place it after the end of the largest possible heap. 289 * 290 * There should really be a pmap call to determine a reasonable 291 * location. 292 */ 293 PROC_LOCK(td->td_proc); 294 if (addr == 0 || 295 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 296 addr < round_page((vm_offset_t)vms->vm_daddr + 297 lim_max(td->td_proc, RLIMIT_DATA)))) 298 addr = round_page((vm_offset_t)vms->vm_daddr + 299 lim_max(td->td_proc, RLIMIT_DATA)); 300 PROC_UNLOCK(td->td_proc); 301 } 302 if (flags & MAP_ANON) { 303 /* 304 * Mapping blank space is trivial. 305 */ 306 handle = NULL; 307 handle_type = OBJT_DEFAULT; 308 maxprot = VM_PROT_ALL; 309 cap_maxprot = VM_PROT_ALL; 310 } else { 311 /* 312 * Mapping file, get fp for validation and don't let the 313 * descriptor disappear on us if we block. Check capability 314 * rights, but also return the maximum rights to be combined 315 * with maxprot later. 316 */ 317 cap_rights_init(&rights, CAP_MMAP); 318 if (prot & PROT_READ) 319 cap_rights_set(&rights, CAP_MMAP_R); 320 if ((flags & MAP_SHARED) != 0) { 321 if (prot & PROT_WRITE) 322 cap_rights_set(&rights, CAP_MMAP_W); 323 } 324 if (prot & PROT_EXEC) 325 cap_rights_set(&rights, CAP_MMAP_X); 326 error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); 327 if (error != 0) 328 goto done; 329 if (fp->f_type == DTYPE_SHM) { 330 handle = fp->f_data; 331 handle_type = OBJT_SWAP; 332 maxprot = VM_PROT_NONE; 333 334 /* FREAD should always be set. */ 335 if (fp->f_flag & FREAD) 336 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 337 if (fp->f_flag & FWRITE) 338 maxprot |= VM_PROT_WRITE; 339 goto map; 340 } 341 if (fp->f_type != DTYPE_VNODE) { 342 error = ENODEV; 343 goto done; 344 } 345#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 346 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 347 /* 348 * POSIX shared-memory objects are defined to have 349 * kernel persistence, and are not defined to support 350 * read(2)/write(2) -- or even open(2). Thus, we can 351 * use MAP_ASYNC to trade on-disk coherence for speed. 352 * The shm_open(3) library routine turns on the FPOSIXSHM 353 * flag to request this behavior. 354 */ 355 if (fp->f_flag & FPOSIXSHM) 356 flags |= MAP_NOSYNC; 357#endif 358 vp = fp->f_vnode; 359 /* 360 * Ensure that file and memory protections are 361 * compatible. Note that we only worry about 362 * writability if mapping is shared; in this case, 363 * current and max prot are dictated by the open file. 364 * XXX use the vnode instead? Problem is: what 365 * credentials do we use for determination? What if 366 * proc does a setuid? 367 */ 368 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 369 maxprot = VM_PROT_NONE; 370 else 371 maxprot = VM_PROT_EXECUTE; 372 if (fp->f_flag & FREAD) { 373 maxprot |= VM_PROT_READ; 374 } else if (prot & PROT_READ) { 375 error = EACCES; 376 goto done; 377 } 378 /* 379 * If we are sharing potential changes (either via 380 * MAP_SHARED or via the implicit sharing of character 381 * device mappings), and we are trying to get write 382 * permission although we opened it without asking 383 * for it, bail out. 384 */ 385 if ((flags & MAP_SHARED) != 0) { 386 if ((fp->f_flag & FWRITE) != 0) { 387 maxprot |= VM_PROT_WRITE; 388 } else if ((prot & PROT_WRITE) != 0) { 389 error = EACCES; 390 goto done; 391 } 392 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 393 maxprot |= VM_PROT_WRITE; 394 cap_maxprot |= VM_PROT_WRITE; 395 } 396 handle = (void *)vp; 397 handle_type = OBJT_VNODE; 398 } 399map: 400 td->td_fpop = fp; 401 maxprot &= cap_maxprot; 402 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 403 flags, handle_type, handle, pos); 404 td->td_fpop = NULL; 405#ifdef HWPMC_HOOKS 406 /* inform hwpmc(4) if an executable is being mapped */ 407 if (error == 0 && handle_type == OBJT_VNODE && 408 (prot & PROT_EXEC)) { 409 pkm.pm_file = handle; 410 pkm.pm_address = (uintptr_t) addr; 411 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 412 } 413#endif 414 if (error == 0) 415 td->td_retval[0] = (register_t) (addr + pageoff); 416done: 417 if (fp) 418 fdrop(fp, td); 419 420 return (error); 421} 422 423int 424freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 425{ 426 struct mmap_args oargs; 427 428 oargs.addr = uap->addr; 429 oargs.len = uap->len; 430 oargs.prot = uap->prot; 431 oargs.flags = uap->flags; 432 oargs.fd = uap->fd; 433 oargs.pos = uap->pos; 434 return (sys_mmap(td, &oargs)); 435} 436 437#ifdef COMPAT_43 438#ifndef _SYS_SYSPROTO_H_ 439struct ommap_args { 440 caddr_t addr; 441 int len; 442 int prot; 443 int flags; 444 int fd; 445 long pos; 446}; 447#endif 448int 449ommap(td, uap) 450 struct thread *td; 451 struct ommap_args *uap; 452{ 453 struct mmap_args nargs; 454 static const char cvtbsdprot[8] = { 455 0, 456 PROT_EXEC, 457 PROT_WRITE, 458 PROT_EXEC | PROT_WRITE, 459 PROT_READ, 460 PROT_EXEC | PROT_READ, 461 PROT_WRITE | PROT_READ, 462 PROT_EXEC | PROT_WRITE | PROT_READ, 463 }; 464 465#define OMAP_ANON 0x0002 466#define OMAP_COPY 0x0020 467#define OMAP_SHARED 0x0010 468#define OMAP_FIXED 0x0100 469 470 nargs.addr = uap->addr; 471 nargs.len = uap->len; 472 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 473#ifdef COMPAT_FREEBSD32 474#if defined(__amd64__) || defined(__ia64__) 475 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 476 nargs.prot != 0) 477 nargs.prot |= PROT_EXEC; 478#endif 479#endif 480 nargs.flags = 0; 481 if (uap->flags & OMAP_ANON) 482 nargs.flags |= MAP_ANON; 483 if (uap->flags & OMAP_COPY) 484 nargs.flags |= MAP_COPY; 485 if (uap->flags & OMAP_SHARED) 486 nargs.flags |= MAP_SHARED; 487 else 488 nargs.flags |= MAP_PRIVATE; 489 if (uap->flags & OMAP_FIXED) 490 nargs.flags |= MAP_FIXED; 491 nargs.fd = uap->fd; 492 nargs.pos = uap->pos; 493 return (sys_mmap(td, &nargs)); 494} 495#endif /* COMPAT_43 */ 496 497 498#ifndef _SYS_SYSPROTO_H_ 499struct msync_args { 500 void *addr; 501 size_t len; 502 int flags; 503}; 504#endif 505int 506sys_msync(td, uap) 507 struct thread *td; 508 struct msync_args *uap; 509{ 510 vm_offset_t addr; 511 vm_size_t size, pageoff; 512 int flags; 513 vm_map_t map; 514 int rv; 515 516 addr = (vm_offset_t) uap->addr; 517 size = uap->len; 518 flags = uap->flags; 519 520 pageoff = (addr & PAGE_MASK); 521 addr -= pageoff; 522 size += pageoff; 523 size = (vm_size_t) round_page(size); 524 if (addr + size < addr) 525 return (EINVAL); 526 527 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 528 return (EINVAL); 529 530 map = &td->td_proc->p_vmspace->vm_map; 531 532 /* 533 * Clean the pages and interpret the return value. 534 */ 535 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 536 (flags & MS_INVALIDATE) != 0); 537 switch (rv) { 538 case KERN_SUCCESS: 539 return (0); 540 case KERN_INVALID_ADDRESS: 541 return (ENOMEM); 542 case KERN_INVALID_ARGUMENT: 543 return (EBUSY); 544 case KERN_FAILURE: 545 return (EIO); 546 default: 547 return (EINVAL); 548 } 549} 550 551#ifndef _SYS_SYSPROTO_H_ 552struct munmap_args { 553 void *addr; 554 size_t len; 555}; 556#endif 557int 558sys_munmap(td, uap) 559 struct thread *td; 560 struct munmap_args *uap; 561{ 562#ifdef HWPMC_HOOKS 563 struct pmckern_map_out pkm; 564 vm_map_entry_t entry; 565#endif 566 vm_offset_t addr; 567 vm_size_t size, pageoff; 568 vm_map_t map; 569 570 addr = (vm_offset_t) uap->addr; 571 size = uap->len; 572 if (size == 0) 573 return (EINVAL); 574 575 pageoff = (addr & PAGE_MASK); 576 addr -= pageoff; 577 size += pageoff; 578 size = (vm_size_t) round_page(size); 579 if (addr + size < addr) 580 return (EINVAL); 581 582 /* 583 * Check for illegal addresses. Watch out for address wrap... 584 */ 585 map = &td->td_proc->p_vmspace->vm_map; 586 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 587 return (EINVAL); 588 vm_map_lock(map); 589#ifdef HWPMC_HOOKS 590 /* 591 * Inform hwpmc if the address range being unmapped contains 592 * an executable region. 593 */ 594 pkm.pm_address = (uintptr_t) NULL; 595 if (vm_map_lookup_entry(map, addr, &entry)) { 596 for (; 597 entry != &map->header && entry->start < addr + size; 598 entry = entry->next) { 599 if (vm_map_check_protection(map, entry->start, 600 entry->end, VM_PROT_EXECUTE) == TRUE) { 601 pkm.pm_address = (uintptr_t) addr; 602 pkm.pm_size = (size_t) size; 603 break; 604 } 605 } 606 } 607#endif 608 vm_map_delete(map, addr, addr + size); 609 610#ifdef HWPMC_HOOKS 611 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 612 vm_map_lock_downgrade(map); 613 if (pkm.pm_address != (uintptr_t) NULL) 614 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 615 vm_map_unlock_read(map); 616#else 617 vm_map_unlock(map); 618#endif 619 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 620 return (0); 621} 622 623#ifndef _SYS_SYSPROTO_H_ 624struct mprotect_args { 625 const void *addr; 626 size_t len; 627 int prot; 628}; 629#endif 630int 631sys_mprotect(td, uap) 632 struct thread *td; 633 struct mprotect_args *uap; 634{ 635 vm_offset_t addr; 636 vm_size_t size, pageoff; 637 vm_prot_t prot; 638 639 addr = (vm_offset_t) uap->addr; 640 size = uap->len; 641 prot = uap->prot & VM_PROT_ALL; 642 643 pageoff = (addr & PAGE_MASK); 644 addr -= pageoff; 645 size += pageoff; 646 size = (vm_size_t) round_page(size); 647 if (addr + size < addr) 648 return (EINVAL); 649 650 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 651 addr + size, prot, FALSE)) { 652 case KERN_SUCCESS: 653 return (0); 654 case KERN_PROTECTION_FAILURE: 655 return (EACCES); 656 case KERN_RESOURCE_SHORTAGE: 657 return (ENOMEM); 658 } 659 return (EINVAL); 660} 661 662#ifndef _SYS_SYSPROTO_H_ 663struct minherit_args { 664 void *addr; 665 size_t len; 666 int inherit; 667}; 668#endif 669int 670sys_minherit(struct thread *td, struct minherit_args *uap) 671{ 672 vm_offset_t addr; 673 vm_size_t size, pageoff; 674 vm_inherit_t inherit; 675 676 addr = (vm_offset_t)uap->addr; 677 size = uap->len; 678 inherit = uap->inherit; 679 680 pageoff = (addr & PAGE_MASK); 681 addr -= pageoff; 682 size += pageoff; 683 size = (vm_size_t) round_page(size); 684 if (addr + size < addr) 685 return (EINVAL); 686 687 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 688 addr + size, inherit)) { 689 case KERN_SUCCESS: 690 return (0); 691 case KERN_PROTECTION_FAILURE: 692 return (EACCES); 693 } 694 return (EINVAL); 695} 696 697#ifndef _SYS_SYSPROTO_H_ 698struct madvise_args { 699 void *addr; 700 size_t len; 701 int behav; 702}; 703#endif 704 705int 706sys_madvise(struct thread *td, struct madvise_args *uap) 707{ 708 vm_offset_t start, end; 709 vm_map_t map; 710 int flags; 711 712 /* 713 * Check for our special case, advising the swap pager we are 714 * "immortal." 715 */ 716 if (uap->behav == MADV_PROTECT) { 717 flags = PPROT_SET; 718 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 719 PROC_SPROTECT, &flags)); 720 } 721 722 /* 723 * Check for illegal behavior 724 */ 725 if (uap->behav < 0 || uap->behav > MADV_CORE) 726 return (EINVAL); 727 /* 728 * Check for illegal addresses. Watch out for address wrap... Note 729 * that VM_*_ADDRESS are not constants due to casts (argh). 730 */ 731 map = &td->td_proc->p_vmspace->vm_map; 732 if ((vm_offset_t)uap->addr < vm_map_min(map) || 733 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 734 return (EINVAL); 735 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 736 return (EINVAL); 737 738 /* 739 * Since this routine is only advisory, we default to conservative 740 * behavior. 741 */ 742 start = trunc_page((vm_offset_t) uap->addr); 743 end = round_page((vm_offset_t) uap->addr + uap->len); 744 745 if (vm_map_madvise(map, start, end, uap->behav)) 746 return (EINVAL); 747 return (0); 748} 749 750#ifndef _SYS_SYSPROTO_H_ 751struct mincore_args { 752 const void *addr; 753 size_t len; 754 char *vec; 755}; 756#endif 757 758int 759sys_mincore(struct thread *td, struct mincore_args *uap) 760{ 761 vm_offset_t addr, first_addr; 762 vm_offset_t end, cend; 763 pmap_t pmap; 764 vm_map_t map; 765 char *vec; 766 int error = 0; 767 int vecindex, lastvecindex; 768 vm_map_entry_t current; 769 vm_map_entry_t entry; 770 vm_object_t object; 771 vm_paddr_t locked_pa; 772 vm_page_t m; 773 vm_pindex_t pindex; 774 int mincoreinfo; 775 unsigned int timestamp; 776 boolean_t locked; 777 778 /* 779 * Make sure that the addresses presented are valid for user 780 * mode. 781 */ 782 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 783 end = addr + (vm_size_t)round_page(uap->len); 784 map = &td->td_proc->p_vmspace->vm_map; 785 if (end > vm_map_max(map) || end < addr) 786 return (ENOMEM); 787 788 /* 789 * Address of byte vector 790 */ 791 vec = uap->vec; 792 793 pmap = vmspace_pmap(td->td_proc->p_vmspace); 794 795 vm_map_lock_read(map); 796RestartScan: 797 timestamp = map->timestamp; 798 799 if (!vm_map_lookup_entry(map, addr, &entry)) { 800 vm_map_unlock_read(map); 801 return (ENOMEM); 802 } 803 804 /* 805 * Do this on a map entry basis so that if the pages are not 806 * in the current processes address space, we can easily look 807 * up the pages elsewhere. 808 */ 809 lastvecindex = -1; 810 for (current = entry; 811 (current != &map->header) && (current->start < end); 812 current = current->next) { 813 814 /* 815 * check for contiguity 816 */ 817 if (current->end < end && 818 (entry->next == &map->header || 819 current->next->start > current->end)) { 820 vm_map_unlock_read(map); 821 return (ENOMEM); 822 } 823 824 /* 825 * ignore submaps (for now) or null objects 826 */ 827 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 828 current->object.vm_object == NULL) 829 continue; 830 831 /* 832 * limit this scan to the current map entry and the 833 * limits for the mincore call 834 */ 835 if (addr < current->start) 836 addr = current->start; 837 cend = current->end; 838 if (cend > end) 839 cend = end; 840 841 /* 842 * scan this entry one page at a time 843 */ 844 while (addr < cend) { 845 /* 846 * Check pmap first, it is likely faster, also 847 * it can provide info as to whether we are the 848 * one referencing or modifying the page. 849 */ 850 object = NULL; 851 locked_pa = 0; 852 retry: 853 m = NULL; 854 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 855 if (locked_pa != 0) { 856 /* 857 * The page is mapped by this process but not 858 * both accessed and modified. It is also 859 * managed. Acquire the object lock so that 860 * other mappings might be examined. 861 */ 862 m = PHYS_TO_VM_PAGE(locked_pa); 863 if (m->object != object) { 864 if (object != NULL) 865 VM_OBJECT_WUNLOCK(object); 866 object = m->object; 867 locked = VM_OBJECT_TRYWLOCK(object); 868 vm_page_unlock(m); 869 if (!locked) { 870 VM_OBJECT_WLOCK(object); 871 vm_page_lock(m); 872 goto retry; 873 } 874 } else 875 vm_page_unlock(m); 876 KASSERT(m->valid == VM_PAGE_BITS_ALL, 877 ("mincore: page %p is mapped but invalid", 878 m)); 879 } else if (mincoreinfo == 0) { 880 /* 881 * The page is not mapped by this process. If 882 * the object implements managed pages, then 883 * determine if the page is resident so that 884 * the mappings might be examined. 885 */ 886 if (current->object.vm_object != object) { 887 if (object != NULL) 888 VM_OBJECT_WUNLOCK(object); 889 object = current->object.vm_object; 890 VM_OBJECT_WLOCK(object); 891 } 892 if (object->type == OBJT_DEFAULT || 893 object->type == OBJT_SWAP || 894 object->type == OBJT_VNODE) { 895 pindex = OFF_TO_IDX(current->offset + 896 (addr - current->start)); 897 m = vm_page_lookup(object, pindex); 898 if (m == NULL && 899 vm_page_is_cached(object, pindex)) 900 mincoreinfo = MINCORE_INCORE; 901 if (m != NULL && m->valid == 0) 902 m = NULL; 903 if (m != NULL) 904 mincoreinfo = MINCORE_INCORE; 905 } 906 } 907 if (m != NULL) { 908 /* Examine other mappings to the page. */ 909 if (m->dirty == 0 && pmap_is_modified(m)) 910 vm_page_dirty(m); 911 if (m->dirty != 0) 912 mincoreinfo |= MINCORE_MODIFIED_OTHER; 913 /* 914 * The first test for PGA_REFERENCED is an 915 * optimization. The second test is 916 * required because a concurrent pmap 917 * operation could clear the last reference 918 * and set PGA_REFERENCED before the call to 919 * pmap_is_referenced(). 920 */ 921 if ((m->aflags & PGA_REFERENCED) != 0 || 922 pmap_is_referenced(m) || 923 (m->aflags & PGA_REFERENCED) != 0) 924 mincoreinfo |= MINCORE_REFERENCED_OTHER; 925 } 926 if (object != NULL) 927 VM_OBJECT_WUNLOCK(object); 928 929 /* 930 * subyte may page fault. In case it needs to modify 931 * the map, we release the lock. 932 */ 933 vm_map_unlock_read(map); 934 935 /* 936 * calculate index into user supplied byte vector 937 */ 938 vecindex = OFF_TO_IDX(addr - first_addr); 939 940 /* 941 * If we have skipped map entries, we need to make sure that 942 * the byte vector is zeroed for those skipped entries. 943 */ 944 while ((lastvecindex + 1) < vecindex) { 945 ++lastvecindex; 946 error = subyte(vec + lastvecindex, 0); 947 if (error) { 948 error = EFAULT; 949 goto done2; 950 } 951 } 952 953 /* 954 * Pass the page information to the user 955 */ 956 error = subyte(vec + vecindex, mincoreinfo); 957 if (error) { 958 error = EFAULT; 959 goto done2; 960 } 961 962 /* 963 * If the map has changed, due to the subyte, the previous 964 * output may be invalid. 965 */ 966 vm_map_lock_read(map); 967 if (timestamp != map->timestamp) 968 goto RestartScan; 969 970 lastvecindex = vecindex; 971 addr += PAGE_SIZE; 972 } 973 } 974 975 /* 976 * subyte may page fault. In case it needs to modify 977 * the map, we release the lock. 978 */ 979 vm_map_unlock_read(map); 980 981 /* 982 * Zero the last entries in the byte vector. 983 */ 984 vecindex = OFF_TO_IDX(end - first_addr); 985 while ((lastvecindex + 1) < vecindex) { 986 ++lastvecindex; 987 error = subyte(vec + lastvecindex, 0); 988 if (error) { 989 error = EFAULT; 990 goto done2; 991 } 992 } 993 994 /* 995 * If the map has changed, due to the subyte, the previous 996 * output may be invalid. 997 */ 998 vm_map_lock_read(map); 999 if (timestamp != map->timestamp) 1000 goto RestartScan; 1001 vm_map_unlock_read(map); 1002done2: 1003 return (error); 1004} 1005 1006#ifndef _SYS_SYSPROTO_H_ 1007struct mlock_args { 1008 const void *addr; 1009 size_t len; 1010}; 1011#endif 1012int 1013sys_mlock(struct thread *td, struct mlock_args *uap) 1014{ 1015 1016 return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); 1017} 1018 1019int 1020vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) 1021{ 1022 vm_offset_t addr, end, last, start; 1023 vm_size_t npages, size; 1024 vm_map_t map; 1025 unsigned long nsize; 1026 int error; 1027 1028 error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); 1029 if (error) 1030 return (error); 1031 addr = (vm_offset_t)addr0; 1032 size = len; 1033 last = addr + size; 1034 start = trunc_page(addr); 1035 end = round_page(last); 1036 if (last < addr || end < addr) 1037 return (EINVAL); 1038 npages = atop(end - start); 1039 if (npages > vm_page_max_wired) 1040 return (ENOMEM); 1041 map = &proc->p_vmspace->vm_map; 1042 PROC_LOCK(proc); 1043 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1044 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1045 PROC_UNLOCK(proc); 1046 return (ENOMEM); 1047 } 1048 PROC_UNLOCK(proc); 1049 if (npages + cnt.v_wire_count > vm_page_max_wired) 1050 return (EAGAIN); 1051#ifdef RACCT 1052 if (racct_enable) { 1053 PROC_LOCK(proc); 1054 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1055 PROC_UNLOCK(proc); 1056 if (error != 0) 1057 return (ENOMEM); 1058 } 1059#endif 1060 error = vm_map_wire(map, start, end, 1061 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1062#ifdef RACCT 1063 if (racct_enable && error != KERN_SUCCESS) { 1064 PROC_LOCK(proc); 1065 racct_set(proc, RACCT_MEMLOCK, 1066 ptoa(pmap_wired_count(map->pmap))); 1067 PROC_UNLOCK(proc); 1068 } 1069#endif 1070 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1071} 1072 1073#ifndef _SYS_SYSPROTO_H_ 1074struct mlockall_args { 1075 int how; 1076}; 1077#endif 1078 1079int 1080sys_mlockall(struct thread *td, struct mlockall_args *uap) 1081{ 1082 vm_map_t map; 1083 int error; 1084 1085 map = &td->td_proc->p_vmspace->vm_map; 1086 error = priv_check(td, PRIV_VM_MLOCK); 1087 if (error) 1088 return (error); 1089 1090 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1091 return (EINVAL); 1092 1093 /* 1094 * If wiring all pages in the process would cause it to exceed 1095 * a hard resource limit, return ENOMEM. 1096 */ 1097 if (!old_mlock && uap->how & MCL_CURRENT) { 1098 PROC_LOCK(td->td_proc); 1099 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1100 PROC_UNLOCK(td->td_proc); 1101 return (ENOMEM); 1102 } 1103 PROC_UNLOCK(td->td_proc); 1104 } 1105#ifdef RACCT 1106 if (racct_enable) { 1107 PROC_LOCK(td->td_proc); 1108 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1109 PROC_UNLOCK(td->td_proc); 1110 if (error != 0) 1111 return (ENOMEM); 1112 } 1113#endif 1114 1115 if (uap->how & MCL_FUTURE) { 1116 vm_map_lock(map); 1117 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1118 vm_map_unlock(map); 1119 error = 0; 1120 } 1121 1122 if (uap->how & MCL_CURRENT) { 1123 /* 1124 * P1003.1-2001 mandates that all currently mapped pages 1125 * will be memory resident and locked (wired) upon return 1126 * from mlockall(). vm_map_wire() will wire pages, by 1127 * calling vm_fault_wire() for each page in the region. 1128 */ 1129 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1130 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1131 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1132 } 1133#ifdef RACCT 1134 if (racct_enable && error != KERN_SUCCESS) { 1135 PROC_LOCK(td->td_proc); 1136 racct_set(td->td_proc, RACCT_MEMLOCK, 1137 ptoa(pmap_wired_count(map->pmap))); 1138 PROC_UNLOCK(td->td_proc); 1139 } 1140#endif 1141 1142 return (error); 1143} 1144 1145#ifndef _SYS_SYSPROTO_H_ 1146struct munlockall_args { 1147 register_t dummy; 1148}; 1149#endif 1150 1151int 1152sys_munlockall(struct thread *td, struct munlockall_args *uap) 1153{ 1154 vm_map_t map; 1155 int error; 1156 1157 map = &td->td_proc->p_vmspace->vm_map; 1158 error = priv_check(td, PRIV_VM_MUNLOCK); 1159 if (error) 1160 return (error); 1161 1162 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1163 vm_map_lock(map); 1164 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1165 vm_map_unlock(map); 1166 1167 /* Forcibly unwire all pages. */ 1168 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1169 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1170#ifdef RACCT 1171 if (racct_enable && error == KERN_SUCCESS) { 1172 PROC_LOCK(td->td_proc); 1173 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1174 PROC_UNLOCK(td->td_proc); 1175 } 1176#endif 1177 1178 return (error); 1179} 1180 1181#ifndef _SYS_SYSPROTO_H_ 1182struct munlock_args { 1183 const void *addr; 1184 size_t len; 1185}; 1186#endif 1187int 1188sys_munlock(td, uap) 1189 struct thread *td; 1190 struct munlock_args *uap; 1191{ 1192 vm_offset_t addr, end, last, start; 1193 vm_size_t size; 1194#ifdef RACCT 1195 vm_map_t map; 1196#endif 1197 int error; 1198 1199 error = priv_check(td, PRIV_VM_MUNLOCK); 1200 if (error) 1201 return (error); 1202 addr = (vm_offset_t)uap->addr; 1203 size = uap->len; 1204 last = addr + size; 1205 start = trunc_page(addr); 1206 end = round_page(last); 1207 if (last < addr || end < addr) 1208 return (EINVAL); 1209 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1210 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1211#ifdef RACCT 1212 if (racct_enable && error == KERN_SUCCESS) { 1213 PROC_LOCK(td->td_proc); 1214 map = &td->td_proc->p_vmspace->vm_map; 1215 racct_set(td->td_proc, RACCT_MEMLOCK, 1216 ptoa(pmap_wired_count(map->pmap))); 1217 PROC_UNLOCK(td->td_proc); 1218 } 1219#endif 1220 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1221} 1222 1223/* 1224 * vm_mmap_vnode() 1225 * 1226 * Helper function for vm_mmap. Perform sanity check specific for mmap 1227 * operations on vnodes. 1228 * 1229 * For VCHR vnodes, the vnode lock is held over the call to 1230 * vm_mmap_cdev() to keep vp->v_rdev valid. 1231 */ 1232int 1233vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1234 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1235 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1236 boolean_t *writecounted) 1237{ 1238 struct vattr va; 1239 vm_object_t obj; 1240 vm_offset_t foff; 1241 struct ucred *cred; 1242 int error, flags, locktype; 1243 1244 cred = td->td_ucred; 1245 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1246 locktype = LK_EXCLUSIVE; 1247 else 1248 locktype = LK_SHARED; 1249 if ((error = vget(vp, locktype, td)) != 0) 1250 return (error); 1251 foff = *foffp; 1252 flags = *flagsp; 1253 obj = vp->v_object; 1254 if (vp->v_type == VREG) { 1255 /* 1256 * Get the proper underlying object 1257 */ 1258 if (obj == NULL) { 1259 error = EINVAL; 1260 goto done; 1261 } 1262 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1263 vput(vp); 1264 vp = (struct vnode *)obj->handle; 1265 /* 1266 * Bypass filesystems obey the mpsafety of the 1267 * underlying fs. Tmpfs never bypasses. 1268 */ 1269 error = vget(vp, locktype, td); 1270 if (error != 0) 1271 return (error); 1272 } 1273 if (locktype == LK_EXCLUSIVE) { 1274 *writecounted = TRUE; 1275 vnode_pager_update_writecount(obj, 0, objsize); 1276 } 1277 } else if (vp->v_type == VCHR) { 1278 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1279 vp->v_rdev, foffp, objp); 1280 if (error == 0) 1281 goto mark_atime; 1282 goto done; 1283 } else { 1284 error = EINVAL; 1285 goto done; 1286 } 1287 if ((error = VOP_GETATTR(vp, &va, cred))) 1288 goto done; 1289#ifdef MAC 1290 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1291 if (error != 0) 1292 goto done; 1293#endif 1294 if ((flags & MAP_SHARED) != 0) { 1295 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1296 if (prot & PROT_WRITE) { 1297 error = EPERM; 1298 goto done; 1299 } 1300 *maxprotp &= ~VM_PROT_WRITE; 1301 } 1302 } 1303 /* 1304 * If it is a regular file without any references 1305 * we do not need to sync it. 1306 * Adjust object size to be the size of actual file. 1307 */ 1308 objsize = round_page(va.va_size); 1309 if (va.va_nlink == 0) 1310 flags |= MAP_NOSYNC; 1311 if (obj->type == OBJT_VNODE) 1312 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1313 cred); 1314 else { 1315 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1316 ("wrong object type")); 1317 vm_object_reference(obj); 1318 } 1319 if (obj == NULL) { 1320 error = ENOMEM; 1321 goto done; 1322 } 1323 *objp = obj; 1324 *flagsp = flags; 1325 1326mark_atime: 1327 vfs_mark_atime(vp, cred); 1328 1329done: 1330 if (error != 0 && *writecounted) { 1331 *writecounted = FALSE; 1332 vnode_pager_update_writecount(obj, objsize, 0); 1333 } 1334 vput(vp); 1335 return (error); 1336} 1337 1338/* 1339 * vm_mmap_cdev() 1340 * 1341 * Helper function for vm_mmap. Perform sanity check specific for mmap 1342 * operations on cdevs. 1343 */ 1344int 1345vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1346 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1347 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1348{ 1349 vm_object_t obj; 1350 struct cdevsw *dsw; 1351 int error, flags, ref; 1352 1353 flags = *flagsp; 1354 1355 dsw = dev_refthread(cdev, &ref); 1356 if (dsw == NULL) 1357 return (ENXIO); 1358 if (dsw->d_flags & D_MMAP_ANON) { 1359 dev_relthread(cdev, ref); 1360 *maxprotp = VM_PROT_ALL; 1361 *flagsp |= MAP_ANON; 1362 return (0); 1363 } 1364 /* 1365 * cdevs do not provide private mappings of any kind. 1366 */ 1367 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1368 (prot & PROT_WRITE) != 0) { 1369 dev_relthread(cdev, ref); 1370 return (EACCES); 1371 } 1372 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1373 dev_relthread(cdev, ref); 1374 return (EINVAL); 1375 } 1376 /* 1377 * Force device mappings to be shared. 1378 */ 1379 flags |= MAP_SHARED; 1380#ifdef MAC_XXX 1381 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1382 if (error != 0) { 1383 dev_relthread(cdev, ref); 1384 return (error); 1385 } 1386#endif 1387 /* 1388 * First, try d_mmap_single(). If that is not implemented 1389 * (returns ENODEV), fall back to using the device pager. 1390 * Note that d_mmap_single() must return a reference to the 1391 * object (it needs to bump the reference count of the object 1392 * it returns somehow). 1393 * 1394 * XXX assumes VM_PROT_* == PROT_* 1395 */ 1396 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1397 dev_relthread(cdev, ref); 1398 if (error != ENODEV) 1399 return (error); 1400 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1401 td->td_ucred); 1402 if (obj == NULL) 1403 return (EINVAL); 1404 *objp = obj; 1405 *flagsp = flags; 1406 return (0); 1407} 1408 1409/* 1410 * vm_mmap_shm() 1411 * 1412 * MPSAFE 1413 * 1414 * Helper function for vm_mmap. Perform sanity check specific for mmap 1415 * operations on shm file descriptors. 1416 */ 1417int 1418vm_mmap_shm(struct thread *td, vm_size_t objsize, 1419 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1420 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1421{ 1422 int error; 1423 1424 if ((*flagsp & MAP_SHARED) != 0 && 1425 (*maxprotp & VM_PROT_WRITE) == 0 && 1426 (prot & PROT_WRITE) != 0) 1427 return (EACCES); 1428#ifdef MAC 1429 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1430 if (error != 0) 1431 return (error); 1432#endif 1433 error = shm_mmap(shmfd, objsize, foff, objp); 1434 if (error) 1435 return (error); 1436 return (0); 1437} 1438 1439/* 1440 * vm_mmap() 1441 * 1442 * MPSAFE 1443 * 1444 * Internal version of mmap. Currently used by mmap, exec, and sys5 1445 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1446 */ 1447int 1448vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1449 vm_prot_t maxprot, int flags, 1450 objtype_t handle_type, void *handle, 1451 vm_ooffset_t foff) 1452{ 1453 boolean_t fitit; 1454 vm_object_t object = NULL; 1455 struct thread *td = curthread; 1456 int docow, error, findspace, rv; 1457 boolean_t writecounted; 1458 1459 if (size == 0) 1460 return (0); 1461 1462 size = round_page(size); 1463 1464 if (map == &td->td_proc->p_vmspace->vm_map) { 1465 PROC_LOCK(td->td_proc); 1466 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1467 PROC_UNLOCK(td->td_proc); 1468 return (ENOMEM); 1469 } 1470 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1471 PROC_UNLOCK(td->td_proc); 1472 return (ENOMEM); 1473 } 1474 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1475 if (ptoa(pmap_wired_count(map->pmap)) + size > 1476 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1477 racct_set_force(td->td_proc, RACCT_VMEM, 1478 map->size); 1479 PROC_UNLOCK(td->td_proc); 1480 return (ENOMEM); 1481 } 1482 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1483 ptoa(pmap_wired_count(map->pmap)) + size); 1484 if (error != 0) { 1485 racct_set_force(td->td_proc, RACCT_VMEM, 1486 map->size); 1487 PROC_UNLOCK(td->td_proc); 1488 return (error); 1489 } 1490 } 1491 PROC_UNLOCK(td->td_proc); 1492 } 1493 1494 /* 1495 * We currently can only deal with page aligned file offsets. 1496 * The check is here rather than in the syscall because the 1497 * kernel calls this function internally for other mmaping 1498 * operations (such as in exec) and non-aligned offsets will 1499 * cause pmap inconsistencies...so we want to be sure to 1500 * disallow this in all cases. 1501 */ 1502 if (foff & PAGE_MASK) 1503 return (EINVAL); 1504 1505 if ((flags & MAP_FIXED) == 0) { 1506 fitit = TRUE; 1507 *addr = round_page(*addr); 1508 } else { 1509 if (*addr != trunc_page(*addr)) 1510 return (EINVAL); 1511 fitit = FALSE; 1512 } 1513 writecounted = FALSE; 1514 1515 /* 1516 * Lookup/allocate object. 1517 */ 1518 switch (handle_type) { 1519 case OBJT_DEVICE: 1520 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1521 handle, &foff, &object); 1522 break; 1523 case OBJT_VNODE: 1524 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1525 handle, &foff, &object, &writecounted); 1526 break; 1527 case OBJT_SWAP: 1528 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1529 handle, foff, &object); 1530 break; 1531 case OBJT_DEFAULT: 1532 if (handle == NULL) { 1533 error = 0; 1534 break; 1535 } 1536 /* FALLTHROUGH */ 1537 default: 1538 error = EINVAL; 1539 break; 1540 } 1541 if (error) 1542 return (error); 1543 if (flags & MAP_ANON) { 1544 object = NULL; 1545 docow = 0; 1546 /* 1547 * Unnamed anonymous regions always start at 0. 1548 */ 1549 if (handle == 0) 1550 foff = 0; 1551 } else if (flags & MAP_PREFAULT_READ) 1552 docow = MAP_PREFAULT; 1553 else 1554 docow = MAP_PREFAULT_PARTIAL; 1555 1556 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1557 docow |= MAP_COPY_ON_WRITE; 1558 if (flags & MAP_NOSYNC) 1559 docow |= MAP_DISABLE_SYNCER; 1560 if (flags & MAP_NOCORE) 1561 docow |= MAP_DISABLE_COREDUMP; 1562 /* Shared memory is also shared with children. */ 1563 if (flags & MAP_SHARED) 1564 docow |= MAP_INHERIT_SHARE; 1565 if (writecounted) 1566 docow |= MAP_VN_WRITECOUNT; 1567 if (flags & MAP_STACK) { 1568 if (object != NULL) 1569 return (EINVAL); 1570 docow |= MAP_STACK_GROWS_DOWN; 1571 } 1572 if ((flags & MAP_EXCL) != 0) 1573 docow |= MAP_CHECK_EXCL; 1574 1575 if (fitit) { 1576 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1577 findspace = VMFS_SUPER_SPACE; 1578 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1579 findspace = VMFS_ALIGNED_SPACE(flags >> 1580 MAP_ALIGNMENT_SHIFT); 1581 else 1582 findspace = VMFS_OPTIMAL_SPACE; 1583 rv = vm_map_find(map, object, foff, addr, size, 1584#ifdef MAP_32BIT 1585 flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR : 1586#endif 1587 0, findspace, prot, maxprot, docow); 1588 } else { 1589 rv = vm_map_fixed(map, object, foff, *addr, size, 1590 prot, maxprot, docow); 1591 } 1592 1593 if (rv == KERN_SUCCESS) { 1594 /* 1595 * If the process has requested that all future mappings 1596 * be wired, then heed this. 1597 */ 1598 if (map->flags & MAP_WIREFUTURE) { 1599 vm_map_wire(map, *addr, *addr + size, 1600 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1601 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1602 } 1603 } else { 1604 /* 1605 * If this mapping was accounted for in the vnode's 1606 * writecount, then undo that now. 1607 */ 1608 if (writecounted) 1609 vnode_pager_release_writecount(object, 0, size); 1610 /* 1611 * Lose the object reference. Will destroy the 1612 * object if it's an unnamed anonymous mapping 1613 * or named anonymous without other references. 1614 */ 1615 vm_object_deallocate(object); 1616 } 1617 return (vm_mmap_to_errno(rv)); 1618} 1619 1620/* 1621 * Translate a Mach VM return code to zero on success or the appropriate errno 1622 * on failure. 1623 */ 1624int 1625vm_mmap_to_errno(int rv) 1626{ 1627 1628 switch (rv) { 1629 case KERN_SUCCESS: 1630 return (0); 1631 case KERN_INVALID_ADDRESS: 1632 case KERN_NO_SPACE: 1633 return (ENOMEM); 1634 case KERN_PROTECTION_FAILURE: 1635 return (EACCES); 1636 default: 1637 return (EINVAL); 1638 } 1639} 1640