vm_map.c revision 330897
1/*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * The Mach Operating System project at Carnegie-Mellon University. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94 35 * 36 * 37 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 38 * All rights reserved. 39 * 40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 41 * 42 * Permission to use, copy, modify and distribute this software and 43 * its documentation is hereby granted, provided that both the copyright 44 * notice and this permission notice appear in all copies of the 45 * software, derivative works or modified versions, and any portions 46 * thereof, and that both notices appear in supporting documentation. 47 * 48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 51 * 52 * Carnegie Mellon requests users of this software to return to 53 * 54 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 55 * School of Computer Science 56 * Carnegie Mellon University 57 * Pittsburgh PA 15213-3890 58 * 59 * any improvements or extensions that they make and grant Carnegie the 60 * rights to redistribute these changes. 61 */ 62 63/* 64 * Virtual memory mapping module. 65 */ 66 67#include <sys/cdefs.h> 68__FBSDID("$FreeBSD: stable/11/sys/vm/vm_map.c 330897 2018-03-14 03:19:51Z eadler $"); 69 70#include <sys/param.h> 71#include <sys/systm.h> 72#include <sys/kernel.h> 73#include <sys/ktr.h> 74#include <sys/lock.h> 75#include <sys/mutex.h> 76#include <sys/proc.h> 77#include <sys/vmmeter.h> 78#include <sys/mman.h> 79#include <sys/vnode.h> 80#include <sys/racct.h> 81#include <sys/resourcevar.h> 82#include <sys/rwlock.h> 83#include <sys/file.h> 84#include <sys/sysctl.h> 85#include <sys/sysent.h> 86#include <sys/shm.h> 87 88#include <vm/vm.h> 89#include <vm/vm_param.h> 90#include <vm/pmap.h> 91#include <vm/vm_map.h> 92#include <vm/vm_page.h> 93#include <vm/vm_object.h> 94#include <vm/vm_pager.h> 95#include <vm/vm_kern.h> 96#include <vm/vm_extern.h> 97#include <vm/vnode_pager.h> 98#include <vm/swap_pager.h> 99#include <vm/uma.h> 100 101/* 102 * Virtual memory maps provide for the mapping, protection, 103 * and sharing of virtual memory objects. In addition, 104 * this module provides for an efficient virtual copy of 105 * memory from one map to another. 106 * 107 * Synchronization is required prior to most operations. 108 * 109 * Maps consist of an ordered doubly-linked list of simple 110 * entries; a self-adjusting binary search tree of these 111 * entries is used to speed up lookups. 112 * 113 * Since portions of maps are specified by start/end addresses, 114 * which may not align with existing map entries, all 115 * routines merely "clip" entries to these start/end values. 116 * [That is, an entry is split into two, bordering at a 117 * start or end value.] Note that these clippings may not 118 * always be necessary (as the two resulting entries are then 119 * not changed); however, the clipping is done for convenience. 120 * 121 * As mentioned above, virtual copy operations are performed 122 * by copying VM object references from one map to 123 * another, and then marking both regions as copy-on-write. 124 */ 125 126static struct mtx map_sleep_mtx; 127static uma_zone_t mapentzone; 128static uma_zone_t kmapentzone; 129static uma_zone_t mapzone; 130static uma_zone_t vmspace_zone; 131static int vmspace_zinit(void *mem, int size, int flags); 132static int vm_map_zinit(void *mem, int ize, int flags); 133static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, 134 vm_offset_t max); 135static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map); 136static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry); 137static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry); 138static int vm_map_growstack(vm_map_t map, vm_offset_t addr, 139 vm_map_entry_t gap_entry); 140static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, 141 vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags); 142#ifdef INVARIANTS 143static void vm_map_zdtor(void *mem, int size, void *arg); 144static void vmspace_zdtor(void *mem, int size, void *arg); 145#endif 146static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, 147 vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max, 148 int cow); 149static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, 150 vm_offset_t failed_addr); 151 152#define ENTRY_CHARGED(e) ((e)->cred != NULL || \ 153 ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \ 154 !((e)->eflags & MAP_ENTRY_NEEDS_COPY))) 155 156/* 157 * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type 158 * stable. 159 */ 160#define PROC_VMSPACE_LOCK(p) do { } while (0) 161#define PROC_VMSPACE_UNLOCK(p) do { } while (0) 162 163/* 164 * VM_MAP_RANGE_CHECK: [ internal use only ] 165 * 166 * Asserts that the starting and ending region 167 * addresses fall within the valid range of the map. 168 */ 169#define VM_MAP_RANGE_CHECK(map, start, end) \ 170 { \ 171 if (start < vm_map_min(map)) \ 172 start = vm_map_min(map); \ 173 if (end > vm_map_max(map)) \ 174 end = vm_map_max(map); \ 175 if (start > end) \ 176 start = end; \ 177 } 178 179/* 180 * vm_map_startup: 181 * 182 * Initialize the vm_map module. Must be called before 183 * any other vm_map routines. 184 * 185 * Map and entry structures are allocated from the general 186 * purpose memory pool with some exceptions: 187 * 188 * - The kernel map and kmem submap are allocated statically. 189 * - Kernel map entries are allocated out of a static pool. 190 * 191 * These restrictions are necessary since malloc() uses the 192 * maps and requires map entries. 193 */ 194 195void 196vm_map_startup(void) 197{ 198 mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF); 199 mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL, 200#ifdef INVARIANTS 201 vm_map_zdtor, 202#else 203 NULL, 204#endif 205 vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 206 uma_prealloc(mapzone, MAX_KMAP); 207 kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry), 208 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 209 UMA_ZONE_MTXCLASS | UMA_ZONE_VM); 210 mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry), 211 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 212 vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL, 213#ifdef INVARIANTS 214 vmspace_zdtor, 215#else 216 NULL, 217#endif 218 vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 219} 220 221static int 222vmspace_zinit(void *mem, int size, int flags) 223{ 224 struct vmspace *vm; 225 226 vm = (struct vmspace *)mem; 227 228 vm->vm_map.pmap = NULL; 229 (void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags); 230 PMAP_LOCK_INIT(vmspace_pmap(vm)); 231 return (0); 232} 233 234static int 235vm_map_zinit(void *mem, int size, int flags) 236{ 237 vm_map_t map; 238 239 map = (vm_map_t)mem; 240 memset(map, 0, sizeof(*map)); 241 mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK); 242 sx_init(&map->lock, "vm map (user)"); 243 return (0); 244} 245 246#ifdef INVARIANTS 247static void 248vmspace_zdtor(void *mem, int size, void *arg) 249{ 250 struct vmspace *vm; 251 252 vm = (struct vmspace *)mem; 253 254 vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg); 255} 256static void 257vm_map_zdtor(void *mem, int size, void *arg) 258{ 259 vm_map_t map; 260 261 map = (vm_map_t)mem; 262 KASSERT(map->nentries == 0, 263 ("map %p nentries == %d on free.", 264 map, map->nentries)); 265 KASSERT(map->size == 0, 266 ("map %p size == %lu on free.", 267 map, (unsigned long)map->size)); 268} 269#endif /* INVARIANTS */ 270 271/* 272 * Allocate a vmspace structure, including a vm_map and pmap, 273 * and initialize those structures. The refcnt is set to 1. 274 * 275 * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit(). 276 */ 277struct vmspace * 278vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit) 279{ 280 struct vmspace *vm; 281 282 vm = uma_zalloc(vmspace_zone, M_WAITOK); 283 284 KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL")); 285 286 if (pinit == NULL) 287 pinit = &pmap_pinit; 288 289 if (!pinit(vmspace_pmap(vm))) { 290 uma_zfree(vmspace_zone, vm); 291 return (NULL); 292 } 293 CTR1(KTR_VM, "vmspace_alloc: %p", vm); 294 _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max); 295 vm->vm_refcnt = 1; 296 vm->vm_shm = NULL; 297 vm->vm_swrss = 0; 298 vm->vm_tsize = 0; 299 vm->vm_dsize = 0; 300 vm->vm_ssize = 0; 301 vm->vm_taddr = 0; 302 vm->vm_daddr = 0; 303 vm->vm_maxsaddr = 0; 304 return (vm); 305} 306 307#ifdef RACCT 308static void 309vmspace_container_reset(struct proc *p) 310{ 311 312 PROC_LOCK(p); 313 racct_set(p, RACCT_DATA, 0); 314 racct_set(p, RACCT_STACK, 0); 315 racct_set(p, RACCT_RSS, 0); 316 racct_set(p, RACCT_MEMLOCK, 0); 317 racct_set(p, RACCT_VMEM, 0); 318 PROC_UNLOCK(p); 319} 320#endif 321 322static inline void 323vmspace_dofree(struct vmspace *vm) 324{ 325 326 CTR1(KTR_VM, "vmspace_free: %p", vm); 327 328 /* 329 * Make sure any SysV shm is freed, it might not have been in 330 * exit1(). 331 */ 332 shmexit(vm); 333 334 /* 335 * Lock the map, to wait out all other references to it. 336 * Delete all of the mappings and pages they hold, then call 337 * the pmap module to reclaim anything left. 338 */ 339 (void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset, 340 vm->vm_map.max_offset); 341 342 pmap_release(vmspace_pmap(vm)); 343 vm->vm_map.pmap = NULL; 344 uma_zfree(vmspace_zone, vm); 345} 346 347void 348vmspace_free(struct vmspace *vm) 349{ 350 351 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 352 "vmspace_free() called"); 353 354 if (vm->vm_refcnt == 0) 355 panic("vmspace_free: attempt to free already freed vmspace"); 356 357 if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1) 358 vmspace_dofree(vm); 359} 360 361void 362vmspace_exitfree(struct proc *p) 363{ 364 struct vmspace *vm; 365 366 PROC_VMSPACE_LOCK(p); 367 vm = p->p_vmspace; 368 p->p_vmspace = NULL; 369 PROC_VMSPACE_UNLOCK(p); 370 KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace")); 371 vmspace_free(vm); 372} 373 374void 375vmspace_exit(struct thread *td) 376{ 377 int refcnt; 378 struct vmspace *vm; 379 struct proc *p; 380 381 /* 382 * Release user portion of address space. 383 * This releases references to vnodes, 384 * which could cause I/O if the file has been unlinked. 385 * Need to do this early enough that we can still sleep. 386 * 387 * The last exiting process to reach this point releases as 388 * much of the environment as it can. vmspace_dofree() is the 389 * slower fallback in case another process had a temporary 390 * reference to the vmspace. 391 */ 392 393 p = td->td_proc; 394 vm = p->p_vmspace; 395 atomic_add_int(&vmspace0.vm_refcnt, 1); 396 do { 397 refcnt = vm->vm_refcnt; 398 if (refcnt > 1 && p->p_vmspace != &vmspace0) { 399 /* Switch now since other proc might free vmspace */ 400 PROC_VMSPACE_LOCK(p); 401 p->p_vmspace = &vmspace0; 402 PROC_VMSPACE_UNLOCK(p); 403 pmap_activate(td); 404 } 405 } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1)); 406 if (refcnt == 1) { 407 if (p->p_vmspace != vm) { 408 /* vmspace not yet freed, switch back */ 409 PROC_VMSPACE_LOCK(p); 410 p->p_vmspace = vm; 411 PROC_VMSPACE_UNLOCK(p); 412 pmap_activate(td); 413 } 414 pmap_remove_pages(vmspace_pmap(vm)); 415 /* Switch now since this proc will free vmspace */ 416 PROC_VMSPACE_LOCK(p); 417 p->p_vmspace = &vmspace0; 418 PROC_VMSPACE_UNLOCK(p); 419 pmap_activate(td); 420 vmspace_dofree(vm); 421 } 422#ifdef RACCT 423 if (racct_enable) 424 vmspace_container_reset(p); 425#endif 426} 427 428/* Acquire reference to vmspace owned by another process. */ 429 430struct vmspace * 431vmspace_acquire_ref(struct proc *p) 432{ 433 struct vmspace *vm; 434 int refcnt; 435 436 PROC_VMSPACE_LOCK(p); 437 vm = p->p_vmspace; 438 if (vm == NULL) { 439 PROC_VMSPACE_UNLOCK(p); 440 return (NULL); 441 } 442 do { 443 refcnt = vm->vm_refcnt; 444 if (refcnt <= 0) { /* Avoid 0->1 transition */ 445 PROC_VMSPACE_UNLOCK(p); 446 return (NULL); 447 } 448 } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1)); 449 if (vm != p->p_vmspace) { 450 PROC_VMSPACE_UNLOCK(p); 451 vmspace_free(vm); 452 return (NULL); 453 } 454 PROC_VMSPACE_UNLOCK(p); 455 return (vm); 456} 457 458/* 459 * Switch between vmspaces in an AIO kernel process. 460 * 461 * The AIO kernel processes switch to and from a user process's 462 * vmspace while performing an I/O operation on behalf of a user 463 * process. The new vmspace is either the vmspace of a user process 464 * obtained from an active AIO request or the initial vmspace of the 465 * AIO kernel process (when it is idling). Because user processes 466 * will block to drain any active AIO requests before proceeding in 467 * exit() or execve(), the vmspace reference count for these vmspaces 468 * can never be 0. This allows for a much simpler implementation than 469 * the loop in vmspace_acquire_ref() above. Similarly, AIO kernel 470 * processes hold an extra reference on their initial vmspace for the 471 * life of the process so that this guarantee is true for any vmspace 472 * passed as 'newvm'. 473 */ 474void 475vmspace_switch_aio(struct vmspace *newvm) 476{ 477 struct vmspace *oldvm; 478 479 /* XXX: Need some way to assert that this is an aio daemon. */ 480 481 KASSERT(newvm->vm_refcnt > 0, 482 ("vmspace_switch_aio: newvm unreferenced")); 483 484 oldvm = curproc->p_vmspace; 485 if (oldvm == newvm) 486 return; 487 488 /* 489 * Point to the new address space and refer to it. 490 */ 491 curproc->p_vmspace = newvm; 492 atomic_add_int(&newvm->vm_refcnt, 1); 493 494 /* Activate the new mapping. */ 495 pmap_activate(curthread); 496 497 /* Remove the daemon's reference to the old address space. */ 498 KASSERT(oldvm->vm_refcnt > 1, 499 ("vmspace_switch_aio: oldvm dropping last reference")); 500 vmspace_free(oldvm); 501} 502 503void 504_vm_map_lock(vm_map_t map, const char *file, int line) 505{ 506 507 if (map->system_map) 508 mtx_lock_flags_(&map->system_mtx, 0, file, line); 509 else 510 sx_xlock_(&map->lock, file, line); 511 map->timestamp++; 512} 513 514static void 515vm_map_process_deferred(void) 516{ 517 struct thread *td; 518 vm_map_entry_t entry, next; 519 vm_object_t object; 520 521 td = curthread; 522 entry = td->td_map_def_user; 523 td->td_map_def_user = NULL; 524 while (entry != NULL) { 525 next = entry->next; 526 if ((entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) { 527 /* 528 * Decrement the object's writemappings and 529 * possibly the vnode's v_writecount. 530 */ 531 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, 532 ("Submap with writecount")); 533 object = entry->object.vm_object; 534 KASSERT(object != NULL, ("No object for writecount")); 535 vnode_pager_release_writecount(object, entry->start, 536 entry->end); 537 } 538 vm_map_entry_deallocate(entry, FALSE); 539 entry = next; 540 } 541} 542 543void 544_vm_map_unlock(vm_map_t map, const char *file, int line) 545{ 546 547 if (map->system_map) 548 mtx_unlock_flags_(&map->system_mtx, 0, file, line); 549 else { 550 sx_xunlock_(&map->lock, file, line); 551 vm_map_process_deferred(); 552 } 553} 554 555void 556_vm_map_lock_read(vm_map_t map, const char *file, int line) 557{ 558 559 if (map->system_map) 560 mtx_lock_flags_(&map->system_mtx, 0, file, line); 561 else 562 sx_slock_(&map->lock, file, line); 563} 564 565void 566_vm_map_unlock_read(vm_map_t map, const char *file, int line) 567{ 568 569 if (map->system_map) 570 mtx_unlock_flags_(&map->system_mtx, 0, file, line); 571 else { 572 sx_sunlock_(&map->lock, file, line); 573 vm_map_process_deferred(); 574 } 575} 576 577int 578_vm_map_trylock(vm_map_t map, const char *file, int line) 579{ 580 int error; 581 582 error = map->system_map ? 583 !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : 584 !sx_try_xlock_(&map->lock, file, line); 585 if (error == 0) 586 map->timestamp++; 587 return (error == 0); 588} 589 590int 591_vm_map_trylock_read(vm_map_t map, const char *file, int line) 592{ 593 int error; 594 595 error = map->system_map ? 596 !mtx_trylock_flags_(&map->system_mtx, 0, file, line) : 597 !sx_try_slock_(&map->lock, file, line); 598 return (error == 0); 599} 600 601/* 602 * _vm_map_lock_upgrade: [ internal use only ] 603 * 604 * Tries to upgrade a read (shared) lock on the specified map to a write 605 * (exclusive) lock. Returns the value "0" if the upgrade succeeds and a 606 * non-zero value if the upgrade fails. If the upgrade fails, the map is 607 * returned without a read or write lock held. 608 * 609 * Requires that the map be read locked. 610 */ 611int 612_vm_map_lock_upgrade(vm_map_t map, const char *file, int line) 613{ 614 unsigned int last_timestamp; 615 616 if (map->system_map) { 617 mtx_assert_(&map->system_mtx, MA_OWNED, file, line); 618 } else { 619 if (!sx_try_upgrade_(&map->lock, file, line)) { 620 last_timestamp = map->timestamp; 621 sx_sunlock_(&map->lock, file, line); 622 vm_map_process_deferred(); 623 /* 624 * If the map's timestamp does not change while the 625 * map is unlocked, then the upgrade succeeds. 626 */ 627 sx_xlock_(&map->lock, file, line); 628 if (last_timestamp != map->timestamp) { 629 sx_xunlock_(&map->lock, file, line); 630 return (1); 631 } 632 } 633 } 634 map->timestamp++; 635 return (0); 636} 637 638void 639_vm_map_lock_downgrade(vm_map_t map, const char *file, int line) 640{ 641 642 if (map->system_map) { 643 mtx_assert_(&map->system_mtx, MA_OWNED, file, line); 644 } else 645 sx_downgrade_(&map->lock, file, line); 646} 647 648/* 649 * vm_map_locked: 650 * 651 * Returns a non-zero value if the caller holds a write (exclusive) lock 652 * on the specified map and the value "0" otherwise. 653 */ 654int 655vm_map_locked(vm_map_t map) 656{ 657 658 if (map->system_map) 659 return (mtx_owned(&map->system_mtx)); 660 else 661 return (sx_xlocked(&map->lock)); 662} 663 664#ifdef INVARIANTS 665static void 666_vm_map_assert_locked(vm_map_t map, const char *file, int line) 667{ 668 669 if (map->system_map) 670 mtx_assert_(&map->system_mtx, MA_OWNED, file, line); 671 else 672 sx_assert_(&map->lock, SA_XLOCKED, file, line); 673} 674 675#define VM_MAP_ASSERT_LOCKED(map) \ 676 _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE) 677#else 678#define VM_MAP_ASSERT_LOCKED(map) 679#endif 680 681/* 682 * _vm_map_unlock_and_wait: 683 * 684 * Atomically releases the lock on the specified map and puts the calling 685 * thread to sleep. The calling thread will remain asleep until either 686 * vm_map_wakeup() is performed on the map or the specified timeout is 687 * exceeded. 688 * 689 * WARNING! This function does not perform deferred deallocations of 690 * objects and map entries. Therefore, the calling thread is expected to 691 * reacquire the map lock after reawakening and later perform an ordinary 692 * unlock operation, such as vm_map_unlock(), before completing its 693 * operation on the map. 694 */ 695int 696_vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line) 697{ 698 699 mtx_lock(&map_sleep_mtx); 700 if (map->system_map) 701 mtx_unlock_flags_(&map->system_mtx, 0, file, line); 702 else 703 sx_xunlock_(&map->lock, file, line); 704 return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps", 705 timo)); 706} 707 708/* 709 * vm_map_wakeup: 710 * 711 * Awaken any threads that have slept on the map using 712 * vm_map_unlock_and_wait(). 713 */ 714void 715vm_map_wakeup(vm_map_t map) 716{ 717 718 /* 719 * Acquire and release map_sleep_mtx to prevent a wakeup() 720 * from being performed (and lost) between the map unlock 721 * and the msleep() in _vm_map_unlock_and_wait(). 722 */ 723 mtx_lock(&map_sleep_mtx); 724 mtx_unlock(&map_sleep_mtx); 725 wakeup(&map->root); 726} 727 728void 729vm_map_busy(vm_map_t map) 730{ 731 732 VM_MAP_ASSERT_LOCKED(map); 733 map->busy++; 734} 735 736void 737vm_map_unbusy(vm_map_t map) 738{ 739 740 VM_MAP_ASSERT_LOCKED(map); 741 KASSERT(map->busy, ("vm_map_unbusy: not busy")); 742 if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) { 743 vm_map_modflags(map, 0, MAP_BUSY_WAKEUP); 744 wakeup(&map->busy); 745 } 746} 747 748void 749vm_map_wait_busy(vm_map_t map) 750{ 751 752 VM_MAP_ASSERT_LOCKED(map); 753 while (map->busy) { 754 vm_map_modflags(map, MAP_BUSY_WAKEUP, 0); 755 if (map->system_map) 756 msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0); 757 else 758 sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0); 759 } 760 map->timestamp++; 761} 762 763long 764vmspace_resident_count(struct vmspace *vmspace) 765{ 766 return pmap_resident_count(vmspace_pmap(vmspace)); 767} 768 769/* 770 * vm_map_create: 771 * 772 * Creates and returns a new empty VM map with 773 * the given physical map structure, and having 774 * the given lower and upper address bounds. 775 */ 776vm_map_t 777vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max) 778{ 779 vm_map_t result; 780 781 result = uma_zalloc(mapzone, M_WAITOK); 782 CTR1(KTR_VM, "vm_map_create: %p", result); 783 _vm_map_init(result, pmap, min, max); 784 return (result); 785} 786 787/* 788 * Initialize an existing vm_map structure 789 * such as that in the vmspace structure. 790 */ 791static void 792_vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) 793{ 794 795 map->header.next = map->header.prev = &map->header; 796 map->needs_wakeup = FALSE; 797 map->system_map = 0; 798 map->pmap = pmap; 799 map->min_offset = min; 800 map->max_offset = max; 801 map->flags = 0; 802 map->root = NULL; 803 map->timestamp = 0; 804 map->busy = 0; 805} 806 807void 808vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max) 809{ 810 811 _vm_map_init(map, pmap, min, max); 812 mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK); 813 sx_init(&map->lock, "user map"); 814} 815 816/* 817 * vm_map_entry_dispose: [ internal use only ] 818 * 819 * Inverse of vm_map_entry_create. 820 */ 821static void 822vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry) 823{ 824 uma_zfree(map->system_map ? kmapentzone : mapentzone, entry); 825} 826 827/* 828 * vm_map_entry_create: [ internal use only ] 829 * 830 * Allocates a VM map entry for insertion. 831 * No entry fields are filled in. 832 */ 833static vm_map_entry_t 834vm_map_entry_create(vm_map_t map) 835{ 836 vm_map_entry_t new_entry; 837 838 if (map->system_map) 839 new_entry = uma_zalloc(kmapentzone, M_NOWAIT); 840 else 841 new_entry = uma_zalloc(mapentzone, M_WAITOK); 842 if (new_entry == NULL) 843 panic("vm_map_entry_create: kernel resources exhausted"); 844 return (new_entry); 845} 846 847/* 848 * vm_map_entry_set_behavior: 849 * 850 * Set the expected access behavior, either normal, random, or 851 * sequential. 852 */ 853static inline void 854vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior) 855{ 856 entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) | 857 (behavior & MAP_ENTRY_BEHAV_MASK); 858} 859 860/* 861 * vm_map_entry_set_max_free: 862 * 863 * Set the max_free field in a vm_map_entry. 864 */ 865static inline void 866vm_map_entry_set_max_free(vm_map_entry_t entry) 867{ 868 869 entry->max_free = entry->adj_free; 870 if (entry->left != NULL && entry->left->max_free > entry->max_free) 871 entry->max_free = entry->left->max_free; 872 if (entry->right != NULL && entry->right->max_free > entry->max_free) 873 entry->max_free = entry->right->max_free; 874} 875 876/* 877 * vm_map_entry_splay: 878 * 879 * The Sleator and Tarjan top-down splay algorithm with the 880 * following variation. Max_free must be computed bottom-up, so 881 * on the downward pass, maintain the left and right spines in 882 * reverse order. Then, make a second pass up each side to fix 883 * the pointers and compute max_free. The time bound is O(log n) 884 * amortized. 885 * 886 * The new root is the vm_map_entry containing "addr", or else an 887 * adjacent entry (lower or higher) if addr is not in the tree. 888 * 889 * The map must be locked, and leaves it so. 890 * 891 * Returns: the new root. 892 */ 893static vm_map_entry_t 894vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root) 895{ 896 vm_map_entry_t llist, rlist; 897 vm_map_entry_t ltree, rtree; 898 vm_map_entry_t y; 899 900 /* Special case of empty tree. */ 901 if (root == NULL) 902 return (root); 903 904 /* 905 * Pass One: Splay down the tree until we find addr or a NULL 906 * pointer where addr would go. llist and rlist are the two 907 * sides in reverse order (bottom-up), with llist linked by 908 * the right pointer and rlist linked by the left pointer in 909 * the vm_map_entry. Wait until Pass Two to set max_free on 910 * the two spines. 911 */ 912 llist = NULL; 913 rlist = NULL; 914 for (;;) { 915 /* root is never NULL in here. */ 916 if (addr < root->start) { 917 y = root->left; 918 if (y == NULL) 919 break; 920 if (addr < y->start && y->left != NULL) { 921 /* Rotate right and put y on rlist. */ 922 root->left = y->right; 923 y->right = root; 924 vm_map_entry_set_max_free(root); 925 root = y->left; 926 y->left = rlist; 927 rlist = y; 928 } else { 929 /* Put root on rlist. */ 930 root->left = rlist; 931 rlist = root; 932 root = y; 933 } 934 } else if (addr >= root->end) { 935 y = root->right; 936 if (y == NULL) 937 break; 938 if (addr >= y->end && y->right != NULL) { 939 /* Rotate left and put y on llist. */ 940 root->right = y->left; 941 y->left = root; 942 vm_map_entry_set_max_free(root); 943 root = y->right; 944 y->right = llist; 945 llist = y; 946 } else { 947 /* Put root on llist. */ 948 root->right = llist; 949 llist = root; 950 root = y; 951 } 952 } else 953 break; 954 } 955 956 /* 957 * Pass Two: Walk back up the two spines, flip the pointers 958 * and set max_free. The subtrees of the root go at the 959 * bottom of llist and rlist. 960 */ 961 ltree = root->left; 962 while (llist != NULL) { 963 y = llist->right; 964 llist->right = ltree; 965 vm_map_entry_set_max_free(llist); 966 ltree = llist; 967 llist = y; 968 } 969 rtree = root->right; 970 while (rlist != NULL) { 971 y = rlist->left; 972 rlist->left = rtree; 973 vm_map_entry_set_max_free(rlist); 974 rtree = rlist; 975 rlist = y; 976 } 977 978 /* 979 * Final assembly: add ltree and rtree as subtrees of root. 980 */ 981 root->left = ltree; 982 root->right = rtree; 983 vm_map_entry_set_max_free(root); 984 985 return (root); 986} 987 988/* 989 * vm_map_entry_{un,}link: 990 * 991 * Insert/remove entries from maps. 992 */ 993static void 994vm_map_entry_link(vm_map_t map, 995 vm_map_entry_t after_where, 996 vm_map_entry_t entry) 997{ 998 999 CTR4(KTR_VM, 1000 "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map, 1001 map->nentries, entry, after_where); 1002 VM_MAP_ASSERT_LOCKED(map); 1003 KASSERT(after_where->end <= entry->start, 1004 ("vm_map_entry_link: prev end %jx new start %jx overlap", 1005 (uintmax_t)after_where->end, (uintmax_t)entry->start)); 1006 KASSERT(entry->end <= after_where->next->start, 1007 ("vm_map_entry_link: new end %jx next start %jx overlap", 1008 (uintmax_t)entry->end, (uintmax_t)after_where->next->start)); 1009 1010 map->nentries++; 1011 entry->prev = after_where; 1012 entry->next = after_where->next; 1013 entry->next->prev = entry; 1014 after_where->next = entry; 1015 1016 if (after_where != &map->header) { 1017 if (after_where != map->root) 1018 vm_map_entry_splay(after_where->start, map->root); 1019 entry->right = after_where->right; 1020 entry->left = after_where; 1021 after_where->right = NULL; 1022 after_where->adj_free = entry->start - after_where->end; 1023 vm_map_entry_set_max_free(after_where); 1024 } else { 1025 entry->right = map->root; 1026 entry->left = NULL; 1027 } 1028 entry->adj_free = entry->next->start - entry->end; 1029 vm_map_entry_set_max_free(entry); 1030 map->root = entry; 1031} 1032 1033static void 1034vm_map_entry_unlink(vm_map_t map, 1035 vm_map_entry_t entry) 1036{ 1037 vm_map_entry_t next, prev, root; 1038 1039 VM_MAP_ASSERT_LOCKED(map); 1040 if (entry != map->root) 1041 vm_map_entry_splay(entry->start, map->root); 1042 if (entry->left == NULL) 1043 root = entry->right; 1044 else { 1045 root = vm_map_entry_splay(entry->start, entry->left); 1046 root->right = entry->right; 1047 root->adj_free = entry->next->start - root->end; 1048 vm_map_entry_set_max_free(root); 1049 } 1050 map->root = root; 1051 1052 prev = entry->prev; 1053 next = entry->next; 1054 next->prev = prev; 1055 prev->next = next; 1056 map->nentries--; 1057 CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map, 1058 map->nentries, entry); 1059} 1060 1061/* 1062 * vm_map_entry_resize_free: 1063 * 1064 * Recompute the amount of free space following a vm_map_entry 1065 * and propagate that value up the tree. Call this function after 1066 * resizing a map entry in-place, that is, without a call to 1067 * vm_map_entry_link() or _unlink(). 1068 * 1069 * The map must be locked, and leaves it so. 1070 */ 1071static void 1072vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry) 1073{ 1074 1075 /* 1076 * Using splay trees without parent pointers, propagating 1077 * max_free up the tree is done by moving the entry to the 1078 * root and making the change there. 1079 */ 1080 if (entry != map->root) 1081 map->root = vm_map_entry_splay(entry->start, map->root); 1082 1083 entry->adj_free = entry->next->start - entry->end; 1084 vm_map_entry_set_max_free(entry); 1085} 1086 1087/* 1088 * vm_map_lookup_entry: [ internal use only ] 1089 * 1090 * Finds the map entry containing (or 1091 * immediately preceding) the specified address 1092 * in the given map; the entry is returned 1093 * in the "entry" parameter. The boolean 1094 * result indicates whether the address is 1095 * actually contained in the map. 1096 */ 1097boolean_t 1098vm_map_lookup_entry( 1099 vm_map_t map, 1100 vm_offset_t address, 1101 vm_map_entry_t *entry) /* OUT */ 1102{ 1103 vm_map_entry_t cur; 1104 boolean_t locked; 1105 1106 /* 1107 * If the map is empty, then the map entry immediately preceding 1108 * "address" is the map's header. 1109 */ 1110 cur = map->root; 1111 if (cur == NULL) 1112 *entry = &map->header; 1113 else if (address >= cur->start && cur->end > address) { 1114 *entry = cur; 1115 return (TRUE); 1116 } else if ((locked = vm_map_locked(map)) || 1117 sx_try_upgrade(&map->lock)) { 1118 /* 1119 * Splay requires a write lock on the map. However, it only 1120 * restructures the binary search tree; it does not otherwise 1121 * change the map. Thus, the map's timestamp need not change 1122 * on a temporary upgrade. 1123 */ 1124 map->root = cur = vm_map_entry_splay(address, cur); 1125 if (!locked) 1126 sx_downgrade(&map->lock); 1127 1128 /* 1129 * If "address" is contained within a map entry, the new root 1130 * is that map entry. Otherwise, the new root is a map entry 1131 * immediately before or after "address". 1132 */ 1133 if (address >= cur->start) { 1134 *entry = cur; 1135 if (cur->end > address) 1136 return (TRUE); 1137 } else 1138 *entry = cur->prev; 1139 } else 1140 /* 1141 * Since the map is only locked for read access, perform a 1142 * standard binary search tree lookup for "address". 1143 */ 1144 for (;;) { 1145 if (address < cur->start) { 1146 if (cur->left == NULL) { 1147 *entry = cur->prev; 1148 break; 1149 } 1150 cur = cur->left; 1151 } else if (cur->end > address) { 1152 *entry = cur; 1153 return (TRUE); 1154 } else { 1155 if (cur->right == NULL) { 1156 *entry = cur; 1157 break; 1158 } 1159 cur = cur->right; 1160 } 1161 } 1162 return (FALSE); 1163} 1164 1165/* 1166 * vm_map_insert: 1167 * 1168 * Inserts the given whole VM object into the target 1169 * map at the specified address range. The object's 1170 * size should match that of the address range. 1171 * 1172 * Requires that the map be locked, and leaves it so. 1173 * 1174 * If object is non-NULL, ref count must be bumped by caller 1175 * prior to making call to account for the new entry. 1176 */ 1177int 1178vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 1179 vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow) 1180{ 1181 vm_map_entry_t new_entry, prev_entry, temp_entry; 1182 struct ucred *cred; 1183 vm_eflags_t protoeflags; 1184 vm_inherit_t inheritance; 1185 1186 VM_MAP_ASSERT_LOCKED(map); 1187 KASSERT((object != kmem_object && object != kernel_object) || 1188 (cow & MAP_COPY_ON_WRITE) == 0, 1189 ("vm_map_insert: kmem or kernel object and COW")); 1190 KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0, 1191 ("vm_map_insert: paradoxical MAP_NOFAULT request")); 1192 KASSERT((prot & ~max) == 0, 1193 ("prot %#x is not subset of max_prot %#x", prot, max)); 1194 1195 /* 1196 * Check that the start and end points are not bogus. 1197 */ 1198 if (start < map->min_offset || end > map->max_offset || start >= end) 1199 return (KERN_INVALID_ADDRESS); 1200 1201 /* 1202 * Find the entry prior to the proposed starting address; if it's part 1203 * of an existing entry, this range is bogus. 1204 */ 1205 if (vm_map_lookup_entry(map, start, &temp_entry)) 1206 return (KERN_NO_SPACE); 1207 1208 prev_entry = temp_entry; 1209 1210 /* 1211 * Assert that the next entry doesn't overlap the end point. 1212 */ 1213 if (prev_entry->next->start < end) 1214 return (KERN_NO_SPACE); 1215 1216 if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL || 1217 max != VM_PROT_NONE)) 1218 return (KERN_INVALID_ARGUMENT); 1219 1220 protoeflags = 0; 1221 if (cow & MAP_COPY_ON_WRITE) 1222 protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; 1223 if (cow & MAP_NOFAULT) 1224 protoeflags |= MAP_ENTRY_NOFAULT; 1225 if (cow & MAP_DISABLE_SYNCER) 1226 protoeflags |= MAP_ENTRY_NOSYNC; 1227 if (cow & MAP_DISABLE_COREDUMP) 1228 protoeflags |= MAP_ENTRY_NOCOREDUMP; 1229 if (cow & MAP_STACK_GROWS_DOWN) 1230 protoeflags |= MAP_ENTRY_GROWS_DOWN; 1231 if (cow & MAP_STACK_GROWS_UP) 1232 protoeflags |= MAP_ENTRY_GROWS_UP; 1233 if (cow & MAP_VN_WRITECOUNT) 1234 protoeflags |= MAP_ENTRY_VN_WRITECNT; 1235 if ((cow & MAP_CREATE_GUARD) != 0) 1236 protoeflags |= MAP_ENTRY_GUARD; 1237 if ((cow & MAP_CREATE_STACK_GAP_DN) != 0) 1238 protoeflags |= MAP_ENTRY_STACK_GAP_DN; 1239 if ((cow & MAP_CREATE_STACK_GAP_UP) != 0) 1240 protoeflags |= MAP_ENTRY_STACK_GAP_UP; 1241 if (cow & MAP_INHERIT_SHARE) 1242 inheritance = VM_INHERIT_SHARE; 1243 else 1244 inheritance = VM_INHERIT_DEFAULT; 1245 1246 cred = NULL; 1247 if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0) 1248 goto charged; 1249 if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) && 1250 ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) { 1251 if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start)) 1252 return (KERN_RESOURCE_SHORTAGE); 1253 KASSERT(object == NULL || 1254 (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 || 1255 object->cred == NULL, 1256 ("overcommit: vm_map_insert o %p", object)); 1257 cred = curthread->td_ucred; 1258 } 1259 1260charged: 1261 /* Expand the kernel pmap, if necessary. */ 1262 if (map == kernel_map && end > kernel_vm_end) 1263 pmap_growkernel(end); 1264 if (object != NULL) { 1265 /* 1266 * OBJ_ONEMAPPING must be cleared unless this mapping 1267 * is trivially proven to be the only mapping for any 1268 * of the object's pages. (Object granularity 1269 * reference counting is insufficient to recognize 1270 * aliases with precision.) 1271 */ 1272 VM_OBJECT_WLOCK(object); 1273 if (object->ref_count > 1 || object->shadow_count != 0) 1274 vm_object_clear_flag(object, OBJ_ONEMAPPING); 1275 VM_OBJECT_WUNLOCK(object); 1276 } else if (prev_entry != &map->header && 1277 prev_entry->eflags == protoeflags && 1278 (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 && 1279 prev_entry->end == start && prev_entry->wired_count == 0 && 1280 (prev_entry->cred == cred || 1281 (prev_entry->object.vm_object != NULL && 1282 prev_entry->object.vm_object->cred == cred)) && 1283 vm_object_coalesce(prev_entry->object.vm_object, 1284 prev_entry->offset, 1285 (vm_size_t)(prev_entry->end - prev_entry->start), 1286 (vm_size_t)(end - prev_entry->end), cred != NULL && 1287 (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) { 1288 /* 1289 * We were able to extend the object. Determine if we 1290 * can extend the previous map entry to include the 1291 * new range as well. 1292 */ 1293 if (prev_entry->inheritance == inheritance && 1294 prev_entry->protection == prot && 1295 prev_entry->max_protection == max) { 1296 if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0) 1297 map->size += end - prev_entry->end; 1298 prev_entry->end = end; 1299 vm_map_entry_resize_free(map, prev_entry); 1300 vm_map_simplify_entry(map, prev_entry); 1301 return (KERN_SUCCESS); 1302 } 1303 1304 /* 1305 * If we can extend the object but cannot extend the 1306 * map entry, we have to create a new map entry. We 1307 * must bump the ref count on the extended object to 1308 * account for it. object may be NULL. 1309 */ 1310 object = prev_entry->object.vm_object; 1311 offset = prev_entry->offset + 1312 (prev_entry->end - prev_entry->start); 1313 vm_object_reference(object); 1314 if (cred != NULL && object != NULL && object->cred != NULL && 1315 !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { 1316 /* Object already accounts for this uid. */ 1317 cred = NULL; 1318 } 1319 } 1320 if (cred != NULL) 1321 crhold(cred); 1322 1323 /* 1324 * Create a new entry 1325 */ 1326 new_entry = vm_map_entry_create(map); 1327 new_entry->start = start; 1328 new_entry->end = end; 1329 new_entry->cred = NULL; 1330 1331 new_entry->eflags = protoeflags; 1332 new_entry->object.vm_object = object; 1333 new_entry->offset = offset; 1334 1335 new_entry->inheritance = inheritance; 1336 new_entry->protection = prot; 1337 new_entry->max_protection = max; 1338 new_entry->wired_count = 0; 1339 new_entry->wiring_thread = NULL; 1340 new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT; 1341 new_entry->next_read = start; 1342 1343 KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry), 1344 ("overcommit: vm_map_insert leaks vm_map %p", new_entry)); 1345 new_entry->cred = cred; 1346 1347 /* 1348 * Insert the new entry into the list 1349 */ 1350 vm_map_entry_link(map, prev_entry, new_entry); 1351 if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0) 1352 map->size += new_entry->end - new_entry->start; 1353 1354 /* 1355 * Try to coalesce the new entry with both the previous and next 1356 * entries in the list. Previously, we only attempted to coalesce 1357 * with the previous entry when object is NULL. Here, we handle the 1358 * other cases, which are less common. 1359 */ 1360 vm_map_simplify_entry(map, new_entry); 1361 1362 if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) { 1363 vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset), 1364 end - start, cow & MAP_PREFAULT_PARTIAL); 1365 } 1366 1367 return (KERN_SUCCESS); 1368} 1369 1370/* 1371 * vm_map_findspace: 1372 * 1373 * Find the first fit (lowest VM address) for "length" free bytes 1374 * beginning at address >= start in the given map. 1375 * 1376 * In a vm_map_entry, "adj_free" is the amount of free space 1377 * adjacent (higher address) to this entry, and "max_free" is the 1378 * maximum amount of contiguous free space in its subtree. This 1379 * allows finding a free region in one path down the tree, so 1380 * O(log n) amortized with splay trees. 1381 * 1382 * The map must be locked, and leaves it so. 1383 * 1384 * Returns: 0 on success, and starting address in *addr, 1385 * 1 if insufficient space. 1386 */ 1387int 1388vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length, 1389 vm_offset_t *addr) /* OUT */ 1390{ 1391 vm_map_entry_t entry; 1392 vm_offset_t st; 1393 1394 /* 1395 * Request must fit within min/max VM address and must avoid 1396 * address wrap. 1397 */ 1398 if (start < map->min_offset) 1399 start = map->min_offset; 1400 if (start + length > map->max_offset || start + length < start) 1401 return (1); 1402 1403 /* Empty tree means wide open address space. */ 1404 if (map->root == NULL) { 1405 *addr = start; 1406 return (0); 1407 } 1408 1409 /* 1410 * After splay, if start comes before root node, then there 1411 * must be a gap from start to the root. 1412 */ 1413 map->root = vm_map_entry_splay(start, map->root); 1414 if (start + length <= map->root->start) { 1415 *addr = start; 1416 return (0); 1417 } 1418 1419 /* 1420 * Root is the last node that might begin its gap before 1421 * start, and this is the last comparison where address 1422 * wrap might be a problem. 1423 */ 1424 st = (start > map->root->end) ? start : map->root->end; 1425 if (length <= map->root->end + map->root->adj_free - st) { 1426 *addr = st; 1427 return (0); 1428 } 1429 1430 /* With max_free, can immediately tell if no solution. */ 1431 entry = map->root->right; 1432 if (entry == NULL || length > entry->max_free) 1433 return (1); 1434 1435 /* 1436 * Search the right subtree in the order: left subtree, root, 1437 * right subtree (first fit). The previous splay implies that 1438 * all regions in the right subtree have addresses > start. 1439 */ 1440 while (entry != NULL) { 1441 if (entry->left != NULL && entry->left->max_free >= length) 1442 entry = entry->left; 1443 else if (entry->adj_free >= length) { 1444 *addr = entry->end; 1445 return (0); 1446 } else 1447 entry = entry->right; 1448 } 1449 1450 /* Can't get here, so panic if we do. */ 1451 panic("vm_map_findspace: max_free corrupt"); 1452} 1453 1454int 1455vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 1456 vm_offset_t start, vm_size_t length, vm_prot_t prot, 1457 vm_prot_t max, int cow) 1458{ 1459 vm_offset_t end; 1460 int result; 1461 1462 end = start + length; 1463 KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || 1464 object == NULL, 1465 ("vm_map_fixed: non-NULL backing object for stack")); 1466 vm_map_lock(map); 1467 VM_MAP_RANGE_CHECK(map, start, end); 1468 if ((cow & MAP_CHECK_EXCL) == 0) 1469 vm_map_delete(map, start, end); 1470 if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { 1471 result = vm_map_stack_locked(map, start, length, sgrowsiz, 1472 prot, max, cow); 1473 } else { 1474 result = vm_map_insert(map, object, offset, start, end, 1475 prot, max, cow); 1476 } 1477 vm_map_unlock(map); 1478 return (result); 1479} 1480 1481/* 1482 * vm_map_find finds an unallocated region in the target address 1483 * map with the given length. The search is defined to be 1484 * first-fit from the specified address; the region found is 1485 * returned in the same parameter. 1486 * 1487 * If object is non-NULL, ref count must be bumped by caller 1488 * prior to making call to account for the new entry. 1489 */ 1490int 1491vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 1492 vm_offset_t *addr, /* IN/OUT */ 1493 vm_size_t length, vm_offset_t max_addr, int find_space, 1494 vm_prot_t prot, vm_prot_t max, int cow) 1495{ 1496 vm_offset_t alignment, initial_addr, start; 1497 int result; 1498 1499 KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 || 1500 object == NULL, 1501 ("vm_map_find: non-NULL backing object for stack")); 1502 if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL || 1503 (object->flags & OBJ_COLORED) == 0)) 1504 find_space = VMFS_ANY_SPACE; 1505 if (find_space >> 8 != 0) { 1506 KASSERT((find_space & 0xff) == 0, ("bad VMFS flags")); 1507 alignment = (vm_offset_t)1 << (find_space >> 8); 1508 } else 1509 alignment = 0; 1510 initial_addr = *addr; 1511again: 1512 start = initial_addr; 1513 vm_map_lock(map); 1514 do { 1515 if (find_space != VMFS_NO_SPACE) { 1516 if (vm_map_findspace(map, start, length, addr) || 1517 (max_addr != 0 && *addr + length > max_addr)) { 1518 vm_map_unlock(map); 1519 if (find_space == VMFS_OPTIMAL_SPACE) { 1520 find_space = VMFS_ANY_SPACE; 1521 goto again; 1522 } 1523 return (KERN_NO_SPACE); 1524 } 1525 switch (find_space) { 1526 case VMFS_SUPER_SPACE: 1527 case VMFS_OPTIMAL_SPACE: 1528 pmap_align_superpage(object, offset, addr, 1529 length); 1530 break; 1531 case VMFS_ANY_SPACE: 1532 break; 1533 default: 1534 if ((*addr & (alignment - 1)) != 0) { 1535 *addr &= ~(alignment - 1); 1536 *addr += alignment; 1537 } 1538 break; 1539 } 1540 1541 start = *addr; 1542 } 1543 if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) { 1544 result = vm_map_stack_locked(map, start, length, 1545 sgrowsiz, prot, max, cow); 1546 } else { 1547 result = vm_map_insert(map, object, offset, start, 1548 start + length, prot, max, cow); 1549 } 1550 } while (result == KERN_NO_SPACE && find_space != VMFS_NO_SPACE && 1551 find_space != VMFS_ANY_SPACE); 1552 vm_map_unlock(map); 1553 return (result); 1554} 1555 1556/* 1557 * vm_map_find_min() is a variant of vm_map_find() that takes an 1558 * additional parameter (min_addr) and treats the given address 1559 * (*addr) differently. Specifically, it treats *addr as a hint 1560 * and not as the minimum address where the mapping is created. 1561 * 1562 * This function works in two phases. First, it tries to 1563 * allocate above the hint. If that fails and the hint is 1564 * greater than min_addr, it performs a second pass, replacing 1565 * the hint with min_addr as the minimum address for the 1566 * allocation. 1567 */ 1568int 1569vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 1570 vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr, 1571 vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max, 1572 int cow) 1573{ 1574 vm_offset_t hint; 1575 int rv; 1576 1577 hint = *addr; 1578 for (;;) { 1579 rv = vm_map_find(map, object, offset, addr, length, max_addr, 1580 find_space, prot, max, cow); 1581 if (rv == KERN_SUCCESS || min_addr >= hint) 1582 return (rv); 1583 *addr = hint = min_addr; 1584 } 1585} 1586 1587/* 1588 * vm_map_simplify_entry: 1589 * 1590 * Simplify the given map entry by merging with either neighbor. This 1591 * routine also has the ability to merge with both neighbors. 1592 * 1593 * The map must be locked. 1594 * 1595 * This routine guarantees that the passed entry remains valid (though 1596 * possibly extended). When merging, this routine may delete one or 1597 * both neighbors. 1598 */ 1599void 1600vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry) 1601{ 1602 vm_map_entry_t next, prev; 1603 vm_size_t prevsize, esize; 1604 1605 if ((entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | 1606 MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)) != 0) 1607 return; 1608 1609 prev = entry->prev; 1610 if (prev != &map->header) { 1611 prevsize = prev->end - prev->start; 1612 if ( (prev->end == entry->start) && 1613 (prev->object.vm_object == entry->object.vm_object) && 1614 (!prev->object.vm_object || 1615 (prev->offset + prevsize == entry->offset)) && 1616 (prev->eflags == entry->eflags) && 1617 (prev->protection == entry->protection) && 1618 (prev->max_protection == entry->max_protection) && 1619 (prev->inheritance == entry->inheritance) && 1620 (prev->wired_count == entry->wired_count) && 1621 (prev->cred == entry->cred)) { 1622 vm_map_entry_unlink(map, prev); 1623 entry->start = prev->start; 1624 entry->offset = prev->offset; 1625 if (entry->prev != &map->header) 1626 vm_map_entry_resize_free(map, entry->prev); 1627 1628 /* 1629 * If the backing object is a vnode object, 1630 * vm_object_deallocate() calls vrele(). 1631 * However, vrele() does not lock the vnode 1632 * because the vnode has additional 1633 * references. Thus, the map lock can be kept 1634 * without causing a lock-order reversal with 1635 * the vnode lock. 1636 * 1637 * Since we count the number of virtual page 1638 * mappings in object->un_pager.vnp.writemappings, 1639 * the writemappings value should not be adjusted 1640 * when the entry is disposed of. 1641 */ 1642 if (prev->object.vm_object) 1643 vm_object_deallocate(prev->object.vm_object); 1644 if (prev->cred != NULL) 1645 crfree(prev->cred); 1646 vm_map_entry_dispose(map, prev); 1647 } 1648 } 1649 1650 next = entry->next; 1651 if (next != &map->header) { 1652 esize = entry->end - entry->start; 1653 if ((entry->end == next->start) && 1654 (next->object.vm_object == entry->object.vm_object) && 1655 (!entry->object.vm_object || 1656 (entry->offset + esize == next->offset)) && 1657 (next->eflags == entry->eflags) && 1658 (next->protection == entry->protection) && 1659 (next->max_protection == entry->max_protection) && 1660 (next->inheritance == entry->inheritance) && 1661 (next->wired_count == entry->wired_count) && 1662 (next->cred == entry->cred)) { 1663 vm_map_entry_unlink(map, next); 1664 entry->end = next->end; 1665 vm_map_entry_resize_free(map, entry); 1666 1667 /* 1668 * See comment above. 1669 */ 1670 if (next->object.vm_object) 1671 vm_object_deallocate(next->object.vm_object); 1672 if (next->cred != NULL) 1673 crfree(next->cred); 1674 vm_map_entry_dispose(map, next); 1675 } 1676 } 1677} 1678/* 1679 * vm_map_clip_start: [ internal use only ] 1680 * 1681 * Asserts that the given entry begins at or after 1682 * the specified address; if necessary, 1683 * it splits the entry into two. 1684 */ 1685#define vm_map_clip_start(map, entry, startaddr) \ 1686{ \ 1687 if (startaddr > entry->start) \ 1688 _vm_map_clip_start(map, entry, startaddr); \ 1689} 1690 1691/* 1692 * This routine is called only when it is known that 1693 * the entry must be split. 1694 */ 1695static void 1696_vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start) 1697{ 1698 vm_map_entry_t new_entry; 1699 1700 VM_MAP_ASSERT_LOCKED(map); 1701 KASSERT(entry->end > start && entry->start < start, 1702 ("_vm_map_clip_start: invalid clip of entry %p", entry)); 1703 1704 /* 1705 * Split off the front portion -- note that we must insert the new 1706 * entry BEFORE this one, so that this entry has the specified 1707 * starting address. 1708 */ 1709 vm_map_simplify_entry(map, entry); 1710 1711 /* 1712 * If there is no object backing this entry, we might as well create 1713 * one now. If we defer it, an object can get created after the map 1714 * is clipped, and individual objects will be created for the split-up 1715 * map. This is a bit of a hack, but is also about the best place to 1716 * put this improvement. 1717 */ 1718 if (entry->object.vm_object == NULL && !map->system_map && 1719 (entry->eflags & MAP_ENTRY_GUARD) == 0) { 1720 vm_object_t object; 1721 object = vm_object_allocate(OBJT_DEFAULT, 1722 atop(entry->end - entry->start)); 1723 entry->object.vm_object = object; 1724 entry->offset = 0; 1725 if (entry->cred != NULL) { 1726 object->cred = entry->cred; 1727 object->charge = entry->end - entry->start; 1728 entry->cred = NULL; 1729 } 1730 } else if (entry->object.vm_object != NULL && 1731 ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && 1732 entry->cred != NULL) { 1733 VM_OBJECT_WLOCK(entry->object.vm_object); 1734 KASSERT(entry->object.vm_object->cred == NULL, 1735 ("OVERCOMMIT: vm_entry_clip_start: both cred e %p", entry)); 1736 entry->object.vm_object->cred = entry->cred; 1737 entry->object.vm_object->charge = entry->end - entry->start; 1738 VM_OBJECT_WUNLOCK(entry->object.vm_object); 1739 entry->cred = NULL; 1740 } 1741 1742 new_entry = vm_map_entry_create(map); 1743 *new_entry = *entry; 1744 1745 new_entry->end = start; 1746 entry->offset += (start - entry->start); 1747 entry->start = start; 1748 if (new_entry->cred != NULL) 1749 crhold(entry->cred); 1750 1751 vm_map_entry_link(map, entry->prev, new_entry); 1752 1753 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 1754 vm_object_reference(new_entry->object.vm_object); 1755 /* 1756 * The object->un_pager.vnp.writemappings for the 1757 * object of MAP_ENTRY_VN_WRITECNT type entry shall be 1758 * kept as is here. The virtual pages are 1759 * re-distributed among the clipped entries, so the sum is 1760 * left the same. 1761 */ 1762 } 1763} 1764 1765/* 1766 * vm_map_clip_end: [ internal use only ] 1767 * 1768 * Asserts that the given entry ends at or before 1769 * the specified address; if necessary, 1770 * it splits the entry into two. 1771 */ 1772#define vm_map_clip_end(map, entry, endaddr) \ 1773{ \ 1774 if ((endaddr) < (entry->end)) \ 1775 _vm_map_clip_end((map), (entry), (endaddr)); \ 1776} 1777 1778/* 1779 * This routine is called only when it is known that 1780 * the entry must be split. 1781 */ 1782static void 1783_vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end) 1784{ 1785 vm_map_entry_t new_entry; 1786 1787 VM_MAP_ASSERT_LOCKED(map); 1788 KASSERT(entry->start < end && entry->end > end, 1789 ("_vm_map_clip_end: invalid clip of entry %p", entry)); 1790 1791 /* 1792 * If there is no object backing this entry, we might as well create 1793 * one now. If we defer it, an object can get created after the map 1794 * is clipped, and individual objects will be created for the split-up 1795 * map. This is a bit of a hack, but is also about the best place to 1796 * put this improvement. 1797 */ 1798 if (entry->object.vm_object == NULL && !map->system_map && 1799 (entry->eflags & MAP_ENTRY_GUARD) == 0) { 1800 vm_object_t object; 1801 object = vm_object_allocate(OBJT_DEFAULT, 1802 atop(entry->end - entry->start)); 1803 entry->object.vm_object = object; 1804 entry->offset = 0; 1805 if (entry->cred != NULL) { 1806 object->cred = entry->cred; 1807 object->charge = entry->end - entry->start; 1808 entry->cred = NULL; 1809 } 1810 } else if (entry->object.vm_object != NULL && 1811 ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) && 1812 entry->cred != NULL) { 1813 VM_OBJECT_WLOCK(entry->object.vm_object); 1814 KASSERT(entry->object.vm_object->cred == NULL, 1815 ("OVERCOMMIT: vm_entry_clip_end: both cred e %p", entry)); 1816 entry->object.vm_object->cred = entry->cred; 1817 entry->object.vm_object->charge = entry->end - entry->start; 1818 VM_OBJECT_WUNLOCK(entry->object.vm_object); 1819 entry->cred = NULL; 1820 } 1821 1822 /* 1823 * Create a new entry and insert it AFTER the specified entry 1824 */ 1825 new_entry = vm_map_entry_create(map); 1826 *new_entry = *entry; 1827 1828 new_entry->start = entry->end = end; 1829 new_entry->offset += (end - entry->start); 1830 if (new_entry->cred != NULL) 1831 crhold(entry->cred); 1832 1833 vm_map_entry_link(map, entry, new_entry); 1834 1835 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 1836 vm_object_reference(new_entry->object.vm_object); 1837 } 1838} 1839 1840/* 1841 * vm_map_submap: [ kernel use only ] 1842 * 1843 * Mark the given range as handled by a subordinate map. 1844 * 1845 * This range must have been created with vm_map_find, 1846 * and no other operations may have been performed on this 1847 * range prior to calling vm_map_submap. 1848 * 1849 * Only a limited number of operations can be performed 1850 * within this rage after calling vm_map_submap: 1851 * vm_fault 1852 * [Don't try vm_map_copy!] 1853 * 1854 * To remove a submapping, one must first remove the 1855 * range from the superior map, and then destroy the 1856 * submap (if desired). [Better yet, don't try it.] 1857 */ 1858int 1859vm_map_submap( 1860 vm_map_t map, 1861 vm_offset_t start, 1862 vm_offset_t end, 1863 vm_map_t submap) 1864{ 1865 vm_map_entry_t entry; 1866 int result = KERN_INVALID_ARGUMENT; 1867 1868 vm_map_lock(map); 1869 1870 VM_MAP_RANGE_CHECK(map, start, end); 1871 1872 if (vm_map_lookup_entry(map, start, &entry)) { 1873 vm_map_clip_start(map, entry, start); 1874 } else 1875 entry = entry->next; 1876 1877 vm_map_clip_end(map, entry, end); 1878 1879 if ((entry->start == start) && (entry->end == end) && 1880 ((entry->eflags & MAP_ENTRY_COW) == 0) && 1881 (entry->object.vm_object == NULL)) { 1882 entry->object.sub_map = submap; 1883 entry->eflags |= MAP_ENTRY_IS_SUB_MAP; 1884 result = KERN_SUCCESS; 1885 } 1886 vm_map_unlock(map); 1887 1888 return (result); 1889} 1890 1891/* 1892 * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified 1893 */ 1894#define MAX_INIT_PT 96 1895 1896/* 1897 * vm_map_pmap_enter: 1898 * 1899 * Preload the specified map's pmap with mappings to the specified 1900 * object's memory-resident pages. No further physical pages are 1901 * allocated, and no further virtual pages are retrieved from secondary 1902 * storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a 1903 * limited number of page mappings are created at the low-end of the 1904 * specified address range. (For this purpose, a superpage mapping 1905 * counts as one page mapping.) Otherwise, all resident pages within 1906 * the specified address range are mapped. 1907 */ 1908static void 1909vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot, 1910 vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags) 1911{ 1912 vm_offset_t start; 1913 vm_page_t p, p_start; 1914 vm_pindex_t mask, psize, threshold, tmpidx; 1915 1916 if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL) 1917 return; 1918 VM_OBJECT_RLOCK(object); 1919 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { 1920 VM_OBJECT_RUNLOCK(object); 1921 VM_OBJECT_WLOCK(object); 1922 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) { 1923 pmap_object_init_pt(map->pmap, addr, object, pindex, 1924 size); 1925 VM_OBJECT_WUNLOCK(object); 1926 return; 1927 } 1928 VM_OBJECT_LOCK_DOWNGRADE(object); 1929 } 1930 1931 psize = atop(size); 1932 if (psize + pindex > object->size) { 1933 if (object->size < pindex) { 1934 VM_OBJECT_RUNLOCK(object); 1935 return; 1936 } 1937 psize = object->size - pindex; 1938 } 1939 1940 start = 0; 1941 p_start = NULL; 1942 threshold = MAX_INIT_PT; 1943 1944 p = vm_page_find_least(object, pindex); 1945 /* 1946 * Assert: the variable p is either (1) the page with the 1947 * least pindex greater than or equal to the parameter pindex 1948 * or (2) NULL. 1949 */ 1950 for (; 1951 p != NULL && (tmpidx = p->pindex - pindex) < psize; 1952 p = TAILQ_NEXT(p, listq)) { 1953 /* 1954 * don't allow an madvise to blow away our really 1955 * free pages allocating pv entries. 1956 */ 1957 if (((flags & MAP_PREFAULT_MADVISE) != 0 && 1958 vm_cnt.v_free_count < vm_cnt.v_free_reserved) || 1959 ((flags & MAP_PREFAULT_PARTIAL) != 0 && 1960 tmpidx >= threshold)) { 1961 psize = tmpidx; 1962 break; 1963 } 1964 if (p->valid == VM_PAGE_BITS_ALL) { 1965 if (p_start == NULL) { 1966 start = addr + ptoa(tmpidx); 1967 p_start = p; 1968 } 1969 /* Jump ahead if a superpage mapping is possible. */ 1970 if (p->psind > 0 && ((addr + ptoa(tmpidx)) & 1971 (pagesizes[p->psind] - 1)) == 0) { 1972 mask = atop(pagesizes[p->psind]) - 1; 1973 if (tmpidx + mask < psize && 1974 vm_page_ps_test(p, PS_ALL_VALID, NULL)) { 1975 p += mask; 1976 threshold += mask; 1977 } 1978 } 1979 } else if (p_start != NULL) { 1980 pmap_enter_object(map->pmap, start, addr + 1981 ptoa(tmpidx), p_start, prot); 1982 p_start = NULL; 1983 } 1984 } 1985 if (p_start != NULL) 1986 pmap_enter_object(map->pmap, start, addr + ptoa(psize), 1987 p_start, prot); 1988 VM_OBJECT_RUNLOCK(object); 1989} 1990 1991/* 1992 * vm_map_protect: 1993 * 1994 * Sets the protection of the specified address 1995 * region in the target map. If "set_max" is 1996 * specified, the maximum protection is to be set; 1997 * otherwise, only the current protection is affected. 1998 */ 1999int 2000vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, 2001 vm_prot_t new_prot, boolean_t set_max) 2002{ 2003 vm_map_entry_t current, entry; 2004 vm_object_t obj; 2005 struct ucred *cred; 2006 vm_prot_t old_prot; 2007 2008 if (start == end) 2009 return (KERN_SUCCESS); 2010 2011 vm_map_lock(map); 2012 2013 /* 2014 * Ensure that we are not concurrently wiring pages. vm_map_wire() may 2015 * need to fault pages into the map and will drop the map lock while 2016 * doing so, and the VM object may end up in an inconsistent state if we 2017 * update the protection on the map entry in between faults. 2018 */ 2019 vm_map_wait_busy(map); 2020 2021 VM_MAP_RANGE_CHECK(map, start, end); 2022 2023 if (vm_map_lookup_entry(map, start, &entry)) { 2024 vm_map_clip_start(map, entry, start); 2025 } else { 2026 entry = entry->next; 2027 } 2028 2029 /* 2030 * Make a first pass to check for protection violations. 2031 */ 2032 for (current = entry; current->start < end; current = current->next) { 2033 if ((current->eflags & MAP_ENTRY_GUARD) != 0) 2034 continue; 2035 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { 2036 vm_map_unlock(map); 2037 return (KERN_INVALID_ARGUMENT); 2038 } 2039 if ((new_prot & current->max_protection) != new_prot) { 2040 vm_map_unlock(map); 2041 return (KERN_PROTECTION_FAILURE); 2042 } 2043 } 2044 2045 /* 2046 * Do an accounting pass for private read-only mappings that 2047 * now will do cow due to allowed write (e.g. debugger sets 2048 * breakpoint on text segment) 2049 */ 2050 for (current = entry; current->start < end; current = current->next) { 2051 2052 vm_map_clip_end(map, current, end); 2053 2054 if (set_max || 2055 ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 || 2056 ENTRY_CHARGED(current) || 2057 (current->eflags & MAP_ENTRY_GUARD) != 0) { 2058 continue; 2059 } 2060 2061 cred = curthread->td_ucred; 2062 obj = current->object.vm_object; 2063 2064 if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) { 2065 if (!swap_reserve(current->end - current->start)) { 2066 vm_map_unlock(map); 2067 return (KERN_RESOURCE_SHORTAGE); 2068 } 2069 crhold(cred); 2070 current->cred = cred; 2071 continue; 2072 } 2073 2074 VM_OBJECT_WLOCK(obj); 2075 if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) { 2076 VM_OBJECT_WUNLOCK(obj); 2077 continue; 2078 } 2079 2080 /* 2081 * Charge for the whole object allocation now, since 2082 * we cannot distinguish between non-charged and 2083 * charged clipped mapping of the same object later. 2084 */ 2085 KASSERT(obj->charge == 0, 2086 ("vm_map_protect: object %p overcharged (entry %p)", 2087 obj, current)); 2088 if (!swap_reserve(ptoa(obj->size))) { 2089 VM_OBJECT_WUNLOCK(obj); 2090 vm_map_unlock(map); 2091 return (KERN_RESOURCE_SHORTAGE); 2092 } 2093 2094 crhold(cred); 2095 obj->cred = cred; 2096 obj->charge = ptoa(obj->size); 2097 VM_OBJECT_WUNLOCK(obj); 2098 } 2099 2100 /* 2101 * Go back and fix up protections. [Note that clipping is not 2102 * necessary the second time.] 2103 */ 2104 for (current = entry; current->start < end; current = current->next) { 2105 if ((current->eflags & MAP_ENTRY_GUARD) != 0) 2106 continue; 2107 2108 old_prot = current->protection; 2109 2110 if (set_max) 2111 current->protection = 2112 (current->max_protection = new_prot) & 2113 old_prot; 2114 else 2115 current->protection = new_prot; 2116 2117 /* 2118 * For user wired map entries, the normal lazy evaluation of 2119 * write access upgrades through soft page faults is 2120 * undesirable. Instead, immediately copy any pages that are 2121 * copy-on-write and enable write access in the physical map. 2122 */ 2123 if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 && 2124 (current->protection & VM_PROT_WRITE) != 0 && 2125 (old_prot & VM_PROT_WRITE) == 0) 2126 vm_fault_copy_entry(map, map, current, current, NULL); 2127 2128 /* 2129 * When restricting access, update the physical map. Worry 2130 * about copy-on-write here. 2131 */ 2132 if ((old_prot & ~current->protection) != 0) { 2133#define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \ 2134 VM_PROT_ALL) 2135 pmap_protect(map->pmap, current->start, 2136 current->end, 2137 current->protection & MASK(current)); 2138#undef MASK 2139 } 2140 vm_map_simplify_entry(map, current); 2141 } 2142 vm_map_unlock(map); 2143 return (KERN_SUCCESS); 2144} 2145 2146/* 2147 * vm_map_madvise: 2148 * 2149 * This routine traverses a processes map handling the madvise 2150 * system call. Advisories are classified as either those effecting 2151 * the vm_map_entry structure, or those effecting the underlying 2152 * objects. 2153 */ 2154int 2155vm_map_madvise( 2156 vm_map_t map, 2157 vm_offset_t start, 2158 vm_offset_t end, 2159 int behav) 2160{ 2161 vm_map_entry_t current, entry; 2162 int modify_map = 0; 2163 2164 /* 2165 * Some madvise calls directly modify the vm_map_entry, in which case 2166 * we need to use an exclusive lock on the map and we need to perform 2167 * various clipping operations. Otherwise we only need a read-lock 2168 * on the map. 2169 */ 2170 switch(behav) { 2171 case MADV_NORMAL: 2172 case MADV_SEQUENTIAL: 2173 case MADV_RANDOM: 2174 case MADV_NOSYNC: 2175 case MADV_AUTOSYNC: 2176 case MADV_NOCORE: 2177 case MADV_CORE: 2178 if (start == end) 2179 return (KERN_SUCCESS); 2180 modify_map = 1; 2181 vm_map_lock(map); 2182 break; 2183 case MADV_WILLNEED: 2184 case MADV_DONTNEED: 2185 case MADV_FREE: 2186 if (start == end) 2187 return (KERN_SUCCESS); 2188 vm_map_lock_read(map); 2189 break; 2190 default: 2191 return (KERN_INVALID_ARGUMENT); 2192 } 2193 2194 /* 2195 * Locate starting entry and clip if necessary. 2196 */ 2197 VM_MAP_RANGE_CHECK(map, start, end); 2198 2199 if (vm_map_lookup_entry(map, start, &entry)) { 2200 if (modify_map) 2201 vm_map_clip_start(map, entry, start); 2202 } else { 2203 entry = entry->next; 2204 } 2205 2206 if (modify_map) { 2207 /* 2208 * madvise behaviors that are implemented in the vm_map_entry. 2209 * 2210 * We clip the vm_map_entry so that behavioral changes are 2211 * limited to the specified address range. 2212 */ 2213 for (current = entry; current->start < end; 2214 current = current->next) { 2215 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) 2216 continue; 2217 2218 vm_map_clip_end(map, current, end); 2219 2220 switch (behav) { 2221 case MADV_NORMAL: 2222 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL); 2223 break; 2224 case MADV_SEQUENTIAL: 2225 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL); 2226 break; 2227 case MADV_RANDOM: 2228 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM); 2229 break; 2230 case MADV_NOSYNC: 2231 current->eflags |= MAP_ENTRY_NOSYNC; 2232 break; 2233 case MADV_AUTOSYNC: 2234 current->eflags &= ~MAP_ENTRY_NOSYNC; 2235 break; 2236 case MADV_NOCORE: 2237 current->eflags |= MAP_ENTRY_NOCOREDUMP; 2238 break; 2239 case MADV_CORE: 2240 current->eflags &= ~MAP_ENTRY_NOCOREDUMP; 2241 break; 2242 default: 2243 break; 2244 } 2245 vm_map_simplify_entry(map, current); 2246 } 2247 vm_map_unlock(map); 2248 } else { 2249 vm_pindex_t pstart, pend; 2250 2251 /* 2252 * madvise behaviors that are implemented in the underlying 2253 * vm_object. 2254 * 2255 * Since we don't clip the vm_map_entry, we have to clip 2256 * the vm_object pindex and count. 2257 */ 2258 for (current = entry; current->start < end; 2259 current = current->next) { 2260 vm_offset_t useEnd, useStart; 2261 2262 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) 2263 continue; 2264 2265 pstart = OFF_TO_IDX(current->offset); 2266 pend = pstart + atop(current->end - current->start); 2267 useStart = current->start; 2268 useEnd = current->end; 2269 2270 if (current->start < start) { 2271 pstart += atop(start - current->start); 2272 useStart = start; 2273 } 2274 if (current->end > end) { 2275 pend -= atop(current->end - end); 2276 useEnd = end; 2277 } 2278 2279 if (pstart >= pend) 2280 continue; 2281 2282 /* 2283 * Perform the pmap_advise() before clearing 2284 * PGA_REFERENCED in vm_page_advise(). Otherwise, a 2285 * concurrent pmap operation, such as pmap_remove(), 2286 * could clear a reference in the pmap and set 2287 * PGA_REFERENCED on the page before the pmap_advise() 2288 * had completed. Consequently, the page would appear 2289 * referenced based upon an old reference that 2290 * occurred before this pmap_advise() ran. 2291 */ 2292 if (behav == MADV_DONTNEED || behav == MADV_FREE) 2293 pmap_advise(map->pmap, useStart, useEnd, 2294 behav); 2295 2296 vm_object_madvise(current->object.vm_object, pstart, 2297 pend, behav); 2298 2299 /* 2300 * Pre-populate paging structures in the 2301 * WILLNEED case. For wired entries, the 2302 * paging structures are already populated. 2303 */ 2304 if (behav == MADV_WILLNEED && 2305 current->wired_count == 0) { 2306 vm_map_pmap_enter(map, 2307 useStart, 2308 current->protection, 2309 current->object.vm_object, 2310 pstart, 2311 ptoa(pend - pstart), 2312 MAP_PREFAULT_MADVISE 2313 ); 2314 } 2315 } 2316 vm_map_unlock_read(map); 2317 } 2318 return (0); 2319} 2320 2321 2322/* 2323 * vm_map_inherit: 2324 * 2325 * Sets the inheritance of the specified address 2326 * range in the target map. Inheritance 2327 * affects how the map will be shared with 2328 * child maps at the time of vmspace_fork. 2329 */ 2330int 2331vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, 2332 vm_inherit_t new_inheritance) 2333{ 2334 vm_map_entry_t entry; 2335 vm_map_entry_t temp_entry; 2336 2337 switch (new_inheritance) { 2338 case VM_INHERIT_NONE: 2339 case VM_INHERIT_COPY: 2340 case VM_INHERIT_SHARE: 2341 case VM_INHERIT_ZERO: 2342 break; 2343 default: 2344 return (KERN_INVALID_ARGUMENT); 2345 } 2346 if (start == end) 2347 return (KERN_SUCCESS); 2348 vm_map_lock(map); 2349 VM_MAP_RANGE_CHECK(map, start, end); 2350 if (vm_map_lookup_entry(map, start, &temp_entry)) { 2351 entry = temp_entry; 2352 vm_map_clip_start(map, entry, start); 2353 } else 2354 entry = temp_entry->next; 2355 while (entry->start < end) { 2356 vm_map_clip_end(map, entry, end); 2357 if ((entry->eflags & MAP_ENTRY_GUARD) == 0 || 2358 new_inheritance != VM_INHERIT_ZERO) 2359 entry->inheritance = new_inheritance; 2360 vm_map_simplify_entry(map, entry); 2361 entry = entry->next; 2362 } 2363 vm_map_unlock(map); 2364 return (KERN_SUCCESS); 2365} 2366 2367/* 2368 * vm_map_unwire: 2369 * 2370 * Implements both kernel and user unwiring. 2371 */ 2372int 2373vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, 2374 int flags) 2375{ 2376 vm_map_entry_t entry, first_entry, tmp_entry; 2377 vm_offset_t saved_start; 2378 unsigned int last_timestamp; 2379 int rv; 2380 boolean_t need_wakeup, result, user_unwire; 2381 2382 if (start == end) 2383 return (KERN_SUCCESS); 2384 user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE; 2385 vm_map_lock(map); 2386 VM_MAP_RANGE_CHECK(map, start, end); 2387 if (!vm_map_lookup_entry(map, start, &first_entry)) { 2388 if (flags & VM_MAP_WIRE_HOLESOK) 2389 first_entry = first_entry->next; 2390 else { 2391 vm_map_unlock(map); 2392 return (KERN_INVALID_ADDRESS); 2393 } 2394 } 2395 last_timestamp = map->timestamp; 2396 entry = first_entry; 2397 while (entry->start < end) { 2398 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { 2399 /* 2400 * We have not yet clipped the entry. 2401 */ 2402 saved_start = (start >= entry->start) ? start : 2403 entry->start; 2404 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 2405 if (vm_map_unlock_and_wait(map, 0)) { 2406 /* 2407 * Allow interruption of user unwiring? 2408 */ 2409 } 2410 vm_map_lock(map); 2411 if (last_timestamp+1 != map->timestamp) { 2412 /* 2413 * Look again for the entry because the map was 2414 * modified while it was unlocked. 2415 * Specifically, the entry may have been 2416 * clipped, merged, or deleted. 2417 */ 2418 if (!vm_map_lookup_entry(map, saved_start, 2419 &tmp_entry)) { 2420 if (flags & VM_MAP_WIRE_HOLESOK) 2421 tmp_entry = tmp_entry->next; 2422 else { 2423 if (saved_start == start) { 2424 /* 2425 * First_entry has been deleted. 2426 */ 2427 vm_map_unlock(map); 2428 return (KERN_INVALID_ADDRESS); 2429 } 2430 end = saved_start; 2431 rv = KERN_INVALID_ADDRESS; 2432 goto done; 2433 } 2434 } 2435 if (entry == first_entry) 2436 first_entry = tmp_entry; 2437 else 2438 first_entry = NULL; 2439 entry = tmp_entry; 2440 } 2441 last_timestamp = map->timestamp; 2442 continue; 2443 } 2444 vm_map_clip_start(map, entry, start); 2445 vm_map_clip_end(map, entry, end); 2446 /* 2447 * Mark the entry in case the map lock is released. (See 2448 * above.) 2449 */ 2450 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && 2451 entry->wiring_thread == NULL, 2452 ("owned map entry %p", entry)); 2453 entry->eflags |= MAP_ENTRY_IN_TRANSITION; 2454 entry->wiring_thread = curthread; 2455 /* 2456 * Check the map for holes in the specified region. 2457 * If VM_MAP_WIRE_HOLESOK was specified, skip this check. 2458 */ 2459 if (((flags & VM_MAP_WIRE_HOLESOK) == 0) && 2460 (entry->end < end && entry->next->start > entry->end)) { 2461 end = entry->end; 2462 rv = KERN_INVALID_ADDRESS; 2463 goto done; 2464 } 2465 /* 2466 * If system unwiring, require that the entry is system wired. 2467 */ 2468 if (!user_unwire && 2469 vm_map_entry_system_wired_count(entry) == 0) { 2470 end = entry->end; 2471 rv = KERN_INVALID_ARGUMENT; 2472 goto done; 2473 } 2474 entry = entry->next; 2475 } 2476 rv = KERN_SUCCESS; 2477done: 2478 need_wakeup = FALSE; 2479 if (first_entry == NULL) { 2480 result = vm_map_lookup_entry(map, start, &first_entry); 2481 if (!result && (flags & VM_MAP_WIRE_HOLESOK)) 2482 first_entry = first_entry->next; 2483 else 2484 KASSERT(result, ("vm_map_unwire: lookup failed")); 2485 } 2486 for (entry = first_entry; entry->start < end; entry = entry->next) { 2487 /* 2488 * If VM_MAP_WIRE_HOLESOK was specified, an empty 2489 * space in the unwired region could have been mapped 2490 * while the map lock was dropped for draining 2491 * MAP_ENTRY_IN_TRANSITION. Moreover, another thread 2492 * could be simultaneously wiring this new mapping 2493 * entry. Detect these cases and skip any entries 2494 * marked as in transition by us. 2495 */ 2496 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || 2497 entry->wiring_thread != curthread) { 2498 KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0, 2499 ("vm_map_unwire: !HOLESOK and new/changed entry")); 2500 continue; 2501 } 2502 2503 if (rv == KERN_SUCCESS && (!user_unwire || 2504 (entry->eflags & MAP_ENTRY_USER_WIRED))) { 2505 if (user_unwire) 2506 entry->eflags &= ~MAP_ENTRY_USER_WIRED; 2507 if (entry->wired_count == 1) 2508 vm_map_entry_unwire(map, entry); 2509 else 2510 entry->wired_count--; 2511 } 2512 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, 2513 ("vm_map_unwire: in-transition flag missing %p", entry)); 2514 KASSERT(entry->wiring_thread == curthread, 2515 ("vm_map_unwire: alien wire %p", entry)); 2516 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION; 2517 entry->wiring_thread = NULL; 2518 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { 2519 entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; 2520 need_wakeup = TRUE; 2521 } 2522 vm_map_simplify_entry(map, entry); 2523 } 2524 vm_map_unlock(map); 2525 if (need_wakeup) 2526 vm_map_wakeup(map); 2527 return (rv); 2528} 2529 2530/* 2531 * vm_map_wire_entry_failure: 2532 * 2533 * Handle a wiring failure on the given entry. 2534 * 2535 * The map should be locked. 2536 */ 2537static void 2538vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry, 2539 vm_offset_t failed_addr) 2540{ 2541 2542 VM_MAP_ASSERT_LOCKED(map); 2543 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 && 2544 entry->wired_count == 1, 2545 ("vm_map_wire_entry_failure: entry %p isn't being wired", entry)); 2546 KASSERT(failed_addr < entry->end, 2547 ("vm_map_wire_entry_failure: entry %p was fully wired", entry)); 2548 2549 /* 2550 * If any pages at the start of this entry were successfully wired, 2551 * then unwire them. 2552 */ 2553 if (failed_addr > entry->start) { 2554 pmap_unwire(map->pmap, entry->start, failed_addr); 2555 vm_object_unwire(entry->object.vm_object, entry->offset, 2556 failed_addr - entry->start, PQ_ACTIVE); 2557 } 2558 2559 /* 2560 * Assign an out-of-range value to represent the failure to wire this 2561 * entry. 2562 */ 2563 entry->wired_count = -1; 2564} 2565 2566/* 2567 * vm_map_wire: 2568 * 2569 * Implements both kernel and user wiring. 2570 */ 2571int 2572vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, 2573 int flags) 2574{ 2575 vm_map_entry_t entry, first_entry, tmp_entry; 2576 vm_offset_t faddr, saved_end, saved_start; 2577 unsigned int last_timestamp; 2578 int rv; 2579 boolean_t need_wakeup, result, user_wire; 2580 vm_prot_t prot; 2581 2582 if (start == end) 2583 return (KERN_SUCCESS); 2584 prot = 0; 2585 if (flags & VM_MAP_WIRE_WRITE) 2586 prot |= VM_PROT_WRITE; 2587 user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE; 2588 vm_map_lock(map); 2589 VM_MAP_RANGE_CHECK(map, start, end); 2590 if (!vm_map_lookup_entry(map, start, &first_entry)) { 2591 if (flags & VM_MAP_WIRE_HOLESOK) 2592 first_entry = first_entry->next; 2593 else { 2594 vm_map_unlock(map); 2595 return (KERN_INVALID_ADDRESS); 2596 } 2597 } 2598 last_timestamp = map->timestamp; 2599 entry = first_entry; 2600 while (entry->start < end) { 2601 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { 2602 /* 2603 * We have not yet clipped the entry. 2604 */ 2605 saved_start = (start >= entry->start) ? start : 2606 entry->start; 2607 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 2608 if (vm_map_unlock_and_wait(map, 0)) { 2609 /* 2610 * Allow interruption of user wiring? 2611 */ 2612 } 2613 vm_map_lock(map); 2614 if (last_timestamp + 1 != map->timestamp) { 2615 /* 2616 * Look again for the entry because the map was 2617 * modified while it was unlocked. 2618 * Specifically, the entry may have been 2619 * clipped, merged, or deleted. 2620 */ 2621 if (!vm_map_lookup_entry(map, saved_start, 2622 &tmp_entry)) { 2623 if (flags & VM_MAP_WIRE_HOLESOK) 2624 tmp_entry = tmp_entry->next; 2625 else { 2626 if (saved_start == start) { 2627 /* 2628 * first_entry has been deleted. 2629 */ 2630 vm_map_unlock(map); 2631 return (KERN_INVALID_ADDRESS); 2632 } 2633 end = saved_start; 2634 rv = KERN_INVALID_ADDRESS; 2635 goto done; 2636 } 2637 } 2638 if (entry == first_entry) 2639 first_entry = tmp_entry; 2640 else 2641 first_entry = NULL; 2642 entry = tmp_entry; 2643 } 2644 last_timestamp = map->timestamp; 2645 continue; 2646 } 2647 vm_map_clip_start(map, entry, start); 2648 vm_map_clip_end(map, entry, end); 2649 /* 2650 * Mark the entry in case the map lock is released. (See 2651 * above.) 2652 */ 2653 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 && 2654 entry->wiring_thread == NULL, 2655 ("owned map entry %p", entry)); 2656 entry->eflags |= MAP_ENTRY_IN_TRANSITION; 2657 entry->wiring_thread = curthread; 2658 if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 2659 || (entry->protection & prot) != prot) { 2660 entry->eflags |= MAP_ENTRY_WIRE_SKIPPED; 2661 if ((flags & VM_MAP_WIRE_HOLESOK) == 0) { 2662 end = entry->end; 2663 rv = KERN_INVALID_ADDRESS; 2664 goto done; 2665 } 2666 goto next_entry; 2667 } 2668 if (entry->wired_count == 0) { 2669 entry->wired_count++; 2670 saved_start = entry->start; 2671 saved_end = entry->end; 2672 2673 /* 2674 * Release the map lock, relying on the in-transition 2675 * mark. Mark the map busy for fork. 2676 */ 2677 vm_map_busy(map); 2678 vm_map_unlock(map); 2679 2680 faddr = saved_start; 2681 do { 2682 /* 2683 * Simulate a fault to get the page and enter 2684 * it into the physical map. 2685 */ 2686 if ((rv = vm_fault(map, faddr, VM_PROT_NONE, 2687 VM_FAULT_WIRE)) != KERN_SUCCESS) 2688 break; 2689 } while ((faddr += PAGE_SIZE) < saved_end); 2690 vm_map_lock(map); 2691 vm_map_unbusy(map); 2692 if (last_timestamp + 1 != map->timestamp) { 2693 /* 2694 * Look again for the entry because the map was 2695 * modified while it was unlocked. The entry 2696 * may have been clipped, but NOT merged or 2697 * deleted. 2698 */ 2699 result = vm_map_lookup_entry(map, saved_start, 2700 &tmp_entry); 2701 KASSERT(result, ("vm_map_wire: lookup failed")); 2702 if (entry == first_entry) 2703 first_entry = tmp_entry; 2704 else 2705 first_entry = NULL; 2706 entry = tmp_entry; 2707 while (entry->end < saved_end) { 2708 /* 2709 * In case of failure, handle entries 2710 * that were not fully wired here; 2711 * fully wired entries are handled 2712 * later. 2713 */ 2714 if (rv != KERN_SUCCESS && 2715 faddr < entry->end) 2716 vm_map_wire_entry_failure(map, 2717 entry, faddr); 2718 entry = entry->next; 2719 } 2720 } 2721 last_timestamp = map->timestamp; 2722 if (rv != KERN_SUCCESS) { 2723 vm_map_wire_entry_failure(map, entry, faddr); 2724 end = entry->end; 2725 goto done; 2726 } 2727 } else if (!user_wire || 2728 (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { 2729 entry->wired_count++; 2730 } 2731 /* 2732 * Check the map for holes in the specified region. 2733 * If VM_MAP_WIRE_HOLESOK was specified, skip this check. 2734 */ 2735 next_entry: 2736 if ((flags & VM_MAP_WIRE_HOLESOK) == 0 && 2737 entry->end < end && entry->next->start > entry->end) { 2738 end = entry->end; 2739 rv = KERN_INVALID_ADDRESS; 2740 goto done; 2741 } 2742 entry = entry->next; 2743 } 2744 rv = KERN_SUCCESS; 2745done: 2746 need_wakeup = FALSE; 2747 if (first_entry == NULL) { 2748 result = vm_map_lookup_entry(map, start, &first_entry); 2749 if (!result && (flags & VM_MAP_WIRE_HOLESOK)) 2750 first_entry = first_entry->next; 2751 else 2752 KASSERT(result, ("vm_map_wire: lookup failed")); 2753 } 2754 for (entry = first_entry; entry->start < end; entry = entry->next) { 2755 /* 2756 * If VM_MAP_WIRE_HOLESOK was specified, an empty 2757 * space in the unwired region could have been mapped 2758 * while the map lock was dropped for faulting in the 2759 * pages or draining MAP_ENTRY_IN_TRANSITION. 2760 * Moreover, another thread could be simultaneously 2761 * wiring this new mapping entry. Detect these cases 2762 * and skip any entries marked as in transition not by us. 2763 */ 2764 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 || 2765 entry->wiring_thread != curthread) { 2766 KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0, 2767 ("vm_map_wire: !HOLESOK and new/changed entry")); 2768 continue; 2769 } 2770 2771 if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) 2772 goto next_entry_done; 2773 2774 if (rv == KERN_SUCCESS) { 2775 if (user_wire) 2776 entry->eflags |= MAP_ENTRY_USER_WIRED; 2777 } else if (entry->wired_count == -1) { 2778 /* 2779 * Wiring failed on this entry. Thus, unwiring is 2780 * unnecessary. 2781 */ 2782 entry->wired_count = 0; 2783 } else if (!user_wire || 2784 (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { 2785 /* 2786 * Undo the wiring. Wiring succeeded on this entry 2787 * but failed on a later entry. 2788 */ 2789 if (entry->wired_count == 1) 2790 vm_map_entry_unwire(map, entry); 2791 else 2792 entry->wired_count--; 2793 } 2794 next_entry_done: 2795 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0, 2796 ("vm_map_wire: in-transition flag missing %p", entry)); 2797 KASSERT(entry->wiring_thread == curthread, 2798 ("vm_map_wire: alien wire %p", entry)); 2799 entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION | 2800 MAP_ENTRY_WIRE_SKIPPED); 2801 entry->wiring_thread = NULL; 2802 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { 2803 entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; 2804 need_wakeup = TRUE; 2805 } 2806 vm_map_simplify_entry(map, entry); 2807 } 2808 vm_map_unlock(map); 2809 if (need_wakeup) 2810 vm_map_wakeup(map); 2811 return (rv); 2812} 2813 2814/* 2815 * vm_map_sync 2816 * 2817 * Push any dirty cached pages in the address range to their pager. 2818 * If syncio is TRUE, dirty pages are written synchronously. 2819 * If invalidate is TRUE, any cached pages are freed as well. 2820 * 2821 * If the size of the region from start to end is zero, we are 2822 * supposed to flush all modified pages within the region containing 2823 * start. Unfortunately, a region can be split or coalesced with 2824 * neighboring regions, making it difficult to determine what the 2825 * original region was. Therefore, we approximate this requirement by 2826 * flushing the current region containing start. 2827 * 2828 * Returns an error if any part of the specified range is not mapped. 2829 */ 2830int 2831vm_map_sync( 2832 vm_map_t map, 2833 vm_offset_t start, 2834 vm_offset_t end, 2835 boolean_t syncio, 2836 boolean_t invalidate) 2837{ 2838 vm_map_entry_t current; 2839 vm_map_entry_t entry; 2840 vm_size_t size; 2841 vm_object_t object; 2842 vm_ooffset_t offset; 2843 unsigned int last_timestamp; 2844 boolean_t failed; 2845 2846 vm_map_lock_read(map); 2847 VM_MAP_RANGE_CHECK(map, start, end); 2848 if (!vm_map_lookup_entry(map, start, &entry)) { 2849 vm_map_unlock_read(map); 2850 return (KERN_INVALID_ADDRESS); 2851 } else if (start == end) { 2852 start = entry->start; 2853 end = entry->end; 2854 } 2855 /* 2856 * Make a first pass to check for user-wired memory and holes. 2857 */ 2858 for (current = entry; current->start < end; current = current->next) { 2859 if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) { 2860 vm_map_unlock_read(map); 2861 return (KERN_INVALID_ARGUMENT); 2862 } 2863 if (end > current->end && 2864 current->end != current->next->start) { 2865 vm_map_unlock_read(map); 2866 return (KERN_INVALID_ADDRESS); 2867 } 2868 } 2869 2870 if (invalidate) 2871 pmap_remove(map->pmap, start, end); 2872 failed = FALSE; 2873 2874 /* 2875 * Make a second pass, cleaning/uncaching pages from the indicated 2876 * objects as we go. 2877 */ 2878 for (current = entry; current->start < end;) { 2879 offset = current->offset + (start - current->start); 2880 size = (end <= current->end ? end : current->end) - start; 2881 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { 2882 vm_map_t smap; 2883 vm_map_entry_t tentry; 2884 vm_size_t tsize; 2885 2886 smap = current->object.sub_map; 2887 vm_map_lock_read(smap); 2888 (void) vm_map_lookup_entry(smap, offset, &tentry); 2889 tsize = tentry->end - offset; 2890 if (tsize < size) 2891 size = tsize; 2892 object = tentry->object.vm_object; 2893 offset = tentry->offset + (offset - tentry->start); 2894 vm_map_unlock_read(smap); 2895 } else { 2896 object = current->object.vm_object; 2897 } 2898 vm_object_reference(object); 2899 last_timestamp = map->timestamp; 2900 vm_map_unlock_read(map); 2901 if (!vm_object_sync(object, offset, size, syncio, invalidate)) 2902 failed = TRUE; 2903 start += size; 2904 vm_object_deallocate(object); 2905 vm_map_lock_read(map); 2906 if (last_timestamp == map->timestamp || 2907 !vm_map_lookup_entry(map, start, ¤t)) 2908 current = current->next; 2909 } 2910 2911 vm_map_unlock_read(map); 2912 return (failed ? KERN_FAILURE : KERN_SUCCESS); 2913} 2914 2915/* 2916 * vm_map_entry_unwire: [ internal use only ] 2917 * 2918 * Make the region specified by this entry pageable. 2919 * 2920 * The map in question should be locked. 2921 * [This is the reason for this routine's existence.] 2922 */ 2923static void 2924vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry) 2925{ 2926 2927 VM_MAP_ASSERT_LOCKED(map); 2928 KASSERT(entry->wired_count > 0, 2929 ("vm_map_entry_unwire: entry %p isn't wired", entry)); 2930 pmap_unwire(map->pmap, entry->start, entry->end); 2931 vm_object_unwire(entry->object.vm_object, entry->offset, entry->end - 2932 entry->start, PQ_ACTIVE); 2933 entry->wired_count = 0; 2934} 2935 2936static void 2937vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map) 2938{ 2939 2940 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) 2941 vm_object_deallocate(entry->object.vm_object); 2942 uma_zfree(system_map ? kmapentzone : mapentzone, entry); 2943} 2944 2945/* 2946 * vm_map_entry_delete: [ internal use only ] 2947 * 2948 * Deallocate the given entry from the target map. 2949 */ 2950static void 2951vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) 2952{ 2953 vm_object_t object; 2954 vm_pindex_t offidxstart, offidxend, count, size1; 2955 vm_size_t size; 2956 2957 vm_map_entry_unlink(map, entry); 2958 object = entry->object.vm_object; 2959 2960 if ((entry->eflags & MAP_ENTRY_GUARD) != 0) { 2961 MPASS(entry->cred == NULL); 2962 MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0); 2963 MPASS(object == NULL); 2964 vm_map_entry_deallocate(entry, map->system_map); 2965 return; 2966 } 2967 2968 size = entry->end - entry->start; 2969 map->size -= size; 2970 2971 if (entry->cred != NULL) { 2972 swap_release_by_cred(size, entry->cred); 2973 crfree(entry->cred); 2974 } 2975 2976 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 && 2977 (object != NULL)) { 2978 KASSERT(entry->cred == NULL || object->cred == NULL || 2979 (entry->eflags & MAP_ENTRY_NEEDS_COPY), 2980 ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry)); 2981 count = atop(size); 2982 offidxstart = OFF_TO_IDX(entry->offset); 2983 offidxend = offidxstart + count; 2984 VM_OBJECT_WLOCK(object); 2985 if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT | 2986 OBJ_ONEMAPPING)) == OBJ_ONEMAPPING || 2987 object == kernel_object || object == kmem_object)) { 2988 vm_object_collapse(object); 2989 2990 /* 2991 * The option OBJPR_NOTMAPPED can be passed here 2992 * because vm_map_delete() already performed 2993 * pmap_remove() on the only mapping to this range 2994 * of pages. 2995 */ 2996 vm_object_page_remove(object, offidxstart, offidxend, 2997 OBJPR_NOTMAPPED); 2998 if (object->type == OBJT_SWAP) 2999 swap_pager_freespace(object, offidxstart, 3000 count); 3001 if (offidxend >= object->size && 3002 offidxstart < object->size) { 3003 size1 = object->size; 3004 object->size = offidxstart; 3005 if (object->cred != NULL) { 3006 size1 -= object->size; 3007 KASSERT(object->charge >= ptoa(size1), 3008 ("object %p charge < 0", object)); 3009 swap_release_by_cred(ptoa(size1), 3010 object->cred); 3011 object->charge -= ptoa(size1); 3012 } 3013 } 3014 } 3015 VM_OBJECT_WUNLOCK(object); 3016 } else 3017 entry->object.vm_object = NULL; 3018 if (map->system_map) 3019 vm_map_entry_deallocate(entry, TRUE); 3020 else { 3021 entry->next = curthread->td_map_def_user; 3022 curthread->td_map_def_user = entry; 3023 } 3024} 3025 3026/* 3027 * vm_map_delete: [ internal use only ] 3028 * 3029 * Deallocates the given address range from the target 3030 * map. 3031 */ 3032int 3033vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end) 3034{ 3035 vm_map_entry_t entry; 3036 vm_map_entry_t first_entry; 3037 3038 VM_MAP_ASSERT_LOCKED(map); 3039 if (start == end) 3040 return (KERN_SUCCESS); 3041 3042 /* 3043 * Find the start of the region, and clip it 3044 */ 3045 if (!vm_map_lookup_entry(map, start, &first_entry)) 3046 entry = first_entry->next; 3047 else { 3048 entry = first_entry; 3049 vm_map_clip_start(map, entry, start); 3050 } 3051 3052 /* 3053 * Step through all entries in this region 3054 */ 3055 while (entry->start < end) { 3056 vm_map_entry_t next; 3057 3058 /* 3059 * Wait for wiring or unwiring of an entry to complete. 3060 * Also wait for any system wirings to disappear on 3061 * user maps. 3062 */ 3063 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 || 3064 (vm_map_pmap(map) != kernel_pmap && 3065 vm_map_entry_system_wired_count(entry) != 0)) { 3066 unsigned int last_timestamp; 3067 vm_offset_t saved_start; 3068 vm_map_entry_t tmp_entry; 3069 3070 saved_start = entry->start; 3071 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 3072 last_timestamp = map->timestamp; 3073 (void) vm_map_unlock_and_wait(map, 0); 3074 vm_map_lock(map); 3075 if (last_timestamp + 1 != map->timestamp) { 3076 /* 3077 * Look again for the entry because the map was 3078 * modified while it was unlocked. 3079 * Specifically, the entry may have been 3080 * clipped, merged, or deleted. 3081 */ 3082 if (!vm_map_lookup_entry(map, saved_start, 3083 &tmp_entry)) 3084 entry = tmp_entry->next; 3085 else { 3086 entry = tmp_entry; 3087 vm_map_clip_start(map, entry, 3088 saved_start); 3089 } 3090 } 3091 continue; 3092 } 3093 vm_map_clip_end(map, entry, end); 3094 3095 next = entry->next; 3096 3097 /* 3098 * Unwire before removing addresses from the pmap; otherwise, 3099 * unwiring will put the entries back in the pmap. 3100 */ 3101 if (entry->wired_count != 0) { 3102 vm_map_entry_unwire(map, entry); 3103 } 3104 3105 pmap_remove(map->pmap, entry->start, entry->end); 3106 3107 /* 3108 * Delete the entry only after removing all pmap 3109 * entries pointing to its pages. (Otherwise, its 3110 * page frames may be reallocated, and any modify bits 3111 * will be set in the wrong object!) 3112 */ 3113 vm_map_entry_delete(map, entry); 3114 entry = next; 3115 } 3116 return (KERN_SUCCESS); 3117} 3118 3119/* 3120 * vm_map_remove: 3121 * 3122 * Remove the given address range from the target map. 3123 * This is the exported form of vm_map_delete. 3124 */ 3125int 3126vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) 3127{ 3128 int result; 3129 3130 vm_map_lock(map); 3131 VM_MAP_RANGE_CHECK(map, start, end); 3132 result = vm_map_delete(map, start, end); 3133 vm_map_unlock(map); 3134 return (result); 3135} 3136 3137/* 3138 * vm_map_check_protection: 3139 * 3140 * Assert that the target map allows the specified privilege on the 3141 * entire address region given. The entire region must be allocated. 3142 * 3143 * WARNING! This code does not and should not check whether the 3144 * contents of the region is accessible. For example a smaller file 3145 * might be mapped into a larger address space. 3146 * 3147 * NOTE! This code is also called by munmap(). 3148 * 3149 * The map must be locked. A read lock is sufficient. 3150 */ 3151boolean_t 3152vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end, 3153 vm_prot_t protection) 3154{ 3155 vm_map_entry_t entry; 3156 vm_map_entry_t tmp_entry; 3157 3158 if (!vm_map_lookup_entry(map, start, &tmp_entry)) 3159 return (FALSE); 3160 entry = tmp_entry; 3161 3162 while (start < end) { 3163 /* 3164 * No holes allowed! 3165 */ 3166 if (start < entry->start) 3167 return (FALSE); 3168 /* 3169 * Check protection associated with entry. 3170 */ 3171 if ((entry->protection & protection) != protection) 3172 return (FALSE); 3173 /* go to next entry */ 3174 start = entry->end; 3175 entry = entry->next; 3176 } 3177 return (TRUE); 3178} 3179 3180/* 3181 * vm_map_copy_entry: 3182 * 3183 * Copies the contents of the source entry to the destination 3184 * entry. The entries *must* be aligned properly. 3185 */ 3186static void 3187vm_map_copy_entry( 3188 vm_map_t src_map, 3189 vm_map_t dst_map, 3190 vm_map_entry_t src_entry, 3191 vm_map_entry_t dst_entry, 3192 vm_ooffset_t *fork_charge) 3193{ 3194 vm_object_t src_object; 3195 vm_map_entry_t fake_entry; 3196 vm_offset_t size; 3197 struct ucred *cred; 3198 int charged; 3199 3200 VM_MAP_ASSERT_LOCKED(dst_map); 3201 3202 if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP) 3203 return; 3204 3205 if (src_entry->wired_count == 0 || 3206 (src_entry->protection & VM_PROT_WRITE) == 0) { 3207 /* 3208 * If the source entry is marked needs_copy, it is already 3209 * write-protected. 3210 */ 3211 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 && 3212 (src_entry->protection & VM_PROT_WRITE) != 0) { 3213 pmap_protect(src_map->pmap, 3214 src_entry->start, 3215 src_entry->end, 3216 src_entry->protection & ~VM_PROT_WRITE); 3217 } 3218 3219 /* 3220 * Make a copy of the object. 3221 */ 3222 size = src_entry->end - src_entry->start; 3223 if ((src_object = src_entry->object.vm_object) != NULL) { 3224 VM_OBJECT_WLOCK(src_object); 3225 charged = ENTRY_CHARGED(src_entry); 3226 if (src_object->handle == NULL && 3227 (src_object->type == OBJT_DEFAULT || 3228 src_object->type == OBJT_SWAP)) { 3229 vm_object_collapse(src_object); 3230 if ((src_object->flags & (OBJ_NOSPLIT | 3231 OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) { 3232 vm_object_split(src_entry); 3233 src_object = 3234 src_entry->object.vm_object; 3235 } 3236 } 3237 vm_object_reference_locked(src_object); 3238 vm_object_clear_flag(src_object, OBJ_ONEMAPPING); 3239 if (src_entry->cred != NULL && 3240 !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) { 3241 KASSERT(src_object->cred == NULL, 3242 ("OVERCOMMIT: vm_map_copy_entry: cred %p", 3243 src_object)); 3244 src_object->cred = src_entry->cred; 3245 src_object->charge = size; 3246 } 3247 VM_OBJECT_WUNLOCK(src_object); 3248 dst_entry->object.vm_object = src_object; 3249 if (charged) { 3250 cred = curthread->td_ucred; 3251 crhold(cred); 3252 dst_entry->cred = cred; 3253 *fork_charge += size; 3254 if (!(src_entry->eflags & 3255 MAP_ENTRY_NEEDS_COPY)) { 3256 crhold(cred); 3257 src_entry->cred = cred; 3258 *fork_charge += size; 3259 } 3260 } 3261 src_entry->eflags |= MAP_ENTRY_COW | 3262 MAP_ENTRY_NEEDS_COPY; 3263 dst_entry->eflags |= MAP_ENTRY_COW | 3264 MAP_ENTRY_NEEDS_COPY; 3265 dst_entry->offset = src_entry->offset; 3266 if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) { 3267 /* 3268 * MAP_ENTRY_VN_WRITECNT cannot 3269 * indicate write reference from 3270 * src_entry, since the entry is 3271 * marked as needs copy. Allocate a 3272 * fake entry that is used to 3273 * decrement object->un_pager.vnp.writecount 3274 * at the appropriate time. Attach 3275 * fake_entry to the deferred list. 3276 */ 3277 fake_entry = vm_map_entry_create(dst_map); 3278 fake_entry->eflags = MAP_ENTRY_VN_WRITECNT; 3279 src_entry->eflags &= ~MAP_ENTRY_VN_WRITECNT; 3280 vm_object_reference(src_object); 3281 fake_entry->object.vm_object = src_object; 3282 fake_entry->start = src_entry->start; 3283 fake_entry->end = src_entry->end; 3284 fake_entry->next = curthread->td_map_def_user; 3285 curthread->td_map_def_user = fake_entry; 3286 } 3287 3288 pmap_copy(dst_map->pmap, src_map->pmap, 3289 dst_entry->start, dst_entry->end - dst_entry->start, 3290 src_entry->start); 3291 } else { 3292 dst_entry->object.vm_object = NULL; 3293 dst_entry->offset = 0; 3294 if (src_entry->cred != NULL) { 3295 dst_entry->cred = curthread->td_ucred; 3296 crhold(dst_entry->cred); 3297 *fork_charge += size; 3298 } 3299 } 3300 } else { 3301 /* 3302 * We don't want to make writeable wired pages copy-on-write. 3303 * Immediately copy these pages into the new map by simulating 3304 * page faults. The new pages are pageable. 3305 */ 3306 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry, 3307 fork_charge); 3308 } 3309} 3310 3311/* 3312 * vmspace_map_entry_forked: 3313 * Update the newly-forked vmspace each time a map entry is inherited 3314 * or copied. The values for vm_dsize and vm_tsize are approximate 3315 * (and mostly-obsolete ideas in the face of mmap(2) et al.) 3316 */ 3317static void 3318vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2, 3319 vm_map_entry_t entry) 3320{ 3321 vm_size_t entrysize; 3322 vm_offset_t newend; 3323 3324 if ((entry->eflags & MAP_ENTRY_GUARD) != 0) 3325 return; 3326 entrysize = entry->end - entry->start; 3327 vm2->vm_map.size += entrysize; 3328 if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) { 3329 vm2->vm_ssize += btoc(entrysize); 3330 } else if (entry->start >= (vm_offset_t)vm1->vm_daddr && 3331 entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) { 3332 newend = MIN(entry->end, 3333 (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)); 3334 vm2->vm_dsize += btoc(newend - entry->start); 3335 } else if (entry->start >= (vm_offset_t)vm1->vm_taddr && 3336 entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) { 3337 newend = MIN(entry->end, 3338 (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)); 3339 vm2->vm_tsize += btoc(newend - entry->start); 3340 } 3341} 3342 3343/* 3344 * vmspace_fork: 3345 * Create a new process vmspace structure and vm_map 3346 * based on those of an existing process. The new map 3347 * is based on the old map, according to the inheritance 3348 * values on the regions in that map. 3349 * 3350 * XXX It might be worth coalescing the entries added to the new vmspace. 3351 * 3352 * The source map must not be locked. 3353 */ 3354struct vmspace * 3355vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) 3356{ 3357 struct vmspace *vm2; 3358 vm_map_t new_map, old_map; 3359 vm_map_entry_t new_entry, old_entry; 3360 vm_object_t object; 3361 int locked; 3362 vm_inherit_t inh; 3363 3364 old_map = &vm1->vm_map; 3365 /* Copy immutable fields of vm1 to vm2. */ 3366 vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, NULL); 3367 if (vm2 == NULL) 3368 return (NULL); 3369 vm2->vm_taddr = vm1->vm_taddr; 3370 vm2->vm_daddr = vm1->vm_daddr; 3371 vm2->vm_maxsaddr = vm1->vm_maxsaddr; 3372 vm_map_lock(old_map); 3373 if (old_map->busy) 3374 vm_map_wait_busy(old_map); 3375 new_map = &vm2->vm_map; 3376 locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */ 3377 KASSERT(locked, ("vmspace_fork: lock failed")); 3378 3379 old_entry = old_map->header.next; 3380 3381 while (old_entry != &old_map->header) { 3382 if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) 3383 panic("vm_map_fork: encountered a submap"); 3384 3385 inh = old_entry->inheritance; 3386 if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 && 3387 inh != VM_INHERIT_NONE) 3388 inh = VM_INHERIT_COPY; 3389 3390 switch (inh) { 3391 case VM_INHERIT_NONE: 3392 break; 3393 3394 case VM_INHERIT_SHARE: 3395 /* 3396 * Clone the entry, creating the shared object if necessary. 3397 */ 3398 object = old_entry->object.vm_object; 3399 if (object == NULL) { 3400 object = vm_object_allocate(OBJT_DEFAULT, 3401 atop(old_entry->end - old_entry->start)); 3402 old_entry->object.vm_object = object; 3403 old_entry->offset = 0; 3404 if (old_entry->cred != NULL) { 3405 object->cred = old_entry->cred; 3406 object->charge = old_entry->end - 3407 old_entry->start; 3408 old_entry->cred = NULL; 3409 } 3410 } 3411 3412 /* 3413 * Add the reference before calling vm_object_shadow 3414 * to insure that a shadow object is created. 3415 */ 3416 vm_object_reference(object); 3417 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) { 3418 vm_object_shadow(&old_entry->object.vm_object, 3419 &old_entry->offset, 3420 old_entry->end - old_entry->start); 3421 old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; 3422 /* Transfer the second reference too. */ 3423 vm_object_reference( 3424 old_entry->object.vm_object); 3425 3426 /* 3427 * As in vm_map_simplify_entry(), the 3428 * vnode lock will not be acquired in 3429 * this call to vm_object_deallocate(). 3430 */ 3431 vm_object_deallocate(object); 3432 object = old_entry->object.vm_object; 3433 } 3434 VM_OBJECT_WLOCK(object); 3435 vm_object_clear_flag(object, OBJ_ONEMAPPING); 3436 if (old_entry->cred != NULL) { 3437 KASSERT(object->cred == NULL, ("vmspace_fork both cred")); 3438 object->cred = old_entry->cred; 3439 object->charge = old_entry->end - old_entry->start; 3440 old_entry->cred = NULL; 3441 } 3442 3443 /* 3444 * Assert the correct state of the vnode 3445 * v_writecount while the object is locked, to 3446 * not relock it later for the assertion 3447 * correctness. 3448 */ 3449 if (old_entry->eflags & MAP_ENTRY_VN_WRITECNT && 3450 object->type == OBJT_VNODE) { 3451 KASSERT(((struct vnode *)object->handle)-> 3452 v_writecount > 0, 3453 ("vmspace_fork: v_writecount %p", object)); 3454 KASSERT(object->un_pager.vnp.writemappings > 0, 3455 ("vmspace_fork: vnp.writecount %p", 3456 object)); 3457 } 3458 VM_OBJECT_WUNLOCK(object); 3459 3460 /* 3461 * Clone the entry, referencing the shared object. 3462 */ 3463 new_entry = vm_map_entry_create(new_map); 3464 *new_entry = *old_entry; 3465 new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | 3466 MAP_ENTRY_IN_TRANSITION); 3467 new_entry->wiring_thread = NULL; 3468 new_entry->wired_count = 0; 3469 if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) { 3470 vnode_pager_update_writecount(object, 3471 new_entry->start, new_entry->end); 3472 } 3473 3474 /* 3475 * Insert the entry into the new map -- we know we're 3476 * inserting at the end of the new map. 3477 */ 3478 vm_map_entry_link(new_map, new_map->header.prev, 3479 new_entry); 3480 vmspace_map_entry_forked(vm1, vm2, new_entry); 3481 3482 /* 3483 * Update the physical map 3484 */ 3485 pmap_copy(new_map->pmap, old_map->pmap, 3486 new_entry->start, 3487 (old_entry->end - old_entry->start), 3488 old_entry->start); 3489 break; 3490 3491 case VM_INHERIT_COPY: 3492 /* 3493 * Clone the entry and link into the map. 3494 */ 3495 new_entry = vm_map_entry_create(new_map); 3496 *new_entry = *old_entry; 3497 /* 3498 * Copied entry is COW over the old object. 3499 */ 3500 new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | 3501 MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT); 3502 new_entry->wiring_thread = NULL; 3503 new_entry->wired_count = 0; 3504 new_entry->object.vm_object = NULL; 3505 new_entry->cred = NULL; 3506 vm_map_entry_link(new_map, new_map->header.prev, 3507 new_entry); 3508 vmspace_map_entry_forked(vm1, vm2, new_entry); 3509 vm_map_copy_entry(old_map, new_map, old_entry, 3510 new_entry, fork_charge); 3511 break; 3512 3513 case VM_INHERIT_ZERO: 3514 /* 3515 * Create a new anonymous mapping entry modelled from 3516 * the old one. 3517 */ 3518 new_entry = vm_map_entry_create(new_map); 3519 memset(new_entry, 0, sizeof(*new_entry)); 3520 3521 new_entry->start = old_entry->start; 3522 new_entry->end = old_entry->end; 3523 new_entry->eflags = old_entry->eflags & 3524 ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION | 3525 MAP_ENTRY_VN_WRITECNT); 3526 new_entry->protection = old_entry->protection; 3527 new_entry->max_protection = old_entry->max_protection; 3528 new_entry->inheritance = VM_INHERIT_ZERO; 3529 3530 vm_map_entry_link(new_map, new_map->header.prev, 3531 new_entry); 3532 vmspace_map_entry_forked(vm1, vm2, new_entry); 3533 3534 new_entry->cred = curthread->td_ucred; 3535 crhold(new_entry->cred); 3536 *fork_charge += (new_entry->end - new_entry->start); 3537 3538 break; 3539 } 3540 old_entry = old_entry->next; 3541 } 3542 /* 3543 * Use inlined vm_map_unlock() to postpone handling the deferred 3544 * map entries, which cannot be done until both old_map and 3545 * new_map locks are released. 3546 */ 3547 sx_xunlock(&old_map->lock); 3548 sx_xunlock(&new_map->lock); 3549 vm_map_process_deferred(); 3550 3551 return (vm2); 3552} 3553 3554/* 3555 * Create a process's stack for exec_new_vmspace(). This function is never 3556 * asked to wire the newly created stack. 3557 */ 3558int 3559vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, 3560 vm_prot_t prot, vm_prot_t max, int cow) 3561{ 3562 vm_size_t growsize, init_ssize; 3563 rlim_t vmemlim; 3564 int rv; 3565 3566 MPASS((map->flags & MAP_WIREFUTURE) == 0); 3567 growsize = sgrowsiz; 3568 init_ssize = (max_ssize < growsize) ? max_ssize : growsize; 3569 vm_map_lock(map); 3570 vmemlim = lim_cur(curthread, RLIMIT_VMEM); 3571 /* If we would blow our VMEM resource limit, no go */ 3572 if (map->size + init_ssize > vmemlim) { 3573 rv = KERN_NO_SPACE; 3574 goto out; 3575 } 3576 rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot, 3577 max, cow); 3578out: 3579 vm_map_unlock(map); 3580 return (rv); 3581} 3582 3583static int stack_guard_page = 1; 3584SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN, 3585 &stack_guard_page, 0, 3586 "Specifies the number of guard pages for a stack that grows"); 3587 3588static int 3589vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, 3590 vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow) 3591{ 3592 vm_map_entry_t new_entry, prev_entry; 3593 vm_offset_t bot, gap_bot, gap_top, top; 3594 vm_size_t init_ssize, sgp; 3595 int orient, rv; 3596 3597 /* 3598 * The stack orientation is piggybacked with the cow argument. 3599 * Extract it into orient and mask the cow argument so that we 3600 * don't pass it around further. 3601 */ 3602 orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP); 3603 KASSERT(orient != 0, ("No stack grow direction")); 3604 KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP), 3605 ("bi-dir stack")); 3606 3607 if (addrbos < vm_map_min(map) || 3608 addrbos + max_ssize > vm_map_max(map) || 3609 addrbos + max_ssize <= addrbos) 3610 return (KERN_INVALID_ADDRESS); 3611 sgp = (vm_size_t)stack_guard_page * PAGE_SIZE; 3612 if (sgp >= max_ssize) 3613 return (KERN_INVALID_ARGUMENT); 3614 3615 init_ssize = growsize; 3616 if (max_ssize < init_ssize + sgp) 3617 init_ssize = max_ssize - sgp; 3618 3619 /* If addr is already mapped, no go */ 3620 if (vm_map_lookup_entry(map, addrbos, &prev_entry)) 3621 return (KERN_NO_SPACE); 3622 3623 /* 3624 * If we can't accommodate max_ssize in the current mapping, no go. 3625 */ 3626 if (prev_entry->next->start < addrbos + max_ssize) 3627 return (KERN_NO_SPACE); 3628 3629 /* 3630 * We initially map a stack of only init_ssize. We will grow as 3631 * needed later. Depending on the orientation of the stack (i.e. 3632 * the grow direction) we either map at the top of the range, the 3633 * bottom of the range or in the middle. 3634 * 3635 * Note: we would normally expect prot and max to be VM_PROT_ALL, 3636 * and cow to be 0. Possibly we should eliminate these as input 3637 * parameters, and just pass these values here in the insert call. 3638 */ 3639 if (orient == MAP_STACK_GROWS_DOWN) { 3640 bot = addrbos + max_ssize - init_ssize; 3641 top = bot + init_ssize; 3642 gap_bot = addrbos; 3643 gap_top = bot; 3644 } else /* if (orient == MAP_STACK_GROWS_UP) */ { 3645 bot = addrbos; 3646 top = bot + init_ssize; 3647 gap_bot = top; 3648 gap_top = addrbos + max_ssize; 3649 } 3650 rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow); 3651 if (rv != KERN_SUCCESS) 3652 return (rv); 3653 new_entry = prev_entry->next; 3654 KASSERT(new_entry->end == top || new_entry->start == bot, 3655 ("Bad entry start/end for new stack entry")); 3656 KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 || 3657 (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0, 3658 ("new entry lacks MAP_ENTRY_GROWS_DOWN")); 3659 KASSERT((orient & MAP_STACK_GROWS_UP) == 0 || 3660 (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0, 3661 ("new entry lacks MAP_ENTRY_GROWS_UP")); 3662 rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE, 3663 VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ? 3664 MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP)); 3665 if (rv != KERN_SUCCESS) 3666 (void)vm_map_delete(map, bot, top); 3667 return (rv); 3668} 3669 3670/* 3671 * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we 3672 * successfully grow the stack. 3673 */ 3674static int 3675vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry) 3676{ 3677 vm_map_entry_t stack_entry; 3678 struct proc *p; 3679 struct vmspace *vm; 3680 struct ucred *cred; 3681 vm_offset_t gap_end, gap_start, grow_start; 3682 size_t grow_amount, guard, max_grow; 3683 rlim_t lmemlim, stacklim, vmemlim; 3684 int rv, rv1; 3685 bool gap_deleted, grow_down, is_procstack; 3686#ifdef notyet 3687 uint64_t limit; 3688#endif 3689#ifdef RACCT 3690 int error; 3691#endif 3692 3693 p = curproc; 3694 vm = p->p_vmspace; 3695 3696 /* 3697 * Disallow stack growth when the access is performed by a 3698 * debugger or AIO daemon. The reason is that the wrong 3699 * resource limits are applied. 3700 */ 3701 if (map != &p->p_vmspace->vm_map || p->p_textvp == NULL) 3702 return (KERN_FAILURE); 3703 3704 MPASS(!map->system_map); 3705 3706 guard = stack_guard_page * PAGE_SIZE; 3707 lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK); 3708 stacklim = lim_cur(curthread, RLIMIT_STACK); 3709 vmemlim = lim_cur(curthread, RLIMIT_VMEM); 3710retry: 3711 /* If addr is not in a hole for a stack grow area, no need to grow. */ 3712 if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry)) 3713 return (KERN_FAILURE); 3714 if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0) 3715 return (KERN_SUCCESS); 3716 if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) { 3717 stack_entry = gap_entry->next; 3718 if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 || 3719 stack_entry->start != gap_entry->end) 3720 return (KERN_FAILURE); 3721 grow_amount = round_page(stack_entry->start - addr); 3722 grow_down = true; 3723 } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) { 3724 stack_entry = gap_entry->prev; 3725 if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 || 3726 stack_entry->end != gap_entry->start) 3727 return (KERN_FAILURE); 3728 grow_amount = round_page(addr + 1 - stack_entry->end); 3729 grow_down = false; 3730 } else { 3731 return (KERN_FAILURE); 3732 } 3733 max_grow = gap_entry->end - gap_entry->start; 3734 if (guard > max_grow) 3735 return (KERN_NO_SPACE); 3736 max_grow -= guard; 3737 if (grow_amount > max_grow) 3738 return (KERN_NO_SPACE); 3739 3740 /* 3741 * If this is the main process stack, see if we're over the stack 3742 * limit. 3743 */ 3744 is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr && 3745 addr < (vm_offset_t)p->p_sysent->sv_usrstack; 3746 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) 3747 return (KERN_NO_SPACE); 3748 3749#ifdef RACCT 3750 if (racct_enable) { 3751 PROC_LOCK(p); 3752 if (is_procstack && racct_set(p, RACCT_STACK, 3753 ctob(vm->vm_ssize) + grow_amount)) { 3754 PROC_UNLOCK(p); 3755 return (KERN_NO_SPACE); 3756 } 3757 PROC_UNLOCK(p); 3758 } 3759#endif 3760 3761 grow_amount = roundup(grow_amount, sgrowsiz); 3762 if (grow_amount > max_grow) 3763 grow_amount = max_grow; 3764 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) { 3765 grow_amount = trunc_page((vm_size_t)stacklim) - 3766 ctob(vm->vm_ssize); 3767 } 3768 3769#ifdef notyet 3770 PROC_LOCK(p); 3771 limit = racct_get_available(p, RACCT_STACK); 3772 PROC_UNLOCK(p); 3773 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit)) 3774 grow_amount = limit - ctob(vm->vm_ssize); 3775#endif 3776 3777 if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) { 3778 if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) { 3779 rv = KERN_NO_SPACE; 3780 goto out; 3781 } 3782#ifdef RACCT 3783 if (racct_enable) { 3784 PROC_LOCK(p); 3785 if (racct_set(p, RACCT_MEMLOCK, 3786 ptoa(pmap_wired_count(map->pmap)) + grow_amount)) { 3787 PROC_UNLOCK(p); 3788 rv = KERN_NO_SPACE; 3789 goto out; 3790 } 3791 PROC_UNLOCK(p); 3792 } 3793#endif 3794 } 3795 3796 /* If we would blow our VMEM resource limit, no go */ 3797 if (map->size + grow_amount > vmemlim) { 3798 rv = KERN_NO_SPACE; 3799 goto out; 3800 } 3801#ifdef RACCT 3802 if (racct_enable) { 3803 PROC_LOCK(p); 3804 if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) { 3805 PROC_UNLOCK(p); 3806 rv = KERN_NO_SPACE; 3807 goto out; 3808 } 3809 PROC_UNLOCK(p); 3810 } 3811#endif 3812 3813 if (vm_map_lock_upgrade(map)) { 3814 gap_entry = NULL; 3815 vm_map_lock_read(map); 3816 goto retry; 3817 } 3818 3819 if (grow_down) { 3820 grow_start = gap_entry->end - grow_amount; 3821 if (gap_entry->start + grow_amount == gap_entry->end) { 3822 gap_start = gap_entry->start; 3823 gap_end = gap_entry->end; 3824 vm_map_entry_delete(map, gap_entry); 3825 gap_deleted = true; 3826 } else { 3827 MPASS(gap_entry->start < gap_entry->end - grow_amount); 3828 gap_entry->end -= grow_amount; 3829 vm_map_entry_resize_free(map, gap_entry); 3830 gap_deleted = false; 3831 } 3832 rv = vm_map_insert(map, NULL, 0, grow_start, 3833 grow_start + grow_amount, 3834 stack_entry->protection, stack_entry->max_protection, 3835 MAP_STACK_GROWS_DOWN); 3836 if (rv != KERN_SUCCESS) { 3837 if (gap_deleted) { 3838 rv1 = vm_map_insert(map, NULL, 0, gap_start, 3839 gap_end, VM_PROT_NONE, VM_PROT_NONE, 3840 MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN); 3841 MPASS(rv1 == KERN_SUCCESS); 3842 } else { 3843 gap_entry->end += grow_amount; 3844 vm_map_entry_resize_free(map, gap_entry); 3845 } 3846 } 3847 } else { 3848 grow_start = stack_entry->end; 3849 cred = stack_entry->cred; 3850 if (cred == NULL && stack_entry->object.vm_object != NULL) 3851 cred = stack_entry->object.vm_object->cred; 3852 if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred)) 3853 rv = KERN_NO_SPACE; 3854 /* Grow the underlying object if applicable. */ 3855 else if (stack_entry->object.vm_object == NULL || 3856 vm_object_coalesce(stack_entry->object.vm_object, 3857 stack_entry->offset, 3858 (vm_size_t)(stack_entry->end - stack_entry->start), 3859 (vm_size_t)grow_amount, cred != NULL)) { 3860 if (gap_entry->start + grow_amount == gap_entry->end) 3861 vm_map_entry_delete(map, gap_entry); 3862 else 3863 gap_entry->start += grow_amount; 3864 stack_entry->end += grow_amount; 3865 map->size += grow_amount; 3866 vm_map_entry_resize_free(map, stack_entry); 3867 rv = KERN_SUCCESS; 3868 } else 3869 rv = KERN_FAILURE; 3870 } 3871 if (rv == KERN_SUCCESS && is_procstack) 3872 vm->vm_ssize += btoc(grow_amount); 3873 3874 /* 3875 * Heed the MAP_WIREFUTURE flag if it was set for this process. 3876 */ 3877 if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) { 3878 vm_map_unlock(map); 3879 vm_map_wire(map, grow_start, grow_start + grow_amount, 3880 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 3881 vm_map_lock_read(map); 3882 } else 3883 vm_map_lock_downgrade(map); 3884 3885out: 3886#ifdef RACCT 3887 if (racct_enable && rv != KERN_SUCCESS) { 3888 PROC_LOCK(p); 3889 error = racct_set(p, RACCT_VMEM, map->size); 3890 KASSERT(error == 0, ("decreasing RACCT_VMEM failed")); 3891 if (!old_mlock) { 3892 error = racct_set(p, RACCT_MEMLOCK, 3893 ptoa(pmap_wired_count(map->pmap))); 3894 KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed")); 3895 } 3896 error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize)); 3897 KASSERT(error == 0, ("decreasing RACCT_STACK failed")); 3898 PROC_UNLOCK(p); 3899 } 3900#endif 3901 3902 return (rv); 3903} 3904 3905/* 3906 * Unshare the specified VM space for exec. If other processes are 3907 * mapped to it, then create a new one. The new vmspace is null. 3908 */ 3909int 3910vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser) 3911{ 3912 struct vmspace *oldvmspace = p->p_vmspace; 3913 struct vmspace *newvmspace; 3914 3915 KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0, 3916 ("vmspace_exec recursed")); 3917 newvmspace = vmspace_alloc(minuser, maxuser, NULL); 3918 if (newvmspace == NULL) 3919 return (ENOMEM); 3920 newvmspace->vm_swrss = oldvmspace->vm_swrss; 3921 /* 3922 * This code is written like this for prototype purposes. The 3923 * goal is to avoid running down the vmspace here, but let the 3924 * other process's that are still using the vmspace to finally 3925 * run it down. Even though there is little or no chance of blocking 3926 * here, it is a good idea to keep this form for future mods. 3927 */ 3928 PROC_VMSPACE_LOCK(p); 3929 p->p_vmspace = newvmspace; 3930 PROC_VMSPACE_UNLOCK(p); 3931 if (p == curthread->td_proc) 3932 pmap_activate(curthread); 3933 curthread->td_pflags |= TDP_EXECVMSPC; 3934 return (0); 3935} 3936 3937/* 3938 * Unshare the specified VM space for forcing COW. This 3939 * is called by rfork, for the (RFMEM|RFPROC) == 0 case. 3940 */ 3941int 3942vmspace_unshare(struct proc *p) 3943{ 3944 struct vmspace *oldvmspace = p->p_vmspace; 3945 struct vmspace *newvmspace; 3946 vm_ooffset_t fork_charge; 3947 3948 if (oldvmspace->vm_refcnt == 1) 3949 return (0); 3950 fork_charge = 0; 3951 newvmspace = vmspace_fork(oldvmspace, &fork_charge); 3952 if (newvmspace == NULL) 3953 return (ENOMEM); 3954 if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) { 3955 vmspace_free(newvmspace); 3956 return (ENOMEM); 3957 } 3958 PROC_VMSPACE_LOCK(p); 3959 p->p_vmspace = newvmspace; 3960 PROC_VMSPACE_UNLOCK(p); 3961 if (p == curthread->td_proc) 3962 pmap_activate(curthread); 3963 vmspace_free(oldvmspace); 3964 return (0); 3965} 3966 3967/* 3968 * vm_map_lookup: 3969 * 3970 * Finds the VM object, offset, and 3971 * protection for a given virtual address in the 3972 * specified map, assuming a page fault of the 3973 * type specified. 3974 * 3975 * Leaves the map in question locked for read; return 3976 * values are guaranteed until a vm_map_lookup_done 3977 * call is performed. Note that the map argument 3978 * is in/out; the returned map must be used in 3979 * the call to vm_map_lookup_done. 3980 * 3981 * A handle (out_entry) is returned for use in 3982 * vm_map_lookup_done, to make that fast. 3983 * 3984 * If a lookup is requested with "write protection" 3985 * specified, the map may be changed to perform virtual 3986 * copying operations, although the data referenced will 3987 * remain the same. 3988 */ 3989int 3990vm_map_lookup(vm_map_t *var_map, /* IN/OUT */ 3991 vm_offset_t vaddr, 3992 vm_prot_t fault_typea, 3993 vm_map_entry_t *out_entry, /* OUT */ 3994 vm_object_t *object, /* OUT */ 3995 vm_pindex_t *pindex, /* OUT */ 3996 vm_prot_t *out_prot, /* OUT */ 3997 boolean_t *wired) /* OUT */ 3998{ 3999 vm_map_entry_t entry; 4000 vm_map_t map = *var_map; 4001 vm_prot_t prot; 4002 vm_prot_t fault_type = fault_typea; 4003 vm_object_t eobject; 4004 vm_size_t size; 4005 struct ucred *cred; 4006 4007RetryLookup: 4008 4009 vm_map_lock_read(map); 4010 4011RetryLookupLocked: 4012 /* 4013 * Lookup the faulting address. 4014 */ 4015 if (!vm_map_lookup_entry(map, vaddr, out_entry)) { 4016 vm_map_unlock_read(map); 4017 return (KERN_INVALID_ADDRESS); 4018 } 4019 4020 entry = *out_entry; 4021 4022 /* 4023 * Handle submaps. 4024 */ 4025 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { 4026 vm_map_t old_map = map; 4027 4028 *var_map = map = entry->object.sub_map; 4029 vm_map_unlock_read(old_map); 4030 goto RetryLookup; 4031 } 4032 4033 /* 4034 * Check whether this task is allowed to have this page. 4035 */ 4036 prot = entry->protection; 4037 if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) { 4038 fault_typea &= ~VM_PROT_FAULT_LOOKUP; 4039 if (prot == VM_PROT_NONE && map != kernel_map && 4040 (entry->eflags & MAP_ENTRY_GUARD) != 0 && 4041 (entry->eflags & (MAP_ENTRY_STACK_GAP_DN | 4042 MAP_ENTRY_STACK_GAP_UP)) != 0 && 4043 vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS) 4044 goto RetryLookupLocked; 4045 } 4046 fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; 4047 if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) { 4048 vm_map_unlock_read(map); 4049 return (KERN_PROTECTION_FAILURE); 4050 } 4051 KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags & 4052 (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) != 4053 (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY), 4054 ("entry %p flags %x", entry, entry->eflags)); 4055 if ((fault_typea & VM_PROT_COPY) != 0 && 4056 (entry->max_protection & VM_PROT_WRITE) == 0 && 4057 (entry->eflags & MAP_ENTRY_COW) == 0) { 4058 vm_map_unlock_read(map); 4059 return (KERN_PROTECTION_FAILURE); 4060 } 4061 4062 /* 4063 * If this page is not pageable, we have to get it for all possible 4064 * accesses. 4065 */ 4066 *wired = (entry->wired_count != 0); 4067 if (*wired) 4068 fault_type = entry->protection; 4069 size = entry->end - entry->start; 4070 /* 4071 * If the entry was copy-on-write, we either ... 4072 */ 4073 if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { 4074 /* 4075 * If we want to write the page, we may as well handle that 4076 * now since we've got the map locked. 4077 * 4078 * If we don't need to write the page, we just demote the 4079 * permissions allowed. 4080 */ 4081 if ((fault_type & VM_PROT_WRITE) != 0 || 4082 (fault_typea & VM_PROT_COPY) != 0) { 4083 /* 4084 * Make a new object, and place it in the object 4085 * chain. Note that no new references have appeared 4086 * -- one just moved from the map to the new 4087 * object. 4088 */ 4089 if (vm_map_lock_upgrade(map)) 4090 goto RetryLookup; 4091 4092 if (entry->cred == NULL) { 4093 /* 4094 * The debugger owner is charged for 4095 * the memory. 4096 */ 4097 cred = curthread->td_ucred; 4098 crhold(cred); 4099 if (!swap_reserve_by_cred(size, cred)) { 4100 crfree(cred); 4101 vm_map_unlock(map); 4102 return (KERN_RESOURCE_SHORTAGE); 4103 } 4104 entry->cred = cred; 4105 } 4106 vm_object_shadow(&entry->object.vm_object, 4107 &entry->offset, size); 4108 entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; 4109 eobject = entry->object.vm_object; 4110 if (eobject->cred != NULL) { 4111 /* 4112 * The object was not shadowed. 4113 */ 4114 swap_release_by_cred(size, entry->cred); 4115 crfree(entry->cred); 4116 entry->cred = NULL; 4117 } else if (entry->cred != NULL) { 4118 VM_OBJECT_WLOCK(eobject); 4119 eobject->cred = entry->cred; 4120 eobject->charge = size; 4121 VM_OBJECT_WUNLOCK(eobject); 4122 entry->cred = NULL; 4123 } 4124 4125 vm_map_lock_downgrade(map); 4126 } else { 4127 /* 4128 * We're attempting to read a copy-on-write page -- 4129 * don't allow writes. 4130 */ 4131 prot &= ~VM_PROT_WRITE; 4132 } 4133 } 4134 4135 /* 4136 * Create an object if necessary. 4137 */ 4138 if (entry->object.vm_object == NULL && 4139 !map->system_map) { 4140 if (vm_map_lock_upgrade(map)) 4141 goto RetryLookup; 4142 entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT, 4143 atop(size)); 4144 entry->offset = 0; 4145 if (entry->cred != NULL) { 4146 VM_OBJECT_WLOCK(entry->object.vm_object); 4147 entry->object.vm_object->cred = entry->cred; 4148 entry->object.vm_object->charge = size; 4149 VM_OBJECT_WUNLOCK(entry->object.vm_object); 4150 entry->cred = NULL; 4151 } 4152 vm_map_lock_downgrade(map); 4153 } 4154 4155 /* 4156 * Return the object/offset from this entry. If the entry was 4157 * copy-on-write or empty, it has been fixed up. 4158 */ 4159 *pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset); 4160 *object = entry->object.vm_object; 4161 4162 *out_prot = prot; 4163 return (KERN_SUCCESS); 4164} 4165 4166/* 4167 * vm_map_lookup_locked: 4168 * 4169 * Lookup the faulting address. A version of vm_map_lookup that returns 4170 * KERN_FAILURE instead of blocking on map lock or memory allocation. 4171 */ 4172int 4173vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */ 4174 vm_offset_t vaddr, 4175 vm_prot_t fault_typea, 4176 vm_map_entry_t *out_entry, /* OUT */ 4177 vm_object_t *object, /* OUT */ 4178 vm_pindex_t *pindex, /* OUT */ 4179 vm_prot_t *out_prot, /* OUT */ 4180 boolean_t *wired) /* OUT */ 4181{ 4182 vm_map_entry_t entry; 4183 vm_map_t map = *var_map; 4184 vm_prot_t prot; 4185 vm_prot_t fault_type = fault_typea; 4186 4187 /* 4188 * Lookup the faulting address. 4189 */ 4190 if (!vm_map_lookup_entry(map, vaddr, out_entry)) 4191 return (KERN_INVALID_ADDRESS); 4192 4193 entry = *out_entry; 4194 4195 /* 4196 * Fail if the entry refers to a submap. 4197 */ 4198 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) 4199 return (KERN_FAILURE); 4200 4201 /* 4202 * Check whether this task is allowed to have this page. 4203 */ 4204 prot = entry->protection; 4205 fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; 4206 if ((fault_type & prot) != fault_type) 4207 return (KERN_PROTECTION_FAILURE); 4208 4209 /* 4210 * If this page is not pageable, we have to get it for all possible 4211 * accesses. 4212 */ 4213 *wired = (entry->wired_count != 0); 4214 if (*wired) 4215 fault_type = entry->protection; 4216 4217 if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { 4218 /* 4219 * Fail if the entry was copy-on-write for a write fault. 4220 */ 4221 if (fault_type & VM_PROT_WRITE) 4222 return (KERN_FAILURE); 4223 /* 4224 * We're attempting to read a copy-on-write page -- 4225 * don't allow writes. 4226 */ 4227 prot &= ~VM_PROT_WRITE; 4228 } 4229 4230 /* 4231 * Fail if an object should be created. 4232 */ 4233 if (entry->object.vm_object == NULL && !map->system_map) 4234 return (KERN_FAILURE); 4235 4236 /* 4237 * Return the object/offset from this entry. If the entry was 4238 * copy-on-write or empty, it has been fixed up. 4239 */ 4240 *pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset); 4241 *object = entry->object.vm_object; 4242 4243 *out_prot = prot; 4244 return (KERN_SUCCESS); 4245} 4246 4247/* 4248 * vm_map_lookup_done: 4249 * 4250 * Releases locks acquired by a vm_map_lookup 4251 * (according to the handle returned by that lookup). 4252 */ 4253void 4254vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry) 4255{ 4256 /* 4257 * Unlock the main-level map 4258 */ 4259 vm_map_unlock_read(map); 4260} 4261 4262#include "opt_ddb.h" 4263#ifdef DDB 4264#include <sys/kernel.h> 4265 4266#include <ddb/ddb.h> 4267 4268static void 4269vm_map_print(vm_map_t map) 4270{ 4271 vm_map_entry_t entry; 4272 4273 db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n", 4274 (void *)map, 4275 (void *)map->pmap, map->nentries, map->timestamp); 4276 4277 db_indent += 2; 4278 for (entry = map->header.next; entry != &map->header; 4279 entry = entry->next) { 4280 db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n", 4281 (void *)entry, (void *)entry->start, (void *)entry->end, 4282 entry->eflags); 4283 { 4284 static char *inheritance_name[4] = 4285 {"share", "copy", "none", "donate_copy"}; 4286 4287 db_iprintf(" prot=%x/%x/%s", 4288 entry->protection, 4289 entry->max_protection, 4290 inheritance_name[(int)(unsigned char)entry->inheritance]); 4291 if (entry->wired_count != 0) 4292 db_printf(", wired"); 4293 } 4294 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { 4295 db_printf(", share=%p, offset=0x%jx\n", 4296 (void *)entry->object.sub_map, 4297 (uintmax_t)entry->offset); 4298 if ((entry->prev == &map->header) || 4299 (entry->prev->object.sub_map != 4300 entry->object.sub_map)) { 4301 db_indent += 2; 4302 vm_map_print((vm_map_t)entry->object.sub_map); 4303 db_indent -= 2; 4304 } 4305 } else { 4306 if (entry->cred != NULL) 4307 db_printf(", ruid %d", entry->cred->cr_ruid); 4308 db_printf(", object=%p, offset=0x%jx", 4309 (void *)entry->object.vm_object, 4310 (uintmax_t)entry->offset); 4311 if (entry->object.vm_object && entry->object.vm_object->cred) 4312 db_printf(", obj ruid %d charge %jx", 4313 entry->object.vm_object->cred->cr_ruid, 4314 (uintmax_t)entry->object.vm_object->charge); 4315 if (entry->eflags & MAP_ENTRY_COW) 4316 db_printf(", copy (%s)", 4317 (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done"); 4318 db_printf("\n"); 4319 4320 if ((entry->prev == &map->header) || 4321 (entry->prev->object.vm_object != 4322 entry->object.vm_object)) { 4323 db_indent += 2; 4324 vm_object_print((db_expr_t)(intptr_t) 4325 entry->object.vm_object, 4326 0, 0, (char *)0); 4327 db_indent -= 2; 4328 } 4329 } 4330 } 4331 db_indent -= 2; 4332} 4333 4334DB_SHOW_COMMAND(map, map) 4335{ 4336 4337 if (!have_addr) { 4338 db_printf("usage: show map <addr>\n"); 4339 return; 4340 } 4341 vm_map_print((vm_map_t)addr); 4342} 4343 4344DB_SHOW_COMMAND(procvm, procvm) 4345{ 4346 struct proc *p; 4347 4348 if (have_addr) { 4349 p = db_lookup_proc(addr); 4350 } else { 4351 p = curproc; 4352 } 4353 4354 db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n", 4355 (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map, 4356 (void *)vmspace_pmap(p->p_vmspace)); 4357 4358 vm_map_print((vm_map_t)&p->p_vmspace->vm_map); 4359} 4360 4361#endif /* DDB */ 4362