1/* 2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1989, 1993 31 * The Regents of the University of California. All rights reserved. 32 * (c) UNIX System Laboratories, Inc. 33 * All or some portions of this file are derived from material licensed 34 * to the University of California by American Telephone and Telegraph 35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 36 * the permission of UNIX System Laboratories, Inc. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. All advertising materials mentioning features or use of this software 47 * must display the following acknowledgement: 48 * This product includes software developed by the University of 49 * California, Berkeley and its contributors. 50 * 4. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 67 */ 68/* 69 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce 70 * support for mandatory and extensible security protections. This notice 71 * is included in support of clause 2.2 (b) of the Apple Public License, 72 * Version 2.0. 73 */ 74 75/* 76 * External virtual filesystem routines 77 */ 78 79 80#include <sys/param.h> 81#include <sys/systm.h> 82#include <sys/proc_internal.h> 83#include <sys/kauth.h> 84#include <sys/mount_internal.h> 85#include <sys/time.h> 86#include <sys/lock.h> 87#include <sys/vnode.h> 88#include <sys/vnode_internal.h> 89#include <sys/stat.h> 90#include <sys/namei.h> 91#include <sys/ucred.h> 92#include <sys/buf_internal.h> 93#include <sys/errno.h> 94#include <sys/malloc.h> 95#include <sys/uio_internal.h> 96#include <sys/uio.h> 97#include <sys/domain.h> 98#include <sys/mbuf.h> 99#include <sys/syslog.h> 100#include <sys/ubc_internal.h> 101#include <sys/vm.h> 102#include <sys/sysctl.h> 103#include <sys/filedesc.h> 104#include <sys/event.h> 105#include <sys/kdebug.h> 106#include <sys/kauth.h> 107#include <sys/user.h> 108#include <sys/systm.h> 109#include <sys/kern_memorystatus.h> 110#include <sys/lockf.h> 111#include <miscfs/fifofs/fifo.h> 112 113#include <string.h> 114#include <machine/spl.h> 115 116 117#include <kern/assert.h> 118#include <mach/kern_return.h> 119#include <kern/thread.h> 120#include <kern/sched_prim.h> 121 122#include <miscfs/specfs/specdev.h> 123 124#include <mach/mach_types.h> 125#include <mach/memory_object_types.h> 126#include <mach/memory_object_control.h> 127 128#include <kern/kalloc.h> /* kalloc()/kfree() */ 129#include <kern/clock.h> /* delay_for_interval() */ 130#include <libkern/OSAtomic.h> /* OSAddAtomic() */ 131 132 133#ifdef JOE_DEBUG 134#include <libkern/OSDebug.h> 135#endif 136 137#include <vm/vm_protos.h> /* vnode_pager_vrele() */ 138 139#if CONFIG_MACF 140#include <security/mac_framework.h> 141#endif 142 143extern lck_grp_t *vnode_lck_grp; 144extern lck_attr_t *vnode_lck_attr; 145 146#if CONFIG_TRIGGERS 147extern lck_grp_t *trigger_vnode_lck_grp; 148extern lck_attr_t *trigger_vnode_lck_attr; 149#endif 150 151extern lck_mtx_t * mnt_list_mtx_lock; 152 153enum vtype iftovt_tab[16] = { 154 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 155 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 156}; 157int vttoif_tab[9] = { 158 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 159 S_IFSOCK, S_IFIFO, S_IFMT, 160}; 161 162 163/* XXX These should be in a BSD accessible Mach header, but aren't. */ 164extern void memory_object_mark_used( 165 memory_object_control_t control); 166 167extern void memory_object_mark_unused( 168 memory_object_control_t control, 169 boolean_t rage); 170 171 172/* XXX next protptype should be from <nfs/nfs.h> */ 173extern int nfs_vinvalbuf(vnode_t, int, vfs_context_t, int); 174 175/* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */ 176__private_extern__ void qsort( 177 void * array, 178 size_t nmembers, 179 size_t member_size, 180 int (*)(const void *, const void *)); 181 182extern kern_return_t adjust_vm_object_cache(vm_size_t oval, vm_size_t nval); 183__private_extern__ void vntblinit(void); 184__private_extern__ kern_return_t reset_vmobjectcache(unsigned int val1, 185 unsigned int val2); 186__private_extern__ int unlink1(vfs_context_t, struct nameidata *, int); 187 188extern int system_inshutdown; 189 190static void vnode_list_add(vnode_t); 191static void vnode_async_list_add(vnode_t); 192static void vnode_list_remove(vnode_t); 193static void vnode_list_remove_locked(vnode_t); 194 195static void vnode_abort_advlocks(vnode_t); 196static errno_t vnode_drain(vnode_t); 197static void vgone(vnode_t, int flags); 198static void vclean(vnode_t vp, int flag); 199static void vnode_reclaim_internal(vnode_t, int, int, int); 200 201static void vnode_dropiocount (vnode_t); 202 203static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev); 204static int vnode_reload(vnode_t); 205static int vnode_isinuse_locked(vnode_t, int, int); 206 207static void insmntque(vnode_t vp, mount_t mp); 208static int mount_getvfscnt(void); 209static int mount_fillfsids(fsid_t *, int ); 210static void vnode_iterate_setup(mount_t); 211int vnode_umount_preflight(mount_t, vnode_t, int); 212static int vnode_iterate_prepare(mount_t); 213static int vnode_iterate_reloadq(mount_t); 214static void vnode_iterate_clear(mount_t); 215static mount_t vfs_getvfs_locked(fsid_t *); 216static int vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, 217 struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx); 218static int vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx); 219 220errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *); 221 222#ifdef JOE_DEBUG 223static void record_vp(vnode_t vp, int count); 224#endif 225 226#if CONFIG_TRIGGERS 227static int vnode_resolver_create(mount_t, vnode_t, struct vnode_trigger_param *, boolean_t external); 228static void vnode_resolver_detach(vnode_t); 229#endif 230 231TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ 232TAILQ_HEAD(deadlst, vnode) vnode_dead_list; /* vnode dead list */ 233TAILQ_HEAD(async_work_lst, vnode) vnode_async_work_list; 234 235 236TAILQ_HEAD(ragelst, vnode) vnode_rage_list; /* vnode rapid age list */ 237struct timeval rage_tv; 238int rage_limit = 0; 239int ragevnodes = 0; 240 241#define RAGE_LIMIT_MIN 100 242#define RAGE_TIME_LIMIT 5 243 244struct mntlist mountlist; /* mounted filesystem list */ 245static int nummounts = 0; 246 247#if DIAGNOSTIC 248#define VLISTCHECK(fun, vp, list) \ 249 if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \ 250 panic("%s: %s vnode not on %slist", (fun), (list), (list)); 251#else 252#define VLISTCHECK(fun, vp, list) 253#endif /* DIAGNOSTIC */ 254 255#define VLISTNONE(vp) \ 256 do { \ 257 (vp)->v_freelist.tqe_next = (struct vnode *)0; \ 258 (vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb; \ 259 } while(0) 260 261#define VONLIST(vp) \ 262 ((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb) 263 264/* remove a vnode from free vnode list */ 265#define VREMFREE(fun, vp) \ 266 do { \ 267 VLISTCHECK((fun), (vp), "free"); \ 268 TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist); \ 269 VLISTNONE((vp)); \ 270 freevnodes--; \ 271 } while(0) 272 273 274/* remove a vnode from dead vnode list */ 275#define VREMDEAD(fun, vp) \ 276 do { \ 277 VLISTCHECK((fun), (vp), "dead"); \ 278 TAILQ_REMOVE(&vnode_dead_list, (vp), v_freelist); \ 279 VLISTNONE((vp)); \ 280 vp->v_listflag &= ~VLIST_DEAD; \ 281 deadvnodes--; \ 282 } while(0) 283 284 285/* remove a vnode from async work vnode list */ 286#define VREMASYNC_WORK(fun, vp) \ 287 do { \ 288 VLISTCHECK((fun), (vp), "async_work"); \ 289 TAILQ_REMOVE(&vnode_async_work_list, (vp), v_freelist); \ 290 VLISTNONE((vp)); \ 291 vp->v_listflag &= ~VLIST_ASYNC_WORK; \ 292 async_work_vnodes--; \ 293 } while(0) 294 295 296/* remove a vnode from rage vnode list */ 297#define VREMRAGE(fun, vp) \ 298 do { \ 299 if ( !(vp->v_listflag & VLIST_RAGE)) \ 300 panic("VREMRAGE: vp not on rage list"); \ 301 VLISTCHECK((fun), (vp), "rage"); \ 302 TAILQ_REMOVE(&vnode_rage_list, (vp), v_freelist); \ 303 VLISTNONE((vp)); \ 304 vp->v_listflag &= ~VLIST_RAGE; \ 305 ragevnodes--; \ 306 } while(0) 307 308 309/* 310 * vnodetarget hasn't been used in a long time, but 311 * it was exported for some reason... I'm leaving in 312 * place for now... it should be deprecated out of the 313 * exports and removed eventually. 314 */ 315u_int32_t vnodetarget; /* target for vnreclaim() */ 316#define VNODE_FREE_TARGET 20 /* Default value for vnodetarget */ 317 318/* 319 * We need quite a few vnodes on the free list to sustain the 320 * rapid stat() the compilation process does, and still benefit from the name 321 * cache. Having too few vnodes on the free list causes serious disk 322 * thrashing as we cycle through them. 323 */ 324#define VNODE_FREE_MIN CONFIG_VNODE_FREE_MIN /* freelist should have at least this many */ 325 326 327static void async_work_continue(void); 328 329/* 330 * Initialize the vnode management data structures. 331 */ 332__private_extern__ void 333vntblinit(void) 334{ 335 thread_t thread = THREAD_NULL; 336 337 TAILQ_INIT(&vnode_free_list); 338 TAILQ_INIT(&vnode_rage_list); 339 TAILQ_INIT(&vnode_dead_list); 340 TAILQ_INIT(&vnode_async_work_list); 341 TAILQ_INIT(&mountlist); 342 343 if (!vnodetarget) 344 vnodetarget = VNODE_FREE_TARGET; 345 346 microuptime(&rage_tv); 347 rage_limit = desiredvnodes / 100; 348 349 if (rage_limit < RAGE_LIMIT_MIN) 350 rage_limit = RAGE_LIMIT_MIN; 351 352 /* 353 * Scale the vm_object_cache to accomodate the vnodes 354 * we want to cache 355 */ 356 (void) adjust_vm_object_cache(0, desiredvnodes - VNODE_FREE_MIN); 357 358 /* 359 * create worker threads 360 */ 361 kernel_thread_start((thread_continue_t)async_work_continue, NULL, &thread); 362 thread_deallocate(thread); 363} 364 365/* Reset the VM Object Cache with the values passed in */ 366__private_extern__ kern_return_t 367reset_vmobjectcache(unsigned int val1, unsigned int val2) 368{ 369 vm_size_t oval = val1 - VNODE_FREE_MIN; 370 vm_size_t nval; 371 372 if (val1 == val2) { 373 return KERN_SUCCESS; 374 } 375 376 if(val2 < VNODE_FREE_MIN) 377 nval = 0; 378 else 379 nval = val2 - VNODE_FREE_MIN; 380 381 return(adjust_vm_object_cache(oval, nval)); 382} 383 384 385/* the timeout is in 10 msecs */ 386int 387vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg) { 388 int error = 0; 389 struct timespec ts; 390 391 KERNEL_DEBUG(0x3010280 | DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, 0, 0); 392 393 if (vp->v_numoutput > output_target) { 394 395 slpflag |= PDROP; 396 397 vnode_lock_spin(vp); 398 399 while ((vp->v_numoutput > output_target) && error == 0) { 400 if (output_target) 401 vp->v_flag |= VTHROTTLED; 402 else 403 vp->v_flag |= VBWAIT; 404 405 ts.tv_sec = (slptimeout/100); 406 ts.tv_nsec = (slptimeout % 1000) * 10 * NSEC_PER_USEC * 1000 ; 407 error = msleep((caddr_t)&vp->v_numoutput, &vp->v_lock, (slpflag | (PRIBIO + 1)), msg, &ts); 408 409 vnode_lock_spin(vp); 410 } 411 vnode_unlock(vp); 412 } 413 KERNEL_DEBUG(0x3010280 | DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, 0); 414 415 return error; 416} 417 418 419void 420vnode_startwrite(vnode_t vp) { 421 422 OSAddAtomic(1, &vp->v_numoutput); 423} 424 425 426void 427vnode_writedone(vnode_t vp) 428{ 429 if (vp) { 430 int need_wakeup = 0; 431 432 OSAddAtomic(-1, &vp->v_numoutput); 433 434 vnode_lock_spin(vp); 435 436 if (vp->v_numoutput < 0) 437 panic("vnode_writedone: numoutput < 0"); 438 439 if ((vp->v_flag & VTHROTTLED)) { 440 vp->v_flag &= ~VTHROTTLED; 441 need_wakeup = 1; 442 } 443 if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) { 444 vp->v_flag &= ~VBWAIT; 445 need_wakeup = 1; 446 } 447 vnode_unlock(vp); 448 449 if (need_wakeup) 450 wakeup((caddr_t)&vp->v_numoutput); 451 } 452} 453 454 455 456int 457vnode_hasdirtyblks(vnode_t vp) 458{ 459 struct cl_writebehind *wbp; 460 461 /* 462 * Not taking the buf_mtxp as there is little 463 * point doing it. Even if the lock is taken the 464 * state can change right after that. If their 465 * needs to be a synchronization, it must be driven 466 * by the caller 467 */ 468 if (vp->v_dirtyblkhd.lh_first) 469 return (1); 470 471 if (!UBCINFOEXISTS(vp)) 472 return (0); 473 474 wbp = vp->v_ubcinfo->cl_wbehind; 475 476 if (wbp && (wbp->cl_number || wbp->cl_scmap)) 477 return (1); 478 479 return (0); 480} 481 482int 483vnode_hascleanblks(vnode_t vp) 484{ 485 /* 486 * Not taking the buf_mtxp as there is little 487 * point doing it. Even if the lock is taken the 488 * state can change right after that. If their 489 * needs to be a synchronization, it must be driven 490 * by the caller 491 */ 492 if (vp->v_cleanblkhd.lh_first) 493 return (1); 494 return (0); 495} 496 497void 498vnode_iterate_setup(mount_t mp) 499{ 500 while (mp->mnt_lflag & MNT_LITER) { 501 mp->mnt_lflag |= MNT_LITERWAIT; 502 msleep((caddr_t)mp, &mp->mnt_mlock, PVFS, "vnode_iterate_setup", NULL); 503 } 504 505 mp->mnt_lflag |= MNT_LITER; 506 507} 508 509int 510vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags) 511{ 512 vnode_t vp; 513 514 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 515 /* disable preflight only for udf, a hack to be removed after 4073176 is fixed */ 516 if (vp->v_tag == VT_UDF) 517 return 0; 518 if (vp->v_type == VDIR) 519 continue; 520 if (vp == skipvp) 521 continue; 522 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || 523 (vp->v_flag & VNOFLUSH))) 524 continue; 525 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) 526 continue; 527 if ((flags & WRITECLOSE) && 528 (vp->v_writecount == 0 || vp->v_type != VREG)) 529 continue; 530 /* Look for busy vnode */ 531 if (((vp->v_usecount != 0) && 532 ((vp->v_usecount - vp->v_kusecount) != 0))) 533 return(1); 534 } 535 536 return(0); 537} 538 539/* 540 * This routine prepares iteration by moving all the vnodes to worker queue 541 * called with mount lock held 542 */ 543int 544vnode_iterate_prepare(mount_t mp) 545{ 546 vnode_t vp; 547 548 if (TAILQ_EMPTY(&mp->mnt_vnodelist)) { 549 /* nothing to do */ 550 return (0); 551 } 552 553 vp = TAILQ_FIRST(&mp->mnt_vnodelist); 554 vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first); 555 mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first; 556 mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last; 557 558 TAILQ_INIT(&mp->mnt_vnodelist); 559 if (mp->mnt_newvnodes.tqh_first != NULL) 560 panic("vnode_iterate_prepare: newvnode when entering vnode"); 561 TAILQ_INIT(&mp->mnt_newvnodes); 562 563 return (1); 564} 565 566 567/* called with mount lock held */ 568int 569vnode_iterate_reloadq(mount_t mp) 570{ 571 int moved = 0; 572 573 /* add the remaining entries in workerq to the end of mount vnode list */ 574 if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) { 575 struct vnode * mvp; 576 mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst); 577 578 /* Joining the workerque entities to mount vnode list */ 579 if (mvp) 580 mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first; 581 else 582 mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first; 583 mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last; 584 mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last; 585 TAILQ_INIT(&mp->mnt_workerqueue); 586 } 587 588 /* add the newvnodes to the head of mount vnode list */ 589 if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) { 590 struct vnode * nlvp; 591 nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst); 592 593 mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first; 594 nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first; 595 if(mp->mnt_vnodelist.tqh_first) 596 mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next; 597 else 598 mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last; 599 mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first; 600 TAILQ_INIT(&mp->mnt_newvnodes); 601 moved = 1; 602 } 603 604 return(moved); 605} 606 607 608void 609vnode_iterate_clear(mount_t mp) 610{ 611 mp->mnt_lflag &= ~MNT_LITER; 612 if (mp->mnt_lflag & MNT_LITERWAIT) { 613 mp->mnt_lflag &= ~MNT_LITERWAIT; 614 wakeup(mp); 615 } 616} 617 618 619int 620vnode_iterate(mount_t mp, int flags, int (*callout)(struct vnode *, void *), 621 void *arg) 622{ 623 struct vnode *vp; 624 int vid, retval; 625 int ret = 0; 626 627 mount_lock(mp); 628 629 vnode_iterate_setup(mp); 630 631 /* it is returns 0 then there is nothing to do */ 632 retval = vnode_iterate_prepare(mp); 633 634 if (retval == 0) { 635 vnode_iterate_clear(mp); 636 mount_unlock(mp); 637 return(ret); 638 } 639 640 /* iterate over all the vnodes */ 641 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) { 642 vp = TAILQ_FIRST(&mp->mnt_workerqueue); 643 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes); 644 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 645 vid = vp->v_id; 646 if ((vp->v_data == NULL) || (vp->v_type == VNON) || (vp->v_mount != mp)) { 647 continue; 648 } 649 mount_unlock(mp); 650 651 if ( vget_internal(vp, vid, (flags | VNODE_NODEAD| VNODE_WITHID | VNODE_NOSUSPEND))) { 652 mount_lock(mp); 653 continue; 654 } 655 if (flags & VNODE_RELOAD) { 656 /* 657 * we're reloading the filesystem 658 * cast out any inactive vnodes... 659 */ 660 if (vnode_reload(vp)) { 661 /* vnode will be recycled on the refcount drop */ 662 vnode_put(vp); 663 mount_lock(mp); 664 continue; 665 } 666 } 667 668 retval = callout(vp, arg); 669 670 switch (retval) { 671 case VNODE_RETURNED: 672 case VNODE_RETURNED_DONE: 673 vnode_put(vp); 674 if (retval == VNODE_RETURNED_DONE) { 675 mount_lock(mp); 676 ret = 0; 677 goto out; 678 } 679 break; 680 681 case VNODE_CLAIMED_DONE: 682 mount_lock(mp); 683 ret = 0; 684 goto out; 685 case VNODE_CLAIMED: 686 default: 687 break; 688 } 689 mount_lock(mp); 690 } 691 692out: 693 (void)vnode_iterate_reloadq(mp); 694 vnode_iterate_clear(mp); 695 mount_unlock(mp); 696 return (ret); 697} 698 699void 700mount_lock_renames(mount_t mp) 701{ 702 lck_mtx_lock(&mp->mnt_renamelock); 703} 704 705void 706mount_unlock_renames(mount_t mp) 707{ 708 lck_mtx_unlock(&mp->mnt_renamelock); 709} 710 711void 712mount_lock(mount_t mp) 713{ 714 lck_mtx_lock(&mp->mnt_mlock); 715} 716 717void 718mount_lock_spin(mount_t mp) 719{ 720 lck_mtx_lock_spin(&mp->mnt_mlock); 721} 722 723void 724mount_unlock(mount_t mp) 725{ 726 lck_mtx_unlock(&mp->mnt_mlock); 727} 728 729 730void 731mount_ref(mount_t mp, int locked) 732{ 733 if ( !locked) 734 mount_lock_spin(mp); 735 736 mp->mnt_count++; 737 738 if ( !locked) 739 mount_unlock(mp); 740} 741 742 743void 744mount_drop(mount_t mp, int locked) 745{ 746 if ( !locked) 747 mount_lock_spin(mp); 748 749 mp->mnt_count--; 750 751 if (mp->mnt_count == 0 && (mp->mnt_lflag & MNT_LDRAIN)) 752 wakeup(&mp->mnt_lflag); 753 754 if ( !locked) 755 mount_unlock(mp); 756} 757 758 759int 760mount_iterref(mount_t mp, int locked) 761{ 762 int retval = 0; 763 764 if (!locked) 765 mount_list_lock(); 766 if (mp->mnt_iterref < 0) { 767 retval = 1; 768 } else { 769 mp->mnt_iterref++; 770 } 771 if (!locked) 772 mount_list_unlock(); 773 return(retval); 774} 775 776int 777mount_isdrained(mount_t mp, int locked) 778{ 779 int retval; 780 781 if (!locked) 782 mount_list_lock(); 783 if (mp->mnt_iterref < 0) 784 retval = 1; 785 else 786 retval = 0; 787 if (!locked) 788 mount_list_unlock(); 789 return(retval); 790} 791 792void 793mount_iterdrop(mount_t mp) 794{ 795 mount_list_lock(); 796 mp->mnt_iterref--; 797 wakeup(&mp->mnt_iterref); 798 mount_list_unlock(); 799} 800 801void 802mount_iterdrain(mount_t mp) 803{ 804 mount_list_lock(); 805 while (mp->mnt_iterref) 806 msleep((caddr_t)&mp->mnt_iterref, mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL); 807 /* mount iterations drained */ 808 mp->mnt_iterref = -1; 809 mount_list_unlock(); 810} 811void 812mount_iterreset(mount_t mp) 813{ 814 mount_list_lock(); 815 if (mp->mnt_iterref == -1) 816 mp->mnt_iterref = 0; 817 mount_list_unlock(); 818} 819 820/* always called with mount lock held */ 821int 822mount_refdrain(mount_t mp) 823{ 824 if (mp->mnt_lflag & MNT_LDRAIN) 825 panic("already in drain"); 826 mp->mnt_lflag |= MNT_LDRAIN; 827 828 while (mp->mnt_count) 829 msleep((caddr_t)&mp->mnt_lflag, &mp->mnt_mlock, PVFS, "mount_drain", NULL); 830 831 if (mp->mnt_vnodelist.tqh_first != NULL) 832 panic("mount_refdrain: dangling vnode"); 833 834 mp->mnt_lflag &= ~MNT_LDRAIN; 835 836 return(0); 837} 838 839/* Tags the mount point as not supportine extended readdir for NFS exports */ 840void 841mount_set_noreaddirext(mount_t mp) { 842 mount_lock (mp); 843 mp->mnt_kern_flag |= MNTK_DENY_READDIREXT; 844 mount_unlock (mp); 845} 846 847/* 848 * Mark a mount point as busy. Used to synchronize access and to delay 849 * unmounting. 850 */ 851int 852vfs_busy(mount_t mp, int flags) 853{ 854 855restart: 856 if (mp->mnt_lflag & MNT_LDEAD) 857 return(ENOENT); 858 859 if (mp->mnt_lflag & MNT_LUNMOUNT) { 860 if (flags & LK_NOWAIT) 861 return (ENOENT); 862 863 mount_lock(mp); 864 865 if (mp->mnt_lflag & MNT_LDEAD) { 866 mount_unlock(mp); 867 return(ENOENT); 868 } 869 if (mp->mnt_lflag & MNT_LUNMOUNT) { 870 mp->mnt_lflag |= MNT_LWAIT; 871 /* 872 * Since all busy locks are shared except the exclusive 873 * lock granted when unmounting, the only place that a 874 * wakeup needs to be done is at the release of the 875 * exclusive lock at the end of dounmount. 876 */ 877 msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS | PDROP), "vfsbusy", NULL); 878 return (ENOENT); 879 } 880 mount_unlock(mp); 881 } 882 883 lck_rw_lock_shared(&mp->mnt_rwlock); 884 885 /* 886 * until we are granted the rwlock, it's possible for the mount point to 887 * change state, so reevaluate before granting the vfs_busy 888 */ 889 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) { 890 lck_rw_done(&mp->mnt_rwlock); 891 goto restart; 892 } 893 return (0); 894} 895 896/* 897 * Free a busy filesystem. 898 */ 899 900void 901vfs_unbusy(mount_t mp) 902{ 903 lck_rw_done(&mp->mnt_rwlock); 904} 905 906 907 908static void 909vfs_rootmountfailed(mount_t mp) { 910 911 mount_list_lock(); 912 mp->mnt_vtable->vfc_refcount--; 913 mount_list_unlock(); 914 915 vfs_unbusy(mp); 916 917 mount_lock_destroy(mp); 918 919#if CONFIG_MACF 920 mac_mount_label_destroy(mp); 921#endif 922 923 FREE_ZONE(mp, sizeof(struct mount), M_MOUNT); 924} 925 926/* 927 * Lookup a filesystem type, and if found allocate and initialize 928 * a mount structure for it. 929 * 930 * Devname is usually updated by mount(8) after booting. 931 */ 932static mount_t 933vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname) 934{ 935 mount_t mp; 936 937 mp = _MALLOC_ZONE(sizeof(struct mount), M_MOUNT, M_WAITOK); 938 bzero((char *)mp, sizeof(struct mount)); 939 940 /* Initialize the default IO constraints */ 941 mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS; 942 mp->mnt_segreadcnt = mp->mnt_segwritecnt = 32; 943 mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt; 944 mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt; 945 mp->mnt_devblocksize = DEV_BSIZE; 946 mp->mnt_alignmentmask = PAGE_MASK; 947 mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH; 948 mp->mnt_ioscale = 1; 949 mp->mnt_ioflags = 0; 950 mp->mnt_realrootvp = NULLVP; 951 mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL; 952 mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1; 953 mp->mnt_devbsdunit = 0; 954 955 mount_lock_init(mp); 956 (void)vfs_busy(mp, LK_NOWAIT); 957 958 TAILQ_INIT(&mp->mnt_vnodelist); 959 TAILQ_INIT(&mp->mnt_workerqueue); 960 TAILQ_INIT(&mp->mnt_newvnodes); 961 962 mp->mnt_vtable = vfsp; 963 mp->mnt_op = vfsp->vfc_vfsops; 964 mp->mnt_flag = MNT_RDONLY | MNT_ROOTFS; 965 mp->mnt_vnodecovered = NULLVP; 966 //mp->mnt_stat.f_type = vfsp->vfc_typenum; 967 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 968 969 mount_list_lock(); 970 vfsp->vfc_refcount++; 971 mount_list_unlock(); 972 973 strncpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN); 974 mp->mnt_vfsstat.f_mntonname[0] = '/'; 975 /* XXX const poisoning layering violation */ 976 (void) copystr((const void *)devname, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, NULL); 977 978#if CONFIG_MACF 979 mac_mount_label_init(mp); 980 mac_mount_label_associate(vfs_context_kernel(), mp); 981#endif 982 return (mp); 983} 984 985errno_t 986vfs_rootmountalloc(const char *fstypename, const char *devname, mount_t *mpp) 987{ 988 struct vfstable *vfsp; 989 990 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 991 if (!strncmp(vfsp->vfc_name, fstypename, 992 sizeof(vfsp->vfc_name))) 993 break; 994 if (vfsp == NULL) 995 return (ENODEV); 996 997 *mpp = vfs_rootmountalloc_internal(vfsp, devname); 998 999 if (*mpp) 1000 return (0); 1001 1002 return (ENOMEM); 1003} 1004 1005 1006/* 1007 * Find an appropriate filesystem to use for the root. If a filesystem 1008 * has not been preselected, walk through the list of known filesystems 1009 * trying those that have mountroot routines, and try them until one 1010 * works or we have tried them all. 1011 */ 1012extern int (*mountroot)(void); 1013 1014int 1015vfs_mountroot(void) 1016{ 1017#if CONFIG_MACF 1018 struct vnode *vp; 1019#endif 1020 struct vfstable *vfsp; 1021 vfs_context_t ctx = vfs_context_kernel(); 1022 struct vfs_attr vfsattr; 1023 int error; 1024 mount_t mp; 1025 vnode_t bdevvp_rootvp; 1026 1027 if (mountroot != NULL) { 1028 /* 1029 * used for netboot which follows a different set of rules 1030 */ 1031 error = (*mountroot)(); 1032 return (error); 1033 } 1034 if ((error = bdevvp(rootdev, &rootvp))) { 1035 printf("vfs_mountroot: can't setup bdevvp\n"); 1036 return (error); 1037 } 1038 /* 1039 * 4951998 - code we call in vfc_mountroot may replace rootvp 1040 * so keep a local copy for some house keeping. 1041 */ 1042 bdevvp_rootvp = rootvp; 1043 1044 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 1045 if (vfsp->vfc_mountroot == NULL) 1046 continue; 1047 1048 mp = vfs_rootmountalloc_internal(vfsp, "root_device"); 1049 mp->mnt_devvp = rootvp; 1050 1051 if ((error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx)) == 0) { 1052 if ( bdevvp_rootvp != rootvp ) { 1053 /* 1054 * rootvp changed... 1055 * bump the iocount and fix up mnt_devvp for the 1056 * new rootvp (it will already have a usecount taken)... 1057 * drop the iocount and the usecount on the orignal 1058 * since we are no longer going to use it... 1059 */ 1060 vnode_getwithref(rootvp); 1061 mp->mnt_devvp = rootvp; 1062 1063 vnode_rele(bdevvp_rootvp); 1064 vnode_put(bdevvp_rootvp); 1065 } 1066 mp->mnt_devvp->v_specflags |= SI_MOUNTEDON; 1067 1068 vfs_unbusy(mp); 1069 1070 mount_list_add(mp); 1071 1072 /* 1073 * cache the IO attributes for the underlying physical media... 1074 * an error return indicates the underlying driver doesn't 1075 * support all the queries necessary... however, reasonable 1076 * defaults will have been set, so no reason to bail or care 1077 */ 1078 vfs_init_io_attributes(rootvp, mp); 1079 1080 /* 1081 * Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS. 1082 */ 1083 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) { 1084 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS; 1085 } 1086 if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) { 1087 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT; 1088 } 1089 1090 /* 1091 * Probe root file system for additional features. 1092 */ 1093 (void)VFS_START(mp, 0, ctx); 1094 1095 VFSATTR_INIT(&vfsattr); 1096 VFSATTR_WANTED(&vfsattr, f_capabilities); 1097 if (vfs_getattr(mp, &vfsattr, ctx) == 0 && 1098 VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) { 1099 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) && 1100 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) { 1101 mp->mnt_kern_flag |= MNTK_EXTENDED_ATTRS; 1102 } 1103#if NAMEDSTREAMS 1104 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) && 1105 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) { 1106 mp->mnt_kern_flag |= MNTK_NAMED_STREAMS; 1107 } 1108#endif 1109 if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) && 1110 (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) { 1111 mp->mnt_kern_flag |= MNTK_PATH_FROM_ID; 1112 } 1113 } 1114 1115 /* 1116 * get rid of iocount reference returned 1117 * by bdevvp (or picked up by us on the substitued 1118 * rootvp)... it (or we) will have also taken 1119 * a usecount reference which we want to keep 1120 */ 1121 vnode_put(rootvp); 1122 1123#if CONFIG_MACF 1124 if ((vfs_flags(mp) & MNT_MULTILABEL) == 0) 1125 return (0); 1126 1127 error = VFS_ROOT(mp, &vp, ctx); 1128 if (error) { 1129 printf("%s() VFS_ROOT() returned %d\n", 1130 __func__, error); 1131 dounmount(mp, MNT_FORCE, 0, ctx); 1132 goto fail; 1133 } 1134 error = vnode_label(mp, NULL, vp, NULL, 0, ctx); 1135 /* 1136 * get rid of reference provided by VFS_ROOT 1137 */ 1138 vnode_put(vp); 1139 1140 if (error) { 1141 printf("%s() vnode_label() returned %d\n", 1142 __func__, error); 1143 dounmount(mp, MNT_FORCE, 0, ctx); 1144 goto fail; 1145 } 1146#endif 1147 return (0); 1148 } 1149#if CONFIG_MACF 1150fail: 1151#endif 1152 vfs_rootmountfailed(mp); 1153 1154 if (error != EINVAL) 1155 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 1156 } 1157 return (ENODEV); 1158} 1159 1160/* 1161 * Lookup a mount point by filesystem identifier. 1162 */ 1163 1164struct mount * 1165vfs_getvfs(fsid_t *fsid) 1166{ 1167 return (mount_list_lookupby_fsid(fsid, 0, 0)); 1168} 1169 1170static struct mount * 1171vfs_getvfs_locked(fsid_t *fsid) 1172{ 1173 return(mount_list_lookupby_fsid(fsid, 1, 0)); 1174} 1175 1176struct mount * 1177vfs_getvfs_by_mntonname(char *path) 1178{ 1179 mount_t retmp = (mount_t)0; 1180 mount_t mp; 1181 1182 mount_list_lock(); 1183 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 1184 if (!strncmp(mp->mnt_vfsstat.f_mntonname, path, 1185 sizeof(mp->mnt_vfsstat.f_mntonname))) { 1186 retmp = mp; 1187 if (mount_iterref(retmp, 1)) 1188 retmp = NULL; 1189 goto out; 1190 } 1191 } 1192out: 1193 mount_list_unlock(); 1194 return (retmp); 1195} 1196 1197/* generation number for creation of new fsids */ 1198u_short mntid_gen = 0; 1199/* 1200 * Get a new unique fsid 1201 */ 1202void 1203vfs_getnewfsid(struct mount *mp) 1204{ 1205 1206 fsid_t tfsid; 1207 int mtype; 1208 mount_t nmp; 1209 1210 mount_list_lock(); 1211 1212 /* generate a new fsid */ 1213 mtype = mp->mnt_vtable->vfc_typenum; 1214 if (++mntid_gen == 0) 1215 mntid_gen++; 1216 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen); 1217 tfsid.val[1] = mtype; 1218 1219 TAILQ_FOREACH(nmp, &mountlist, mnt_list) { 1220 while (vfs_getvfs_locked(&tfsid)) { 1221 if (++mntid_gen == 0) 1222 mntid_gen++; 1223 tfsid.val[0] = makedev(nblkdev + mtype, mntid_gen); 1224 } 1225 } 1226 mp->mnt_vfsstat.f_fsid.val[0] = tfsid.val[0]; 1227 mp->mnt_vfsstat.f_fsid.val[1] = tfsid.val[1]; 1228 mount_list_unlock(); 1229} 1230 1231/* 1232 * Routines having to do with the management of the vnode table. 1233 */ 1234extern int (**dead_vnodeop_p)(void *); 1235long numvnodes, freevnodes, deadvnodes, async_work_vnodes; 1236 1237 1238int async_work_timed_out = 0; 1239int async_work_handled = 0; 1240int dead_vnode_wanted = 0; 1241int dead_vnode_waited = 0; 1242 1243/* 1244 * Move a vnode from one mount queue to another. 1245 */ 1246static void 1247insmntque(vnode_t vp, mount_t mp) 1248{ 1249 mount_t lmp; 1250 /* 1251 * Delete from old mount point vnode list, if on one. 1252 */ 1253 if ( (lmp = vp->v_mount) != NULL && lmp != dead_mountp) { 1254 if ((vp->v_lflag & VNAMED_MOUNT) == 0) 1255 panic("insmntque: vp not in mount vnode list"); 1256 vp->v_lflag &= ~VNAMED_MOUNT; 1257 1258 mount_lock_spin(lmp); 1259 1260 mount_drop(lmp, 1); 1261 1262 if (vp->v_mntvnodes.tqe_next == NULL) { 1263 if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp) 1264 TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes); 1265 else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp) 1266 TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes); 1267 else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp) 1268 TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes); 1269 } else { 1270 vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev; 1271 *vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next; 1272 } 1273 vp->v_mntvnodes.tqe_next = NULL; 1274 vp->v_mntvnodes.tqe_prev = NULL; 1275 mount_unlock(lmp); 1276 return; 1277 } 1278 1279 /* 1280 * Insert into list of vnodes for the new mount point, if available. 1281 */ 1282 if ((vp->v_mount = mp) != NULL) { 1283 mount_lock_spin(mp); 1284 if ((vp->v_mntvnodes.tqe_next != 0) && (vp->v_mntvnodes.tqe_prev != 0)) 1285 panic("vp already in mount list"); 1286 if (mp->mnt_lflag & MNT_LITER) 1287 TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes); 1288 else 1289 TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 1290 if (vp->v_lflag & VNAMED_MOUNT) 1291 panic("insmntque: vp already in mount vnode list"); 1292 vp->v_lflag |= VNAMED_MOUNT; 1293 mount_ref(mp, 1); 1294 mount_unlock(mp); 1295 } 1296} 1297 1298 1299/* 1300 * Create a vnode for a block device. 1301 * Used for root filesystem, argdev, and swap areas. 1302 * Also used for memory file system special devices. 1303 */ 1304int 1305bdevvp(dev_t dev, vnode_t *vpp) 1306{ 1307 vnode_t nvp; 1308 int error; 1309 struct vnode_fsparam vfsp; 1310 struct vfs_context context; 1311 1312 if (dev == NODEV) { 1313 *vpp = NULLVP; 1314 return (ENODEV); 1315 } 1316 1317 context.vc_thread = current_thread(); 1318 context.vc_ucred = FSCRED; 1319 1320 vfsp.vnfs_mp = (struct mount *)0; 1321 vfsp.vnfs_vtype = VBLK; 1322 vfsp.vnfs_str = "bdevvp"; 1323 vfsp.vnfs_dvp = NULL; 1324 vfsp.vnfs_fsnode = NULL; 1325 vfsp.vnfs_cnp = NULL; 1326 vfsp.vnfs_vops = spec_vnodeop_p; 1327 vfsp.vnfs_rdev = dev; 1328 vfsp.vnfs_filesize = 0; 1329 1330 vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE; 1331 1332 vfsp.vnfs_marksystem = 0; 1333 vfsp.vnfs_markroot = 0; 1334 1335 if ( (error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp)) ) { 1336 *vpp = NULLVP; 1337 return (error); 1338 } 1339 vnode_lock_spin(nvp); 1340 nvp->v_flag |= VBDEVVP; 1341 nvp->v_tag = VT_NON; /* set this to VT_NON so during aliasing it can be replaced */ 1342 vnode_unlock(nvp); 1343 if ( (error = vnode_ref(nvp)) ) { 1344 panic("bdevvp failed: vnode_ref"); 1345 return (error); 1346 } 1347 if ( (error = VNOP_FSYNC(nvp, MNT_WAIT, &context)) ) { 1348 panic("bdevvp failed: fsync"); 1349 return (error); 1350 } 1351 if ( (error = buf_invalidateblks(nvp, BUF_WRITE_DATA, 0, 0)) ) { 1352 panic("bdevvp failed: invalidateblks"); 1353 return (error); 1354 } 1355 1356#if CONFIG_MACF 1357 /* 1358 * XXXMAC: We can't put a MAC check here, the system will 1359 * panic without this vnode. 1360 */ 1361#endif /* MAC */ 1362 1363 if ( (error = VNOP_OPEN(nvp, FREAD, &context)) ) { 1364 panic("bdevvp failed: open"); 1365 return (error); 1366 } 1367 *vpp = nvp; 1368 1369 return (0); 1370} 1371 1372/* 1373 * Check to see if the new vnode represents a special device 1374 * for which we already have a vnode (either because of 1375 * bdevvp() or because of a different vnode representing 1376 * the same block device). If such an alias exists, deallocate 1377 * the existing contents and return the aliased vnode. The 1378 * caller is responsible for filling it with its new contents. 1379 */ 1380static vnode_t 1381checkalias(struct vnode *nvp, dev_t nvp_rdev) 1382{ 1383 struct vnode *vp; 1384 struct vnode **vpp; 1385 struct specinfo *sin = NULL; 1386 int vid = 0; 1387 1388 vpp = &speclisth[SPECHASH(nvp_rdev)]; 1389loop: 1390 SPECHASH_LOCK(); 1391 1392 for (vp = *vpp; vp; vp = vp->v_specnext) { 1393 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) { 1394 vid = vp->v_id; 1395 break; 1396 } 1397 } 1398 SPECHASH_UNLOCK(); 1399 1400 if (vp) { 1401found_alias: 1402 if (vnode_getwithvid(vp,vid)) { 1403 goto loop; 1404 } 1405 /* 1406 * Termination state is checked in vnode_getwithvid 1407 */ 1408 vnode_lock(vp); 1409 1410 /* 1411 * Alias, but not in use, so flush it out. 1412 */ 1413 if ((vp->v_iocount == 1) && (vp->v_usecount == 0)) { 1414 vnode_reclaim_internal(vp, 1, 1, 0); 1415 vnode_put_locked(vp); 1416 vnode_unlock(vp); 1417 goto loop; 1418 } 1419 1420 } 1421 if (vp == NULL || vp->v_tag != VT_NON) { 1422 if (sin == NULL) { 1423 MALLOC_ZONE(sin, struct specinfo *, sizeof(struct specinfo), 1424 M_SPECINFO, M_WAITOK); 1425 } 1426 1427 nvp->v_specinfo = sin; 1428 bzero(nvp->v_specinfo, sizeof(struct specinfo)); 1429 nvp->v_rdev = nvp_rdev; 1430 nvp->v_specflags = 0; 1431 nvp->v_speclastr = -1; 1432 nvp->v_specinfo->si_opencount = 0; 1433 nvp->v_specinfo->si_initted = 0; 1434 nvp->v_specinfo->si_throttleable = 0; 1435 1436 SPECHASH_LOCK(); 1437 1438 /* We dropped the lock, someone could have added */ 1439 if (vp == NULLVP) { 1440 for (vp = *vpp; vp; vp = vp->v_specnext) { 1441 if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) { 1442 vid = vp->v_id; 1443 SPECHASH_UNLOCK(); 1444 goto found_alias; 1445 } 1446 } 1447 } 1448 1449 nvp->v_hashchain = vpp; 1450 nvp->v_specnext = *vpp; 1451 *vpp = nvp; 1452 1453 if (vp != NULLVP) { 1454 nvp->v_specflags |= SI_ALIASED; 1455 vp->v_specflags |= SI_ALIASED; 1456 SPECHASH_UNLOCK(); 1457 vnode_put_locked(vp); 1458 vnode_unlock(vp); 1459 } else { 1460 SPECHASH_UNLOCK(); 1461 } 1462 1463 return (NULLVP); 1464 } 1465 1466 if (sin) { 1467 FREE_ZONE(sin, sizeof(struct specinfo), M_SPECINFO); 1468 } 1469 1470 if ((vp->v_flag & (VBDEVVP | VDEVFLUSH)) != 0) 1471 return(vp); 1472 1473 panic("checkalias with VT_NON vp that shouldn't: %p", vp); 1474 1475 return (vp); 1476} 1477 1478 1479/* 1480 * Get a reference on a particular vnode and lock it if requested. 1481 * If the vnode was on the inactive list, remove it from the list. 1482 * If the vnode was on the free list, remove it from the list and 1483 * move it to inactive list as needed. 1484 * The vnode lock bit is set if the vnode is being eliminated in 1485 * vgone. The process is awakened when the transition is completed, 1486 * and an error returned to indicate that the vnode is no longer 1487 * usable (possibly having been changed to a new file system type). 1488 */ 1489int 1490vget_internal(vnode_t vp, int vid, int vflags) 1491{ 1492 int error = 0; 1493 1494 vnode_lock_spin(vp); 1495 1496 if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0)) 1497 /* 1498 * vnode to be returned only if it has writers opened 1499 */ 1500 error = EINVAL; 1501 else 1502 error = vnode_getiocount(vp, vid, vflags); 1503 1504 vnode_unlock(vp); 1505 1506 return (error); 1507} 1508 1509/* 1510 * Returns: 0 Success 1511 * ENOENT No such file or directory [terminating] 1512 */ 1513int 1514vnode_ref(vnode_t vp) 1515{ 1516 1517 return (vnode_ref_ext(vp, 0, 0)); 1518} 1519 1520/* 1521 * Returns: 0 Success 1522 * ENOENT No such file or directory [terminating] 1523 */ 1524int 1525vnode_ref_ext(vnode_t vp, int fmode, int flags) 1526{ 1527 int error = 0; 1528 1529 vnode_lock_spin(vp); 1530 1531 /* 1532 * once all the current call sites have been fixed to insure they have 1533 * taken an iocount, we can toughen this assert up and insist that the 1534 * iocount is non-zero... a non-zero usecount doesn't insure correctness 1535 */ 1536 if (vp->v_iocount <= 0 && vp->v_usecount <= 0) 1537 panic("vnode_ref_ext: vp %p has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount); 1538 1539 /* 1540 * if you are the owner of drain/termination, can acquire usecount 1541 */ 1542 if ((flags & VNODE_REF_FORCE) == 0) { 1543 if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) { 1544 if (vp->v_owner != current_thread()) { 1545 error = ENOENT; 1546 goto out; 1547 } 1548 } 1549 } 1550 vp->v_usecount++; 1551 1552 if (fmode & FWRITE) { 1553 if (++vp->v_writecount <= 0) 1554 panic("vnode_ref_ext: v_writecount"); 1555 } 1556 if (fmode & O_EVTONLY) { 1557 if (++vp->v_kusecount <= 0) 1558 panic("vnode_ref_ext: v_kusecount"); 1559 } 1560 if (vp->v_flag & VRAGE) { 1561 struct uthread *ut; 1562 1563 ut = get_bsdthread_info(current_thread()); 1564 1565 if ( !(current_proc()->p_lflag & P_LRAGE_VNODES) && 1566 !(ut->uu_flag & UT_RAGE_VNODES)) { 1567 /* 1568 * a 'normal' process accessed this vnode 1569 * so make sure its no longer marked 1570 * for rapid aging... also, make sure 1571 * it gets removed from the rage list... 1572 * when v_usecount drops back to 0, it 1573 * will be put back on the real free list 1574 */ 1575 vp->v_flag &= ~VRAGE; 1576 vp->v_references = 0; 1577 vnode_list_remove(vp); 1578 } 1579 } 1580 if (vp->v_usecount == 1 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) { 1581 1582 if (vp->v_ubcinfo) { 1583 vnode_lock_convert(vp); 1584 memory_object_mark_used(vp->v_ubcinfo->ui_control); 1585 } 1586 } 1587out: 1588 vnode_unlock(vp); 1589 1590 return (error); 1591} 1592 1593 1594static boolean_t 1595vnode_on_reliable_media(vnode_t vp) 1596{ 1597 if ( !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) && (vp->v_mount->mnt_flag & MNT_LOCAL) ) 1598 return (TRUE); 1599 return (FALSE); 1600} 1601 1602static void 1603vnode_async_list_add(vnode_t vp) 1604{ 1605 vnode_list_lock(); 1606 1607 if (VONLIST(vp) || (vp->v_lflag & (VL_TERMINATE|VL_DEAD))) 1608 panic("vnode_async_list_add: %p is in wrong state", vp); 1609 1610 TAILQ_INSERT_HEAD(&vnode_async_work_list, vp, v_freelist); 1611 vp->v_listflag |= VLIST_ASYNC_WORK; 1612 1613 async_work_vnodes++; 1614 1615 vnode_list_unlock(); 1616 1617 wakeup(&vnode_async_work_list); 1618 1619} 1620 1621 1622/* 1623 * put the vnode on appropriate free list. 1624 * called with vnode LOCKED 1625 */ 1626static void 1627vnode_list_add(vnode_t vp) 1628{ 1629 boolean_t need_dead_wakeup = FALSE; 1630 1631#if DIAGNOSTIC 1632 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); 1633#endif 1634 /* 1635 * if it is already on a list or non zero references return 1636 */ 1637 if (VONLIST(vp) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || (vp->v_lflag & VL_TERMINATE)) 1638 return; 1639 1640 vnode_list_lock(); 1641 1642 if ((vp->v_flag & VRAGE) && !(vp->v_lflag & VL_DEAD)) { 1643 /* 1644 * add the new guy to the appropriate end of the RAGE list 1645 */ 1646 if ((vp->v_flag & VAGE)) 1647 TAILQ_INSERT_HEAD(&vnode_rage_list, vp, v_freelist); 1648 else 1649 TAILQ_INSERT_TAIL(&vnode_rage_list, vp, v_freelist); 1650 1651 vp->v_listflag |= VLIST_RAGE; 1652 ragevnodes++; 1653 1654 /* 1655 * reset the timestamp for the last inserted vp on the RAGE 1656 * queue to let new_vnode know that its not ok to start stealing 1657 * from this list... as long as we're actively adding to this list 1658 * we'll push out the vnodes we want to donate to the real free list 1659 * once we stop pushing, we'll let some time elapse before we start 1660 * stealing them in the new_vnode routine 1661 */ 1662 microuptime(&rage_tv); 1663 } else { 1664 /* 1665 * if VL_DEAD, insert it at head of the dead list 1666 * else insert at tail of LRU list or at head if VAGE is set 1667 */ 1668 if ( (vp->v_lflag & VL_DEAD)) { 1669 TAILQ_INSERT_HEAD(&vnode_dead_list, vp, v_freelist); 1670 vp->v_listflag |= VLIST_DEAD; 1671 deadvnodes++; 1672 1673 if (dead_vnode_wanted) { 1674 dead_vnode_wanted--; 1675 need_dead_wakeup = TRUE; 1676 } 1677 1678 } else if ( (vp->v_flag & VAGE) ) { 1679 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1680 vp->v_flag &= ~VAGE; 1681 freevnodes++; 1682 } else { 1683 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 1684 freevnodes++; 1685 } 1686 } 1687 vnode_list_unlock(); 1688 1689 if (need_dead_wakeup == TRUE) 1690 wakeup_one((caddr_t)&dead_vnode_wanted); 1691} 1692 1693 1694/* 1695 * remove the vnode from appropriate free list. 1696 * called with vnode LOCKED and 1697 * the list lock held 1698 */ 1699static void 1700vnode_list_remove_locked(vnode_t vp) 1701{ 1702 if (VONLIST(vp)) { 1703 /* 1704 * the v_listflag field is 1705 * protected by the vnode_list_lock 1706 */ 1707 if (vp->v_listflag & VLIST_RAGE) 1708 VREMRAGE("vnode_list_remove", vp); 1709 else if (vp->v_listflag & VLIST_DEAD) 1710 VREMDEAD("vnode_list_remove", vp); 1711 else if (vp->v_listflag & VLIST_ASYNC_WORK) 1712 VREMASYNC_WORK("vnode_list_remove", vp); 1713 else 1714 VREMFREE("vnode_list_remove", vp); 1715 } 1716} 1717 1718 1719/* 1720 * remove the vnode from appropriate free list. 1721 * called with vnode LOCKED 1722 */ 1723static void 1724vnode_list_remove(vnode_t vp) 1725{ 1726#if DIAGNOSTIC 1727 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); 1728#endif 1729 /* 1730 * we want to avoid taking the list lock 1731 * in the case where we're not on the free 1732 * list... this will be true for most 1733 * directories and any currently in use files 1734 * 1735 * we're guaranteed that we can't go from 1736 * the not-on-list state to the on-list 1737 * state since we hold the vnode lock... 1738 * all calls to vnode_list_add are done 1739 * under the vnode lock... so we can 1740 * check for that condition (the prevelant one) 1741 * without taking the list lock 1742 */ 1743 if (VONLIST(vp)) { 1744 vnode_list_lock(); 1745 /* 1746 * however, we're not guaranteed that 1747 * we won't go from the on-list state 1748 * to the not-on-list state until we 1749 * hold the vnode_list_lock... this 1750 * is due to "new_vnode" removing vnodes 1751 * from the free list uder the list_lock 1752 * w/o the vnode lock... so we need to 1753 * check again whether we're currently 1754 * on the free list 1755 */ 1756 vnode_list_remove_locked(vp); 1757 1758 vnode_list_unlock(); 1759 } 1760} 1761 1762 1763void 1764vnode_rele(vnode_t vp) 1765{ 1766 vnode_rele_internal(vp, 0, 0, 0); 1767} 1768 1769 1770void 1771vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter) 1772{ 1773 vnode_rele_internal(vp, fmode, dont_reenter, 0); 1774} 1775 1776 1777void 1778vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked) 1779{ 1780 1781 if ( !locked) 1782 vnode_lock_spin(vp); 1783#if DIAGNOSTIC 1784 else 1785 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); 1786#endif 1787 if (--vp->v_usecount < 0) 1788 panic("vnode_rele_ext: vp %p usecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag); 1789 1790 if (fmode & FWRITE) { 1791 if (--vp->v_writecount < 0) 1792 panic("vnode_rele_ext: vp %p writecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_writecount, vp->v_tag, vp->v_type, vp->v_flag); 1793 } 1794 if (fmode & O_EVTONLY) { 1795 if (--vp->v_kusecount < 0) 1796 panic("vnode_rele_ext: vp %p kusecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_tag, vp->v_type, vp->v_flag); 1797 } 1798 if (vp->v_kusecount > vp->v_usecount) 1799 panic("vnode_rele_ext: vp %p kusecount(%d) out of balance with usecount(%d). v_tag = %d, v_type = %d, v_flag = %x.",vp, vp->v_kusecount, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag); 1800 1801 if ((vp->v_iocount > 0) || (vp->v_usecount > 0)) { 1802 /* 1803 * vnode is still busy... if we're the last 1804 * usecount, mark for a future call to VNOP_INACTIVE 1805 * when the iocount finally drops to 0 1806 */ 1807 if (vp->v_usecount == 0) { 1808 vp->v_lflag |= VL_NEEDINACTIVE; 1809 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT); 1810 } 1811 goto done; 1812 } 1813 vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT); 1814 1815 if ( (vp->v_lflag & (VL_TERMINATE | VL_DEAD)) || dont_reenter) { 1816 /* 1817 * vnode is being cleaned, or 1818 * we've requested that we don't reenter 1819 * the filesystem on this release... in 1820 * this case, we'll mark the vnode aged 1821 * if it's been marked for termination 1822 */ 1823 if (dont_reenter) { 1824 if ( !(vp->v_lflag & (VL_TERMINATE | VL_DEAD | VL_MARKTERM)) ) { 1825 vp->v_lflag |= VL_NEEDINACTIVE; 1826 1827 if (vnode_on_reliable_media(vp) == FALSE) { 1828 vnode_async_list_add(vp); 1829 goto done; 1830 } 1831 } 1832 vp->v_flag |= VAGE; 1833 } 1834 vnode_list_add(vp); 1835 1836 goto done; 1837 } 1838 /* 1839 * at this point both the iocount and usecount 1840 * are zero 1841 * pick up an iocount so that we can call 1842 * VNOP_INACTIVE with the vnode lock unheld 1843 */ 1844 vp->v_iocount++; 1845#ifdef JOE_DEBUG 1846 record_vp(vp, 1); 1847#endif 1848 vp->v_lflag &= ~VL_NEEDINACTIVE; 1849 vnode_unlock(vp); 1850 1851 VNOP_INACTIVE(vp, vfs_context_current()); 1852 1853 vnode_lock_spin(vp); 1854 /* 1855 * because we dropped the vnode lock to call VNOP_INACTIVE 1856 * the state of the vnode may have changed... we may have 1857 * picked up an iocount, usecount or the MARKTERM may have 1858 * been set... we need to reevaluate the reference counts 1859 * to determine if we can call vnode_reclaim_internal at 1860 * this point... if the reference counts are up, we'll pick 1861 * up the MARKTERM state when they get subsequently dropped 1862 */ 1863 if ( (vp->v_iocount == 1) && (vp->v_usecount == 0) && 1864 ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM)) { 1865 struct uthread *ut; 1866 1867 ut = get_bsdthread_info(current_thread()); 1868 1869 if (ut->uu_defer_reclaims) { 1870 vp->v_defer_reclaimlist = ut->uu_vreclaims; 1871 ut->uu_vreclaims = vp; 1872 goto done; 1873 } 1874 vnode_lock_convert(vp); 1875 vnode_reclaim_internal(vp, 1, 1, 0); 1876 } 1877 vnode_dropiocount(vp); 1878 vnode_list_add(vp); 1879done: 1880 if (vp->v_usecount == 0 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) { 1881 1882 if (vp->v_ubcinfo) { 1883 vnode_lock_convert(vp); 1884 memory_object_mark_unused(vp->v_ubcinfo->ui_control, (vp->v_flag & VRAGE) == VRAGE); 1885 } 1886 } 1887 if ( !locked) 1888 vnode_unlock(vp); 1889 return; 1890} 1891 1892/* 1893 * Remove any vnodes in the vnode table belonging to mount point mp. 1894 * 1895 * If MNT_NOFORCE is specified, there should not be any active ones, 1896 * return error if any are found (nb: this is a user error, not a 1897 * system error). If MNT_FORCE is specified, detach any active vnodes 1898 * that are found. 1899 */ 1900#if DIAGNOSTIC 1901int busyprt = 0; /* print out busy vnodes */ 1902#if 0 1903struct ctldebug debug1 = { "busyprt", &busyprt }; 1904#endif /* 0 */ 1905#endif 1906 1907int 1908vflush(struct mount *mp, struct vnode *skipvp, int flags) 1909{ 1910 struct vnode *vp; 1911 int busy = 0; 1912 int reclaimed = 0; 1913 int retval; 1914 unsigned int vid; 1915 1916 mount_lock(mp); 1917 vnode_iterate_setup(mp); 1918 /* 1919 * On regular unmounts(not forced) do a 1920 * quick check for vnodes to be in use. This 1921 * preserves the caching of vnodes. automounter 1922 * tries unmounting every so often to see whether 1923 * it is still busy or not. 1924 */ 1925 if (((flags & FORCECLOSE)==0) && ((mp->mnt_kern_flag & MNTK_UNMOUNT_PREFLIGHT) != 0)) { 1926 if (vnode_umount_preflight(mp, skipvp, flags)) { 1927 vnode_iterate_clear(mp); 1928 mount_unlock(mp); 1929 return(EBUSY); 1930 } 1931 } 1932loop: 1933 /* it is returns 0 then there is nothing to do */ 1934 retval = vnode_iterate_prepare(mp); 1935 1936 if (retval == 0) { 1937 vnode_iterate_clear(mp); 1938 mount_unlock(mp); 1939 return(retval); 1940 } 1941 1942 /* iterate over all the vnodes */ 1943 while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) { 1944 1945 vp = TAILQ_FIRST(&mp->mnt_workerqueue); 1946 TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes); 1947 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes); 1948 1949 if ( (vp->v_mount != mp) || (vp == skipvp)) { 1950 continue; 1951 } 1952 vid = vp->v_id; 1953 mount_unlock(mp); 1954 1955 vnode_lock_spin(vp); 1956 1957 if ((vp->v_id != vid) || ((vp->v_lflag & (VL_DEAD | VL_TERMINATE)))) { 1958 vnode_unlock(vp); 1959 mount_lock(mp); 1960 continue; 1961 } 1962 1963 /* 1964 * If requested, skip over vnodes marked VSYSTEM. 1965 * Skip over all vnodes marked VNOFLUSH. 1966 */ 1967 if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) || 1968 (vp->v_flag & VNOFLUSH))) { 1969 vnode_unlock(vp); 1970 mount_lock(mp); 1971 continue; 1972 } 1973 /* 1974 * If requested, skip over vnodes marked VSWAP. 1975 */ 1976 if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) { 1977 vnode_unlock(vp); 1978 mount_lock(mp); 1979 continue; 1980 } 1981 /* 1982 * If requested, skip over vnodes marked VROOT. 1983 */ 1984 if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) { 1985 vnode_unlock(vp); 1986 mount_lock(mp); 1987 continue; 1988 } 1989 /* 1990 * If WRITECLOSE is set, only flush out regular file 1991 * vnodes open for writing. 1992 */ 1993 if ((flags & WRITECLOSE) && 1994 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1995 vnode_unlock(vp); 1996 mount_lock(mp); 1997 continue; 1998 } 1999 /* 2000 * If the real usecount is 0, all we need to do is clear 2001 * out the vnode data structures and we are done. 2002 */ 2003 if (((vp->v_usecount == 0) || 2004 ((vp->v_usecount - vp->v_kusecount) == 0))) { 2005 2006 vnode_lock_convert(vp); 2007 vp->v_iocount++; /* so that drain waits for * other iocounts */ 2008#ifdef JOE_DEBUG 2009 record_vp(vp, 1); 2010#endif 2011 vnode_reclaim_internal(vp, 1, 1, 0); 2012 vnode_dropiocount(vp); 2013 vnode_list_add(vp); 2014 vnode_unlock(vp); 2015 2016 reclaimed++; 2017 mount_lock(mp); 2018 continue; 2019 } 2020 /* 2021 * If FORCECLOSE is set, forcibly close the vnode. 2022 * For block or character devices, revert to an 2023 * anonymous device. For all other files, just kill them. 2024 */ 2025 if (flags & FORCECLOSE) { 2026 vnode_lock_convert(vp); 2027 2028 if (vp->v_type != VBLK && vp->v_type != VCHR) { 2029 vp->v_iocount++; /* so that drain waits * for other iocounts */ 2030#ifdef JOE_DEBUG 2031 record_vp(vp, 1); 2032#endif 2033 vnode_abort_advlocks(vp); 2034 vnode_reclaim_internal(vp, 1, 1, 0); 2035 vnode_dropiocount(vp); 2036 vnode_list_add(vp); 2037 vnode_unlock(vp); 2038 } else { 2039 vclean(vp, 0); 2040 vp->v_lflag &= ~VL_DEAD; 2041 vp->v_op = spec_vnodeop_p; 2042 vp->v_flag |= VDEVFLUSH; 2043 vnode_unlock(vp); 2044 } 2045 mount_lock(mp); 2046 continue; 2047 } 2048#if DIAGNOSTIC 2049 if (busyprt) 2050 vprint("vflush: busy vnode", vp); 2051#endif 2052 vnode_unlock(vp); 2053 mount_lock(mp); 2054 busy++; 2055 } 2056 2057 /* At this point the worker queue is completed */ 2058 if (busy && ((flags & FORCECLOSE)==0) && reclaimed) { 2059 busy = 0; 2060 reclaimed = 0; 2061 (void)vnode_iterate_reloadq(mp); 2062 /* returned with mount lock held */ 2063 goto loop; 2064 } 2065 2066 /* if new vnodes were created in between retry the reclaim */ 2067 if ( vnode_iterate_reloadq(mp) != 0) { 2068 if (!(busy && ((flags & FORCECLOSE)==0))) 2069 goto loop; 2070 } 2071 vnode_iterate_clear(mp); 2072 mount_unlock(mp); 2073 2074 if (busy && ((flags & FORCECLOSE)==0)) 2075 return (EBUSY); 2076 return (0); 2077} 2078 2079long num_recycledvnodes = 0; 2080/* 2081 * Disassociate the underlying file system from a vnode. 2082 * The vnode lock is held on entry. 2083 */ 2084static void 2085vclean(vnode_t vp, int flags) 2086{ 2087 vfs_context_t ctx = vfs_context_current(); 2088 int active; 2089 int need_inactive; 2090 int already_terminating; 2091 int clflags = 0; 2092#if NAMEDSTREAMS 2093 int is_namedstream; 2094#endif 2095 2096 /* 2097 * Check to see if the vnode is in use. 2098 * If so we have to reference it before we clean it out 2099 * so that its count cannot fall to zero and generate a 2100 * race against ourselves to recycle it. 2101 */ 2102 active = vp->v_usecount; 2103 2104 /* 2105 * just in case we missed sending a needed 2106 * VNOP_INACTIVE, we'll do it now 2107 */ 2108 need_inactive = (vp->v_lflag & VL_NEEDINACTIVE); 2109 2110 vp->v_lflag &= ~VL_NEEDINACTIVE; 2111 2112 /* 2113 * Prevent the vnode from being recycled or 2114 * brought into use while we clean it out. 2115 */ 2116 already_terminating = (vp->v_lflag & VL_TERMINATE); 2117 2118 vp->v_lflag |= VL_TERMINATE; 2119 2120 /* 2121 * remove the vnode from any mount list 2122 * it might be on... 2123 */ 2124 insmntque(vp, (struct mount *)0); 2125 2126#if NAMEDSTREAMS 2127 is_namedstream = vnode_isnamedstream(vp); 2128#endif 2129 2130 vnode_unlock(vp); 2131 2132 OSAddAtomicLong(1, &num_recycledvnodes); 2133 2134 if (flags & DOCLOSE) 2135 clflags |= IO_NDELAY; 2136 if (flags & REVOKEALL) 2137 clflags |= IO_REVOKE; 2138 2139 if (active && (flags & DOCLOSE)) 2140 VNOP_CLOSE(vp, clflags, ctx); 2141 2142 /* 2143 * Clean out any buffers associated with the vnode. 2144 */ 2145 if (flags & DOCLOSE) { 2146#if NFSCLIENT 2147 if (vp->v_tag == VT_NFS) 2148 nfs_vinvalbuf(vp, V_SAVE, ctx, 0); 2149 else 2150#endif 2151 { 2152 VNOP_FSYNC(vp, MNT_WAIT, ctx); 2153 buf_invalidateblks(vp, BUF_WRITE_DATA | BUF_INVALIDATE_LOCKED, 0, 0); 2154 } 2155 if (UBCINFOEXISTS(vp)) 2156 /* 2157 * Clean the pages in VM. 2158 */ 2159 (void)ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC); 2160 } 2161 if (active || need_inactive) 2162 VNOP_INACTIVE(vp, ctx); 2163 2164#if NAMEDSTREAMS 2165 if ((is_namedstream != 0) && (vp->v_parent != NULLVP)) { 2166 vnode_t pvp = vp->v_parent; 2167 2168 /* Delete the shadow stream file before we reclaim its vnode */ 2169 if (vnode_isshadow(vp)) { 2170 vnode_relenamedstream(pvp, vp, ctx); 2171 } 2172 2173 /* 2174 * No more streams associated with the parent. We 2175 * have a ref on it, so its identity is stable. 2176 * If the parent is on an opaque volume, then we need to know 2177 * whether it has associated named streams. 2178 */ 2179 if (vfs_authopaque(pvp->v_mount)) { 2180 vnode_lock_spin(pvp); 2181 pvp->v_lflag &= ~VL_HASSTREAMS; 2182 vnode_unlock(pvp); 2183 } 2184 } 2185#endif 2186 2187 /* 2188 * Destroy ubc named reference 2189 * cluster_release is done on this path 2190 * along with dropping the reference on the ucred 2191 */ 2192 ubc_destroy_named(vp); 2193 2194#if CONFIG_TRIGGERS 2195 /* 2196 * cleanup trigger info from vnode (if any) 2197 */ 2198 if (vp->v_resolve) 2199 vnode_resolver_detach(vp); 2200#endif 2201 2202 /* 2203 * Reclaim the vnode. 2204 */ 2205 if (VNOP_RECLAIM(vp, ctx)) 2206 panic("vclean: cannot reclaim"); 2207 2208 // make sure the name & parent ptrs get cleaned out! 2209 vnode_update_identity(vp, NULLVP, NULL, 0, 0, VNODE_UPDATE_PARENT | VNODE_UPDATE_NAME | VNODE_UPDATE_PURGE); 2210 2211 vnode_lock(vp); 2212 2213 vp->v_mount = dead_mountp; 2214 vp->v_op = dead_vnodeop_p; 2215 vp->v_tag = VT_NON; 2216 vp->v_data = NULL; 2217 2218 vp->v_lflag |= VL_DEAD; 2219 2220 if (already_terminating == 0) { 2221 vp->v_lflag &= ~VL_TERMINATE; 2222 /* 2223 * Done with purge, notify sleepers of the grim news. 2224 */ 2225 if (vp->v_lflag & VL_TERMWANT) { 2226 vp->v_lflag &= ~VL_TERMWANT; 2227 wakeup(&vp->v_lflag); 2228 } 2229 } 2230} 2231 2232/* 2233 * Eliminate all activity associated with the requested vnode 2234 * and with all vnodes aliased to the requested vnode. 2235 */ 2236int 2237#if DIAGNOSTIC 2238vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context) 2239#else 2240vn_revoke(vnode_t vp, __unused int flags, __unused vfs_context_t a_context) 2241#endif 2242{ 2243 struct vnode *vq; 2244 int vid; 2245 2246#if DIAGNOSTIC 2247 if ((flags & REVOKEALL) == 0) 2248 panic("vnop_revoke"); 2249#endif 2250 2251 if (vnode_isaliased(vp)) { 2252 /* 2253 * If a vgone (or vclean) is already in progress, 2254 * return an immediate error 2255 */ 2256 if (vp->v_lflag & VL_TERMINATE) 2257 return(ENOENT); 2258 2259 /* 2260 * Ensure that vp will not be vgone'd while we 2261 * are eliminating its aliases. 2262 */ 2263 SPECHASH_LOCK(); 2264 while ((vp->v_specflags & SI_ALIASED)) { 2265 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2266 if (vq->v_rdev != vp->v_rdev || 2267 vq->v_type != vp->v_type || vp == vq) 2268 continue; 2269 vid = vq->v_id; 2270 SPECHASH_UNLOCK(); 2271 if (vnode_getwithvid(vq,vid)){ 2272 SPECHASH_LOCK(); 2273 break; 2274 } 2275 vnode_reclaim_internal(vq, 0, 1, 0); 2276 vnode_put(vq); 2277 SPECHASH_LOCK(); 2278 break; 2279 } 2280 } 2281 SPECHASH_UNLOCK(); 2282 } 2283 vnode_reclaim_internal(vp, 0, 0, REVOKEALL); 2284 2285 return (0); 2286} 2287 2288/* 2289 * Recycle an unused vnode to the front of the free list. 2290 * Release the passed interlock if the vnode will be recycled. 2291 */ 2292int 2293vnode_recycle(struct vnode *vp) 2294{ 2295 vnode_lock_spin(vp); 2296 2297 if (vp->v_iocount || vp->v_usecount) { 2298 vp->v_lflag |= VL_MARKTERM; 2299 vnode_unlock(vp); 2300 return(0); 2301 } 2302 vnode_lock_convert(vp); 2303 vnode_reclaim_internal(vp, 1, 0, 0); 2304 2305 vnode_unlock(vp); 2306 2307 return (1); 2308} 2309 2310static int 2311vnode_reload(vnode_t vp) 2312{ 2313 vnode_lock_spin(vp); 2314 2315 if ((vp->v_iocount > 1) || vp->v_usecount) { 2316 vnode_unlock(vp); 2317 return(0); 2318 } 2319 if (vp->v_iocount <= 0) 2320 panic("vnode_reload with no iocount %d", vp->v_iocount); 2321 2322 /* mark for release when iocount is dopped */ 2323 vp->v_lflag |= VL_MARKTERM; 2324 vnode_unlock(vp); 2325 2326 return (1); 2327} 2328 2329 2330static void 2331vgone(vnode_t vp, int flags) 2332{ 2333 struct vnode *vq; 2334 struct vnode *vx; 2335 2336 /* 2337 * Clean out the filesystem specific data. 2338 * vclean also takes care of removing the 2339 * vnode from any mount list it might be on 2340 */ 2341 vclean(vp, flags | DOCLOSE); 2342 2343 /* 2344 * If special device, remove it from special device alias list 2345 * if it is on one. 2346 */ 2347 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { 2348 SPECHASH_LOCK(); 2349 if (*vp->v_hashchain == vp) { 2350 *vp->v_hashchain = vp->v_specnext; 2351 } else { 2352 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2353 if (vq->v_specnext != vp) 2354 continue; 2355 vq->v_specnext = vp->v_specnext; 2356 break; 2357 } 2358 if (vq == NULL) 2359 panic("missing bdev"); 2360 } 2361 if (vp->v_specflags & SI_ALIASED) { 2362 vx = NULL; 2363 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2364 if (vq->v_rdev != vp->v_rdev || 2365 vq->v_type != vp->v_type) 2366 continue; 2367 if (vx) 2368 break; 2369 vx = vq; 2370 } 2371 if (vx == NULL) 2372 panic("missing alias"); 2373 if (vq == NULL) 2374 vx->v_specflags &= ~SI_ALIASED; 2375 vp->v_specflags &= ~SI_ALIASED; 2376 } 2377 SPECHASH_UNLOCK(); 2378 { 2379 struct specinfo *tmp = vp->v_specinfo; 2380 vp->v_specinfo = NULL; 2381 FREE_ZONE((void *)tmp, sizeof(struct specinfo), M_SPECINFO); 2382 } 2383 } 2384} 2385 2386/* 2387 * Lookup a vnode by device number. 2388 */ 2389int 2390check_mountedon(dev_t dev, enum vtype type, int *errorp) 2391{ 2392 vnode_t vp; 2393 int rc = 0; 2394 int vid; 2395 2396loop: 2397 SPECHASH_LOCK(); 2398 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 2399 if (dev != vp->v_rdev || type != vp->v_type) 2400 continue; 2401 vid = vp->v_id; 2402 SPECHASH_UNLOCK(); 2403 if (vnode_getwithvid(vp,vid)) 2404 goto loop; 2405 vnode_lock_spin(vp); 2406 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) { 2407 vnode_unlock(vp); 2408 if ((*errorp = vfs_mountedon(vp)) != 0) 2409 rc = 1; 2410 } else 2411 vnode_unlock(vp); 2412 vnode_put(vp); 2413 return(rc); 2414 } 2415 SPECHASH_UNLOCK(); 2416 return (0); 2417} 2418 2419/* 2420 * Calculate the total number of references to a special device. 2421 */ 2422int 2423vcount(vnode_t vp) 2424{ 2425 vnode_t vq, vnext; 2426 int count; 2427 int vid; 2428 2429loop: 2430 if (!vnode_isaliased(vp)) 2431 return (vp->v_specinfo->si_opencount); 2432 count = 0; 2433 2434 SPECHASH_LOCK(); 2435 /* 2436 * Grab first vnode and its vid. 2437 */ 2438 vq = *vp->v_hashchain; 2439 vid = vq ? vq->v_id : 0; 2440 2441 SPECHASH_UNLOCK(); 2442 2443 while (vq) { 2444 /* 2445 * Attempt to get the vnode outside the SPECHASH lock. 2446 */ 2447 if (vnode_getwithvid(vq, vid)) { 2448 goto loop; 2449 } 2450 vnode_lock(vq); 2451 2452 if (vq->v_rdev == vp->v_rdev && vq->v_type == vp->v_type) { 2453 if ((vq->v_usecount == 0) && (vq->v_iocount == 1) && vq != vp) { 2454 /* 2455 * Alias, but not in use, so flush it out. 2456 */ 2457 vnode_reclaim_internal(vq, 1, 1, 0); 2458 vnode_put_locked(vq); 2459 vnode_unlock(vq); 2460 goto loop; 2461 } 2462 count += vq->v_specinfo->si_opencount; 2463 } 2464 vnode_unlock(vq); 2465 2466 SPECHASH_LOCK(); 2467 /* 2468 * must do this with the reference still held on 'vq' 2469 * so that it can't be destroyed while we're poking 2470 * through v_specnext 2471 */ 2472 vnext = vq->v_specnext; 2473 vid = vnext ? vnext->v_id : 0; 2474 2475 SPECHASH_UNLOCK(); 2476 2477 vnode_put(vq); 2478 2479 vq = vnext; 2480 } 2481 2482 return (count); 2483} 2484 2485int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 2486 2487/* 2488 * Print out a description of a vnode. 2489 */ 2490static const char *typename[] = 2491 { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" }; 2492 2493void 2494vprint(const char *label, struct vnode *vp) 2495{ 2496 char sbuf[64]; 2497 2498 if (label != NULL) 2499 printf("%s: ", label); 2500 printf("type %s, usecount %d, writecount %d", 2501 typename[vp->v_type], vp->v_usecount, vp->v_writecount); 2502 sbuf[0] = '\0'; 2503 if (vp->v_flag & VROOT) 2504 strlcat(sbuf, "|VROOT", sizeof(sbuf)); 2505 if (vp->v_flag & VTEXT) 2506 strlcat(sbuf, "|VTEXT", sizeof(sbuf)); 2507 if (vp->v_flag & VSYSTEM) 2508 strlcat(sbuf, "|VSYSTEM", sizeof(sbuf)); 2509 if (vp->v_flag & VNOFLUSH) 2510 strlcat(sbuf, "|VNOFLUSH", sizeof(sbuf)); 2511 if (vp->v_flag & VBWAIT) 2512 strlcat(sbuf, "|VBWAIT", sizeof(sbuf)); 2513 if (vnode_isaliased(vp)) 2514 strlcat(sbuf, "|VALIASED", sizeof(sbuf)); 2515 if (sbuf[0] != '\0') 2516 printf(" flags (%s)", &sbuf[1]); 2517} 2518 2519 2520int 2521vn_getpath(struct vnode *vp, char *pathbuf, int *len) 2522{ 2523 return build_path(vp, pathbuf, *len, len, BUILDPATH_NO_FS_ENTER, vfs_context_current()); 2524} 2525 2526int 2527vn_getpath_fsenter(struct vnode *vp, char *pathbuf, int *len) 2528{ 2529 return build_path(vp, pathbuf, *len, len, 0, vfs_context_current()); 2530} 2531 2532int 2533vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash) 2534{ 2535 return ubc_cs_getcdhash(vp, offset, cdhash); 2536} 2537 2538 2539static char *extension_table=NULL; 2540static int nexts; 2541static int max_ext_width; 2542 2543static int 2544extension_cmp(const void *a, const void *b) 2545{ 2546 return (strlen((const char *)a) - strlen((const char *)b)); 2547} 2548 2549 2550// 2551// This is the api LaunchServices uses to inform the kernel 2552// the list of package extensions to ignore. 2553// 2554// Internally we keep the list sorted by the length of the 2555// the extension (from longest to shortest). We sort the 2556// list of extensions so that we can speed up our searches 2557// when comparing file names -- we only compare extensions 2558// that could possibly fit into the file name, not all of 2559// them (i.e. a short 8 character name can't have an 8 2560// character extension). 2561// 2562extern lck_mtx_t *pkg_extensions_lck; 2563 2564__private_extern__ int 2565set_package_extensions_table(user_addr_t data, int nentries, int maxwidth) 2566{ 2567 char *new_exts, *old_exts; 2568 int error; 2569 2570 if (nentries <= 0 || nentries > 1024 || maxwidth <= 0 || maxwidth > 255) { 2571 return EINVAL; 2572 } 2573 2574 2575 // allocate one byte extra so we can guarantee null termination 2576 MALLOC(new_exts, char *, (nentries * maxwidth) + 1, M_TEMP, M_WAITOK); 2577 if (new_exts == NULL) { 2578 return ENOMEM; 2579 } 2580 2581 error = copyin(data, new_exts, nentries * maxwidth); 2582 if (error) { 2583 FREE(new_exts, M_TEMP); 2584 return error; 2585 } 2586 2587 new_exts[(nentries * maxwidth)] = '\0'; // guarantee null termination of the block 2588 2589 qsort(new_exts, nentries, maxwidth, extension_cmp); 2590 2591 lck_mtx_lock(pkg_extensions_lck); 2592 2593 old_exts = extension_table; 2594 extension_table = new_exts; 2595 nexts = nentries; 2596 max_ext_width = maxwidth; 2597 2598 lck_mtx_unlock(pkg_extensions_lck); 2599 2600 if (old_exts) { 2601 FREE(old_exts, M_TEMP); 2602 } 2603 2604 return 0; 2605} 2606 2607 2608__private_extern__ int 2609is_package_name(const char *name, int len) 2610{ 2611 int i, extlen; 2612 const char *ptr, *name_ext; 2613 2614 if (len <= 3) { 2615 return 0; 2616 } 2617 2618 name_ext = NULL; 2619 for(ptr=name; *ptr != '\0'; ptr++) { 2620 if (*ptr == '.') { 2621 name_ext = ptr; 2622 } 2623 } 2624 2625 // if there is no "." extension, it can't match 2626 if (name_ext == NULL) { 2627 return 0; 2628 } 2629 2630 // advance over the "." 2631 name_ext++; 2632 2633 lck_mtx_lock(pkg_extensions_lck); 2634 2635 // now iterate over all the extensions to see if any match 2636 ptr = &extension_table[0]; 2637 for(i=0; i < nexts; i++, ptr+=max_ext_width) { 2638 extlen = strlen(ptr); 2639 if (strncasecmp(name_ext, ptr, extlen) == 0 && name_ext[extlen] == '\0') { 2640 // aha, a match! 2641 lck_mtx_unlock(pkg_extensions_lck); 2642 return 1; 2643 } 2644 } 2645 2646 lck_mtx_unlock(pkg_extensions_lck); 2647 2648 // if we get here, no extension matched 2649 return 0; 2650} 2651 2652int 2653vn_path_package_check(__unused vnode_t vp, char *path, int pathlen, int *component) 2654{ 2655 char *ptr, *end; 2656 int comp=0; 2657 2658 *component = -1; 2659 if (*path != '/') { 2660 return EINVAL; 2661 } 2662 2663 end = path + 1; 2664 while(end < path + pathlen && *end != '\0') { 2665 while(end < path + pathlen && *end == '/' && *end != '\0') { 2666 end++; 2667 } 2668 2669 ptr = end; 2670 2671 while(end < path + pathlen && *end != '/' && *end != '\0') { 2672 end++; 2673 } 2674 2675 if (end > path + pathlen) { 2676 // hmm, string wasn't null terminated 2677 return EINVAL; 2678 } 2679 2680 *end = '\0'; 2681 if (is_package_name(ptr, end - ptr)) { 2682 *component = comp; 2683 break; 2684 } 2685 2686 end++; 2687 comp++; 2688 } 2689 2690 return 0; 2691} 2692 2693/* 2694 * Determine if a name is inappropriate for a searchfs query. 2695 * This list consists of /System currently. 2696 */ 2697 2698int vn_searchfs_inappropriate_name(const char *name, int len) { 2699 const char *bad_names[] = { "System" }; 2700 int bad_len[] = { 6 }; 2701 int i; 2702 2703 for(i=0; i < (int) (sizeof(bad_names) / sizeof(bad_names[0])); i++) { 2704 if (len == bad_len[i] && strncmp(name, bad_names[i], strlen(bad_names[i]) + 1) == 0) { 2705 return 1; 2706 } 2707 } 2708 2709 // if we get here, no name matched 2710 return 0; 2711} 2712 2713/* 2714 * Top level filesystem related information gathering. 2715 */ 2716extern unsigned int vfs_nummntops; 2717 2718int 2719vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, 2720 user_addr_t newp, size_t newlen, proc_t p) 2721{ 2722 struct vfstable *vfsp; 2723 int *username; 2724 u_int usernamelen; 2725 int error; 2726 struct vfsconf vfsc; 2727 2728 if (namelen > CTL_MAXNAME) { 2729 return (EINVAL); 2730 } 2731 2732 /* All non VFS_GENERIC and in VFS_GENERIC, 2733 * VFS_MAXTYPENUM, VFS_CONF, VFS_SET_PACKAGE_EXTS 2734 * needs to have root priv to have modifiers. 2735 * For rest the userland_sysctl(CTLFLAG_ANYBODY) would cover. 2736 */ 2737 if ((newp != USER_ADDR_NULL) && ((name[0] != VFS_GENERIC) || 2738 ((name[1] == VFS_MAXTYPENUM) || 2739 (name[1] == VFS_CONF) || 2740 (name[1] == VFS_SET_PACKAGE_EXTS))) 2741 && (error = suser(kauth_cred_get(), &p->p_acflag))) { 2742 return(error); 2743 } 2744 /* 2745 * The VFS_NUMMNTOPS shouldn't be at name[0] since 2746 * is a VFS generic variable. So now we must check 2747 * namelen so we don't end up covering any UFS 2748 * variables (sinc UFS vfc_typenum is 1). 2749 * 2750 * It should have been: 2751 * name[0]: VFS_GENERIC 2752 * name[1]: VFS_NUMMNTOPS 2753 */ 2754 if (namelen == 1 && name[0] == VFS_NUMMNTOPS) { 2755 return (sysctl_rdint(oldp, oldlenp, newp, vfs_nummntops)); 2756 } 2757 2758 /* all sysctl names at this level are at least name and field */ 2759 if (namelen < 2) 2760 return (EISDIR); /* overloaded */ 2761 if (name[0] != VFS_GENERIC) { 2762 2763 mount_list_lock(); 2764 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2765 if (vfsp->vfc_typenum == name[0]) { 2766 vfsp->vfc_refcount++; 2767 break; 2768 } 2769 mount_list_unlock(); 2770 2771 if (vfsp == NULL) 2772 return (ENOTSUP); 2773 2774 /* XXX current context proxy for proc p? */ 2775 error = ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2776 oldp, oldlenp, newp, newlen, 2777 vfs_context_current())); 2778 2779 mount_list_lock(); 2780 vfsp->vfc_refcount--; 2781 mount_list_unlock(); 2782 return error; 2783 } 2784 switch (name[1]) { 2785 case VFS_MAXTYPENUM: 2786 return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf)); 2787 case VFS_CONF: 2788 if (namelen < 3) 2789 return (ENOTDIR); /* overloaded */ 2790 2791 mount_list_lock(); 2792 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2793 if (vfsp->vfc_typenum == name[2]) 2794 break; 2795 2796 if (vfsp == NULL) { 2797 mount_list_unlock(); 2798 return (ENOTSUP); 2799 } 2800 2801 vfsc.vfc_reserved1 = 0; 2802 bcopy(vfsp->vfc_name, vfsc.vfc_name, sizeof(vfsc.vfc_name)); 2803 vfsc.vfc_typenum = vfsp->vfc_typenum; 2804 vfsc.vfc_refcount = vfsp->vfc_refcount; 2805 vfsc.vfc_flags = vfsp->vfc_flags; 2806 vfsc.vfc_reserved2 = 0; 2807 vfsc.vfc_reserved3 = 0; 2808 2809 mount_list_unlock(); 2810 return (sysctl_rdstruct(oldp, oldlenp, newp, &vfsc, 2811 sizeof(struct vfsconf))); 2812 2813 case VFS_SET_PACKAGE_EXTS: 2814 return set_package_extensions_table((user_addr_t)((unsigned)name[1]), name[2], name[3]); 2815 } 2816 /* 2817 * We need to get back into the general MIB, so we need to re-prepend 2818 * CTL_VFS to our name and try userland_sysctl(). 2819 */ 2820 2821 usernamelen = namelen + 1; 2822 MALLOC(username, int *, usernamelen * sizeof(*username), 2823 M_TEMP, M_WAITOK); 2824 bcopy(name, username + 1, namelen * sizeof(*name)); 2825 username[0] = CTL_VFS; 2826 error = userland_sysctl(p, username, usernamelen, oldp, 2827 oldlenp, newp, newlen, oldlenp); 2828 FREE(username, M_TEMP); 2829 return (error); 2830} 2831 2832/* 2833 * Dump vnode list (via sysctl) - defunct 2834 * use "pstat" instead 2835 */ 2836/* ARGSUSED */ 2837int 2838sysctl_vnode 2839(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, __unused struct sysctl_req *req) 2840{ 2841 return(EINVAL); 2842} 2843 2844SYSCTL_PROC(_kern, KERN_VNODE, vnode, 2845 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 2846 0, 0, sysctl_vnode, "S,", ""); 2847 2848 2849/* 2850 * Check to see if a filesystem is mounted on a block device. 2851 */ 2852int 2853vfs_mountedon(struct vnode *vp) 2854{ 2855 struct vnode *vq; 2856 int error = 0; 2857 2858 SPECHASH_LOCK(); 2859 if (vp->v_specflags & SI_MOUNTEDON) { 2860 error = EBUSY; 2861 goto out; 2862 } 2863 if (vp->v_specflags & SI_ALIASED) { 2864 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2865 if (vq->v_rdev != vp->v_rdev || 2866 vq->v_type != vp->v_type) 2867 continue; 2868 if (vq->v_specflags & SI_MOUNTEDON) { 2869 error = EBUSY; 2870 break; 2871 } 2872 } 2873 } 2874out: 2875 SPECHASH_UNLOCK(); 2876 return (error); 2877} 2878 2879/* 2880 * Unmount all filesystems. The list is traversed in reverse order 2881 * of mounting to avoid dependencies. 2882 */ 2883__private_extern__ void 2884vfs_unmountall(void) 2885{ 2886 struct mount *mp; 2887 int error; 2888 2889 /* 2890 * Since this only runs when rebooting, it is not interlocked. 2891 */ 2892 mount_list_lock(); 2893 while(!TAILQ_EMPTY(&mountlist)) { 2894 mp = TAILQ_LAST(&mountlist, mntlist); 2895 mount_list_unlock(); 2896 error = dounmount(mp, MNT_FORCE, 0, vfs_context_current()); 2897 if ((error != 0) && (error != EBUSY)) { 2898 printf("unmount of %s failed (", mp->mnt_vfsstat.f_mntonname); 2899 printf("%d)\n", error); 2900 mount_list_lock(); 2901 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2902 continue; 2903 } else if (error == EBUSY) { 2904 /* If EBUSY is returned, the unmount was already in progress */ 2905 printf("unmount of %p failed (", mp); 2906 printf("BUSY)\n"); 2907 } 2908 mount_list_lock(); 2909 } 2910 mount_list_unlock(); 2911} 2912 2913 2914/* 2915 * This routine is called from vnode_pager_deallocate out of the VM 2916 * The path to vnode_pager_deallocate can only be initiated by ubc_destroy_named 2917 * on a vnode that has a UBCINFO 2918 */ 2919__private_extern__ void 2920vnode_pager_vrele(vnode_t vp) 2921{ 2922 struct ubc_info *uip; 2923 2924 vnode_lock_spin(vp); 2925 2926 vp->v_lflag &= ~VNAMED_UBC; 2927 2928 uip = vp->v_ubcinfo; 2929 vp->v_ubcinfo = UBC_INFO_NULL; 2930 2931 vnode_unlock(vp); 2932 2933 ubc_info_deallocate(uip); 2934} 2935 2936 2937#include <sys/disk.h> 2938 2939u_int32_t rootunit = (u_int32_t)-1; 2940 2941errno_t 2942vfs_init_io_attributes(vnode_t devvp, mount_t mp) 2943{ 2944 int error; 2945 off_t readblockcnt = 0; 2946 off_t writeblockcnt = 0; 2947 off_t readmaxcnt = 0; 2948 off_t writemaxcnt = 0; 2949 off_t readsegcnt = 0; 2950 off_t writesegcnt = 0; 2951 off_t readsegsize = 0; 2952 off_t writesegsize = 0; 2953 off_t alignment = 0; 2954 off_t ioqueue_depth = 0; 2955 u_int32_t blksize; 2956 u_int64_t temp; 2957 u_int32_t features; 2958 vfs_context_t ctx = vfs_context_current(); 2959 int isssd = 0; 2960 int isvirtual = 0; 2961 2962 2963 VNOP_IOCTL(devvp, DKIOCGETTHROTTLEMASK, (caddr_t)&mp->mnt_throttle_mask, 0, NULL); 2964 /* 2965 * as a reasonable approximation, only use the lowest bit of the mask 2966 * to generate a disk unit number 2967 */ 2968 mp->mnt_devbsdunit = num_trailing_0(mp->mnt_throttle_mask); 2969 2970 if (devvp == rootvp) 2971 rootunit = mp->mnt_devbsdunit; 2972 2973 if (mp->mnt_devbsdunit == rootunit) { 2974 /* 2975 * this mount point exists on the same device as the root 2976 * partition, so it comes under the hard throttle control... 2977 * this is true even for the root mount point itself 2978 */ 2979 mp->mnt_kern_flag |= MNTK_ROOTDEV; 2980 } 2981 /* 2982 * force the spec device to re-cache 2983 * the underlying block size in case 2984 * the filesystem overrode the initial value 2985 */ 2986 set_fsblocksize(devvp); 2987 2988 2989 if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, 2990 (caddr_t)&blksize, 0, ctx))) 2991 return (error); 2992 2993 mp->mnt_devblocksize = blksize; 2994 2995 /* 2996 * set the maximum possible I/O size 2997 * this may get clipped to a smaller value 2998 * based on which constraints are being advertised 2999 * and if those advertised constraints result in a smaller 3000 * limit for a given I/O 3001 */ 3002 mp->mnt_maxreadcnt = MAX_UPL_SIZE * PAGE_SIZE; 3003 mp->mnt_maxwritecnt = MAX_UPL_SIZE * PAGE_SIZE; 3004 3005 if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, ctx) == 0) { 3006 if (isvirtual) 3007 mp->mnt_kern_flag |= MNTK_VIRTUALDEV; 3008 } 3009 if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ctx) == 0) { 3010 if (isssd) 3011 mp->mnt_kern_flag |= MNTK_SSD; 3012 } 3013 if ((error = VNOP_IOCTL(devvp, DKIOCGETFEATURES, 3014 (caddr_t)&features, 0, ctx))) 3015 return (error); 3016 3017 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, 3018 (caddr_t)&readblockcnt, 0, ctx))) 3019 return (error); 3020 3021 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, 3022 (caddr_t)&writeblockcnt, 0, ctx))) 3023 return (error); 3024 3025 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, 3026 (caddr_t)&readmaxcnt, 0, ctx))) 3027 return (error); 3028 3029 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, 3030 (caddr_t)&writemaxcnt, 0, ctx))) 3031 return (error); 3032 3033 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, 3034 (caddr_t)&readsegcnt, 0, ctx))) 3035 return (error); 3036 3037 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, 3038 (caddr_t)&writesegcnt, 0, ctx))) 3039 return (error); 3040 3041 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD, 3042 (caddr_t)&readsegsize, 0, ctx))) 3043 return (error); 3044 3045 if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE, 3046 (caddr_t)&writesegsize, 0, ctx))) 3047 return (error); 3048 3049 if ((error = VNOP_IOCTL(devvp, DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT, 3050 (caddr_t)&alignment, 0, ctx))) 3051 return (error); 3052 3053 if ((error = VNOP_IOCTL(devvp, DKIOCGETCOMMANDPOOLSIZE, 3054 (caddr_t)&ioqueue_depth, 0, ctx))) 3055 return (error); 3056 3057 if (readmaxcnt) 3058 mp->mnt_maxreadcnt = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt; 3059 3060 if (readblockcnt) { 3061 temp = readblockcnt * blksize; 3062 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp; 3063 3064 if (temp < mp->mnt_maxreadcnt) 3065 mp->mnt_maxreadcnt = (u_int32_t)temp; 3066 } 3067 3068 if (writemaxcnt) 3069 mp->mnt_maxwritecnt = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt; 3070 3071 if (writeblockcnt) { 3072 temp = writeblockcnt * blksize; 3073 temp = (temp > UINT32_MAX) ? UINT32_MAX : temp; 3074 3075 if (temp < mp->mnt_maxwritecnt) 3076 mp->mnt_maxwritecnt = (u_int32_t)temp; 3077 } 3078 3079 if (readsegcnt) { 3080 temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt; 3081 } else { 3082 temp = mp->mnt_maxreadcnt / PAGE_SIZE; 3083 3084 if (temp > UINT16_MAX) 3085 temp = UINT16_MAX; 3086 } 3087 mp->mnt_segreadcnt = (u_int16_t)temp; 3088 3089 if (writesegcnt) { 3090 temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt; 3091 } else { 3092 temp = mp->mnt_maxwritecnt / PAGE_SIZE; 3093 3094 if (temp > UINT16_MAX) 3095 temp = UINT16_MAX; 3096 } 3097 mp->mnt_segwritecnt = (u_int16_t)temp; 3098 3099 if (readsegsize) 3100 temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize; 3101 else 3102 temp = mp->mnt_maxreadcnt; 3103 mp->mnt_maxsegreadsize = (u_int32_t)temp; 3104 3105 if (writesegsize) 3106 temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize; 3107 else 3108 temp = mp->mnt_maxwritecnt; 3109 mp->mnt_maxsegwritesize = (u_int32_t)temp; 3110 3111 if (alignment) 3112 temp = (alignment > PAGE_SIZE) ? PAGE_MASK : alignment - 1; 3113 else 3114 temp = 0; 3115 mp->mnt_alignmentmask = temp; 3116 3117 3118 if (ioqueue_depth > MNT_DEFAULT_IOQUEUE_DEPTH) 3119 temp = ioqueue_depth; 3120 else 3121 temp = MNT_DEFAULT_IOQUEUE_DEPTH; 3122 3123 mp->mnt_ioqueue_depth = temp; 3124 mp->mnt_ioscale = (mp->mnt_ioqueue_depth + (MNT_DEFAULT_IOQUEUE_DEPTH - 1)) / MNT_DEFAULT_IOQUEUE_DEPTH; 3125 3126 if (mp->mnt_ioscale > 1) 3127 printf("ioqueue_depth = %d, ioscale = %d\n", (int)mp->mnt_ioqueue_depth, (int)mp->mnt_ioscale); 3128 3129 if (features & DK_FEATURE_FORCE_UNIT_ACCESS) 3130 mp->mnt_ioflags |= MNT_IOFLAGS_FUA_SUPPORTED; 3131 3132 if (features & DK_FEATURE_UNMAP) 3133 mp->mnt_ioflags |= MNT_IOFLAGS_UNMAP_SUPPORTED; 3134 3135 return (error); 3136} 3137 3138static struct klist fs_klist; 3139lck_grp_t *fs_klist_lck_grp; 3140lck_mtx_t *fs_klist_lock; 3141 3142void 3143vfs_event_init(void) 3144{ 3145 3146 klist_init(&fs_klist); 3147 fs_klist_lck_grp = lck_grp_alloc_init("fs_klist", NULL); 3148 fs_klist_lock = lck_mtx_alloc_init(fs_klist_lck_grp, NULL); 3149} 3150 3151void 3152vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data) 3153{ 3154 if (event == VQ_DEAD || event == VQ_NOTRESP) { 3155 struct mount *mp = vfs_getvfs(fsid); 3156 if (mp) { 3157 mount_lock_spin(mp); 3158 if (data) 3159 mp->mnt_kern_flag &= ~MNT_LNOTRESP; // Now responding 3160 else 3161 mp->mnt_kern_flag |= MNT_LNOTRESP; // Not responding 3162 mount_unlock(mp); 3163 } 3164 } 3165 3166 lck_mtx_lock(fs_klist_lock); 3167 KNOTE(&fs_klist, event); 3168 lck_mtx_unlock(fs_klist_lock); 3169} 3170 3171/* 3172 * return the number of mounted filesystems. 3173 */ 3174static int 3175sysctl_vfs_getvfscnt(void) 3176{ 3177 return(mount_getvfscnt()); 3178} 3179 3180 3181static int 3182mount_getvfscnt(void) 3183{ 3184 int ret; 3185 3186 mount_list_lock(); 3187 ret = nummounts; 3188 mount_list_unlock(); 3189 return (ret); 3190 3191} 3192 3193 3194 3195static int 3196mount_fillfsids(fsid_t *fsidlst, int count) 3197{ 3198 struct mount *mp; 3199 int actual=0; 3200 3201 actual = 0; 3202 mount_list_lock(); 3203 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3204 if (actual <= count) { 3205 fsidlst[actual] = mp->mnt_vfsstat.f_fsid; 3206 actual++; 3207 } 3208 } 3209 mount_list_unlock(); 3210 return (actual); 3211 3212} 3213 3214/* 3215 * fill in the array of fsid_t's up to a max of 'count', the actual 3216 * number filled in will be set in '*actual'. If there are more fsid_t's 3217 * than room in fsidlst then ENOMEM will be returned and '*actual' will 3218 * have the actual count. 3219 * having *actual filled out even in the error case is depended upon. 3220 */ 3221static int 3222sysctl_vfs_getvfslist(fsid_t *fsidlst, int count, int *actual) 3223{ 3224 struct mount *mp; 3225 3226 *actual = 0; 3227 mount_list_lock(); 3228 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3229 (*actual)++; 3230 if (*actual <= count) 3231 fsidlst[(*actual) - 1] = mp->mnt_vfsstat.f_fsid; 3232 } 3233 mount_list_unlock(); 3234 return (*actual <= count ? 0 : ENOMEM); 3235} 3236 3237static int 3238sysctl_vfs_vfslist(__unused struct sysctl_oid *oidp, __unused void *arg1, 3239 __unused int arg2, struct sysctl_req *req) 3240{ 3241 int actual, error; 3242 size_t space; 3243 fsid_t *fsidlst; 3244 3245 /* This is a readonly node. */ 3246 if (req->newptr != USER_ADDR_NULL) 3247 return (EPERM); 3248 3249 /* they are querying us so just return the space required. */ 3250 if (req->oldptr == USER_ADDR_NULL) { 3251 req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t); 3252 return 0; 3253 } 3254again: 3255 /* 3256 * Retrieve an accurate count of the amount of space required to copy 3257 * out all the fsids in the system. 3258 */ 3259 space = req->oldlen; 3260 req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t); 3261 3262 /* they didn't give us enough space. */ 3263 if (space < req->oldlen) 3264 return (ENOMEM); 3265 3266 MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK); 3267 if (fsidlst == NULL) { 3268 return (ENOMEM); 3269 } 3270 3271 error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t), 3272 &actual); 3273 /* 3274 * If we get back ENOMEM, then another mount has been added while we 3275 * slept in malloc above. If this is the case then try again. 3276 */ 3277 if (error == ENOMEM) { 3278 FREE(fsidlst, M_TEMP); 3279 req->oldlen = space; 3280 goto again; 3281 } 3282 if (error == 0) { 3283 error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t)); 3284 } 3285 FREE(fsidlst, M_TEMP); 3286 return (error); 3287} 3288 3289/* 3290 * Do a sysctl by fsid. 3291 */ 3292static int 3293sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2, 3294 struct sysctl_req *req) 3295{ 3296 union union_vfsidctl vc; 3297 struct mount *mp; 3298 struct vfsstatfs *sp; 3299 int *name, flags, namelen; 3300 int error=0, gotref=0; 3301 vfs_context_t ctx = vfs_context_current(); 3302 proc_t p = req->p; /* XXX req->p != current_proc()? */ 3303 boolean_t is_64_bit; 3304 3305 name = arg1; 3306 namelen = arg2; 3307 is_64_bit = proc_is64bit(p); 3308 3309 error = SYSCTL_IN(req, &vc, is_64_bit? sizeof(vc.vc64):sizeof(vc.vc32)); 3310 if (error) 3311 goto out; 3312 if (vc.vc32.vc_vers != VFS_CTL_VERS1) { /* works for 32 and 64 */ 3313 error = EINVAL; 3314 goto out; 3315 } 3316 mp = mount_list_lookupby_fsid(&vc.vc32.vc_fsid, 0, 1); /* works for 32 and 64 */ 3317 if (mp == NULL) { 3318 error = ENOENT; 3319 goto out; 3320 } 3321 gotref = 1; 3322 /* reset so that the fs specific code can fetch it. */ 3323 req->newidx = 0; 3324 /* 3325 * Note if this is a VFS_CTL then we pass the actual sysctl req 3326 * in for "oldp" so that the lower layer can DTRT and use the 3327 * SYSCTL_IN/OUT routines. 3328 */ 3329 if (mp->mnt_op->vfs_sysctl != NULL) { 3330 if (is_64_bit) { 3331 if (vfs_64bitready(mp)) { 3332 error = mp->mnt_op->vfs_sysctl(name, namelen, 3333 CAST_USER_ADDR_T(req), 3334 NULL, USER_ADDR_NULL, 0, 3335 ctx); 3336 } 3337 else { 3338 error = ENOTSUP; 3339 } 3340 } 3341 else { 3342 error = mp->mnt_op->vfs_sysctl(name, namelen, 3343 CAST_USER_ADDR_T(req), 3344 NULL, USER_ADDR_NULL, 0, 3345 ctx); 3346 } 3347 if (error != ENOTSUP) { 3348 goto out; 3349 } 3350 } 3351 switch (name[0]) { 3352 case VFS_CTL_UMOUNT: 3353 req->newidx = 0; 3354 if (is_64_bit) { 3355 req->newptr = vc.vc64.vc_ptr; 3356 req->newlen = (size_t)vc.vc64.vc_len; 3357 } 3358 else { 3359 req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr); 3360 req->newlen = vc.vc32.vc_len; 3361 } 3362 error = SYSCTL_IN(req, &flags, sizeof(flags)); 3363 if (error) 3364 break; 3365 3366 mount_ref(mp, 0); 3367 mount_iterdrop(mp); 3368 gotref = 0; 3369 /* safedounmount consumes a ref */ 3370 error = safedounmount(mp, flags, ctx); 3371 break; 3372 case VFS_CTL_STATFS: 3373 req->newidx = 0; 3374 if (is_64_bit) { 3375 req->newptr = vc.vc64.vc_ptr; 3376 req->newlen = (size_t)vc.vc64.vc_len; 3377 } 3378 else { 3379 req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr); 3380 req->newlen = vc.vc32.vc_len; 3381 } 3382 error = SYSCTL_IN(req, &flags, sizeof(flags)); 3383 if (error) 3384 break; 3385 sp = &mp->mnt_vfsstat; 3386 if (((flags & MNT_NOWAIT) == 0 || (flags & (MNT_WAIT | MNT_DWAIT))) && 3387 (error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT))) 3388 goto out; 3389 if (is_64_bit) { 3390 struct user64_statfs sfs; 3391 bzero(&sfs, sizeof(sfs)); 3392 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK; 3393 sfs.f_type = mp->mnt_vtable->vfc_typenum; 3394 sfs.f_bsize = (user64_long_t)sp->f_bsize; 3395 sfs.f_iosize = (user64_long_t)sp->f_iosize; 3396 sfs.f_blocks = (user64_long_t)sp->f_blocks; 3397 sfs.f_bfree = (user64_long_t)sp->f_bfree; 3398 sfs.f_bavail = (user64_long_t)sp->f_bavail; 3399 sfs.f_files = (user64_long_t)sp->f_files; 3400 sfs.f_ffree = (user64_long_t)sp->f_ffree; 3401 sfs.f_fsid = sp->f_fsid; 3402 sfs.f_owner = sp->f_owner; 3403 3404 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { 3405 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); 3406 } else { 3407 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); 3408 } 3409 strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN); 3410 strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN); 3411 3412 error = SYSCTL_OUT(req, &sfs, sizeof(sfs)); 3413 } 3414 else { 3415 struct user32_statfs sfs; 3416 bzero(&sfs, sizeof(sfs)); 3417 sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK; 3418 sfs.f_type = mp->mnt_vtable->vfc_typenum; 3419 3420 /* 3421 * It's possible for there to be more than 2^^31 blocks in the filesystem, so we 3422 * have to fudge the numbers here in that case. We inflate the blocksize in order 3423 * to reflect the filesystem size as best we can. 3424 */ 3425 if (sp->f_blocks > INT_MAX) { 3426 int shift; 3427 3428 /* 3429 * Work out how far we have to shift the block count down to make it fit. 3430 * Note that it's possible to have to shift so far that the resulting 3431 * blocksize would be unreportably large. At that point, we will clip 3432 * any values that don't fit. 3433 * 3434 * For safety's sake, we also ensure that f_iosize is never reported as 3435 * being smaller than f_bsize. 3436 */ 3437 for (shift = 0; shift < 32; shift++) { 3438 if ((sp->f_blocks >> shift) <= INT_MAX) 3439 break; 3440 if ((((long long)sp->f_bsize) << (shift + 1)) > INT_MAX) 3441 break; 3442 } 3443#define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s))) 3444 sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sp->f_blocks, shift); 3445 sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bfree, shift); 3446 sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bavail, shift); 3447#undef __SHIFT_OR_CLIP 3448 sfs.f_bsize = (user32_long_t)(sp->f_bsize << shift); 3449 sfs.f_iosize = lmax(sp->f_iosize, sp->f_bsize); 3450 } else { 3451 sfs.f_bsize = (user32_long_t)sp->f_bsize; 3452 sfs.f_iosize = (user32_long_t)sp->f_iosize; 3453 sfs.f_blocks = (user32_long_t)sp->f_blocks; 3454 sfs.f_bfree = (user32_long_t)sp->f_bfree; 3455 sfs.f_bavail = (user32_long_t)sp->f_bavail; 3456 } 3457 sfs.f_files = (user32_long_t)sp->f_files; 3458 sfs.f_ffree = (user32_long_t)sp->f_ffree; 3459 sfs.f_fsid = sp->f_fsid; 3460 sfs.f_owner = sp->f_owner; 3461 3462 if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { 3463 strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); 3464 } else { 3465 strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); 3466 } 3467 strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN); 3468 strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN); 3469 3470 error = SYSCTL_OUT(req, &sfs, sizeof(sfs)); 3471 } 3472 break; 3473 default: 3474 error = ENOTSUP; 3475 goto out; 3476 } 3477out: 3478 if(gotref != 0) 3479 mount_iterdrop(mp); 3480 return (error); 3481} 3482 3483static int filt_fsattach(struct knote *kn); 3484static void filt_fsdetach(struct knote *kn); 3485static int filt_fsevent(struct knote *kn, long hint); 3486struct filterops fs_filtops = { 3487 .f_attach = filt_fsattach, 3488 .f_detach = filt_fsdetach, 3489 .f_event = filt_fsevent, 3490}; 3491 3492static int 3493filt_fsattach(struct knote *kn) 3494{ 3495 3496 lck_mtx_lock(fs_klist_lock); 3497 kn->kn_flags |= EV_CLEAR; 3498 KNOTE_ATTACH(&fs_klist, kn); 3499 lck_mtx_unlock(fs_klist_lock); 3500 return (0); 3501} 3502 3503static void 3504filt_fsdetach(struct knote *kn) 3505{ 3506 lck_mtx_lock(fs_klist_lock); 3507 KNOTE_DETACH(&fs_klist, kn); 3508 lck_mtx_unlock(fs_klist_lock); 3509} 3510 3511static int 3512filt_fsevent(struct knote *kn, long hint) 3513{ 3514 /* 3515 * Backwards compatibility: 3516 * Other filters would do nothing if kn->kn_sfflags == 0 3517 */ 3518 3519 if ((kn->kn_sfflags == 0) || (kn->kn_sfflags & hint)) { 3520 kn->kn_fflags |= hint; 3521 } 3522 3523 return (kn->kn_fflags != 0); 3524} 3525 3526static int 3527sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp, 3528 __unused void *arg1, __unused int arg2, struct sysctl_req *req) 3529{ 3530 int out, error; 3531 pid_t pid; 3532 proc_t p; 3533 3534 /* We need a pid. */ 3535 if (req->newptr == USER_ADDR_NULL) 3536 return (EINVAL); 3537 3538 error = SYSCTL_IN(req, &pid, sizeof(pid)); 3539 if (error) 3540 return (error); 3541 3542 p = proc_find(pid < 0 ? -pid : pid); 3543 if (p == NULL) 3544 return (ESRCH); 3545 3546 /* 3547 * Fetching the value is ok, but we only fetch if the old 3548 * pointer is given. 3549 */ 3550 if (req->oldptr != USER_ADDR_NULL) { 3551 out = !((p->p_flag & P_NOREMOTEHANG) == 0); 3552 proc_rele(p); 3553 error = SYSCTL_OUT(req, &out, sizeof(out)); 3554 return (error); 3555 } 3556 3557 /* cansignal offers us enough security. */ 3558 if (p != req->p && proc_suser(req->p) != 0) { 3559 proc_rele(p); 3560 return (EPERM); 3561 } 3562 3563 if (pid < 0) 3564 OSBitAndAtomic(~((uint32_t)P_NOREMOTEHANG), &p->p_flag); 3565 else 3566 OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag); 3567 proc_rele(p); 3568 3569 return (0); 3570} 3571 3572/* the vfs.generic. branch. */ 3573SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs generic hinge"); 3574/* retreive a list of mounted filesystem fsid_t */ 3575SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLFLAG_RD | CTLFLAG_LOCKED, 3576 NULL, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids"); 3577/* perform operations on filesystem via fsid_t */ 3578SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW | CTLFLAG_LOCKED, 3579 sysctl_vfs_ctlbyfsid, "ctlbyfsid"); 3580SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW | CTLFLAG_ANYBODY, 3581 NULL, 0, sysctl_vfs_noremotehang, "I", "noremotehang"); 3582 3583 3584long num_reusedvnodes = 0; 3585 3586 3587static vnode_t 3588process_vp(vnode_t vp, int want_vp, int *deferred) 3589{ 3590 unsigned int vpid; 3591 3592 *deferred = 0; 3593 3594 vpid = vp->v_id; 3595 3596 vnode_list_remove_locked(vp); 3597 3598 vnode_list_unlock(); 3599 3600 vnode_lock_spin(vp); 3601 3602 /* 3603 * We could wait for the vnode_lock after removing the vp from the freelist 3604 * and the vid is bumped only at the very end of reclaim. So it is possible 3605 * that we are looking at a vnode that is being terminated. If so skip it. 3606 */ 3607 if ((vpid != vp->v_id) || (vp->v_usecount != 0) || (vp->v_iocount != 0) || 3608 VONLIST(vp) || (vp->v_lflag & VL_TERMINATE)) { 3609 /* 3610 * we lost the race between dropping the list lock 3611 * and picking up the vnode_lock... someone else 3612 * used this vnode and it is now in a new state 3613 */ 3614 vnode_unlock(vp); 3615 3616 return (NULLVP); 3617 } 3618 if ( (vp->v_lflag & (VL_NEEDINACTIVE | VL_MARKTERM)) == VL_NEEDINACTIVE ) { 3619 /* 3620 * we did a vnode_rele_ext that asked for 3621 * us not to reenter the filesystem during 3622 * the release even though VL_NEEDINACTIVE was 3623 * set... we'll do it here by doing a 3624 * vnode_get/vnode_put 3625 * 3626 * pick up an iocount so that we can call 3627 * vnode_put and drive the VNOP_INACTIVE... 3628 * vnode_put will either leave us off 3629 * the freelist if a new ref comes in, 3630 * or put us back on the end of the freelist 3631 * or recycle us if we were marked for termination... 3632 * so we'll just go grab a new candidate 3633 */ 3634 vp->v_iocount++; 3635#ifdef JOE_DEBUG 3636 record_vp(vp, 1); 3637#endif 3638 vnode_put_locked(vp); 3639 vnode_unlock(vp); 3640 3641 return (NULLVP); 3642 } 3643 /* 3644 * Checks for anyone racing us for recycle 3645 */ 3646 if (vp->v_type != VBAD) { 3647 if (want_vp && vnode_on_reliable_media(vp) == FALSE) { 3648 vnode_async_list_add(vp); 3649 vnode_unlock(vp); 3650 3651 *deferred = 1; 3652 3653 return (NULLVP); 3654 } 3655 if (vp->v_lflag & VL_DEAD) 3656 panic("new_vnode(%p): the vnode is VL_DEAD but not VBAD", vp); 3657 3658 vnode_lock_convert(vp); 3659 (void)vnode_reclaim_internal(vp, 1, want_vp, 0); 3660 3661 if (want_vp) { 3662 if ((VONLIST(vp))) 3663 panic("new_vnode(%p): vp on list", vp); 3664 if (vp->v_usecount || vp->v_iocount || vp->v_kusecount || 3665 (vp->v_lflag & (VNAMED_UBC | VNAMED_MOUNT | VNAMED_FSHASH))) 3666 panic("new_vnode(%p): free vnode still referenced", vp); 3667 if ((vp->v_mntvnodes.tqe_prev != 0) && (vp->v_mntvnodes.tqe_next != 0)) 3668 panic("new_vnode(%p): vnode seems to be on mount list", vp); 3669 if ( !LIST_EMPTY(&vp->v_nclinks) || !LIST_EMPTY(&vp->v_ncchildren)) 3670 panic("new_vnode(%p): vnode still hooked into the name cache", vp); 3671 } else { 3672 vnode_unlock(vp); 3673 vp = NULLVP; 3674 } 3675 } 3676 return (vp); 3677} 3678 3679 3680 3681static void 3682async_work_continue(void) 3683{ 3684 struct async_work_lst *q; 3685 int deferred; 3686 vnode_t vp; 3687 3688 q = &vnode_async_work_list; 3689 3690 for (;;) { 3691 3692 vnode_list_lock(); 3693 3694 if ( TAILQ_EMPTY(q) ) { 3695 assert_wait(q, (THREAD_UNINT)); 3696 3697 vnode_list_unlock(); 3698 3699 thread_block((thread_continue_t)async_work_continue); 3700 3701 continue; 3702 } 3703 async_work_handled++; 3704 3705 vp = TAILQ_FIRST(q); 3706 3707 vp = process_vp(vp, 0, &deferred); 3708 3709 if (vp != NULLVP) 3710 panic("found VBAD vp (%p) on async queue", vp); 3711 } 3712} 3713 3714 3715static int 3716new_vnode(vnode_t *vpp) 3717{ 3718 vnode_t vp; 3719 uint32_t retries = 0, max_retries = 100; /* retry incase of tablefull */ 3720 int force_alloc = 0, walk_count = 0; 3721 boolean_t need_reliable_vp = FALSE; 3722 int deferred; 3723 struct timeval initial_tv; 3724 struct timeval current_tv; 3725#if CONFIG_VFS_FUNNEL 3726 struct unsafe_fsnode *l_unsafefs = 0; 3727#endif /* CONFIG_VFS_FUNNEL */ 3728 proc_t curproc = current_proc(); 3729 3730 initial_tv.tv_sec = 0; 3731retry: 3732 vp = NULLVP; 3733 3734 vnode_list_lock(); 3735 3736 if (need_reliable_vp == TRUE) 3737 async_work_timed_out++; 3738 3739 if ((numvnodes - deadvnodes) < desiredvnodes || force_alloc) { 3740 struct timespec ts; 3741 3742 if ( !TAILQ_EMPTY(&vnode_dead_list)) { 3743 /* 3744 * Can always reuse a dead one 3745 */ 3746 vp = TAILQ_FIRST(&vnode_dead_list); 3747 goto steal_this_vp; 3748 } 3749 /* 3750 * no dead vnodes available... if we're under 3751 * the limit, we'll create a new vnode 3752 */ 3753 numvnodes++; 3754 vnode_list_unlock(); 3755 3756 MALLOC_ZONE(vp, struct vnode *, sizeof(*vp), M_VNODE, M_WAITOK); 3757 bzero((char *)vp, sizeof(*vp)); 3758 VLISTNONE(vp); /* avoid double queue removal */ 3759 lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr); 3760 3761 klist_init(&vp->v_knotes); 3762 nanouptime(&ts); 3763 vp->v_id = ts.tv_nsec; 3764 vp->v_flag = VSTANDARD; 3765 3766#if CONFIG_MACF 3767 if (mac_vnode_label_init_needed(vp)) 3768 mac_vnode_label_init(vp); 3769#endif /* MAC */ 3770 3771 vp->v_iocount = 1; 3772 goto done; 3773 } 3774 microuptime(¤t_tv); 3775 3776#define MAX_WALK_COUNT 1000 3777 3778 if ( !TAILQ_EMPTY(&vnode_rage_list) && 3779 (ragevnodes >= rage_limit || 3780 (current_tv.tv_sec - rage_tv.tv_sec) >= RAGE_TIME_LIMIT)) { 3781 3782 TAILQ_FOREACH(vp, &vnode_rage_list, v_freelist) { 3783 if ( !(vp->v_listflag & VLIST_RAGE)) 3784 panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp); 3785 3786 // if we're a dependency-capable process, skip vnodes that can 3787 // cause recycling deadlocks. (i.e. this process is diskimages 3788 // helper and the vnode is in a disk image). Querying the 3789 // mnt_kern_flag for the mount's virtual device status 3790 // is safer than checking the mnt_dependent_process, which 3791 // may not be updated if there are multiple devnode layers 3792 // in between the disk image and the final consumer. 3793 3794 if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || 3795 (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) { 3796 /* 3797 * if need_reliable_vp == TRUE, then we've already sent one or more 3798 * non-reliable vnodes to the async thread for processing and timed 3799 * out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT 3800 * mechanism to first scan for a reliable vnode before forcing 3801 * a new vnode to be created 3802 */ 3803 if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) 3804 break; 3805 } 3806 3807 // don't iterate more than MAX_WALK_COUNT vnodes to 3808 // avoid keeping the vnode list lock held for too long. 3809 3810 if (walk_count++ > MAX_WALK_COUNT) { 3811 vp = NULL; 3812 break; 3813 } 3814 } 3815 } 3816 3817 if (vp == NULL && !TAILQ_EMPTY(&vnode_free_list)) { 3818 /* 3819 * Pick the first vp for possible reuse 3820 */ 3821 walk_count = 0; 3822 TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) { 3823 3824 // if we're a dependency-capable process, skip vnodes that can 3825 // cause recycling deadlocks. (i.e. this process is diskimages 3826 // helper and the vnode is in a disk image). Querying the 3827 // mnt_kern_flag for the mount's virtual device status 3828 // is safer than checking the mnt_dependent_process, which 3829 // may not be updated if there are multiple devnode layers 3830 // in between the disk image and the final consumer. 3831 3832 if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || 3833 (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) { 3834 /* 3835 * if need_reliable_vp == TRUE, then we've already sent one or more 3836 * non-reliable vnodes to the async thread for processing and timed 3837 * out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT 3838 * mechanism to first scan for a reliable vnode before forcing 3839 * a new vnode to be created 3840 */ 3841 if (need_reliable_vp == FALSE || vnode_on_reliable_media(vp) == TRUE) 3842 break; 3843 } 3844 3845 // don't iterate more than MAX_WALK_COUNT vnodes to 3846 // avoid keeping the vnode list lock held for too long. 3847 3848 if (walk_count++ > MAX_WALK_COUNT) { 3849 vp = NULL; 3850 break; 3851 } 3852 } 3853 } 3854 3855 // 3856 // if we don't have a vnode and the walk_count is >= MAX_WALK_COUNT 3857 // then we're trying to create a vnode on behalf of a 3858 // process like diskimages-helper that has file systems 3859 // mounted on top of itself (and thus we can't reclaim 3860 // vnodes in the file systems on top of us). if we can't 3861 // find a vnode to reclaim then we'll just have to force 3862 // the allocation. 3863 // 3864 if (vp == NULL && walk_count >= MAX_WALK_COUNT) { 3865 force_alloc = 1; 3866 vnode_list_unlock(); 3867 goto retry; 3868 } 3869 3870 if (vp == NULL) { 3871 /* 3872 * we've reached the system imposed maximum number of vnodes 3873 * but there isn't a single one available 3874 * wait a bit and then retry... if we can't get a vnode 3875 * after our target number of retries, than log a complaint 3876 */ 3877 if (++retries <= max_retries) { 3878 vnode_list_unlock(); 3879 delay_for_interval(1, 1000 * 1000); 3880 goto retry; 3881 } 3882 3883 vnode_list_unlock(); 3884 tablefull("vnode"); 3885 log(LOG_EMERG, "%d desired, %d numvnodes, " 3886 "%d free, %d dead, %d rage\n", 3887 desiredvnodes, numvnodes, freevnodes, deadvnodes, ragevnodes); 3888#if CONFIG_JETSAM 3889 /* 3890 * Running out of vnodes tends to make a system unusable. Start killing 3891 * processes that jetsam knows are killable. 3892 */ 3893 if (memorystatus_kill_top_proc(TRUE, kMemorystatusFlagsKilledVnodes) < 0) { 3894 /* 3895 * If jetsam can't find any more processes to kill and there 3896 * still aren't any free vnodes, panic. Hopefully we'll get a 3897 * panic log to tell us why we ran out. 3898 */ 3899 panic("vnode table is full\n"); 3900 } 3901 3902 /* 3903 * Now that we've killed someone, wait a bit and continue looking 3904 * (with fewer retries before trying another kill). 3905 */ 3906 delay_for_interval(3, 1000 * 1000); 3907 retries = 0; 3908 max_retries = 10; 3909 goto retry; 3910#endif 3911 3912 *vpp = NULL; 3913 return (ENFILE); 3914 } 3915steal_this_vp: 3916 if ((vp = process_vp(vp, 1, &deferred)) == NULLVP) { 3917 if (deferred) { 3918 int elapsed_msecs; 3919 struct timeval elapsed_tv; 3920 3921 if (initial_tv.tv_sec == 0) 3922 microuptime(&initial_tv); 3923 3924 vnode_list_lock(); 3925 3926 dead_vnode_waited++; 3927 dead_vnode_wanted++; 3928 3929 /* 3930 * note that we're only going to explicitly wait 10ms 3931 * for a dead vnode to become available, since even if one 3932 * isn't available, a reliable vnode might now be available 3933 * at the head of the VRAGE or free lists... if so, we 3934 * can satisfy the new_vnode request with less latency then waiting 3935 * for the full 100ms duration we're ultimately willing to tolerate 3936 */ 3937 assert_wait_timeout((caddr_t)&dead_vnode_wanted, (THREAD_INTERRUPTIBLE), 10000, NSEC_PER_USEC); 3938 3939 vnode_list_unlock(); 3940 3941 thread_block(THREAD_CONTINUE_NULL); 3942 3943 microuptime(&elapsed_tv); 3944 3945 timevalsub(&elapsed_tv, &initial_tv); 3946 elapsed_msecs = elapsed_tv.tv_sec * 1000 + elapsed_tv.tv_usec / 1000; 3947 3948 if (elapsed_msecs >= 100) { 3949 /* 3950 * we've waited long enough... 100ms is 3951 * somewhat arbitrary for this case, but the 3952 * normal worst case latency used for UI 3953 * interaction is 100ms, so I've chosen to 3954 * go with that. 3955 * 3956 * setting need_reliable_vp to TRUE 3957 * forces us to find a reliable vnode 3958 * that we can process synchronously, or 3959 * to create a new one if the scan for 3960 * a reliable one hits the scan limit 3961 */ 3962 need_reliable_vp = TRUE; 3963 } 3964 } 3965 goto retry; 3966 } 3967 OSAddAtomicLong(1, &num_reusedvnodes); 3968 3969 3970#if CONFIG_VFS_FUNNEL 3971 if (vp->v_unsafefs) { 3972 l_unsafefs = vp->v_unsafefs; 3973 vp->v_unsafefs = (struct unsafe_fsnode *)NULL; 3974 } 3975#endif /* CONFIG_VFS_FUNNEL */ 3976 3977#if CONFIG_MACF 3978 /* 3979 * We should never see VL_LABELWAIT or VL_LABEL here. 3980 * as those operations hold a reference. 3981 */ 3982 assert ((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT); 3983 assert ((vp->v_lflag & VL_LABEL) != VL_LABEL); 3984 if (vp->v_lflag & VL_LABELED) { 3985 vnode_lock_convert(vp); 3986 mac_vnode_label_recycle(vp); 3987 } else if (mac_vnode_label_init_needed(vp)) { 3988 vnode_lock_convert(vp); 3989 mac_vnode_label_init(vp); 3990 } 3991 3992#endif /* MAC */ 3993 3994 vp->v_iocount = 1; 3995 vp->v_lflag = 0; 3996 vp->v_writecount = 0; 3997 vp->v_references = 0; 3998 vp->v_iterblkflags = 0; 3999 vp->v_flag = VSTANDARD; 4000 /* vbad vnodes can point to dead_mountp */ 4001 vp->v_mount = NULL; 4002 vp->v_defer_reclaimlist = (vnode_t)0; 4003 4004 vnode_unlock(vp); 4005 4006#if CONFIG_VFS_FUNNEL 4007 if (l_unsafefs) { 4008 lck_mtx_destroy(&l_unsafefs->fsnodelock, vnode_lck_grp); 4009 FREE_ZONE((void *)l_unsafefs, sizeof(struct unsafe_fsnode), M_UNSAFEFS); 4010 } 4011#endif /* CONFIG_VFS_FUNNEL */ 4012 4013done: 4014 *vpp = vp; 4015 4016 return (0); 4017} 4018 4019void 4020vnode_lock(vnode_t vp) 4021{ 4022 lck_mtx_lock(&vp->v_lock); 4023} 4024 4025void 4026vnode_lock_spin(vnode_t vp) 4027{ 4028 lck_mtx_lock_spin(&vp->v_lock); 4029} 4030 4031void 4032vnode_unlock(vnode_t vp) 4033{ 4034 lck_mtx_unlock(&vp->v_lock); 4035} 4036 4037 4038 4039int 4040vnode_get(struct vnode *vp) 4041{ 4042 int retval; 4043 4044 vnode_lock_spin(vp); 4045 retval = vnode_get_locked(vp); 4046 vnode_unlock(vp); 4047 4048 return(retval); 4049} 4050 4051int 4052vnode_get_locked(struct vnode *vp) 4053{ 4054#if DIAGNOSTIC 4055 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); 4056#endif 4057 if ((vp->v_iocount == 0) && (vp->v_lflag & (VL_TERMINATE | VL_DEAD))) { 4058 return(ENOENT); 4059 } 4060 vp->v_iocount++; 4061#ifdef JOE_DEBUG 4062 record_vp(vp, 1); 4063#endif 4064 return (0); 4065} 4066 4067/* 4068 * vnode_getwithvid() cuts in line in front of a vnode drain (that is, 4069 * while the vnode is draining, but at no point after that) to prevent 4070 * deadlocks when getting vnodes from filesystem hashes while holding 4071 * resources that may prevent other iocounts from being released. 4072 */ 4073int 4074vnode_getwithvid(vnode_t vp, uint32_t vid) 4075{ 4076 return(vget_internal(vp, vid, ( VNODE_NODEAD | VNODE_WITHID | VNODE_DRAINO ))); 4077} 4078 4079/* 4080 * vnode_getwithvid_drainok() is like vnode_getwithvid(), but *does* block behind a vnode 4081 * drain; it exists for use in the VFS name cache, where we really do want to block behind 4082 * vnode drain to prevent holding off an unmount. 4083 */ 4084int 4085vnode_getwithvid_drainok(vnode_t vp, uint32_t vid) 4086{ 4087 return(vget_internal(vp, vid, ( VNODE_NODEAD | VNODE_WITHID ))); 4088} 4089 4090int 4091vnode_getwithref(vnode_t vp) 4092{ 4093 return(vget_internal(vp, 0, 0)); 4094} 4095 4096 4097__private_extern__ int 4098vnode_getalways(vnode_t vp) 4099{ 4100 return(vget_internal(vp, 0, VNODE_ALWAYS)); 4101} 4102 4103int 4104vnode_put(vnode_t vp) 4105{ 4106 int retval; 4107 4108 vnode_lock_spin(vp); 4109 retval = vnode_put_locked(vp); 4110 vnode_unlock(vp); 4111 4112 return(retval); 4113} 4114 4115int 4116vnode_put_locked(vnode_t vp) 4117{ 4118 vfs_context_t ctx = vfs_context_current(); /* hoist outside loop */ 4119 4120#if DIAGNOSTIC 4121 lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); 4122#endif 4123retry: 4124 if (vp->v_iocount < 1) 4125 panic("vnode_put(%p): iocount < 1", vp); 4126 4127 if ((vp->v_usecount > 0) || (vp->v_iocount > 1)) { 4128 vnode_dropiocount(vp); 4129 return(0); 4130 } 4131 if ((vp->v_lflag & (VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) { 4132 4133 vp->v_lflag &= ~VL_NEEDINACTIVE; 4134 vnode_unlock(vp); 4135 4136 VNOP_INACTIVE(vp, ctx); 4137 4138 vnode_lock_spin(vp); 4139 /* 4140 * because we had to drop the vnode lock before calling 4141 * VNOP_INACTIVE, the state of this vnode may have changed... 4142 * we may pick up both VL_MARTERM and either 4143 * an iocount or a usecount while in the VNOP_INACTIVE call 4144 * we don't want to call vnode_reclaim_internal on a vnode 4145 * that has active references on it... so loop back around 4146 * and reevaluate the state 4147 */ 4148 goto retry; 4149 } 4150 vp->v_lflag &= ~VL_NEEDINACTIVE; 4151 4152 if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD)) == VL_MARKTERM) { 4153 vnode_lock_convert(vp); 4154 vnode_reclaim_internal(vp, 1, 1, 0); 4155 } 4156 vnode_dropiocount(vp); 4157 vnode_list_add(vp); 4158 4159 return(0); 4160} 4161 4162/* is vnode_t in use by others? */ 4163int 4164vnode_isinuse(vnode_t vp, int refcnt) 4165{ 4166 return(vnode_isinuse_locked(vp, refcnt, 0)); 4167} 4168 4169 4170static int 4171vnode_isinuse_locked(vnode_t vp, int refcnt, int locked) 4172{ 4173 int retval = 0; 4174 4175 if (!locked) 4176 vnode_lock_spin(vp); 4177 if ((vp->v_type != VREG) && ((vp->v_usecount - vp->v_kusecount) > refcnt)) { 4178 retval = 1; 4179 goto out; 4180 } 4181 if (vp->v_type == VREG) { 4182 retval = ubc_isinuse_locked(vp, refcnt, 1); 4183 } 4184 4185out: 4186 if (!locked) 4187 vnode_unlock(vp); 4188 return(retval); 4189} 4190 4191 4192/* resume vnode_t */ 4193errno_t 4194vnode_resume(vnode_t vp) 4195{ 4196 if ((vp->v_lflag & VL_SUSPENDED) && vp->v_owner == current_thread()) { 4197 4198 vnode_lock_spin(vp); 4199 vp->v_lflag &= ~VL_SUSPENDED; 4200 vp->v_owner = NULL; 4201 vnode_unlock(vp); 4202 4203 wakeup(&vp->v_iocount); 4204 } 4205 return(0); 4206} 4207 4208/* suspend vnode_t 4209 * Please do not use on more than one vnode at a time as it may 4210 * cause deadlocks. 4211 * xxx should we explicity prevent this from happening? 4212 */ 4213 4214errno_t 4215vnode_suspend(vnode_t vp) 4216{ 4217 if (vp->v_lflag & VL_SUSPENDED) { 4218 return(EBUSY); 4219 } 4220 4221 vnode_lock_spin(vp); 4222 4223 /* 4224 * xxx is this sufficient to check if a vnode_drain is 4225 * progress? 4226 */ 4227 4228 if (vp->v_owner == NULL) { 4229 vp->v_lflag |= VL_SUSPENDED; 4230 vp->v_owner = current_thread(); 4231 } 4232 vnode_unlock(vp); 4233 4234 return(0); 4235} 4236 4237/* 4238 * Release any blocked locking requests on the vnode. 4239 * Used for forced-unmounts. 4240 * 4241 * XXX What about network filesystems? 4242 */ 4243static void 4244vnode_abort_advlocks(vnode_t vp) 4245{ 4246 if (vp->v_flag & VLOCKLOCAL) 4247 lf_abort_advlocks(vp); 4248} 4249 4250 4251static errno_t 4252vnode_drain(vnode_t vp) 4253{ 4254 4255 if (vp->v_lflag & VL_DRAIN) { 4256 panic("vnode_drain: recursive drain"); 4257 return(ENOENT); 4258 } 4259 vp->v_lflag |= VL_DRAIN; 4260 vp->v_owner = current_thread(); 4261 4262 while (vp->v_iocount > 1) 4263 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", NULL); 4264 4265 vp->v_lflag &= ~VL_DRAIN; 4266 4267 return(0); 4268} 4269 4270 4271/* 4272 * if the number of recent references via vnode_getwithvid or vnode_getwithref 4273 * exceeds this threshold, than 'UN-AGE' the vnode by removing it from 4274 * the LRU list if it's currently on it... once the iocount and usecount both drop 4275 * to 0, it will get put back on the end of the list, effectively making it younger 4276 * this allows us to keep actively referenced vnodes in the list without having 4277 * to constantly remove and add to the list each time a vnode w/o a usecount is 4278 * referenced which costs us taking and dropping a global lock twice. 4279 */ 4280#define UNAGE_THRESHHOLD 25 4281 4282errno_t 4283vnode_getiocount(vnode_t vp, unsigned int vid, int vflags) 4284{ 4285 int nodead = vflags & VNODE_NODEAD; 4286 int nosusp = vflags & VNODE_NOSUSPEND; 4287 int always = vflags & VNODE_ALWAYS; 4288 int beatdrain = vflags & VNODE_DRAINO; 4289 4290 for (;;) { 4291 /* 4292 * if it is a dead vnode with deadfs 4293 */ 4294 if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) || (vp->v_data == 0))) { 4295 return(ENOENT); 4296 } 4297 /* 4298 * will return VL_DEAD ones 4299 */ 4300 if ((vp->v_lflag & (VL_SUSPENDED | VL_DRAIN | VL_TERMINATE)) == 0 ) { 4301 break; 4302 } 4303 /* 4304 * if suspended vnodes are to be failed 4305 */ 4306 if (nosusp && (vp->v_lflag & VL_SUSPENDED)) { 4307 return(ENOENT); 4308 } 4309 /* 4310 * if you are the owner of drain/suspend/termination , can acquire iocount 4311 * check for VL_TERMINATE; it does not set owner 4312 */ 4313 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED | VL_TERMINATE)) && 4314 (vp->v_owner == current_thread())) { 4315 break; 4316 } 4317 4318 if (always != 0) 4319 break; 4320 4321 /* 4322 * In some situations, we want to get an iocount 4323 * even if the vnode is draining to prevent deadlock, 4324 * e.g. if we're in the filesystem, potentially holding 4325 * resources that could prevent other iocounts from 4326 * being released. 4327 */ 4328 if (beatdrain && (vp->v_lflag & VL_DRAIN)) { 4329 break; 4330 } 4331 4332 vnode_lock_convert(vp); 4333 4334 if (vp->v_lflag & VL_TERMINATE) { 4335 vp->v_lflag |= VL_TERMWANT; 4336 4337 msleep(&vp->v_lflag, &vp->v_lock, PVFS, "vnode getiocount", NULL); 4338 } else 4339 msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", NULL); 4340 } 4341 if (((vflags & VNODE_WITHID) != 0) && vid != vp->v_id) { 4342 return(ENOENT); 4343 } 4344 if (++vp->v_references >= UNAGE_THRESHHOLD) { 4345 vp->v_references = 0; 4346 vnode_list_remove(vp); 4347 } 4348 vp->v_iocount++; 4349#ifdef JOE_DEBUG 4350 record_vp(vp, 1); 4351#endif 4352 return(0); 4353} 4354 4355static void 4356vnode_dropiocount (vnode_t vp) 4357{ 4358 if (vp->v_iocount < 1) 4359 panic("vnode_dropiocount(%p): v_iocount < 1", vp); 4360 4361 vp->v_iocount--; 4362#ifdef JOE_DEBUG 4363 record_vp(vp, -1); 4364#endif 4365 if ((vp->v_lflag & (VL_DRAIN | VL_SUSPENDED)) && (vp->v_iocount <= 1)) 4366 wakeup(&vp->v_iocount); 4367} 4368 4369 4370void 4371vnode_reclaim(struct vnode * vp) 4372{ 4373 vnode_reclaim_internal(vp, 0, 0, 0); 4374} 4375 4376__private_extern__ 4377void 4378vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags) 4379{ 4380 int isfifo = 0; 4381 4382 if (!locked) 4383 vnode_lock(vp); 4384 4385 if (vp->v_lflag & VL_TERMINATE) { 4386 panic("vnode reclaim in progress"); 4387 } 4388 vp->v_lflag |= VL_TERMINATE; 4389 4390 vn_clearunionwait(vp, 1); 4391 4392 vnode_drain(vp); 4393 4394 isfifo = (vp->v_type == VFIFO); 4395 4396 if (vp->v_type != VBAD) 4397 vgone(vp, flags); /* clean and reclaim the vnode */ 4398 4399 /* 4400 * give the vnode a new identity so that vnode_getwithvid will fail 4401 * on any stale cache accesses... 4402 * grab the list_lock so that if we're in "new_vnode" 4403 * behind the list_lock trying to steal this vnode, the v_id is stable... 4404 * once new_vnode drops the list_lock, it will block trying to take 4405 * the vnode lock until we release it... at that point it will evaluate 4406 * whether the v_vid has changed 4407 * also need to make sure that the vnode isn't on a list where "new_vnode" 4408 * can find it after the v_id has been bumped until we are completely done 4409 * with the vnode (i.e. putting it back on a list has to be the very last 4410 * thing we do to this vnode... many of the callers of vnode_reclaim_internal 4411 * are holding an io_count on the vnode... they need to drop the io_count 4412 * BEFORE doing a vnode_list_add or make sure to hold the vnode lock until 4413 * they are completely done with the vnode 4414 */ 4415 vnode_list_lock(); 4416 4417 vnode_list_remove_locked(vp); 4418 vp->v_id++; 4419 4420 vnode_list_unlock(); 4421 4422 if (isfifo) { 4423 struct fifoinfo * fip; 4424 4425 fip = vp->v_fifoinfo; 4426 vp->v_fifoinfo = NULL; 4427 FREE(fip, M_TEMP); 4428 } 4429 vp->v_type = VBAD; 4430 4431 if (vp->v_data) 4432 panic("vnode_reclaim_internal: cleaned vnode isn't"); 4433 if (vp->v_numoutput) 4434 panic("vnode_reclaim_internal: clean vnode has pending I/O's"); 4435 if (UBCINFOEXISTS(vp)) 4436 panic("vnode_reclaim_internal: ubcinfo not cleaned"); 4437 if (vp->v_parent) 4438 panic("vnode_reclaim_internal: vparent not removed"); 4439 if (vp->v_name) 4440 panic("vnode_reclaim_internal: vname not removed"); 4441 4442 vp->v_socket = NULL; 4443 4444 vp->v_lflag &= ~VL_TERMINATE; 4445 vp->v_owner = NULL; 4446 4447 KNOTE(&vp->v_knotes, NOTE_REVOKE); 4448 4449 /* Make sure that when we reuse the vnode, no knotes left over */ 4450 klist_init(&vp->v_knotes); 4451 4452 if (vp->v_lflag & VL_TERMWANT) { 4453 vp->v_lflag &= ~VL_TERMWANT; 4454 wakeup(&vp->v_lflag); 4455 } 4456 if (!reuse) { 4457 /* 4458 * make sure we get on the 4459 * dead list if appropriate 4460 */ 4461 vnode_list_add(vp); 4462 } 4463 if (!locked) 4464 vnode_unlock(vp); 4465} 4466 4467/* USAGE: 4468 * The following api creates a vnode and associates all the parameter specified in vnode_fsparam 4469 * structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias 4470 * is obsoleted by this. 4471 */ 4472int 4473vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp) 4474{ 4475 int error; 4476 int insert = 1; 4477 vnode_t vp; 4478 vnode_t nvp; 4479 vnode_t dvp; 4480 struct uthread *ut; 4481 struct componentname *cnp; 4482 struct vnode_fsparam *param = (struct vnode_fsparam *)data; 4483#if CONFIG_TRIGGERS 4484 struct vnode_trigger_param *tinfo = NULL; 4485#endif 4486 if (param == NULL) 4487 return (EINVAL); 4488 4489 /* Do quick sanity check on the parameters */ 4490 if (param->vnfs_vtype == VBAD) { 4491 return (EINVAL); 4492 } 4493 4494#if CONFIG_TRIGGERS 4495 if ((flavor == VNCREATE_TRIGGER) && (size == VNCREATE_TRIGGER_SIZE)) { 4496 tinfo = (struct vnode_trigger_param *)data; 4497 4498 /* Validate trigger vnode input */ 4499 if ((param->vnfs_vtype != VDIR) || 4500 (tinfo->vnt_resolve_func == NULL) || 4501 (tinfo->vnt_flags & ~VNT_VALID_MASK)) { 4502 return (EINVAL); 4503 } 4504 /* Fall through a normal create (params will be the same) */ 4505 flavor = VNCREATE_FLAVOR; 4506 size = VCREATESIZE; 4507 } 4508#endif 4509 if ((flavor != VNCREATE_FLAVOR) || (size != VCREATESIZE)) 4510 return (EINVAL); 4511 4512 if ( (error = new_vnode(&vp)) ) 4513 return(error); 4514 4515 dvp = param->vnfs_dvp; 4516 cnp = param->vnfs_cnp; 4517 4518 vp->v_op = param->vnfs_vops; 4519 vp->v_type = param->vnfs_vtype; 4520 vp->v_data = param->vnfs_fsnode; 4521 4522 if (param->vnfs_markroot) 4523 vp->v_flag |= VROOT; 4524 if (param->vnfs_marksystem) 4525 vp->v_flag |= VSYSTEM; 4526 if (vp->v_type == VREG) { 4527 error = ubc_info_init_withsize(vp, param->vnfs_filesize); 4528 if (error) { 4529#ifdef JOE_DEBUG 4530 record_vp(vp, 1); 4531#endif 4532 vp->v_mount = NULL; 4533 vp->v_op = dead_vnodeop_p; 4534 vp->v_tag = VT_NON; 4535 vp->v_data = NULL; 4536 vp->v_type = VBAD; 4537 vp->v_lflag |= VL_DEAD; 4538 4539 vnode_put(vp); 4540 return(error); 4541 } 4542 } 4543#ifdef JOE_DEBUG 4544 record_vp(vp, 1); 4545#endif 4546 4547#if CONFIG_TRIGGERS 4548 /* 4549 * For trigger vnodes, attach trigger info to vnode 4550 */ 4551 if ((vp->v_type == VDIR) && (tinfo != NULL)) { 4552 /* 4553 * Note: has a side effect of incrementing trigger count on the 4554 * mount if successful, which we would need to undo on a 4555 * subsequent failure. 4556 */ 4557#ifdef JOE_DEBUG 4558 record_vp(vp, -1); 4559#endif 4560 error = vnode_resolver_create(param->vnfs_mp, vp, tinfo, FALSE); 4561 if (error) { 4562 printf("vnode_create: vnode_resolver_create() err %d\n", error); 4563 vp->v_mount = NULL; 4564 vp->v_op = dead_vnodeop_p; 4565 vp->v_tag = VT_NON; 4566 vp->v_data = NULL; 4567 vp->v_type = VBAD; 4568 vp->v_lflag |= VL_DEAD; 4569#ifdef JOE_DEBUG 4570 record_vp(vp, 1); 4571#endif 4572 vnode_put(vp); 4573 return (error); 4574 } 4575 } 4576#endif 4577 if (vp->v_type == VCHR || vp->v_type == VBLK) { 4578 4579 vp->v_tag = VT_DEVFS; /* callers will reset if needed (bdevvp) */ 4580 4581 if ( (nvp = checkalias(vp, param->vnfs_rdev)) ) { 4582 /* 4583 * if checkalias returns a vnode, it will be locked 4584 * 4585 * first get rid of the unneeded vnode we acquired 4586 */ 4587 vp->v_data = NULL; 4588 vp->v_op = spec_vnodeop_p; 4589 vp->v_type = VBAD; 4590 vp->v_lflag = VL_DEAD; 4591 vp->v_data = NULL; 4592 vp->v_tag = VT_NON; 4593 vnode_put(vp); 4594 4595 /* 4596 * switch to aliased vnode and finish 4597 * preparing it 4598 */ 4599 vp = nvp; 4600 4601 vclean(vp, 0); 4602 vp->v_op = param->vnfs_vops; 4603 vp->v_type = param->vnfs_vtype; 4604 vp->v_data = param->vnfs_fsnode; 4605 vp->v_lflag = 0; 4606 vp->v_mount = NULL; 4607 insmntque(vp, param->vnfs_mp); 4608 insert = 0; 4609 vnode_unlock(vp); 4610 } 4611 4612 if (VCHR == vp->v_type) { 4613 u_int maj = major(vp->v_rdev); 4614 4615 if (maj < (u_int)nchrdev && 4616 (D_TYPEMASK & cdevsw[maj].d_type) == D_TTY) 4617 vp->v_flag |= VISTTY; 4618 } 4619 } 4620 4621 if (vp->v_type == VFIFO) { 4622 struct fifoinfo *fip; 4623 4624 MALLOC(fip, struct fifoinfo *, 4625 sizeof(*fip), M_TEMP, M_WAITOK); 4626 bzero(fip, sizeof(struct fifoinfo )); 4627 vp->v_fifoinfo = fip; 4628 } 4629 /* The file systems must pass the address of the location where 4630 * they store the vnode pointer. When we add the vnode into the mount 4631 * list and name cache they become discoverable. So the file system node 4632 * must have the connection to vnode setup by then 4633 */ 4634 *vpp = vp; 4635 4636 /* Add fs named reference. */ 4637 if (param->vnfs_flags & VNFS_ADDFSREF) { 4638 vp->v_lflag |= VNAMED_FSHASH; 4639 } 4640 if (param->vnfs_mp) { 4641 if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) 4642 vp->v_flag |= VLOCKLOCAL; 4643 if (insert) { 4644 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) 4645 panic("insmntque: vp on the free list\n"); 4646 4647 /* 4648 * enter in mount vnode list 4649 */ 4650 insmntque(vp, param->vnfs_mp); 4651 } 4652#if CONFIG_VFS_FUNNEL 4653 if ((param->vnfs_mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE) == 0) { 4654 MALLOC_ZONE(vp->v_unsafefs, struct unsafe_fsnode *, 4655 sizeof(struct unsafe_fsnode), M_UNSAFEFS, M_WAITOK); 4656 vp->v_unsafefs->fsnode_count = 0; 4657 vp->v_unsafefs->fsnodeowner = (void *)NULL; 4658 lck_mtx_init(&vp->v_unsafefs->fsnodelock, vnode_lck_grp, vnode_lck_attr); 4659 } 4660#endif /* CONFIG_VFS_FUNNEL */ 4661 } 4662 if (dvp && vnode_ref(dvp) == 0) { 4663 vp->v_parent = dvp; 4664 } 4665 if (cnp) { 4666 if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) { 4667 /* 4668 * enter into name cache 4669 * we've got the info to enter it into the name cache now 4670 * cache_enter_create will pick up an extra reference on 4671 * the name entered into the string cache 4672 */ 4673 vp->v_name = cache_enter_create(dvp, vp, cnp); 4674 } else 4675 vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0); 4676 4677 if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) 4678 vp->v_flag |= VISUNION; 4679 } 4680 if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) { 4681 /* 4682 * this vnode is being created as cacheable in the name cache 4683 * this allows us to re-enter it in the cache 4684 */ 4685 vp->v_flag |= VNCACHEABLE; 4686 } 4687 ut = get_bsdthread_info(current_thread()); 4688 4689 if ((current_proc()->p_lflag & P_LRAGE_VNODES) || 4690 (ut->uu_flag & UT_RAGE_VNODES)) { 4691 /* 4692 * process has indicated that it wants any 4693 * vnodes created on its behalf to be rapidly 4694 * aged to reduce the impact on the cached set 4695 * of vnodes 4696 */ 4697 vp->v_flag |= VRAGE; 4698 } 4699 return (0); 4700} 4701 4702int 4703vnode_addfsref(vnode_t vp) 4704{ 4705 vnode_lock_spin(vp); 4706 if (vp->v_lflag & VNAMED_FSHASH) 4707 panic("add_fsref: vp already has named reference"); 4708 if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) 4709 panic("addfsref: vp on the free list\n"); 4710 vp->v_lflag |= VNAMED_FSHASH; 4711 vnode_unlock(vp); 4712 return(0); 4713 4714} 4715int 4716vnode_removefsref(vnode_t vp) 4717{ 4718 vnode_lock_spin(vp); 4719 if ((vp->v_lflag & VNAMED_FSHASH) == 0) 4720 panic("remove_fsref: no named reference"); 4721 vp->v_lflag &= ~VNAMED_FSHASH; 4722 vnode_unlock(vp); 4723 return(0); 4724 4725} 4726 4727 4728int 4729vfs_iterate(int flags, int (*callout)(mount_t, void *), void *arg) 4730{ 4731 mount_t mp; 4732 int ret = 0; 4733 fsid_t * fsid_list; 4734 int count, actualcount, i; 4735 void * allocmem; 4736 int indx_start, indx_stop, indx_incr; 4737 4738 count = mount_getvfscnt(); 4739 count += 10; 4740 4741 fsid_list = (fsid_t *)kalloc(count * sizeof(fsid_t)); 4742 allocmem = (void *)fsid_list; 4743 4744 actualcount = mount_fillfsids(fsid_list, count); 4745 4746 /* 4747 * Establish the iteration direction 4748 * VFS_ITERATE_TAIL_FIRST overrides default head first order (oldest first) 4749 */ 4750 if (flags & VFS_ITERATE_TAIL_FIRST) { 4751 indx_start = actualcount - 1; 4752 indx_stop = -1; 4753 indx_incr = -1; 4754 } else /* Head first by default */ { 4755 indx_start = 0; 4756 indx_stop = actualcount; 4757 indx_incr = 1; 4758 } 4759 4760 for (i=indx_start; i != indx_stop; i += indx_incr) { 4761 4762 /* obtain the mount point with iteration reference */ 4763 mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1); 4764 4765 if(mp == (struct mount *)0) 4766 continue; 4767 mount_lock(mp); 4768 if (mp->mnt_lflag & (MNT_LDEAD | MNT_LUNMOUNT)) { 4769 mount_unlock(mp); 4770 mount_iterdrop(mp); 4771 continue; 4772 4773 } 4774 mount_unlock(mp); 4775 4776 /* iterate over all the vnodes */ 4777 ret = callout(mp, arg); 4778 4779 mount_iterdrop(mp); 4780 4781 switch (ret) { 4782 case VFS_RETURNED: 4783 case VFS_RETURNED_DONE: 4784 if (ret == VFS_RETURNED_DONE) { 4785 ret = 0; 4786 goto out; 4787 } 4788 break; 4789 4790 case VFS_CLAIMED_DONE: 4791 ret = 0; 4792 goto out; 4793 case VFS_CLAIMED: 4794 default: 4795 break; 4796 } 4797 ret = 0; 4798 } 4799 4800out: 4801 kfree(allocmem, (count * sizeof(fsid_t))); 4802 return (ret); 4803} 4804 4805/* 4806 * Update the vfsstatfs structure in the mountpoint. 4807 * MAC: Parameter eventtype added, indicating whether the event that 4808 * triggered this update came from user space, via a system call 4809 * (VFS_USER_EVENT) or an internal kernel call (VFS_KERNEL_EVENT). 4810 */ 4811int 4812vfs_update_vfsstat(mount_t mp, vfs_context_t ctx, __unused int eventtype) 4813{ 4814 struct vfs_attr va; 4815 int error; 4816 4817 /* 4818 * Request the attributes we want to propagate into 4819 * the per-mount vfsstat structure. 4820 */ 4821 VFSATTR_INIT(&va); 4822 VFSATTR_WANTED(&va, f_iosize); 4823 VFSATTR_WANTED(&va, f_blocks); 4824 VFSATTR_WANTED(&va, f_bfree); 4825 VFSATTR_WANTED(&va, f_bavail); 4826 VFSATTR_WANTED(&va, f_bused); 4827 VFSATTR_WANTED(&va, f_files); 4828 VFSATTR_WANTED(&va, f_ffree); 4829 VFSATTR_WANTED(&va, f_bsize); 4830 VFSATTR_WANTED(&va, f_fssubtype); 4831#if CONFIG_MACF 4832 if (eventtype == VFS_USER_EVENT) { 4833 error = mac_mount_check_getattr(ctx, mp, &va); 4834 if (error != 0) 4835 return (error); 4836 } 4837#endif 4838 4839 if ((error = vfs_getattr(mp, &va, ctx)) != 0) { 4840 KAUTH_DEBUG("STAT - filesystem returned error %d", error); 4841 return(error); 4842 } 4843 4844 /* 4845 * Unpack into the per-mount structure. 4846 * 4847 * We only overwrite these fields, which are likely to change: 4848 * f_blocks 4849 * f_bfree 4850 * f_bavail 4851 * f_bused 4852 * f_files 4853 * f_ffree 4854 * 4855 * And these which are not, but which the FS has no other way 4856 * of providing to us: 4857 * f_bsize 4858 * f_iosize 4859 * f_fssubtype 4860 * 4861 */ 4862 if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) { 4863 /* 4822056 - protect against malformed server mount */ 4864 mp->mnt_vfsstat.f_bsize = (va.f_bsize > 0 ? va.f_bsize : 512); 4865 } else { 4866 mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; /* default from the device block size */ 4867 } 4868 if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) { 4869 mp->mnt_vfsstat.f_iosize = va.f_iosize; 4870 } else { 4871 mp->mnt_vfsstat.f_iosize = 1024 * 1024; /* 1MB sensible I/O size */ 4872 } 4873 if (VFSATTR_IS_SUPPORTED(&va, f_blocks)) 4874 mp->mnt_vfsstat.f_blocks = va.f_blocks; 4875 if (VFSATTR_IS_SUPPORTED(&va, f_bfree)) 4876 mp->mnt_vfsstat.f_bfree = va.f_bfree; 4877 if (VFSATTR_IS_SUPPORTED(&va, f_bavail)) 4878 mp->mnt_vfsstat.f_bavail = va.f_bavail; 4879 if (VFSATTR_IS_SUPPORTED(&va, f_bused)) 4880 mp->mnt_vfsstat.f_bused = va.f_bused; 4881 if (VFSATTR_IS_SUPPORTED(&va, f_files)) 4882 mp->mnt_vfsstat.f_files = va.f_files; 4883 if (VFSATTR_IS_SUPPORTED(&va, f_ffree)) 4884 mp->mnt_vfsstat.f_ffree = va.f_ffree; 4885 4886 /* this is unlikely to change, but has to be queried for */ 4887 if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype)) 4888 mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype; 4889 4890 return(0); 4891} 4892 4893int 4894mount_list_add(mount_t mp) 4895{ 4896 int res; 4897 4898 mount_list_lock(); 4899 if (system_inshutdown != 0) { 4900 res = -1; 4901 } else { 4902 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); 4903 nummounts++; 4904 res = 0; 4905 } 4906 mount_list_unlock(); 4907 4908 return res; 4909} 4910 4911void 4912mount_list_remove(mount_t mp) 4913{ 4914 mount_list_lock(); 4915 TAILQ_REMOVE(&mountlist, mp, mnt_list); 4916 nummounts--; 4917 mp->mnt_list.tqe_next = NULL; 4918 mp->mnt_list.tqe_prev = NULL; 4919 mount_list_unlock(); 4920} 4921 4922mount_t 4923mount_lookupby_volfsid(int volfs_id, int withref) 4924{ 4925 mount_t cur_mount = (mount_t)0; 4926 mount_t mp; 4927 4928 mount_list_lock(); 4929 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4930 if (!(mp->mnt_kern_flag & MNTK_UNMOUNT) && 4931 (mp->mnt_kern_flag & MNTK_PATH_FROM_ID) && 4932 (mp->mnt_vfsstat.f_fsid.val[0] == volfs_id)) { 4933 cur_mount = mp; 4934 if (withref) { 4935 if (mount_iterref(cur_mount, 1)) { 4936 cur_mount = (mount_t)0; 4937 mount_list_unlock(); 4938 goto out; 4939 } 4940 } 4941 break; 4942 } 4943 } 4944 mount_list_unlock(); 4945 if (withref && (cur_mount != (mount_t)0)) { 4946 mp = cur_mount; 4947 if (vfs_busy(mp, LK_NOWAIT) != 0) { 4948 cur_mount = (mount_t)0; 4949 } 4950 mount_iterdrop(mp); 4951 } 4952out: 4953 return(cur_mount); 4954} 4955 4956mount_t 4957mount_list_lookupby_fsid(fsid_t *fsid, int locked, int withref) 4958{ 4959 mount_t retmp = (mount_t)0; 4960 mount_t mp; 4961 4962 if (!locked) 4963 mount_list_lock(); 4964 TAILQ_FOREACH(mp, &mountlist, mnt_list) 4965 if (mp->mnt_vfsstat.f_fsid.val[0] == fsid->val[0] && 4966 mp->mnt_vfsstat.f_fsid.val[1] == fsid->val[1]) { 4967 retmp = mp; 4968 if (withref) { 4969 if (mount_iterref(retmp, 1)) 4970 retmp = (mount_t)0; 4971 } 4972 goto out; 4973 } 4974out: 4975 if (!locked) 4976 mount_list_unlock(); 4977 return (retmp); 4978} 4979 4980errno_t 4981vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx) 4982{ 4983 struct nameidata nd; 4984 int error; 4985 u_int32_t ndflags = 0; 4986 4987 if (ctx == NULL) { /* XXX technically an error */ 4988 ctx = vfs_context_current(); 4989 } 4990 4991 if (flags & VNODE_LOOKUP_NOFOLLOW) 4992 ndflags = NOFOLLOW; 4993 else 4994 ndflags = FOLLOW; 4995 4996 if (flags & VNODE_LOOKUP_NOCROSSMOUNT) 4997 ndflags |= NOCROSSMOUNT; 4998 if (flags & VNODE_LOOKUP_DOWHITEOUT) 4999 ndflags |= DOWHITEOUT; 5000 5001 /* XXX AUDITVNPATH1 needed ? */ 5002 NDINIT(&nd, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE, 5003 CAST_USER_ADDR_T(path), ctx); 5004 5005 if ((error = namei(&nd))) 5006 return (error); 5007 *vpp = nd.ni_vp; 5008 nameidone(&nd); 5009 5010 return (0); 5011} 5012 5013errno_t 5014vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx) 5015{ 5016 struct nameidata nd; 5017 int error; 5018 u_int32_t ndflags = 0; 5019 int lflags = flags; 5020 5021 if (ctx == NULL) { /* XXX technically an error */ 5022 ctx = vfs_context_current(); 5023 } 5024 5025 if (fmode & O_NOFOLLOW) 5026 lflags |= VNODE_LOOKUP_NOFOLLOW; 5027 5028 if (lflags & VNODE_LOOKUP_NOFOLLOW) 5029 ndflags = NOFOLLOW; 5030 else 5031 ndflags = FOLLOW; 5032 5033 if (lflags & VNODE_LOOKUP_NOCROSSMOUNT) 5034 ndflags |= NOCROSSMOUNT; 5035 if (lflags & VNODE_LOOKUP_DOWHITEOUT) 5036 ndflags |= DOWHITEOUT; 5037 5038 /* XXX AUDITVNPATH1 needed ? */ 5039 NDINIT(&nd, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE, 5040 CAST_USER_ADDR_T(path), ctx); 5041 5042 if ((error = vn_open(&nd, fmode, cmode))) 5043 *vpp = NULL; 5044 else 5045 *vpp = nd.ni_vp; 5046 5047 return (error); 5048} 5049 5050errno_t 5051vnode_close(vnode_t vp, int flags, vfs_context_t ctx) 5052{ 5053 int error; 5054 5055 if (ctx == NULL) { 5056 ctx = vfs_context_current(); 5057 } 5058 5059 error = vn_close(vp, flags, ctx); 5060 vnode_put(vp); 5061 return (error); 5062} 5063 5064/* 5065 * Returns: 0 Success 5066 * vnode_getattr:??? 5067 */ 5068errno_t 5069vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx) 5070{ 5071 struct vnode_attr va; 5072 int error; 5073 5074 VATTR_INIT(&va); 5075 VATTR_WANTED(&va, va_data_size); 5076 error = vnode_getattr(vp, &va, ctx); 5077 if (!error) 5078 *sizep = va.va_data_size; 5079 return(error); 5080} 5081 5082errno_t 5083vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx) 5084{ 5085 struct vnode_attr va; 5086 5087 VATTR_INIT(&va); 5088 VATTR_SET(&va, va_data_size, size); 5089 va.va_vaflags = ioflag & 0xffff; 5090 return(vnode_setattr(vp, &va, ctx)); 5091} 5092 5093static int 5094vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx) 5095{ 5096 /* Only use compound VNOP for compound operation */ 5097 if (vnode_compound_open_available(dvp) && ((flags & VN_CREATE_DOOPEN) != 0)) { 5098 *vpp = NULLVP; 5099 return VNOP_COMPOUND_OPEN(dvp, vpp, ndp, VNOP_COMPOUND_OPEN_DO_CREATE, fmode, statusp, vap, ctx); 5100 } else { 5101 return VNOP_CREATE(dvp, vpp, &ndp->ni_cnd, vap, ctx); 5102 } 5103} 5104 5105/* 5106 * Create a filesystem object of arbitrary type with arbitrary attributes in 5107 * the spevied directory with the specified name. 5108 * 5109 * Parameters: dvp Pointer to the vnode of the directory 5110 * in which to create the object. 5111 * vpp Pointer to the area into which to 5112 * return the vnode of the created object. 5113 * cnp Component name pointer from the namei 5114 * data structure, containing the name to 5115 * use for the create object. 5116 * vap Pointer to the vnode_attr structure 5117 * describing the object to be created, 5118 * including the type of object. 5119 * flags VN_* flags controlling ACL inheritance 5120 * and whether or not authorization is to 5121 * be required for the operation. 5122 * 5123 * Returns: 0 Success 5124 * !0 errno value 5125 * 5126 * Implicit: *vpp Contains the vnode of the object that 5127 * was created, if successful. 5128 * *cnp May be modified by the underlying VFS. 5129 * *vap May be modified by the underlying VFS. 5130 * modified by either ACL inheritance or 5131 * 5132 * 5133 * be modified, even if the operation is 5134 * 5135 * 5136 * Notes: The kauth_filesec_t in 'vap', if any, is in host byte order. 5137 * 5138 * Modification of '*cnp' and '*vap' by the underlying VFS is 5139 * strongly discouraged. 5140 * 5141 * XXX: This function is a 'vn_*' function; it belongs in vfs_vnops.c 5142 * 5143 * XXX: We should enummerate the possible errno values here, and where 5144 * in the code they originated. 5145 */ 5146errno_t 5147vn_create(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx) 5148{ 5149 errno_t error, old_error; 5150 vnode_t vp = (vnode_t)0; 5151 boolean_t batched; 5152 struct componentname *cnp; 5153 uint32_t defaulted; 5154 5155 cnp = &ndp->ni_cnd; 5156 error = 0; 5157 batched = namei_compound_available(dvp, ndp) ? TRUE : FALSE; 5158 5159 KAUTH_DEBUG("%p CREATE - '%s'", dvp, cnp->cn_nameptr); 5160 5161 if (flags & VN_CREATE_NOINHERIT) 5162 vap->va_vaflags |= VA_NOINHERIT; 5163 if (flags & VN_CREATE_NOAUTH) 5164 vap->va_vaflags |= VA_NOAUTH; 5165 /* 5166 * Handle ACL inheritance, initialize vap. 5167 */ 5168 error = vn_attribute_prepare(dvp, vap, &defaulted, ctx); 5169 if (error) { 5170 return error; 5171 } 5172 5173 if (vap->va_type != VREG && (fmode != 0 || (flags & VN_CREATE_DOOPEN) || statusp)) { 5174 panic("Open parameters, but not a regular file."); 5175 } 5176 if ((fmode != 0) && ((flags & VN_CREATE_DOOPEN) == 0)) { 5177 panic("Mode for open, but not trying to open..."); 5178 } 5179 5180 /* 5181 * Create the requested node. 5182 */ 5183 switch(vap->va_type) { 5184 case VREG: 5185 error = vn_create_reg(dvp, vpp, ndp, vap, flags, fmode, statusp, ctx); 5186 break; 5187 case VDIR: 5188 error = vn_mkdir(dvp, vpp, ndp, vap, ctx); 5189 break; 5190 case VSOCK: 5191 case VFIFO: 5192 case VBLK: 5193 case VCHR: 5194 error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx); 5195 break; 5196 default: 5197 panic("vnode_create: unknown vtype %d", vap->va_type); 5198 } 5199 if (error != 0) { 5200 KAUTH_DEBUG("%p CREATE - error %d returned by filesystem", dvp, error); 5201 goto out; 5202 } 5203 5204 vp = *vpp; 5205 old_error = error; 5206 5207#if CONFIG_MACF 5208 if (!(flags & VN_CREATE_NOLABEL)) { 5209 error = vnode_label(vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx); 5210 if (error) 5211 goto error; 5212 } 5213#endif 5214 5215 /* 5216 * If some of the requested attributes weren't handled by the VNOP, 5217 * use our fallback code. 5218 */ 5219 if (!VATTR_ALL_SUPPORTED(vap) && *vpp) { 5220 KAUTH_DEBUG(" CREATE - doing fallback with ACL %p", vap->va_acl); 5221 error = vnode_setattr_fallback(*vpp, vap, ctx); 5222 } 5223#if CONFIG_MACF 5224error: 5225#endif 5226 if ((error != 0) && (vp != (vnode_t)0)) { 5227 5228 /* If we've done a compound open, close */ 5229 if (batched && (old_error == 0) && (vap->va_type == VREG)) { 5230 VNOP_CLOSE(vp, fmode, ctx); 5231 } 5232 5233 /* Need to provide notifications if a create succeeded */ 5234 if (!batched) { 5235 *vpp = (vnode_t) 0; 5236 vnode_put(vp); 5237 } 5238 } 5239 5240out: 5241 vn_attribute_cleanup(vap, defaulted); 5242 5243 return(error); 5244} 5245 5246static kauth_scope_t vnode_scope; 5247static int vnode_authorize_callback(kauth_cred_t credential, void *idata, kauth_action_t action, 5248 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3); 5249static int vnode_authorize_callback_int(__unused kauth_cred_t credential, __unused void *idata, kauth_action_t action, 5250 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3); 5251 5252typedef struct _vnode_authorize_context { 5253 vnode_t vp; 5254 struct vnode_attr *vap; 5255 vnode_t dvp; 5256 struct vnode_attr *dvap; 5257 vfs_context_t ctx; 5258 int flags; 5259 int flags_valid; 5260#define _VAC_IS_OWNER (1<<0) 5261#define _VAC_IN_GROUP (1<<1) 5262#define _VAC_IS_DIR_OWNER (1<<2) 5263#define _VAC_IN_DIR_GROUP (1<<3) 5264} *vauth_ctx; 5265 5266void 5267vnode_authorize_init(void) 5268{ 5269 vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL); 5270} 5271 5272#define VATTR_PREPARE_DEFAULTED_UID 0x1 5273#define VATTR_PREPARE_DEFAULTED_GID 0x2 5274#define VATTR_PREPARE_DEFAULTED_MODE 0x4 5275 5276int 5277vn_attribute_prepare(vnode_t dvp, struct vnode_attr *vap, uint32_t *defaulted_fieldsp, vfs_context_t ctx) 5278{ 5279 kauth_acl_t nacl = NULL, oacl = NULL; 5280 int error; 5281 5282 /* 5283 * Handle ACL inheritance. 5284 */ 5285 if (!(vap->va_vaflags & VA_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) { 5286 /* save the original filesec */ 5287 if (VATTR_IS_ACTIVE(vap, va_acl)) { 5288 oacl = vap->va_acl; 5289 } 5290 5291 vap->va_acl = NULL; 5292 if ((error = kauth_acl_inherit(dvp, 5293 oacl, 5294 &nacl, 5295 vap->va_type == VDIR, 5296 ctx)) != 0) { 5297 KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error); 5298 return(error); 5299 } 5300 5301 /* 5302 * If the generated ACL is NULL, then we can save ourselves some effort 5303 * by clearing the active bit. 5304 */ 5305 if (nacl == NULL) { 5306 VATTR_CLEAR_ACTIVE(vap, va_acl); 5307 } else { 5308 vap->va_base_acl = oacl; 5309 VATTR_SET(vap, va_acl, nacl); 5310 } 5311 } 5312 5313 error = vnode_authattr_new_internal(dvp, vap, (vap->va_vaflags & VA_NOAUTH), defaulted_fieldsp, ctx); 5314 if (error) { 5315 vn_attribute_cleanup(vap, *defaulted_fieldsp); 5316 } 5317 5318 return error; 5319} 5320 5321void 5322vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields) 5323{ 5324 /* 5325 * If the caller supplied a filesec in vap, it has been replaced 5326 * now by the post-inheritance copy. We need to put the original back 5327 * and free the inherited product. 5328 */ 5329 kauth_acl_t nacl, oacl; 5330 5331 if (VATTR_IS_ACTIVE(vap, va_acl)) { 5332 nacl = vap->va_acl; 5333 oacl = vap->va_base_acl; 5334 5335 if (oacl) { 5336 VATTR_SET(vap, va_acl, oacl); 5337 vap->va_base_acl = NULL; 5338 } else { 5339 VATTR_CLEAR_ACTIVE(vap, va_acl); 5340 } 5341 5342 if (nacl != NULL) { 5343 kauth_acl_free(nacl); 5344 } 5345 } 5346 5347 if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_MODE) != 0) { 5348 VATTR_CLEAR_ACTIVE(vap, va_mode); 5349 } 5350 if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_GID) != 0) { 5351 VATTR_CLEAR_ACTIVE(vap, va_gid); 5352 } 5353 if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_UID) != 0) { 5354 VATTR_CLEAR_ACTIVE(vap, va_uid); 5355 } 5356 5357 return; 5358} 5359 5360int 5361vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, __unused void *reserved) 5362{ 5363 int error = 0; 5364 5365 /* 5366 * Normally, unlinking of directories is not supported. 5367 * However, some file systems may have limited support. 5368 */ 5369 if ((vp->v_type == VDIR) && 5370 !(vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSDIRLINKS)) { 5371 return (EPERM); /* POSIX */ 5372 } 5373 5374 /* authorize the delete operation */ 5375#if CONFIG_MACF 5376 if (!error) 5377 error = mac_vnode_check_unlink(ctx, dvp, vp, cnp); 5378#endif /* MAC */ 5379 if (!error) 5380 error = vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx); 5381 5382 return error; 5383} 5384 5385int 5386vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fmode, vfs_context_t ctx, void *reserved) 5387{ 5388 /* Open of existing case */ 5389 kauth_action_t action; 5390 int error = 0; 5391 5392 if (cnp->cn_ndp == NULL) { 5393 panic("NULL ndp"); 5394 } 5395 if (reserved != NULL) { 5396 panic("reserved not NULL."); 5397 } 5398 5399#if CONFIG_MACF 5400 /* XXX may do duplicate work here, but ignore that for now (idempotent) */ 5401 if (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL) { 5402 error = vnode_label(vnode_mount(vp), NULL, vp, NULL, 0, ctx); 5403 if (error) 5404 return (error); 5405 } 5406#endif 5407 5408 if ( (fmode & O_DIRECTORY) && vp->v_type != VDIR ) { 5409 return (ENOTDIR); 5410 } 5411 5412 if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) { 5413 return (EOPNOTSUPP); /* Operation not supported on socket */ 5414 } 5415 5416 if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != 0) { 5417 return (ELOOP); /* O_NOFOLLOW was specified and the target is a symbolic link */ 5418 } 5419 5420 /* disallow write operations on directories */ 5421 if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) { 5422 return (EISDIR); 5423 } 5424 5425 if ((cnp->cn_ndp->ni_flag & NAMEI_TRAILINGSLASH)) { 5426 if (vp->v_type != VDIR) { 5427 return (ENOTDIR); 5428 } 5429 } 5430 5431#if CONFIG_MACF 5432 /* If a file being opened is a shadow file containing 5433 * namedstream data, ignore the macf checks because it 5434 * is a kernel internal file and access should always 5435 * be allowed. 5436 */ 5437 if (!(vnode_isshadow(vp) && vnode_isnamedstream(vp))) { 5438 error = mac_vnode_check_open(ctx, vp, fmode); 5439 if (error) { 5440 return (error); 5441 } 5442 } 5443#endif 5444 5445 /* compute action to be authorized */ 5446 action = 0; 5447 if (fmode & FREAD) { 5448 action |= KAUTH_VNODE_READ_DATA; 5449 } 5450 if (fmode & (FWRITE | O_TRUNC)) { 5451 /* 5452 * If we are writing, appending, and not truncating, 5453 * indicate that we are appending so that if the 5454 * UF_APPEND or SF_APPEND bits are set, we do not deny 5455 * the open. 5456 */ 5457 if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) { 5458 action |= KAUTH_VNODE_APPEND_DATA; 5459 } else { 5460 action |= KAUTH_VNODE_WRITE_DATA; 5461 } 5462 } 5463 error = vnode_authorize(vp, NULL, action, ctx); 5464 5465#if NAMEDSTREAMS 5466 if (error == EACCES) { 5467 /* 5468 * Shadow files may exist on-disk with a different UID/GID 5469 * than that of the current context. Verify that this file 5470 * is really a shadow file. If it was created successfully 5471 * then it should be authorized. 5472 */ 5473 if (vnode_isshadow(vp) && vnode_isnamedstream (vp)) { 5474 error = vnode_verifynamedstream(vp, ctx); 5475 } 5476 } 5477#endif 5478 5479 return error; 5480} 5481 5482int 5483vn_authorize_create(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved) 5484{ 5485 /* Creation case */ 5486 int error; 5487 5488 if (cnp->cn_ndp == NULL) { 5489 panic("NULL cn_ndp"); 5490 } 5491 if (reserved != NULL) { 5492 panic("reserved not NULL."); 5493 } 5494 5495 /* Only validate path for creation if we didn't do a complete lookup */ 5496 if (cnp->cn_ndp->ni_flag & NAMEI_UNFINISHED) { 5497 error = lookup_validate_creation_path(cnp->cn_ndp); 5498 if (error) 5499 return (error); 5500 } 5501 5502#if CONFIG_MACF 5503 error = mac_vnode_check_create(ctx, dvp, cnp, vap); 5504 if (error) 5505 return (error); 5506#endif /* CONFIG_MACF */ 5507 5508 return (vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)); 5509} 5510 5511int 5512vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, 5513 struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, 5514 vfs_context_t ctx, void *reserved) 5515{ 5516 int error = 0; 5517 int moving = 0; 5518 5519 if (reserved != NULL) { 5520 panic("Passed something other than NULL as reserved field!"); 5521 } 5522 5523 /* 5524 * Avoid renaming "." and "..". 5525 * 5526 * XXX No need to check for this in the FS. We should always have the leaves 5527 * in VFS in this case. 5528 */ 5529 if (fvp->v_type == VDIR && 5530 ((fdvp == fvp) || 5531 (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || 5532 ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT)) ) { 5533 error = EINVAL; 5534 goto out; 5535 } 5536 5537 if (tvp == NULLVP && vnode_compound_rename_available(tdvp)) { 5538 error = lookup_validate_creation_path(tcnp->cn_ndp); 5539 if (error) 5540 goto out; 5541 } 5542 5543 /***** <MACF> *****/ 5544#if CONFIG_MACF 5545 error = mac_vnode_check_rename_from(ctx, fdvp, fvp, fcnp); 5546 if (error) 5547 goto out; 5548#endif 5549 5550#if CONFIG_MACF 5551 error = mac_vnode_check_rename_to(ctx, 5552 tdvp, tvp, fdvp == tdvp, tcnp); 5553 if (error) 5554 goto out; 5555#endif 5556 /***** </MACF> *****/ 5557 5558 /***** <MiscChecks> *****/ 5559 if (tvp != NULL) { 5560 if (fvp->v_type == VDIR && tvp->v_type != VDIR) { 5561 error = ENOTDIR; 5562 goto out; 5563 } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { 5564 error = EISDIR; 5565 goto out; 5566 } 5567 } 5568 5569 if (fvp == tdvp) { 5570 error = EINVAL; 5571 goto out; 5572 } 5573 5574 /* 5575 * The following edge case is caught here: 5576 * (to cannot be a descendent of from) 5577 * 5578 * o fdvp 5579 * / 5580 * / 5581 * o fvp 5582 * \ 5583 * \ 5584 * o tdvp 5585 * / 5586 * / 5587 * o tvp 5588 */ 5589 if (tdvp->v_parent == fvp) { 5590 error = EINVAL; 5591 goto out; 5592 } 5593 /***** </MiscChecks> *****/ 5594 5595 /***** <Kauth> *****/ 5596 5597 error = 0; 5598 if ((tvp != NULL) && vnode_isdir(tvp)) { 5599 if (tvp != fdvp) 5600 moving = 1; 5601 } else if (tdvp != fdvp) { 5602 moving = 1; 5603 } 5604 5605 5606 /* 5607 * must have delete rights to remove the old name even in 5608 * the simple case of fdvp == tdvp. 5609 * 5610 * If fvp is a directory, and we are changing it's parent, 5611 * then we also need rights to rewrite its ".." entry as well. 5612 */ 5613 if (vnode_isdir(fvp)) { 5614 if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) 5615 goto out; 5616 } else { 5617 if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0) 5618 goto out; 5619 } 5620 if (moving) { 5621 /* moving into tdvp or tvp, must have rights to add */ 5622 if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp, 5623 NULL, 5624 vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, 5625 ctx)) != 0) { 5626 goto out; 5627 } 5628 } else { 5629 /* node staying in same directory, must be allowed to add new name */ 5630 if ((error = vnode_authorize(fdvp, NULL, 5631 vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0) 5632 goto out; 5633 } 5634 /* overwriting tvp */ 5635 if ((tvp != NULL) && !vnode_isdir(tvp) && 5636 ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) { 5637 goto out; 5638 } 5639 5640 /***** </Kauth> *****/ 5641 5642 /* XXX more checks? */ 5643out: 5644 return error; 5645} 5646 5647int 5648vn_authorize_mkdir(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved) 5649{ 5650 int error; 5651 5652 if (reserved != NULL) { 5653 panic("reserved not NULL in vn_authorize_mkdir()"); 5654 } 5655 5656 /* XXX A hack for now, to make shadow files work */ 5657 if (cnp->cn_ndp == NULL) { 5658 return 0; 5659 } 5660 5661 if (vnode_compound_mkdir_available(dvp)) { 5662 error = lookup_validate_creation_path(cnp->cn_ndp); 5663 if (error) 5664 goto out; 5665 } 5666 5667#if CONFIG_MACF 5668 error = mac_vnode_check_create(ctx, 5669 dvp, cnp, vap); 5670 if (error) 5671 goto out; 5672#endif 5673 5674 /* authorize addition of a directory to the parent */ 5675 if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) 5676 goto out; 5677 5678out: 5679 return error; 5680} 5681 5682int 5683vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved) 5684{ 5685 int error; 5686 5687 if (reserved != NULL) { 5688 panic("Non-NULL reserved argument to vn_authorize_rmdir()"); 5689 } 5690 5691 if (vp->v_type != VDIR) { 5692 /* 5693 * rmdir only deals with directories 5694 */ 5695 return ENOTDIR; 5696 } 5697 5698 if (dvp == vp) { 5699 /* 5700 * No rmdir "." please. 5701 */ 5702 return EINVAL; 5703 } 5704 5705#if CONFIG_MACF 5706 error = mac_vnode_check_unlink(ctx, dvp, 5707 vp, cnp); 5708 if (error) 5709 return error; 5710#endif 5711 5712 return vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx); 5713} 5714 5715/* 5716 * Authorize an operation on a vnode. 5717 * 5718 * This is KPI, but here because it needs vnode_scope. 5719 * 5720 * Returns: 0 Success 5721 * kauth_authorize_action:EPERM ... 5722 * xlate => EACCES Permission denied 5723 * kauth_authorize_action:0 Success 5724 * kauth_authorize_action: Depends on callback return; this is 5725 * usually only vnode_authorize_callback(), 5726 * but may include other listerners, if any 5727 * exist. 5728 * EROFS 5729 * EACCES 5730 * EPERM 5731 * ??? 5732 */ 5733int 5734vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx) 5735{ 5736 int error, result; 5737 5738 /* 5739 * We can't authorize against a dead vnode; allow all operations through so that 5740 * the correct error can be returned. 5741 */ 5742 if (vp->v_type == VBAD) 5743 return(0); 5744 5745 error = 0; 5746 result = kauth_authorize_action(vnode_scope, vfs_context_ucred(ctx), action, 5747 (uintptr_t)ctx, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error); 5748 if (result == EPERM) /* traditional behaviour */ 5749 result = EACCES; 5750 /* did the lower layers give a better error return? */ 5751 if ((result != 0) && (error != 0)) 5752 return(error); 5753 return(result); 5754} 5755 5756/* 5757 * Test for vnode immutability. 5758 * 5759 * The 'append' flag is set when the authorization request is constrained 5760 * to operations which only request the right to append to a file. 5761 * 5762 * The 'ignore' flag is set when an operation modifying the immutability flags 5763 * is being authorized. We check the system securelevel to determine which 5764 * immutability flags we can ignore. 5765 */ 5766static int 5767vnode_immutable(struct vnode_attr *vap, int append, int ignore) 5768{ 5769 int mask; 5770 5771 /* start with all bits precluding the operation */ 5772 mask = IMMUTABLE | APPEND; 5773 5774 /* if appending only, remove the append-only bits */ 5775 if (append) 5776 mask &= ~APPEND; 5777 5778 /* ignore only set when authorizing flags changes */ 5779 if (ignore) { 5780 if (securelevel <= 0) { 5781 /* in insecure state, flags do not inhibit changes */ 5782 mask = 0; 5783 } else { 5784 /* in secure state, user flags don't inhibit */ 5785 mask &= ~(UF_IMMUTABLE | UF_APPEND); 5786 } 5787 } 5788 KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore); 5789 if ((vap->va_flags & mask) != 0) 5790 return(EPERM); 5791 return(0); 5792} 5793 5794static int 5795vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred) 5796{ 5797 int result; 5798 5799 /* default assumption is not-owner */ 5800 result = 0; 5801 5802 /* 5803 * If the filesystem has given us a UID, we treat this as authoritative. 5804 */ 5805 if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) { 5806 result = (vap->va_uid == kauth_cred_getuid(cred)) ? 1 : 0; 5807 } 5808 /* we could test the owner UUID here if we had a policy for it */ 5809 5810 return(result); 5811} 5812 5813/* 5814 * vauth_node_group 5815 * 5816 * Description: Ask if a cred is a member of the group owning the vnode object 5817 * 5818 * Parameters: vap vnode attribute 5819 * vap->va_gid group owner of vnode object 5820 * cred credential to check 5821 * ismember pointer to where to put the answer 5822 * idontknow Return this if we can't get an answer 5823 * 5824 * Returns: 0 Success 5825 * idontknow Can't get information 5826 * kauth_cred_ismember_gid:? Error from kauth subsystem 5827 * kauth_cred_ismember_gid:? Error from kauth subsystem 5828 */ 5829static int 5830vauth_node_group(struct vnode_attr *vap, kauth_cred_t cred, int *ismember, int idontknow) 5831{ 5832 int error; 5833 int result; 5834 5835 error = 0; 5836 result = 0; 5837 5838 /* 5839 * The caller is expected to have asked the filesystem for a group 5840 * at some point prior to calling this function. The answer may 5841 * have been that there is no group ownership supported for the 5842 * vnode object, in which case we return 5843 */ 5844 if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) { 5845 error = kauth_cred_ismember_gid(cred, vap->va_gid, &result); 5846 /* 5847 * Credentials which are opted into external group membership 5848 * resolution which are not known to the external resolver 5849 * will result in an ENOENT error. We translate this into 5850 * the appropriate 'idontknow' response for our caller. 5851 * 5852 * XXX We do not make a distinction here between an ENOENT 5853 * XXX arising from a response from the external resolver, 5854 * XXX and an ENOENT which is internally generated. This is 5855 * XXX a deficiency of the published kauth_cred_ismember_gid() 5856 * XXX KPI which can not be overcome without new KPI. For 5857 * XXX all currently known cases, however, this wil result 5858 * XXX in correct behaviour. 5859 */ 5860 if (error == ENOENT) 5861 error = idontknow; 5862 } 5863 /* 5864 * XXX We could test the group UUID here if we had a policy for it, 5865 * XXX but this is problematic from the perspective of synchronizing 5866 * XXX group UUID and POSIX GID ownership of a file and keeping the 5867 * XXX values coherent over time. The problem is that the local 5868 * XXX system will vend transient group UUIDs for unknown POSIX GID 5869 * XXX values, and these are not persistent, whereas storage of values 5870 * XXX is persistent. One potential solution to this is a local 5871 * XXX (persistent) replica of remote directory entries and vended 5872 * XXX local ids in a local directory server (think in terms of a 5873 * XXX caching DNS server). 5874 */ 5875 5876 if (!error) 5877 *ismember = result; 5878 return(error); 5879} 5880 5881static int 5882vauth_file_owner(vauth_ctx vcp) 5883{ 5884 int result; 5885 5886 if (vcp->flags_valid & _VAC_IS_OWNER) { 5887 result = (vcp->flags & _VAC_IS_OWNER) ? 1 : 0; 5888 } else { 5889 result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred); 5890 5891 /* cache our result */ 5892 vcp->flags_valid |= _VAC_IS_OWNER; 5893 if (result) { 5894 vcp->flags |= _VAC_IS_OWNER; 5895 } else { 5896 vcp->flags &= ~_VAC_IS_OWNER; 5897 } 5898 } 5899 return(result); 5900} 5901 5902 5903/* 5904 * vauth_file_ingroup 5905 * 5906 * Description: Ask if a user is a member of the group owning the directory 5907 * 5908 * Parameters: vcp The vnode authorization context that 5909 * contains the user and directory info 5910 * vcp->flags_valid Valid flags 5911 * vcp->flags Flags values 5912 * vcp->vap File vnode attributes 5913 * vcp->ctx VFS Context (for user) 5914 * ismember pointer to where to put the answer 5915 * idontknow Return this if we can't get an answer 5916 * 5917 * Returns: 0 Success 5918 * vauth_node_group:? Error from vauth_node_group() 5919 * 5920 * Implicit returns: *ismember 0 The user is not a group member 5921 * 1 The user is a group member 5922 */ 5923static int 5924vauth_file_ingroup(vauth_ctx vcp, int *ismember, int idontknow) 5925{ 5926 int error; 5927 5928 /* Check for a cached answer first, to avoid the check if possible */ 5929 if (vcp->flags_valid & _VAC_IN_GROUP) { 5930 *ismember = (vcp->flags & _VAC_IN_GROUP) ? 1 : 0; 5931 error = 0; 5932 } else { 5933 /* Otherwise, go look for it */ 5934 error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember, idontknow); 5935 5936 if (!error) { 5937 /* cache our result */ 5938 vcp->flags_valid |= _VAC_IN_GROUP; 5939 if (*ismember) { 5940 vcp->flags |= _VAC_IN_GROUP; 5941 } else { 5942 vcp->flags &= ~_VAC_IN_GROUP; 5943 } 5944 } 5945 5946 } 5947 return(error); 5948} 5949 5950static int 5951vauth_dir_owner(vauth_ctx vcp) 5952{ 5953 int result; 5954 5955 if (vcp->flags_valid & _VAC_IS_DIR_OWNER) { 5956 result = (vcp->flags & _VAC_IS_DIR_OWNER) ? 1 : 0; 5957 } else { 5958 result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred); 5959 5960 /* cache our result */ 5961 vcp->flags_valid |= _VAC_IS_DIR_OWNER; 5962 if (result) { 5963 vcp->flags |= _VAC_IS_DIR_OWNER; 5964 } else { 5965 vcp->flags &= ~_VAC_IS_DIR_OWNER; 5966 } 5967 } 5968 return(result); 5969} 5970 5971/* 5972 * vauth_dir_ingroup 5973 * 5974 * Description: Ask if a user is a member of the group owning the directory 5975 * 5976 * Parameters: vcp The vnode authorization context that 5977 * contains the user and directory info 5978 * vcp->flags_valid Valid flags 5979 * vcp->flags Flags values 5980 * vcp->dvap Dir vnode attributes 5981 * vcp->ctx VFS Context (for user) 5982 * ismember pointer to where to put the answer 5983 * idontknow Return this if we can't get an answer 5984 * 5985 * Returns: 0 Success 5986 * vauth_node_group:? Error from vauth_node_group() 5987 * 5988 * Implicit returns: *ismember 0 The user is not a group member 5989 * 1 The user is a group member 5990 */ 5991static int 5992vauth_dir_ingroup(vauth_ctx vcp, int *ismember, int idontknow) 5993{ 5994 int error; 5995 5996 /* Check for a cached answer first, to avoid the check if possible */ 5997 if (vcp->flags_valid & _VAC_IN_DIR_GROUP) { 5998 *ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? 1 : 0; 5999 error = 0; 6000 } else { 6001 /* Otherwise, go look for it */ 6002 error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember, idontknow); 6003 6004 if (!error) { 6005 /* cache our result */ 6006 vcp->flags_valid |= _VAC_IN_DIR_GROUP; 6007 if (*ismember) { 6008 vcp->flags |= _VAC_IN_DIR_GROUP; 6009 } else { 6010 vcp->flags &= ~_VAC_IN_DIR_GROUP; 6011 } 6012 } 6013 } 6014 return(error); 6015} 6016 6017/* 6018 * Test the posix permissions in (vap) to determine whether (credential) 6019 * may perform (action) 6020 */ 6021static int 6022vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir) 6023{ 6024 struct vnode_attr *vap; 6025 int needed, error, owner_ok, group_ok, world_ok, ismember; 6026#ifdef KAUTH_DEBUG_ENABLE 6027 const char *where = "uninitialized"; 6028# define _SETWHERE(c) where = c; 6029#else 6030# define _SETWHERE(c) 6031#endif 6032 6033 /* checking file or directory? */ 6034 if (on_dir) { 6035 vap = vcp->dvap; 6036 } else { 6037 vap = vcp->vap; 6038 } 6039 6040 error = 0; 6041 6042 /* 6043 * We want to do as little work here as possible. So first we check 6044 * which sets of permissions grant us the access we need, and avoid checking 6045 * whether specific permissions grant access when more generic ones would. 6046 */ 6047 6048 /* owner permissions */ 6049 needed = 0; 6050 if (action & VREAD) 6051 needed |= S_IRUSR; 6052 if (action & VWRITE) 6053 needed |= S_IWUSR; 6054 if (action & VEXEC) 6055 needed |= S_IXUSR; 6056 owner_ok = (needed & vap->va_mode) == needed; 6057 6058 /* group permissions */ 6059 needed = 0; 6060 if (action & VREAD) 6061 needed |= S_IRGRP; 6062 if (action & VWRITE) 6063 needed |= S_IWGRP; 6064 if (action & VEXEC) 6065 needed |= S_IXGRP; 6066 group_ok = (needed & vap->va_mode) == needed; 6067 6068 /* world permissions */ 6069 needed = 0; 6070 if (action & VREAD) 6071 needed |= S_IROTH; 6072 if (action & VWRITE) 6073 needed |= S_IWOTH; 6074 if (action & VEXEC) 6075 needed |= S_IXOTH; 6076 world_ok = (needed & vap->va_mode) == needed; 6077 6078 /* If granted/denied by all three, we're done */ 6079 if (owner_ok && group_ok && world_ok) { 6080 _SETWHERE("all"); 6081 goto out; 6082 } 6083 if (!owner_ok && !group_ok && !world_ok) { 6084 _SETWHERE("all"); 6085 error = EACCES; 6086 goto out; 6087 } 6088 6089 /* Check ownership (relatively cheap) */ 6090 if ((on_dir && vauth_dir_owner(vcp)) || 6091 (!on_dir && vauth_file_owner(vcp))) { 6092 _SETWHERE("user"); 6093 if (!owner_ok) 6094 error = EACCES; 6095 goto out; 6096 } 6097 6098 /* Not owner; if group and world both grant it we're done */ 6099 if (group_ok && world_ok) { 6100 _SETWHERE("group/world"); 6101 goto out; 6102 } 6103 if (!group_ok && !world_ok) { 6104 _SETWHERE("group/world"); 6105 error = EACCES; 6106 goto out; 6107 } 6108 6109 /* Check group membership (most expensive) */ 6110 ismember = 0; /* Default to allow, if the target has no group owner */ 6111 6112 /* 6113 * In the case we can't get an answer about the user from the call to 6114 * vauth_dir_ingroup() or vauth_file_ingroup(), we want to fail on 6115 * the side of caution, rather than simply granting access, or we will 6116 * fail to correctly implement exclusion groups, so we set the third 6117 * parameter on the basis of the state of 'group_ok'. 6118 */ 6119 if (on_dir) { 6120 error = vauth_dir_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0)); 6121 } else { 6122 error = vauth_file_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0)); 6123 } 6124 if (error) { 6125 if (!group_ok) 6126 ismember = 1; 6127 error = 0; 6128 } 6129 if (ismember) { 6130 _SETWHERE("group"); 6131 if (!group_ok) 6132 error = EACCES; 6133 goto out; 6134 } 6135 6136 /* Not owner, not in group, use world result */ 6137 _SETWHERE("world"); 6138 if (!world_ok) 6139 error = EACCES; 6140 6141 /* FALLTHROUGH */ 6142 6143out: 6144 KAUTH_DEBUG("%p %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d", 6145 vcp->vp, (error == 0) ? "ALLOWED" : "DENIED", where, 6146 (action & VREAD) ? "r" : "-", 6147 (action & VWRITE) ? "w" : "-", 6148 (action & VEXEC) ? "x" : "-", 6149 needed, 6150 (vap->va_mode & S_IRUSR) ? "r" : "-", 6151 (vap->va_mode & S_IWUSR) ? "w" : "-", 6152 (vap->va_mode & S_IXUSR) ? "x" : "-", 6153 (vap->va_mode & S_IRGRP) ? "r" : "-", 6154 (vap->va_mode & S_IWGRP) ? "w" : "-", 6155 (vap->va_mode & S_IXGRP) ? "x" : "-", 6156 (vap->va_mode & S_IROTH) ? "r" : "-", 6157 (vap->va_mode & S_IWOTH) ? "w" : "-", 6158 (vap->va_mode & S_IXOTH) ? "x" : "-", 6159 kauth_cred_getuid(vcp->ctx->vc_ucred), 6160 on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid, 6161 on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid); 6162 return(error); 6163} 6164 6165/* 6166 * Authorize the deletion of the node vp from the directory dvp. 6167 * 6168 * We assume that: 6169 * - Neither the node nor the directory are immutable. 6170 * - The user is not the superuser. 6171 * 6172 * Deletion is not permitted if the directory is sticky and the caller is 6173 * not owner of the node or directory. 6174 * 6175 * If either the node grants DELETE, or the directory grants DELETE_CHILD, 6176 * the node may be deleted. If neither denies the permission, and the 6177 * caller has Posix write access to the directory, then the node may be 6178 * deleted. 6179 * 6180 * As an optimization, we cache whether or not delete child is permitted 6181 * on directories without the sticky bit set. 6182 */ 6183int 6184vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child); 6185/*static*/ int 6186vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) 6187{ 6188 struct vnode_attr *vap = vcp->vap; 6189 struct vnode_attr *dvap = vcp->dvap; 6190 kauth_cred_t cred = vcp->ctx->vc_ucred; 6191 struct kauth_acl_eval eval; 6192 int error, delete_denied, delete_child_denied, ismember; 6193 6194 /* check the ACL on the directory */ 6195 delete_child_denied = 0; 6196 if (!cached_delete_child && VATTR_IS_NOT(dvap, va_acl, NULL)) { 6197 eval.ae_requested = KAUTH_VNODE_DELETE_CHILD; 6198 eval.ae_acl = &dvap->va_acl->acl_ace[0]; 6199 eval.ae_count = dvap->va_acl->acl_entrycount; 6200 eval.ae_options = 0; 6201 if (vauth_dir_owner(vcp)) 6202 eval.ae_options |= KAUTH_AEVAL_IS_OWNER; 6203 /* 6204 * We use ENOENT as a marker to indicate we could not get 6205 * information in order to delay evaluation until after we 6206 * have the ACL evaluation answer. Previously, we would 6207 * always deny the operation at this point. 6208 */ 6209 if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) 6210 return(error); 6211 if (error == ENOENT) 6212 eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; 6213 else if (ismember) 6214 eval.ae_options |= KAUTH_AEVAL_IN_GROUP; 6215 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; 6216 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; 6217 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; 6218 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; 6219 6220 /* 6221 * If there is no entry, we are going to defer to other 6222 * authorization mechanisms. 6223 */ 6224 error = kauth_acl_evaluate(cred, &eval); 6225 6226 if (error != 0) { 6227 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error); 6228 return(error); 6229 } 6230 switch(eval.ae_result) { 6231 case KAUTH_RESULT_DENY: 6232 delete_child_denied = 1; 6233 break; 6234 /* FALLSTHROUGH */ 6235 case KAUTH_RESULT_ALLOW: 6236 KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp); 6237 return(0); 6238 case KAUTH_RESULT_DEFER: 6239 default: 6240 /* Effectively the same as !delete_child_denied */ 6241 KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp); 6242 break; 6243 } 6244 } 6245 6246 /* check the ACL on the node */ 6247 delete_denied = 0; 6248 if (VATTR_IS_NOT(vap, va_acl, NULL)) { 6249 eval.ae_requested = KAUTH_VNODE_DELETE; 6250 eval.ae_acl = &vap->va_acl->acl_ace[0]; 6251 eval.ae_count = vap->va_acl->acl_entrycount; 6252 eval.ae_options = 0; 6253 if (vauth_file_owner(vcp)) 6254 eval.ae_options |= KAUTH_AEVAL_IS_OWNER; 6255 /* 6256 * We use ENOENT as a marker to indicate we could not get 6257 * information in order to delay evaluation until after we 6258 * have the ACL evaluation answer. Previously, we would 6259 * always deny the operation at this point. 6260 */ 6261 if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) 6262 return(error); 6263 if (error == ENOENT) 6264 eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; 6265 else if (ismember) 6266 eval.ae_options |= KAUTH_AEVAL_IN_GROUP; 6267 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; 6268 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; 6269 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; 6270 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; 6271 6272 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) { 6273 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error); 6274 return(error); 6275 } 6276 6277 switch(eval.ae_result) { 6278 case KAUTH_RESULT_DENY: 6279 delete_denied = 1; 6280 break; 6281 case KAUTH_RESULT_ALLOW: 6282 KAUTH_DEBUG("%p ALLOWED - granted by file ACL", vcp->vp); 6283 return(0); 6284 case KAUTH_RESULT_DEFER: 6285 default: 6286 /* Effectively the same as !delete_child_denied */ 6287 KAUTH_DEBUG("%p DEFERRED%s - by file ACL", vcp->vp, delete_denied ? "(DENY)" : ""); 6288 break; 6289 } 6290 } 6291 6292 /* if denied by ACL on directory or node, return denial */ 6293 if (delete_denied || delete_child_denied) { 6294 KAUTH_DEBUG("%p DENIED - denied by ACL", vcp->vp); 6295 return(EACCES); 6296 } 6297 6298 /* enforce sticky bit behaviour */ 6299 if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) { 6300 KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)", 6301 vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid); 6302 return(EACCES); 6303 } 6304 6305 /* check the directory */ 6306 if (!cached_delete_child && (error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) { 6307 KAUTH_DEBUG("%p DENIED - denied by posix permisssions", vcp->vp); 6308 return(error); 6309 } 6310 6311 /* not denied, must be OK */ 6312 return(0); 6313} 6314 6315 6316/* 6317 * Authorize an operation based on the node's attributes. 6318 */ 6319static int 6320vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights, boolean_t *found_deny) 6321{ 6322 struct vnode_attr *vap = vcp->vap; 6323 kauth_cred_t cred = vcp->ctx->vc_ucred; 6324 struct kauth_acl_eval eval; 6325 int error, ismember; 6326 mode_t posix_action; 6327 6328 /* 6329 * If we are the file owner, we automatically have some rights. 6330 * 6331 * Do we need to expand this to support group ownership? 6332 */ 6333 if (vauth_file_owner(vcp)) 6334 acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY); 6335 6336 /* 6337 * If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can 6338 * mask the latter. If TAKE_OWNERSHIP is requested the caller is about to 6339 * change ownership to themselves, and WRITE_SECURITY is implicitly 6340 * granted to the owner. We need to do this because at this point 6341 * WRITE_SECURITY may not be granted as the caller is not currently 6342 * the owner. 6343 */ 6344 if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) && 6345 (acl_rights & KAUTH_VNODE_WRITE_SECURITY)) 6346 acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY; 6347 6348 if (acl_rights == 0) { 6349 KAUTH_DEBUG("%p ALLOWED - implicit or no rights required", vcp->vp); 6350 return(0); 6351 } 6352 6353 /* if we have an ACL, evaluate it */ 6354 if (VATTR_IS_NOT(vap, va_acl, NULL)) { 6355 eval.ae_requested = acl_rights; 6356 eval.ae_acl = &vap->va_acl->acl_ace[0]; 6357 eval.ae_count = vap->va_acl->acl_entrycount; 6358 eval.ae_options = 0; 6359 if (vauth_file_owner(vcp)) 6360 eval.ae_options |= KAUTH_AEVAL_IS_OWNER; 6361 /* 6362 * We use ENOENT as a marker to indicate we could not get 6363 * information in order to delay evaluation until after we 6364 * have the ACL evaluation answer. Previously, we would 6365 * always deny the operation at this point. 6366 */ 6367 if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) 6368 return(error); 6369 if (error == ENOENT) 6370 eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; 6371 else if (ismember) 6372 eval.ae_options |= KAUTH_AEVAL_IN_GROUP; 6373 eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; 6374 eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; 6375 eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; 6376 eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; 6377 6378 if ((error = kauth_acl_evaluate(cred, &eval)) != 0) { 6379 KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error); 6380 return(error); 6381 } 6382 6383 switch(eval.ae_result) { 6384 case KAUTH_RESULT_DENY: 6385 KAUTH_DEBUG("%p DENIED - by ACL", vcp->vp); 6386 return(EACCES); /* deny, deny, counter-allege */ 6387 case KAUTH_RESULT_ALLOW: 6388 KAUTH_DEBUG("%p ALLOWED - all rights granted by ACL", vcp->vp); 6389 return(0); 6390 case KAUTH_RESULT_DEFER: 6391 default: 6392 /* Effectively the same as !delete_child_denied */ 6393 KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp); 6394 break; 6395 } 6396 6397 *found_deny = eval.ae_found_deny; 6398 6399 /* fall through and evaluate residual rights */ 6400 } else { 6401 /* no ACL, everything is residual */ 6402 eval.ae_residual = acl_rights; 6403 } 6404 6405 /* 6406 * Grant residual rights that have been pre-authorized. 6407 */ 6408 eval.ae_residual &= ~preauth_rights; 6409 6410 /* 6411 * We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied. 6412 */ 6413 if (vauth_file_owner(vcp)) 6414 eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES; 6415 6416 if (eval.ae_residual == 0) { 6417 KAUTH_DEBUG("%p ALLOWED - rights already authorized", vcp->vp); 6418 return(0); 6419 } 6420 6421 /* 6422 * Bail if we have residual rights that can't be granted by posix permissions, 6423 * or aren't presumed granted at this point. 6424 * 6425 * XXX these can be collapsed for performance 6426 */ 6427 if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) { 6428 KAUTH_DEBUG("%p DENIED - CHANGE_OWNER not permitted", vcp->vp); 6429 return(EACCES); 6430 } 6431 if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) { 6432 KAUTH_DEBUG("%p DENIED - WRITE_SECURITY not permitted", vcp->vp); 6433 return(EACCES); 6434 } 6435 6436#if DIAGNOSTIC 6437 if (eval.ae_residual & KAUTH_VNODE_DELETE) 6438 panic("vnode_authorize: can't be checking delete permission here"); 6439#endif 6440 6441 /* 6442 * Compute the fallback posix permissions that will satisfy the remaining 6443 * rights. 6444 */ 6445 posix_action = 0; 6446 if (eval.ae_residual & (KAUTH_VNODE_READ_DATA | 6447 KAUTH_VNODE_LIST_DIRECTORY | 6448 KAUTH_VNODE_READ_EXTATTRIBUTES)) 6449 posix_action |= VREAD; 6450 if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA | 6451 KAUTH_VNODE_ADD_FILE | 6452 KAUTH_VNODE_ADD_SUBDIRECTORY | 6453 KAUTH_VNODE_DELETE_CHILD | 6454 KAUTH_VNODE_WRITE_ATTRIBUTES | 6455 KAUTH_VNODE_WRITE_EXTATTRIBUTES)) 6456 posix_action |= VWRITE; 6457 if (eval.ae_residual & (KAUTH_VNODE_EXECUTE | 6458 KAUTH_VNODE_SEARCH)) 6459 posix_action |= VEXEC; 6460 6461 if (posix_action != 0) { 6462 return(vnode_authorize_posix(vcp, posix_action, 0 /* !on_dir */)); 6463 } else { 6464 KAUTH_DEBUG("%p ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping", 6465 vcp->vp, 6466 (eval.ae_residual & KAUTH_VNODE_READ_DATA) 6467 ? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "", 6468 (eval.ae_residual & KAUTH_VNODE_WRITE_DATA) 6469 ? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "", 6470 (eval.ae_residual & KAUTH_VNODE_EXECUTE) 6471 ? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "", 6472 (eval.ae_residual & KAUTH_VNODE_DELETE) 6473 ? " DELETE" : "", 6474 (eval.ae_residual & KAUTH_VNODE_APPEND_DATA) 6475 ? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "", 6476 (eval.ae_residual & KAUTH_VNODE_DELETE_CHILD) 6477 ? " DELETE_CHILD" : "", 6478 (eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES) 6479 ? " READ_ATTRIBUTES" : "", 6480 (eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES) 6481 ? " WRITE_ATTRIBUTES" : "", 6482 (eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES) 6483 ? " READ_EXTATTRIBUTES" : "", 6484 (eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES) 6485 ? " WRITE_EXTATTRIBUTES" : "", 6486 (eval.ae_residual & KAUTH_VNODE_READ_SECURITY) 6487 ? " READ_SECURITY" : "", 6488 (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) 6489 ? " WRITE_SECURITY" : "", 6490 (eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE) 6491 ? " CHECKIMMUTABLE" : "", 6492 (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) 6493 ? " CHANGE_OWNER" : ""); 6494 } 6495 6496 /* 6497 * Lack of required Posix permissions implies no reason to deny access. 6498 */ 6499 return(0); 6500} 6501 6502/* 6503 * Check for file immutability. 6504 */ 6505static int 6506vnode_authorize_checkimmutable(vnode_t vp, struct vnode_attr *vap, int rights, int ignore) 6507{ 6508 mount_t mp; 6509 int error; 6510 int append; 6511 6512 /* 6513 * Perform immutability checks for operations that change data. 6514 * 6515 * Sockets, fifos and devices require special handling. 6516 */ 6517 switch(vp->v_type) { 6518 case VSOCK: 6519 case VFIFO: 6520 case VBLK: 6521 case VCHR: 6522 /* 6523 * Writing to these nodes does not change the filesystem data, 6524 * so forget that it's being tried. 6525 */ 6526 rights &= ~KAUTH_VNODE_WRITE_DATA; 6527 break; 6528 default: 6529 break; 6530 } 6531 6532 error = 0; 6533 if (rights & KAUTH_VNODE_WRITE_RIGHTS) { 6534 6535 /* check per-filesystem options if possible */ 6536 mp = vp->v_mount; 6537 if (mp != NULL) { 6538 6539 /* check for no-EA filesystems */ 6540 if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) && 6541 (vfs_flags(mp) & MNT_NOUSERXATTR)) { 6542 KAUTH_DEBUG("%p DENIED - filesystem disallowed extended attributes", vp); 6543 error = EACCES; /* User attributes disabled */ 6544 goto out; 6545 } 6546 } 6547 6548 /* 6549 * check for file immutability. first, check if the requested rights are 6550 * allowable for a UF_APPEND file. 6551 */ 6552 append = 0; 6553 if (vp->v_type == VDIR) { 6554 if ((rights & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights) 6555 append = 1; 6556 } else { 6557 if ((rights & (KAUTH_VNODE_APPEND_DATA | KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights) 6558 append = 1; 6559 } 6560 if ((error = vnode_immutable(vap, append, ignore)) != 0) { 6561 KAUTH_DEBUG("%p DENIED - file is immutable", vp); 6562 goto out; 6563 } 6564 } 6565out: 6566 return(error); 6567} 6568 6569/* 6570 * Handle authorization actions for filesystems that advertise that the 6571 * server will be enforcing. 6572 * 6573 * Returns: 0 Authorization should be handled locally 6574 * 1 Authorization was handled by the FS 6575 * 6576 * Note: Imputed returns will only occur if the authorization request 6577 * was handled by the FS. 6578 * 6579 * Imputed: *resultp, modified Return code from FS when the request is 6580 * handled by the FS. 6581 * VNOP_ACCESS:??? 6582 * VNOP_OPEN:??? 6583 */ 6584static int 6585vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx) 6586{ 6587 int error; 6588 6589 /* 6590 * If the vp is a device node, socket or FIFO it actually represents a local 6591 * endpoint, so we need to handle it locally. 6592 */ 6593 switch(vp->v_type) { 6594 case VBLK: 6595 case VCHR: 6596 case VSOCK: 6597 case VFIFO: 6598 return(0); 6599 default: 6600 break; 6601 } 6602 6603 /* 6604 * In the advisory request case, if the filesystem doesn't think it's reliable 6605 * we will attempt to formulate a result ourselves based on VNOP_GETATTR data. 6606 */ 6607 if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(vp->v_mount)) 6608 return(0); 6609 6610 /* 6611 * Let the filesystem have a say in the matter. It's OK for it to not implemnent 6612 * VNOP_ACCESS, as most will authorise inline with the actual request. 6613 */ 6614 if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) { 6615 *resultp = error; 6616 KAUTH_DEBUG("%p DENIED - opaque filesystem VNOP_ACCESS denied access", vp); 6617 return(1); 6618 } 6619 6620 /* 6621 * Typically opaque filesystems do authorisation in-line, but exec is a special case. In 6622 * order to be reasonably sure that exec will be permitted, we try a bit harder here. 6623 */ 6624 if ((action & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG)) { 6625 /* try a VNOP_OPEN for readonly access */ 6626 if ((error = VNOP_OPEN(vp, FREAD, ctx)) != 0) { 6627 *resultp = error; 6628 KAUTH_DEBUG("%p DENIED - EXECUTE denied because file could not be opened readonly", vp); 6629 return(1); 6630 } 6631 VNOP_CLOSE(vp, FREAD, ctx); 6632 } 6633 6634 /* 6635 * We don't have any reason to believe that the request has to be denied at this point, 6636 * so go ahead and allow it. 6637 */ 6638 *resultp = 0; 6639 KAUTH_DEBUG("%p ALLOWED - bypassing access check for non-local filesystem", vp); 6640 return(1); 6641} 6642 6643 6644 6645 6646/* 6647 * Returns: KAUTH_RESULT_ALLOW 6648 * KAUTH_RESULT_DENY 6649 * 6650 * Imputed: *arg3, modified Error code in the deny case 6651 * EROFS Read-only file system 6652 * EACCES Permission denied 6653 * EPERM Operation not permitted [no execute] 6654 * vnode_getattr:ENOMEM Not enough space [only if has filesec] 6655 * vnode_getattr:??? 6656 * vnode_authorize_opaque:*arg2 ??? 6657 * vnode_authorize_checkimmutable:??? 6658 * vnode_authorize_delete:??? 6659 * vnode_authorize_simple:??? 6660 */ 6661 6662 6663static int 6664vnode_authorize_callback(kauth_cred_t cred, void *idata, kauth_action_t action, 6665 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) 6666{ 6667 vfs_context_t ctx; 6668 vnode_t cvp = NULLVP; 6669 vnode_t vp, dvp; 6670 int result = KAUTH_RESULT_DENY; 6671 int parent_iocount = 0; 6672 int parent_action; /* In case we need to use namedstream's data fork for cached rights*/ 6673 6674 ctx = (vfs_context_t)arg0; 6675 vp = (vnode_t)arg1; 6676 dvp = (vnode_t)arg2; 6677 6678 /* 6679 * if there are 2 vnodes passed in, we don't know at 6680 * this point which rights to look at based on the 6681 * combined action being passed in... defer until later... 6682 * otherwise check the kauth 'rights' cache hung 6683 * off of the vnode we're interested in... if we've already 6684 * been granted the right we're currently interested in, 6685 * we can just return success... otherwise we'll go through 6686 * the process of authorizing the requested right(s)... if that 6687 * succeeds, we'll add the right(s) to the cache. 6688 * VNOP_SETATTR and VNOP_SETXATTR will invalidate this cache 6689 */ 6690 if (dvp && vp) 6691 goto defer; 6692 if (dvp) { 6693 cvp = dvp; 6694 } else { 6695 /* 6696 * For named streams on local-authorization volumes, rights are cached on the parent; 6697 * authorization is determined by looking at the parent's properties anyway, so storing 6698 * on the parent means that we don't recompute for the named stream and that if 6699 * we need to flush rights (e.g. on VNOP_SETATTR()) we don't need to track down the 6700 * stream to flush its cache separately. If we miss in the cache, then we authorize 6701 * as if there were no cached rights (passing the named stream vnode and desired rights to 6702 * vnode_authorize_callback_int()). 6703 * 6704 * On an opaquely authorized volume, we don't know the relationship between the 6705 * data fork's properties and the rights granted on a stream. Thus, named stream vnodes 6706 * on such a volume are authorized directly (rather than using the parent) and have their 6707 * own caches. When a named stream vnode is created, we mark the parent as having a named 6708 * stream. On a VNOP_SETATTR() for the parent that may invalidate cached authorization, we 6709 * find the stream and flush its cache. 6710 */ 6711 if (vnode_isnamedstream(vp) && (!vfs_authopaque(vp->v_mount))) { 6712 cvp = vnode_getparent(vp); 6713 if (cvp != NULLVP) { 6714 parent_iocount = 1; 6715 } else { 6716 cvp = NULL; 6717 goto defer; /* If we can't use the parent, take the slow path */ 6718 } 6719 6720 /* Have to translate some actions */ 6721 parent_action = action; 6722 if (parent_action & KAUTH_VNODE_READ_DATA) { 6723 parent_action &= ~KAUTH_VNODE_READ_DATA; 6724 parent_action |= KAUTH_VNODE_READ_EXTATTRIBUTES; 6725 } 6726 if (parent_action & KAUTH_VNODE_WRITE_DATA) { 6727 parent_action &= ~KAUTH_VNODE_WRITE_DATA; 6728 parent_action |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; 6729 } 6730 6731 } else { 6732 cvp = vp; 6733 } 6734 } 6735 6736 if (vnode_cache_is_authorized(cvp, ctx, parent_iocount ? parent_action : action) == TRUE) { 6737 result = KAUTH_RESULT_ALLOW; 6738 goto out; 6739 } 6740defer: 6741 result = vnode_authorize_callback_int(cred, idata, action, arg0, arg1, arg2, arg3); 6742 6743 if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) { 6744 KAUTH_DEBUG("%p - caching action = %x", cvp, action); 6745 vnode_cache_authorized_action(cvp, ctx, action); 6746 } 6747 6748out: 6749 if (parent_iocount) { 6750 vnode_put(cvp); 6751 } 6752 6753 return result; 6754} 6755 6756 6757static int 6758vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *idata, kauth_action_t action, 6759 uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) 6760{ 6761 struct _vnode_authorize_context auth_context; 6762 vauth_ctx vcp; 6763 vfs_context_t ctx; 6764 vnode_t vp, dvp; 6765 kauth_cred_t cred; 6766 kauth_ace_rights_t rights; 6767 struct vnode_attr va, dva; 6768 int result; 6769 int *errorp; 6770 int noimmutable; 6771 boolean_t parent_authorized_for_delete_child = FALSE; 6772 boolean_t found_deny = FALSE; 6773 boolean_t parent_ref= FALSE; 6774 6775 vcp = &auth_context; 6776 ctx = vcp->ctx = (vfs_context_t)arg0; 6777 vp = vcp->vp = (vnode_t)arg1; 6778 dvp = vcp->dvp = (vnode_t)arg2; 6779 errorp = (int *)arg3; 6780 /* 6781 * Note that we authorize against the context, not the passed cred 6782 * (the same thing anyway) 6783 */ 6784 cred = ctx->vc_ucred; 6785 6786 VATTR_INIT(&va); 6787 vcp->vap = &va; 6788 VATTR_INIT(&dva); 6789 vcp->dvap = &dva; 6790 6791 vcp->flags = vcp->flags_valid = 0; 6792 6793#if DIAGNOSTIC 6794 if ((ctx == NULL) || (vp == NULL) || (cred == NULL)) 6795 panic("vnode_authorize: bad arguments (context %p vp %p cred %p)", ctx, vp, cred); 6796#endif 6797 6798 KAUTH_DEBUG("%p AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)", 6799 vp, vfs_context_proc(ctx)->p_comm, 6800 (action & KAUTH_VNODE_ACCESS) ? "access" : "auth", 6801 (action & KAUTH_VNODE_READ_DATA) ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "", 6802 (action & KAUTH_VNODE_WRITE_DATA) ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "", 6803 (action & KAUTH_VNODE_EXECUTE) ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "", 6804 (action & KAUTH_VNODE_DELETE) ? " DELETE" : "", 6805 (action & KAUTH_VNODE_APPEND_DATA) ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "", 6806 (action & KAUTH_VNODE_DELETE_CHILD) ? " DELETE_CHILD" : "", 6807 (action & KAUTH_VNODE_READ_ATTRIBUTES) ? " READ_ATTRIBUTES" : "", 6808 (action & KAUTH_VNODE_WRITE_ATTRIBUTES) ? " WRITE_ATTRIBUTES" : "", 6809 (action & KAUTH_VNODE_READ_EXTATTRIBUTES) ? " READ_EXTATTRIBUTES" : "", 6810 (action & KAUTH_VNODE_WRITE_EXTATTRIBUTES) ? " WRITE_EXTATTRIBUTES" : "", 6811 (action & KAUTH_VNODE_READ_SECURITY) ? " READ_SECURITY" : "", 6812 (action & KAUTH_VNODE_WRITE_SECURITY) ? " WRITE_SECURITY" : "", 6813 (action & KAUTH_VNODE_CHANGE_OWNER) ? " CHANGE_OWNER" : "", 6814 (action & KAUTH_VNODE_NOIMMUTABLE) ? " (noimmutable)" : "", 6815 vnode_isdir(vp) ? "directory" : "file", 6816 vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp); 6817 6818 /* 6819 * Extract the control bits from the action, everything else is 6820 * requested rights. 6821 */ 6822 noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? 1 : 0; 6823 rights = action & ~(KAUTH_VNODE_ACCESS | KAUTH_VNODE_NOIMMUTABLE); 6824 6825 if (rights & KAUTH_VNODE_DELETE) { 6826#if DIAGNOSTIC 6827 if (dvp == NULL) 6828 panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory"); 6829#endif 6830 /* 6831 * check to see if we've already authorized the parent 6832 * directory for deletion of its children... if so, we 6833 * can skip a whole bunch of work... we will still have to 6834 * authorize that this specific child can be removed 6835 */ 6836 if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE) 6837 parent_authorized_for_delete_child = TRUE; 6838 } else { 6839 dvp = NULL; 6840 } 6841 6842 /* 6843 * Check for read-only filesystems. 6844 */ 6845 if ((rights & KAUTH_VNODE_WRITE_RIGHTS) && 6846 (vp->v_mount->mnt_flag & MNT_RDONLY) && 6847 ((vp->v_type == VREG) || (vp->v_type == VDIR) || 6848 (vp->v_type == VLNK) || (vp->v_type == VCPLX) || 6849 (rights & KAUTH_VNODE_DELETE) || (rights & KAUTH_VNODE_DELETE_CHILD))) { 6850 result = EROFS; 6851 goto out; 6852 } 6853 6854 /* 6855 * Check for noexec filesystems. 6856 */ 6857 if ((rights & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) { 6858 result = EACCES; 6859 goto out; 6860 } 6861 6862 /* 6863 * Handle cases related to filesystems with non-local enforcement. 6864 * This call can return 0, in which case we will fall through to perform a 6865 * check based on VNOP_GETATTR data. Otherwise it returns 1 and sets 6866 * an appropriate result, at which point we can return immediately. 6867 */ 6868 if ((vp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) && vnode_authorize_opaque(vp, &result, action, ctx)) 6869 goto out; 6870 6871 /* 6872 * Get vnode attributes and extended security information for the vnode 6873 * and directory if required. 6874 */ 6875 VATTR_WANTED(&va, va_mode); 6876 VATTR_WANTED(&va, va_uid); 6877 VATTR_WANTED(&va, va_gid); 6878 VATTR_WANTED(&va, va_flags); 6879 VATTR_WANTED(&va, va_acl); 6880 if ((result = vnode_getattr(vp, &va, ctx)) != 0) { 6881 KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result); 6882 goto out; 6883 } 6884 if (dvp) { 6885 VATTR_WANTED(&dva, va_mode); 6886 VATTR_WANTED(&dva, va_uid); 6887 VATTR_WANTED(&dva, va_gid); 6888 VATTR_WANTED(&dva, va_flags); 6889 VATTR_WANTED(&dva, va_acl); 6890 if ((result = vnode_getattr(dvp, &dva, ctx)) != 0) { 6891 KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result); 6892 goto out; 6893 } 6894 } 6895 6896 /* 6897 * If the vnode is an extended attribute data vnode (eg. a resource fork), *_DATA becomes 6898 * *_EXTATTRIBUTES. 6899 */ 6900 if (vnode_isnamedstream(vp)) { 6901 if (rights & KAUTH_VNODE_READ_DATA) { 6902 rights &= ~KAUTH_VNODE_READ_DATA; 6903 rights |= KAUTH_VNODE_READ_EXTATTRIBUTES; 6904 } 6905 if (rights & KAUTH_VNODE_WRITE_DATA) { 6906 rights &= ~KAUTH_VNODE_WRITE_DATA; 6907 rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; 6908 } 6909 } 6910 6911 /* 6912 * Point 'vp' to the resource fork's parent for ACL checking 6913 */ 6914 if (vnode_isnamedstream(vp) && 6915 (vp->v_parent != NULL) && 6916 (vget_internal(vp->v_parent, 0, VNODE_NODEAD | VNODE_DRAINO) == 0)) { 6917 parent_ref = TRUE; 6918 vcp->vp = vp = vp->v_parent; 6919 if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) 6920 kauth_acl_free(va.va_acl); 6921 VATTR_INIT(&va); 6922 VATTR_WANTED(&va, va_mode); 6923 VATTR_WANTED(&va, va_uid); 6924 VATTR_WANTED(&va, va_gid); 6925 VATTR_WANTED(&va, va_flags); 6926 VATTR_WANTED(&va, va_acl); 6927 if ((result = vnode_getattr(vp, &va, ctx)) != 0) 6928 goto out; 6929 } 6930 6931 /* 6932 * Check for immutability. 6933 * 6934 * In the deletion case, parent directory immutability vetoes specific 6935 * file rights. 6936 */ 6937 if ((result = vnode_authorize_checkimmutable(vp, &va, rights, noimmutable)) != 0) 6938 goto out; 6939 if ((rights & KAUTH_VNODE_DELETE) && 6940 parent_authorized_for_delete_child == FALSE && 6941 ((result = vnode_authorize_checkimmutable(dvp, &dva, KAUTH_VNODE_DELETE_CHILD, 0)) != 0)) 6942 goto out; 6943 6944 /* 6945 * Clear rights that have been authorized by reaching this point, bail if nothing left to 6946 * check. 6947 */ 6948 rights &= ~(KAUTH_VNODE_LINKTARGET | KAUTH_VNODE_CHECKIMMUTABLE); 6949 if (rights == 0) 6950 goto out; 6951 6952 /* 6953 * If we're not the superuser, authorize based on file properties; 6954 * note that even if parent_authorized_for_delete_child is TRUE, we 6955 * need to check on the node itself. 6956 */ 6957 if (!vfs_context_issuser(ctx)) { 6958 /* process delete rights */ 6959 if ((rights & KAUTH_VNODE_DELETE) && 6960 ((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != 0)) 6961 goto out; 6962 6963 /* process remaining rights */ 6964 if ((rights & ~KAUTH_VNODE_DELETE) && 6965 (result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE, &found_deny)) != 0) 6966 goto out; 6967 } else { 6968 6969 /* 6970 * Execute is only granted to root if one of the x bits is set. This check only 6971 * makes sense if the posix mode bits are actually supported. 6972 */ 6973 if ((rights & KAUTH_VNODE_EXECUTE) && 6974 (vp->v_type == VREG) && 6975 VATTR_IS_SUPPORTED(&va, va_mode) && 6976 !(va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) { 6977 result = EPERM; 6978 KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode); 6979 goto out; 6980 } 6981 6982 KAUTH_DEBUG("%p ALLOWED - caller is superuser", vp); 6983 } 6984out: 6985 if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) 6986 kauth_acl_free(va.va_acl); 6987 if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL)) 6988 kauth_acl_free(dva.va_acl); 6989 6990 if (result) { 6991 if (parent_ref) 6992 vnode_put(vp); 6993 *errorp = result; 6994 KAUTH_DEBUG("%p DENIED - auth denied", vp); 6995 return(KAUTH_RESULT_DENY); 6996 } 6997 if ((rights & KAUTH_VNODE_SEARCH) && found_deny == FALSE && vp->v_type == VDIR) { 6998 /* 6999 * if we were successfully granted the right to search this directory 7000 * and there were NO ACL DENYs for search and the posix permissions also don't 7001 * deny execute, we can synthesize a global right that allows anyone to 7002 * traverse this directory during a pathname lookup without having to 7003 * match the credential associated with this cache of rights. 7004 */ 7005 if (!VATTR_IS_SUPPORTED(&va, va_mode) || 7006 ((va.va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 7007 (S_IXUSR | S_IXGRP | S_IXOTH))) { 7008 vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE); 7009 } 7010 } 7011 if ((rights & KAUTH_VNODE_DELETE) && parent_authorized_for_delete_child == FALSE) { 7012 /* 7013 * parent was successfully and newly authorized for content deletions 7014 * add it to the cache, but only if it doesn't have the sticky 7015 * bit set on it. This same check is done earlier guarding 7016 * fetching of dva, and if we jumped to out without having done 7017 * this, we will have returned already because of a non-zero 7018 * 'result' value. 7019 */ 7020 if (VATTR_IS_SUPPORTED(&dva, va_mode) && 7021 !(dva.va_mode & (S_ISVTX))) { 7022 /* OK to cache delete rights */ 7023 KAUTH_DEBUG("%p - caching DELETE_CHILD rights", dvp); 7024 vnode_cache_authorized_action(dvp, ctx, KAUTH_VNODE_DELETE_CHILD); 7025 } 7026 } 7027 if (parent_ref) 7028 vnode_put(vp); 7029 /* 7030 * Note that this implies that we will allow requests for no rights, as well as 7031 * for rights that we do not recognise. There should be none of these. 7032 */ 7033 KAUTH_DEBUG("%p ALLOWED - auth granted", vp); 7034 return(KAUTH_RESULT_ALLOW); 7035} 7036 7037int 7038vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx) 7039{ 7040 return vnode_authattr_new_internal(dvp, vap, noauth, NULL, ctx); 7041} 7042 7043/* 7044 * Check that the attribute information in vattr can be legally applied to 7045 * a new file by the context. 7046 */ 7047static int 7048vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx) 7049{ 7050 int error; 7051 int has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode; 7052 kauth_cred_t cred; 7053 guid_t changer; 7054 mount_t dmp; 7055 7056 error = 0; 7057 7058 if (defaulted_fieldsp) { 7059 *defaulted_fieldsp = 0; 7060 } 7061 7062 defaulted_owner = defaulted_group = defaulted_mode = 0; 7063 7064 /* 7065 * Require that the filesystem support extended security to apply any. 7066 */ 7067 if (!vfs_extendedsecurity(dvp->v_mount) && 7068 (VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid))) { 7069 error = EINVAL; 7070 goto out; 7071 } 7072 7073 /* 7074 * Default some fields. 7075 */ 7076 dmp = dvp->v_mount; 7077 7078 /* 7079 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that 7080 * owner takes ownership of all new files. 7081 */ 7082 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) { 7083 VATTR_SET(vap, va_uid, dmp->mnt_fsowner); 7084 defaulted_owner = 1; 7085 } else { 7086 if (!VATTR_IS_ACTIVE(vap, va_uid)) { 7087 /* default owner is current user */ 7088 VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx))); 7089 defaulted_owner = 1; 7090 } 7091 } 7092 7093 /* 7094 * If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that 7095 * group takes ownership of all new files. 7096 */ 7097 if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) { 7098 VATTR_SET(vap, va_gid, dmp->mnt_fsgroup); 7099 defaulted_group = 1; 7100 } else { 7101 if (!VATTR_IS_ACTIVE(vap, va_gid)) { 7102 /* default group comes from parent object, fallback to current user */ 7103 struct vnode_attr dva; 7104 VATTR_INIT(&dva); 7105 VATTR_WANTED(&dva, va_gid); 7106 if ((error = vnode_getattr(dvp, &dva, ctx)) != 0) 7107 goto out; 7108 if (VATTR_IS_SUPPORTED(&dva, va_gid)) { 7109 VATTR_SET(vap, va_gid, dva.va_gid); 7110 } else { 7111 VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx))); 7112 } 7113 defaulted_group = 1; 7114 } 7115 } 7116 7117 if (!VATTR_IS_ACTIVE(vap, va_flags)) 7118 VATTR_SET(vap, va_flags, 0); 7119 7120 /* default mode is everything, masked with current umask */ 7121 if (!VATTR_IS_ACTIVE(vap, va_mode)) { 7122 VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask); 7123 KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o", vap->va_mode, vfs_context_proc(ctx)->p_fd->fd_cmask); 7124 defaulted_mode = 1; 7125 } 7126 /* set timestamps to now */ 7127 if (!VATTR_IS_ACTIVE(vap, va_create_time)) { 7128 nanotime(&vap->va_create_time); 7129 VATTR_SET_ACTIVE(vap, va_create_time); 7130 } 7131 7132 /* 7133 * Check for attempts to set nonsensical fields. 7134 */ 7135 if (vap->va_active & ~VNODE_ATTR_NEWOBJ) { 7136 error = EINVAL; 7137 KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx", 7138 vap->va_active & ~VNODE_ATTR_NEWOBJ); 7139 goto out; 7140 } 7141 7142 /* 7143 * Quickly check for the applicability of any enforcement here. 7144 * Tests below maintain the integrity of the local security model. 7145 */ 7146 if (vfs_authopaque(dvp->v_mount)) 7147 goto out; 7148 7149 /* 7150 * We need to know if the caller is the superuser, or if the work is 7151 * otherwise already authorised. 7152 */ 7153 cred = vfs_context_ucred(ctx); 7154 if (noauth) { 7155 /* doing work for the kernel */ 7156 has_priv_suser = 1; 7157 } else { 7158 has_priv_suser = vfs_context_issuser(ctx); 7159 } 7160 7161 7162 if (VATTR_IS_ACTIVE(vap, va_flags)) { 7163 if (has_priv_suser) { 7164 if ((vap->va_flags & (UF_SETTABLE | SF_SETTABLE)) != vap->va_flags) { 7165 error = EPERM; 7166 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)"); 7167 goto out; 7168 } 7169 } else { 7170 if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) { 7171 error = EPERM; 7172 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)"); 7173 goto out; 7174 } 7175 } 7176 } 7177 7178 /* if not superuser, validate legality of new-item attributes */ 7179 if (!has_priv_suser) { 7180 if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) { 7181 /* setgid? */ 7182 if (vap->va_mode & S_ISGID) { 7183 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) { 7184 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid); 7185 goto out; 7186 } 7187 if (!ismember) { 7188 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", vap->va_gid); 7189 error = EPERM; 7190 goto out; 7191 } 7192 } 7193 7194 /* setuid? */ 7195 if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred))) { 7196 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit"); 7197 error = EPERM; 7198 goto out; 7199 } 7200 } 7201 if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred))) { 7202 KAUTH_DEBUG(" DENIED - cannot create new item owned by %d", vap->va_uid); 7203 error = EPERM; 7204 goto out; 7205 } 7206 if (!defaulted_group) { 7207 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) { 7208 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid); 7209 goto out; 7210 } 7211 if (!ismember) { 7212 KAUTH_DEBUG(" DENIED - cannot create new item with group %d - not a member", vap->va_gid); 7213 error = EPERM; 7214 goto out; 7215 } 7216 } 7217 7218 /* initialising owner/group UUID */ 7219 if (VATTR_IS_ACTIVE(vap, va_uuuid)) { 7220 if ((error = kauth_cred_getguid(cred, &changer)) != 0) { 7221 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error); 7222 /* XXX ENOENT here - no GUID - should perhaps become EPERM */ 7223 goto out; 7224 } 7225 if (!kauth_guid_equal(&vap->va_uuuid, &changer)) { 7226 KAUTH_DEBUG(" ERROR - cannot create item with supplied owner UUID - not us"); 7227 error = EPERM; 7228 goto out; 7229 } 7230 } 7231 if (VATTR_IS_ACTIVE(vap, va_guuid)) { 7232 if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) { 7233 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error); 7234 goto out; 7235 } 7236 if (!ismember) { 7237 KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member"); 7238 error = EPERM; 7239 goto out; 7240 } 7241 } 7242 } 7243out: 7244 if (defaulted_fieldsp) { 7245 if (defaulted_mode) { 7246 *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_MODE; 7247 } 7248 if (defaulted_group) { 7249 *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_GID; 7250 } 7251 if (defaulted_owner) { 7252 *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_UID; 7253 } 7254 } 7255 return(error); 7256} 7257 7258/* 7259 * Check that the attribute information in vap can be legally written by the 7260 * context. 7261 * 7262 * Call this when you're not sure about the vnode_attr; either its contents 7263 * have come from an unknown source, or when they are variable. 7264 * 7265 * Returns errno, or zero and sets *actionp to the KAUTH_VNODE_* actions that 7266 * must be authorized to be permitted to write the vattr. 7267 */ 7268int 7269vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_context_t ctx) 7270{ 7271 struct vnode_attr ova; 7272 kauth_action_t required_action; 7273 int error, has_priv_suser, ismember, chowner, chgroup, clear_suid, clear_sgid; 7274 guid_t changer; 7275 gid_t group; 7276 uid_t owner; 7277 mode_t newmode; 7278 kauth_cred_t cred; 7279 uint32_t fdelta; 7280 7281 VATTR_INIT(&ova); 7282 required_action = 0; 7283 error = 0; 7284 7285 /* 7286 * Quickly check for enforcement applicability. 7287 */ 7288 if (vfs_authopaque(vp->v_mount)) 7289 goto out; 7290 7291 /* 7292 * Check for attempts to set nonsensical fields. 7293 */ 7294 if (vap->va_active & VNODE_ATTR_RDONLY) { 7295 KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)"); 7296 error = EINVAL; 7297 goto out; 7298 } 7299 7300 /* 7301 * We need to know if the caller is the superuser. 7302 */ 7303 cred = vfs_context_ucred(ctx); 7304 has_priv_suser = kauth_cred_issuser(cred); 7305 7306 /* 7307 * If any of the following are changing, we need information from the old file: 7308 * va_uid 7309 * va_gid 7310 * va_mode 7311 * va_uuuid 7312 * va_guuid 7313 */ 7314 if (VATTR_IS_ACTIVE(vap, va_uid) || 7315 VATTR_IS_ACTIVE(vap, va_gid) || 7316 VATTR_IS_ACTIVE(vap, va_mode) || 7317 VATTR_IS_ACTIVE(vap, va_uuuid) || 7318 VATTR_IS_ACTIVE(vap, va_guuid)) { 7319 VATTR_WANTED(&ova, va_mode); 7320 VATTR_WANTED(&ova, va_uid); 7321 VATTR_WANTED(&ova, va_gid); 7322 VATTR_WANTED(&ova, va_uuuid); 7323 VATTR_WANTED(&ova, va_guuid); 7324 KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes"); 7325 } 7326 7327 /* 7328 * If timestamps are being changed, we need to know who the file is owned 7329 * by. 7330 */ 7331 if (VATTR_IS_ACTIVE(vap, va_create_time) || 7332 VATTR_IS_ACTIVE(vap, va_change_time) || 7333 VATTR_IS_ACTIVE(vap, va_modify_time) || 7334 VATTR_IS_ACTIVE(vap, va_access_time) || 7335 VATTR_IS_ACTIVE(vap, va_backup_time)) { 7336 7337 VATTR_WANTED(&ova, va_uid); 7338#if 0 /* enable this when we support UUIDs as official owners */ 7339 VATTR_WANTED(&ova, va_uuuid); 7340#endif 7341 KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID"); 7342 } 7343 7344 /* 7345 * If flags are being changed, we need the old flags. 7346 */ 7347 if (VATTR_IS_ACTIVE(vap, va_flags)) { 7348 KAUTH_DEBUG("ATTR - flags changing, fetching old flags"); 7349 VATTR_WANTED(&ova, va_flags); 7350 } 7351 7352 /* 7353 * If ACLs are being changed, we need the old ACLs. 7354 */ 7355 if (VATTR_IS_ACTIVE(vap, va_acl)) { 7356 KAUTH_DEBUG("ATTR - acl changing, fetching old flags"); 7357 VATTR_WANTED(&ova, va_acl); 7358 } 7359 7360 /* 7361 * If the size is being set, make sure it's not a directory. 7362 */ 7363 if (VATTR_IS_ACTIVE(vap, va_data_size)) { 7364 /* size is meaningless on a directory, don't permit this */ 7365 if (vnode_isdir(vp)) { 7366 KAUTH_DEBUG("ATTR - ERROR: size change requested on a directory"); 7367 error = EISDIR; 7368 goto out; 7369 } 7370 } 7371 7372 /* 7373 * Get old data. 7374 */ 7375 KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active); 7376 if ((error = vnode_getattr(vp, &ova, ctx)) != 0) { 7377 KAUTH_DEBUG(" ERROR - got %d trying to get attributes", error); 7378 goto out; 7379 } 7380 7381 /* 7382 * Size changes require write access to the file data. 7383 */ 7384 if (VATTR_IS_ACTIVE(vap, va_data_size)) { 7385 /* if we can't get the size, or it's different, we need write access */ 7386 KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA"); 7387 required_action |= KAUTH_VNODE_WRITE_DATA; 7388 } 7389 7390 /* 7391 * Changing timestamps? 7392 * 7393 * Note that we are only called to authorize user-requested time changes; 7394 * side-effect time changes are not authorized. Authorisation is only 7395 * required for existing files. 7396 * 7397 * Non-owners are not permitted to change the time on an existing 7398 * file to anything other than the current time. 7399 */ 7400 if (VATTR_IS_ACTIVE(vap, va_create_time) || 7401 VATTR_IS_ACTIVE(vap, va_change_time) || 7402 VATTR_IS_ACTIVE(vap, va_modify_time) || 7403 VATTR_IS_ACTIVE(vap, va_access_time) || 7404 VATTR_IS_ACTIVE(vap, va_backup_time)) { 7405 /* 7406 * The owner and root may set any timestamps they like, 7407 * provided that the file is not immutable. The owner still needs 7408 * WRITE_ATTRIBUTES (implied by ownership but still deniable). 7409 */ 7410 if (has_priv_suser || vauth_node_owner(&ova, cred)) { 7411 KAUTH_DEBUG("ATTR - root or owner changing timestamps"); 7412 required_action |= KAUTH_VNODE_CHECKIMMUTABLE | KAUTH_VNODE_WRITE_ATTRIBUTES; 7413 } else { 7414 /* just setting the current time? */ 7415 if (vap->va_vaflags & VA_UTIMES_NULL) { 7416 KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES"); 7417 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES; 7418 } else { 7419 KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted"); 7420 error = EACCES; 7421 goto out; 7422 } 7423 } 7424 } 7425 7426 /* 7427 * Changing file mode? 7428 */ 7429 if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) { 7430 KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode); 7431 7432 /* 7433 * Mode changes always have the same basic auth requirements. 7434 */ 7435 if (has_priv_suser) { 7436 KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check"); 7437 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; 7438 } else { 7439 /* need WRITE_SECURITY */ 7440 KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY"); 7441 required_action |= KAUTH_VNODE_WRITE_SECURITY; 7442 } 7443 7444 /* 7445 * Can't set the setgid bit if you're not in the group and not root. Have to have 7446 * existing group information in the case we're not setting it right now. 7447 */ 7448 if (vap->va_mode & S_ISGID) { 7449 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */ 7450 if (!has_priv_suser) { 7451 if (VATTR_IS_ACTIVE(vap, va_gid)) { 7452 group = vap->va_gid; 7453 } else if (VATTR_IS_SUPPORTED(&ova, va_gid)) { 7454 group = ova.va_gid; 7455 } else { 7456 KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available"); 7457 error = EINVAL; 7458 goto out; 7459 } 7460 /* 7461 * This might be too restrictive; WRITE_SECURITY might be implied by 7462 * membership in this case, rather than being an additional requirement. 7463 */ 7464 if ((error = kauth_cred_ismember_gid(cred, group, &ismember)) != 0) { 7465 KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid); 7466 goto out; 7467 } 7468 if (!ismember) { 7469 KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", group); 7470 error = EPERM; 7471 goto out; 7472 } 7473 } 7474 } 7475 7476 /* 7477 * Can't set the setuid bit unless you're root or the file's owner. 7478 */ 7479 if (vap->va_mode & S_ISUID) { 7480 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; /* always required */ 7481 if (!has_priv_suser) { 7482 if (VATTR_IS_ACTIVE(vap, va_uid)) { 7483 owner = vap->va_uid; 7484 } else if (VATTR_IS_SUPPORTED(&ova, va_uid)) { 7485 owner = ova.va_uid; 7486 } else { 7487 KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available"); 7488 error = EINVAL; 7489 goto out; 7490 } 7491 if (owner != kauth_cred_getuid(cred)) { 7492 /* 7493 * We could allow this if WRITE_SECURITY is permitted, perhaps. 7494 */ 7495 KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit"); 7496 error = EPERM; 7497 goto out; 7498 } 7499 } 7500 } 7501 } 7502 7503 /* 7504 * Validate/mask flags changes. This checks that only the flags in 7505 * the UF_SETTABLE mask are being set, and preserves the flags in 7506 * the SF_SETTABLE case. 7507 * 7508 * Since flags changes may be made in conjunction with other changes, 7509 * we will ask the auth code to ignore immutability in the case that 7510 * the SF_* flags are not set and we are only manipulating the file flags. 7511 * 7512 */ 7513 if (VATTR_IS_ACTIVE(vap, va_flags)) { 7514 /* compute changing flags bits */ 7515 if (VATTR_IS_SUPPORTED(&ova, va_flags)) { 7516 fdelta = vap->va_flags ^ ova.va_flags; 7517 } else { 7518 fdelta = vap->va_flags; 7519 } 7520 7521 if (fdelta != 0) { 7522 KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY"); 7523 required_action |= KAUTH_VNODE_WRITE_SECURITY; 7524 7525 /* check that changing bits are legal */ 7526 if (has_priv_suser) { 7527 /* 7528 * The immutability check will prevent us from clearing the SF_* 7529 * flags unless the system securelevel permits it, so just check 7530 * for legal flags here. 7531 */ 7532 if (fdelta & ~(UF_SETTABLE | SF_SETTABLE)) { 7533 error = EPERM; 7534 KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)"); 7535 goto out; 7536 } 7537 } else { 7538 if (fdelta & ~UF_SETTABLE) { 7539 error = EPERM; 7540 KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)"); 7541 goto out; 7542 } 7543 } 7544 /* 7545 * If the caller has the ability to manipulate file flags, 7546 * security is not reduced by ignoring them for this operation. 7547 * 7548 * A more complete test here would consider the 'after' states of the flags 7549 * to determine whether it would permit the operation, but this becomes 7550 * very complex. 7551 * 7552 * Ignoring immutability is conditional on securelevel; this does not bypass 7553 * the SF_* flags if securelevel > 0. 7554 */ 7555 required_action |= KAUTH_VNODE_NOIMMUTABLE; 7556 } 7557 } 7558 7559 /* 7560 * Validate ownership information. 7561 */ 7562 chowner = 0; 7563 chgroup = 0; 7564 clear_suid = 0; 7565 clear_sgid = 0; 7566 7567 /* 7568 * uid changing 7569 * Note that if the filesystem didn't give us a UID, we expect that it doesn't 7570 * support them in general, and will ignore it if/when we try to set it. 7571 * We might want to clear the uid out of vap completely here. 7572 */ 7573 if (VATTR_IS_ACTIVE(vap, va_uid)) { 7574 if (VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) { 7575 if (!has_priv_suser && (kauth_cred_getuid(cred) != vap->va_uid)) { 7576 KAUTH_DEBUG(" DENIED - non-superuser cannot change ownershipt to a third party"); 7577 error = EPERM; 7578 goto out; 7579 } 7580 chowner = 1; 7581 } 7582 clear_suid = 1; 7583 } 7584 7585 /* 7586 * gid changing 7587 * Note that if the filesystem didn't give us a GID, we expect that it doesn't 7588 * support them in general, and will ignore it if/when we try to set it. 7589 * We might want to clear the gid out of vap completely here. 7590 */ 7591 if (VATTR_IS_ACTIVE(vap, va_gid)) { 7592 if (VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) { 7593 if (!has_priv_suser) { 7594 if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != 0) { 7595 KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid); 7596 goto out; 7597 } 7598 if (!ismember) { 7599 KAUTH_DEBUG(" DENIED - group change from %d to %d but not a member of target group", 7600 ova.va_gid, vap->va_gid); 7601 error = EPERM; 7602 goto out; 7603 } 7604 } 7605 chgroup = 1; 7606 } 7607 clear_sgid = 1; 7608 } 7609 7610 /* 7611 * Owner UUID being set or changed. 7612 */ 7613 if (VATTR_IS_ACTIVE(vap, va_uuuid)) { 7614 /* if the owner UUID is not actually changing ... */ 7615 if (VATTR_IS_SUPPORTED(&ova, va_uuuid)) { 7616 if (kauth_guid_equal(&vap->va_uuuid, &ova.va_uuuid)) 7617 goto no_uuuid_change; 7618 7619 /* 7620 * If the current owner UUID is a null GUID, check 7621 * it against the UUID corresponding to the owner UID. 7622 */ 7623 if (kauth_guid_equal(&ova.va_uuuid, &kauth_null_guid) && 7624 VATTR_IS_SUPPORTED(&ova, va_uid)) { 7625 guid_t uid_guid; 7626 7627 if (kauth_cred_uid2guid(ova.va_uid, &uid_guid) == 0 && 7628 kauth_guid_equal(&vap->va_uuuid, &uid_guid)) 7629 goto no_uuuid_change; 7630 } 7631 } 7632 7633 /* 7634 * The owner UUID cannot be set by a non-superuser to anything other than 7635 * their own or a null GUID (to "unset" the owner UUID). 7636 * Note that file systems must be prepared to handle the 7637 * null UUID case in a manner appropriate for that file 7638 * system. 7639 */ 7640 if (!has_priv_suser) { 7641 if ((error = kauth_cred_getguid(cred, &changer)) != 0) { 7642 KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error); 7643 /* XXX ENOENT here - no UUID - should perhaps become EPERM */ 7644 goto out; 7645 } 7646 if (!kauth_guid_equal(&vap->va_uuuid, &changer) && 7647 !kauth_guid_equal(&vap->va_uuuid, &kauth_null_guid)) { 7648 KAUTH_DEBUG(" ERROR - cannot set supplied owner UUID - not us / null"); 7649 error = EPERM; 7650 goto out; 7651 } 7652 } 7653 chowner = 1; 7654 clear_suid = 1; 7655 } 7656no_uuuid_change: 7657 /* 7658 * Group UUID being set or changed. 7659 */ 7660 if (VATTR_IS_ACTIVE(vap, va_guuid)) { 7661 /* if the group UUID is not actually changing ... */ 7662 if (VATTR_IS_SUPPORTED(&ova, va_guuid)) { 7663 if (kauth_guid_equal(&vap->va_guuid, &ova.va_guuid)) 7664 goto no_guuid_change; 7665 7666 /* 7667 * If the current group UUID is a null UUID, check 7668 * it against the UUID corresponding to the group GID. 7669 */ 7670 if (kauth_guid_equal(&ova.va_guuid, &kauth_null_guid) && 7671 VATTR_IS_SUPPORTED(&ova, va_gid)) { 7672 guid_t gid_guid; 7673 7674 if (kauth_cred_gid2guid(ova.va_gid, &gid_guid) == 0 && 7675 kauth_guid_equal(&vap->va_guuid, &gid_guid)) 7676 goto no_guuid_change; 7677 } 7678 } 7679 7680 /* 7681 * The group UUID cannot be set by a non-superuser to anything other than 7682 * one of which they are a member or a null GUID (to "unset" 7683 * the group UUID). 7684 * Note that file systems must be prepared to handle the 7685 * null UUID case in a manner appropriate for that file 7686 * system. 7687 */ 7688 if (!has_priv_suser) { 7689 if (kauth_guid_equal(&vap->va_guuid, &kauth_null_guid)) 7690 ismember = 1; 7691 else if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != 0) { 7692 KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error); 7693 goto out; 7694 } 7695 if (!ismember) { 7696 KAUTH_DEBUG(" ERROR - cannot set supplied group UUID - not a member / null"); 7697 error = EPERM; 7698 goto out; 7699 } 7700 } 7701 chgroup = 1; 7702 } 7703no_guuid_change: 7704 7705 /* 7706 * Compute authorisation for group/ownership changes. 7707 */ 7708 if (chowner || chgroup || clear_suid || clear_sgid) { 7709 if (has_priv_suser) { 7710 KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check"); 7711 required_action |= KAUTH_VNODE_CHECKIMMUTABLE; 7712 } else { 7713 if (chowner) { 7714 KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP"); 7715 required_action |= KAUTH_VNODE_TAKE_OWNERSHIP; 7716 } 7717 if (chgroup && !chowner) { 7718 KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY"); 7719 required_action |= KAUTH_VNODE_WRITE_SECURITY; 7720 } 7721 7722 /* clear set-uid and set-gid bits as required by Posix */ 7723 if (VATTR_IS_ACTIVE(vap, va_mode)) { 7724 newmode = vap->va_mode; 7725 } else if (VATTR_IS_SUPPORTED(&ova, va_mode)) { 7726 newmode = ova.va_mode; 7727 } else { 7728 KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits"); 7729 newmode = 0; 7730 } 7731 if (newmode & (S_ISUID | S_ISGID)) { 7732 VATTR_SET(vap, va_mode, newmode & ~(S_ISUID | S_ISGID)); 7733 KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o", newmode, vap->va_mode); 7734 } 7735 } 7736 } 7737 7738 /* 7739 * Authorise changes in the ACL. 7740 */ 7741 if (VATTR_IS_ACTIVE(vap, va_acl)) { 7742 7743 /* no existing ACL */ 7744 if (!VATTR_IS_ACTIVE(&ova, va_acl) || (ova.va_acl == NULL)) { 7745 7746 /* adding an ACL */ 7747 if (vap->va_acl != NULL) { 7748 required_action |= KAUTH_VNODE_WRITE_SECURITY; 7749 KAUTH_DEBUG("CHMOD - adding ACL"); 7750 } 7751 7752 /* removing an existing ACL */ 7753 } else if (vap->va_acl == NULL) { 7754 required_action |= KAUTH_VNODE_WRITE_SECURITY; 7755 KAUTH_DEBUG("CHMOD - removing ACL"); 7756 7757 /* updating an existing ACL */ 7758 } else { 7759 if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) { 7760 /* entry count changed, must be different */ 7761 required_action |= KAUTH_VNODE_WRITE_SECURITY; 7762 KAUTH_DEBUG("CHMOD - adding/removing ACL entries"); 7763 } else if (vap->va_acl->acl_entrycount > 0) { 7764 /* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */ 7765 if (memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0], 7766 sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) { 7767 required_action |= KAUTH_VNODE_WRITE_SECURITY; 7768 KAUTH_DEBUG("CHMOD - changing ACL entries"); 7769 } 7770 } 7771 } 7772 } 7773 7774 /* 7775 * Other attributes that require authorisation. 7776 */ 7777 if (VATTR_IS_ACTIVE(vap, va_encoding)) 7778 required_action |= KAUTH_VNODE_WRITE_ATTRIBUTES; 7779 7780out: 7781 if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL)) 7782 kauth_acl_free(ova.va_acl); 7783 if (error == 0) 7784 *actionp = required_action; 7785 return(error); 7786} 7787 7788static int 7789setlocklocal_callback(struct vnode *vp, __unused void *cargs) 7790{ 7791 vnode_lock_spin(vp); 7792 vp->v_flag |= VLOCKLOCAL; 7793 vnode_unlock(vp); 7794 7795 return (VNODE_RETURNED); 7796} 7797 7798void 7799vfs_setlocklocal(mount_t mp) 7800{ 7801 mount_lock_spin(mp); 7802 mp->mnt_kern_flag |= MNTK_LOCK_LOCAL; 7803 mount_unlock(mp); 7804 7805 /* 7806 * The number of active vnodes is expected to be 7807 * very small when vfs_setlocklocal is invoked. 7808 */ 7809 vnode_iterate(mp, 0, setlocklocal_callback, NULL); 7810} 7811 7812void 7813vfs_setunmountpreflight(mount_t mp) 7814{ 7815 mount_lock_spin(mp); 7816 mp->mnt_kern_flag |= MNTK_UNMOUNT_PREFLIGHT; 7817 mount_unlock(mp); 7818} 7819 7820void 7821vfs_setcompoundopen(mount_t mp) 7822{ 7823 mount_lock_spin(mp); 7824 mp->mnt_compound_ops |= COMPOUND_VNOP_OPEN; 7825 mount_unlock(mp); 7826} 7827 7828void 7829vn_setunionwait(vnode_t vp) 7830{ 7831 vnode_lock_spin(vp); 7832 vp->v_flag |= VISUNION; 7833 vnode_unlock(vp); 7834} 7835 7836 7837void 7838vn_checkunionwait(vnode_t vp) 7839{ 7840 vnode_lock_spin(vp); 7841 while ((vp->v_flag & VISUNION) == VISUNION) 7842 msleep((caddr_t)&vp->v_flag, &vp->v_lock, 0, 0, 0); 7843 vnode_unlock(vp); 7844} 7845 7846void 7847vn_clearunionwait(vnode_t vp, int locked) 7848{ 7849 if (!locked) 7850 vnode_lock_spin(vp); 7851 if((vp->v_flag & VISUNION) == VISUNION) { 7852 vp->v_flag &= ~VISUNION; 7853 wakeup((caddr_t)&vp->v_flag); 7854 } 7855 if (!locked) 7856 vnode_unlock(vp); 7857} 7858 7859/* 7860 * XXX - get "don't trigger mounts" flag for thread; used by autofs. 7861 */ 7862extern int thread_notrigger(void); 7863 7864int 7865thread_notrigger(void) 7866{ 7867 struct uthread *uth = (struct uthread *)get_bsdthread_info(current_thread()); 7868 return (uth->uu_notrigger); 7869} 7870 7871/* 7872 * Removes orphaned apple double files during a rmdir 7873 * Works by: 7874 * 1. vnode_suspend(). 7875 * 2. Call VNOP_READDIR() till the end of directory is reached. 7876 * 3. Check if the directory entries returned are regular files with name starting with "._". If not, return ENOTEMPTY. 7877 * 4. Continue (2) and (3) till end of directory is reached. 7878 * 5. If all the entries in the directory were files with "._" name, delete all the files. 7879 * 6. vnode_resume() 7880 * 7. If deletion of all files succeeded, call VNOP_RMDIR() again. 7881 */ 7882 7883errno_t rmdir_remove_orphaned_appleDouble(vnode_t vp , vfs_context_t ctx, int * restart_flag) 7884{ 7885 7886#define UIO_BUFF_SIZE 2048 7887 uio_t auio = NULL; 7888 int eofflag, siz = UIO_BUFF_SIZE, nentries = 0; 7889 int open_flag = 0, full_erase_flag = 0; 7890 char uio_buf[ UIO_SIZEOF(1) ]; 7891 char *rbuf = NULL, *cpos, *cend; 7892 struct nameidata nd_temp; 7893 struct dirent *dp; 7894 errno_t error; 7895 7896 error = vnode_suspend(vp); 7897 7898 /* 7899 * restart_flag is set so that the calling rmdir sleeps and resets 7900 */ 7901 if (error == EBUSY) 7902 *restart_flag = 1; 7903 if (error != 0) 7904 goto outsc; 7905 7906 /* 7907 * set up UIO 7908 */ 7909 MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK); 7910 if (rbuf) 7911 auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, 7912 &uio_buf[0], sizeof(uio_buf)); 7913 if (!rbuf || !auio) { 7914 error = ENOMEM; 7915 goto outsc; 7916 } 7917 7918 uio_setoffset(auio,0); 7919 7920 eofflag = 0; 7921 7922 if ((error = VNOP_OPEN(vp, FREAD, ctx))) 7923 goto outsc; 7924 else 7925 open_flag = 1; 7926 7927 /* 7928 * First pass checks if all files are appleDouble files. 7929 */ 7930 7931 do { 7932 siz = UIO_BUFF_SIZE; 7933 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ); 7934 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE); 7935 7936 if((error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx))) 7937 goto outsc; 7938 7939 if (uio_resid(auio) != 0) 7940 siz -= uio_resid(auio); 7941 7942 /* 7943 * Iterate through directory 7944 */ 7945 cpos = rbuf; 7946 cend = rbuf + siz; 7947 dp = (struct dirent*) cpos; 7948 7949 if (cpos == cend) 7950 eofflag = 1; 7951 7952 while ((cpos < cend)) { 7953 /* 7954 * Check for . and .. as well as directories 7955 */ 7956 if (dp->d_ino != 0 && 7957 !((dp->d_namlen == 1 && dp->d_name[0] == '.') || 7958 (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))) { 7959 /* 7960 * Check for irregular files and ._ files 7961 * If there is a ._._ file abort the op 7962 */ 7963 if ( dp->d_namlen < 2 || 7964 strncmp(dp->d_name,"._",2) || 7965 (dp->d_namlen >= 4 && !strncmp(&(dp->d_name[2]), "._",2))) { 7966 error = ENOTEMPTY; 7967 goto outsc; 7968 } 7969 } 7970 cpos += dp->d_reclen; 7971 dp = (struct dirent*)cpos; 7972 } 7973 7974 /* 7975 * workaround for HFS/NFS setting eofflag before end of file 7976 */ 7977 if (vp->v_tag == VT_HFS && nentries > 2) 7978 eofflag=0; 7979 7980 if (vp->v_tag == VT_NFS) { 7981 if (eofflag && !full_erase_flag) { 7982 full_erase_flag = 1; 7983 eofflag = 0; 7984 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ); 7985 } 7986 else if (!eofflag && full_erase_flag) 7987 full_erase_flag = 0; 7988 } 7989 7990 } while (!eofflag); 7991 /* 7992 * If we've made it here all the files in the dir are ._ files. 7993 * We can delete the files even though the node is suspended 7994 * because we are the owner of the file. 7995 */ 7996 7997 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ); 7998 eofflag = 0; 7999 full_erase_flag = 0; 8000 8001 do { 8002 siz = UIO_BUFF_SIZE; 8003 uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ); 8004 uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE); 8005 8006 error = VNOP_READDIR(vp, auio, 0, &eofflag, &nentries, ctx); 8007 8008 if (error != 0) 8009 goto outsc; 8010 8011 if (uio_resid(auio) != 0) 8012 siz -= uio_resid(auio); 8013 8014 /* 8015 * Iterate through directory 8016 */ 8017 cpos = rbuf; 8018 cend = rbuf + siz; 8019 dp = (struct dirent*) cpos; 8020 8021 if (cpos == cend) 8022 eofflag = 1; 8023 8024 while ((cpos < cend)) { 8025 /* 8026 * Check for . and .. as well as directories 8027 */ 8028 if (dp->d_ino != 0 && 8029 !((dp->d_namlen == 1 && dp->d_name[0] == '.') || 8030 (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.')) 8031 ) { 8032 8033 NDINIT(&nd_temp, DELETE, OP_UNLINK, USEDVP, 8034 UIO_SYSSPACE, CAST_USER_ADDR_T(dp->d_name), 8035 ctx); 8036 nd_temp.ni_dvp = vp; 8037 error = unlink1(ctx, &nd_temp, VNODE_REMOVE_SKIP_NAMESPACE_EVENT); 8038 8039 if (error && error != ENOENT) { 8040 goto outsc; 8041 } 8042 8043 } 8044 cpos += dp->d_reclen; 8045 dp = (struct dirent*)cpos; 8046 } 8047 8048 /* 8049 * workaround for HFS/NFS setting eofflag before end of file 8050 */ 8051 if (vp->v_tag == VT_HFS && nentries > 2) 8052 eofflag=0; 8053 8054 if (vp->v_tag == VT_NFS) { 8055 if (eofflag && !full_erase_flag) { 8056 full_erase_flag = 1; 8057 eofflag = 0; 8058 uio_reset(auio, 0, UIO_SYSSPACE, UIO_READ); 8059 } 8060 else if (!eofflag && full_erase_flag) 8061 full_erase_flag = 0; 8062 } 8063 8064 } while (!eofflag); 8065 8066 8067 error = 0; 8068 8069outsc: 8070 if (open_flag) 8071 VNOP_CLOSE(vp, FREAD, ctx); 8072 8073 uio_free(auio); 8074 FREE(rbuf, M_TEMP); 8075 8076 vnode_resume(vp); 8077 8078 8079 return(error); 8080 8081} 8082 8083 8084void 8085lock_vnode_and_post(vnode_t vp, int kevent_num) 8086{ 8087 /* Only take the lock if there's something there! */ 8088 if (vp->v_knotes.slh_first != NULL) { 8089 vnode_lock(vp); 8090 KNOTE(&vp->v_knotes, kevent_num); 8091 vnode_unlock(vp); 8092 } 8093} 8094 8095#ifdef JOE_DEBUG 8096static void record_vp(vnode_t vp, int count) { 8097 struct uthread *ut; 8098 8099#if CONFIG_TRIGGERS 8100 if (vp->v_resolve) 8101 return; 8102#endif 8103 if ((vp->v_flag & VSYSTEM)) 8104 return; 8105 8106 ut = get_bsdthread_info(current_thread()); 8107 ut->uu_iocount += count; 8108 8109 if (count == 1) { 8110 if (ut->uu_vpindex < 32) { 8111 OSBacktrace((void **)&ut->uu_pcs[ut->uu_vpindex][0], 10); 8112 8113 ut->uu_vps[ut->uu_vpindex] = vp; 8114 ut->uu_vpindex++; 8115 } 8116 } 8117} 8118#endif 8119 8120 8121#if CONFIG_TRIGGERS 8122 8123#define TRIG_DEBUG 0 8124 8125#if TRIG_DEBUG 8126#define TRIG_LOG(...) do { printf("%s: ", __FUNCTION__); printf(__VA_ARGS__); } while (0) 8127#else 8128#define TRIG_LOG(...) 8129#endif 8130 8131/* 8132 * Resolver result functions 8133 */ 8134 8135resolver_result_t 8136vfs_resolver_result(uint32_t seq, enum resolver_status stat, int aux) 8137{ 8138 /* 8139 * |<--- 32 --->|<--- 28 --->|<- 4 ->| 8140 * sequence auxiliary status 8141 */ 8142 return (((uint64_t)seq) << 32) | 8143 (((uint64_t)(aux & 0x0fffffff)) << 4) | 8144 (uint64_t)(stat & 0x0000000F); 8145} 8146 8147enum resolver_status 8148vfs_resolver_status(resolver_result_t result) 8149{ 8150 /* lower 4 bits is status */ 8151 return (result & 0x0000000F); 8152} 8153 8154uint32_t 8155vfs_resolver_sequence(resolver_result_t result) 8156{ 8157 /* upper 32 bits is sequence */ 8158 return (uint32_t)(result >> 32); 8159} 8160 8161int 8162vfs_resolver_auxiliary(resolver_result_t result) 8163{ 8164 /* 28 bits of auxiliary */ 8165 return (int)(((uint32_t)(result & 0xFFFFFFF0)) >> 4); 8166} 8167 8168/* 8169 * SPI 8170 * Call in for resolvers to update vnode trigger state 8171 */ 8172int 8173vnode_trigger_update(vnode_t vp, resolver_result_t result) 8174{ 8175 vnode_resolve_t rp; 8176 uint32_t seq; 8177 enum resolver_status stat; 8178 8179 if (vp->v_resolve == NULL) { 8180 return (EINVAL); 8181 } 8182 8183 stat = vfs_resolver_status(result); 8184 seq = vfs_resolver_sequence(result); 8185 8186 if ((stat != RESOLVER_RESOLVED) && (stat != RESOLVER_UNRESOLVED)) { 8187 return (EINVAL); 8188 } 8189 8190 rp = vp->v_resolve; 8191 lck_mtx_lock(&rp->vr_lock); 8192 8193 if (seq > rp->vr_lastseq) { 8194 if (stat == RESOLVER_RESOLVED) 8195 rp->vr_flags |= VNT_RESOLVED; 8196 else 8197 rp->vr_flags &= ~VNT_RESOLVED; 8198 8199 rp->vr_lastseq = seq; 8200 } 8201 8202 lck_mtx_unlock(&rp->vr_lock); 8203 8204 return (0); 8205} 8206 8207static int 8208vnode_resolver_attach(vnode_t vp, vnode_resolve_t rp, boolean_t ref) 8209{ 8210 int error; 8211 8212 vnode_lock_spin(vp); 8213 if (vp->v_resolve != NULL) { 8214 vnode_unlock(vp); 8215 return EINVAL; 8216 } else { 8217 vp->v_resolve = rp; 8218 } 8219 vnode_unlock(vp); 8220 8221 if (ref) { 8222 error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE); 8223 if (error != 0) { 8224 panic("VNODE_REF_FORCE didn't help..."); 8225 } 8226 } 8227 8228 return 0; 8229} 8230 8231/* 8232 * VFS internal interfaces for vnode triggers 8233 * 8234 * vnode must already have an io count on entry 8235 * v_resolve is stable when io count is non-zero 8236 */ 8237static int 8238vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo, boolean_t external) 8239{ 8240 vnode_resolve_t rp; 8241 int result; 8242 char byte; 8243 8244#if 1 8245 /* minimum pointer test (debugging) */ 8246 if (tinfo->vnt_data) 8247 byte = *((char *)tinfo->vnt_data); 8248#endif 8249 MALLOC(rp, vnode_resolve_t, sizeof(*rp), M_TEMP, M_WAITOK); 8250 if (rp == NULL) 8251 return (ENOMEM); 8252 8253 lck_mtx_init(&rp->vr_lock, trigger_vnode_lck_grp, trigger_vnode_lck_attr); 8254 8255 rp->vr_resolve_func = tinfo->vnt_resolve_func; 8256 rp->vr_unresolve_func = tinfo->vnt_unresolve_func; 8257 rp->vr_rearm_func = tinfo->vnt_rearm_func; 8258 rp->vr_reclaim_func = tinfo->vnt_reclaim_func; 8259 rp->vr_data = tinfo->vnt_data; 8260 rp->vr_lastseq = 0; 8261 rp->vr_flags = tinfo->vnt_flags & VNT_VALID_MASK; 8262 if (external) { 8263 rp->vr_flags |= VNT_EXTERNAL; 8264 } 8265 8266 result = vnode_resolver_attach(vp, rp, external); 8267 if (result != 0) { 8268 goto out; 8269 } 8270 8271 if (mp) { 8272 OSAddAtomic(1, &mp->mnt_numtriggers); 8273 } 8274 8275 return (result); 8276 8277out: 8278 FREE(rp, M_TEMP); 8279 return result; 8280} 8281 8282static void 8283vnode_resolver_release(vnode_resolve_t rp) 8284{ 8285 /* 8286 * Give them a chance to free any private data 8287 */ 8288 if (rp->vr_data && rp->vr_reclaim_func) { 8289 rp->vr_reclaim_func(NULLVP, rp->vr_data); 8290 } 8291 8292 lck_mtx_destroy(&rp->vr_lock, trigger_vnode_lck_grp); 8293 FREE(rp, M_TEMP); 8294 8295} 8296 8297/* Called after the vnode has been drained */ 8298static void 8299vnode_resolver_detach(vnode_t vp) 8300{ 8301 vnode_resolve_t rp; 8302 mount_t mp; 8303 8304 mp = vnode_mount(vp); 8305 8306 vnode_lock(vp); 8307 rp = vp->v_resolve; 8308 vp->v_resolve = NULL; 8309 vnode_unlock(vp); 8310 8311 if ((rp->vr_flags & VNT_EXTERNAL) != 0) { 8312 vnode_rele_ext(vp, O_EVTONLY, 1); 8313 } 8314 8315 vnode_resolver_release(rp); 8316 8317 /* Keep count of active trigger vnodes per mount */ 8318 OSAddAtomic(-1, &mp->mnt_numtriggers); 8319} 8320 8321/* 8322 * Pathname operations that don't trigger a mount for trigger vnodes 8323 */ 8324static const u_int64_t ignorable_pathops_mask = 8325 1LL << OP_MOUNT | 8326 1LL << OP_UNMOUNT | 8327 1LL << OP_STATFS | 8328 1LL << OP_ACCESS | 8329 1LL << OP_GETATTR | 8330 1LL << OP_LISTXATTR; 8331 8332int 8333vfs_istraditionaltrigger(enum path_operation op, const struct componentname *cnp) 8334{ 8335 if (cnp->cn_flags & ISLASTCN) 8336 return ((1LL << op) & ignorable_pathops_mask) == 0; 8337 else 8338 return (1); 8339} 8340 8341__private_extern__ 8342void 8343vnode_trigger_rearm(vnode_t vp, vfs_context_t ctx) 8344{ 8345 vnode_resolve_t rp; 8346 resolver_result_t result; 8347 enum resolver_status status; 8348 uint32_t seq; 8349 8350 if ((vp->v_resolve == NULL) || 8351 (vp->v_resolve->vr_rearm_func == NULL) || 8352 (vp->v_resolve->vr_flags & VNT_AUTO_REARM) == 0) { 8353 return; 8354 } 8355 8356 rp = vp->v_resolve; 8357 lck_mtx_lock(&rp->vr_lock); 8358 8359 /* 8360 * Check if VFS initiated this unmount. If so, we'll catch it after the unresolve completes. 8361 */ 8362 if (rp->vr_flags & VNT_VFS_UNMOUNTED) { 8363 lck_mtx_unlock(&rp->vr_lock); 8364 return; 8365 } 8366 8367 /* Check if this vnode is already armed */ 8368 if ((rp->vr_flags & VNT_RESOLVED) == 0) { 8369 lck_mtx_unlock(&rp->vr_lock); 8370 return; 8371 } 8372 8373 lck_mtx_unlock(&rp->vr_lock); 8374 8375 result = rp->vr_rearm_func(vp, 0, rp->vr_data, ctx); 8376 status = vfs_resolver_status(result); 8377 seq = vfs_resolver_sequence(result); 8378 8379 lck_mtx_lock(&rp->vr_lock); 8380 if (seq > rp->vr_lastseq) { 8381 if (status == RESOLVER_UNRESOLVED) 8382 rp->vr_flags &= ~VNT_RESOLVED; 8383 rp->vr_lastseq = seq; 8384 } 8385 lck_mtx_unlock(&rp->vr_lock); 8386} 8387 8388__private_extern__ 8389int 8390vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx) 8391{ 8392 vnode_resolve_t rp; 8393 enum path_operation op; 8394 resolver_result_t result; 8395 enum resolver_status status; 8396 uint32_t seq; 8397 8398 /* Only trigger on topmost vnodes */ 8399 if ((vp->v_resolve == NULL) || 8400 (vp->v_resolve->vr_resolve_func == NULL) || 8401 (vp->v_mountedhere != NULL)) { 8402 return (0); 8403 } 8404 8405 rp = vp->v_resolve; 8406 lck_mtx_lock(&rp->vr_lock); 8407 8408 /* Check if this vnode is already resolved */ 8409 if (rp->vr_flags & VNT_RESOLVED) { 8410 lck_mtx_unlock(&rp->vr_lock); 8411 return (0); 8412 } 8413 8414 lck_mtx_unlock(&rp->vr_lock); 8415 8416 /* 8417 * XXX 8418 * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock) 8419 * is there anyway to know this??? 8420 * there can also be other legitimate lookups in parallel 8421 * 8422 * XXX - should we call this on a separate thread with a timeout? 8423 * 8424 * XXX - should we use ISLASTCN to pick the op value??? Perhaps only leafs should 8425 * get the richer set and non-leafs should get generic OP_LOOKUP? TBD 8426 */ 8427 op = (ndp->ni_op < OP_MAXOP) ? ndp->ni_op: OP_LOOKUP; 8428 8429 result = rp->vr_resolve_func(vp, &ndp->ni_cnd, op, 0, rp->vr_data, ctx); 8430 status = vfs_resolver_status(result); 8431 seq = vfs_resolver_sequence(result); 8432 8433 lck_mtx_lock(&rp->vr_lock); 8434 if (seq > rp->vr_lastseq) { 8435 if (status == RESOLVER_RESOLVED) 8436 rp->vr_flags |= VNT_RESOLVED; 8437 rp->vr_lastseq = seq; 8438 } 8439 lck_mtx_unlock(&rp->vr_lock); 8440 8441 /* On resolver errors, propagate the error back up */ 8442 return (status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0); 8443} 8444 8445static int 8446vnode_trigger_unresolve(vnode_t vp, int flags, vfs_context_t ctx) 8447{ 8448 vnode_resolve_t rp; 8449 resolver_result_t result; 8450 enum resolver_status status; 8451 uint32_t seq; 8452 8453 if ((vp->v_resolve == NULL) || (vp->v_resolve->vr_unresolve_func == NULL)) { 8454 return (0); 8455 } 8456 8457 rp = vp->v_resolve; 8458 lck_mtx_lock(&rp->vr_lock); 8459 8460 /* Check if this vnode is already resolved */ 8461 if ((rp->vr_flags & VNT_RESOLVED) == 0) { 8462 printf("vnode_trigger_unresolve: not currently resolved\n"); 8463 lck_mtx_unlock(&rp->vr_lock); 8464 return (0); 8465 } 8466 8467 rp->vr_flags |= VNT_VFS_UNMOUNTED; 8468 8469 lck_mtx_unlock(&rp->vr_lock); 8470 8471 /* 8472 * XXX 8473 * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock) 8474 * there can also be other legitimate lookups in parallel 8475 * 8476 * XXX - should we call this on a separate thread with a timeout? 8477 */ 8478 8479 result = rp->vr_unresolve_func(vp, flags, rp->vr_data, ctx); 8480 status = vfs_resolver_status(result); 8481 seq = vfs_resolver_sequence(result); 8482 8483 lck_mtx_lock(&rp->vr_lock); 8484 if (seq > rp->vr_lastseq) { 8485 if (status == RESOLVER_UNRESOLVED) 8486 rp->vr_flags &= ~VNT_RESOLVED; 8487 rp->vr_lastseq = seq; 8488 } 8489 rp->vr_flags &= ~VNT_VFS_UNMOUNTED; 8490 lck_mtx_unlock(&rp->vr_lock); 8491 8492 /* On resolver errors, propagate the error back up */ 8493 return (status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0); 8494} 8495 8496static int 8497triggerisdescendant(mount_t mp, mount_t rmp) 8498{ 8499 int match = FALSE; 8500 8501 /* 8502 * walk up vnode covered chain looking for a match 8503 */ 8504 name_cache_lock_shared(); 8505 8506 while (1) { 8507 vnode_t vp; 8508 8509 /* did we encounter "/" ? */ 8510 if (mp->mnt_flag & MNT_ROOTFS) 8511 break; 8512 8513 vp = mp->mnt_vnodecovered; 8514 if (vp == NULLVP) 8515 break; 8516 8517 mp = vp->v_mount; 8518 if (mp == rmp) { 8519 match = TRUE; 8520 break; 8521 } 8522 } 8523 8524 name_cache_unlock(); 8525 8526 return (match); 8527} 8528 8529struct trigger_unmount_info { 8530 vfs_context_t ctx; 8531 mount_t top_mp; 8532 vnode_t trigger_vp; 8533 mount_t trigger_mp; 8534 uint32_t trigger_vid; 8535 int flags; 8536}; 8537 8538static int 8539trigger_unmount_callback(mount_t mp, void * arg) 8540{ 8541 struct trigger_unmount_info * infop = (struct trigger_unmount_info *)arg; 8542 boolean_t mountedtrigger = FALSE; 8543 8544 /* 8545 * When we encounter the top level mount we're done 8546 */ 8547 if (mp == infop->top_mp) 8548 return (VFS_RETURNED_DONE); 8549 8550 if ((mp->mnt_vnodecovered == NULL) || 8551 (vnode_getwithref(mp->mnt_vnodecovered) != 0)) { 8552 return (VFS_RETURNED); 8553 } 8554 8555 if ((mp->mnt_vnodecovered->v_mountedhere == mp) && 8556 (mp->mnt_vnodecovered->v_resolve != NULL) && 8557 (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_RESOLVED)) { 8558 mountedtrigger = TRUE; 8559 } 8560 vnode_put(mp->mnt_vnodecovered); 8561 8562 /* 8563 * When we encounter a mounted trigger, check if its under the top level mount 8564 */ 8565 if ( !mountedtrigger || !triggerisdescendant(mp, infop->top_mp) ) 8566 return (VFS_RETURNED); 8567 8568 /* 8569 * Process any pending nested mount (now that its not referenced) 8570 */ 8571 if ((infop->trigger_vp != NULLVP) && 8572 (vnode_getwithvid(infop->trigger_vp, infop->trigger_vid) == 0)) { 8573 vnode_t vp = infop->trigger_vp; 8574 int error; 8575 8576 infop->trigger_vp = NULLVP; 8577 8578 if (mp == vp->v_mountedhere) { 8579 vnode_put(vp); 8580 printf("trigger_unmount_callback: unexpected match '%s'\n", 8581 mp->mnt_vfsstat.f_mntonname); 8582 return (VFS_RETURNED); 8583 } 8584 if (infop->trigger_mp != vp->v_mountedhere) { 8585 vnode_put(vp); 8586 printf("trigger_unmount_callback: trigger mnt changed! (%p != %p)\n", 8587 infop->trigger_mp, vp->v_mountedhere); 8588 goto savenext; 8589 } 8590 8591 error = vnode_trigger_unresolve(vp, infop->flags, infop->ctx); 8592 vnode_put(vp); 8593 if (error) { 8594 printf("unresolving: '%s', err %d\n", 8595 vp->v_mountedhere ? vp->v_mountedhere->mnt_vfsstat.f_mntonname : 8596 "???", error); 8597 return (VFS_RETURNED_DONE); /* stop iteration on errors */ 8598 } 8599 } 8600savenext: 8601 /* 8602 * We can't call resolver here since we hold a mount iter 8603 * ref on mp so save its covered vp for later processing 8604 */ 8605 infop->trigger_vp = mp->mnt_vnodecovered; 8606 if ((infop->trigger_vp != NULLVP) && 8607 (vnode_getwithref(infop->trigger_vp) == 0)) { 8608 if (infop->trigger_vp->v_mountedhere == mp) { 8609 infop->trigger_vid = infop->trigger_vp->v_id; 8610 infop->trigger_mp = mp; 8611 } 8612 vnode_put(infop->trigger_vp); 8613 } 8614 8615 return (VFS_RETURNED); 8616} 8617 8618/* 8619 * Attempt to unmount any trigger mounts nested underneath a mount. 8620 * This is a best effort attempt and no retries are performed here. 8621 * 8622 * Note: mp->mnt_rwlock is held exclusively on entry (so be carefull) 8623 */ 8624__private_extern__ 8625void 8626vfs_nested_trigger_unmounts(mount_t mp, int flags, vfs_context_t ctx) 8627{ 8628 struct trigger_unmount_info info; 8629 8630 /* Must have trigger vnodes */ 8631 if (mp->mnt_numtriggers == 0) { 8632 return; 8633 } 8634 /* Avoid recursive requests (by checking covered vnode) */ 8635 if ((mp->mnt_vnodecovered != NULL) && 8636 (vnode_getwithref(mp->mnt_vnodecovered) == 0)) { 8637 boolean_t recursive = FALSE; 8638 8639 if ((mp->mnt_vnodecovered->v_mountedhere == mp) && 8640 (mp->mnt_vnodecovered->v_resolve != NULL) && 8641 (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_VFS_UNMOUNTED)) { 8642 recursive = TRUE; 8643 } 8644 vnode_put(mp->mnt_vnodecovered); 8645 if (recursive) 8646 return; 8647 } 8648 8649 /* 8650 * Attempt to unmount any nested trigger mounts (best effort) 8651 */ 8652 info.ctx = ctx; 8653 info.top_mp = mp; 8654 info.trigger_vp = NULLVP; 8655 info.trigger_vid = 0; 8656 info.trigger_mp = NULL; 8657 info.flags = flags; 8658 8659 (void) vfs_iterate(VFS_ITERATE_TAIL_FIRST, trigger_unmount_callback, &info); 8660 8661 /* 8662 * Process remaining nested mount (now that its not referenced) 8663 */ 8664 if ((info.trigger_vp != NULLVP) && 8665 (vnode_getwithvid(info.trigger_vp, info.trigger_vid) == 0)) { 8666 vnode_t vp = info.trigger_vp; 8667 8668 if (info.trigger_mp == vp->v_mountedhere) { 8669 (void) vnode_trigger_unresolve(vp, flags, ctx); 8670 } 8671 vnode_put(vp); 8672 } 8673} 8674 8675int 8676vfs_addtrigger(mount_t mp, const char *relpath, struct vnode_trigger_info *vtip, vfs_context_t ctx) 8677{ 8678 struct nameidata nd; 8679 int res; 8680 vnode_t rvp, vp; 8681 struct vnode_trigger_param vtp; 8682 8683 /* 8684 * Must be called for trigger callback, wherein rwlock is held 8685 */ 8686 lck_rw_assert(&mp->mnt_rwlock, LCK_RW_ASSERT_HELD); 8687 8688 TRIG_LOG("Adding trigger at %s\n", relpath); 8689 TRIG_LOG("Trying VFS_ROOT\n"); 8690 8691 /* 8692 * We do a lookup starting at the root of the mountpoint, unwilling 8693 * to cross into other mountpoints. 8694 */ 8695 res = VFS_ROOT(mp, &rvp, ctx); 8696 if (res != 0) { 8697 goto out; 8698 } 8699 8700 TRIG_LOG("Trying namei\n"); 8701 8702 NDINIT(&nd, LOOKUP, OP_LOOKUP, USEDVP | NOCROSSMOUNT | FOLLOW, UIO_SYSSPACE, 8703 CAST_USER_ADDR_T(relpath), ctx); 8704 nd.ni_dvp = rvp; 8705 res = namei(&nd); 8706 if (res != 0) { 8707 vnode_put(rvp); 8708 goto out; 8709 } 8710 8711 vp = nd.ni_vp; 8712 nameidone(&nd); 8713 vnode_put(rvp); 8714 8715 TRIG_LOG("Trying vnode_resolver_create()\n"); 8716 8717 /* 8718 * Set up blob. vnode_create() takes a larger structure 8719 * with creation info, and we needed something different 8720 * for this case. One needs to win, or we need to munge both; 8721 * vnode_create() wins. 8722 */ 8723 bzero(&vtp, sizeof(vtp)); 8724 vtp.vnt_resolve_func = vtip->vti_resolve_func; 8725 vtp.vnt_unresolve_func = vtip->vti_unresolve_func; 8726 vtp.vnt_rearm_func = vtip->vti_rearm_func; 8727 vtp.vnt_reclaim_func = vtip->vti_reclaim_func; 8728 vtp.vnt_reclaim_func = vtip->vti_reclaim_func; 8729 vtp.vnt_data = vtip->vti_data; 8730 vtp.vnt_flags = vtip->vti_flags; 8731 8732 res = vnode_resolver_create(mp, vp, &vtp, TRUE); 8733 vnode_put(vp); 8734out: 8735 TRIG_LOG("Returning %d\n", res); 8736 return res; 8737} 8738 8739#endif /* CONFIG_TRIGGERS */ 8740