zfs_vnops.c revision 302721
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28/* Portions Copyright 2007 Jeremy Teo */ 29/* Portions Copyright 2010 Robert Milkowski */ 30 31#include <sys/types.h> 32#include <sys/param.h> 33#include <sys/time.h> 34#include <sys/systm.h> 35#include <sys/sysmacros.h> 36#include <sys/resource.h> 37#include <sys/vfs.h> 38#include <sys/vm.h> 39#include <sys/vnode.h> 40#include <sys/file.h> 41#include <sys/stat.h> 42#include <sys/kmem.h> 43#include <sys/taskq.h> 44#include <sys/uio.h> 45#include <sys/atomic.h> 46#include <sys/namei.h> 47#include <sys/mman.h> 48#include <sys/cmn_err.h> 49#include <sys/errno.h> 50#include <sys/unistd.h> 51#include <sys/zfs_dir.h> 52#include <sys/zfs_ioctl.h> 53#include <sys/fs/zfs.h> 54#include <sys/dmu.h> 55#include <sys/dmu_objset.h> 56#include <sys/spa.h> 57#include <sys/txg.h> 58#include <sys/dbuf.h> 59#include <sys/zap.h> 60#include <sys/sa.h> 61#include <sys/dirent.h> 62#include <sys/policy.h> 63#include <sys/sunddi.h> 64#include <sys/filio.h> 65#include <sys/sid.h> 66#include <sys/zfs_ctldir.h> 67#include <sys/zfs_fuid.h> 68#include <sys/zfs_sa.h> 69#include <sys/dnlc.h> 70#include <sys/zfs_rlock.h> 71#include <sys/extdirent.h> 72#include <sys/kidmap.h> 73#include <sys/bio.h> 74#include <sys/buf.h> 75#include <sys/sched.h> 76#include <sys/acl.h> 77#include <vm/vm_param.h> 78 79/* 80 * Programming rules. 81 * 82 * Each vnode op performs some logical unit of work. To do this, the ZPL must 83 * properly lock its in-core state, create a DMU transaction, do the work, 84 * record this work in the intent log (ZIL), commit the DMU transaction, 85 * and wait for the intent log to commit if it is a synchronous operation. 86 * Moreover, the vnode ops must work in both normal and log replay context. 87 * The ordering of events is important to avoid deadlocks and references 88 * to freed memory. The example below illustrates the following Big Rules: 89 * 90 * (1) A check must be made in each zfs thread for a mounted file system. 91 * This is done avoiding races using ZFS_ENTER(zfsvfs). 92 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 93 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 94 * can return EIO from the calling function. 95 * 96 * (2) VN_RELE() should always be the last thing except for zil_commit() 97 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 98 * First, if it's the last reference, the vnode/znode 99 * can be freed, so the zp may point to freed memory. Second, the last 100 * reference will call zfs_zinactive(), which may induce a lot of work -- 101 * pushing cached pages (which acquires range locks) and syncing out 102 * cached atime changes. Third, zfs_zinactive() may require a new tx, 103 * which could deadlock the system if you were already holding one. 104 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 105 * 106 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 107 * as they can span dmu_tx_assign() calls. 108 * 109 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 110 * dmu_tx_assign(). This is critical because we don't want to block 111 * while holding locks. 112 * 113 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This 114 * reduces lock contention and CPU usage when we must wait (note that if 115 * throughput is constrained by the storage, nearly every transaction 116 * must wait). 117 * 118 * Note, in particular, that if a lock is sometimes acquired before 119 * the tx assigns, and sometimes after (e.g. z_lock), then failing 120 * to use a non-blocking assign can deadlock the system. The scenario: 121 * 122 * Thread A has grabbed a lock before calling dmu_tx_assign(). 123 * Thread B is in an already-assigned tx, and blocks for this lock. 124 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 125 * forever, because the previous txg can't quiesce until B's tx commits. 126 * 127 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 128 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 129 * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, 130 * to indicate that this operation has already called dmu_tx_wait(). 131 * This will ensure that we don't retry forever, waiting a short bit 132 * each time. 133 * 134 * (5) If the operation succeeded, generate the intent log entry for it 135 * before dropping locks. This ensures that the ordering of events 136 * in the intent log matches the order in which they actually occurred. 137 * During ZIL replay the zfs_log_* functions will update the sequence 138 * number to indicate the zil transaction has replayed. 139 * 140 * (6) At the end of each vnode op, the DMU tx must always commit, 141 * regardless of whether there were any errors. 142 * 143 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 144 * to ensure that synchronous semantics are provided when necessary. 145 * 146 * In general, this is how things should be ordered in each vnode op: 147 * 148 * ZFS_ENTER(zfsvfs); // exit if unmounted 149 * top: 150 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) 151 * rw_enter(...); // grab any other locks you need 152 * tx = dmu_tx_create(...); // get DMU tx 153 * dmu_tx_hold_*(); // hold each object you might modify 154 * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 155 * if (error) { 156 * rw_exit(...); // drop locks 157 * zfs_dirent_unlock(dl); // unlock directory entry 158 * VN_RELE(...); // release held vnodes 159 * if (error == ERESTART) { 160 * waited = B_TRUE; 161 * dmu_tx_wait(tx); 162 * dmu_tx_abort(tx); 163 * goto top; 164 * } 165 * dmu_tx_abort(tx); // abort DMU tx 166 * ZFS_EXIT(zfsvfs); // finished in zfs 167 * return (error); // really out of space 168 * } 169 * error = do_real_work(); // do whatever this VOP does 170 * if (error == 0) 171 * zfs_log_*(...); // on success, make ZIL entry 172 * dmu_tx_commit(tx); // commit DMU tx -- error or not 173 * rw_exit(...); // drop locks 174 * zfs_dirent_unlock(dl); // unlock directory entry 175 * VN_RELE(...); // release held vnodes 176 * zil_commit(zilog, foid); // synchronous when necessary 177 * ZFS_EXIT(zfsvfs); // finished in zfs 178 * return (error); // done, report error 179 */ 180 181/* ARGSUSED */ 182static int 183zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 184{ 185 znode_t *zp = VTOZ(*vpp); 186 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 187 188 ZFS_ENTER(zfsvfs); 189 ZFS_VERIFY_ZP(zp); 190 191 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && 192 ((flag & FAPPEND) == 0)) { 193 ZFS_EXIT(zfsvfs); 194 return (SET_ERROR(EPERM)); 195 } 196 197 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 198 ZTOV(zp)->v_type == VREG && 199 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { 200 if (fs_vscan(*vpp, cr, 0) != 0) { 201 ZFS_EXIT(zfsvfs); 202 return (SET_ERROR(EACCES)); 203 } 204 } 205 206 /* Keep a count of the synchronous opens in the znode */ 207 if (flag & (FSYNC | FDSYNC)) 208 atomic_inc_32(&zp->z_sync_cnt); 209 210 ZFS_EXIT(zfsvfs); 211 return (0); 212} 213 214/* ARGSUSED */ 215static int 216zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 217 caller_context_t *ct) 218{ 219 znode_t *zp = VTOZ(vp); 220 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 221 222 /* 223 * Clean up any locks held by this process on the vp. 224 */ 225 cleanlocks(vp, ddi_get_pid(), 0); 226 cleanshares(vp, ddi_get_pid()); 227 228 ZFS_ENTER(zfsvfs); 229 ZFS_VERIFY_ZP(zp); 230 231 /* Decrement the synchronous opens in the znode */ 232 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 233 atomic_dec_32(&zp->z_sync_cnt); 234 235 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 236 ZTOV(zp)->v_type == VREG && 237 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) 238 VERIFY(fs_vscan(vp, cr, 1) == 0); 239 240 ZFS_EXIT(zfsvfs); 241 return (0); 242} 243 244/* 245 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 246 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 247 */ 248static int 249zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) 250{ 251 znode_t *zp = VTOZ(vp); 252 uint64_t noff = (uint64_t)*off; /* new offset */ 253 uint64_t file_sz; 254 int error; 255 boolean_t hole; 256 257 file_sz = zp->z_size; 258 if (noff >= file_sz) { 259 return (SET_ERROR(ENXIO)); 260 } 261 262 if (cmd == _FIO_SEEK_HOLE) 263 hole = B_TRUE; 264 else 265 hole = B_FALSE; 266 267 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 268 269 if (error == ESRCH) 270 return (SET_ERROR(ENXIO)); 271 272 /* 273 * We could find a hole that begins after the logical end-of-file, 274 * because dmu_offset_next() only works on whole blocks. If the 275 * EOF falls mid-block, then indicate that the "virtual hole" 276 * at the end of the file begins at the logical EOF, rather than 277 * at the end of the last block. 278 */ 279 if (noff > file_sz) { 280 ASSERT(hole); 281 noff = file_sz; 282 } 283 284 if (noff < *off) 285 return (error); 286 *off = noff; 287 return (error); 288} 289 290/* ARGSUSED */ 291static int 292zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, 293 int *rvalp, caller_context_t *ct) 294{ 295 offset_t off; 296 offset_t ndata; 297 dmu_object_info_t doi; 298 int error; 299 zfsvfs_t *zfsvfs; 300 znode_t *zp; 301 302 switch (com) { 303 case _FIOFFS: 304 { 305 return (0); 306 307 /* 308 * The following two ioctls are used by bfu. Faking out, 309 * necessary to avoid bfu errors. 310 */ 311 } 312 case _FIOGDIO: 313 case _FIOSDIO: 314 { 315 return (0); 316 } 317 318 case _FIO_SEEK_DATA: 319 case _FIO_SEEK_HOLE: 320 { 321#ifdef illumos 322 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 323 return (SET_ERROR(EFAULT)); 324#else 325 off = *(offset_t *)data; 326#endif 327 zp = VTOZ(vp); 328 zfsvfs = zp->z_zfsvfs; 329 ZFS_ENTER(zfsvfs); 330 ZFS_VERIFY_ZP(zp); 331 332 /* offset parameter is in/out */ 333 error = zfs_holey(vp, com, &off); 334 ZFS_EXIT(zfsvfs); 335 if (error) 336 return (error); 337#ifdef illumos 338 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 339 return (SET_ERROR(EFAULT)); 340#else 341 *(offset_t *)data = off; 342#endif 343 return (0); 344 } 345#ifdef illumos 346 case _FIO_COUNT_FILLED: 347 { 348 /* 349 * _FIO_COUNT_FILLED adds a new ioctl command which 350 * exposes the number of filled blocks in a 351 * ZFS object. 352 */ 353 zp = VTOZ(vp); 354 zfsvfs = zp->z_zfsvfs; 355 ZFS_ENTER(zfsvfs); 356 ZFS_VERIFY_ZP(zp); 357 358 /* 359 * Wait for all dirty blocks for this object 360 * to get synced out to disk, and the DMU info 361 * updated. 362 */ 363 error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); 364 if (error) { 365 ZFS_EXIT(zfsvfs); 366 return (error); 367 } 368 369 /* 370 * Retrieve fill count from DMU object. 371 */ 372 error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); 373 if (error) { 374 ZFS_EXIT(zfsvfs); 375 return (error); 376 } 377 378 ndata = doi.doi_fill_count; 379 380 ZFS_EXIT(zfsvfs); 381 if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) 382 return (SET_ERROR(EFAULT)); 383 return (0); 384 } 385#endif 386 } 387 return (SET_ERROR(ENOTTY)); 388} 389 390static vm_page_t 391page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) 392{ 393 vm_object_t obj; 394 vm_page_t pp; 395 int64_t end; 396 397 /* 398 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE 399 * aligned boundaries, if the range is not aligned. As a result a 400 * DEV_BSIZE subrange with partially dirty data may get marked as clean. 401 * It may happen that all DEV_BSIZE subranges are marked clean and thus 402 * the whole page would be considred clean despite have some dirty data. 403 * For this reason we should shrink the range to DEV_BSIZE aligned 404 * boundaries before calling vm_page_clear_dirty. 405 */ 406 end = rounddown2(off + nbytes, DEV_BSIZE); 407 off = roundup2(off, DEV_BSIZE); 408 nbytes = end - off; 409 410 obj = vp->v_object; 411 zfs_vmobject_assert_wlocked(obj); 412 413 for (;;) { 414 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 415 pp->valid) { 416 if (vm_page_xbusied(pp)) { 417 /* 418 * Reference the page before unlocking and 419 * sleeping so that the page daemon is less 420 * likely to reclaim it. 421 */ 422 vm_page_reference(pp); 423 vm_page_lock(pp); 424 zfs_vmobject_wunlock(obj); 425 vm_page_busy_sleep(pp, "zfsmwb"); 426 zfs_vmobject_wlock(obj); 427 continue; 428 } 429 vm_page_sbusy(pp); 430 } else if (pp == NULL) { 431 pp = vm_page_alloc(obj, OFF_TO_IDX(start), 432 VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED | 433 VM_ALLOC_SBUSY); 434 } else { 435 ASSERT(pp != NULL && !pp->valid); 436 pp = NULL; 437 } 438 439 if (pp != NULL) { 440 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 441 vm_object_pip_add(obj, 1); 442 pmap_remove_write(pp); 443 if (nbytes != 0) 444 vm_page_clear_dirty(pp, off, nbytes); 445 } 446 break; 447 } 448 return (pp); 449} 450 451static void 452page_unbusy(vm_page_t pp) 453{ 454 455 vm_page_sunbusy(pp); 456 vm_object_pip_subtract(pp->object, 1); 457} 458 459static vm_page_t 460page_hold(vnode_t *vp, int64_t start) 461{ 462 vm_object_t obj; 463 vm_page_t pp; 464 465 obj = vp->v_object; 466 zfs_vmobject_assert_wlocked(obj); 467 468 for (;;) { 469 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 470 pp->valid) { 471 if (vm_page_xbusied(pp)) { 472 /* 473 * Reference the page before unlocking and 474 * sleeping so that the page daemon is less 475 * likely to reclaim it. 476 */ 477 vm_page_reference(pp); 478 vm_page_lock(pp); 479 zfs_vmobject_wunlock(obj); 480 vm_page_busy_sleep(pp, "zfsmwb"); 481 zfs_vmobject_wlock(obj); 482 continue; 483 } 484 485 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 486 vm_page_lock(pp); 487 vm_page_hold(pp); 488 vm_page_unlock(pp); 489 490 } else 491 pp = NULL; 492 break; 493 } 494 return (pp); 495} 496 497static void 498page_unhold(vm_page_t pp) 499{ 500 501 vm_page_lock(pp); 502 vm_page_unhold(pp); 503 vm_page_unlock(pp); 504} 505 506/* 507 * When a file is memory mapped, we must keep the IO data synchronized 508 * between the DMU cache and the memory mapped pages. What this means: 509 * 510 * On Write: If we find a memory mapped page, we write to *both* 511 * the page and the dmu buffer. 512 */ 513static void 514update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, 515 int segflg, dmu_tx_t *tx) 516{ 517 vm_object_t obj; 518 struct sf_buf *sf; 519 caddr_t va; 520 int off; 521 522 ASSERT(segflg != UIO_NOCOPY); 523 ASSERT(vp->v_mount != NULL); 524 obj = vp->v_object; 525 ASSERT(obj != NULL); 526 527 off = start & PAGEOFFSET; 528 zfs_vmobject_wlock(obj); 529 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 530 vm_page_t pp; 531 int nbytes = imin(PAGESIZE - off, len); 532 533 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { 534 zfs_vmobject_wunlock(obj); 535 536 va = zfs_map_page(pp, &sf); 537 (void) dmu_read(os, oid, start+off, nbytes, 538 va+off, DMU_READ_PREFETCH);; 539 zfs_unmap_page(sf); 540 541 zfs_vmobject_wlock(obj); 542 page_unbusy(pp); 543 } 544 len -= nbytes; 545 off = 0; 546 } 547 vm_object_pip_wakeupn(obj, 0); 548 zfs_vmobject_wunlock(obj); 549} 550 551/* 552 * Read with UIO_NOCOPY flag means that sendfile(2) requests 553 * ZFS to populate a range of page cache pages with data. 554 * 555 * NOTE: this function could be optimized to pre-allocate 556 * all pages in advance, drain exclusive busy on all of them, 557 * map them into contiguous KVA region and populate them 558 * in one single dmu_read() call. 559 */ 560static int 561mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) 562{ 563 znode_t *zp = VTOZ(vp); 564 objset_t *os = zp->z_zfsvfs->z_os; 565 struct sf_buf *sf; 566 vm_object_t obj; 567 vm_page_t pp; 568 int64_t start; 569 caddr_t va; 570 int len = nbytes; 571 int off; 572 int error = 0; 573 574 ASSERT(uio->uio_segflg == UIO_NOCOPY); 575 ASSERT(vp->v_mount != NULL); 576 obj = vp->v_object; 577 ASSERT(obj != NULL); 578 ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); 579 580 zfs_vmobject_wlock(obj); 581 for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { 582 int bytes = MIN(PAGESIZE, len); 583 584 pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | 585 VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); 586 if (pp->valid == 0) { 587 zfs_vmobject_wunlock(obj); 588 va = zfs_map_page(pp, &sf); 589 error = dmu_read(os, zp->z_id, start, bytes, va, 590 DMU_READ_PREFETCH); 591 if (bytes != PAGESIZE && error == 0) 592 bzero(va + bytes, PAGESIZE - bytes); 593 zfs_unmap_page(sf); 594 zfs_vmobject_wlock(obj); 595 vm_page_sunbusy(pp); 596 vm_page_lock(pp); 597 if (error) { 598 if (pp->wire_count == 0 && pp->valid == 0 && 599 !vm_page_busied(pp)) 600 vm_page_free(pp); 601 } else { 602 pp->valid = VM_PAGE_BITS_ALL; 603 vm_page_activate(pp); 604 } 605 vm_page_unlock(pp); 606 } else { 607 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 608 vm_page_sunbusy(pp); 609 } 610 if (error) 611 break; 612 uio->uio_resid -= bytes; 613 uio->uio_offset += bytes; 614 len -= bytes; 615 } 616 zfs_vmobject_wunlock(obj); 617 return (error); 618} 619 620/* 621 * When a file is memory mapped, we must keep the IO data synchronized 622 * between the DMU cache and the memory mapped pages. What this means: 623 * 624 * On Read: We "read" preferentially from memory mapped pages, 625 * else we default from the dmu buffer. 626 * 627 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 628 * the file is memory mapped. 629 */ 630static int 631mappedread(vnode_t *vp, int nbytes, uio_t *uio) 632{ 633 znode_t *zp = VTOZ(vp); 634 vm_object_t obj; 635 int64_t start; 636 caddr_t va; 637 int len = nbytes; 638 int off; 639 int error = 0; 640 641 ASSERT(vp->v_mount != NULL); 642 obj = vp->v_object; 643 ASSERT(obj != NULL); 644 645 start = uio->uio_loffset; 646 off = start & PAGEOFFSET; 647 zfs_vmobject_wlock(obj); 648 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 649 vm_page_t pp; 650 uint64_t bytes = MIN(PAGESIZE - off, len); 651 652 if (pp = page_hold(vp, start)) { 653 struct sf_buf *sf; 654 caddr_t va; 655 656 zfs_vmobject_wunlock(obj); 657 va = zfs_map_page(pp, &sf); 658#ifdef illumos 659 error = uiomove(va + off, bytes, UIO_READ, uio); 660#else 661 error = vn_io_fault_uiomove(va + off, bytes, uio); 662#endif 663 zfs_unmap_page(sf); 664 zfs_vmobject_wlock(obj); 665 page_unhold(pp); 666 } else { 667 zfs_vmobject_wunlock(obj); 668 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 669 uio, bytes); 670 zfs_vmobject_wlock(obj); 671 } 672 len -= bytes; 673 off = 0; 674 if (error) 675 break; 676 } 677 zfs_vmobject_wunlock(obj); 678 return (error); 679} 680 681offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 682 683/* 684 * Read bytes from specified file into supplied buffer. 685 * 686 * IN: vp - vnode of file to be read from. 687 * uio - structure supplying read location, range info, 688 * and return buffer. 689 * ioflag - SYNC flags; used to provide FRSYNC semantics. 690 * cr - credentials of caller. 691 * ct - caller context 692 * 693 * OUT: uio - updated offset and range, buffer filled. 694 * 695 * RETURN: 0 on success, error code on failure. 696 * 697 * Side Effects: 698 * vp - atime updated if byte count > 0 699 */ 700/* ARGSUSED */ 701static int 702zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 703{ 704 znode_t *zp = VTOZ(vp); 705 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 706 ssize_t n, nbytes; 707 int error = 0; 708 rl_t *rl; 709 xuio_t *xuio = NULL; 710 711 ZFS_ENTER(zfsvfs); 712 ZFS_VERIFY_ZP(zp); 713 714 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 715 ZFS_EXIT(zfsvfs); 716 return (SET_ERROR(EACCES)); 717 } 718 719 /* 720 * Validate file offset 721 */ 722 if (uio->uio_loffset < (offset_t)0) { 723 ZFS_EXIT(zfsvfs); 724 return (SET_ERROR(EINVAL)); 725 } 726 727 /* 728 * Fasttrack empty reads 729 */ 730 if (uio->uio_resid == 0) { 731 ZFS_EXIT(zfsvfs); 732 return (0); 733 } 734 735 /* 736 * Check for mandatory locks 737 */ 738 if (MANDMODE(zp->z_mode)) { 739 if (error = chklock(vp, FREAD, 740 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 741 ZFS_EXIT(zfsvfs); 742 return (error); 743 } 744 } 745 746 /* 747 * If we're in FRSYNC mode, sync out this znode before reading it. 748 */ 749 if (zfsvfs->z_log && 750 (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) 751 zil_commit(zfsvfs->z_log, zp->z_id); 752 753 /* 754 * Lock the range against changes. 755 */ 756 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 757 758 /* 759 * If we are reading past end-of-file we can skip 760 * to the end; but we might still need to set atime. 761 */ 762 if (uio->uio_loffset >= zp->z_size) { 763 error = 0; 764 goto out; 765 } 766 767 ASSERT(uio->uio_loffset < zp->z_size); 768 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); 769 770#ifdef illumos 771 if ((uio->uio_extflg == UIO_XUIO) && 772 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { 773 int nblk; 774 int blksz = zp->z_blksz; 775 uint64_t offset = uio->uio_loffset; 776 777 xuio = (xuio_t *)uio; 778 if ((ISP2(blksz))) { 779 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, 780 blksz)) / blksz; 781 } else { 782 ASSERT(offset + n <= blksz); 783 nblk = 1; 784 } 785 (void) dmu_xuio_init(xuio, nblk); 786 787 if (vn_has_cached_data(vp)) { 788 /* 789 * For simplicity, we always allocate a full buffer 790 * even if we only expect to read a portion of a block. 791 */ 792 while (--nblk >= 0) { 793 (void) dmu_xuio_add(xuio, 794 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 795 blksz), 0, blksz); 796 } 797 } 798 } 799#endif /* illumos */ 800 801 while (n > 0) { 802 nbytes = MIN(n, zfs_read_chunk_size - 803 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 804 805#ifdef __FreeBSD__ 806 if (uio->uio_segflg == UIO_NOCOPY) 807 error = mappedread_sf(vp, nbytes, uio); 808 else 809#endif /* __FreeBSD__ */ 810 if (vn_has_cached_data(vp)) { 811 error = mappedread(vp, nbytes, uio); 812 } else { 813 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 814 uio, nbytes); 815 } 816 if (error) { 817 /* convert checksum errors into IO errors */ 818 if (error == ECKSUM) 819 error = SET_ERROR(EIO); 820 break; 821 } 822 823 n -= nbytes; 824 } 825out: 826 zfs_range_unlock(rl); 827 828 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 829 ZFS_EXIT(zfsvfs); 830 return (error); 831} 832 833/* 834 * Write the bytes to a file. 835 * 836 * IN: vp - vnode of file to be written to. 837 * uio - structure supplying write location, range info, 838 * and data buffer. 839 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is 840 * set if in append mode. 841 * cr - credentials of caller. 842 * ct - caller context (NFS/CIFS fem monitor only) 843 * 844 * OUT: uio - updated offset and range. 845 * 846 * RETURN: 0 on success, error code on failure. 847 * 848 * Timestamps: 849 * vp - ctime|mtime updated if byte count > 0 850 */ 851 852/* ARGSUSED */ 853static int 854zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 855{ 856 znode_t *zp = VTOZ(vp); 857 rlim64_t limit = MAXOFFSET_T; 858 ssize_t start_resid = uio->uio_resid; 859 ssize_t tx_bytes; 860 uint64_t end_size; 861 dmu_tx_t *tx; 862 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 863 zilog_t *zilog; 864 offset_t woff; 865 ssize_t n, nbytes; 866 rl_t *rl; 867 int max_blksz = zfsvfs->z_max_blksz; 868 int error = 0; 869 arc_buf_t *abuf; 870 iovec_t *aiov = NULL; 871 xuio_t *xuio = NULL; 872 int i_iov = 0; 873 int iovcnt = uio->uio_iovcnt; 874 iovec_t *iovp = uio->uio_iov; 875 int write_eof; 876 int count = 0; 877 sa_bulk_attr_t bulk[4]; 878 uint64_t mtime[2], ctime[2]; 879 880 /* 881 * Fasttrack empty write 882 */ 883 n = start_resid; 884 if (n == 0) 885 return (0); 886 887 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 888 limit = MAXOFFSET_T; 889 890 ZFS_ENTER(zfsvfs); 891 ZFS_VERIFY_ZP(zp); 892 893 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 894 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 895 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 896 &zp->z_size, 8); 897 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 898 &zp->z_pflags, 8); 899 900 /* 901 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our 902 * callers might not be able to detect properly that we are read-only, 903 * so check it explicitly here. 904 */ 905 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 906 ZFS_EXIT(zfsvfs); 907 return (SET_ERROR(EROFS)); 908 } 909 910 /* 911 * If immutable or not appending then return EPERM 912 */ 913 if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 914 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 915 (uio->uio_loffset < zp->z_size))) { 916 ZFS_EXIT(zfsvfs); 917 return (SET_ERROR(EPERM)); 918 } 919 920 zilog = zfsvfs->z_log; 921 922 /* 923 * Validate file offset 924 */ 925 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; 926 if (woff < 0) { 927 ZFS_EXIT(zfsvfs); 928 return (SET_ERROR(EINVAL)); 929 } 930 931 /* 932 * Check for mandatory locks before calling zfs_range_lock() 933 * in order to prevent a deadlock with locks set via fcntl(). 934 */ 935 if (MANDMODE((mode_t)zp->z_mode) && 936 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 937 ZFS_EXIT(zfsvfs); 938 return (error); 939 } 940 941#ifdef illumos 942 /* 943 * Pre-fault the pages to ensure slow (eg NFS) pages 944 * don't hold up txg. 945 * Skip this if uio contains loaned arc_buf. 946 */ 947 if ((uio->uio_extflg == UIO_XUIO) && 948 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) 949 xuio = (xuio_t *)uio; 950 else 951 uio_prefaultpages(MIN(n, max_blksz), uio); 952#endif 953 954 /* 955 * If in append mode, set the io offset pointer to eof. 956 */ 957 if (ioflag & FAPPEND) { 958 /* 959 * Obtain an appending range lock to guarantee file append 960 * semantics. We reset the write offset once we have the lock. 961 */ 962 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 963 woff = rl->r_off; 964 if (rl->r_len == UINT64_MAX) { 965 /* 966 * We overlocked the file because this write will cause 967 * the file block size to increase. 968 * Note that zp_size cannot change with this lock held. 969 */ 970 woff = zp->z_size; 971 } 972 uio->uio_loffset = woff; 973 } else { 974 /* 975 * Note that if the file block size will change as a result of 976 * this write, then this range lock will lock the entire file 977 * so that we can re-write the block safely. 978 */ 979 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 980 } 981 982 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { 983 zfs_range_unlock(rl); 984 ZFS_EXIT(zfsvfs); 985 return (EFBIG); 986 } 987 988 if (woff >= limit) { 989 zfs_range_unlock(rl); 990 ZFS_EXIT(zfsvfs); 991 return (SET_ERROR(EFBIG)); 992 } 993 994 if ((woff + n) > limit || woff > (limit - n)) 995 n = limit - woff; 996 997 /* Will this write extend the file length? */ 998 write_eof = (woff + n > zp->z_size); 999 1000 end_size = MAX(zp->z_size, woff + n); 1001 1002 /* 1003 * Write the file in reasonable size chunks. Each chunk is written 1004 * in a separate transaction; this keeps the intent log records small 1005 * and allows us to do more fine-grained space accounting. 1006 */ 1007 while (n > 0) { 1008 abuf = NULL; 1009 woff = uio->uio_loffset; 1010 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 1011 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 1012 if (abuf != NULL) 1013 dmu_return_arcbuf(abuf); 1014 error = SET_ERROR(EDQUOT); 1015 break; 1016 } 1017 1018 if (xuio && abuf == NULL) { 1019 ASSERT(i_iov < iovcnt); 1020 aiov = &iovp[i_iov]; 1021 abuf = dmu_xuio_arcbuf(xuio, i_iov); 1022 dmu_xuio_clear(xuio, i_iov); 1023 DTRACE_PROBE3(zfs_cp_write, int, i_iov, 1024 iovec_t *, aiov, arc_buf_t *, abuf); 1025 ASSERT((aiov->iov_base == abuf->b_data) || 1026 ((char *)aiov->iov_base - (char *)abuf->b_data + 1027 aiov->iov_len == arc_buf_size(abuf))); 1028 i_iov++; 1029 } else if (abuf == NULL && n >= max_blksz && 1030 woff >= zp->z_size && 1031 P2PHASE(woff, max_blksz) == 0 && 1032 zp->z_blksz == max_blksz) { 1033 /* 1034 * This write covers a full block. "Borrow" a buffer 1035 * from the dmu so that we can fill it before we enter 1036 * a transaction. This avoids the possibility of 1037 * holding up the transaction if the data copy hangs 1038 * up on a pagefault (e.g., from an NFS server mapping). 1039 */ 1040#ifdef illumos 1041 size_t cbytes; 1042#endif 1043 1044 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 1045 max_blksz); 1046 ASSERT(abuf != NULL); 1047 ASSERT(arc_buf_size(abuf) == max_blksz); 1048#ifdef illumos 1049 if (error = uiocopy(abuf->b_data, max_blksz, 1050 UIO_WRITE, uio, &cbytes)) { 1051 dmu_return_arcbuf(abuf); 1052 break; 1053 } 1054 ASSERT(cbytes == max_blksz); 1055#else 1056 ssize_t resid = uio->uio_resid; 1057 error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio); 1058 if (error != 0) { 1059 uio->uio_offset -= resid - uio->uio_resid; 1060 uio->uio_resid = resid; 1061 dmu_return_arcbuf(abuf); 1062 break; 1063 } 1064#endif 1065 } 1066 1067 /* 1068 * Start a transaction. 1069 */ 1070 tx = dmu_tx_create(zfsvfs->z_os); 1071 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1072 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 1073 zfs_sa_upgrade_txholds(tx, zp); 1074 error = dmu_tx_assign(tx, TXG_WAIT); 1075 if (error) { 1076 dmu_tx_abort(tx); 1077 if (abuf != NULL) 1078 dmu_return_arcbuf(abuf); 1079 break; 1080 } 1081 1082 /* 1083 * If zfs_range_lock() over-locked we grow the blocksize 1084 * and then reduce the lock range. This will only happen 1085 * on the first iteration since zfs_range_reduce() will 1086 * shrink down r_len to the appropriate size. 1087 */ 1088 if (rl->r_len == UINT64_MAX) { 1089 uint64_t new_blksz; 1090 1091 if (zp->z_blksz > max_blksz) { 1092 /* 1093 * File's blocksize is already larger than the 1094 * "recordsize" property. Only let it grow to 1095 * the next power of 2. 1096 */ 1097 ASSERT(!ISP2(zp->z_blksz)); 1098 new_blksz = MIN(end_size, 1099 1 << highbit64(zp->z_blksz)); 1100 } else { 1101 new_blksz = MIN(end_size, max_blksz); 1102 } 1103 zfs_grow_blocksize(zp, new_blksz, tx); 1104 zfs_range_reduce(rl, woff, n); 1105 } 1106 1107 /* 1108 * XXX - should we really limit each write to z_max_blksz? 1109 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 1110 */ 1111 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 1112 1113 if (woff + nbytes > zp->z_size) 1114 vnode_pager_setsize(vp, woff + nbytes); 1115 1116 if (abuf == NULL) { 1117 tx_bytes = uio->uio_resid; 1118 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 1119 uio, nbytes, tx); 1120 tx_bytes -= uio->uio_resid; 1121 } else { 1122 tx_bytes = nbytes; 1123 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); 1124 /* 1125 * If this is not a full block write, but we are 1126 * extending the file past EOF and this data starts 1127 * block-aligned, use assign_arcbuf(). Otherwise, 1128 * write via dmu_write(). 1129 */ 1130 if (tx_bytes < max_blksz && (!write_eof || 1131 aiov->iov_base != abuf->b_data)) { 1132 ASSERT(xuio); 1133 dmu_write(zfsvfs->z_os, zp->z_id, woff, 1134 aiov->iov_len, aiov->iov_base, tx); 1135 dmu_return_arcbuf(abuf); 1136 xuio_stat_wbuf_copied(); 1137 } else { 1138 ASSERT(xuio || tx_bytes == max_blksz); 1139 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), 1140 woff, abuf, tx); 1141 } 1142#ifdef illumos 1143 ASSERT(tx_bytes <= uio->uio_resid); 1144 uioskip(uio, tx_bytes); 1145#endif 1146 } 1147 if (tx_bytes && vn_has_cached_data(vp)) { 1148 update_pages(vp, woff, tx_bytes, zfsvfs->z_os, 1149 zp->z_id, uio->uio_segflg, tx); 1150 } 1151 1152 /* 1153 * If we made no progress, we're done. If we made even 1154 * partial progress, update the znode and ZIL accordingly. 1155 */ 1156 if (tx_bytes == 0) { 1157 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 1158 (void *)&zp->z_size, sizeof (uint64_t), tx); 1159 dmu_tx_commit(tx); 1160 ASSERT(error != 0); 1161 break; 1162 } 1163 1164 /* 1165 * Clear Set-UID/Set-GID bits on successful write if not 1166 * privileged and at least one of the excute bits is set. 1167 * 1168 * It would be nice to to this after all writes have 1169 * been done, but that would still expose the ISUID/ISGID 1170 * to another app after the partial write is committed. 1171 * 1172 * Note: we don't call zfs_fuid_map_id() here because 1173 * user 0 is not an ephemeral uid. 1174 */ 1175 mutex_enter(&zp->z_acl_lock); 1176 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | 1177 (S_IXUSR >> 6))) != 0 && 1178 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 1179 secpolicy_vnode_setid_retain(vp, cr, 1180 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { 1181 uint64_t newmode; 1182 zp->z_mode &= ~(S_ISUID | S_ISGID); 1183 newmode = zp->z_mode; 1184 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 1185 (void *)&newmode, sizeof (uint64_t), tx); 1186 } 1187 mutex_exit(&zp->z_acl_lock); 1188 1189 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 1190 B_TRUE); 1191 1192 /* 1193 * Update the file size (zp_size) if it has changed; 1194 * account for possible concurrent updates. 1195 */ 1196 while ((end_size = zp->z_size) < uio->uio_loffset) { 1197 (void) atomic_cas_64(&zp->z_size, end_size, 1198 uio->uio_loffset); 1199#ifdef illumos 1200 ASSERT(error == 0); 1201#else 1202 ASSERT(error == 0 || error == EFAULT); 1203#endif 1204 } 1205 /* 1206 * If we are replaying and eof is non zero then force 1207 * the file size to the specified eof. Note, there's no 1208 * concurrency during replay. 1209 */ 1210 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 1211 zp->z_size = zfsvfs->z_replay_eof; 1212 1213 if (error == 0) 1214 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1215 else 1216 (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1217 1218 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); 1219 dmu_tx_commit(tx); 1220 1221 if (error != 0) 1222 break; 1223 ASSERT(tx_bytes == nbytes); 1224 n -= nbytes; 1225 1226#ifdef illumos 1227 if (!xuio && n > 0) 1228 uio_prefaultpages(MIN(n, max_blksz), uio); 1229#endif 1230 } 1231 1232 zfs_range_unlock(rl); 1233 1234 /* 1235 * If we're in replay mode, or we made no progress, return error. 1236 * Otherwise, it's at least a partial write, so it's successful. 1237 */ 1238 if (zfsvfs->z_replay || uio->uio_resid == start_resid) { 1239 ZFS_EXIT(zfsvfs); 1240 return (error); 1241 } 1242 1243#ifdef __FreeBSD__ 1244 /* 1245 * EFAULT means that at least one page of the source buffer was not 1246 * available. VFS will re-try remaining I/O upon this error. 1247 */ 1248 if (error == EFAULT) { 1249 ZFS_EXIT(zfsvfs); 1250 return (error); 1251 } 1252#endif 1253 1254 if (ioflag & (FSYNC | FDSYNC) || 1255 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1256 zil_commit(zilog, zp->z_id); 1257 1258 ZFS_EXIT(zfsvfs); 1259 return (0); 1260} 1261 1262void 1263zfs_get_done(zgd_t *zgd, int error) 1264{ 1265 znode_t *zp = zgd->zgd_private; 1266 objset_t *os = zp->z_zfsvfs->z_os; 1267 1268 if (zgd->zgd_db) 1269 dmu_buf_rele(zgd->zgd_db, zgd); 1270 1271 zfs_range_unlock(zgd->zgd_rl); 1272 1273 /* 1274 * Release the vnode asynchronously as we currently have the 1275 * txg stopped from syncing. 1276 */ 1277 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1278 1279 if (error == 0 && zgd->zgd_bp) 1280 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 1281 1282 kmem_free(zgd, sizeof (zgd_t)); 1283} 1284 1285#ifdef DEBUG 1286static int zil_fault_io = 0; 1287#endif 1288 1289/* 1290 * Get data to generate a TX_WRITE intent log record. 1291 */ 1292int 1293zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 1294{ 1295 zfsvfs_t *zfsvfs = arg; 1296 objset_t *os = zfsvfs->z_os; 1297 znode_t *zp; 1298 uint64_t object = lr->lr_foid; 1299 uint64_t offset = lr->lr_offset; 1300 uint64_t size = lr->lr_length; 1301 blkptr_t *bp = &lr->lr_blkptr; 1302 dmu_buf_t *db; 1303 zgd_t *zgd; 1304 int error = 0; 1305 1306 ASSERT(zio != NULL); 1307 ASSERT(size != 0); 1308 1309 /* 1310 * Nothing to do if the file has been removed 1311 */ 1312 if (zfs_zget(zfsvfs, object, &zp) != 0) 1313 return (SET_ERROR(ENOENT)); 1314 if (zp->z_unlinked) { 1315 /* 1316 * Release the vnode asynchronously as we currently have the 1317 * txg stopped from syncing. 1318 */ 1319 VN_RELE_ASYNC(ZTOV(zp), 1320 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1321 return (SET_ERROR(ENOENT)); 1322 } 1323 1324 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1325 zgd->zgd_zilog = zfsvfs->z_log; 1326 zgd->zgd_private = zp; 1327 1328 /* 1329 * Write records come in two flavors: immediate and indirect. 1330 * For small writes it's cheaper to store the data with the 1331 * log record (immediate); for large writes it's cheaper to 1332 * sync the data and get a pointer to it (indirect) so that 1333 * we don't have to write the data twice. 1334 */ 1335 if (buf != NULL) { /* immediate write */ 1336 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); 1337 /* test for truncation needs to be done while range locked */ 1338 if (offset >= zp->z_size) { 1339 error = SET_ERROR(ENOENT); 1340 } else { 1341 error = dmu_read(os, object, offset, size, buf, 1342 DMU_READ_NO_PREFETCH); 1343 } 1344 ASSERT(error == 0 || error == ENOENT); 1345 } else { /* indirect write */ 1346 /* 1347 * Have to lock the whole block to ensure when it's 1348 * written out and it's checksum is being calculated 1349 * that no one can change the data. We need to re-check 1350 * blocksize after we get the lock in case it's changed! 1351 */ 1352 for (;;) { 1353 uint64_t blkoff; 1354 size = zp->z_blksz; 1355 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1356 offset -= blkoff; 1357 zgd->zgd_rl = zfs_range_lock(zp, offset, size, 1358 RL_READER); 1359 if (zp->z_blksz == size) 1360 break; 1361 offset += blkoff; 1362 zfs_range_unlock(zgd->zgd_rl); 1363 } 1364 /* test for truncation needs to be done while range locked */ 1365 if (lr->lr_offset >= zp->z_size) 1366 error = SET_ERROR(ENOENT); 1367#ifdef DEBUG 1368 if (zil_fault_io) { 1369 error = SET_ERROR(EIO); 1370 zil_fault_io = 0; 1371 } 1372#endif 1373 if (error == 0) 1374 error = dmu_buf_hold(os, object, offset, zgd, &db, 1375 DMU_READ_NO_PREFETCH); 1376 1377 if (error == 0) { 1378 blkptr_t *obp = dmu_buf_get_blkptr(db); 1379 if (obp) { 1380 ASSERT(BP_IS_HOLE(bp)); 1381 *bp = *obp; 1382 } 1383 1384 zgd->zgd_db = db; 1385 zgd->zgd_bp = bp; 1386 1387 ASSERT(db->db_offset == offset); 1388 ASSERT(db->db_size == size); 1389 1390 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1391 zfs_get_done, zgd); 1392 ASSERT(error || lr->lr_length <= zp->z_blksz); 1393 1394 /* 1395 * On success, we need to wait for the write I/O 1396 * initiated by dmu_sync() to complete before we can 1397 * release this dbuf. We will finish everything up 1398 * in the zfs_get_done() callback. 1399 */ 1400 if (error == 0) 1401 return (0); 1402 1403 if (error == EALREADY) { 1404 lr->lr_common.lrc_txtype = TX_WRITE2; 1405 error = 0; 1406 } 1407 } 1408 } 1409 1410 zfs_get_done(zgd, error); 1411 1412 return (error); 1413} 1414 1415/*ARGSUSED*/ 1416static int 1417zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1418 caller_context_t *ct) 1419{ 1420 znode_t *zp = VTOZ(vp); 1421 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1422 int error; 1423 1424 ZFS_ENTER(zfsvfs); 1425 ZFS_VERIFY_ZP(zp); 1426 1427 if (flag & V_ACE_MASK) 1428 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1429 else 1430 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1431 1432 ZFS_EXIT(zfsvfs); 1433 return (error); 1434} 1435 1436/* 1437 * If vnode is for a device return a specfs vnode instead. 1438 */ 1439static int 1440specvp_check(vnode_t **vpp, cred_t *cr) 1441{ 1442 int error = 0; 1443 1444 if (IS_DEVVP(*vpp)) { 1445 struct vnode *svp; 1446 1447 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1448 VN_RELE(*vpp); 1449 if (svp == NULL) 1450 error = SET_ERROR(ENOSYS); 1451 *vpp = svp; 1452 } 1453 return (error); 1454} 1455 1456 1457/* 1458 * Lookup an entry in a directory, or an extended attribute directory. 1459 * If it exists, return a held vnode reference for it. 1460 * 1461 * IN: dvp - vnode of directory to search. 1462 * nm - name of entry to lookup. 1463 * pnp - full pathname to lookup [UNUSED]. 1464 * flags - LOOKUP_XATTR set if looking for an attribute. 1465 * rdir - root directory vnode [UNUSED]. 1466 * cr - credentials of caller. 1467 * ct - caller context 1468 * direntflags - directory lookup flags 1469 * realpnp - returned pathname. 1470 * 1471 * OUT: vpp - vnode of located entry, NULL if not found. 1472 * 1473 * RETURN: 0 on success, error code on failure. 1474 * 1475 * Timestamps: 1476 * NA 1477 */ 1478/* ARGSUSED */ 1479static int 1480zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, 1481 int nameiop, cred_t *cr, kthread_t *td, int flags) 1482{ 1483 znode_t *zdp = VTOZ(dvp); 1484 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1485 int error = 0; 1486 int *direntflags = NULL; 1487 void *realpnp = NULL; 1488 1489 /* fast path */ 1490 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { 1491 1492 if (dvp->v_type != VDIR) { 1493 return (SET_ERROR(ENOTDIR)); 1494 } else if (zdp->z_sa_hdl == NULL) { 1495 return (SET_ERROR(EIO)); 1496 } 1497 1498 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 1499 error = zfs_fastaccesschk_execute(zdp, cr); 1500 if (!error) { 1501 *vpp = dvp; 1502 VN_HOLD(*vpp); 1503 return (0); 1504 } 1505 return (error); 1506 } else { 1507 vnode_t *tvp = dnlc_lookup(dvp, nm); 1508 1509 if (tvp) { 1510 error = zfs_fastaccesschk_execute(zdp, cr); 1511 if (error) { 1512 VN_RELE(tvp); 1513 return (error); 1514 } 1515 if (tvp == DNLC_NO_VNODE) { 1516 VN_RELE(tvp); 1517 return (SET_ERROR(ENOENT)); 1518 } else { 1519 *vpp = tvp; 1520 return (specvp_check(vpp, cr)); 1521 } 1522 } 1523 } 1524 } 1525 1526 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); 1527 1528 ZFS_ENTER(zfsvfs); 1529 ZFS_VERIFY_ZP(zdp); 1530 1531 *vpp = NULL; 1532 1533 if (flags & LOOKUP_XATTR) { 1534#ifdef TODO 1535 /* 1536 * If the xattr property is off, refuse the lookup request. 1537 */ 1538 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1539 ZFS_EXIT(zfsvfs); 1540 return (SET_ERROR(EINVAL)); 1541 } 1542#endif 1543 1544 /* 1545 * We don't allow recursive attributes.. 1546 * Maybe someday we will. 1547 */ 1548 if (zdp->z_pflags & ZFS_XATTR) { 1549 ZFS_EXIT(zfsvfs); 1550 return (SET_ERROR(EINVAL)); 1551 } 1552 1553 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1554 ZFS_EXIT(zfsvfs); 1555 return (error); 1556 } 1557 1558 /* 1559 * Do we have permission to get into attribute directory? 1560 */ 1561 1562 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1563 B_FALSE, cr)) { 1564 VN_RELE(*vpp); 1565 *vpp = NULL; 1566 } 1567 1568 ZFS_EXIT(zfsvfs); 1569 return (error); 1570 } 1571 1572 if (dvp->v_type != VDIR) { 1573 ZFS_EXIT(zfsvfs); 1574 return (SET_ERROR(ENOTDIR)); 1575 } 1576 1577 /* 1578 * Check accessibility of directory. 1579 */ 1580 1581 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1582 ZFS_EXIT(zfsvfs); 1583 return (error); 1584 } 1585 1586 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1587 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1588 ZFS_EXIT(zfsvfs); 1589 return (SET_ERROR(EILSEQ)); 1590 } 1591 1592 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); 1593 if (error == 0) 1594 error = specvp_check(vpp, cr); 1595 1596 /* Translate errors and add SAVENAME when needed. */ 1597 if (cnp->cn_flags & ISLASTCN) { 1598 switch (nameiop) { 1599 case CREATE: 1600 case RENAME: 1601 if (error == ENOENT) { 1602 error = EJUSTRETURN; 1603 cnp->cn_flags |= SAVENAME; 1604 break; 1605 } 1606 /* FALLTHROUGH */ 1607 case DELETE: 1608 if (error == 0) 1609 cnp->cn_flags |= SAVENAME; 1610 break; 1611 } 1612 } 1613 if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) { 1614 int ltype = 0; 1615 1616 if (cnp->cn_flags & ISDOTDOT) { 1617 ltype = VOP_ISLOCKED(dvp); 1618 VOP_UNLOCK(dvp, 0); 1619 } 1620 ZFS_EXIT(zfsvfs); 1621 error = vn_lock(*vpp, cnp->cn_lkflags); 1622 if (cnp->cn_flags & ISDOTDOT) 1623 vn_lock(dvp, ltype | LK_RETRY); 1624 if (error != 0) { 1625 VN_RELE(*vpp); 1626 *vpp = NULL; 1627 return (error); 1628 } 1629 } else { 1630 ZFS_EXIT(zfsvfs); 1631 } 1632 1633#ifdef FREEBSD_NAMECACHE 1634 /* 1635 * Insert name into cache (as non-existent) if appropriate. 1636 */ 1637 if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) 1638 cache_enter(dvp, *vpp, cnp); 1639 /* 1640 * Insert name into cache if appropriate. 1641 */ 1642 if (error == 0 && (cnp->cn_flags & MAKEENTRY)) { 1643 if (!(cnp->cn_flags & ISLASTCN) || 1644 (nameiop != DELETE && nameiop != RENAME)) { 1645 cache_enter(dvp, *vpp, cnp); 1646 } 1647 } 1648#endif 1649 1650 return (error); 1651} 1652 1653/* 1654 * Attempt to create a new entry in a directory. If the entry 1655 * already exists, truncate the file if permissible, else return 1656 * an error. Return the vp of the created or trunc'd file. 1657 * 1658 * IN: dvp - vnode of directory to put new file entry in. 1659 * name - name of new file entry. 1660 * vap - attributes of new file. 1661 * excl - flag indicating exclusive or non-exclusive mode. 1662 * mode - mode to open file with. 1663 * cr - credentials of caller. 1664 * flag - large file flag [UNUSED]. 1665 * ct - caller context 1666 * vsecp - ACL to be set 1667 * 1668 * OUT: vpp - vnode of created or trunc'd entry. 1669 * 1670 * RETURN: 0 on success, error code on failure. 1671 * 1672 * Timestamps: 1673 * dvp - ctime|mtime updated if new entry created 1674 * vp - ctime|mtime always, atime if new 1675 */ 1676 1677/* ARGSUSED */ 1678static int 1679zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, 1680 vnode_t **vpp, cred_t *cr, kthread_t *td) 1681{ 1682 znode_t *zp, *dzp = VTOZ(dvp); 1683 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1684 zilog_t *zilog; 1685 objset_t *os; 1686 zfs_dirlock_t *dl; 1687 dmu_tx_t *tx; 1688 int error; 1689 ksid_t *ksid; 1690 uid_t uid; 1691 gid_t gid = crgetgid(cr); 1692 zfs_acl_ids_t acl_ids; 1693 boolean_t fuid_dirtied; 1694 boolean_t have_acl = B_FALSE; 1695 boolean_t waited = B_FALSE; 1696 void *vsecp = NULL; 1697 int flag = 0; 1698 1699 /* 1700 * If we have an ephemeral id, ACL, or XVATTR then 1701 * make sure file system is at proper version 1702 */ 1703 1704 ksid = crgetsid(cr, KSID_OWNER); 1705 if (ksid) 1706 uid = ksid_getid(ksid); 1707 else 1708 uid = crgetuid(cr); 1709 1710 if (zfsvfs->z_use_fuids == B_FALSE && 1711 (vsecp || (vap->va_mask & AT_XVATTR) || 1712 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1713 return (SET_ERROR(EINVAL)); 1714 1715 ZFS_ENTER(zfsvfs); 1716 ZFS_VERIFY_ZP(dzp); 1717 os = zfsvfs->z_os; 1718 zilog = zfsvfs->z_log; 1719 1720 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 1721 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1722 ZFS_EXIT(zfsvfs); 1723 return (SET_ERROR(EILSEQ)); 1724 } 1725 1726 if (vap->va_mask & AT_XVATTR) { 1727 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 1728 crgetuid(cr), cr, vap->va_type)) != 0) { 1729 ZFS_EXIT(zfsvfs); 1730 return (error); 1731 } 1732 } 1733 1734 getnewvnode_reserve(1); 1735 1736top: 1737 *vpp = NULL; 1738 1739 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) 1740 vap->va_mode &= ~S_ISVTX; 1741 1742 if (*name == '\0') { 1743 /* 1744 * Null component name refers to the directory itself. 1745 */ 1746 VN_HOLD(dvp); 1747 zp = dzp; 1748 dl = NULL; 1749 error = 0; 1750 } else { 1751 /* possible VN_HOLD(zp) */ 1752 int zflg = 0; 1753 1754 if (flag & FIGNORECASE) 1755 zflg |= ZCILOOK; 1756 1757 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1758 NULL, NULL); 1759 if (error) { 1760 if (have_acl) 1761 zfs_acl_ids_free(&acl_ids); 1762 if (strcmp(name, "..") == 0) 1763 error = SET_ERROR(EISDIR); 1764 getnewvnode_drop_reserve(); 1765 ZFS_EXIT(zfsvfs); 1766 return (error); 1767 } 1768 } 1769 1770 if (zp == NULL) { 1771 uint64_t txtype; 1772 1773 /* 1774 * Create a new file object and update the directory 1775 * to reference it. 1776 */ 1777 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 1778 if (have_acl) 1779 zfs_acl_ids_free(&acl_ids); 1780 goto out; 1781 } 1782 1783 /* 1784 * We only support the creation of regular files in 1785 * extended attribute directories. 1786 */ 1787 1788 if ((dzp->z_pflags & ZFS_XATTR) && 1789 (vap->va_type != VREG)) { 1790 if (have_acl) 1791 zfs_acl_ids_free(&acl_ids); 1792 error = SET_ERROR(EINVAL); 1793 goto out; 1794 } 1795 1796 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 1797 cr, vsecp, &acl_ids)) != 0) 1798 goto out; 1799 have_acl = B_TRUE; 1800 1801 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 1802 zfs_acl_ids_free(&acl_ids); 1803 error = SET_ERROR(EDQUOT); 1804 goto out; 1805 } 1806 1807 tx = dmu_tx_create(os); 1808 1809 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1810 ZFS_SA_BASE_ATTR_SIZE); 1811 1812 fuid_dirtied = zfsvfs->z_fuid_dirty; 1813 if (fuid_dirtied) 1814 zfs_fuid_txhold(zfsvfs, tx); 1815 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 1816 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 1817 if (!zfsvfs->z_use_sa && 1818 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1819 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 1820 0, acl_ids.z_aclp->z_acl_bytes); 1821 } 1822 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 1823 if (error) { 1824 zfs_dirent_unlock(dl); 1825 if (error == ERESTART) { 1826 waited = B_TRUE; 1827 dmu_tx_wait(tx); 1828 dmu_tx_abort(tx); 1829 goto top; 1830 } 1831 zfs_acl_ids_free(&acl_ids); 1832 dmu_tx_abort(tx); 1833 getnewvnode_drop_reserve(); 1834 ZFS_EXIT(zfsvfs); 1835 return (error); 1836 } 1837 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1838 1839 if (fuid_dirtied) 1840 zfs_fuid_sync(zfsvfs, tx); 1841 1842 (void) zfs_link_create(dl, zp, tx, ZNEW); 1843 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 1844 if (flag & FIGNORECASE) 1845 txtype |= TX_CI; 1846 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 1847 vsecp, acl_ids.z_fuidp, vap); 1848 zfs_acl_ids_free(&acl_ids); 1849 dmu_tx_commit(tx); 1850 } else { 1851 int aflags = (flag & FAPPEND) ? V_APPEND : 0; 1852 1853 if (have_acl) 1854 zfs_acl_ids_free(&acl_ids); 1855 have_acl = B_FALSE; 1856 1857 /* 1858 * A directory entry already exists for this name. 1859 */ 1860 /* 1861 * Can't truncate an existing file if in exclusive mode. 1862 */ 1863 if (excl == EXCL) { 1864 error = SET_ERROR(EEXIST); 1865 goto out; 1866 } 1867 /* 1868 * Can't open a directory for writing. 1869 */ 1870 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { 1871 error = SET_ERROR(EISDIR); 1872 goto out; 1873 } 1874 /* 1875 * Verify requested access to file. 1876 */ 1877 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { 1878 goto out; 1879 } 1880 1881 mutex_enter(&dzp->z_lock); 1882 dzp->z_seq++; 1883 mutex_exit(&dzp->z_lock); 1884 1885 /* 1886 * Truncate regular files if requested. 1887 */ 1888 if ((ZTOV(zp)->v_type == VREG) && 1889 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { 1890 /* we can't hold any locks when calling zfs_freesp() */ 1891 zfs_dirent_unlock(dl); 1892 dl = NULL; 1893 error = zfs_freesp(zp, 0, 0, mode, TRUE); 1894 if (error == 0) { 1895 vnevent_create(ZTOV(zp), ct); 1896 } 1897 } 1898 } 1899out: 1900 getnewvnode_drop_reserve(); 1901 if (dl) 1902 zfs_dirent_unlock(dl); 1903 1904 if (error) { 1905 if (zp) 1906 VN_RELE(ZTOV(zp)); 1907 } else { 1908 *vpp = ZTOV(zp); 1909 error = specvp_check(vpp, cr); 1910 } 1911 1912 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1913 zil_commit(zilog, 0); 1914 1915 ZFS_EXIT(zfsvfs); 1916 return (error); 1917} 1918 1919/* 1920 * Remove an entry from a directory. 1921 * 1922 * IN: dvp - vnode of directory to remove entry from. 1923 * name - name of entry to remove. 1924 * cr - credentials of caller. 1925 * ct - caller context 1926 * flags - case flags 1927 * 1928 * RETURN: 0 on success, error code on failure. 1929 * 1930 * Timestamps: 1931 * dvp - ctime|mtime 1932 * vp - ctime (if nlink > 0) 1933 */ 1934 1935uint64_t null_xattr = 0; 1936 1937/*ARGSUSED*/ 1938static int 1939zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, 1940 int flags) 1941{ 1942 znode_t *zp, *dzp = VTOZ(dvp); 1943 znode_t *xzp; 1944 vnode_t *vp; 1945 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 1946 zilog_t *zilog; 1947 uint64_t acl_obj, xattr_obj; 1948 uint64_t xattr_obj_unlinked = 0; 1949 uint64_t obj = 0; 1950 zfs_dirlock_t *dl; 1951 dmu_tx_t *tx; 1952 boolean_t may_delete_now, delete_now = FALSE; 1953 boolean_t unlinked, toobig = FALSE; 1954 uint64_t txtype; 1955 pathname_t *realnmp = NULL; 1956 pathname_t realnm; 1957 int error; 1958 int zflg = ZEXISTS; 1959 boolean_t waited = B_FALSE; 1960 1961 ZFS_ENTER(zfsvfs); 1962 ZFS_VERIFY_ZP(dzp); 1963 zilog = zfsvfs->z_log; 1964 1965 if (flags & FIGNORECASE) { 1966 zflg |= ZCILOOK; 1967 pn_alloc(&realnm); 1968 realnmp = &realnm; 1969 } 1970 1971top: 1972 xattr_obj = 0; 1973 xzp = NULL; 1974 /* 1975 * Attempt to lock directory; fail if entry doesn't exist. 1976 */ 1977 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1978 NULL, realnmp)) { 1979 if (realnmp) 1980 pn_free(realnmp); 1981 ZFS_EXIT(zfsvfs); 1982 return (error); 1983 } 1984 1985 vp = ZTOV(zp); 1986 1987 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 1988 goto out; 1989 } 1990 1991 /* 1992 * Need to use rmdir for removing directories. 1993 */ 1994 if (vp->v_type == VDIR) { 1995 error = SET_ERROR(EPERM); 1996 goto out; 1997 } 1998 1999 vnevent_remove(vp, dvp, name, ct); 2000 2001 if (realnmp) 2002 dnlc_remove(dvp, realnmp->pn_buf); 2003 else 2004 dnlc_remove(dvp, name); 2005 2006 VI_LOCK(vp); 2007 may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); 2008 VI_UNLOCK(vp); 2009 2010 /* 2011 * We may delete the znode now, or we may put it in the unlinked set; 2012 * it depends on whether we're the last link, and on whether there are 2013 * other holds on the vnode. So we dmu_tx_hold() the right things to 2014 * allow for either case. 2015 */ 2016 obj = zp->z_id; 2017 tx = dmu_tx_create(zfsvfs->z_os); 2018 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2019 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2020 zfs_sa_upgrade_txholds(tx, zp); 2021 zfs_sa_upgrade_txholds(tx, dzp); 2022 if (may_delete_now) { 2023 toobig = 2024 zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; 2025 /* if the file is too big, only hold_free a token amount */ 2026 dmu_tx_hold_free(tx, zp->z_id, 0, 2027 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 2028 } 2029 2030 /* are there any extended attributes? */ 2031 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 2032 &xattr_obj, sizeof (xattr_obj)); 2033 if (error == 0 && xattr_obj) { 2034 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 2035 ASSERT0(error); 2036 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2037 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 2038 } 2039 2040 mutex_enter(&zp->z_lock); 2041 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) 2042 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 2043 mutex_exit(&zp->z_lock); 2044 2045 /* charge as an update -- would be nice not to charge at all */ 2046 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2047 2048 /* 2049 * Mark this transaction as typically resulting in a net free of space 2050 */ 2051 dmu_tx_mark_netfree(tx); 2052 2053 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 2054 if (error) { 2055 zfs_dirent_unlock(dl); 2056 VN_RELE(vp); 2057 if (xzp) 2058 VN_RELE(ZTOV(xzp)); 2059 if (error == ERESTART) { 2060 waited = B_TRUE; 2061 dmu_tx_wait(tx); 2062 dmu_tx_abort(tx); 2063 goto top; 2064 } 2065 if (realnmp) 2066 pn_free(realnmp); 2067 dmu_tx_abort(tx); 2068 ZFS_EXIT(zfsvfs); 2069 return (error); 2070 } 2071 2072 /* 2073 * Remove the directory entry. 2074 */ 2075 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 2076 2077 if (error) { 2078 dmu_tx_commit(tx); 2079 goto out; 2080 } 2081 2082 if (unlinked) { 2083 /* 2084 * Hold z_lock so that we can make sure that the ACL obj 2085 * hasn't changed. Could have been deleted due to 2086 * zfs_sa_upgrade(). 2087 */ 2088 mutex_enter(&zp->z_lock); 2089 VI_LOCK(vp); 2090 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 2091 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); 2092 delete_now = may_delete_now && !toobig && 2093 vp->v_count == 1 && !vn_has_cached_data(vp) && 2094 xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == 2095 acl_obj; 2096 VI_UNLOCK(vp); 2097 } 2098 2099 if (delete_now) { 2100#ifdef __FreeBSD__ 2101 panic("zfs_remove: delete_now branch taken"); 2102#endif 2103 if (xattr_obj_unlinked) { 2104 ASSERT3U(xzp->z_links, ==, 2); 2105 mutex_enter(&xzp->z_lock); 2106 xzp->z_unlinked = 1; 2107 xzp->z_links = 0; 2108 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 2109 &xzp->z_links, sizeof (xzp->z_links), tx); 2110 ASSERT3U(error, ==, 0); 2111 mutex_exit(&xzp->z_lock); 2112 zfs_unlinked_add(xzp, tx); 2113 2114 if (zp->z_is_sa) 2115 error = sa_remove(zp->z_sa_hdl, 2116 SA_ZPL_XATTR(zfsvfs), tx); 2117 else 2118 error = sa_update(zp->z_sa_hdl, 2119 SA_ZPL_XATTR(zfsvfs), &null_xattr, 2120 sizeof (uint64_t), tx); 2121 ASSERT0(error); 2122 } 2123 VI_LOCK(vp); 2124 vp->v_count--; 2125 ASSERT0(vp->v_count); 2126 VI_UNLOCK(vp); 2127 mutex_exit(&zp->z_lock); 2128 zfs_znode_delete(zp, tx); 2129 } else if (unlinked) { 2130 mutex_exit(&zp->z_lock); 2131 zfs_unlinked_add(zp, tx); 2132#ifdef __FreeBSD__ 2133 vp->v_vflag |= VV_NOSYNC; 2134#endif 2135 } 2136 2137 txtype = TX_REMOVE; 2138 if (flags & FIGNORECASE) 2139 txtype |= TX_CI; 2140 zfs_log_remove(zilog, tx, txtype, dzp, name, obj); 2141 2142 dmu_tx_commit(tx); 2143out: 2144 if (realnmp) 2145 pn_free(realnmp); 2146 2147 zfs_dirent_unlock(dl); 2148 2149 if (!delete_now) 2150 VN_RELE(vp); 2151 if (xzp) 2152 VN_RELE(ZTOV(xzp)); 2153 2154 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2155 zil_commit(zilog, 0); 2156 2157 ZFS_EXIT(zfsvfs); 2158 return (error); 2159} 2160 2161/* 2162 * Create a new directory and insert it into dvp using the name 2163 * provided. Return a pointer to the inserted directory. 2164 * 2165 * IN: dvp - vnode of directory to add subdir to. 2166 * dirname - name of new directory. 2167 * vap - attributes of new directory. 2168 * cr - credentials of caller. 2169 * ct - caller context 2170 * flags - case flags 2171 * vsecp - ACL to be set 2172 * 2173 * OUT: vpp - vnode of created directory. 2174 * 2175 * RETURN: 0 on success, error code on failure. 2176 * 2177 * Timestamps: 2178 * dvp - ctime|mtime updated 2179 * vp - ctime|mtime|atime updated 2180 */ 2181/*ARGSUSED*/ 2182static int 2183zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, 2184 caller_context_t *ct, int flags, vsecattr_t *vsecp) 2185{ 2186 znode_t *zp, *dzp = VTOZ(dvp); 2187 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2188 zilog_t *zilog; 2189 zfs_dirlock_t *dl; 2190 uint64_t txtype; 2191 dmu_tx_t *tx; 2192 int error; 2193 int zf = ZNEW; 2194 ksid_t *ksid; 2195 uid_t uid; 2196 gid_t gid = crgetgid(cr); 2197 zfs_acl_ids_t acl_ids; 2198 boolean_t fuid_dirtied; 2199 boolean_t waited = B_FALSE; 2200 2201 ASSERT(vap->va_type == VDIR); 2202 2203 /* 2204 * If we have an ephemeral id, ACL, or XVATTR then 2205 * make sure file system is at proper version 2206 */ 2207 2208 ksid = crgetsid(cr, KSID_OWNER); 2209 if (ksid) 2210 uid = ksid_getid(ksid); 2211 else 2212 uid = crgetuid(cr); 2213 if (zfsvfs->z_use_fuids == B_FALSE && 2214 (vsecp || (vap->va_mask & AT_XVATTR) || 2215 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 2216 return (SET_ERROR(EINVAL)); 2217 2218 ZFS_ENTER(zfsvfs); 2219 ZFS_VERIFY_ZP(dzp); 2220 zilog = zfsvfs->z_log; 2221 2222 if (dzp->z_pflags & ZFS_XATTR) { 2223 ZFS_EXIT(zfsvfs); 2224 return (SET_ERROR(EINVAL)); 2225 } 2226 2227 if (zfsvfs->z_utf8 && u8_validate(dirname, 2228 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2229 ZFS_EXIT(zfsvfs); 2230 return (SET_ERROR(EILSEQ)); 2231 } 2232 if (flags & FIGNORECASE) 2233 zf |= ZCILOOK; 2234 2235 if (vap->va_mask & AT_XVATTR) { 2236 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 2237 crgetuid(cr), cr, vap->va_type)) != 0) { 2238 ZFS_EXIT(zfsvfs); 2239 return (error); 2240 } 2241 } 2242 2243 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 2244 vsecp, &acl_ids)) != 0) { 2245 ZFS_EXIT(zfsvfs); 2246 return (error); 2247 } 2248 2249 getnewvnode_reserve(1); 2250 2251 /* 2252 * First make sure the new directory doesn't exist. 2253 * 2254 * Existence is checked first to make sure we don't return 2255 * EACCES instead of EEXIST which can cause some applications 2256 * to fail. 2257 */ 2258top: 2259 *vpp = NULL; 2260 2261 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 2262 NULL, NULL)) { 2263 zfs_acl_ids_free(&acl_ids); 2264 getnewvnode_drop_reserve(); 2265 ZFS_EXIT(zfsvfs); 2266 return (error); 2267 } 2268 2269 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 2270 zfs_acl_ids_free(&acl_ids); 2271 zfs_dirent_unlock(dl); 2272 getnewvnode_drop_reserve(); 2273 ZFS_EXIT(zfsvfs); 2274 return (error); 2275 } 2276 2277 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 2278 zfs_acl_ids_free(&acl_ids); 2279 zfs_dirent_unlock(dl); 2280 getnewvnode_drop_reserve(); 2281 ZFS_EXIT(zfsvfs); 2282 return (SET_ERROR(EDQUOT)); 2283 } 2284 2285 /* 2286 * Add a new entry to the directory. 2287 */ 2288 tx = dmu_tx_create(zfsvfs->z_os); 2289 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 2290 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2291 fuid_dirtied = zfsvfs->z_fuid_dirty; 2292 if (fuid_dirtied) 2293 zfs_fuid_txhold(zfsvfs, tx); 2294 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2295 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2296 acl_ids.z_aclp->z_acl_bytes); 2297 } 2298 2299 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 2300 ZFS_SA_BASE_ATTR_SIZE); 2301 2302 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 2303 if (error) { 2304 zfs_dirent_unlock(dl); 2305 if (error == ERESTART) { 2306 waited = B_TRUE; 2307 dmu_tx_wait(tx); 2308 dmu_tx_abort(tx); 2309 goto top; 2310 } 2311 zfs_acl_ids_free(&acl_ids); 2312 dmu_tx_abort(tx); 2313 getnewvnode_drop_reserve(); 2314 ZFS_EXIT(zfsvfs); 2315 return (error); 2316 } 2317 2318 /* 2319 * Create new node. 2320 */ 2321 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 2322 2323 if (fuid_dirtied) 2324 zfs_fuid_sync(zfsvfs, tx); 2325 2326 /* 2327 * Now put new name in parent dir. 2328 */ 2329 (void) zfs_link_create(dl, zp, tx, ZNEW); 2330 2331 *vpp = ZTOV(zp); 2332 2333 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 2334 if (flags & FIGNORECASE) 2335 txtype |= TX_CI; 2336 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 2337 acl_ids.z_fuidp, vap); 2338 2339 zfs_acl_ids_free(&acl_ids); 2340 2341 dmu_tx_commit(tx); 2342 2343 getnewvnode_drop_reserve(); 2344 2345 zfs_dirent_unlock(dl); 2346 2347 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2348 zil_commit(zilog, 0); 2349 2350 ZFS_EXIT(zfsvfs); 2351 return (0); 2352} 2353 2354/* 2355 * Remove a directory subdir entry. If the current working 2356 * directory is the same as the subdir to be removed, the 2357 * remove will fail. 2358 * 2359 * IN: dvp - vnode of directory to remove from. 2360 * name - name of directory to be removed. 2361 * cwd - vnode of current working directory. 2362 * cr - credentials of caller. 2363 * ct - caller context 2364 * flags - case flags 2365 * 2366 * RETURN: 0 on success, error code on failure. 2367 * 2368 * Timestamps: 2369 * dvp - ctime|mtime updated 2370 */ 2371/*ARGSUSED*/ 2372static int 2373zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, 2374 caller_context_t *ct, int flags) 2375{ 2376 znode_t *dzp = VTOZ(dvp); 2377 znode_t *zp; 2378 vnode_t *vp; 2379 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2380 zilog_t *zilog; 2381 zfs_dirlock_t *dl; 2382 dmu_tx_t *tx; 2383 int error; 2384 int zflg = ZEXISTS; 2385 boolean_t waited = B_FALSE; 2386 2387 ZFS_ENTER(zfsvfs); 2388 ZFS_VERIFY_ZP(dzp); 2389 zilog = zfsvfs->z_log; 2390 2391 if (flags & FIGNORECASE) 2392 zflg |= ZCILOOK; 2393top: 2394 zp = NULL; 2395 2396 /* 2397 * Attempt to lock directory; fail if entry doesn't exist. 2398 */ 2399 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 2400 NULL, NULL)) { 2401 ZFS_EXIT(zfsvfs); 2402 return (error); 2403 } 2404 2405 vp = ZTOV(zp); 2406 2407 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 2408 goto out; 2409 } 2410 2411 if (vp->v_type != VDIR) { 2412 error = SET_ERROR(ENOTDIR); 2413 goto out; 2414 } 2415 2416 if (vp == cwd) { 2417 error = SET_ERROR(EINVAL); 2418 goto out; 2419 } 2420 2421 vnevent_rmdir(vp, dvp, name, ct); 2422 2423 /* 2424 * Grab a lock on the directory to make sure that noone is 2425 * trying to add (or lookup) entries while we are removing it. 2426 */ 2427 rw_enter(&zp->z_name_lock, RW_WRITER); 2428 2429 /* 2430 * Grab a lock on the parent pointer to make sure we play well 2431 * with the treewalk and directory rename code. 2432 */ 2433 rw_enter(&zp->z_parent_lock, RW_WRITER); 2434 2435 tx = dmu_tx_create(zfsvfs->z_os); 2436 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2437 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2438 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2439 zfs_sa_upgrade_txholds(tx, zp); 2440 zfs_sa_upgrade_txholds(tx, dzp); 2441 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 2442 if (error) { 2443 rw_exit(&zp->z_parent_lock); 2444 rw_exit(&zp->z_name_lock); 2445 zfs_dirent_unlock(dl); 2446 VN_RELE(vp); 2447 if (error == ERESTART) { 2448 waited = B_TRUE; 2449 dmu_tx_wait(tx); 2450 dmu_tx_abort(tx); 2451 goto top; 2452 } 2453 dmu_tx_abort(tx); 2454 ZFS_EXIT(zfsvfs); 2455 return (error); 2456 } 2457 2458#ifdef FREEBSD_NAMECACHE 2459 cache_purge(dvp); 2460#endif 2461 2462 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 2463 2464 if (error == 0) { 2465 uint64_t txtype = TX_RMDIR; 2466 if (flags & FIGNORECASE) 2467 txtype |= TX_CI; 2468 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); 2469 } 2470 2471 dmu_tx_commit(tx); 2472 2473 rw_exit(&zp->z_parent_lock); 2474 rw_exit(&zp->z_name_lock); 2475#ifdef FREEBSD_NAMECACHE 2476 cache_purge(vp); 2477#endif 2478out: 2479 zfs_dirent_unlock(dl); 2480 2481 VN_RELE(vp); 2482 2483 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2484 zil_commit(zilog, 0); 2485 2486 ZFS_EXIT(zfsvfs); 2487 return (error); 2488} 2489 2490/* 2491 * Read as many directory entries as will fit into the provided 2492 * buffer from the given directory cursor position (specified in 2493 * the uio structure). 2494 * 2495 * IN: vp - vnode of directory to read. 2496 * uio - structure supplying read location, range info, 2497 * and return buffer. 2498 * cr - credentials of caller. 2499 * ct - caller context 2500 * flags - case flags 2501 * 2502 * OUT: uio - updated offset and range, buffer filled. 2503 * eofp - set to true if end-of-file detected. 2504 * 2505 * RETURN: 0 on success, error code on failure. 2506 * 2507 * Timestamps: 2508 * vp - atime updated 2509 * 2510 * Note that the low 4 bits of the cookie returned by zap is always zero. 2511 * This allows us to use the low range for "special" directory entries: 2512 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2513 * we use the offset 2 for the '.zfs' directory. 2514 */ 2515/* ARGSUSED */ 2516static int 2517zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) 2518{ 2519 znode_t *zp = VTOZ(vp); 2520 iovec_t *iovp; 2521 edirent_t *eodp; 2522 dirent64_t *odp; 2523 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2524 objset_t *os; 2525 caddr_t outbuf; 2526 size_t bufsize; 2527 zap_cursor_t zc; 2528 zap_attribute_t zap; 2529 uint_t bytes_wanted; 2530 uint64_t offset; /* must be unsigned; checks for < 1 */ 2531 uint64_t parent; 2532 int local_eof; 2533 int outcount; 2534 int error; 2535 uint8_t prefetch; 2536 boolean_t check_sysattrs; 2537 uint8_t type; 2538 int ncooks; 2539 u_long *cooks = NULL; 2540 int flags = 0; 2541 2542 ZFS_ENTER(zfsvfs); 2543 ZFS_VERIFY_ZP(zp); 2544 2545 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 2546 &parent, sizeof (parent))) != 0) { 2547 ZFS_EXIT(zfsvfs); 2548 return (error); 2549 } 2550 2551 /* 2552 * If we are not given an eof variable, 2553 * use a local one. 2554 */ 2555 if (eofp == NULL) 2556 eofp = &local_eof; 2557 2558 /* 2559 * Check for valid iov_len. 2560 */ 2561 if (uio->uio_iov->iov_len <= 0) { 2562 ZFS_EXIT(zfsvfs); 2563 return (SET_ERROR(EINVAL)); 2564 } 2565 2566 /* 2567 * Quit if directory has been removed (posix) 2568 */ 2569 if ((*eofp = zp->z_unlinked) != 0) { 2570 ZFS_EXIT(zfsvfs); 2571 return (0); 2572 } 2573 2574 error = 0; 2575 os = zfsvfs->z_os; 2576 offset = uio->uio_loffset; 2577 prefetch = zp->z_zn_prefetch; 2578 2579 /* 2580 * Initialize the iterator cursor. 2581 */ 2582 if (offset <= 3) { 2583 /* 2584 * Start iteration from the beginning of the directory. 2585 */ 2586 zap_cursor_init(&zc, os, zp->z_id); 2587 } else { 2588 /* 2589 * The offset is a serialized cursor. 2590 */ 2591 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2592 } 2593 2594 /* 2595 * Get space to change directory entries into fs independent format. 2596 */ 2597 iovp = uio->uio_iov; 2598 bytes_wanted = iovp->iov_len; 2599 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { 2600 bufsize = bytes_wanted; 2601 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2602 odp = (struct dirent64 *)outbuf; 2603 } else { 2604 bufsize = bytes_wanted; 2605 outbuf = NULL; 2606 odp = (struct dirent64 *)iovp->iov_base; 2607 } 2608 eodp = (struct edirent *)odp; 2609 2610 if (ncookies != NULL) { 2611 /* 2612 * Minimum entry size is dirent size and 1 byte for a file name. 2613 */ 2614 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 2615 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); 2616 *cookies = cooks; 2617 *ncookies = ncooks; 2618 } 2619 /* 2620 * If this VFS supports the system attribute view interface; and 2621 * we're looking at an extended attribute directory; and we care 2622 * about normalization conflicts on this vfs; then we must check 2623 * for normalization conflicts with the sysattr name space. 2624 */ 2625#ifdef TODO 2626 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2627 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2628 (flags & V_RDDIR_ENTFLAGS); 2629#else 2630 check_sysattrs = 0; 2631#endif 2632 2633 /* 2634 * Transform to file-system independent format 2635 */ 2636 outcount = 0; 2637 while (outcount < bytes_wanted) { 2638 ino64_t objnum; 2639 ushort_t reclen; 2640 off64_t *next = NULL; 2641 2642 /* 2643 * Special case `.', `..', and `.zfs'. 2644 */ 2645 if (offset == 0) { 2646 (void) strcpy(zap.za_name, "."); 2647 zap.za_normalization_conflict = 0; 2648 objnum = zp->z_id; 2649 type = DT_DIR; 2650 } else if (offset == 1) { 2651 (void) strcpy(zap.za_name, ".."); 2652 zap.za_normalization_conflict = 0; 2653 objnum = parent; 2654 type = DT_DIR; 2655 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2656 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2657 zap.za_normalization_conflict = 0; 2658 objnum = ZFSCTL_INO_ROOT; 2659 type = DT_DIR; 2660 } else { 2661 /* 2662 * Grab next entry. 2663 */ 2664 if (error = zap_cursor_retrieve(&zc, &zap)) { 2665 if ((*eofp = (error == ENOENT)) != 0) 2666 break; 2667 else 2668 goto update; 2669 } 2670 2671 if (zap.za_integer_length != 8 || 2672 zap.za_num_integers != 1) { 2673 cmn_err(CE_WARN, "zap_readdir: bad directory " 2674 "entry, obj = %lld, offset = %lld\n", 2675 (u_longlong_t)zp->z_id, 2676 (u_longlong_t)offset); 2677 error = SET_ERROR(ENXIO); 2678 goto update; 2679 } 2680 2681 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2682 /* 2683 * MacOS X can extract the object type here such as: 2684 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2685 */ 2686 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2687 2688 if (check_sysattrs && !zap.za_normalization_conflict) { 2689#ifdef TODO 2690 zap.za_normalization_conflict = 2691 xattr_sysattr_casechk(zap.za_name); 2692#else 2693 panic("%s:%u: TODO", __func__, __LINE__); 2694#endif 2695 } 2696 } 2697 2698 if (flags & V_RDDIR_ACCFILTER) { 2699 /* 2700 * If we have no access at all, don't include 2701 * this entry in the returned information 2702 */ 2703 znode_t *ezp; 2704 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) 2705 goto skip_entry; 2706 if (!zfs_has_access(ezp, cr)) { 2707 VN_RELE(ZTOV(ezp)); 2708 goto skip_entry; 2709 } 2710 VN_RELE(ZTOV(ezp)); 2711 } 2712 2713 if (flags & V_RDDIR_ENTFLAGS) 2714 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2715 else 2716 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2717 2718 /* 2719 * Will this entry fit in the buffer? 2720 */ 2721 if (outcount + reclen > bufsize) { 2722 /* 2723 * Did we manage to fit anything in the buffer? 2724 */ 2725 if (!outcount) { 2726 error = SET_ERROR(EINVAL); 2727 goto update; 2728 } 2729 break; 2730 } 2731 if (flags & V_RDDIR_ENTFLAGS) { 2732 /* 2733 * Add extended flag entry: 2734 */ 2735 eodp->ed_ino = objnum; 2736 eodp->ed_reclen = reclen; 2737 /* NOTE: ed_off is the offset for the *next* entry */ 2738 next = &(eodp->ed_off); 2739 eodp->ed_eflags = zap.za_normalization_conflict ? 2740 ED_CASE_CONFLICT : 0; 2741 (void) strncpy(eodp->ed_name, zap.za_name, 2742 EDIRENT_NAMELEN(reclen)); 2743 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2744 } else { 2745 /* 2746 * Add normal entry: 2747 */ 2748 odp->d_ino = objnum; 2749 odp->d_reclen = reclen; 2750 odp->d_namlen = strlen(zap.za_name); 2751 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); 2752 odp->d_type = type; 2753 odp = (dirent64_t *)((intptr_t)odp + reclen); 2754 } 2755 outcount += reclen; 2756 2757 ASSERT(outcount <= bufsize); 2758 2759 /* Prefetch znode */ 2760 if (prefetch) 2761 dmu_prefetch(os, objnum, 0, 0, 0, 2762 ZIO_PRIORITY_SYNC_READ); 2763 2764 skip_entry: 2765 /* 2766 * Move to the next entry, fill in the previous offset. 2767 */ 2768 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2769 zap_cursor_advance(&zc); 2770 offset = zap_cursor_serialize(&zc); 2771 } else { 2772 offset += 1; 2773 } 2774 2775 if (cooks != NULL) { 2776 *cooks++ = offset; 2777 ncooks--; 2778 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); 2779 } 2780 } 2781 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2782 2783 /* Subtract unused cookies */ 2784 if (ncookies != NULL) 2785 *ncookies -= ncooks; 2786 2787 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { 2788 iovp->iov_base += outcount; 2789 iovp->iov_len -= outcount; 2790 uio->uio_resid -= outcount; 2791 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { 2792 /* 2793 * Reset the pointer. 2794 */ 2795 offset = uio->uio_loffset; 2796 } 2797 2798update: 2799 zap_cursor_fini(&zc); 2800 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) 2801 kmem_free(outbuf, bufsize); 2802 2803 if (error == ENOENT) 2804 error = 0; 2805 2806 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 2807 2808 uio->uio_loffset = offset; 2809 ZFS_EXIT(zfsvfs); 2810 if (error != 0 && cookies != NULL) { 2811 free(*cookies, M_TEMP); 2812 *cookies = NULL; 2813 *ncookies = 0; 2814 } 2815 return (error); 2816} 2817 2818ulong_t zfs_fsync_sync_cnt = 4; 2819 2820static int 2821zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 2822{ 2823 znode_t *zp = VTOZ(vp); 2824 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2825 2826 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); 2827 2828 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 2829 ZFS_ENTER(zfsvfs); 2830 ZFS_VERIFY_ZP(zp); 2831 zil_commit(zfsvfs->z_log, zp->z_id); 2832 ZFS_EXIT(zfsvfs); 2833 } 2834 return (0); 2835} 2836 2837 2838/* 2839 * Get the requested file attributes and place them in the provided 2840 * vattr structure. 2841 * 2842 * IN: vp - vnode of file. 2843 * vap - va_mask identifies requested attributes. 2844 * If AT_XVATTR set, then optional attrs are requested 2845 * flags - ATTR_NOACLCHECK (CIFS server context) 2846 * cr - credentials of caller. 2847 * ct - caller context 2848 * 2849 * OUT: vap - attribute values. 2850 * 2851 * RETURN: 0 (always succeeds). 2852 */ 2853/* ARGSUSED */ 2854static int 2855zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 2856 caller_context_t *ct) 2857{ 2858 znode_t *zp = VTOZ(vp); 2859 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2860 int error = 0; 2861 uint32_t blksize; 2862 u_longlong_t nblocks; 2863 uint64_t links; 2864 uint64_t mtime[2], ctime[2], crtime[2], rdev; 2865 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 2866 xoptattr_t *xoap = NULL; 2867 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 2868 sa_bulk_attr_t bulk[4]; 2869 int count = 0; 2870 2871 ZFS_ENTER(zfsvfs); 2872 ZFS_VERIFY_ZP(zp); 2873 2874 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 2875 2876 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 2877 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 2878 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); 2879 if (vp->v_type == VBLK || vp->v_type == VCHR) 2880 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, 2881 &rdev, 8); 2882 2883 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { 2884 ZFS_EXIT(zfsvfs); 2885 return (error); 2886 } 2887 2888 /* 2889 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 2890 * Also, if we are the owner don't bother, since owner should 2891 * always be allowed to read basic attributes of file. 2892 */ 2893 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && 2894 (vap->va_uid != crgetuid(cr))) { 2895 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 2896 skipaclchk, cr)) { 2897 ZFS_EXIT(zfsvfs); 2898 return (error); 2899 } 2900 } 2901 2902 /* 2903 * Return all attributes. It's cheaper to provide the answer 2904 * than to determine whether we were asked the question. 2905 */ 2906 2907 mutex_enter(&zp->z_lock); 2908 vap->va_type = IFTOVT(zp->z_mode); 2909 vap->va_mode = zp->z_mode & ~S_IFMT; 2910#ifdef illumos 2911 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 2912#else 2913 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; 2914#endif 2915 vap->va_nodeid = zp->z_id; 2916 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 2917 links = zp->z_links + 1; 2918 else 2919 links = zp->z_links; 2920 vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */ 2921 vap->va_size = zp->z_size; 2922#ifdef illumos 2923 vap->va_rdev = vp->v_rdev; 2924#else 2925 if (vp->v_type == VBLK || vp->v_type == VCHR) 2926 vap->va_rdev = zfs_cmpldev(rdev); 2927#endif 2928 vap->va_seq = zp->z_seq; 2929 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ 2930 vap->va_filerev = zp->z_seq; 2931 2932 /* 2933 * Add in any requested optional attributes and the create time. 2934 * Also set the corresponding bits in the returned attribute bitmap. 2935 */ 2936 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 2937 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 2938 xoap->xoa_archive = 2939 ((zp->z_pflags & ZFS_ARCHIVE) != 0); 2940 XVA_SET_RTN(xvap, XAT_ARCHIVE); 2941 } 2942 2943 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 2944 xoap->xoa_readonly = 2945 ((zp->z_pflags & ZFS_READONLY) != 0); 2946 XVA_SET_RTN(xvap, XAT_READONLY); 2947 } 2948 2949 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 2950 xoap->xoa_system = 2951 ((zp->z_pflags & ZFS_SYSTEM) != 0); 2952 XVA_SET_RTN(xvap, XAT_SYSTEM); 2953 } 2954 2955 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 2956 xoap->xoa_hidden = 2957 ((zp->z_pflags & ZFS_HIDDEN) != 0); 2958 XVA_SET_RTN(xvap, XAT_HIDDEN); 2959 } 2960 2961 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2962 xoap->xoa_nounlink = 2963 ((zp->z_pflags & ZFS_NOUNLINK) != 0); 2964 XVA_SET_RTN(xvap, XAT_NOUNLINK); 2965 } 2966 2967 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2968 xoap->xoa_immutable = 2969 ((zp->z_pflags & ZFS_IMMUTABLE) != 0); 2970 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 2971 } 2972 2973 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2974 xoap->xoa_appendonly = 2975 ((zp->z_pflags & ZFS_APPENDONLY) != 0); 2976 XVA_SET_RTN(xvap, XAT_APPENDONLY); 2977 } 2978 2979 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2980 xoap->xoa_nodump = 2981 ((zp->z_pflags & ZFS_NODUMP) != 0); 2982 XVA_SET_RTN(xvap, XAT_NODUMP); 2983 } 2984 2985 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 2986 xoap->xoa_opaque = 2987 ((zp->z_pflags & ZFS_OPAQUE) != 0); 2988 XVA_SET_RTN(xvap, XAT_OPAQUE); 2989 } 2990 2991 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2992 xoap->xoa_av_quarantined = 2993 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); 2994 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 2995 } 2996 2997 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2998 xoap->xoa_av_modified = 2999 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); 3000 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 3001 } 3002 3003 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 3004 vp->v_type == VREG) { 3005 zfs_sa_get_scanstamp(zp, xvap); 3006 } 3007 3008 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 3009 uint64_t times[2]; 3010 3011 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), 3012 times, sizeof (times)); 3013 ZFS_TIME_DECODE(&xoap->xoa_createtime, times); 3014 XVA_SET_RTN(xvap, XAT_CREATETIME); 3015 } 3016 3017 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 3018 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); 3019 XVA_SET_RTN(xvap, XAT_REPARSE); 3020 } 3021 if (XVA_ISSET_REQ(xvap, XAT_GEN)) { 3022 xoap->xoa_generation = zp->z_gen; 3023 XVA_SET_RTN(xvap, XAT_GEN); 3024 } 3025 3026 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 3027 xoap->xoa_offline = 3028 ((zp->z_pflags & ZFS_OFFLINE) != 0); 3029 XVA_SET_RTN(xvap, XAT_OFFLINE); 3030 } 3031 3032 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 3033 xoap->xoa_sparse = 3034 ((zp->z_pflags & ZFS_SPARSE) != 0); 3035 XVA_SET_RTN(xvap, XAT_SPARSE); 3036 } 3037 } 3038 3039 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); 3040 ZFS_TIME_DECODE(&vap->va_mtime, mtime); 3041 ZFS_TIME_DECODE(&vap->va_ctime, ctime); 3042 ZFS_TIME_DECODE(&vap->va_birthtime, crtime); 3043 3044 mutex_exit(&zp->z_lock); 3045 3046 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 3047 vap->va_blksize = blksize; 3048 vap->va_bytes = nblocks << 9; /* nblocks * 512 */ 3049 3050 if (zp->z_blksz == 0) { 3051 /* 3052 * Block size hasn't been set; suggest maximal I/O transfers. 3053 */ 3054 vap->va_blksize = zfsvfs->z_max_blksz; 3055 } 3056 3057 ZFS_EXIT(zfsvfs); 3058 return (0); 3059} 3060 3061/* 3062 * Set the file attributes to the values contained in the 3063 * vattr structure. 3064 * 3065 * IN: vp - vnode of file to be modified. 3066 * vap - new attribute values. 3067 * If AT_XVATTR set, then optional attrs are being set 3068 * flags - ATTR_UTIME set if non-default time values provided. 3069 * - ATTR_NOACLCHECK (CIFS context only). 3070 * cr - credentials of caller. 3071 * ct - caller context 3072 * 3073 * RETURN: 0 on success, error code on failure. 3074 * 3075 * Timestamps: 3076 * vp - ctime updated, mtime updated if size changed. 3077 */ 3078/* ARGSUSED */ 3079static int 3080zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 3081 caller_context_t *ct) 3082{ 3083 znode_t *zp = VTOZ(vp); 3084 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3085 zilog_t *zilog; 3086 dmu_tx_t *tx; 3087 vattr_t oldva; 3088 xvattr_t tmpxvattr; 3089 uint_t mask = vap->va_mask; 3090 uint_t saved_mask = 0; 3091 uint64_t saved_mode; 3092 int trim_mask = 0; 3093 uint64_t new_mode; 3094 uint64_t new_uid, new_gid; 3095 uint64_t xattr_obj; 3096 uint64_t mtime[2], ctime[2]; 3097 znode_t *attrzp; 3098 int need_policy = FALSE; 3099 int err, err2; 3100 zfs_fuid_info_t *fuidp = NULL; 3101 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 3102 xoptattr_t *xoap; 3103 zfs_acl_t *aclp; 3104 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 3105 boolean_t fuid_dirtied = B_FALSE; 3106 sa_bulk_attr_t bulk[7], xattr_bulk[7]; 3107 int count = 0, xattr_count = 0; 3108 3109 if (mask == 0) 3110 return (0); 3111 3112 if (mask & AT_NOSET) 3113 return (SET_ERROR(EINVAL)); 3114 3115 ZFS_ENTER(zfsvfs); 3116 ZFS_VERIFY_ZP(zp); 3117 3118 zilog = zfsvfs->z_log; 3119 3120 /* 3121 * Make sure that if we have ephemeral uid/gid or xvattr specified 3122 * that file system is at proper version level 3123 */ 3124 3125 if (zfsvfs->z_use_fuids == B_FALSE && 3126 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 3127 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 3128 (mask & AT_XVATTR))) { 3129 ZFS_EXIT(zfsvfs); 3130 return (SET_ERROR(EINVAL)); 3131 } 3132 3133 if (mask & AT_SIZE && vp->v_type == VDIR) { 3134 ZFS_EXIT(zfsvfs); 3135 return (SET_ERROR(EISDIR)); 3136 } 3137 3138 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 3139 ZFS_EXIT(zfsvfs); 3140 return (SET_ERROR(EINVAL)); 3141 } 3142 3143 /* 3144 * If this is an xvattr_t, then get a pointer to the structure of 3145 * optional attributes. If this is NULL, then we have a vattr_t. 3146 */ 3147 xoap = xva_getxoptattr(xvap); 3148 3149 xva_init(&tmpxvattr); 3150 3151 /* 3152 * Immutable files can only alter immutable bit and atime 3153 */ 3154 if ((zp->z_pflags & ZFS_IMMUTABLE) && 3155 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 3156 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 3157 ZFS_EXIT(zfsvfs); 3158 return (SET_ERROR(EPERM)); 3159 } 3160 3161 if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) { 3162 ZFS_EXIT(zfsvfs); 3163 return (SET_ERROR(EPERM)); 3164 } 3165 3166 /* 3167 * Verify timestamps doesn't overflow 32 bits. 3168 * ZFS can handle large timestamps, but 32bit syscalls can't 3169 * handle times greater than 2039. This check should be removed 3170 * once large timestamps are fully supported. 3171 */ 3172 if (mask & (AT_ATIME | AT_MTIME)) { 3173 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 3174 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 3175 ZFS_EXIT(zfsvfs); 3176 return (SET_ERROR(EOVERFLOW)); 3177 } 3178 } 3179 3180top: 3181 attrzp = NULL; 3182 aclp = NULL; 3183 3184 /* Can this be moved to before the top label? */ 3185 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 3186 ZFS_EXIT(zfsvfs); 3187 return (SET_ERROR(EROFS)); 3188 } 3189 3190 /* 3191 * First validate permissions 3192 */ 3193 3194 if (mask & AT_SIZE) { 3195 /* 3196 * XXX - Note, we are not providing any open 3197 * mode flags here (like FNDELAY), so we may 3198 * block if there are locks present... this 3199 * should be addressed in openat(). 3200 */ 3201 /* XXX - would it be OK to generate a log record here? */ 3202 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 3203 if (err) { 3204 ZFS_EXIT(zfsvfs); 3205 return (err); 3206 } 3207 } 3208 3209 if (mask & (AT_ATIME|AT_MTIME) || 3210 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 3211 XVA_ISSET_REQ(xvap, XAT_READONLY) || 3212 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 3213 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 3214 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 3215 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 3216 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 3217 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 3218 skipaclchk, cr); 3219 } 3220 3221 if (mask & (AT_UID|AT_GID)) { 3222 int idmask = (mask & (AT_UID|AT_GID)); 3223 int take_owner; 3224 int take_group; 3225 3226 /* 3227 * NOTE: even if a new mode is being set, 3228 * we may clear S_ISUID/S_ISGID bits. 3229 */ 3230 3231 if (!(mask & AT_MODE)) 3232 vap->va_mode = zp->z_mode; 3233 3234 /* 3235 * Take ownership or chgrp to group we are a member of 3236 */ 3237 3238 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 3239 take_group = (mask & AT_GID) && 3240 zfs_groupmember(zfsvfs, vap->va_gid, cr); 3241 3242 /* 3243 * If both AT_UID and AT_GID are set then take_owner and 3244 * take_group must both be set in order to allow taking 3245 * ownership. 3246 * 3247 * Otherwise, send the check through secpolicy_vnode_setattr() 3248 * 3249 */ 3250 3251 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 3252 ((idmask == AT_UID) && take_owner) || 3253 ((idmask == AT_GID) && take_group)) { 3254 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 3255 skipaclchk, cr) == 0) { 3256 /* 3257 * Remove setuid/setgid for non-privileged users 3258 */ 3259 secpolicy_setid_clear(vap, vp, cr); 3260 trim_mask = (mask & (AT_UID|AT_GID)); 3261 } else { 3262 need_policy = TRUE; 3263 } 3264 } else { 3265 need_policy = TRUE; 3266 } 3267 } 3268 3269 mutex_enter(&zp->z_lock); 3270 oldva.va_mode = zp->z_mode; 3271 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 3272 if (mask & AT_XVATTR) { 3273 /* 3274 * Update xvattr mask to include only those attributes 3275 * that are actually changing. 3276 * 3277 * the bits will be restored prior to actually setting 3278 * the attributes so the caller thinks they were set. 3279 */ 3280 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 3281 if (xoap->xoa_appendonly != 3282 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 3283 need_policy = TRUE; 3284 } else { 3285 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 3286 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); 3287 } 3288 } 3289 3290 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 3291 if (xoap->xoa_nounlink != 3292 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 3293 need_policy = TRUE; 3294 } else { 3295 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 3296 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); 3297 } 3298 } 3299 3300 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 3301 if (xoap->xoa_immutable != 3302 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 3303 need_policy = TRUE; 3304 } else { 3305 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 3306 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); 3307 } 3308 } 3309 3310 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 3311 if (xoap->xoa_nodump != 3312 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 3313 need_policy = TRUE; 3314 } else { 3315 XVA_CLR_REQ(xvap, XAT_NODUMP); 3316 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); 3317 } 3318 } 3319 3320 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 3321 if (xoap->xoa_av_modified != 3322 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 3323 need_policy = TRUE; 3324 } else { 3325 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 3326 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); 3327 } 3328 } 3329 3330 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 3331 if ((vp->v_type != VREG && 3332 xoap->xoa_av_quarantined) || 3333 xoap->xoa_av_quarantined != 3334 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 3335 need_policy = TRUE; 3336 } else { 3337 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 3338 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); 3339 } 3340 } 3341 3342 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 3343 mutex_exit(&zp->z_lock); 3344 ZFS_EXIT(zfsvfs); 3345 return (SET_ERROR(EPERM)); 3346 } 3347 3348 if (need_policy == FALSE && 3349 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 3350 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 3351 need_policy = TRUE; 3352 } 3353 } 3354 3355 mutex_exit(&zp->z_lock); 3356 3357 if (mask & AT_MODE) { 3358 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 3359 err = secpolicy_setid_setsticky_clear(vp, vap, 3360 &oldva, cr); 3361 if (err) { 3362 ZFS_EXIT(zfsvfs); 3363 return (err); 3364 } 3365 trim_mask |= AT_MODE; 3366 } else { 3367 need_policy = TRUE; 3368 } 3369 } 3370 3371 if (need_policy) { 3372 /* 3373 * If trim_mask is set then take ownership 3374 * has been granted or write_acl is present and user 3375 * has the ability to modify mode. In that case remove 3376 * UID|GID and or MODE from mask so that 3377 * secpolicy_vnode_setattr() doesn't revoke it. 3378 */ 3379 3380 if (trim_mask) { 3381 saved_mask = vap->va_mask; 3382 vap->va_mask &= ~trim_mask; 3383 if (trim_mask & AT_MODE) { 3384 /* 3385 * Save the mode, as secpolicy_vnode_setattr() 3386 * will overwrite it with ova.va_mode. 3387 */ 3388 saved_mode = vap->va_mode; 3389 } 3390 } 3391 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 3392 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 3393 if (err) { 3394 ZFS_EXIT(zfsvfs); 3395 return (err); 3396 } 3397 3398 if (trim_mask) { 3399 vap->va_mask |= saved_mask; 3400 if (trim_mask & AT_MODE) { 3401 /* 3402 * Recover the mode after 3403 * secpolicy_vnode_setattr(). 3404 */ 3405 vap->va_mode = saved_mode; 3406 } 3407 } 3408 } 3409 3410 /* 3411 * secpolicy_vnode_setattr, or take ownership may have 3412 * changed va_mask 3413 */ 3414 mask = vap->va_mask; 3415 3416 if ((mask & (AT_UID | AT_GID))) { 3417 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 3418 &xattr_obj, sizeof (xattr_obj)); 3419 3420 if (err == 0 && xattr_obj) { 3421 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); 3422 if (err) 3423 goto out2; 3424 } 3425 if (mask & AT_UID) { 3426 new_uid = zfs_fuid_create(zfsvfs, 3427 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 3428 if (new_uid != zp->z_uid && 3429 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { 3430 if (attrzp) 3431 VN_RELE(ZTOV(attrzp)); 3432 err = SET_ERROR(EDQUOT); 3433 goto out2; 3434 } 3435 } 3436 3437 if (mask & AT_GID) { 3438 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, 3439 cr, ZFS_GROUP, &fuidp); 3440 if (new_gid != zp->z_gid && 3441 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { 3442 if (attrzp) 3443 VN_RELE(ZTOV(attrzp)); 3444 err = SET_ERROR(EDQUOT); 3445 goto out2; 3446 } 3447 } 3448 } 3449 tx = dmu_tx_create(zfsvfs->z_os); 3450 3451 if (mask & AT_MODE) { 3452 uint64_t pmode = zp->z_mode; 3453 uint64_t acl_obj; 3454 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 3455 3456 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && 3457 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 3458 err = SET_ERROR(EPERM); 3459 goto out; 3460 } 3461 3462 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) 3463 goto out; 3464 3465 mutex_enter(&zp->z_lock); 3466 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 3467 /* 3468 * Are we upgrading ACL from old V0 format 3469 * to V1 format? 3470 */ 3471 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 3472 zfs_znode_acl_version(zp) == 3473 ZFS_ACL_VERSION_INITIAL) { 3474 dmu_tx_hold_free(tx, acl_obj, 0, 3475 DMU_OBJECT_END); 3476 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3477 0, aclp->z_acl_bytes); 3478 } else { 3479 dmu_tx_hold_write(tx, acl_obj, 0, 3480 aclp->z_acl_bytes); 3481 } 3482 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3483 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3484 0, aclp->z_acl_bytes); 3485 } 3486 mutex_exit(&zp->z_lock); 3487 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3488 } else { 3489 if ((mask & AT_XVATTR) && 3490 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3491 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3492 else 3493 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3494 } 3495 3496 if (attrzp) { 3497 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 3498 } 3499 3500 fuid_dirtied = zfsvfs->z_fuid_dirty; 3501 if (fuid_dirtied) 3502 zfs_fuid_txhold(zfsvfs, tx); 3503 3504 zfs_sa_upgrade_txholds(tx, zp); 3505 3506 err = dmu_tx_assign(tx, TXG_WAIT); 3507 if (err) 3508 goto out; 3509 3510 count = 0; 3511 /* 3512 * Set each attribute requested. 3513 * We group settings according to the locks they need to acquire. 3514 * 3515 * Note: you cannot set ctime directly, although it will be 3516 * updated as a side-effect of calling this function. 3517 */ 3518 3519 3520 if (mask & (AT_UID|AT_GID|AT_MODE)) 3521 mutex_enter(&zp->z_acl_lock); 3522 mutex_enter(&zp->z_lock); 3523 3524 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 3525 &zp->z_pflags, sizeof (zp->z_pflags)); 3526 3527 if (attrzp) { 3528 if (mask & (AT_UID|AT_GID|AT_MODE)) 3529 mutex_enter(&attrzp->z_acl_lock); 3530 mutex_enter(&attrzp->z_lock); 3531 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3532 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 3533 sizeof (attrzp->z_pflags)); 3534 } 3535 3536 if (mask & (AT_UID|AT_GID)) { 3537 3538 if (mask & AT_UID) { 3539 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 3540 &new_uid, sizeof (new_uid)); 3541 zp->z_uid = new_uid; 3542 if (attrzp) { 3543 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3544 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 3545 sizeof (new_uid)); 3546 attrzp->z_uid = new_uid; 3547 } 3548 } 3549 3550 if (mask & AT_GID) { 3551 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 3552 NULL, &new_gid, sizeof (new_gid)); 3553 zp->z_gid = new_gid; 3554 if (attrzp) { 3555 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3556 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 3557 sizeof (new_gid)); 3558 attrzp->z_gid = new_gid; 3559 } 3560 } 3561 if (!(mask & AT_MODE)) { 3562 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 3563 NULL, &new_mode, sizeof (new_mode)); 3564 new_mode = zp->z_mode; 3565 } 3566 err = zfs_acl_chown_setattr(zp); 3567 ASSERT(err == 0); 3568 if (attrzp) { 3569 err = zfs_acl_chown_setattr(attrzp); 3570 ASSERT(err == 0); 3571 } 3572 } 3573 3574 if (mask & AT_MODE) { 3575 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 3576 &new_mode, sizeof (new_mode)); 3577 zp->z_mode = new_mode; 3578 ASSERT3U((uintptr_t)aclp, !=, 0); 3579 err = zfs_aclset_common(zp, aclp, cr, tx); 3580 ASSERT0(err); 3581 if (zp->z_acl_cached) 3582 zfs_acl_free(zp->z_acl_cached); 3583 zp->z_acl_cached = aclp; 3584 aclp = NULL; 3585 } 3586 3587 3588 if (mask & AT_ATIME) { 3589 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); 3590 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 3591 &zp->z_atime, sizeof (zp->z_atime)); 3592 } 3593 3594 if (mask & AT_MTIME) { 3595 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 3596 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 3597 mtime, sizeof (mtime)); 3598 } 3599 3600 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 3601 if (mask & AT_SIZE && !(mask & AT_MTIME)) { 3602 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 3603 NULL, mtime, sizeof (mtime)); 3604 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3605 &ctime, sizeof (ctime)); 3606 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 3607 B_TRUE); 3608 } else if (mask != 0) { 3609 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3610 &ctime, sizeof (ctime)); 3611 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, 3612 B_TRUE); 3613 if (attrzp) { 3614 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3615 SA_ZPL_CTIME(zfsvfs), NULL, 3616 &ctime, sizeof (ctime)); 3617 zfs_tstamp_update_setup(attrzp, STATE_CHANGED, 3618 mtime, ctime, B_TRUE); 3619 } 3620 } 3621 /* 3622 * Do this after setting timestamps to prevent timestamp 3623 * update from toggling bit 3624 */ 3625 3626 if (xoap && (mask & AT_XVATTR)) { 3627 3628 /* 3629 * restore trimmed off masks 3630 * so that return masks can be set for caller. 3631 */ 3632 3633 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { 3634 XVA_SET_REQ(xvap, XAT_APPENDONLY); 3635 } 3636 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { 3637 XVA_SET_REQ(xvap, XAT_NOUNLINK); 3638 } 3639 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { 3640 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 3641 } 3642 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { 3643 XVA_SET_REQ(xvap, XAT_NODUMP); 3644 } 3645 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { 3646 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 3647 } 3648 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { 3649 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 3650 } 3651 3652 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3653 ASSERT(vp->v_type == VREG); 3654 3655 zfs_xvattr_set(zp, xvap, tx); 3656 } 3657 3658 if (fuid_dirtied) 3659 zfs_fuid_sync(zfsvfs, tx); 3660 3661 if (mask != 0) 3662 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 3663 3664 mutex_exit(&zp->z_lock); 3665 if (mask & (AT_UID|AT_GID|AT_MODE)) 3666 mutex_exit(&zp->z_acl_lock); 3667 3668 if (attrzp) { 3669 if (mask & (AT_UID|AT_GID|AT_MODE)) 3670 mutex_exit(&attrzp->z_acl_lock); 3671 mutex_exit(&attrzp->z_lock); 3672 } 3673out: 3674 if (err == 0 && attrzp) { 3675 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 3676 xattr_count, tx); 3677 ASSERT(err2 == 0); 3678 } 3679 3680 if (attrzp) 3681 VN_RELE(ZTOV(attrzp)); 3682 3683 if (aclp) 3684 zfs_acl_free(aclp); 3685 3686 if (fuidp) { 3687 zfs_fuid_info_free(fuidp); 3688 fuidp = NULL; 3689 } 3690 3691 if (err) { 3692 dmu_tx_abort(tx); 3693 if (err == ERESTART) 3694 goto top; 3695 } else { 3696 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 3697 dmu_tx_commit(tx); 3698 } 3699 3700out2: 3701 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3702 zil_commit(zilog, 0); 3703 3704 ZFS_EXIT(zfsvfs); 3705 return (err); 3706} 3707 3708typedef struct zfs_zlock { 3709 krwlock_t *zl_rwlock; /* lock we acquired */ 3710 znode_t *zl_znode; /* znode we held */ 3711 struct zfs_zlock *zl_next; /* next in list */ 3712} zfs_zlock_t; 3713 3714/* 3715 * Drop locks and release vnodes that were held by zfs_rename_lock(). 3716 */ 3717static void 3718zfs_rename_unlock(zfs_zlock_t **zlpp) 3719{ 3720 zfs_zlock_t *zl; 3721 3722 while ((zl = *zlpp) != NULL) { 3723 if (zl->zl_znode != NULL) 3724 VN_RELE(ZTOV(zl->zl_znode)); 3725 rw_exit(zl->zl_rwlock); 3726 *zlpp = zl->zl_next; 3727 kmem_free(zl, sizeof (*zl)); 3728 } 3729} 3730 3731/* 3732 * Search back through the directory tree, using the ".." entries. 3733 * Lock each directory in the chain to prevent concurrent renames. 3734 * Fail any attempt to move a directory into one of its own descendants. 3735 * XXX - z_parent_lock can overlap with map or grow locks 3736 */ 3737static int 3738zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 3739{ 3740 zfs_zlock_t *zl; 3741 znode_t *zp = tdzp; 3742 uint64_t rootid = zp->z_zfsvfs->z_root; 3743 uint64_t oidp = zp->z_id; 3744 krwlock_t *rwlp = &szp->z_parent_lock; 3745 krw_t rw = RW_WRITER; 3746 3747 /* 3748 * First pass write-locks szp and compares to zp->z_id. 3749 * Later passes read-lock zp and compare to zp->z_parent. 3750 */ 3751 do { 3752 if (!rw_tryenter(rwlp, rw)) { 3753 /* 3754 * Another thread is renaming in this path. 3755 * Note that if we are a WRITER, we don't have any 3756 * parent_locks held yet. 3757 */ 3758 if (rw == RW_READER && zp->z_id > szp->z_id) { 3759 /* 3760 * Drop our locks and restart 3761 */ 3762 zfs_rename_unlock(&zl); 3763 *zlpp = NULL; 3764 zp = tdzp; 3765 oidp = zp->z_id; 3766 rwlp = &szp->z_parent_lock; 3767 rw = RW_WRITER; 3768 continue; 3769 } else { 3770 /* 3771 * Wait for other thread to drop its locks 3772 */ 3773 rw_enter(rwlp, rw); 3774 } 3775 } 3776 3777 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 3778 zl->zl_rwlock = rwlp; 3779 zl->zl_znode = NULL; 3780 zl->zl_next = *zlpp; 3781 *zlpp = zl; 3782 3783 if (oidp == szp->z_id) /* We're a descendant of szp */ 3784 return (SET_ERROR(EINVAL)); 3785 3786 if (oidp == rootid) /* We've hit the top */ 3787 return (0); 3788 3789 if (rw == RW_READER) { /* i.e. not the first pass */ 3790 int error = zfs_zget(zp->z_zfsvfs, oidp, &zp); 3791 if (error) 3792 return (error); 3793 zl->zl_znode = zp; 3794 } 3795 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs), 3796 &oidp, sizeof (oidp)); 3797 rwlp = &zp->z_parent_lock; 3798 rw = RW_READER; 3799 3800 } while (zp->z_id != sdzp->z_id); 3801 3802 return (0); 3803} 3804 3805/* 3806 * Move an entry from the provided source directory to the target 3807 * directory. Change the entry name as indicated. 3808 * 3809 * IN: sdvp - Source directory containing the "old entry". 3810 * snm - Old entry name. 3811 * tdvp - Target directory to contain the "new entry". 3812 * tnm - New entry name. 3813 * cr - credentials of caller. 3814 * ct - caller context 3815 * flags - case flags 3816 * 3817 * RETURN: 0 on success, error code on failure. 3818 * 3819 * Timestamps: 3820 * sdvp,tdvp - ctime|mtime updated 3821 */ 3822/*ARGSUSED*/ 3823static int 3824zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, 3825 caller_context_t *ct, int flags) 3826{ 3827 znode_t *tdzp, *sdzp, *szp, *tzp; 3828 zfsvfs_t *zfsvfs; 3829 zilog_t *zilog; 3830 vnode_t *realvp; 3831 zfs_dirlock_t *sdl, *tdl; 3832 dmu_tx_t *tx; 3833 zfs_zlock_t *zl; 3834 int cmp, serr, terr; 3835 int error = 0; 3836 int zflg = 0; 3837 boolean_t waited = B_FALSE; 3838 3839 tdzp = VTOZ(tdvp); 3840 ZFS_VERIFY_ZP(tdzp); 3841 zfsvfs = tdzp->z_zfsvfs; 3842 ZFS_ENTER(zfsvfs); 3843 zilog = zfsvfs->z_log; 3844 sdzp = VTOZ(sdvp); 3845 3846 /* 3847 * In case sdzp is not valid, let's be sure to exit from the right 3848 * zfsvfs_t. 3849 */ 3850 if (sdzp->z_sa_hdl == NULL) { 3851 ZFS_EXIT(zfsvfs); 3852 return (SET_ERROR(EIO)); 3853 } 3854 3855 /* 3856 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the 3857 * ctldir appear to have the same v_vfsp. 3858 */ 3859 if (sdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) { 3860 ZFS_EXIT(zfsvfs); 3861 return (SET_ERROR(EXDEV)); 3862 } 3863 3864 if (zfsvfs->z_utf8 && u8_validate(tnm, 3865 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3866 ZFS_EXIT(zfsvfs); 3867 return (SET_ERROR(EILSEQ)); 3868 } 3869 3870 if (flags & FIGNORECASE) 3871 zflg |= ZCILOOK; 3872 3873top: 3874 szp = NULL; 3875 tzp = NULL; 3876 zl = NULL; 3877 3878 /* 3879 * This is to prevent the creation of links into attribute space 3880 * by renaming a linked file into/outof an attribute directory. 3881 * See the comment in zfs_link() for why this is considered bad. 3882 */ 3883 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 3884 ZFS_EXIT(zfsvfs); 3885 return (SET_ERROR(EINVAL)); 3886 } 3887 3888 /* 3889 * Lock source and target directory entries. To prevent deadlock, 3890 * a lock ordering must be defined. We lock the directory with 3891 * the smallest object id first, or if it's a tie, the one with 3892 * the lexically first name. 3893 */ 3894 if (sdzp->z_id < tdzp->z_id) { 3895 cmp = -1; 3896 } else if (sdzp->z_id > tdzp->z_id) { 3897 cmp = 1; 3898 } else { 3899 /* 3900 * First compare the two name arguments without 3901 * considering any case folding. 3902 */ 3903 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 3904 3905 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 3906 ASSERT(error == 0 || !zfsvfs->z_utf8); 3907 if (cmp == 0) { 3908 /* 3909 * POSIX: "If the old argument and the new argument 3910 * both refer to links to the same existing file, 3911 * the rename() function shall return successfully 3912 * and perform no other action." 3913 */ 3914 ZFS_EXIT(zfsvfs); 3915 return (0); 3916 } 3917 /* 3918 * If the file system is case-folding, then we may 3919 * have some more checking to do. A case-folding file 3920 * system is either supporting mixed case sensitivity 3921 * access or is completely case-insensitive. Note 3922 * that the file system is always case preserving. 3923 * 3924 * In mixed sensitivity mode case sensitive behavior 3925 * is the default. FIGNORECASE must be used to 3926 * explicitly request case insensitive behavior. 3927 * 3928 * If the source and target names provided differ only 3929 * by case (e.g., a request to rename 'tim' to 'Tim'), 3930 * we will treat this as a special case in the 3931 * case-insensitive mode: as long as the source name 3932 * is an exact match, we will allow this to proceed as 3933 * a name-change request. 3934 */ 3935 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 3936 (zfsvfs->z_case == ZFS_CASE_MIXED && 3937 flags & FIGNORECASE)) && 3938 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 3939 &error) == 0) { 3940 /* 3941 * case preserving rename request, require exact 3942 * name matches 3943 */ 3944 zflg |= ZCIEXACT; 3945 zflg &= ~ZCILOOK; 3946 } 3947 } 3948 3949 /* 3950 * If the source and destination directories are the same, we should 3951 * grab the z_name_lock of that directory only once. 3952 */ 3953 if (sdzp == tdzp) { 3954 zflg |= ZHAVELOCK; 3955 rw_enter(&sdzp->z_name_lock, RW_READER); 3956 } 3957 3958 if (cmp < 0) { 3959 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 3960 ZEXISTS | zflg, NULL, NULL); 3961 terr = zfs_dirent_lock(&tdl, 3962 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 3963 } else { 3964 terr = zfs_dirent_lock(&tdl, 3965 tdzp, tnm, &tzp, zflg, NULL, NULL); 3966 serr = zfs_dirent_lock(&sdl, 3967 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 3968 NULL, NULL); 3969 } 3970 3971 if (serr) { 3972 /* 3973 * Source entry invalid or not there. 3974 */ 3975 if (!terr) { 3976 zfs_dirent_unlock(tdl); 3977 if (tzp) 3978 VN_RELE(ZTOV(tzp)); 3979 } 3980 3981 if (sdzp == tdzp) 3982 rw_exit(&sdzp->z_name_lock); 3983 3984 /* 3985 * FreeBSD: In OpenSolaris they only check if rename source is 3986 * ".." here, because "." is handled in their lookup. This is 3987 * not the case for FreeBSD, so we check for "." explicitly. 3988 */ 3989 if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) 3990 serr = SET_ERROR(EINVAL); 3991 ZFS_EXIT(zfsvfs); 3992 return (serr); 3993 } 3994 if (terr) { 3995 zfs_dirent_unlock(sdl); 3996 VN_RELE(ZTOV(szp)); 3997 3998 if (sdzp == tdzp) 3999 rw_exit(&sdzp->z_name_lock); 4000 4001 if (strcmp(tnm, "..") == 0) 4002 terr = SET_ERROR(EINVAL); 4003 ZFS_EXIT(zfsvfs); 4004 return (terr); 4005 } 4006 4007 /* 4008 * Must have write access at the source to remove the old entry 4009 * and write access at the target to create the new entry. 4010 * Note that if target and source are the same, this can be 4011 * done in a single check. 4012 */ 4013 4014 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 4015 goto out; 4016 4017 if (ZTOV(szp)->v_type == VDIR) { 4018 /* 4019 * Check to make sure rename is valid. 4020 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 4021 */ 4022 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) 4023 goto out; 4024 } 4025 4026 /* 4027 * Does target exist? 4028 */ 4029 if (tzp) { 4030 /* 4031 * Source and target must be the same type. 4032 */ 4033 if (ZTOV(szp)->v_type == VDIR) { 4034 if (ZTOV(tzp)->v_type != VDIR) { 4035 error = SET_ERROR(ENOTDIR); 4036 goto out; 4037 } 4038 } else { 4039 if (ZTOV(tzp)->v_type == VDIR) { 4040 error = SET_ERROR(EISDIR); 4041 goto out; 4042 } 4043 } 4044 /* 4045 * POSIX dictates that when the source and target 4046 * entries refer to the same file object, rename 4047 * must do nothing and exit without error. 4048 */ 4049 if (szp->z_id == tzp->z_id) { 4050 error = 0; 4051 goto out; 4052 } 4053 } 4054 4055 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); 4056 if (tzp) 4057 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); 4058 4059 /* 4060 * notify the target directory if it is not the same 4061 * as source directory. 4062 */ 4063 if (tdvp != sdvp) { 4064 vnevent_rename_dest_dir(tdvp, ct); 4065 } 4066 4067 tx = dmu_tx_create(zfsvfs->z_os); 4068 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4069 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 4070 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 4071 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 4072 if (sdzp != tdzp) { 4073 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 4074 zfs_sa_upgrade_txholds(tx, tdzp); 4075 } 4076 if (tzp) { 4077 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 4078 zfs_sa_upgrade_txholds(tx, tzp); 4079 } 4080 4081 zfs_sa_upgrade_txholds(tx, szp); 4082 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 4083 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 4084 if (error) { 4085 if (zl != NULL) 4086 zfs_rename_unlock(&zl); 4087 zfs_dirent_unlock(sdl); 4088 zfs_dirent_unlock(tdl); 4089 4090 if (sdzp == tdzp) 4091 rw_exit(&sdzp->z_name_lock); 4092 4093 VN_RELE(ZTOV(szp)); 4094 if (tzp) 4095 VN_RELE(ZTOV(tzp)); 4096 if (error == ERESTART) { 4097 waited = B_TRUE; 4098 dmu_tx_wait(tx); 4099 dmu_tx_abort(tx); 4100 goto top; 4101 } 4102 dmu_tx_abort(tx); 4103 ZFS_EXIT(zfsvfs); 4104 return (error); 4105 } 4106 4107 if (tzp) /* Attempt to remove the existing target */ 4108 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); 4109 4110 if (error == 0) { 4111 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 4112 if (error == 0) { 4113 szp->z_pflags |= ZFS_AV_MODIFIED; 4114 4115 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 4116 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 4117 ASSERT0(error); 4118 4119 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 4120 if (error == 0) { 4121 zfs_log_rename(zilog, tx, TX_RENAME | 4122 (flags & FIGNORECASE ? TX_CI : 0), sdzp, 4123 sdl->dl_name, tdzp, tdl->dl_name, szp); 4124 4125 /* 4126 * Update path information for the target vnode 4127 */ 4128 vn_renamepath(tdvp, ZTOV(szp), tnm, 4129 strlen(tnm)); 4130 } else { 4131 /* 4132 * At this point, we have successfully created 4133 * the target name, but have failed to remove 4134 * the source name. Since the create was done 4135 * with the ZRENAMING flag, there are 4136 * complications; for one, the link count is 4137 * wrong. The easiest way to deal with this 4138 * is to remove the newly created target, and 4139 * return the original error. This must 4140 * succeed; fortunately, it is very unlikely to 4141 * fail, since we just created it. 4142 */ 4143 VERIFY3U(zfs_link_destroy(tdl, szp, tx, 4144 ZRENAMING, NULL), ==, 0); 4145 } 4146 } 4147#ifdef FREEBSD_NAMECACHE 4148 if (error == 0) { 4149 cache_purge(sdvp); 4150 cache_purge(tdvp); 4151 cache_purge(ZTOV(szp)); 4152 if (tzp) 4153 cache_purge(ZTOV(tzp)); 4154 } 4155#endif 4156 } 4157 4158 dmu_tx_commit(tx); 4159out: 4160 if (zl != NULL) 4161 zfs_rename_unlock(&zl); 4162 4163 zfs_dirent_unlock(sdl); 4164 zfs_dirent_unlock(tdl); 4165 4166 if (sdzp == tdzp) 4167 rw_exit(&sdzp->z_name_lock); 4168 4169 4170 VN_RELE(ZTOV(szp)); 4171 if (tzp) 4172 VN_RELE(ZTOV(tzp)); 4173 4174 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4175 zil_commit(zilog, 0); 4176 4177 ZFS_EXIT(zfsvfs); 4178 4179 return (error); 4180} 4181 4182/* 4183 * Insert the indicated symbolic reference entry into the directory. 4184 * 4185 * IN: dvp - Directory to contain new symbolic link. 4186 * link - Name for new symlink entry. 4187 * vap - Attributes of new entry. 4188 * cr - credentials of caller. 4189 * ct - caller context 4190 * flags - case flags 4191 * 4192 * RETURN: 0 on success, error code on failure. 4193 * 4194 * Timestamps: 4195 * dvp - ctime|mtime updated 4196 */ 4197/*ARGSUSED*/ 4198static int 4199zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, 4200 cred_t *cr, kthread_t *td) 4201{ 4202 znode_t *zp, *dzp = VTOZ(dvp); 4203 zfs_dirlock_t *dl; 4204 dmu_tx_t *tx; 4205 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4206 zilog_t *zilog; 4207 uint64_t len = strlen(link); 4208 int error; 4209 int zflg = ZNEW; 4210 zfs_acl_ids_t acl_ids; 4211 boolean_t fuid_dirtied; 4212 uint64_t txtype = TX_SYMLINK; 4213 boolean_t waited = B_FALSE; 4214 int flags = 0; 4215 4216 ASSERT(vap->va_type == VLNK); 4217 4218 ZFS_ENTER(zfsvfs); 4219 ZFS_VERIFY_ZP(dzp); 4220 zilog = zfsvfs->z_log; 4221 4222 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 4223 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4224 ZFS_EXIT(zfsvfs); 4225 return (SET_ERROR(EILSEQ)); 4226 } 4227 if (flags & FIGNORECASE) 4228 zflg |= ZCILOOK; 4229 4230 if (len > MAXPATHLEN) { 4231 ZFS_EXIT(zfsvfs); 4232 return (SET_ERROR(ENAMETOOLONG)); 4233 } 4234 4235 if ((error = zfs_acl_ids_create(dzp, 0, 4236 vap, cr, NULL, &acl_ids)) != 0) { 4237 ZFS_EXIT(zfsvfs); 4238 return (error); 4239 } 4240 4241 getnewvnode_reserve(1); 4242 4243top: 4244 /* 4245 * Attempt to lock directory; fail if entry already exists. 4246 */ 4247 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 4248 if (error) { 4249 zfs_acl_ids_free(&acl_ids); 4250 getnewvnode_drop_reserve(); 4251 ZFS_EXIT(zfsvfs); 4252 return (error); 4253 } 4254 4255 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4256 zfs_acl_ids_free(&acl_ids); 4257 zfs_dirent_unlock(dl); 4258 getnewvnode_drop_reserve(); 4259 ZFS_EXIT(zfsvfs); 4260 return (error); 4261 } 4262 4263 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 4264 zfs_acl_ids_free(&acl_ids); 4265 zfs_dirent_unlock(dl); 4266 getnewvnode_drop_reserve(); 4267 ZFS_EXIT(zfsvfs); 4268 return (SET_ERROR(EDQUOT)); 4269 } 4270 tx = dmu_tx_create(zfsvfs->z_os); 4271 fuid_dirtied = zfsvfs->z_fuid_dirty; 4272 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 4273 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4274 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 4275 ZFS_SA_BASE_ATTR_SIZE + len); 4276 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 4277 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 4278 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 4279 acl_ids.z_aclp->z_acl_bytes); 4280 } 4281 if (fuid_dirtied) 4282 zfs_fuid_txhold(zfsvfs, tx); 4283 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 4284 if (error) { 4285 zfs_dirent_unlock(dl); 4286 if (error == ERESTART) { 4287 waited = B_TRUE; 4288 dmu_tx_wait(tx); 4289 dmu_tx_abort(tx); 4290 goto top; 4291 } 4292 zfs_acl_ids_free(&acl_ids); 4293 dmu_tx_abort(tx); 4294 getnewvnode_drop_reserve(); 4295 ZFS_EXIT(zfsvfs); 4296 return (error); 4297 } 4298 4299 /* 4300 * Create a new object for the symlink. 4301 * for version 4 ZPL datsets the symlink will be an SA attribute 4302 */ 4303 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 4304 4305 if (fuid_dirtied) 4306 zfs_fuid_sync(zfsvfs, tx); 4307 4308 mutex_enter(&zp->z_lock); 4309 if (zp->z_is_sa) 4310 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 4311 link, len, tx); 4312 else 4313 zfs_sa_symlink(zp, link, len, tx); 4314 mutex_exit(&zp->z_lock); 4315 4316 zp->z_size = len; 4317 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 4318 &zp->z_size, sizeof (zp->z_size), tx); 4319 /* 4320 * Insert the new object into the directory. 4321 */ 4322 (void) zfs_link_create(dl, zp, tx, ZNEW); 4323 4324 if (flags & FIGNORECASE) 4325 txtype |= TX_CI; 4326 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 4327 *vpp = ZTOV(zp); 4328 4329 zfs_acl_ids_free(&acl_ids); 4330 4331 dmu_tx_commit(tx); 4332 4333 getnewvnode_drop_reserve(); 4334 4335 zfs_dirent_unlock(dl); 4336 4337 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4338 zil_commit(zilog, 0); 4339 4340 ZFS_EXIT(zfsvfs); 4341 return (error); 4342} 4343 4344/* 4345 * Return, in the buffer contained in the provided uio structure, 4346 * the symbolic path referred to by vp. 4347 * 4348 * IN: vp - vnode of symbolic link. 4349 * uio - structure to contain the link path. 4350 * cr - credentials of caller. 4351 * ct - caller context 4352 * 4353 * OUT: uio - structure containing the link path. 4354 * 4355 * RETURN: 0 on success, error code on failure. 4356 * 4357 * Timestamps: 4358 * vp - atime updated 4359 */ 4360/* ARGSUSED */ 4361static int 4362zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 4363{ 4364 znode_t *zp = VTOZ(vp); 4365 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4366 int error; 4367 4368 ZFS_ENTER(zfsvfs); 4369 ZFS_VERIFY_ZP(zp); 4370 4371 mutex_enter(&zp->z_lock); 4372 if (zp->z_is_sa) 4373 error = sa_lookup_uio(zp->z_sa_hdl, 4374 SA_ZPL_SYMLINK(zfsvfs), uio); 4375 else 4376 error = zfs_sa_readlink(zp, uio); 4377 mutex_exit(&zp->z_lock); 4378 4379 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4380 4381 ZFS_EXIT(zfsvfs); 4382 return (error); 4383} 4384 4385/* 4386 * Insert a new entry into directory tdvp referencing svp. 4387 * 4388 * IN: tdvp - Directory to contain new entry. 4389 * svp - vnode of new entry. 4390 * name - name of new entry. 4391 * cr - credentials of caller. 4392 * ct - caller context 4393 * 4394 * RETURN: 0 on success, error code on failure. 4395 * 4396 * Timestamps: 4397 * tdvp - ctime|mtime updated 4398 * svp - ctime updated 4399 */ 4400/* ARGSUSED */ 4401static int 4402zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 4403 caller_context_t *ct, int flags) 4404{ 4405 znode_t *dzp = VTOZ(tdvp); 4406 znode_t *tzp, *szp; 4407 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4408 zilog_t *zilog; 4409 zfs_dirlock_t *dl; 4410 dmu_tx_t *tx; 4411 vnode_t *realvp; 4412 int error; 4413 int zf = ZNEW; 4414 uint64_t parent; 4415 uid_t owner; 4416 boolean_t waited = B_FALSE; 4417 4418 ASSERT(tdvp->v_type == VDIR); 4419 4420 ZFS_ENTER(zfsvfs); 4421 ZFS_VERIFY_ZP(dzp); 4422 zilog = zfsvfs->z_log; 4423 4424 if (VOP_REALVP(svp, &realvp, ct) == 0) 4425 svp = realvp; 4426 4427 /* 4428 * POSIX dictates that we return EPERM here. 4429 * Better choices include ENOTSUP or EISDIR. 4430 */ 4431 if (svp->v_type == VDIR) { 4432 ZFS_EXIT(zfsvfs); 4433 return (SET_ERROR(EPERM)); 4434 } 4435 4436 szp = VTOZ(svp); 4437 ZFS_VERIFY_ZP(szp); 4438 4439 if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { 4440 ZFS_EXIT(zfsvfs); 4441 return (SET_ERROR(EPERM)); 4442 } 4443 4444 /* 4445 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the 4446 * ctldir appear to have the same v_vfsp. 4447 */ 4448 if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) { 4449 ZFS_EXIT(zfsvfs); 4450 return (SET_ERROR(EXDEV)); 4451 } 4452 4453 /* Prevent links to .zfs/shares files */ 4454 4455 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 4456 &parent, sizeof (uint64_t))) != 0) { 4457 ZFS_EXIT(zfsvfs); 4458 return (error); 4459 } 4460 if (parent == zfsvfs->z_shares_dir) { 4461 ZFS_EXIT(zfsvfs); 4462 return (SET_ERROR(EPERM)); 4463 } 4464 4465 if (zfsvfs->z_utf8 && u8_validate(name, 4466 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4467 ZFS_EXIT(zfsvfs); 4468 return (SET_ERROR(EILSEQ)); 4469 } 4470 if (flags & FIGNORECASE) 4471 zf |= ZCILOOK; 4472 4473 /* 4474 * We do not support links between attributes and non-attributes 4475 * because of the potential security risk of creating links 4476 * into "normal" file space in order to circumvent restrictions 4477 * imposed in attribute space. 4478 */ 4479 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { 4480 ZFS_EXIT(zfsvfs); 4481 return (SET_ERROR(EINVAL)); 4482 } 4483 4484 4485 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); 4486 if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { 4487 ZFS_EXIT(zfsvfs); 4488 return (SET_ERROR(EPERM)); 4489 } 4490 4491 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4492 ZFS_EXIT(zfsvfs); 4493 return (error); 4494 } 4495 4496top: 4497 /* 4498 * Attempt to lock directory; fail if entry already exists. 4499 */ 4500 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); 4501 if (error) { 4502 ZFS_EXIT(zfsvfs); 4503 return (error); 4504 } 4505 4506 tx = dmu_tx_create(zfsvfs->z_os); 4507 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4508 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4509 zfs_sa_upgrade_txholds(tx, szp); 4510 zfs_sa_upgrade_txholds(tx, dzp); 4511 error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 4512 if (error) { 4513 zfs_dirent_unlock(dl); 4514 if (error == ERESTART) { 4515 waited = B_TRUE; 4516 dmu_tx_wait(tx); 4517 dmu_tx_abort(tx); 4518 goto top; 4519 } 4520 dmu_tx_abort(tx); 4521 ZFS_EXIT(zfsvfs); 4522 return (error); 4523 } 4524 4525 error = zfs_link_create(dl, szp, tx, 0); 4526 4527 if (error == 0) { 4528 uint64_t txtype = TX_LINK; 4529 if (flags & FIGNORECASE) 4530 txtype |= TX_CI; 4531 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 4532 } 4533 4534 dmu_tx_commit(tx); 4535 4536 zfs_dirent_unlock(dl); 4537 4538 if (error == 0) { 4539 vnevent_link(svp, ct); 4540 } 4541 4542 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4543 zil_commit(zilog, 0); 4544 4545 ZFS_EXIT(zfsvfs); 4546 return (error); 4547} 4548 4549#ifdef illumos 4550/* 4551 * zfs_null_putapage() is used when the file system has been force 4552 * unmounted. It just drops the pages. 4553 */ 4554/* ARGSUSED */ 4555static int 4556zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 4557 size_t *lenp, int flags, cred_t *cr) 4558{ 4559 pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); 4560 return (0); 4561} 4562 4563/* 4564 * Push a page out to disk, klustering if possible. 4565 * 4566 * IN: vp - file to push page to. 4567 * pp - page to push. 4568 * flags - additional flags. 4569 * cr - credentials of caller. 4570 * 4571 * OUT: offp - start of range pushed. 4572 * lenp - len of range pushed. 4573 * 4574 * RETURN: 0 on success, error code on failure. 4575 * 4576 * NOTE: callers must have locked the page to be pushed. On 4577 * exit, the page (and all other pages in the kluster) must be 4578 * unlocked. 4579 */ 4580/* ARGSUSED */ 4581static int 4582zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, 4583 size_t *lenp, int flags, cred_t *cr) 4584{ 4585 znode_t *zp = VTOZ(vp); 4586 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4587 dmu_tx_t *tx; 4588 u_offset_t off, koff; 4589 size_t len, klen; 4590 int err; 4591 4592 off = pp->p_offset; 4593 len = PAGESIZE; 4594 /* 4595 * If our blocksize is bigger than the page size, try to kluster 4596 * multiple pages so that we write a full block (thus avoiding 4597 * a read-modify-write). 4598 */ 4599 if (off < zp->z_size && zp->z_blksz > PAGESIZE) { 4600 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); 4601 koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; 4602 ASSERT(koff <= zp->z_size); 4603 if (koff + klen > zp->z_size) 4604 klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE); 4605 pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); 4606 } 4607 ASSERT3U(btop(len), ==, btopr(len)); 4608 4609 /* 4610 * Can't push pages past end-of-file. 4611 */ 4612 if (off >= zp->z_size) { 4613 /* ignore all pages */ 4614 err = 0; 4615 goto out; 4616 } else if (off + len > zp->z_size) { 4617 int npages = btopr(zp->z_size - off); 4618 page_t *trunc; 4619 4620 page_list_break(&pp, &trunc, npages); 4621 /* ignore pages past end of file */ 4622 if (trunc) 4623 pvn_write_done(trunc, flags); 4624 len = zp->z_size - off; 4625 } 4626 4627 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 4628 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 4629 err = SET_ERROR(EDQUOT); 4630 goto out; 4631 } 4632 tx = dmu_tx_create(zfsvfs->z_os); 4633 dmu_tx_hold_write(tx, zp->z_id, off, len); 4634 4635 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4636 zfs_sa_upgrade_txholds(tx, zp); 4637 err = dmu_tx_assign(tx, TXG_WAIT); 4638 if (err != 0) { 4639 dmu_tx_abort(tx); 4640 goto out; 4641 } 4642 4643 if (zp->z_blksz <= PAGESIZE) { 4644 caddr_t va = zfs_map_page(pp, S_READ); 4645 ASSERT3U(len, <=, PAGESIZE); 4646 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); 4647 zfs_unmap_page(pp, va); 4648 } else { 4649 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); 4650 } 4651 4652 if (err == 0) { 4653 uint64_t mtime[2], ctime[2]; 4654 sa_bulk_attr_t bulk[3]; 4655 int count = 0; 4656 4657 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 4658 &mtime, 16); 4659 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 4660 &ctime, 16); 4661 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 4662 &zp->z_pflags, 8); 4663 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 4664 B_TRUE); 4665 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); 4666 } 4667 dmu_tx_commit(tx); 4668 4669out: 4670 pvn_write_done(pp, (err ? B_ERROR : 0) | flags); 4671 if (offp) 4672 *offp = off; 4673 if (lenp) 4674 *lenp = len; 4675 4676 return (err); 4677} 4678 4679/* 4680 * Copy the portion of the file indicated from pages into the file. 4681 * The pages are stored in a page list attached to the files vnode. 4682 * 4683 * IN: vp - vnode of file to push page data to. 4684 * off - position in file to put data. 4685 * len - amount of data to write. 4686 * flags - flags to control the operation. 4687 * cr - credentials of caller. 4688 * ct - caller context. 4689 * 4690 * RETURN: 0 on success, error code on failure. 4691 * 4692 * Timestamps: 4693 * vp - ctime|mtime updated 4694 */ 4695/*ARGSUSED*/ 4696static int 4697zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 4698 caller_context_t *ct) 4699{ 4700 znode_t *zp = VTOZ(vp); 4701 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4702 page_t *pp; 4703 size_t io_len; 4704 u_offset_t io_off; 4705 uint_t blksz; 4706 rl_t *rl; 4707 int error = 0; 4708 4709 ZFS_ENTER(zfsvfs); 4710 ZFS_VERIFY_ZP(zp); 4711 4712 /* 4713 * Align this request to the file block size in case we kluster. 4714 * XXX - this can result in pretty aggresive locking, which can 4715 * impact simultanious read/write access. One option might be 4716 * to break up long requests (len == 0) into block-by-block 4717 * operations to get narrower locking. 4718 */ 4719 blksz = zp->z_blksz; 4720 if (ISP2(blksz)) 4721 io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); 4722 else 4723 io_off = 0; 4724 if (len > 0 && ISP2(blksz)) 4725 io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); 4726 else 4727 io_len = 0; 4728 4729 if (io_len == 0) { 4730 /* 4731 * Search the entire vp list for pages >= io_off. 4732 */ 4733 rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); 4734 error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); 4735 goto out; 4736 } 4737 rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); 4738 4739 if (off > zp->z_size) { 4740 /* past end of file */ 4741 zfs_range_unlock(rl); 4742 ZFS_EXIT(zfsvfs); 4743 return (0); 4744 } 4745 4746 len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off); 4747 4748 for (off = io_off; io_off < off + len; io_off += io_len) { 4749 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 4750 pp = page_lookup(vp, io_off, 4751 (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); 4752 } else { 4753 pp = page_lookup_nowait(vp, io_off, 4754 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 4755 } 4756 4757 if (pp != NULL && pvn_getdirty(pp, flags)) { 4758 int err; 4759 4760 /* 4761 * Found a dirty page to push 4762 */ 4763 err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); 4764 if (err) 4765 error = err; 4766 } else { 4767 io_len = PAGESIZE; 4768 } 4769 } 4770out: 4771 zfs_range_unlock(rl); 4772 if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4773 zil_commit(zfsvfs->z_log, zp->z_id); 4774 ZFS_EXIT(zfsvfs); 4775 return (error); 4776} 4777#endif /* illumos */ 4778 4779/*ARGSUSED*/ 4780void 4781zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4782{ 4783 znode_t *zp = VTOZ(vp); 4784 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4785 int error; 4786 4787 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4788 if (zp->z_sa_hdl == NULL) { 4789 /* 4790 * The fs has been unmounted, or we did a 4791 * suspend/resume and this file no longer exists. 4792 */ 4793 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4794 vrecycle(vp); 4795 return; 4796 } 4797 4798 mutex_enter(&zp->z_lock); 4799 if (zp->z_unlinked) { 4800 /* 4801 * Fast path to recycle a vnode of a removed file. 4802 */ 4803 mutex_exit(&zp->z_lock); 4804 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4805 vrecycle(vp); 4806 return; 4807 } 4808 mutex_exit(&zp->z_lock); 4809 4810 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 4811 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4812 4813 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4814 zfs_sa_upgrade_txholds(tx, zp); 4815 error = dmu_tx_assign(tx, TXG_WAIT); 4816 if (error) { 4817 dmu_tx_abort(tx); 4818 } else { 4819 mutex_enter(&zp->z_lock); 4820 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4821 (void *)&zp->z_atime, sizeof (zp->z_atime), tx); 4822 zp->z_atime_dirty = 0; 4823 mutex_exit(&zp->z_lock); 4824 dmu_tx_commit(tx); 4825 } 4826 } 4827 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4828} 4829 4830#ifdef illumos 4831/* 4832 * Bounds-check the seek operation. 4833 * 4834 * IN: vp - vnode seeking within 4835 * ooff - old file offset 4836 * noffp - pointer to new file offset 4837 * ct - caller context 4838 * 4839 * RETURN: 0 on success, EINVAL if new offset invalid. 4840 */ 4841/* ARGSUSED */ 4842static int 4843zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, 4844 caller_context_t *ct) 4845{ 4846 if (vp->v_type == VDIR) 4847 return (0); 4848 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 4849} 4850 4851/* 4852 * Pre-filter the generic locking function to trap attempts to place 4853 * a mandatory lock on a memory mapped file. 4854 */ 4855static int 4856zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, 4857 flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) 4858{ 4859 znode_t *zp = VTOZ(vp); 4860 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4861 4862 ZFS_ENTER(zfsvfs); 4863 ZFS_VERIFY_ZP(zp); 4864 4865 /* 4866 * We are following the UFS semantics with respect to mapcnt 4867 * here: If we see that the file is mapped already, then we will 4868 * return an error, but we don't worry about races between this 4869 * function and zfs_map(). 4870 */ 4871 if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) { 4872 ZFS_EXIT(zfsvfs); 4873 return (SET_ERROR(EAGAIN)); 4874 } 4875 ZFS_EXIT(zfsvfs); 4876 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 4877} 4878 4879/* 4880 * If we can't find a page in the cache, we will create a new page 4881 * and fill it with file data. For efficiency, we may try to fill 4882 * multiple pages at once (klustering) to fill up the supplied page 4883 * list. Note that the pages to be filled are held with an exclusive 4884 * lock to prevent access by other threads while they are being filled. 4885 */ 4886static int 4887zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, 4888 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) 4889{ 4890 znode_t *zp = VTOZ(vp); 4891 page_t *pp, *cur_pp; 4892 objset_t *os = zp->z_zfsvfs->z_os; 4893 u_offset_t io_off, total; 4894 size_t io_len; 4895 int err; 4896 4897 if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { 4898 /* 4899 * We only have a single page, don't bother klustering 4900 */ 4901 io_off = off; 4902 io_len = PAGESIZE; 4903 pp = page_create_va(vp, io_off, io_len, 4904 PG_EXCL | PG_WAIT, seg, addr); 4905 } else { 4906 /* 4907 * Try to find enough pages to fill the page list 4908 */ 4909 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 4910 &io_len, off, plsz, 0); 4911 } 4912 if (pp == NULL) { 4913 /* 4914 * The page already exists, nothing to do here. 4915 */ 4916 *pl = NULL; 4917 return (0); 4918 } 4919 4920 /* 4921 * Fill the pages in the kluster. 4922 */ 4923 cur_pp = pp; 4924 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { 4925 caddr_t va; 4926 4927 ASSERT3U(io_off, ==, cur_pp->p_offset); 4928 va = zfs_map_page(cur_pp, S_WRITE); 4929 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, 4930 DMU_READ_PREFETCH); 4931 zfs_unmap_page(cur_pp, va); 4932 if (err) { 4933 /* On error, toss the entire kluster */ 4934 pvn_read_done(pp, B_ERROR); 4935 /* convert checksum errors into IO errors */ 4936 if (err == ECKSUM) 4937 err = SET_ERROR(EIO); 4938 return (err); 4939 } 4940 cur_pp = cur_pp->p_next; 4941 } 4942 4943 /* 4944 * Fill in the page list array from the kluster starting 4945 * from the desired offset `off'. 4946 * NOTE: the page list will always be null terminated. 4947 */ 4948 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 4949 ASSERT(pl == NULL || (*pl)->p_offset == off); 4950 4951 return (0); 4952} 4953 4954/* 4955 * Return pointers to the pages for the file region [off, off + len] 4956 * in the pl array. If plsz is greater than len, this function may 4957 * also return page pointers from after the specified region 4958 * (i.e. the region [off, off + plsz]). These additional pages are 4959 * only returned if they are already in the cache, or were created as 4960 * part of a klustered read. 4961 * 4962 * IN: vp - vnode of file to get data from. 4963 * off - position in file to get data from. 4964 * len - amount of data to retrieve. 4965 * plsz - length of provided page list. 4966 * seg - segment to obtain pages for. 4967 * addr - virtual address of fault. 4968 * rw - mode of created pages. 4969 * cr - credentials of caller. 4970 * ct - caller context. 4971 * 4972 * OUT: protp - protection mode of created pages. 4973 * pl - list of pages created. 4974 * 4975 * RETURN: 0 on success, error code on failure. 4976 * 4977 * Timestamps: 4978 * vp - atime updated 4979 */ 4980/* ARGSUSED */ 4981static int 4982zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 4983 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 4984 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 4985{ 4986 znode_t *zp = VTOZ(vp); 4987 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4988 page_t **pl0 = pl; 4989 int err = 0; 4990 4991 /* we do our own caching, faultahead is unnecessary */ 4992 if (pl == NULL) 4993 return (0); 4994 else if (len > plsz) 4995 len = plsz; 4996 else 4997 len = P2ROUNDUP(len, PAGESIZE); 4998 ASSERT(plsz >= len); 4999 5000 ZFS_ENTER(zfsvfs); 5001 ZFS_VERIFY_ZP(zp); 5002 5003 if (protp) 5004 *protp = PROT_ALL; 5005 5006 /* 5007 * Loop through the requested range [off, off + len) looking 5008 * for pages. If we don't find a page, we will need to create 5009 * a new page and fill it with data from the file. 5010 */ 5011 while (len > 0) { 5012 if (*pl = page_lookup(vp, off, SE_SHARED)) 5013 *(pl+1) = NULL; 5014 else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) 5015 goto out; 5016 while (*pl) { 5017 ASSERT3U((*pl)->p_offset, ==, off); 5018 off += PAGESIZE; 5019 addr += PAGESIZE; 5020 if (len > 0) { 5021 ASSERT3U(len, >=, PAGESIZE); 5022 len -= PAGESIZE; 5023 } 5024 ASSERT3U(plsz, >=, PAGESIZE); 5025 plsz -= PAGESIZE; 5026 pl++; 5027 } 5028 } 5029 5030 /* 5031 * Fill out the page array with any pages already in the cache. 5032 */ 5033 while (plsz > 0 && 5034 (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { 5035 off += PAGESIZE; 5036 plsz -= PAGESIZE; 5037 } 5038out: 5039 if (err) { 5040 /* 5041 * Release any pages we have previously locked. 5042 */ 5043 while (pl > pl0) 5044 page_unlock(*--pl); 5045 } else { 5046 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 5047 } 5048 5049 *pl = NULL; 5050 5051 ZFS_EXIT(zfsvfs); 5052 return (err); 5053} 5054 5055/* 5056 * Request a memory map for a section of a file. This code interacts 5057 * with common code and the VM system as follows: 5058 * 5059 * - common code calls mmap(), which ends up in smmap_common() 5060 * - this calls VOP_MAP(), which takes you into (say) zfs 5061 * - zfs_map() calls as_map(), passing segvn_create() as the callback 5062 * - segvn_create() creates the new segment and calls VOP_ADDMAP() 5063 * - zfs_addmap() updates z_mapcnt 5064 */ 5065/*ARGSUSED*/ 5066static int 5067zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 5068 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 5069 caller_context_t *ct) 5070{ 5071 znode_t *zp = VTOZ(vp); 5072 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5073 segvn_crargs_t vn_a; 5074 int error; 5075 5076 ZFS_ENTER(zfsvfs); 5077 ZFS_VERIFY_ZP(zp); 5078 5079 if ((prot & PROT_WRITE) && (zp->z_pflags & 5080 (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { 5081 ZFS_EXIT(zfsvfs); 5082 return (SET_ERROR(EPERM)); 5083 } 5084 5085 if ((prot & (PROT_READ | PROT_EXEC)) && 5086 (zp->z_pflags & ZFS_AV_QUARANTINED)) { 5087 ZFS_EXIT(zfsvfs); 5088 return (SET_ERROR(EACCES)); 5089 } 5090 5091 if (vp->v_flag & VNOMAP) { 5092 ZFS_EXIT(zfsvfs); 5093 return (SET_ERROR(ENOSYS)); 5094 } 5095 5096 if (off < 0 || len > MAXOFFSET_T - off) { 5097 ZFS_EXIT(zfsvfs); 5098 return (SET_ERROR(ENXIO)); 5099 } 5100 5101 if (vp->v_type != VREG) { 5102 ZFS_EXIT(zfsvfs); 5103 return (SET_ERROR(ENODEV)); 5104 } 5105 5106 /* 5107 * If file is locked, disallow mapping. 5108 */ 5109 if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) { 5110 ZFS_EXIT(zfsvfs); 5111 return (SET_ERROR(EAGAIN)); 5112 } 5113 5114 as_rangelock(as); 5115 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 5116 if (error != 0) { 5117 as_rangeunlock(as); 5118 ZFS_EXIT(zfsvfs); 5119 return (error); 5120 } 5121 5122 vn_a.vp = vp; 5123 vn_a.offset = (u_offset_t)off; 5124 vn_a.type = flags & MAP_TYPE; 5125 vn_a.prot = prot; 5126 vn_a.maxprot = maxprot; 5127 vn_a.cred = cr; 5128 vn_a.amp = NULL; 5129 vn_a.flags = flags & ~MAP_TYPE; 5130 vn_a.szc = 0; 5131 vn_a.lgrp_mem_policy_flags = 0; 5132 5133 error = as_map(as, *addrp, len, segvn_create, &vn_a); 5134 5135 as_rangeunlock(as); 5136 ZFS_EXIT(zfsvfs); 5137 return (error); 5138} 5139 5140/* ARGSUSED */ 5141static int 5142zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 5143 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 5144 caller_context_t *ct) 5145{ 5146 uint64_t pages = btopr(len); 5147 5148 atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); 5149 return (0); 5150} 5151 5152/* 5153 * The reason we push dirty pages as part of zfs_delmap() is so that we get a 5154 * more accurate mtime for the associated file. Since we don't have a way of 5155 * detecting when the data was actually modified, we have to resort to 5156 * heuristics. If an explicit msync() is done, then we mark the mtime when the 5157 * last page is pushed. The problem occurs when the msync() call is omitted, 5158 * which by far the most common case: 5159 * 5160 * open() 5161 * mmap() 5162 * <modify memory> 5163 * munmap() 5164 * close() 5165 * <time lapse> 5166 * putpage() via fsflush 5167 * 5168 * If we wait until fsflush to come along, we can have a modification time that 5169 * is some arbitrary point in the future. In order to prevent this in the 5170 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is 5171 * torn down. 5172 */ 5173/* ARGSUSED */ 5174static int 5175zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 5176 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 5177 caller_context_t *ct) 5178{ 5179 uint64_t pages = btopr(len); 5180 5181 ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); 5182 atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); 5183 5184 if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && 5185 vn_has_cached_data(vp)) 5186 (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); 5187 5188 return (0); 5189} 5190 5191/* 5192 * Free or allocate space in a file. Currently, this function only 5193 * supports the `F_FREESP' command. However, this command is somewhat 5194 * misnamed, as its functionality includes the ability to allocate as 5195 * well as free space. 5196 * 5197 * IN: vp - vnode of file to free data in. 5198 * cmd - action to take (only F_FREESP supported). 5199 * bfp - section of file to free/alloc. 5200 * flag - current file open mode flags. 5201 * offset - current file offset. 5202 * cr - credentials of caller [UNUSED]. 5203 * ct - caller context. 5204 * 5205 * RETURN: 0 on success, error code on failure. 5206 * 5207 * Timestamps: 5208 * vp - ctime|mtime updated 5209 */ 5210/* ARGSUSED */ 5211static int 5212zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, 5213 offset_t offset, cred_t *cr, caller_context_t *ct) 5214{ 5215 znode_t *zp = VTOZ(vp); 5216 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5217 uint64_t off, len; 5218 int error; 5219 5220 ZFS_ENTER(zfsvfs); 5221 ZFS_VERIFY_ZP(zp); 5222 5223 if (cmd != F_FREESP) { 5224 ZFS_EXIT(zfsvfs); 5225 return (SET_ERROR(EINVAL)); 5226 } 5227 5228 /* 5229 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our 5230 * callers might not be able to detect properly that we are read-only, 5231 * so check it explicitly here. 5232 */ 5233 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 5234 ZFS_EXIT(zfsvfs); 5235 return (SET_ERROR(EROFS)); 5236 } 5237 5238 if (error = convoff(vp, bfp, 0, offset)) { 5239 ZFS_EXIT(zfsvfs); 5240 return (error); 5241 } 5242 5243 if (bfp->l_len < 0) { 5244 ZFS_EXIT(zfsvfs); 5245 return (SET_ERROR(EINVAL)); 5246 } 5247 5248 off = bfp->l_start; 5249 len = bfp->l_len; /* 0 means from off to end of file */ 5250 5251 error = zfs_freesp(zp, off, len, flag, TRUE); 5252 5253 ZFS_EXIT(zfsvfs); 5254 return (error); 5255} 5256#endif /* illumos */ 5257 5258CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); 5259CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); 5260 5261/*ARGSUSED*/ 5262static int 5263zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 5264{ 5265 znode_t *zp = VTOZ(vp); 5266 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5267 uint32_t gen; 5268 uint64_t gen64; 5269 uint64_t object = zp->z_id; 5270 zfid_short_t *zfid; 5271 int size, i, error; 5272 5273 ZFS_ENTER(zfsvfs); 5274 ZFS_VERIFY_ZP(zp); 5275 5276 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 5277 &gen64, sizeof (uint64_t))) != 0) { 5278 ZFS_EXIT(zfsvfs); 5279 return (error); 5280 } 5281 5282 gen = (uint32_t)gen64; 5283 5284 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 5285 5286#ifdef illumos 5287 if (fidp->fid_len < size) { 5288 fidp->fid_len = size; 5289 ZFS_EXIT(zfsvfs); 5290 return (SET_ERROR(ENOSPC)); 5291 } 5292#else 5293 fidp->fid_len = size; 5294#endif 5295 5296 zfid = (zfid_short_t *)fidp; 5297 5298 zfid->zf_len = size; 5299 5300 for (i = 0; i < sizeof (zfid->zf_object); i++) 5301 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 5302 5303 /* Must have a non-zero generation number to distinguish from .zfs */ 5304 if (gen == 0) 5305 gen = 1; 5306 for (i = 0; i < sizeof (zfid->zf_gen); i++) 5307 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 5308 5309 if (size == LONG_FID_LEN) { 5310 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 5311 zfid_long_t *zlfid; 5312 5313 zlfid = (zfid_long_t *)fidp; 5314 5315 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 5316 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 5317 5318 /* XXX - this should be the generation number for the objset */ 5319 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 5320 zlfid->zf_setgen[i] = 0; 5321 } 5322 5323 ZFS_EXIT(zfsvfs); 5324 return (0); 5325} 5326 5327static int 5328zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 5329 caller_context_t *ct) 5330{ 5331 znode_t *zp, *xzp; 5332 zfsvfs_t *zfsvfs; 5333 zfs_dirlock_t *dl; 5334 int error; 5335 5336 switch (cmd) { 5337 case _PC_LINK_MAX: 5338 *valp = INT_MAX; 5339 return (0); 5340 5341 case _PC_FILESIZEBITS: 5342 *valp = 64; 5343 return (0); 5344#ifdef illumos 5345 case _PC_XATTR_EXISTS: 5346 zp = VTOZ(vp); 5347 zfsvfs = zp->z_zfsvfs; 5348 ZFS_ENTER(zfsvfs); 5349 ZFS_VERIFY_ZP(zp); 5350 *valp = 0; 5351 error = zfs_dirent_lock(&dl, zp, "", &xzp, 5352 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL); 5353 if (error == 0) { 5354 zfs_dirent_unlock(dl); 5355 if (!zfs_dirempty(xzp)) 5356 *valp = 1; 5357 VN_RELE(ZTOV(xzp)); 5358 } else if (error == ENOENT) { 5359 /* 5360 * If there aren't extended attributes, it's the 5361 * same as having zero of them. 5362 */ 5363 error = 0; 5364 } 5365 ZFS_EXIT(zfsvfs); 5366 return (error); 5367 5368 case _PC_SATTR_ENABLED: 5369 case _PC_SATTR_EXISTS: 5370 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 5371 (vp->v_type == VREG || vp->v_type == VDIR); 5372 return (0); 5373 5374 case _PC_ACCESS_FILTERING: 5375 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && 5376 vp->v_type == VDIR; 5377 return (0); 5378 5379 case _PC_ACL_ENABLED: 5380 *valp = _ACL_ACE_ENABLED; 5381 return (0); 5382#endif /* illumos */ 5383 case _PC_MIN_HOLE_SIZE: 5384 *valp = (int)SPA_MINBLOCKSIZE; 5385 return (0); 5386#ifdef illumos 5387 case _PC_TIMESTAMP_RESOLUTION: 5388 /* nanosecond timestamp resolution */ 5389 *valp = 1L; 5390 return (0); 5391#endif 5392 case _PC_ACL_EXTENDED: 5393 *valp = 0; 5394 return (0); 5395 5396 case _PC_ACL_NFS4: 5397 *valp = 1; 5398 return (0); 5399 5400 case _PC_ACL_PATH_MAX: 5401 *valp = ACL_MAX_ENTRIES; 5402 return (0); 5403 5404 default: 5405 return (EOPNOTSUPP); 5406 } 5407} 5408 5409/*ARGSUSED*/ 5410static int 5411zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 5412 caller_context_t *ct) 5413{ 5414 znode_t *zp = VTOZ(vp); 5415 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5416 int error; 5417 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 5418 5419 ZFS_ENTER(zfsvfs); 5420 ZFS_VERIFY_ZP(zp); 5421 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 5422 ZFS_EXIT(zfsvfs); 5423 5424 return (error); 5425} 5426 5427/*ARGSUSED*/ 5428int 5429zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 5430 caller_context_t *ct) 5431{ 5432 znode_t *zp = VTOZ(vp); 5433 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5434 int error; 5435 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 5436 zilog_t *zilog = zfsvfs->z_log; 5437 5438 ZFS_ENTER(zfsvfs); 5439 ZFS_VERIFY_ZP(zp); 5440 5441 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 5442 5443 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 5444 zil_commit(zilog, 0); 5445 5446 ZFS_EXIT(zfsvfs); 5447 return (error); 5448} 5449 5450#ifdef illumos 5451/* 5452 * The smallest read we may consider to loan out an arcbuf. 5453 * This must be a power of 2. 5454 */ 5455int zcr_blksz_min = (1 << 10); /* 1K */ 5456/* 5457 * If set to less than the file block size, allow loaning out of an 5458 * arcbuf for a partial block read. This must be a power of 2. 5459 */ 5460int zcr_blksz_max = (1 << 17); /* 128K */ 5461 5462/*ARGSUSED*/ 5463static int 5464zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr, 5465 caller_context_t *ct) 5466{ 5467 znode_t *zp = VTOZ(vp); 5468 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5469 int max_blksz = zfsvfs->z_max_blksz; 5470 uio_t *uio = &xuio->xu_uio; 5471 ssize_t size = uio->uio_resid; 5472 offset_t offset = uio->uio_loffset; 5473 int blksz; 5474 int fullblk, i; 5475 arc_buf_t *abuf; 5476 ssize_t maxsize; 5477 int preamble, postamble; 5478 5479 if (xuio->xu_type != UIOTYPE_ZEROCOPY) 5480 return (SET_ERROR(EINVAL)); 5481 5482 ZFS_ENTER(zfsvfs); 5483 ZFS_VERIFY_ZP(zp); 5484 switch (ioflag) { 5485 case UIO_WRITE: 5486 /* 5487 * Loan out an arc_buf for write if write size is bigger than 5488 * max_blksz, and the file's block size is also max_blksz. 5489 */ 5490 blksz = max_blksz; 5491 if (size < blksz || zp->z_blksz != blksz) { 5492 ZFS_EXIT(zfsvfs); 5493 return (SET_ERROR(EINVAL)); 5494 } 5495 /* 5496 * Caller requests buffers for write before knowing where the 5497 * write offset might be (e.g. NFS TCP write). 5498 */ 5499 if (offset == -1) { 5500 preamble = 0; 5501 } else { 5502 preamble = P2PHASE(offset, blksz); 5503 if (preamble) { 5504 preamble = blksz - preamble; 5505 size -= preamble; 5506 } 5507 } 5508 5509 postamble = P2PHASE(size, blksz); 5510 size -= postamble; 5511 5512 fullblk = size / blksz; 5513 (void) dmu_xuio_init(xuio, 5514 (preamble != 0) + fullblk + (postamble != 0)); 5515 DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble, 5516 int, postamble, int, 5517 (preamble != 0) + fullblk + (postamble != 0)); 5518 5519 /* 5520 * Have to fix iov base/len for partial buffers. They 5521 * currently represent full arc_buf's. 5522 */ 5523 if (preamble) { 5524 /* data begins in the middle of the arc_buf */ 5525 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5526 blksz); 5527 ASSERT(abuf); 5528 (void) dmu_xuio_add(xuio, abuf, 5529 blksz - preamble, preamble); 5530 } 5531 5532 for (i = 0; i < fullblk; i++) { 5533 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5534 blksz); 5535 ASSERT(abuf); 5536 (void) dmu_xuio_add(xuio, abuf, 0, blksz); 5537 } 5538 5539 if (postamble) { 5540 /* data ends in the middle of the arc_buf */ 5541 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 5542 blksz); 5543 ASSERT(abuf); 5544 (void) dmu_xuio_add(xuio, abuf, 0, postamble); 5545 } 5546 break; 5547 case UIO_READ: 5548 /* 5549 * Loan out an arc_buf for read if the read size is larger than 5550 * the current file block size. Block alignment is not 5551 * considered. Partial arc_buf will be loaned out for read. 5552 */ 5553 blksz = zp->z_blksz; 5554 if (blksz < zcr_blksz_min) 5555 blksz = zcr_blksz_min; 5556 if (blksz > zcr_blksz_max) 5557 blksz = zcr_blksz_max; 5558 /* avoid potential complexity of dealing with it */ 5559 if (blksz > max_blksz) { 5560 ZFS_EXIT(zfsvfs); 5561 return (SET_ERROR(EINVAL)); 5562 } 5563 5564 maxsize = zp->z_size - uio->uio_loffset; 5565 if (size > maxsize) 5566 size = maxsize; 5567 5568 if (size < blksz || vn_has_cached_data(vp)) { 5569 ZFS_EXIT(zfsvfs); 5570 return (SET_ERROR(EINVAL)); 5571 } 5572 break; 5573 default: 5574 ZFS_EXIT(zfsvfs); 5575 return (SET_ERROR(EINVAL)); 5576 } 5577 5578 uio->uio_extflg = UIO_XUIO; 5579 XUIO_XUZC_RW(xuio) = ioflag; 5580 ZFS_EXIT(zfsvfs); 5581 return (0); 5582} 5583 5584/*ARGSUSED*/ 5585static int 5586zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct) 5587{ 5588 int i; 5589 arc_buf_t *abuf; 5590 int ioflag = XUIO_XUZC_RW(xuio); 5591 5592 ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); 5593 5594 i = dmu_xuio_cnt(xuio); 5595 while (i-- > 0) { 5596 abuf = dmu_xuio_arcbuf(xuio, i); 5597 /* 5598 * if abuf == NULL, it must be a write buffer 5599 * that has been returned in zfs_write(). 5600 */ 5601 if (abuf) 5602 dmu_return_arcbuf(abuf); 5603 ASSERT(abuf || ioflag == UIO_WRITE); 5604 } 5605 5606 dmu_xuio_fini(xuio); 5607 return (0); 5608} 5609 5610/* 5611 * Predeclare these here so that the compiler assumes that 5612 * this is an "old style" function declaration that does 5613 * not include arguments => we won't get type mismatch errors 5614 * in the initializations that follow. 5615 */ 5616static int zfs_inval(); 5617static int zfs_isdir(); 5618 5619static int 5620zfs_inval() 5621{ 5622 return (SET_ERROR(EINVAL)); 5623} 5624 5625static int 5626zfs_isdir() 5627{ 5628 return (SET_ERROR(EISDIR)); 5629} 5630/* 5631 * Directory vnode operations template 5632 */ 5633vnodeops_t *zfs_dvnodeops; 5634const fs_operation_def_t zfs_dvnodeops_template[] = { 5635 VOPNAME_OPEN, { .vop_open = zfs_open }, 5636 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5637 VOPNAME_READ, { .error = zfs_isdir }, 5638 VOPNAME_WRITE, { .error = zfs_isdir }, 5639 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5640 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5641 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5642 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5643 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5644 VOPNAME_CREATE, { .vop_create = zfs_create }, 5645 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 5646 VOPNAME_LINK, { .vop_link = zfs_link }, 5647 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5648 VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir }, 5649 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 5650 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 5651 VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink }, 5652 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5653 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5654 VOPNAME_FID, { .vop_fid = zfs_fid }, 5655 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5656 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5657 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5658 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5659 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5660 NULL, NULL 5661}; 5662 5663/* 5664 * Regular file vnode operations template 5665 */ 5666vnodeops_t *zfs_fvnodeops; 5667const fs_operation_def_t zfs_fvnodeops_template[] = { 5668 VOPNAME_OPEN, { .vop_open = zfs_open }, 5669 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5670 VOPNAME_READ, { .vop_read = zfs_read }, 5671 VOPNAME_WRITE, { .vop_write = zfs_write }, 5672 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5673 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5674 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5675 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5676 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5677 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5678 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5679 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5680 VOPNAME_FID, { .vop_fid = zfs_fid }, 5681 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5682 VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock }, 5683 VOPNAME_SPACE, { .vop_space = zfs_space }, 5684 VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage }, 5685 VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage }, 5686 VOPNAME_MAP, { .vop_map = zfs_map }, 5687 VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap }, 5688 VOPNAME_DELMAP, { .vop_delmap = zfs_delmap }, 5689 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5690 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5691 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5692 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5693 VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf }, 5694 VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf }, 5695 NULL, NULL 5696}; 5697 5698/* 5699 * Symbolic link vnode operations template 5700 */ 5701vnodeops_t *zfs_symvnodeops; 5702const fs_operation_def_t zfs_symvnodeops_template[] = { 5703 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5704 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5705 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5706 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5707 VOPNAME_READLINK, { .vop_readlink = zfs_readlink }, 5708 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5709 VOPNAME_FID, { .vop_fid = zfs_fid }, 5710 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5711 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5712 NULL, NULL 5713}; 5714 5715/* 5716 * special share hidden files vnode operations template 5717 */ 5718vnodeops_t *zfs_sharevnodeops; 5719const fs_operation_def_t zfs_sharevnodeops_template[] = { 5720 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5721 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5722 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5723 VOPNAME_FID, { .vop_fid = zfs_fid }, 5724 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5725 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5726 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5727 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5728 NULL, NULL 5729}; 5730 5731/* 5732 * Extended attribute directory vnode operations template 5733 * 5734 * This template is identical to the directory vnodes 5735 * operation template except for restricted operations: 5736 * VOP_MKDIR() 5737 * VOP_SYMLINK() 5738 * 5739 * Note that there are other restrictions embedded in: 5740 * zfs_create() - restrict type to VREG 5741 * zfs_link() - no links into/out of attribute space 5742 * zfs_rename() - no moves into/out of attribute space 5743 */ 5744vnodeops_t *zfs_xdvnodeops; 5745const fs_operation_def_t zfs_xdvnodeops_template[] = { 5746 VOPNAME_OPEN, { .vop_open = zfs_open }, 5747 VOPNAME_CLOSE, { .vop_close = zfs_close }, 5748 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl }, 5749 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, 5750 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, 5751 VOPNAME_ACCESS, { .vop_access = zfs_access }, 5752 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup }, 5753 VOPNAME_CREATE, { .vop_create = zfs_create }, 5754 VOPNAME_REMOVE, { .vop_remove = zfs_remove }, 5755 VOPNAME_LINK, { .vop_link = zfs_link }, 5756 VOPNAME_RENAME, { .vop_rename = zfs_rename }, 5757 VOPNAME_MKDIR, { .error = zfs_inval }, 5758 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir }, 5759 VOPNAME_READDIR, { .vop_readdir = zfs_readdir }, 5760 VOPNAME_SYMLINK, { .error = zfs_inval }, 5761 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync }, 5762 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5763 VOPNAME_FID, { .vop_fid = zfs_fid }, 5764 VOPNAME_SEEK, { .vop_seek = zfs_seek }, 5765 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5766 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, 5767 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, 5768 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 5769 NULL, NULL 5770}; 5771 5772/* 5773 * Error vnode operations template 5774 */ 5775vnodeops_t *zfs_evnodeops; 5776const fs_operation_def_t zfs_evnodeops_template[] = { 5777 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, 5778 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, 5779 NULL, NULL 5780}; 5781#endif /* illumos */ 5782 5783static int 5784ioflags(int ioflags) 5785{ 5786 int flags = 0; 5787 5788 if (ioflags & IO_APPEND) 5789 flags |= FAPPEND; 5790 if (ioflags & IO_NDELAY) 5791 flags |= FNONBLOCK; 5792 if (ioflags & IO_SYNC) 5793 flags |= (FSYNC | FDSYNC | FRSYNC); 5794 5795 return (flags); 5796} 5797 5798static int 5799zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage) 5800{ 5801 znode_t *zp = VTOZ(vp); 5802 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5803 objset_t *os = zp->z_zfsvfs->z_os; 5804 vm_page_t mfirst, mlast, mreq; 5805 vm_object_t object; 5806 caddr_t va; 5807 struct sf_buf *sf; 5808 off_t startoff, endoff; 5809 int i, error; 5810 vm_pindex_t reqstart, reqend; 5811 int pcount, lsize, reqsize, size; 5812 5813 ZFS_ENTER(zfsvfs); 5814 ZFS_VERIFY_ZP(zp); 5815 5816 pcount = OFF_TO_IDX(round_page(count)); 5817 mreq = m[reqpage]; 5818 object = mreq->object; 5819 error = 0; 5820 5821 KASSERT(vp->v_object == object, ("mismatching object")); 5822 5823 if (pcount > 1 && zp->z_blksz > PAGESIZE) { 5824 startoff = rounddown(IDX_TO_OFF(mreq->pindex), zp->z_blksz); 5825 reqstart = OFF_TO_IDX(round_page(startoff)); 5826 if (reqstart < m[0]->pindex) 5827 reqstart = 0; 5828 else 5829 reqstart = reqstart - m[0]->pindex; 5830 endoff = roundup(IDX_TO_OFF(mreq->pindex) + PAGE_SIZE, 5831 zp->z_blksz); 5832 reqend = OFF_TO_IDX(trunc_page(endoff)) - 1; 5833 if (reqend > m[pcount - 1]->pindex) 5834 reqend = m[pcount - 1]->pindex; 5835 reqsize = reqend - m[reqstart]->pindex + 1; 5836 KASSERT(reqstart <= reqpage && reqpage < reqstart + reqsize, 5837 ("reqpage beyond [reqstart, reqstart + reqsize[ bounds")); 5838 } else { 5839 reqstart = reqpage; 5840 reqsize = 1; 5841 } 5842 mfirst = m[reqstart]; 5843 mlast = m[reqstart + reqsize - 1]; 5844 5845 zfs_vmobject_wlock(object); 5846 5847 for (i = 0; i < reqstart; i++) { 5848 vm_page_lock(m[i]); 5849 vm_page_free(m[i]); 5850 vm_page_unlock(m[i]); 5851 } 5852 for (i = reqstart + reqsize; i < pcount; i++) { 5853 vm_page_lock(m[i]); 5854 vm_page_free(m[i]); 5855 vm_page_unlock(m[i]); 5856 } 5857 5858 if (mreq->valid && reqsize == 1) { 5859 if (mreq->valid != VM_PAGE_BITS_ALL) 5860 vm_page_zero_invalid(mreq, TRUE); 5861 zfs_vmobject_wunlock(object); 5862 ZFS_EXIT(zfsvfs); 5863 return (zfs_vm_pagerret_ok); 5864 } 5865 5866 PCPU_INC(cnt.v_vnodein); 5867 PCPU_ADD(cnt.v_vnodepgsin, reqsize); 5868 5869 if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) { 5870 for (i = reqstart; i < reqstart + reqsize; i++) { 5871 if (i != reqpage) { 5872 vm_page_lock(m[i]); 5873 vm_page_free(m[i]); 5874 vm_page_unlock(m[i]); 5875 } 5876 } 5877 zfs_vmobject_wunlock(object); 5878 ZFS_EXIT(zfsvfs); 5879 return (zfs_vm_pagerret_bad); 5880 } 5881 5882 lsize = PAGE_SIZE; 5883 if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size) 5884 lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex); 5885 5886 zfs_vmobject_wunlock(object); 5887 5888 for (i = reqstart; i < reqstart + reqsize; i++) { 5889 size = PAGE_SIZE; 5890 if (i == (reqstart + reqsize - 1)) 5891 size = lsize; 5892 va = zfs_map_page(m[i], &sf); 5893 error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex), 5894 size, va, DMU_READ_PREFETCH); 5895 if (size != PAGE_SIZE) 5896 bzero(va + size, PAGE_SIZE - size); 5897 zfs_unmap_page(sf); 5898 if (error != 0) 5899 break; 5900 } 5901 5902 zfs_vmobject_wlock(object); 5903 5904 for (i = reqstart; i < reqstart + reqsize; i++) { 5905 if (!error) 5906 m[i]->valid = VM_PAGE_BITS_ALL; 5907 KASSERT(m[i]->dirty == 0, ("zfs_getpages: page %p is dirty", m[i])); 5908 if (i != reqpage) 5909 vm_page_readahead_finish(m[i]); 5910 } 5911 5912 zfs_vmobject_wunlock(object); 5913 5914 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 5915 ZFS_EXIT(zfsvfs); 5916 return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok); 5917} 5918 5919static int 5920zfs_freebsd_getpages(ap) 5921 struct vop_getpages_args /* { 5922 struct vnode *a_vp; 5923 vm_page_t *a_m; 5924 int a_count; 5925 int a_reqpage; 5926 vm_ooffset_t a_offset; 5927 } */ *ap; 5928{ 5929 5930 return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage)); 5931} 5932 5933static int 5934zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, 5935 int *rtvals) 5936{ 5937 znode_t *zp = VTOZ(vp); 5938 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5939 rl_t *rl; 5940 dmu_tx_t *tx; 5941 struct sf_buf *sf; 5942 vm_object_t object; 5943 vm_page_t m; 5944 caddr_t va; 5945 size_t tocopy; 5946 size_t lo_len; 5947 vm_ooffset_t lo_off; 5948 vm_ooffset_t off; 5949 uint_t blksz; 5950 int ncount; 5951 int pcount; 5952 int err; 5953 int i; 5954 5955 ZFS_ENTER(zfsvfs); 5956 ZFS_VERIFY_ZP(zp); 5957 5958 object = vp->v_object; 5959 pcount = btoc(len); 5960 ncount = pcount; 5961 5962 KASSERT(ma[0]->object == object, ("mismatching object")); 5963 KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); 5964 5965 for (i = 0; i < pcount; i++) 5966 rtvals[i] = zfs_vm_pagerret_error; 5967 5968 off = IDX_TO_OFF(ma[0]->pindex); 5969 blksz = zp->z_blksz; 5970 lo_off = rounddown(off, blksz); 5971 lo_len = roundup(len + (off - lo_off), blksz); 5972 rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER); 5973 5974 zfs_vmobject_wlock(object); 5975 if (len + off > object->un_pager.vnp.vnp_size) { 5976 if (object->un_pager.vnp.vnp_size > off) { 5977 int pgoff; 5978 5979 len = object->un_pager.vnp.vnp_size - off; 5980 ncount = btoc(len); 5981 if ((pgoff = (int)len & PAGE_MASK) != 0) { 5982 /* 5983 * If the object is locked and the following 5984 * conditions hold, then the page's dirty 5985 * field cannot be concurrently changed by a 5986 * pmap operation. 5987 */ 5988 m = ma[ncount - 1]; 5989 vm_page_assert_sbusied(m); 5990 KASSERT(!pmap_page_is_write_mapped(m), 5991 ("zfs_putpages: page %p is not read-only", m)); 5992 vm_page_clear_dirty(m, pgoff, PAGE_SIZE - 5993 pgoff); 5994 } 5995 } else { 5996 len = 0; 5997 ncount = 0; 5998 } 5999 if (ncount < pcount) { 6000 for (i = ncount; i < pcount; i++) { 6001 rtvals[i] = zfs_vm_pagerret_bad; 6002 } 6003 } 6004 } 6005 zfs_vmobject_wunlock(object); 6006 6007 if (ncount == 0) 6008 goto out; 6009 6010 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 6011 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 6012 goto out; 6013 } 6014 6015top: 6016 tx = dmu_tx_create(zfsvfs->z_os); 6017 dmu_tx_hold_write(tx, zp->z_id, off, len); 6018 6019 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 6020 zfs_sa_upgrade_txholds(tx, zp); 6021 err = dmu_tx_assign(tx, TXG_NOWAIT); 6022 if (err != 0) { 6023 if (err == ERESTART) { 6024 dmu_tx_wait(tx); 6025 dmu_tx_abort(tx); 6026 goto top; 6027 } 6028 dmu_tx_abort(tx); 6029 goto out; 6030 } 6031 6032 if (zp->z_blksz < PAGE_SIZE) { 6033 i = 0; 6034 for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { 6035 tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; 6036 va = zfs_map_page(ma[i], &sf); 6037 dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); 6038 zfs_unmap_page(sf); 6039 } 6040 } else { 6041 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); 6042 } 6043 6044 if (err == 0) { 6045 uint64_t mtime[2], ctime[2]; 6046 sa_bulk_attr_t bulk[3]; 6047 int count = 0; 6048 6049 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 6050 &mtime, 16); 6051 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 6052 &ctime, 16); 6053 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 6054 &zp->z_pflags, 8); 6055 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 6056 B_TRUE); 6057 (void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 6058 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); 6059 6060 zfs_vmobject_wlock(object); 6061 for (i = 0; i < ncount; i++) { 6062 rtvals[i] = zfs_vm_pagerret_ok; 6063 vm_page_undirty(ma[i]); 6064 } 6065 zfs_vmobject_wunlock(object); 6066 PCPU_INC(cnt.v_vnodeout); 6067 PCPU_ADD(cnt.v_vnodepgsout, ncount); 6068 } 6069 dmu_tx_commit(tx); 6070 6071out: 6072 zfs_range_unlock(rl); 6073 if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || 6074 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 6075 zil_commit(zfsvfs->z_log, zp->z_id); 6076 ZFS_EXIT(zfsvfs); 6077 return (rtvals[0]); 6078} 6079 6080int 6081zfs_freebsd_putpages(ap) 6082 struct vop_putpages_args /* { 6083 struct vnode *a_vp; 6084 vm_page_t *a_m; 6085 int a_count; 6086 int a_sync; 6087 int *a_rtvals; 6088 vm_ooffset_t a_offset; 6089 } */ *ap; 6090{ 6091 6092 return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, 6093 ap->a_rtvals)); 6094} 6095 6096static int 6097zfs_freebsd_bmap(ap) 6098 struct vop_bmap_args /* { 6099 struct vnode *a_vp; 6100 daddr_t a_bn; 6101 struct bufobj **a_bop; 6102 daddr_t *a_bnp; 6103 int *a_runp; 6104 int *a_runb; 6105 } */ *ap; 6106{ 6107 6108 if (ap->a_bop != NULL) 6109 *ap->a_bop = &ap->a_vp->v_bufobj; 6110 if (ap->a_bnp != NULL) 6111 *ap->a_bnp = ap->a_bn; 6112 if (ap->a_runp != NULL) 6113 *ap->a_runp = 0; 6114 if (ap->a_runb != NULL) 6115 *ap->a_runb = 0; 6116 6117 return (0); 6118} 6119 6120static int 6121zfs_freebsd_open(ap) 6122 struct vop_open_args /* { 6123 struct vnode *a_vp; 6124 int a_mode; 6125 struct ucred *a_cred; 6126 struct thread *a_td; 6127 } */ *ap; 6128{ 6129 vnode_t *vp = ap->a_vp; 6130 znode_t *zp = VTOZ(vp); 6131 int error; 6132 6133 error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); 6134 if (error == 0) 6135 vnode_create_vobject(vp, zp->z_size, ap->a_td); 6136 return (error); 6137} 6138 6139static int 6140zfs_freebsd_close(ap) 6141 struct vop_close_args /* { 6142 struct vnode *a_vp; 6143 int a_fflag; 6144 struct ucred *a_cred; 6145 struct thread *a_td; 6146 } */ *ap; 6147{ 6148 6149 return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL)); 6150} 6151 6152static int 6153zfs_freebsd_ioctl(ap) 6154 struct vop_ioctl_args /* { 6155 struct vnode *a_vp; 6156 u_long a_command; 6157 caddr_t a_data; 6158 int a_fflag; 6159 struct ucred *cred; 6160 struct thread *td; 6161 } */ *ap; 6162{ 6163 6164 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, 6165 ap->a_fflag, ap->a_cred, NULL, NULL)); 6166} 6167 6168static int 6169zfs_freebsd_read(ap) 6170 struct vop_read_args /* { 6171 struct vnode *a_vp; 6172 struct uio *a_uio; 6173 int a_ioflag; 6174 struct ucred *a_cred; 6175 } */ *ap; 6176{ 6177 6178 return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), 6179 ap->a_cred, NULL)); 6180} 6181 6182static int 6183zfs_freebsd_write(ap) 6184 struct vop_write_args /* { 6185 struct vnode *a_vp; 6186 struct uio *a_uio; 6187 int a_ioflag; 6188 struct ucred *a_cred; 6189 } */ *ap; 6190{ 6191 6192 return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), 6193 ap->a_cred, NULL)); 6194} 6195 6196static int 6197zfs_freebsd_access(ap) 6198 struct vop_access_args /* { 6199 struct vnode *a_vp; 6200 accmode_t a_accmode; 6201 struct ucred *a_cred; 6202 struct thread *a_td; 6203 } */ *ap; 6204{ 6205 vnode_t *vp = ap->a_vp; 6206 znode_t *zp = VTOZ(vp); 6207 accmode_t accmode; 6208 int error = 0; 6209 6210 /* 6211 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, 6212 */ 6213 accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); 6214 if (accmode != 0) 6215 error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); 6216 6217 /* 6218 * VADMIN has to be handled by vaccess(). 6219 */ 6220 if (error == 0) { 6221 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); 6222 if (accmode != 0) { 6223 error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, 6224 zp->z_gid, accmode, ap->a_cred, NULL); 6225 } 6226 } 6227 6228 /* 6229 * For VEXEC, ensure that at least one execute bit is set for 6230 * non-directories. 6231 */ 6232 if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && 6233 (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { 6234 error = EACCES; 6235 } 6236 6237 return (error); 6238} 6239 6240static int 6241zfs_freebsd_lookup(ap) 6242 struct vop_lookup_args /* { 6243 struct vnode *a_dvp; 6244 struct vnode **a_vpp; 6245 struct componentname *a_cnp; 6246 } */ *ap; 6247{ 6248 struct componentname *cnp = ap->a_cnp; 6249 char nm[NAME_MAX + 1]; 6250 6251 ASSERT(cnp->cn_namelen < sizeof(nm)); 6252 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); 6253 6254 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, 6255 cnp->cn_cred, cnp->cn_thread, 0)); 6256} 6257 6258static int 6259zfs_freebsd_create(ap) 6260 struct vop_create_args /* { 6261 struct vnode *a_dvp; 6262 struct vnode **a_vpp; 6263 struct componentname *a_cnp; 6264 struct vattr *a_vap; 6265 } */ *ap; 6266{ 6267 struct componentname *cnp = ap->a_cnp; 6268 vattr_t *vap = ap->a_vap; 6269 int error, mode; 6270 6271 ASSERT(cnp->cn_flags & SAVENAME); 6272 6273 vattr_init_mask(vap); 6274 mode = vap->va_mode & ALLPERMS; 6275 6276 error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, 6277 ap->a_vpp, cnp->cn_cred, cnp->cn_thread); 6278#ifdef FREEBSD_NAMECACHE 6279 if (error == 0 && (cnp->cn_flags & MAKEENTRY) != 0) 6280 cache_enter(ap->a_dvp, *ap->a_vpp, cnp); 6281#endif 6282 return (error); 6283} 6284 6285static int 6286zfs_freebsd_remove(ap) 6287 struct vop_remove_args /* { 6288 struct vnode *a_dvp; 6289 struct vnode *a_vp; 6290 struct componentname *a_cnp; 6291 } */ *ap; 6292{ 6293 6294 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 6295 6296 return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr, 6297 ap->a_cnp->cn_cred, NULL, 0)); 6298} 6299 6300static int 6301zfs_freebsd_mkdir(ap) 6302 struct vop_mkdir_args /* { 6303 struct vnode *a_dvp; 6304 struct vnode **a_vpp; 6305 struct componentname *a_cnp; 6306 struct vattr *a_vap; 6307 } */ *ap; 6308{ 6309 vattr_t *vap = ap->a_vap; 6310 6311 ASSERT(ap->a_cnp->cn_flags & SAVENAME); 6312 6313 vattr_init_mask(vap); 6314 6315 return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, 6316 ap->a_cnp->cn_cred, NULL, 0, NULL)); 6317} 6318 6319static int 6320zfs_freebsd_rmdir(ap) 6321 struct vop_rmdir_args /* { 6322 struct vnode *a_dvp; 6323 struct vnode *a_vp; 6324 struct componentname *a_cnp; 6325 } */ *ap; 6326{ 6327 struct componentname *cnp = ap->a_cnp; 6328 6329 ASSERT(cnp->cn_flags & SAVENAME); 6330 6331 return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0)); 6332} 6333 6334static int 6335zfs_freebsd_readdir(ap) 6336 struct vop_readdir_args /* { 6337 struct vnode *a_vp; 6338 struct uio *a_uio; 6339 struct ucred *a_cred; 6340 int *a_eofflag; 6341 int *a_ncookies; 6342 u_long **a_cookies; 6343 } */ *ap; 6344{ 6345 6346 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, 6347 ap->a_ncookies, ap->a_cookies)); 6348} 6349 6350static int 6351zfs_freebsd_fsync(ap) 6352 struct vop_fsync_args /* { 6353 struct vnode *a_vp; 6354 int a_waitfor; 6355 struct thread *a_td; 6356 } */ *ap; 6357{ 6358 6359 vop_stdfsync(ap); 6360 return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); 6361} 6362 6363static int 6364zfs_freebsd_getattr(ap) 6365 struct vop_getattr_args /* { 6366 struct vnode *a_vp; 6367 struct vattr *a_vap; 6368 struct ucred *a_cred; 6369 } */ *ap; 6370{ 6371 vattr_t *vap = ap->a_vap; 6372 xvattr_t xvap; 6373 u_long fflags = 0; 6374 int error; 6375 6376 xva_init(&xvap); 6377 xvap.xva_vattr = *vap; 6378 xvap.xva_vattr.va_mask |= AT_XVATTR; 6379 6380 /* Convert chflags into ZFS-type flags. */ 6381 /* XXX: what about SF_SETTABLE?. */ 6382 XVA_SET_REQ(&xvap, XAT_IMMUTABLE); 6383 XVA_SET_REQ(&xvap, XAT_APPENDONLY); 6384 XVA_SET_REQ(&xvap, XAT_NOUNLINK); 6385 XVA_SET_REQ(&xvap, XAT_NODUMP); 6386 XVA_SET_REQ(&xvap, XAT_READONLY); 6387 XVA_SET_REQ(&xvap, XAT_ARCHIVE); 6388 XVA_SET_REQ(&xvap, XAT_SYSTEM); 6389 XVA_SET_REQ(&xvap, XAT_HIDDEN); 6390 XVA_SET_REQ(&xvap, XAT_REPARSE); 6391 XVA_SET_REQ(&xvap, XAT_OFFLINE); 6392 XVA_SET_REQ(&xvap, XAT_SPARSE); 6393 6394 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); 6395 if (error != 0) 6396 return (error); 6397 6398 /* Convert ZFS xattr into chflags. */ 6399#define FLAG_CHECK(fflag, xflag, xfield) do { \ 6400 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ 6401 fflags |= (fflag); \ 6402} while (0) 6403 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, 6404 xvap.xva_xoptattrs.xoa_immutable); 6405 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, 6406 xvap.xva_xoptattrs.xoa_appendonly); 6407 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, 6408 xvap.xva_xoptattrs.xoa_nounlink); 6409 FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, 6410 xvap.xva_xoptattrs.xoa_archive); 6411 FLAG_CHECK(UF_NODUMP, XAT_NODUMP, 6412 xvap.xva_xoptattrs.xoa_nodump); 6413 FLAG_CHECK(UF_READONLY, XAT_READONLY, 6414 xvap.xva_xoptattrs.xoa_readonly); 6415 FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, 6416 xvap.xva_xoptattrs.xoa_system); 6417 FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, 6418 xvap.xva_xoptattrs.xoa_hidden); 6419 FLAG_CHECK(UF_REPARSE, XAT_REPARSE, 6420 xvap.xva_xoptattrs.xoa_reparse); 6421 FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, 6422 xvap.xva_xoptattrs.xoa_offline); 6423 FLAG_CHECK(UF_SPARSE, XAT_SPARSE, 6424 xvap.xva_xoptattrs.xoa_sparse); 6425 6426#undef FLAG_CHECK 6427 *vap = xvap.xva_vattr; 6428 vap->va_flags = fflags; 6429 return (0); 6430} 6431 6432static int 6433zfs_freebsd_setattr(ap) 6434 struct vop_setattr_args /* { 6435 struct vnode *a_vp; 6436 struct vattr *a_vap; 6437 struct ucred *a_cred; 6438 } */ *ap; 6439{ 6440 vnode_t *vp = ap->a_vp; 6441 vattr_t *vap = ap->a_vap; 6442 cred_t *cred = ap->a_cred; 6443 xvattr_t xvap; 6444 u_long fflags; 6445 uint64_t zflags; 6446 6447 vattr_init_mask(vap); 6448 vap->va_mask &= ~AT_NOSET; 6449 6450 xva_init(&xvap); 6451 xvap.xva_vattr = *vap; 6452 6453 zflags = VTOZ(vp)->z_pflags; 6454 6455 if (vap->va_flags != VNOVAL) { 6456 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; 6457 int error; 6458 6459 if (zfsvfs->z_use_fuids == B_FALSE) 6460 return (EOPNOTSUPP); 6461 6462 fflags = vap->va_flags; 6463 /* 6464 * XXX KDM 6465 * We need to figure out whether it makes sense to allow 6466 * UF_REPARSE through, since we don't really have other 6467 * facilities to handle reparse points and zfs_setattr() 6468 * doesn't currently allow setting that attribute anyway. 6469 */ 6470 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| 6471 UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| 6472 UF_OFFLINE|UF_SPARSE)) != 0) 6473 return (EOPNOTSUPP); 6474 /* 6475 * Unprivileged processes are not permitted to unset system 6476 * flags, or modify flags if any system flags are set. 6477 * Privileged non-jail processes may not modify system flags 6478 * if securelevel > 0 and any existing system flags are set. 6479 * Privileged jail processes behave like privileged non-jail 6480 * processes if the security.jail.chflags_allowed sysctl is 6481 * is non-zero; otherwise, they behave like unprivileged 6482 * processes. 6483 */ 6484 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || 6485 priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) { 6486 if (zflags & 6487 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 6488 error = securelevel_gt(cred, 0); 6489 if (error != 0) 6490 return (error); 6491 } 6492 } else { 6493 /* 6494 * Callers may only modify the file flags on objects they 6495 * have VADMIN rights for. 6496 */ 6497 if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) 6498 return (error); 6499 if (zflags & 6500 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { 6501 return (EPERM); 6502 } 6503 if (fflags & 6504 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { 6505 return (EPERM); 6506 } 6507 } 6508 6509#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ 6510 if (((fflags & (fflag)) && !(zflags & (zflag))) || \ 6511 ((zflags & (zflag)) && !(fflags & (fflag)))) { \ 6512 XVA_SET_REQ(&xvap, (xflag)); \ 6513 (xfield) = ((fflags & (fflag)) != 0); \ 6514 } \ 6515} while (0) 6516 /* Convert chflags into ZFS-type flags. */ 6517 /* XXX: what about SF_SETTABLE?. */ 6518 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, 6519 xvap.xva_xoptattrs.xoa_immutable); 6520 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, 6521 xvap.xva_xoptattrs.xoa_appendonly); 6522 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, 6523 xvap.xva_xoptattrs.xoa_nounlink); 6524 FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, 6525 xvap.xva_xoptattrs.xoa_archive); 6526 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, 6527 xvap.xva_xoptattrs.xoa_nodump); 6528 FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, 6529 xvap.xva_xoptattrs.xoa_readonly); 6530 FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, 6531 xvap.xva_xoptattrs.xoa_system); 6532 FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, 6533 xvap.xva_xoptattrs.xoa_hidden); 6534 FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, 6535 xvap.xva_xoptattrs.xoa_hidden); 6536 FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, 6537 xvap.xva_xoptattrs.xoa_offline); 6538 FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, 6539 xvap.xva_xoptattrs.xoa_sparse); 6540#undef FLAG_CHANGE 6541 } 6542 return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); 6543} 6544 6545static int 6546zfs_freebsd_rename(ap) 6547 struct vop_rename_args /* { 6548 struct vnode *a_fdvp; 6549 struct vnode *a_fvp; 6550 struct componentname *a_fcnp; 6551 struct vnode *a_tdvp; 6552 struct vnode *a_tvp; 6553 struct componentname *a_tcnp; 6554 } */ *ap; 6555{ 6556 vnode_t *fdvp = ap->a_fdvp; 6557 vnode_t *fvp = ap->a_fvp; 6558 vnode_t *tdvp = ap->a_tdvp; 6559 vnode_t *tvp = ap->a_tvp; 6560 int error; 6561 6562 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); 6563 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); 6564 6565 /* 6566 * Check for cross-device rename. 6567 */ 6568 if ((fdvp->v_mount != tdvp->v_mount) || 6569 (tvp && (fdvp->v_mount != tvp->v_mount))) 6570 error = EXDEV; 6571 else 6572 error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp, 6573 ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0); 6574 if (tdvp == tvp) 6575 VN_RELE(tdvp); 6576 else 6577 VN_URELE(tdvp); 6578 if (tvp) 6579 VN_URELE(tvp); 6580 VN_RELE(fdvp); 6581 VN_RELE(fvp); 6582 6583 return (error); 6584} 6585 6586static int 6587zfs_freebsd_symlink(ap) 6588 struct vop_symlink_args /* { 6589 struct vnode *a_dvp; 6590 struct vnode **a_vpp; 6591 struct componentname *a_cnp; 6592 struct vattr *a_vap; 6593 char *a_target; 6594 } */ *ap; 6595{ 6596 struct componentname *cnp = ap->a_cnp; 6597 vattr_t *vap = ap->a_vap; 6598 6599 ASSERT(cnp->cn_flags & SAVENAME); 6600 6601 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ 6602 vattr_init_mask(vap); 6603 6604 return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, 6605 ap->a_target, cnp->cn_cred, cnp->cn_thread)); 6606} 6607 6608static int 6609zfs_freebsd_readlink(ap) 6610 struct vop_readlink_args /* { 6611 struct vnode *a_vp; 6612 struct uio *a_uio; 6613 struct ucred *a_cred; 6614 } */ *ap; 6615{ 6616 6617 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); 6618} 6619 6620static int 6621zfs_freebsd_link(ap) 6622 struct vop_link_args /* { 6623 struct vnode *a_tdvp; 6624 struct vnode *a_vp; 6625 struct componentname *a_cnp; 6626 } */ *ap; 6627{ 6628 struct componentname *cnp = ap->a_cnp; 6629 vnode_t *vp = ap->a_vp; 6630 vnode_t *tdvp = ap->a_tdvp; 6631 6632 if (tdvp->v_mount != vp->v_mount) 6633 return (EXDEV); 6634 6635 ASSERT(cnp->cn_flags & SAVENAME); 6636 6637 return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); 6638} 6639 6640static int 6641zfs_freebsd_inactive(ap) 6642 struct vop_inactive_args /* { 6643 struct vnode *a_vp; 6644 struct thread *a_td; 6645 } */ *ap; 6646{ 6647 vnode_t *vp = ap->a_vp; 6648 6649 zfs_inactive(vp, ap->a_td->td_ucred, NULL); 6650 return (0); 6651} 6652 6653static int 6654zfs_freebsd_reclaim(ap) 6655 struct vop_reclaim_args /* { 6656 struct vnode *a_vp; 6657 struct thread *a_td; 6658 } */ *ap; 6659{ 6660 vnode_t *vp = ap->a_vp; 6661 znode_t *zp = VTOZ(vp); 6662 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 6663 6664 ASSERT(zp != NULL); 6665 6666 /* Destroy the vm object and flush associated pages. */ 6667 vnode_destroy_vobject(vp); 6668 6669 /* 6670 * z_teardown_inactive_lock protects from a race with 6671 * zfs_znode_dmu_fini in zfsvfs_teardown during 6672 * force unmount. 6673 */ 6674 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 6675 if (zp->z_sa_hdl == NULL) 6676 zfs_znode_free(zp); 6677 else 6678 zfs_zinactive(zp); 6679 rw_exit(&zfsvfs->z_teardown_inactive_lock); 6680 6681 vp->v_data = NULL; 6682 return (0); 6683} 6684 6685static int 6686zfs_freebsd_fid(ap) 6687 struct vop_fid_args /* { 6688 struct vnode *a_vp; 6689 struct fid *a_fid; 6690 } */ *ap; 6691{ 6692 6693 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); 6694} 6695 6696static int 6697zfs_freebsd_pathconf(ap) 6698 struct vop_pathconf_args /* { 6699 struct vnode *a_vp; 6700 int a_name; 6701 register_t *a_retval; 6702 } */ *ap; 6703{ 6704 ulong_t val; 6705 int error; 6706 6707 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); 6708 if (error == 0) 6709 *ap->a_retval = val; 6710 else if (error == EOPNOTSUPP) 6711 error = vop_stdpathconf(ap); 6712 return (error); 6713} 6714 6715static int 6716zfs_freebsd_fifo_pathconf(ap) 6717 struct vop_pathconf_args /* { 6718 struct vnode *a_vp; 6719 int a_name; 6720 register_t *a_retval; 6721 } */ *ap; 6722{ 6723 6724 switch (ap->a_name) { 6725 case _PC_ACL_EXTENDED: 6726 case _PC_ACL_NFS4: 6727 case _PC_ACL_PATH_MAX: 6728 case _PC_MAC_PRESENT: 6729 return (zfs_freebsd_pathconf(ap)); 6730 default: 6731 return (fifo_specops.vop_pathconf(ap)); 6732 } 6733} 6734 6735/* 6736 * FreeBSD's extended attributes namespace defines file name prefix for ZFS' 6737 * extended attribute name: 6738 * 6739 * NAMESPACE PREFIX 6740 * system freebsd:system: 6741 * user (none, can be used to access ZFS fsattr(5) attributes 6742 * created on Solaris) 6743 */ 6744static int 6745zfs_create_attrname(int attrnamespace, const char *name, char *attrname, 6746 size_t size) 6747{ 6748 const char *namespace, *prefix, *suffix; 6749 6750 /* We don't allow '/' character in attribute name. */ 6751 if (strchr(name, '/') != NULL) 6752 return (EINVAL); 6753 /* We don't allow attribute names that start with "freebsd:" string. */ 6754 if (strncmp(name, "freebsd:", 8) == 0) 6755 return (EINVAL); 6756 6757 bzero(attrname, size); 6758 6759 switch (attrnamespace) { 6760 case EXTATTR_NAMESPACE_USER: 6761#if 0 6762 prefix = "freebsd:"; 6763 namespace = EXTATTR_NAMESPACE_USER_STRING; 6764 suffix = ":"; 6765#else 6766 /* 6767 * This is the default namespace by which we can access all 6768 * attributes created on Solaris. 6769 */ 6770 prefix = namespace = suffix = ""; 6771#endif 6772 break; 6773 case EXTATTR_NAMESPACE_SYSTEM: 6774 prefix = "freebsd:"; 6775 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; 6776 suffix = ":"; 6777 break; 6778 case EXTATTR_NAMESPACE_EMPTY: 6779 default: 6780 return (EINVAL); 6781 } 6782 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, 6783 name) >= size) { 6784 return (ENAMETOOLONG); 6785 } 6786 return (0); 6787} 6788 6789/* 6790 * Vnode operating to retrieve a named extended attribute. 6791 */ 6792static int 6793zfs_getextattr(struct vop_getextattr_args *ap) 6794/* 6795vop_getextattr { 6796 IN struct vnode *a_vp; 6797 IN int a_attrnamespace; 6798 IN const char *a_name; 6799 INOUT struct uio *a_uio; 6800 OUT size_t *a_size; 6801 IN struct ucred *a_cred; 6802 IN struct thread *a_td; 6803}; 6804*/ 6805{ 6806 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 6807 struct thread *td = ap->a_td; 6808 struct nameidata nd; 6809 char attrname[255]; 6810 struct vattr va; 6811 vnode_t *xvp = NULL, *vp; 6812 int error, flags; 6813 6814 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 6815 ap->a_cred, ap->a_td, VREAD); 6816 if (error != 0) 6817 return (error); 6818 6819 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 6820 sizeof(attrname)); 6821 if (error != 0) 6822 return (error); 6823 6824 ZFS_ENTER(zfsvfs); 6825 6826 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 6827 LOOKUP_XATTR); 6828 if (error != 0) { 6829 ZFS_EXIT(zfsvfs); 6830 return (error); 6831 } 6832 6833 flags = FREAD; 6834 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, 6835 xvp, td); 6836 error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); 6837 vp = nd.ni_vp; 6838 NDFREE(&nd, NDF_ONLY_PNBUF); 6839 if (error != 0) { 6840 ZFS_EXIT(zfsvfs); 6841 if (error == ENOENT) 6842 error = ENOATTR; 6843 return (error); 6844 } 6845 6846 if (ap->a_size != NULL) { 6847 error = VOP_GETATTR(vp, &va, ap->a_cred); 6848 if (error == 0) 6849 *ap->a_size = (size_t)va.va_size; 6850 } else if (ap->a_uio != NULL) 6851 error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); 6852 6853 VOP_UNLOCK(vp, 0); 6854 vn_close(vp, flags, ap->a_cred, td); 6855 ZFS_EXIT(zfsvfs); 6856 6857 return (error); 6858} 6859 6860/* 6861 * Vnode operation to remove a named attribute. 6862 */ 6863int 6864zfs_deleteextattr(struct vop_deleteextattr_args *ap) 6865/* 6866vop_deleteextattr { 6867 IN struct vnode *a_vp; 6868 IN int a_attrnamespace; 6869 IN const char *a_name; 6870 IN struct ucred *a_cred; 6871 IN struct thread *a_td; 6872}; 6873*/ 6874{ 6875 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 6876 struct thread *td = ap->a_td; 6877 struct nameidata nd; 6878 char attrname[255]; 6879 struct vattr va; 6880 vnode_t *xvp = NULL, *vp; 6881 int error, flags; 6882 6883 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 6884 ap->a_cred, ap->a_td, VWRITE); 6885 if (error != 0) 6886 return (error); 6887 6888 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 6889 sizeof(attrname)); 6890 if (error != 0) 6891 return (error); 6892 6893 ZFS_ENTER(zfsvfs); 6894 6895 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 6896 LOOKUP_XATTR); 6897 if (error != 0) { 6898 ZFS_EXIT(zfsvfs); 6899 return (error); 6900 } 6901 6902 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, 6903 UIO_SYSSPACE, attrname, xvp, td); 6904 error = namei(&nd); 6905 vp = nd.ni_vp; 6906 if (error != 0) { 6907 ZFS_EXIT(zfsvfs); 6908 NDFREE(&nd, NDF_ONLY_PNBUF); 6909 if (error == ENOENT) 6910 error = ENOATTR; 6911 return (error); 6912 } 6913 6914 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); 6915 NDFREE(&nd, NDF_ONLY_PNBUF); 6916 6917 vput(nd.ni_dvp); 6918 if (vp == nd.ni_dvp) 6919 vrele(vp); 6920 else 6921 vput(vp); 6922 ZFS_EXIT(zfsvfs); 6923 6924 return (error); 6925} 6926 6927/* 6928 * Vnode operation to set a named attribute. 6929 */ 6930static int 6931zfs_setextattr(struct vop_setextattr_args *ap) 6932/* 6933vop_setextattr { 6934 IN struct vnode *a_vp; 6935 IN int a_attrnamespace; 6936 IN const char *a_name; 6937 INOUT struct uio *a_uio; 6938 IN struct ucred *a_cred; 6939 IN struct thread *a_td; 6940}; 6941*/ 6942{ 6943 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 6944 struct thread *td = ap->a_td; 6945 struct nameidata nd; 6946 char attrname[255]; 6947 struct vattr va; 6948 vnode_t *xvp = NULL, *vp; 6949 int error, flags; 6950 6951 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 6952 ap->a_cred, ap->a_td, VWRITE); 6953 if (error != 0) 6954 return (error); 6955 6956 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, 6957 sizeof(attrname)); 6958 if (error != 0) 6959 return (error); 6960 6961 ZFS_ENTER(zfsvfs); 6962 6963 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 6964 LOOKUP_XATTR | CREATE_XATTR_DIR); 6965 if (error != 0) { 6966 ZFS_EXIT(zfsvfs); 6967 return (error); 6968 } 6969 6970 flags = FFLAGS(O_WRONLY | O_CREAT); 6971 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, 6972 xvp, td); 6973 error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); 6974 vp = nd.ni_vp; 6975 NDFREE(&nd, NDF_ONLY_PNBUF); 6976 if (error != 0) { 6977 ZFS_EXIT(zfsvfs); 6978 return (error); 6979 } 6980 6981 VATTR_NULL(&va); 6982 va.va_size = 0; 6983 error = VOP_SETATTR(vp, &va, ap->a_cred); 6984 if (error == 0) 6985 VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); 6986 6987 VOP_UNLOCK(vp, 0); 6988 vn_close(vp, flags, ap->a_cred, td); 6989 ZFS_EXIT(zfsvfs); 6990 6991 return (error); 6992} 6993 6994/* 6995 * Vnode operation to retrieve extended attributes on a vnode. 6996 */ 6997static int 6998zfs_listextattr(struct vop_listextattr_args *ap) 6999/* 7000vop_listextattr { 7001 IN struct vnode *a_vp; 7002 IN int a_attrnamespace; 7003 INOUT struct uio *a_uio; 7004 OUT size_t *a_size; 7005 IN struct ucred *a_cred; 7006 IN struct thread *a_td; 7007}; 7008*/ 7009{ 7010 zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; 7011 struct thread *td = ap->a_td; 7012 struct nameidata nd; 7013 char attrprefix[16]; 7014 u_char dirbuf[sizeof(struct dirent)]; 7015 struct dirent *dp; 7016 struct iovec aiov; 7017 struct uio auio, *uio = ap->a_uio; 7018 size_t *sizep = ap->a_size; 7019 size_t plen; 7020 vnode_t *xvp = NULL, *vp; 7021 int done, error, eof, pos; 7022 7023 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 7024 ap->a_cred, ap->a_td, VREAD); 7025 if (error != 0) 7026 return (error); 7027 7028 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, 7029 sizeof(attrprefix)); 7030 if (error != 0) 7031 return (error); 7032 plen = strlen(attrprefix); 7033 7034 ZFS_ENTER(zfsvfs); 7035 7036 if (sizep != NULL) 7037 *sizep = 0; 7038 7039 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, 7040 LOOKUP_XATTR); 7041 if (error != 0) { 7042 ZFS_EXIT(zfsvfs); 7043 /* 7044 * ENOATTR means that the EA directory does not yet exist, 7045 * i.e. there are no extended attributes there. 7046 */ 7047 if (error == ENOATTR) 7048 error = 0; 7049 return (error); 7050 } 7051 7052 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, 7053 UIO_SYSSPACE, ".", xvp, td); 7054 error = namei(&nd); 7055 vp = nd.ni_vp; 7056 NDFREE(&nd, NDF_ONLY_PNBUF); 7057 if (error != 0) { 7058 ZFS_EXIT(zfsvfs); 7059 return (error); 7060 } 7061 7062 auio.uio_iov = &aiov; 7063 auio.uio_iovcnt = 1; 7064 auio.uio_segflg = UIO_SYSSPACE; 7065 auio.uio_td = td; 7066 auio.uio_rw = UIO_READ; 7067 auio.uio_offset = 0; 7068 7069 do { 7070 u_char nlen; 7071 7072 aiov.iov_base = (void *)dirbuf; 7073 aiov.iov_len = sizeof(dirbuf); 7074 auio.uio_resid = sizeof(dirbuf); 7075 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); 7076 done = sizeof(dirbuf) - auio.uio_resid; 7077 if (error != 0) 7078 break; 7079 for (pos = 0; pos < done;) { 7080 dp = (struct dirent *)(dirbuf + pos); 7081 pos += dp->d_reclen; 7082 /* 7083 * XXX: Temporarily we also accept DT_UNKNOWN, as this 7084 * is what we get when attribute was created on Solaris. 7085 */ 7086 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) 7087 continue; 7088 if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) 7089 continue; 7090 else if (strncmp(dp->d_name, attrprefix, plen) != 0) 7091 continue; 7092 nlen = dp->d_namlen - plen; 7093 if (sizep != NULL) 7094 *sizep += 1 + nlen; 7095 else if (uio != NULL) { 7096 /* 7097 * Format of extattr name entry is one byte for 7098 * length and the rest for name. 7099 */ 7100 error = uiomove(&nlen, 1, uio->uio_rw, uio); 7101 if (error == 0) { 7102 error = uiomove(dp->d_name + plen, nlen, 7103 uio->uio_rw, uio); 7104 } 7105 if (error != 0) 7106 break; 7107 } 7108 } 7109 } while (!eof && error == 0); 7110 7111 vput(vp); 7112 ZFS_EXIT(zfsvfs); 7113 7114 return (error); 7115} 7116 7117int 7118zfs_freebsd_getacl(ap) 7119 struct vop_getacl_args /* { 7120 struct vnode *vp; 7121 acl_type_t type; 7122 struct acl *aclp; 7123 struct ucred *cred; 7124 struct thread *td; 7125 } */ *ap; 7126{ 7127 int error; 7128 vsecattr_t vsecattr; 7129 7130 if (ap->a_type != ACL_TYPE_NFS4) 7131 return (EINVAL); 7132 7133 vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; 7134 if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) 7135 return (error); 7136 7137 error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); 7138 if (vsecattr.vsa_aclentp != NULL) 7139 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); 7140 7141 return (error); 7142} 7143 7144int 7145zfs_freebsd_setacl(ap) 7146 struct vop_setacl_args /* { 7147 struct vnode *vp; 7148 acl_type_t type; 7149 struct acl *aclp; 7150 struct ucred *cred; 7151 struct thread *td; 7152 } */ *ap; 7153{ 7154 int error; 7155 vsecattr_t vsecattr; 7156 int aclbsize; /* size of acl list in bytes */ 7157 aclent_t *aaclp; 7158 7159 if (ap->a_type != ACL_TYPE_NFS4) 7160 return (EINVAL); 7161 7162 if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) 7163 return (EINVAL); 7164 7165 /* 7166 * With NFSv4 ACLs, chmod(2) may need to add additional entries, 7167 * splitting every entry into two and appending "canonical six" 7168 * entries at the end. Don't allow for setting an ACL that would 7169 * cause chmod(2) to run out of ACL entries. 7170 */ 7171 if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) 7172 return (ENOSPC); 7173 7174 error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); 7175 if (error != 0) 7176 return (error); 7177 7178 vsecattr.vsa_mask = VSA_ACE; 7179 aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); 7180 vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); 7181 aaclp = vsecattr.vsa_aclentp; 7182 vsecattr.vsa_aclentsz = aclbsize; 7183 7184 aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); 7185 error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); 7186 kmem_free(aaclp, aclbsize); 7187 7188 return (error); 7189} 7190 7191int 7192zfs_freebsd_aclcheck(ap) 7193 struct vop_aclcheck_args /* { 7194 struct vnode *vp; 7195 acl_type_t type; 7196 struct acl *aclp; 7197 struct ucred *cred; 7198 struct thread *td; 7199 } */ *ap; 7200{ 7201 7202 return (EOPNOTSUPP); 7203} 7204 7205struct vop_vector zfs_vnodeops; 7206struct vop_vector zfs_fifoops; 7207struct vop_vector zfs_shareops; 7208 7209struct vop_vector zfs_vnodeops = { 7210 .vop_default = &default_vnodeops, 7211 .vop_inactive = zfs_freebsd_inactive, 7212 .vop_reclaim = zfs_freebsd_reclaim, 7213 .vop_access = zfs_freebsd_access, 7214#ifdef FREEBSD_NAMECACHE 7215 .vop_lookup = vfs_cache_lookup, 7216 .vop_cachedlookup = zfs_freebsd_lookup, 7217#else 7218 .vop_lookup = zfs_freebsd_lookup, 7219#endif 7220 .vop_getattr = zfs_freebsd_getattr, 7221 .vop_setattr = zfs_freebsd_setattr, 7222 .vop_create = zfs_freebsd_create, 7223 .vop_mknod = zfs_freebsd_create, 7224 .vop_mkdir = zfs_freebsd_mkdir, 7225 .vop_readdir = zfs_freebsd_readdir, 7226 .vop_fsync = zfs_freebsd_fsync, 7227 .vop_open = zfs_freebsd_open, 7228 .vop_close = zfs_freebsd_close, 7229 .vop_rmdir = zfs_freebsd_rmdir, 7230 .vop_ioctl = zfs_freebsd_ioctl, 7231 .vop_link = zfs_freebsd_link, 7232 .vop_symlink = zfs_freebsd_symlink, 7233 .vop_readlink = zfs_freebsd_readlink, 7234 .vop_read = zfs_freebsd_read, 7235 .vop_write = zfs_freebsd_write, 7236 .vop_remove = zfs_freebsd_remove, 7237 .vop_rename = zfs_freebsd_rename, 7238 .vop_pathconf = zfs_freebsd_pathconf, 7239 .vop_bmap = zfs_freebsd_bmap, 7240 .vop_fid = zfs_freebsd_fid, 7241 .vop_getextattr = zfs_getextattr, 7242 .vop_deleteextattr = zfs_deleteextattr, 7243 .vop_setextattr = zfs_setextattr, 7244 .vop_listextattr = zfs_listextattr, 7245 .vop_getacl = zfs_freebsd_getacl, 7246 .vop_setacl = zfs_freebsd_setacl, 7247 .vop_aclcheck = zfs_freebsd_aclcheck, 7248 .vop_getpages = zfs_freebsd_getpages, 7249 .vop_putpages = zfs_freebsd_putpages, 7250}; 7251 7252struct vop_vector zfs_fifoops = { 7253 .vop_default = &fifo_specops, 7254 .vop_fsync = zfs_freebsd_fsync, 7255 .vop_access = zfs_freebsd_access, 7256 .vop_getattr = zfs_freebsd_getattr, 7257 .vop_inactive = zfs_freebsd_inactive, 7258 .vop_read = VOP_PANIC, 7259 .vop_reclaim = zfs_freebsd_reclaim, 7260 .vop_setattr = zfs_freebsd_setattr, 7261 .vop_write = VOP_PANIC, 7262 .vop_pathconf = zfs_freebsd_fifo_pathconf, 7263 .vop_fid = zfs_freebsd_fid, 7264 .vop_getacl = zfs_freebsd_getacl, 7265 .vop_setacl = zfs_freebsd_setacl, 7266 .vop_aclcheck = zfs_freebsd_aclcheck, 7267}; 7268 7269/* 7270 * special share hidden files vnode operations template 7271 */ 7272struct vop_vector zfs_shareops = { 7273 .vop_default = &default_vnodeops, 7274 .vop_access = zfs_freebsd_access, 7275 .vop_inactive = zfs_freebsd_inactive, 7276 .vop_reclaim = zfs_freebsd_reclaim, 7277 .vop_fid = zfs_freebsd_fid, 7278 .vop_pathconf = zfs_freebsd_pathconf, 7279}; 7280